Pulsar-VLM / trainer_state.json
jcchtt's picture
Upload model upload
08e0660 verified
{
"best_metric": 0.00065021,
"best_model_checkpoint": "/mnt/si0001694oxp/default/vlm_sft/outputs/output/deepseek-vl-7b-chat/v32-20250613-154734/checkpoint-3000",
"epoch": 1.0993219717793659,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003665017408832692,
"grad_norm": 16.33839225769043,
"learning_rate": 9.999999631609428e-06,
"loss": 3.0264194011688232,
"memory(GiB)": 149.2,
"step": 1,
"token_acc": 0.4675925925925926,
"train_speed(iter/s)": 0.033836
},
{
"epoch": 0.001832508704416346,
"grad_norm": 6.188778400421143,
"learning_rate": 9.999990790238409e-06,
"loss": 0.6836232542991638,
"memory(GiB)": 158.4,
"step": 5,
"token_acc": 0.8406651231319722,
"train_speed(iter/s)": 0.042635
},
{
"epoch": 0.003665017408832692,
"grad_norm": 0.4842391312122345,
"learning_rate": 9.999963160987561e-06,
"loss": 0.05034670829772949,
"memory(GiB)": 158.4,
"step": 10,
"token_acc": 0.9856121161127471,
"train_speed(iter/s)": 0.04407
},
{
"epoch": 0.005497526113249038,
"grad_norm": 0.1827951818704605,
"learning_rate": 9.99991711234924e-06,
"loss": 0.01651783734560013,
"memory(GiB)": 158.4,
"step": 15,
"token_acc": 0.9920074036681811,
"train_speed(iter/s)": 0.044607
},
{
"epoch": 0.007330034817665384,
"grad_norm": 0.09308009594678879,
"learning_rate": 9.999852644493086e-06,
"loss": 0.014441253244876861,
"memory(GiB)": 158.4,
"step": 20,
"token_acc": 0.9914947368421053,
"train_speed(iter/s)": 0.044685
},
{
"epoch": 0.00916254352208173,
"grad_norm": 0.13165982067584991,
"learning_rate": 9.999769757656593e-06,
"loss": 0.013714964687824249,
"memory(GiB)": 158.4,
"step": 25,
"token_acc": 0.9919225915018931,
"train_speed(iter/s)": 0.044897
},
{
"epoch": 0.010995052226498075,
"grad_norm": 0.136412113904953,
"learning_rate": 9.999668452145104e-06,
"loss": 0.010563116520643234,
"memory(GiB)": 158.4,
"step": 30,
"token_acc": 0.9947824623411596,
"train_speed(iter/s)": 0.04502
},
{
"epoch": 0.012827560930914422,
"grad_norm": 0.2637465298175812,
"learning_rate": 9.999548728331825e-06,
"loss": 0.008089790493249894,
"memory(GiB)": 158.4,
"step": 35,
"token_acc": 0.9959606160060591,
"train_speed(iter/s)": 0.045028
},
{
"epoch": 0.014660069635330768,
"grad_norm": 0.2768152952194214,
"learning_rate": 9.999410586657801e-06,
"loss": 0.005358598381280899,
"memory(GiB)": 158.4,
"step": 40,
"token_acc": 0.9978118161925602,
"train_speed(iter/s)": 0.045061
},
{
"epoch": 0.016492578339747113,
"grad_norm": 0.09677782654762268,
"learning_rate": 9.999254027631938e-06,
"loss": 0.003943501785397529,
"memory(GiB)": 158.4,
"step": 45,
"token_acc": 0.9986528584659425,
"train_speed(iter/s)": 0.044994
},
{
"epoch": 0.01832508704416346,
"grad_norm": 0.3623986840248108,
"learning_rate": 9.99907905183098e-06,
"loss": 0.0031241703778505324,
"memory(GiB)": 158.4,
"step": 50,
"token_acc": 0.9987373737373737,
"train_speed(iter/s)": 0.04505
},
{
"epoch": 0.020157595748579806,
"grad_norm": 0.496895432472229,
"learning_rate": 9.998885659899524e-06,
"loss": 0.002511710487306118,
"memory(GiB)": 158.4,
"step": 55,
"token_acc": 0.9988217471806093,
"train_speed(iter/s)": 0.045107
},
{
"epoch": 0.02199010445299615,
"grad_norm": 0.1918005645275116,
"learning_rate": 9.998673852550007e-06,
"loss": 0.002556230500340462,
"memory(GiB)": 158.4,
"step": 60,
"token_acc": 0.9994104270192875,
"train_speed(iter/s)": 0.045164
},
{
"epoch": 0.0238226131574125,
"grad_norm": 0.16670851409435272,
"learning_rate": 9.998443630562707e-06,
"loss": 0.0034642994403839113,
"memory(GiB)": 158.4,
"step": 65,
"token_acc": 0.9989904938167746,
"train_speed(iter/s)": 0.045187
},
{
"epoch": 0.025655121861828844,
"grad_norm": 0.04445331171154976,
"learning_rate": 9.99819499478574e-06,
"loss": 0.00226197075098753,
"memory(GiB)": 158.4,
"step": 70,
"token_acc": 0.9994109231675503,
"train_speed(iter/s)": 0.045194
},
{
"epoch": 0.02748763056624519,
"grad_norm": 0.13421526551246643,
"learning_rate": 9.997927946135055e-06,
"loss": 0.0026616916060447694,
"memory(GiB)": 158.4,
"step": 75,
"token_acc": 0.998989558773998,
"train_speed(iter/s)": 0.04522
},
{
"epoch": 0.029320139270661537,
"grad_norm": 0.09873384982347488,
"learning_rate": 9.997642485594436e-06,
"loss": 0.0017027700319886207,
"memory(GiB)": 158.4,
"step": 80,
"token_acc": 0.9993260887878022,
"train_speed(iter/s)": 0.04525
},
{
"epoch": 0.03115264797507788,
"grad_norm": 0.03224126249551773,
"learning_rate": 9.997338614215492e-06,
"loss": 0.0017118226736783982,
"memory(GiB)": 158.4,
"step": 85,
"token_acc": 0.9993263725159987,
"train_speed(iter/s)": 0.04528
},
{
"epoch": 0.032985156679494226,
"grad_norm": 0.3803243637084961,
"learning_rate": 9.997016333117655e-06,
"loss": 0.0019580798223614694,
"memory(GiB)": 158.4,
"step": 90,
"token_acc": 0.9993265993265993,
"train_speed(iter/s)": 0.045299
},
{
"epoch": 0.034817665383910575,
"grad_norm": 0.3237900733947754,
"learning_rate": 9.996675643488177e-06,
"loss": 0.002880098670721054,
"memory(GiB)": 158.4,
"step": 95,
"token_acc": 0.9990737622094982,
"train_speed(iter/s)": 0.045329
},
{
"epoch": 0.03665017408832692,
"grad_norm": 0.1465182900428772,
"learning_rate": 9.99631654658213e-06,
"loss": 0.0028293343260884286,
"memory(GiB)": 158.4,
"step": 100,
"token_acc": 0.9990743857287109,
"train_speed(iter/s)": 0.045355
},
{
"epoch": 0.038482682792743264,
"grad_norm": 0.24748782813549042,
"learning_rate": 9.995939043722388e-06,
"loss": 0.0018339043483138085,
"memory(GiB)": 158.4,
"step": 105,
"token_acc": 0.9994106255788499,
"train_speed(iter/s)": 0.045379
},
{
"epoch": 0.04031519149715961,
"grad_norm": 0.04621001332998276,
"learning_rate": 9.995543136299636e-06,
"loss": 0.0019403379410505295,
"memory(GiB)": 158.4,
"step": 110,
"token_acc": 0.9994108735903047,
"train_speed(iter/s)": 0.045398
},
{
"epoch": 0.04214770020157596,
"grad_norm": 0.06725554913282394,
"learning_rate": 9.995128825772365e-06,
"loss": 0.0010762955993413926,
"memory(GiB)": 158.4,
"step": 115,
"token_acc": 0.9995792308339645,
"train_speed(iter/s)": 0.045421
},
{
"epoch": 0.0439802089059923,
"grad_norm": 0.16836291551589966,
"learning_rate": 9.99469611366685e-06,
"loss": 0.0029191805049777033,
"memory(GiB)": 158.4,
"step": 120,
"token_acc": 0.9990743857287109,
"train_speed(iter/s)": 0.045438
},
{
"epoch": 0.04581271761040865,
"grad_norm": 0.19015128910541534,
"learning_rate": 9.994245001577163e-06,
"loss": 0.0029153132811188696,
"memory(GiB)": 158.4,
"step": 125,
"token_acc": 0.9988206553786538,
"train_speed(iter/s)": 0.045454
},
{
"epoch": 0.047645226314825,
"grad_norm": 0.2672649919986725,
"learning_rate": 9.993775491165157e-06,
"loss": 0.0028599994257092476,
"memory(GiB)": 158.4,
"step": 130,
"token_acc": 0.9989905787348586,
"train_speed(iter/s)": 0.045477
},
{
"epoch": 0.04947773501924134,
"grad_norm": 0.09613120555877686,
"learning_rate": 9.993287584160462e-06,
"loss": 0.001117743458598852,
"memory(GiB)": 158.4,
"step": 135,
"token_acc": 0.9996634129922585,
"train_speed(iter/s)": 0.045484
},
{
"epoch": 0.05131024372365769,
"grad_norm": 0.08400937169790268,
"learning_rate": 9.992781282360486e-06,
"loss": 0.0014099805615842343,
"memory(GiB)": 158.4,
"step": 140,
"token_acc": 0.9995794785534062,
"train_speed(iter/s)": 0.045497
},
{
"epoch": 0.053142752428074036,
"grad_norm": 0.2961122989654541,
"learning_rate": 9.992256587630392e-06,
"loss": 0.0026107219979166984,
"memory(GiB)": 158.4,
"step": 145,
"token_acc": 0.9993264292329713,
"train_speed(iter/s)": 0.045509
},
{
"epoch": 0.05497526113249038,
"grad_norm": 0.11588957160711288,
"learning_rate": 9.991713501903107e-06,
"loss": 0.0020393442362546923,
"memory(GiB)": 158.4,
"step": 150,
"token_acc": 0.9991583908432924,
"train_speed(iter/s)": 0.045518
},
{
"epoch": 0.056807769836906725,
"grad_norm": 0.04025767371058464,
"learning_rate": 9.991152027179307e-06,
"loss": 0.001108243688941002,
"memory(GiB)": 158.4,
"step": 155,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.045528
},
{
"epoch": 0.058640278541323074,
"grad_norm": 0.26148226857185364,
"learning_rate": 9.990572165527413e-06,
"loss": 0.003043392114341259,
"memory(GiB)": 158.4,
"step": 160,
"token_acc": 0.9991581074254925,
"train_speed(iter/s)": 0.045537
},
{
"epoch": 0.060472787245739415,
"grad_norm": 0.02609323337674141,
"learning_rate": 9.989973919083576e-06,
"loss": 0.003145371749997139,
"memory(GiB)": 158.4,
"step": 165,
"token_acc": 0.9989058160087535,
"train_speed(iter/s)": 0.045548
},
{
"epoch": 0.06230529595015576,
"grad_norm": 0.08112650364637375,
"learning_rate": 9.989357290051681e-06,
"loss": 0.0019015805795788766,
"memory(GiB)": 158.4,
"step": 170,
"token_acc": 0.9991585324806462,
"train_speed(iter/s)": 0.045556
},
{
"epoch": 0.06413780465457211,
"grad_norm": 0.012307146564126015,
"learning_rate": 9.98872228070333e-06,
"loss": 0.0017634263262152673,
"memory(GiB)": 158.4,
"step": 175,
"token_acc": 0.9994951619688683,
"train_speed(iter/s)": 0.045559
},
{
"epoch": 0.06597031335898845,
"grad_norm": 0.22926685214042664,
"learning_rate": 9.988068893377841e-06,
"loss": 0.0008580862544476986,
"memory(GiB)": 158.4,
"step": 180,
"token_acc": 0.9996634413125789,
"train_speed(iter/s)": 0.045562
},
{
"epoch": 0.06780282206340481,
"grad_norm": 0.07493411749601364,
"learning_rate": 9.987397130482224e-06,
"loss": 0.001726461760699749,
"memory(GiB)": 158.4,
"step": 185,
"token_acc": 0.9994107744107744,
"train_speed(iter/s)": 0.045574
},
{
"epoch": 0.06963533076782115,
"grad_norm": 0.11616482585668564,
"learning_rate": 9.986706994491194e-06,
"loss": 0.0020760688930749893,
"memory(GiB)": 158.4,
"step": 190,
"token_acc": 0.999494779386999,
"train_speed(iter/s)": 0.04558
},
{
"epoch": 0.07146783947223749,
"grad_norm": 0.1130843311548233,
"learning_rate": 9.985998487947143e-06,
"loss": 0.003568219020962715,
"memory(GiB)": 158.4,
"step": 195,
"token_acc": 0.9988221436984688,
"train_speed(iter/s)": 0.045588
},
{
"epoch": 0.07330034817665385,
"grad_norm": 0.03086119331419468,
"learning_rate": 9.985271613460144e-06,
"loss": 0.0014082181267440319,
"memory(GiB)": 158.4,
"step": 200,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.045593
},
{
"epoch": 0.07513285688107019,
"grad_norm": 0.10936316847801208,
"learning_rate": 9.984526373707933e-06,
"loss": 0.0023099591955542563,
"memory(GiB)": 158.4,
"step": 205,
"token_acc": 0.999242615501136,
"train_speed(iter/s)": 0.045599
},
{
"epoch": 0.07696536558548653,
"grad_norm": 0.17849738895893097,
"learning_rate": 9.983762771435902e-06,
"loss": 0.0017316842451691628,
"memory(GiB)": 158.4,
"step": 210,
"token_acc": 0.9995793016407236,
"train_speed(iter/s)": 0.0456
},
{
"epoch": 0.07879787428990288,
"grad_norm": 0.07379074394702911,
"learning_rate": 9.982980809457088e-06,
"loss": 0.001504539605230093,
"memory(GiB)": 158.4,
"step": 215,
"token_acc": 0.99949499200404,
"train_speed(iter/s)": 0.045601
},
{
"epoch": 0.08063038299431922,
"grad_norm": 0.20956623554229736,
"learning_rate": 9.982180490652165e-06,
"loss": 0.001286138966679573,
"memory(GiB)": 158.4,
"step": 220,
"token_acc": 0.9997476022211005,
"train_speed(iter/s)": 0.045606
},
{
"epoch": 0.08246289169873557,
"grad_norm": 0.36039137840270996,
"learning_rate": 9.981361817969433e-06,
"loss": 0.0015822691842913628,
"memory(GiB)": 158.4,
"step": 225,
"token_acc": 0.999494779386999,
"train_speed(iter/s)": 0.045612
},
{
"epoch": 0.08429540040315192,
"grad_norm": 0.05167197808623314,
"learning_rate": 9.9805247944248e-06,
"loss": 0.0016318798065185548,
"memory(GiB)": 158.4,
"step": 230,
"token_acc": 0.9994951194883878,
"train_speed(iter/s)": 0.045618
},
{
"epoch": 0.08612790910756826,
"grad_norm": 0.0602310486137867,
"learning_rate": 9.979669423101784e-06,
"loss": 0.0017338620498776435,
"memory(GiB)": 158.4,
"step": 235,
"token_acc": 0.9992421690804985,
"train_speed(iter/s)": 0.045622
},
{
"epoch": 0.0879604178119846,
"grad_norm": 0.03006557747721672,
"learning_rate": 9.978795707151492e-06,
"loss": 0.0005913118831813336,
"memory(GiB)": 158.4,
"step": 240,
"token_acc": 0.9997476659096644,
"train_speed(iter/s)": 0.045626
},
{
"epoch": 0.08979292651640096,
"grad_norm": 0.1851363480091095,
"learning_rate": 9.977903649792606e-06,
"loss": 0.0013333003968000411,
"memory(GiB)": 158.4,
"step": 245,
"token_acc": 0.9995793016407236,
"train_speed(iter/s)": 0.04562
},
{
"epoch": 0.0916254352208173,
"grad_norm": 0.16427940130233765,
"learning_rate": 9.976993254311385e-06,
"loss": 0.0022492580115795135,
"memory(GiB)": 158.4,
"step": 250,
"token_acc": 0.999326259053394,
"train_speed(iter/s)": 0.045566
},
{
"epoch": 0.09345794392523364,
"grad_norm": 0.07113044708967209,
"learning_rate": 9.976064524061637e-06,
"loss": 0.0023244613781571387,
"memory(GiB)": 158.4,
"step": 255,
"token_acc": 0.9994107744107744,
"train_speed(iter/s)": 0.04552
},
{
"epoch": 0.09529045262965,
"grad_norm": 0.0672680214047432,
"learning_rate": 9.975117462464716e-06,
"loss": 0.0020451253280043603,
"memory(GiB)": 158.4,
"step": 260,
"token_acc": 0.9994105263157895,
"train_speed(iter/s)": 0.045509
},
{
"epoch": 0.09712296133406634,
"grad_norm": 0.09312908351421356,
"learning_rate": 9.974152073009506e-06,
"loss": 0.0018878720700740814,
"memory(GiB)": 158.4,
"step": 265,
"token_acc": 0.9994954167017072,
"train_speed(iter/s)": 0.045482
},
{
"epoch": 0.09895547003848268,
"grad_norm": 0.06397019326686859,
"learning_rate": 9.973168359252411e-06,
"loss": 0.0020165286958217623,
"memory(GiB)": 158.4,
"step": 270,
"token_acc": 0.9994108735903047,
"train_speed(iter/s)": 0.045476
},
{
"epoch": 0.10078797874289903,
"grad_norm": 0.15306073427200317,
"learning_rate": 9.972166324817338e-06,
"loss": 0.0017529357224702834,
"memory(GiB)": 158.4,
"step": 275,
"token_acc": 0.9997474109623642,
"train_speed(iter/s)": 0.045455
},
{
"epoch": 0.10262048744731538,
"grad_norm": 0.13208770751953125,
"learning_rate": 9.971145973395685e-06,
"loss": 0.001645715907216072,
"memory(GiB)": 158.4,
"step": 280,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.045452
},
{
"epoch": 0.10445299615173172,
"grad_norm": 0.0297766774892807,
"learning_rate": 9.97010730874633e-06,
"loss": 0.0012823720462620258,
"memory(GiB)": 158.4,
"step": 285,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.045422
},
{
"epoch": 0.10628550485614807,
"grad_norm": 0.16176588833332062,
"learning_rate": 9.969050334695619e-06,
"loss": 0.001742975413799286,
"memory(GiB)": 158.4,
"step": 290,
"token_acc": 0.9995788764423482,
"train_speed(iter/s)": 0.045417
},
{
"epoch": 0.10811801356056441,
"grad_norm": 0.10822831094264984,
"learning_rate": 9.967975055137335e-06,
"loss": 0.002227822504937649,
"memory(GiB)": 158.4,
"step": 295,
"token_acc": 0.9994103773584906,
"train_speed(iter/s)": 0.045373
},
{
"epoch": 0.10995052226498075,
"grad_norm": 0.1328648328781128,
"learning_rate": 9.966881474032711e-06,
"loss": 0.0017272233963012695,
"memory(GiB)": 158.4,
"step": 300,
"token_acc": 0.9994105759514988,
"train_speed(iter/s)": 0.045362
},
{
"epoch": 0.11178303096939711,
"grad_norm": 0.11945555359125137,
"learning_rate": 9.965769595410395e-06,
"loss": 0.0011399961076676846,
"memory(GiB)": 158.4,
"step": 305,
"token_acc": 0.9995791954216462,
"train_speed(iter/s)": 0.045363
},
{
"epoch": 0.11361553967381345,
"grad_norm": 0.2175164371728897,
"learning_rate": 9.964639423366442e-06,
"loss": 0.0025836611166596413,
"memory(GiB)": 158.4,
"step": 310,
"token_acc": 0.9990738401953355,
"train_speed(iter/s)": 0.045357
},
{
"epoch": 0.11544804837822979,
"grad_norm": 0.035975273698568344,
"learning_rate": 9.963490962064297e-06,
"loss": 0.0006968880537897348,
"memory(GiB)": 158.4,
"step": 315,
"token_acc": 0.9997475385003787,
"train_speed(iter/s)": 0.04536
},
{
"epoch": 0.11728055708264615,
"grad_norm": 0.14850489795207977,
"learning_rate": 9.962324215734782e-06,
"loss": 0.0017726331949234008,
"memory(GiB)": 158.4,
"step": 320,
"token_acc": 0.999242615501136,
"train_speed(iter/s)": 0.045365
},
{
"epoch": 0.11911306578706249,
"grad_norm": 0.03455163165926933,
"learning_rate": 9.96113918867608e-06,
"loss": 0.0013269748538732528,
"memory(GiB)": 158.4,
"step": 325,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.045365
},
{
"epoch": 0.12094557449147883,
"grad_norm": 0.23186658322811127,
"learning_rate": 9.959935885253715e-06,
"loss": 0.0010508694685995579,
"memory(GiB)": 158.4,
"step": 330,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.045369
},
{
"epoch": 0.12277808319589519,
"grad_norm": 0.06666416674852371,
"learning_rate": 9.958714309900546e-06,
"loss": 0.0009142296388745308,
"memory(GiB)": 158.4,
"step": 335,
"token_acc": 0.9995789119083712,
"train_speed(iter/s)": 0.045376
},
{
"epoch": 0.12461059190031153,
"grad_norm": 0.014640443958342075,
"learning_rate": 9.957474467116739e-06,
"loss": 0.0024377334862947463,
"memory(GiB)": 158.4,
"step": 340,
"token_acc": 0.9992424880060601,
"train_speed(iter/s)": 0.045382
},
{
"epoch": 0.12644310060472788,
"grad_norm": 0.15044739842414856,
"learning_rate": 9.956216361469755e-06,
"loss": 0.002022208273410797,
"memory(GiB)": 158.4,
"step": 345,
"token_acc": 0.9994952893674294,
"train_speed(iter/s)": 0.045388
},
{
"epoch": 0.12827560930914422,
"grad_norm": 0.012829025276005268,
"learning_rate": 9.954939997594335e-06,
"loss": 0.003057861886918545,
"memory(GiB)": 158.4,
"step": 350,
"token_acc": 0.9992422966829433,
"train_speed(iter/s)": 0.045394
},
{
"epoch": 0.13010811801356056,
"grad_norm": 0.02966240420937538,
"learning_rate": 9.953645380192485e-06,
"loss": 0.0017476610839366913,
"memory(GiB)": 158.4,
"step": 355,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.045399
},
{
"epoch": 0.1319406267179769,
"grad_norm": 0.0715402215719223,
"learning_rate": 9.952332514033449e-06,
"loss": 0.0023743031546473504,
"memory(GiB)": 158.4,
"step": 360,
"token_acc": 0.9991585324806462,
"train_speed(iter/s)": 0.045407
},
{
"epoch": 0.13377313542239325,
"grad_norm": 0.07701452821493149,
"learning_rate": 9.9510014039537e-06,
"loss": 0.0022863084450364113,
"memory(GiB)": 158.4,
"step": 365,
"token_acc": 0.9994110222970131,
"train_speed(iter/s)": 0.04541
},
{
"epoch": 0.13560564412680962,
"grad_norm": 0.09453430771827698,
"learning_rate": 9.949652054856924e-06,
"loss": 0.0019000820815563203,
"memory(GiB)": 158.4,
"step": 370,
"token_acc": 0.9993265426382693,
"train_speed(iter/s)": 0.045415
},
{
"epoch": 0.13743815283122596,
"grad_norm": 0.0394257977604866,
"learning_rate": 9.948284471713994e-06,
"loss": 0.0016634922474622726,
"memory(GiB)": 158.4,
"step": 375,
"token_acc": 0.9994104766717197,
"train_speed(iter/s)": 0.045419
},
{
"epoch": 0.1392706615356423,
"grad_norm": 0.04517311230301857,
"learning_rate": 9.94689865956295e-06,
"loss": 0.0017285166308283807,
"memory(GiB)": 158.4,
"step": 380,
"token_acc": 0.9994948644552955,
"train_speed(iter/s)": 0.045425
},
{
"epoch": 0.14110317024005864,
"grad_norm": 0.07294133305549622,
"learning_rate": 9.945494623509003e-06,
"loss": 0.000422241585329175,
"memory(GiB)": 158.4,
"step": 385,
"token_acc": 0.9999158390843292,
"train_speed(iter/s)": 0.045427
},
{
"epoch": 0.14293567894447498,
"grad_norm": 0.06523015350103378,
"learning_rate": 9.944072368724476e-06,
"loss": 0.0024235062301158905,
"memory(GiB)": 158.4,
"step": 390,
"token_acc": 0.9994953318193288,
"train_speed(iter/s)": 0.045433
},
{
"epoch": 0.14476818764889132,
"grad_norm": 0.0444883331656456,
"learning_rate": 9.942631900448827e-06,
"loss": 0.0009868125431239604,
"memory(GiB)": 158.4,
"step": 395,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.045437
},
{
"epoch": 0.1466006963533077,
"grad_norm": 0.01692277006804943,
"learning_rate": 9.941173223988603e-06,
"loss": 0.0023114632815122603,
"memory(GiB)": 158.4,
"step": 400,
"token_acc": 0.9993263725159987,
"train_speed(iter/s)": 0.045442
},
{
"epoch": 0.14843320505772403,
"grad_norm": 0.02756733074784279,
"learning_rate": 9.939696344717427e-06,
"loss": 0.0015292948111891747,
"memory(GiB)": 158.4,
"step": 405,
"token_acc": 0.9994107744107744,
"train_speed(iter/s)": 0.045444
},
{
"epoch": 0.15026571376214037,
"grad_norm": 0.09074392169713974,
"learning_rate": 9.938201268075982e-06,
"loss": 0.0020554307848215103,
"memory(GiB)": 158.4,
"step": 410,
"token_acc": 0.9992423604680528,
"train_speed(iter/s)": 0.045448
},
{
"epoch": 0.15209822246655672,
"grad_norm": 0.07123276591300964,
"learning_rate": 9.936687999571987e-06,
"loss": 0.0014599796384572982,
"memory(GiB)": 158.4,
"step": 415,
"token_acc": 0.9994952044422009,
"train_speed(iter/s)": 0.045449
},
{
"epoch": 0.15393073117097306,
"grad_norm": 0.07088897377252579,
"learning_rate": 9.935156544780183e-06,
"loss": 0.0010397397913038731,
"memory(GiB)": 158.4,
"step": 420,
"token_acc": 0.9996633846671716,
"train_speed(iter/s)": 0.045448
},
{
"epoch": 0.1557632398753894,
"grad_norm": 0.1305522322654724,
"learning_rate": 9.9336069093423e-06,
"loss": 0.0015219044871628284,
"memory(GiB)": 158.4,
"step": 425,
"token_acc": 0.9994950770007573,
"train_speed(iter/s)": 0.045451
},
{
"epoch": 0.15759574857980577,
"grad_norm": 0.03542817756533623,
"learning_rate": 9.932039098967046e-06,
"loss": 0.002127250283956528,
"memory(GiB)": 158.4,
"step": 430,
"token_acc": 0.9994949494949495,
"train_speed(iter/s)": 0.045456
},
{
"epoch": 0.1594282572842221,
"grad_norm": 0.14930537343025208,
"learning_rate": 9.930453119430086e-06,
"loss": 0.000645923474803567,
"memory(GiB)": 158.4,
"step": 435,
"token_acc": 0.9997474960020201,
"train_speed(iter/s)": 0.045458
},
{
"epoch": 0.16126076598863845,
"grad_norm": 0.10225468873977661,
"learning_rate": 9.92884897657402e-06,
"loss": 0.000911066122353077,
"memory(GiB)": 158.4,
"step": 440,
"token_acc": 0.9997473471450228,
"train_speed(iter/s)": 0.045415
},
{
"epoch": 0.1630932746930548,
"grad_norm": 0.05018873140215874,
"learning_rate": 9.927226676308354e-06,
"loss": 0.00166127011179924,
"memory(GiB)": 158.4,
"step": 445,
"token_acc": 0.9997476871320438,
"train_speed(iter/s)": 0.045381
},
{
"epoch": 0.16492578339747113,
"grad_norm": 0.17071396112442017,
"learning_rate": 9.925586224609489e-06,
"loss": 0.0025668978691101075,
"memory(GiB)": 158.4,
"step": 450,
"token_acc": 0.9994110718492344,
"train_speed(iter/s)": 0.045381
},
{
"epoch": 0.16675829210188747,
"grad_norm": 0.008416908793151379,
"learning_rate": 9.923927627520694e-06,
"loss": 0.000798144843429327,
"memory(GiB)": 158.4,
"step": 455,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.045382
},
{
"epoch": 0.16859080080630384,
"grad_norm": 0.1326538473367691,
"learning_rate": 9.922250891152078e-06,
"loss": 0.0013757062144577504,
"memory(GiB)": 158.4,
"step": 460,
"token_acc": 0.9994102780117945,
"train_speed(iter/s)": 0.045388
},
{
"epoch": 0.17042330951072018,
"grad_norm": 0.10151144862174988,
"learning_rate": 9.92055602168058e-06,
"loss": 0.0008957336656749248,
"memory(GiB)": 158.4,
"step": 465,
"token_acc": 0.9996634696281339,
"train_speed(iter/s)": 0.045392
},
{
"epoch": 0.17225581821513652,
"grad_norm": 0.09111111611127853,
"learning_rate": 9.918843025349941e-06,
"loss": 0.0013033418916165828,
"memory(GiB)": 158.4,
"step": 470,
"token_acc": 0.9995792308339645,
"train_speed(iter/s)": 0.045396
},
{
"epoch": 0.17408832691955287,
"grad_norm": 0.029473107308149338,
"learning_rate": 9.917111908470673e-06,
"loss": 0.0013312675058841706,
"memory(GiB)": 158.4,
"step": 475,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.0454
},
{
"epoch": 0.1759208356239692,
"grad_norm": 0.1001836434006691,
"learning_rate": 9.915362677420045e-06,
"loss": 0.0019773678854107858,
"memory(GiB)": 158.4,
"step": 480,
"token_acc": 0.9991580365412142,
"train_speed(iter/s)": 0.045406
},
{
"epoch": 0.17775334432838555,
"grad_norm": 0.047665633261203766,
"learning_rate": 9.913595338642059e-06,
"loss": 0.0014092091470956803,
"memory(GiB)": 158.4,
"step": 485,
"token_acc": 0.9997473896934995,
"train_speed(iter/s)": 0.045408
},
{
"epoch": 0.17958585303280192,
"grad_norm": 0.02579871006309986,
"learning_rate": 9.91180989864742e-06,
"loss": 0.0007158961612731219,
"memory(GiB)": 158.4,
"step": 490,
"token_acc": 0.9996631862579993,
"train_speed(iter/s)": 0.045412
},
{
"epoch": 0.18141836173721826,
"grad_norm": 0.028310472145676613,
"learning_rate": 9.910006364013522e-06,
"loss": 0.0007194250822067261,
"memory(GiB)": 158.4,
"step": 495,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.045414
},
{
"epoch": 0.1832508704416346,
"grad_norm": 0.12511947751045227,
"learning_rate": 9.908184741384412e-06,
"loss": 0.0015858769416809081,
"memory(GiB)": 158.4,
"step": 500,
"token_acc": 0.999663242970197,
"train_speed(iter/s)": 0.045418
},
{
"epoch": 0.1832508704416346,
"eval_loss": 0.0010450058616697788,
"eval_runtime": 172.5985,
"eval_samples_per_second": 2.549,
"eval_steps_per_second": 2.549,
"eval_token_acc": 0.9996786189798448,
"step": 500
},
{
"epoch": 0.18508337914605094,
"grad_norm": 0.007651148363947868,
"learning_rate": 9.906345037470776e-06,
"loss": 0.0017563182860612868,
"memory(GiB)": 160.86,
"step": 505,
"token_acc": 0.9996503360658923,
"train_speed(iter/s)": 0.0421
},
{
"epoch": 0.18691588785046728,
"grad_norm": 0.06049024686217308,
"learning_rate": 9.904487259049907e-06,
"loss": 0.0015754606574773788,
"memory(GiB)": 160.86,
"step": 510,
"token_acc": 0.9991582491582491,
"train_speed(iter/s)": 0.0421
},
{
"epoch": 0.18874839655488362,
"grad_norm": 0.06416209042072296,
"learning_rate": 9.902611412965681e-06,
"loss": 0.0016123156994581223,
"memory(GiB)": 160.86,
"step": 515,
"token_acc": 0.9994950770007573,
"train_speed(iter/s)": 0.042128
},
{
"epoch": 0.1905809052593,
"grad_norm": 0.028774991631507874,
"learning_rate": 9.90071750612854e-06,
"loss": 0.001327525917440653,
"memory(GiB)": 160.86,
"step": 520,
"token_acc": 0.999326825984517,
"train_speed(iter/s)": 0.042161
},
{
"epoch": 0.19241341396371633,
"grad_norm": 0.01806553080677986,
"learning_rate": 9.898805545515455e-06,
"loss": 0.0018014278262853622,
"memory(GiB)": 160.86,
"step": 525,
"token_acc": 0.999494779386999,
"train_speed(iter/s)": 0.042191
},
{
"epoch": 0.19424592266813268,
"grad_norm": 0.022810854017734528,
"learning_rate": 9.896875538169906e-06,
"loss": 0.0012151801958680153,
"memory(GiB)": 160.86,
"step": 530,
"token_acc": 0.9996629876147949,
"train_speed(iter/s)": 0.042224
},
{
"epoch": 0.19607843137254902,
"grad_norm": 0.11561686545610428,
"learning_rate": 9.894927491201856e-06,
"loss": 0.0021266091614961626,
"memory(GiB)": 160.86,
"step": 535,
"token_acc": 0.9994109727364524,
"train_speed(iter/s)": 0.042231
},
{
"epoch": 0.19791094007696536,
"grad_norm": 0.06175706535577774,
"learning_rate": 9.892961411787725e-06,
"loss": 0.0011159414425492287,
"memory(GiB)": 160.86,
"step": 540,
"token_acc": 0.9996632146164857,
"train_speed(iter/s)": 0.042227
},
{
"epoch": 0.1997434487813817,
"grad_norm": 0.05753181502223015,
"learning_rate": 9.890977307170362e-06,
"loss": 0.001347663253545761,
"memory(GiB)": 160.86,
"step": 545,
"token_acc": 0.9994108240047134,
"train_speed(iter/s)": 0.042253
},
{
"epoch": 0.20157595748579807,
"grad_norm": 0.02328096143901348,
"learning_rate": 9.888975184659018e-06,
"loss": 0.0003634607419371605,
"memory(GiB)": 160.86,
"step": 550,
"token_acc": 1.0,
"train_speed(iter/s)": 0.042275
},
{
"epoch": 0.2034084661902144,
"grad_norm": 0.06188211217522621,
"learning_rate": 9.886955051629322e-06,
"loss": 0.001550444681197405,
"memory(GiB)": 160.86,
"step": 555,
"token_acc": 0.9994948219247285,
"train_speed(iter/s)": 0.042291
},
{
"epoch": 0.20524097489463075,
"grad_norm": 0.1453787237405777,
"learning_rate": 9.88491691552325e-06,
"loss": 0.001519276574254036,
"memory(GiB)": 160.86,
"step": 560,
"token_acc": 0.9992421052631579,
"train_speed(iter/s)": 0.042293
},
{
"epoch": 0.2070734835990471,
"grad_norm": 0.023789288476109505,
"learning_rate": 9.882860783849106e-06,
"loss": 0.00029240711592137814,
"memory(GiB)": 160.86,
"step": 565,
"token_acc": 0.9999158107425492,
"train_speed(iter/s)": 0.042306
},
{
"epoch": 0.20890599230346343,
"grad_norm": 0.01045987755060196,
"learning_rate": 9.880786664181477e-06,
"loss": 0.0012256539426743983,
"memory(GiB)": 160.86,
"step": 570,
"token_acc": 0.999579018270607,
"train_speed(iter/s)": 0.042302
},
{
"epoch": 0.21073850100787977,
"grad_norm": 0.011777155101299286,
"learning_rate": 9.878694564161227e-06,
"loss": 0.00046466137282550333,
"memory(GiB)": 160.86,
"step": 575,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.042318
},
{
"epoch": 0.21257100971229614,
"grad_norm": 0.23171444237232208,
"learning_rate": 9.876584491495448e-06,
"loss": 0.0011185991577804088,
"memory(GiB)": 160.86,
"step": 580,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.042309
},
{
"epoch": 0.21440351841671249,
"grad_norm": 0.049548666924238205,
"learning_rate": 9.87445645395745e-06,
"loss": 0.0009535157121717929,
"memory(GiB)": 160.86,
"step": 585,
"token_acc": 0.9995791954216462,
"train_speed(iter/s)": 0.042334
},
{
"epoch": 0.21623602712112883,
"grad_norm": 0.022135065868496895,
"learning_rate": 9.87231045938672e-06,
"loss": 0.0012145033106207848,
"memory(GiB)": 160.86,
"step": 590,
"token_acc": 0.999663129526697,
"train_speed(iter/s)": 0.042348
},
{
"epoch": 0.21806853582554517,
"grad_norm": 0.20922328531742096,
"learning_rate": 9.870146515688896e-06,
"loss": 0.0015425698831677437,
"memory(GiB)": 160.86,
"step": 595,
"token_acc": 0.999578947368421,
"train_speed(iter/s)": 0.04231
},
{
"epoch": 0.2199010445299615,
"grad_norm": 0.027032975107431412,
"learning_rate": 9.867964630835742e-06,
"loss": 0.00022940777707844973,
"memory(GiB)": 160.86,
"step": 600,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.042103
},
{
"epoch": 0.22173355323437785,
"grad_norm": 0.016112059354782104,
"learning_rate": 9.865764812865113e-06,
"loss": 0.0013837903738021851,
"memory(GiB)": 160.86,
"step": 605,
"token_acc": 0.9996631862579993,
"train_speed(iter/s)": 0.042119
},
{
"epoch": 0.22356606193879422,
"grad_norm": 0.03569135442376137,
"learning_rate": 9.863547069880928e-06,
"loss": 0.002841825969517231,
"memory(GiB)": 160.86,
"step": 610,
"token_acc": 0.9993265993265993,
"train_speed(iter/s)": 0.042143
},
{
"epoch": 0.22539857064321056,
"grad_norm": 0.04555279016494751,
"learning_rate": 9.86131141005314e-06,
"loss": 0.012712681293487548,
"memory(GiB)": 160.86,
"step": 615,
"token_acc": 0.9986531986531987,
"train_speed(iter/s)": 0.042163
},
{
"epoch": 0.2272310793476269,
"grad_norm": 0.09330299496650696,
"learning_rate": 9.859057841617709e-06,
"loss": 0.007313913106918335,
"memory(GiB)": 160.86,
"step": 620,
"token_acc": 0.9966310115387855,
"train_speed(iter/s)": 0.042185
},
{
"epoch": 0.22906358805204324,
"grad_norm": 0.04176206886768341,
"learning_rate": 9.856786372876565e-06,
"loss": 0.0030346425250172616,
"memory(GiB)": 160.86,
"step": 625,
"token_acc": 0.9989054475035783,
"train_speed(iter/s)": 0.042207
},
{
"epoch": 0.23089609675645958,
"grad_norm": 0.0391584113240242,
"learning_rate": 9.854497012197581e-06,
"loss": 0.0021283647045493128,
"memory(GiB)": 160.86,
"step": 630,
"token_acc": 0.999494779386999,
"train_speed(iter/s)": 0.042231
},
{
"epoch": 0.23272860546087593,
"grad_norm": 0.06570518761873245,
"learning_rate": 9.852189768014547e-06,
"loss": 0.0012692485004663467,
"memory(GiB)": 160.86,
"step": 635,
"token_acc": 0.9994950770007573,
"train_speed(iter/s)": 0.042249
},
{
"epoch": 0.2345611141652923,
"grad_norm": 0.04750160127878189,
"learning_rate": 9.849864648827126e-06,
"loss": 0.001050265971571207,
"memory(GiB)": 160.86,
"step": 640,
"token_acc": 0.9996630727762803,
"train_speed(iter/s)": 0.042274
},
{
"epoch": 0.23639362286970864,
"grad_norm": 0.012142885476350784,
"learning_rate": 9.847521663200837e-06,
"loss": 0.00046721328981220723,
"memory(GiB)": 160.86,
"step": 645,
"token_acc": 0.9998315363881402,
"train_speed(iter/s)": 0.042296
},
{
"epoch": 0.23822613157412498,
"grad_norm": 0.0755368173122406,
"learning_rate": 9.845160819767017e-06,
"loss": 0.0013550316914916038,
"memory(GiB)": 160.86,
"step": 650,
"token_acc": 0.9995790891489182,
"train_speed(iter/s)": 0.042321
},
{
"epoch": 0.24005864027854132,
"grad_norm": 0.07237580418586731,
"learning_rate": 9.842782127222786e-06,
"loss": 0.002187203988432884,
"memory(GiB)": 160.86,
"step": 655,
"token_acc": 0.9994101786316144,
"train_speed(iter/s)": 0.042344
},
{
"epoch": 0.24189114898295766,
"grad_norm": 0.043931830674409866,
"learning_rate": 9.840385594331022e-06,
"loss": 0.0009523511864244938,
"memory(GiB)": 160.86,
"step": 660,
"token_acc": 0.9997474534893509,
"train_speed(iter/s)": 0.042366
},
{
"epoch": 0.243723657687374,
"grad_norm": 0.008748149499297142,
"learning_rate": 9.837971229920324e-06,
"loss": 0.0016139259561896325,
"memory(GiB)": 160.86,
"step": 665,
"token_acc": 0.9994108240047134,
"train_speed(iter/s)": 0.042378
},
{
"epoch": 0.24555616639179037,
"grad_norm": 0.12863993644714355,
"learning_rate": 9.83553904288498e-06,
"loss": 0.001357206143438816,
"memory(GiB)": 160.86,
"step": 670,
"token_acc": 0.9993265993265993,
"train_speed(iter/s)": 0.042358
},
{
"epoch": 0.2473886750962067,
"grad_norm": 0.08388248831033707,
"learning_rate": 9.833089042184933e-06,
"loss": 0.0016548488289117812,
"memory(GiB)": 160.86,
"step": 675,
"token_acc": 0.9994950345059754,
"train_speed(iter/s)": 0.042379
},
{
"epoch": 0.24922118380062305,
"grad_norm": 0.09960606694221497,
"learning_rate": 9.830621236845755e-06,
"loss": 0.0014729213900864125,
"memory(GiB)": 160.86,
"step": 680,
"token_acc": 0.9994103773584906,
"train_speed(iter/s)": 0.042403
},
{
"epoch": 0.2510536925050394,
"grad_norm": 0.07054334878921509,
"learning_rate": 9.828135635958602e-06,
"loss": 0.0012276002205908298,
"memory(GiB)": 160.86,
"step": 685,
"token_acc": 0.99949499200404,
"train_speed(iter/s)": 0.042425
},
{
"epoch": 0.25288620120945576,
"grad_norm": 0.011227499693632126,
"learning_rate": 9.825632248680195e-06,
"loss": 0.0014451307244598866,
"memory(GiB)": 160.86,
"step": 690,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.042448
},
{
"epoch": 0.2547187099138721,
"grad_norm": 0.09235574305057526,
"learning_rate": 9.82311108423277e-06,
"loss": 0.001263285707682371,
"memory(GiB)": 160.86,
"step": 695,
"token_acc": 0.9995789119083712,
"train_speed(iter/s)": 0.04247
},
{
"epoch": 0.25655121861828845,
"grad_norm": 0.045791253447532654,
"learning_rate": 9.82057215190406e-06,
"loss": 0.0009290166199207306,
"memory(GiB)": 160.86,
"step": 700,
"token_acc": 0.9998316498316498,
"train_speed(iter/s)": 0.042485
},
{
"epoch": 0.25838372732270476,
"grad_norm": 0.07074666768312454,
"learning_rate": 9.818015461047246e-06,
"loss": 0.0015341023914515971,
"memory(GiB)": 160.86,
"step": 705,
"token_acc": 0.99949499200404,
"train_speed(iter/s)": 0.042504
},
{
"epoch": 0.26021623602712113,
"grad_norm": 0.1540241241455078,
"learning_rate": 9.815441021080935e-06,
"loss": 0.0007845636457204819,
"memory(GiB)": 160.86,
"step": 710,
"token_acc": 0.9997473896934995,
"train_speed(iter/s)": 0.042523
},
{
"epoch": 0.2620487447315375,
"grad_norm": 0.033406198024749756,
"learning_rate": 9.812848841489118e-06,
"loss": 0.0012617891654372216,
"memory(GiB)": 160.86,
"step": 715,
"token_acc": 0.9994950345059754,
"train_speed(iter/s)": 0.042542
},
{
"epoch": 0.2638812534359538,
"grad_norm": 0.09797952324151993,
"learning_rate": 9.810238931821139e-06,
"loss": 0.0005904140882194043,
"memory(GiB)": 160.86,
"step": 720,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.042558
},
{
"epoch": 0.2657137621403702,
"grad_norm": 0.004131863825023174,
"learning_rate": 9.807611301691656e-06,
"loss": 0.0003168722614645958,
"memory(GiB)": 160.86,
"step": 725,
"token_acc": 0.9998316498316498,
"train_speed(iter/s)": 0.042576
},
{
"epoch": 0.2675462708447865,
"grad_norm": 0.0872625857591629,
"learning_rate": 9.804965960780603e-06,
"loss": 0.0018875803798437119,
"memory(GiB)": 160.86,
"step": 730,
"token_acc": 0.9993262023077571,
"train_speed(iter/s)": 0.042595
},
{
"epoch": 0.26937877954920286,
"grad_norm": 0.03825852647423744,
"learning_rate": 9.80230291883317e-06,
"loss": 0.0008161487989127636,
"memory(GiB)": 160.86,
"step": 735,
"token_acc": 0.9996634129922585,
"train_speed(iter/s)": 0.042614
},
{
"epoch": 0.27121128825361923,
"grad_norm": 0.09421674907207489,
"learning_rate": 9.799622185659748e-06,
"loss": 0.0013505241833627224,
"memory(GiB)": 160.86,
"step": 740,
"token_acc": 0.9995793016407236,
"train_speed(iter/s)": 0.0426
},
{
"epoch": 0.27304379695803555,
"grad_norm": 0.008914557285606861,
"learning_rate": 9.7969237711359e-06,
"loss": 0.0008496672846376896,
"memory(GiB)": 160.86,
"step": 745,
"token_acc": 0.9995792662403231,
"train_speed(iter/s)": 0.042614
},
{
"epoch": 0.2748763056624519,
"grad_norm": 0.05403187870979309,
"learning_rate": 9.79420768520233e-06,
"loss": 0.00033216315787285564,
"memory(GiB)": 160.86,
"step": 750,
"token_acc": 0.9998315221969506,
"train_speed(iter/s)": 0.042627
},
{
"epoch": 0.2767088143668682,
"grad_norm": 0.07824942469596863,
"learning_rate": 9.791473937864838e-06,
"loss": 0.0009146830998361111,
"memory(GiB)": 160.86,
"step": 755,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.042645
},
{
"epoch": 0.2785413230712846,
"grad_norm": 0.059788450598716736,
"learning_rate": 9.788722539194291e-06,
"loss": 0.0014368345960974692,
"memory(GiB)": 160.86,
"step": 760,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.042663
},
{
"epoch": 0.2803738317757009,
"grad_norm": 0.03711073473095894,
"learning_rate": 9.785953499326575e-06,
"loss": 0.0013325980864465237,
"memory(GiB)": 160.86,
"step": 765,
"token_acc": 0.9994953742640875,
"train_speed(iter/s)": 0.042681
},
{
"epoch": 0.2822063404801173,
"grad_norm": 0.024719931185245514,
"learning_rate": 9.783166828462573e-06,
"loss": 0.002364422380924225,
"memory(GiB)": 160.86,
"step": 770,
"token_acc": 0.9992422328870927,
"train_speed(iter/s)": 0.0427
},
{
"epoch": 0.28403884918453365,
"grad_norm": 0.03786981478333473,
"learning_rate": 9.780362536868113e-06,
"loss": 0.0009791357442736626,
"memory(GiB)": 160.86,
"step": 775,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.042719
},
{
"epoch": 0.28587135788894996,
"grad_norm": 0.1868947595357895,
"learning_rate": 9.777540634873939e-06,
"loss": 0.0009650942869484424,
"memory(GiB)": 160.86,
"step": 780,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.042737
},
{
"epoch": 0.28770386659336633,
"grad_norm": 0.015713131055235863,
"learning_rate": 9.774701132875665e-06,
"loss": 0.0007482931017875671,
"memory(GiB)": 160.86,
"step": 785,
"token_acc": 0.9997473045822103,
"train_speed(iter/s)": 0.042755
},
{
"epoch": 0.28953637529778264,
"grad_norm": 0.0045456611551344395,
"learning_rate": 9.771844041333751e-06,
"loss": 0.0009433764033019542,
"memory(GiB)": 160.86,
"step": 790,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.042771
},
{
"epoch": 0.291368884002199,
"grad_norm": 0.01577194780111313,
"learning_rate": 9.768969370773446e-06,
"loss": 0.0004402685910463333,
"memory(GiB)": 160.86,
"step": 795,
"token_acc": 0.9999158390843292,
"train_speed(iter/s)": 0.042788
},
{
"epoch": 0.2932013927066154,
"grad_norm": 0.022222327068448067,
"learning_rate": 9.766077131784764e-06,
"loss": 0.0012076054699718952,
"memory(GiB)": 160.86,
"step": 800,
"token_acc": 0.999663129526697,
"train_speed(iter/s)": 0.042807
},
{
"epoch": 0.2950339014110317,
"grad_norm": 0.1063130721449852,
"learning_rate": 9.763167335022437e-06,
"loss": 0.0008463741280138493,
"memory(GiB)": 160.86,
"step": 805,
"token_acc": 0.9997475597441938,
"train_speed(iter/s)": 0.042824
},
{
"epoch": 0.29686641011544807,
"grad_norm": 0.018112968653440475,
"learning_rate": 9.760239991205878e-06,
"loss": 0.0014921230264008044,
"memory(GiB)": 160.86,
"step": 810,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.042841
},
{
"epoch": 0.2986989188198644,
"grad_norm": 0.17134827375411987,
"learning_rate": 9.757295111119142e-06,
"loss": 0.0017302492633461952,
"memory(GiB)": 160.86,
"step": 815,
"token_acc": 0.9994105263157895,
"train_speed(iter/s)": 0.042859
},
{
"epoch": 0.30053142752428075,
"grad_norm": 0.1881178468465805,
"learning_rate": 9.75433270561089e-06,
"loss": 0.0018818458542227746,
"memory(GiB)": 160.86,
"step": 820,
"token_acc": 0.9994947368421052,
"train_speed(iter/s)": 0.042876
},
{
"epoch": 0.30236393622869706,
"grad_norm": 0.0701608955860138,
"learning_rate": 9.751352785594337e-06,
"loss": 0.0015649979934096337,
"memory(GiB)": 160.86,
"step": 825,
"token_acc": 0.9994106255788499,
"train_speed(iter/s)": 0.042892
},
{
"epoch": 0.30419644493311343,
"grad_norm": 0.11719143390655518,
"learning_rate": 9.748355362047228e-06,
"loss": 0.0022079024463891985,
"memory(GiB)": 160.86,
"step": 830,
"token_acc": 0.9993266560053867,
"train_speed(iter/s)": 0.042901
},
{
"epoch": 0.3060289536375298,
"grad_norm": 0.052010610699653625,
"learning_rate": 9.745340446011782e-06,
"loss": 0.0014782694168388843,
"memory(GiB)": 160.86,
"step": 835,
"token_acc": 0.9994952044422009,
"train_speed(iter/s)": 0.042913
},
{
"epoch": 0.3078614623419461,
"grad_norm": 0.04955873638391495,
"learning_rate": 9.742308048594665e-06,
"loss": 0.0016095375642180443,
"memory(GiB)": 160.86,
"step": 840,
"token_acc": 0.9994949069787019,
"train_speed(iter/s)": 0.04293
},
{
"epoch": 0.3096939710463625,
"grad_norm": 0.03515881672501564,
"learning_rate": 9.73925818096694e-06,
"loss": 0.0010076938197016716,
"memory(GiB)": 160.86,
"step": 845,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.042945
},
{
"epoch": 0.3115264797507788,
"grad_norm": 0.05620809271931648,
"learning_rate": 9.736190854364025e-06,
"loss": 0.0021063588559627534,
"memory(GiB)": 160.86,
"step": 850,
"token_acc": 0.999326485940394,
"train_speed(iter/s)": 0.042961
},
{
"epoch": 0.31335898845519516,
"grad_norm": 0.03683305159211159,
"learning_rate": 9.733106080085662e-06,
"loss": 0.0005148151423782111,
"memory(GiB)": 160.86,
"step": 855,
"token_acc": 0.9997475385003787,
"train_speed(iter/s)": 0.042977
},
{
"epoch": 0.31519149715961153,
"grad_norm": 0.029852213338017464,
"learning_rate": 9.730003869495863e-06,
"loss": 0.0004310948308557272,
"memory(GiB)": 160.86,
"step": 860,
"token_acc": 0.9998316073082428,
"train_speed(iter/s)": 0.042993
},
{
"epoch": 0.31702400586402785,
"grad_norm": 0.0037861524615436792,
"learning_rate": 9.726884234022877e-06,
"loss": 0.0005989938508719206,
"memory(GiB)": 160.86,
"step": 865,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.042978
},
{
"epoch": 0.3188565145684442,
"grad_norm": 0.04197857156395912,
"learning_rate": 9.723747185159146e-06,
"loss": 0.0018272759392857552,
"memory(GiB)": 160.86,
"step": 870,
"token_acc": 0.9996634413125789,
"train_speed(iter/s)": 0.042973
},
{
"epoch": 0.32068902327286053,
"grad_norm": 0.04336322471499443,
"learning_rate": 9.720592734461257e-06,
"loss": 0.0018274670466780663,
"memory(GiB)": 160.86,
"step": 875,
"token_acc": 0.999578947368421,
"train_speed(iter/s)": 0.042986
},
{
"epoch": 0.3225215319772769,
"grad_norm": 0.007882770150899887,
"learning_rate": 9.717420893549902e-06,
"loss": 0.0010360433720052243,
"memory(GiB)": 160.86,
"step": 880,
"token_acc": 0.9994951619688683,
"train_speed(iter/s)": 0.042996
},
{
"epoch": 0.3243540406816932,
"grad_norm": 0.03858296945691109,
"learning_rate": 9.714231674109845e-06,
"loss": 0.0016417885199189186,
"memory(GiB)": 160.86,
"step": 885,
"token_acc": 0.9991580365412142,
"train_speed(iter/s)": 0.042993
},
{
"epoch": 0.3261865493861096,
"grad_norm": 0.016526591032743454,
"learning_rate": 9.711025087889866e-06,
"loss": 0.0008385243825614452,
"memory(GiB)": 160.86,
"step": 890,
"token_acc": 0.9999158461667929,
"train_speed(iter/s)": 0.042974
},
{
"epoch": 0.32801905809052595,
"grad_norm": 0.011745758354663849,
"learning_rate": 9.70780114670272e-06,
"loss": 0.0007513574324548245,
"memory(GiB)": 160.86,
"step": 895,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.042982
},
{
"epoch": 0.32985156679494226,
"grad_norm": 0.032515864819288254,
"learning_rate": 9.704559862425101e-06,
"loss": 0.000879857875406742,
"memory(GiB)": 160.86,
"step": 900,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.042993
},
{
"epoch": 0.33168407549935863,
"grad_norm": 0.11071360856294632,
"learning_rate": 9.701301246997592e-06,
"loss": 0.0013037783093750477,
"memory(GiB)": 160.86,
"step": 905,
"token_acc": 0.9994106751978448,
"train_speed(iter/s)": 0.043
},
{
"epoch": 0.33351658420377495,
"grad_norm": 0.03765702247619629,
"learning_rate": 9.698025312424619e-06,
"loss": 0.0015159587375819684,
"memory(GiB)": 160.86,
"step": 910,
"token_acc": 0.999579443182774,
"train_speed(iter/s)": 0.043008
},
{
"epoch": 0.3353490929081913,
"grad_norm": 0.008713570423424244,
"learning_rate": 9.694732070774415e-06,
"loss": 0.00026825035456568,
"memory(GiB)": 160.86,
"step": 915,
"token_acc": 1.0,
"train_speed(iter/s)": 0.04298
},
{
"epoch": 0.3371816016126077,
"grad_norm": 0.07823354005813599,
"learning_rate": 9.691421534178966e-06,
"loss": 0.001245938241481781,
"memory(GiB)": 160.86,
"step": 920,
"token_acc": 0.9994108240047134,
"train_speed(iter/s)": 0.042955
},
{
"epoch": 0.339014110317024,
"grad_norm": 0.04400285705924034,
"learning_rate": 9.688093714833975e-06,
"loss": 0.000505279190838337,
"memory(GiB)": 160.86,
"step": 925,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.042963
},
{
"epoch": 0.34084661902144037,
"grad_norm": 0.05997716262936592,
"learning_rate": 9.68474862499881e-06,
"loss": 0.001019585132598877,
"memory(GiB)": 160.86,
"step": 930,
"token_acc": 0.9996631578947368,
"train_speed(iter/s)": 0.042968
},
{
"epoch": 0.3426791277258567,
"grad_norm": 0.17811425030231476,
"learning_rate": 9.681386276996462e-06,
"loss": 0.0005352488718926906,
"memory(GiB)": 160.86,
"step": 935,
"token_acc": 0.999831734814067,
"train_speed(iter/s)": 0.042975
},
{
"epoch": 0.34451163643027305,
"grad_norm": 0.2344316691160202,
"learning_rate": 9.678006683213503e-06,
"loss": 0.0009379078634083271,
"memory(GiB)": 160.86,
"step": 940,
"token_acc": 0.9997475385003787,
"train_speed(iter/s)": 0.04297
},
{
"epoch": 0.34634414513468936,
"grad_norm": 0.06496769934892654,
"learning_rate": 9.674609856100032e-06,
"loss": 0.0008637402206659317,
"memory(GiB)": 160.86,
"step": 945,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.04298
},
{
"epoch": 0.34817665383910573,
"grad_norm": 0.0862952470779419,
"learning_rate": 9.671195808169639e-06,
"loss": 0.0011458213441073895,
"memory(GiB)": 160.86,
"step": 950,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.042994
},
{
"epoch": 0.3500091625435221,
"grad_norm": 0.016611328348517418,
"learning_rate": 9.667764551999346e-06,
"loss": 0.0010181719437241555,
"memory(GiB)": 160.86,
"step": 955,
"token_acc": 0.999663242970197,
"train_speed(iter/s)": 0.04297
},
{
"epoch": 0.3518416712479384,
"grad_norm": 0.08347468078136444,
"learning_rate": 9.664316100229578e-06,
"loss": 0.0007937697693705559,
"memory(GiB)": 160.86,
"step": 960,
"token_acc": 0.9995793724236561,
"train_speed(iter/s)": 0.042953
},
{
"epoch": 0.3536741799523548,
"grad_norm": 0.07462402433156967,
"learning_rate": 9.660850465564101e-06,
"loss": 0.0014566186815500259,
"memory(GiB)": 160.86,
"step": 965,
"token_acc": 0.9995790537127462,
"train_speed(iter/s)": 0.042967
},
{
"epoch": 0.3555066886567711,
"grad_norm": 0.031168634071946144,
"learning_rate": 9.657367660769984e-06,
"loss": 0.0008765817619860172,
"memory(GiB)": 160.86,
"step": 970,
"token_acc": 0.9996635828427249,
"train_speed(iter/s)": 0.04298
},
{
"epoch": 0.35733919736118747,
"grad_norm": 0.10647280514240265,
"learning_rate": 9.653867698677543e-06,
"loss": 0.0011190660297870636,
"memory(GiB)": 160.86,
"step": 975,
"token_acc": 0.9996634979389248,
"train_speed(iter/s)": 0.042993
},
{
"epoch": 0.35917170606560384,
"grad_norm": 0.041436877101659775,
"learning_rate": 9.650350592180312e-06,
"loss": 0.0012339851818978786,
"memory(GiB)": 160.86,
"step": 980,
"token_acc": 0.9994108735903047,
"train_speed(iter/s)": 0.043006
},
{
"epoch": 0.36100421477002015,
"grad_norm": 0.056029047816991806,
"learning_rate": 9.646816354234968e-06,
"loss": 0.0012508154846727847,
"memory(GiB)": 160.86,
"step": 985,
"token_acc": 0.9996634129922585,
"train_speed(iter/s)": 0.043019
},
{
"epoch": 0.3628367234744365,
"grad_norm": 0.016829386353492737,
"learning_rate": 9.643264997861312e-06,
"loss": 0.0006543456576764584,
"memory(GiB)": 160.86,
"step": 990,
"token_acc": 0.9995789119083712,
"train_speed(iter/s)": 0.043032
},
{
"epoch": 0.36466923217885283,
"grad_norm": 0.035343799740076065,
"learning_rate": 9.6396965361422e-06,
"loss": 0.0010605846531689168,
"memory(GiB)": 160.86,
"step": 995,
"token_acc": 0.9996632146164857,
"train_speed(iter/s)": 0.043044
},
{
"epoch": 0.3665017408832692,
"grad_norm": 0.1007576435804367,
"learning_rate": 9.636110982223505e-06,
"loss": 0.0017275510355830193,
"memory(GiB)": 160.86,
"step": 1000,
"token_acc": 0.9993263157894737,
"train_speed(iter/s)": 0.043052
},
{
"epoch": 0.3665017408832692,
"eval_loss": 0.0009223763481713831,
"eval_runtime": 173.3991,
"eval_samples_per_second": 2.537,
"eval_steps_per_second": 2.537,
"eval_token_acc": 0.9996633151217422,
"step": 1000
},
{
"epoch": 0.3683342495876855,
"grad_norm": 0.005491136573255062,
"learning_rate": 9.632508349314066e-06,
"loss": 0.0003129460848867893,
"memory(GiB)": 160.86,
"step": 1005,
"token_acc": 0.9997021844125912,
"train_speed(iter/s)": 0.041292
},
{
"epoch": 0.3701667582921019,
"grad_norm": 0.052943065762519836,
"learning_rate": 9.628888650685642e-06,
"loss": 0.0011203167960047722,
"memory(GiB)": 160.86,
"step": 1010,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.041243
},
{
"epoch": 0.37199926699651825,
"grad_norm": 0.03638750687241554,
"learning_rate": 9.625251899672852e-06,
"loss": 0.0004535942804068327,
"memory(GiB)": 160.86,
"step": 1015,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.041263
},
{
"epoch": 0.37383177570093457,
"grad_norm": 0.010707657784223557,
"learning_rate": 9.621598109673142e-06,
"loss": 0.00024845553562045095,
"memory(GiB)": 160.86,
"step": 1020,
"token_acc": 1.0,
"train_speed(iter/s)": 0.041283
},
{
"epoch": 0.37566428440535093,
"grad_norm": 0.003029848216101527,
"learning_rate": 9.617927294146726e-06,
"loss": 0.000255924928933382,
"memory(GiB)": 160.86,
"step": 1025,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.041303
},
{
"epoch": 0.37749679310976725,
"grad_norm": 0.002139889169484377,
"learning_rate": 9.614239466616541e-06,
"loss": 0.001936671696603298,
"memory(GiB)": 160.86,
"step": 1030,
"token_acc": 0.9996633846671716,
"train_speed(iter/s)": 0.041322
},
{
"epoch": 0.3793293018141836,
"grad_norm": 0.033104073256254196,
"learning_rate": 9.61053464066819e-06,
"loss": 0.0009706121869385243,
"memory(GiB)": 160.86,
"step": 1035,
"token_acc": 0.9996632146164857,
"train_speed(iter/s)": 0.041341
},
{
"epoch": 0.3811618105186,
"grad_norm": 0.02874094434082508,
"learning_rate": 9.606812829949896e-06,
"loss": 0.0007171142846345901,
"memory(GiB)": 160.86,
"step": 1040,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.041359
},
{
"epoch": 0.3829943192230163,
"grad_norm": 0.13675667345523834,
"learning_rate": 9.603074048172458e-06,
"loss": 0.0008686968125402927,
"memory(GiB)": 160.86,
"step": 1045,
"token_acc": 0.9998317489694625,
"train_speed(iter/s)": 0.041378
},
{
"epoch": 0.38482682792743267,
"grad_norm": 0.325898677110672,
"learning_rate": 9.599318309109191e-06,
"loss": 0.001396147720515728,
"memory(GiB)": 160.86,
"step": 1050,
"token_acc": 0.9995791245791246,
"train_speed(iter/s)": 0.041398
},
{
"epoch": 0.386659336631849,
"grad_norm": 0.06272176653146744,
"learning_rate": 9.595545626595878e-06,
"loss": 0.002794544957578182,
"memory(GiB)": 160.86,
"step": 1055,
"token_acc": 0.9992422966829433,
"train_speed(iter/s)": 0.041416
},
{
"epoch": 0.38849184533626535,
"grad_norm": 0.019762301817536354,
"learning_rate": 9.591756014530723e-06,
"loss": 0.0009371510706841946,
"memory(GiB)": 160.86,
"step": 1060,
"token_acc": 0.9996630727762803,
"train_speed(iter/s)": 0.041434
},
{
"epoch": 0.3903243540406817,
"grad_norm": 0.09259835630655289,
"learning_rate": 9.587949486874295e-06,
"loss": 0.0013479561544954776,
"memory(GiB)": 160.86,
"step": 1065,
"token_acc": 0.9995791245791246,
"train_speed(iter/s)": 0.041453
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.05826210230588913,
"learning_rate": 9.58412605764948e-06,
"loss": 0.00075059924274683,
"memory(GiB)": 160.86,
"step": 1070,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.041452
},
{
"epoch": 0.3939893714495144,
"grad_norm": 0.02435746043920517,
"learning_rate": 9.580285740941425e-06,
"loss": 0.0010668656788766385,
"memory(GiB)": 160.86,
"step": 1075,
"token_acc": 0.9994948644552955,
"train_speed(iter/s)": 0.04147
},
{
"epoch": 0.3958218801539307,
"grad_norm": 0.06046979874372482,
"learning_rate": 9.57642855089749e-06,
"loss": 0.0006216964218765497,
"memory(GiB)": 160.86,
"step": 1080,
"token_acc": 0.9995790891489182,
"train_speed(iter/s)": 0.041489
},
{
"epoch": 0.3976543888583471,
"grad_norm": 0.02380959317088127,
"learning_rate": 9.572554501727198e-06,
"loss": 0.000693302508443594,
"memory(GiB)": 160.86,
"step": 1085,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.041506
},
{
"epoch": 0.3994868975627634,
"grad_norm": 0.015010896138846874,
"learning_rate": 9.568663607702174e-06,
"loss": 0.0005827041808515787,
"memory(GiB)": 160.86,
"step": 1090,
"token_acc": 0.9997476446837147,
"train_speed(iter/s)": 0.041523
},
{
"epoch": 0.40131940626717977,
"grad_norm": 0.17055855691432953,
"learning_rate": 9.564755883156103e-06,
"loss": 0.0010279595851898193,
"memory(GiB)": 160.86,
"step": 1095,
"token_acc": 0.9995791600033668,
"train_speed(iter/s)": 0.041535
},
{
"epoch": 0.40315191497159614,
"grad_norm": 0.0005144431488588452,
"learning_rate": 9.560831342484668e-06,
"loss": 0.00026263915933668616,
"memory(GiB)": 160.86,
"step": 1100,
"token_acc": 0.9999158249158249,
"train_speed(iter/s)": 0.041545
},
{
"epoch": 0.40498442367601245,
"grad_norm": 0.019269630312919617,
"learning_rate": 9.556890000145503e-06,
"loss": 0.0010970150120556354,
"memory(GiB)": 160.86,
"step": 1105,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.041546
},
{
"epoch": 0.4068169323804288,
"grad_norm": 0.037301257252693176,
"learning_rate": 9.552931870658136e-06,
"loss": 0.001028469391167164,
"memory(GiB)": 160.86,
"step": 1110,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.041562
},
{
"epoch": 0.40864944108484513,
"grad_norm": 0.006164327263832092,
"learning_rate": 9.54895696860394e-06,
"loss": 0.0005135733168572188,
"memory(GiB)": 160.86,
"step": 1115,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.041578
},
{
"epoch": 0.4104819497892615,
"grad_norm": 0.1576082557439804,
"learning_rate": 9.544965308626075e-06,
"loss": 0.001076418813318014,
"memory(GiB)": 160.86,
"step": 1120,
"token_acc": 0.9996634413125789,
"train_speed(iter/s)": 0.041593
},
{
"epoch": 0.41231445849367787,
"grad_norm": 0.014838850125670433,
"learning_rate": 9.540956905429435e-06,
"loss": 0.000989390444010496,
"memory(GiB)": 160.86,
"step": 1125,
"token_acc": 0.9994946091644205,
"train_speed(iter/s)": 0.041608
},
{
"epoch": 0.4141469671980942,
"grad_norm": 0.014855766668915749,
"learning_rate": 9.536931773780598e-06,
"loss": 0.0015475031919777392,
"memory(GiB)": 160.86,
"step": 1130,
"token_acc": 0.9994103276893269,
"train_speed(iter/s)": 0.041623
},
{
"epoch": 0.41597947590251055,
"grad_norm": 0.019349105656147003,
"learning_rate": 9.53288992850776e-06,
"loss": 0.0005111652426421642,
"memory(GiB)": 160.86,
"step": 1135,
"token_acc": 1.0,
"train_speed(iter/s)": 0.041628
},
{
"epoch": 0.41781198460692687,
"grad_norm": 0.03461524471640587,
"learning_rate": 9.528831384500699e-06,
"loss": 0.0004519184119999409,
"memory(GiB)": 160.86,
"step": 1140,
"token_acc": 0.9999158036541214,
"train_speed(iter/s)": 0.041643
},
{
"epoch": 0.41964449331134324,
"grad_norm": 0.15801462531089783,
"learning_rate": 9.5247561567107e-06,
"loss": 0.00042958445847034453,
"memory(GiB)": 160.86,
"step": 1145,
"token_acc": 0.9997474322276477,
"train_speed(iter/s)": 0.041646
},
{
"epoch": 0.42147700201575955,
"grad_norm": 0.04607151448726654,
"learning_rate": 9.520664260150513e-06,
"loss": 0.0018787598237395287,
"memory(GiB)": 160.86,
"step": 1150,
"token_acc": 0.9995792662403231,
"train_speed(iter/s)": 0.04166
},
{
"epoch": 0.4233095107201759,
"grad_norm": 0.0973573699593544,
"learning_rate": 9.5165557098943e-06,
"loss": 0.0009789202362298966,
"memory(GiB)": 160.86,
"step": 1155,
"token_acc": 0.9997473684210526,
"train_speed(iter/s)": 0.041675
},
{
"epoch": 0.4251420194245923,
"grad_norm": 0.038962222635746,
"learning_rate": 9.512430521077565e-06,
"loss": 0.0009090069681406022,
"memory(GiB)": 160.86,
"step": 1160,
"token_acc": 0.9997473896934995,
"train_speed(iter/s)": 0.041686
},
{
"epoch": 0.4269745281290086,
"grad_norm": 0.010646538808941841,
"learning_rate": 9.508288708897109e-06,
"loss": 0.00033488136250525713,
"memory(GiB)": 160.86,
"step": 1165,
"token_acc": 1.0,
"train_speed(iter/s)": 0.041701
},
{
"epoch": 0.42880703683342497,
"grad_norm": 0.0063909804448485374,
"learning_rate": 9.504130288610972e-06,
"loss": 0.0002777322195470333,
"memory(GiB)": 160.86,
"step": 1170,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.041717
},
{
"epoch": 0.4306395455378413,
"grad_norm": 0.0029652463272213936,
"learning_rate": 9.499955275538384e-06,
"loss": 0.0006769481580704451,
"memory(GiB)": 160.86,
"step": 1175,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.041732
},
{
"epoch": 0.43247205424225765,
"grad_norm": 0.03148781880736351,
"learning_rate": 9.495763685059689e-06,
"loss": 0.0021369663998484613,
"memory(GiB)": 160.86,
"step": 1180,
"token_acc": 0.9996631862579993,
"train_speed(iter/s)": 0.041747
},
{
"epoch": 0.434304562946674,
"grad_norm": 0.0476820208132267,
"learning_rate": 9.49155553261631e-06,
"loss": 0.0006943107582628727,
"memory(GiB)": 160.86,
"step": 1185,
"token_acc": 0.9999157610984752,
"train_speed(iter/s)": 0.041763
},
{
"epoch": 0.43613707165109034,
"grad_norm": 0.006549006327986717,
"learning_rate": 9.487330833710678e-06,
"loss": 0.00024927293416112664,
"memory(GiB)": 160.86,
"step": 1190,
"token_acc": 0.9999158532480646,
"train_speed(iter/s)": 0.041649
},
{
"epoch": 0.4379695803555067,
"grad_norm": 0.030179157853126526,
"learning_rate": 9.48308960390618e-06,
"loss": 0.0010321117006242275,
"memory(GiB)": 160.86,
"step": 1195,
"token_acc": 0.9997475597441938,
"train_speed(iter/s)": 0.041664
},
{
"epoch": 0.439802089059923,
"grad_norm": 0.0033925846219062805,
"learning_rate": 9.478831858827105e-06,
"loss": 0.00027046091854572297,
"memory(GiB)": 160.86,
"step": 1200,
"token_acc": 1.0,
"train_speed(iter/s)": 0.041679
},
{
"epoch": 0.4416345977643394,
"grad_norm": 0.07267381250858307,
"learning_rate": 9.474557614158575e-06,
"loss": 0.0008655142039060593,
"memory(GiB)": 160.86,
"step": 1205,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.041695
},
{
"epoch": 0.4434671064687557,
"grad_norm": 0.006699859630316496,
"learning_rate": 9.470266885646504e-06,
"loss": 0.0006839127279818058,
"memory(GiB)": 160.86,
"step": 1210,
"token_acc": 0.9998316498316498,
"train_speed(iter/s)": 0.04171
},
{
"epoch": 0.44529961517317207,
"grad_norm": 0.01745425909757614,
"learning_rate": 9.465959689097525e-06,
"loss": 0.0009552924893796444,
"memory(GiB)": 160.86,
"step": 1215,
"token_acc": 0.9997473896934995,
"train_speed(iter/s)": 0.041723
},
{
"epoch": 0.44713212387758844,
"grad_norm": 0.018873147666454315,
"learning_rate": 9.461636040378941e-06,
"loss": 0.0004271782469004393,
"memory(GiB)": 160.86,
"step": 1220,
"token_acc": 0.9998315789473684,
"train_speed(iter/s)": 0.04171
},
{
"epoch": 0.44896463258200475,
"grad_norm": 0.030013209208846092,
"learning_rate": 9.45729595541866e-06,
"loss": 0.0011812681332230568,
"memory(GiB)": 160.86,
"step": 1225,
"token_acc": 0.9996633280026934,
"train_speed(iter/s)": 0.041724
},
{
"epoch": 0.4507971412864211,
"grad_norm": 0.0008936990634538233,
"learning_rate": 9.452939450205139e-06,
"loss": 0.0004920902196317911,
"memory(GiB)": 160.86,
"step": 1230,
"token_acc": 0.9996634129922585,
"train_speed(iter/s)": 0.041738
},
{
"epoch": 0.45262964999083743,
"grad_norm": 0.06023690477013588,
"learning_rate": 9.448566540787331e-06,
"loss": 0.0010696605779230595,
"memory(GiB)": 160.86,
"step": 1235,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.041753
},
{
"epoch": 0.4544621586952538,
"grad_norm": 0.05453835055232048,
"learning_rate": 9.444177243274619e-06,
"loss": 0.0011446685530245304,
"memory(GiB)": 160.86,
"step": 1240,
"token_acc": 0.9994107248084856,
"train_speed(iter/s)": 0.041767
},
{
"epoch": 0.4562946673996702,
"grad_norm": 0.06793410331010818,
"learning_rate": 9.43977157383675e-06,
"loss": 0.0017616702243685722,
"memory(GiB)": 160.86,
"step": 1245,
"token_acc": 0.9994109231675503,
"train_speed(iter/s)": 0.04178
},
{
"epoch": 0.4581271761040865,
"grad_norm": 0.03625203296542168,
"learning_rate": 9.435349548703796e-06,
"loss": 0.000555843859910965,
"memory(GiB)": 160.86,
"step": 1250,
"token_acc": 0.9998317489694625,
"train_speed(iter/s)": 0.041794
},
{
"epoch": 0.45995968480850286,
"grad_norm": 0.08264432102441788,
"learning_rate": 9.430911184166074e-06,
"loss": 0.0007446614094078541,
"memory(GiB)": 160.86,
"step": 1255,
"token_acc": 0.9996634696281339,
"train_speed(iter/s)": 0.041808
},
{
"epoch": 0.46179219351291917,
"grad_norm": 0.03210179880261421,
"learning_rate": 9.426456496574095e-06,
"loss": 0.0009373857639729977,
"memory(GiB)": 160.86,
"step": 1260,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.041821
},
{
"epoch": 0.46362470221733554,
"grad_norm": 0.047844789922237396,
"learning_rate": 9.421985502338505e-06,
"loss": 0.0005674117710441351,
"memory(GiB)": 160.86,
"step": 1265,
"token_acc": 0.9997473258654089,
"train_speed(iter/s)": 0.041818
},
{
"epoch": 0.46545721092175185,
"grad_norm": 0.10654474049806595,
"learning_rate": 9.417498217930017e-06,
"loss": 0.0010964240878820418,
"memory(GiB)": 160.86,
"step": 1270,
"token_acc": 0.9994948644552955,
"train_speed(iter/s)": 0.04183
},
{
"epoch": 0.4672897196261682,
"grad_norm": 0.09114305675029755,
"learning_rate": 9.412994659879362e-06,
"loss": 0.0010675345547497272,
"memory(GiB)": 160.86,
"step": 1275,
"token_acc": 0.9997476022211005,
"train_speed(iter/s)": 0.041843
},
{
"epoch": 0.4691222283305846,
"grad_norm": 0.01834912970662117,
"learning_rate": 9.408474844777218e-06,
"loss": 0.0008592868223786354,
"memory(GiB)": 160.86,
"step": 1280,
"token_acc": 0.9996632146164857,
"train_speed(iter/s)": 0.041856
},
{
"epoch": 0.4709547370350009,
"grad_norm": 0.057866550981998444,
"learning_rate": 9.403938789274152e-06,
"loss": 0.0005749462172389031,
"memory(GiB)": 160.86,
"step": 1285,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.041858
},
{
"epoch": 0.4727872457394173,
"grad_norm": 0.06462471187114716,
"learning_rate": 9.39938651008056e-06,
"loss": 0.00032207604963332417,
"memory(GiB)": 160.86,
"step": 1290,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.041871
},
{
"epoch": 0.4746197544438336,
"grad_norm": 0.13423164188861847,
"learning_rate": 9.394818023966604e-06,
"loss": 0.0010271795094013215,
"memory(GiB)": 160.86,
"step": 1295,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.041884
},
{
"epoch": 0.47645226314824995,
"grad_norm": 0.08763778209686279,
"learning_rate": 9.39023334776215e-06,
"loss": 0.0028607085347175597,
"memory(GiB)": 160.86,
"step": 1300,
"token_acc": 0.9993261455525606,
"train_speed(iter/s)": 0.041897
},
{
"epoch": 0.4782847718526663,
"grad_norm": 0.002933151787146926,
"learning_rate": 9.385632498356713e-06,
"loss": 0.00027030634228140114,
"memory(GiB)": 160.86,
"step": 1305,
"token_acc": 1.0,
"train_speed(iter/s)": 0.041909
},
{
"epoch": 0.48011728055708264,
"grad_norm": 0.04423481225967407,
"learning_rate": 9.381015492699379e-06,
"loss": 0.00081101693212986,
"memory(GiB)": 160.86,
"step": 1310,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.041916
},
{
"epoch": 0.481949789261499,
"grad_norm": 0.02344198152422905,
"learning_rate": 9.376382347798756e-06,
"loss": 0.0003832927206531167,
"memory(GiB)": 160.86,
"step": 1315,
"token_acc": 0.9998316073082428,
"train_speed(iter/s)": 0.041926
},
{
"epoch": 0.4837822979659153,
"grad_norm": 0.016795309260487556,
"learning_rate": 9.371733080722911e-06,
"loss": 0.00048357550986111164,
"memory(GiB)": 160.86,
"step": 1320,
"token_acc": 0.9998315789473684,
"train_speed(iter/s)": 0.041939
},
{
"epoch": 0.4856148066703317,
"grad_norm": 0.09421277046203613,
"learning_rate": 9.3670677085993e-06,
"loss": 0.0011711867526173591,
"memory(GiB)": 160.86,
"step": 1325,
"token_acc": 0.9997474322276477,
"train_speed(iter/s)": 0.04195
},
{
"epoch": 0.487447315374748,
"grad_norm": 0.18248307704925537,
"learning_rate": 9.362386248614706e-06,
"loss": 0.0005028956104069949,
"memory(GiB)": 160.86,
"step": 1330,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.041963
},
{
"epoch": 0.48927982407916437,
"grad_norm": 0.04889710247516632,
"learning_rate": 9.357688718015185e-06,
"loss": 0.0029960500076413156,
"memory(GiB)": 160.86,
"step": 1335,
"token_acc": 0.9992425517589631,
"train_speed(iter/s)": 0.041975
},
{
"epoch": 0.49111233278358074,
"grad_norm": 0.01644892431795597,
"learning_rate": 9.35297513410599e-06,
"loss": 0.001054964866489172,
"memory(GiB)": 160.86,
"step": 1340,
"token_acc": 0.999663242970197,
"train_speed(iter/s)": 0.041987
},
{
"epoch": 0.49294484148799705,
"grad_norm": 0.06923960894346237,
"learning_rate": 9.348245514251515e-06,
"loss": 0.0015572577714920044,
"memory(GiB)": 160.86,
"step": 1345,
"token_acc": 0.99949499200404,
"train_speed(iter/s)": 0.041999
},
{
"epoch": 0.4947773501924134,
"grad_norm": 0.4345010817050934,
"learning_rate": 9.343499875875226e-06,
"loss": 0.0008648891933262348,
"memory(GiB)": 160.86,
"step": 1350,
"token_acc": 0.9998317914213625,
"train_speed(iter/s)": 0.042012
},
{
"epoch": 0.49660985889682974,
"grad_norm": 0.12544922530651093,
"learning_rate": 9.338738236459606e-06,
"loss": 0.0008970722556114197,
"memory(GiB)": 160.86,
"step": 1355,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.042024
},
{
"epoch": 0.4984423676012461,
"grad_norm": 0.04251859337091446,
"learning_rate": 9.333960613546079e-06,
"loss": 0.0008619870990514755,
"memory(GiB)": 160.86,
"step": 1360,
"token_acc": 0.9996632146164857,
"train_speed(iter/s)": 0.042036
},
{
"epoch": 0.5002748763056625,
"grad_norm": 0.05376381427049637,
"learning_rate": 9.329167024734951e-06,
"loss": 0.0009831368923187255,
"memory(GiB)": 160.86,
"step": 1365,
"token_acc": 0.9996631862579993,
"train_speed(iter/s)": 0.042046
},
{
"epoch": 0.5021073850100788,
"grad_norm": 0.03389672935009003,
"learning_rate": 9.32435748768535e-06,
"loss": 0.001122223772108555,
"memory(GiB)": 160.86,
"step": 1370,
"token_acc": 0.9995790891489182,
"train_speed(iter/s)": 0.042057
},
{
"epoch": 0.5039398937144951,
"grad_norm": 0.07879503071308136,
"learning_rate": 9.319532020115147e-06,
"loss": 0.0011348828673362731,
"memory(GiB)": 160.86,
"step": 1375,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.042069
},
{
"epoch": 0.5057724024189115,
"grad_norm": 0.004050049465149641,
"learning_rate": 9.314690639800906e-06,
"loss": 0.0002213560277596116,
"memory(GiB)": 160.86,
"step": 1380,
"token_acc": 1.0,
"train_speed(iter/s)": 0.04208
},
{
"epoch": 0.5076049111233278,
"grad_norm": 0.028278427198529243,
"learning_rate": 9.30983336457781e-06,
"loss": 0.0009013951756060123,
"memory(GiB)": 160.86,
"step": 1385,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.042093
},
{
"epoch": 0.5094374198277442,
"grad_norm": 0.020806804299354553,
"learning_rate": 9.304960212339602e-06,
"loss": 0.001097150705754757,
"memory(GiB)": 160.86,
"step": 1390,
"token_acc": 0.9995791954216462,
"train_speed(iter/s)": 0.042093
},
{
"epoch": 0.5112699285321606,
"grad_norm": 0.05375039204955101,
"learning_rate": 9.300071201038503e-06,
"loss": 0.0004816567990928888,
"memory(GiB)": 160.86,
"step": 1395,
"token_acc": 0.9998316073082428,
"train_speed(iter/s)": 0.042105
},
{
"epoch": 0.5131024372365769,
"grad_norm": 0.005027708597481251,
"learning_rate": 9.295166348685169e-06,
"loss": 0.0004785487428307533,
"memory(GiB)": 160.86,
"step": 1400,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.042115
},
{
"epoch": 0.5149349459409932,
"grad_norm": 0.007288212422281504,
"learning_rate": 9.290245673348609e-06,
"loss": 0.00039666993543505666,
"memory(GiB)": 160.86,
"step": 1405,
"token_acc": 0.9998316781686585,
"train_speed(iter/s)": 0.042118
},
{
"epoch": 0.5167674546454095,
"grad_norm": 0.0003485670604277402,
"learning_rate": 9.285309193156118e-06,
"loss": 0.0002419668948277831,
"memory(GiB)": 160.86,
"step": 1410,
"token_acc": 0.9999158461667929,
"train_speed(iter/s)": 0.042128
},
{
"epoch": 0.5185999633498259,
"grad_norm": 0.05836885794997215,
"learning_rate": 9.280356926293222e-06,
"loss": 0.0011019782163202763,
"memory(GiB)": 160.86,
"step": 1415,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.04214
},
{
"epoch": 0.5204324720542423,
"grad_norm": 0.030392736196517944,
"learning_rate": 9.275388891003596e-06,
"loss": 0.0003588124178349972,
"memory(GiB)": 160.86,
"step": 1420,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.042152
},
{
"epoch": 0.5222649807586586,
"grad_norm": 0.10738146305084229,
"learning_rate": 9.270405105589012e-06,
"loss": 0.0022922657430171967,
"memory(GiB)": 160.86,
"step": 1425,
"token_acc": 0.9995792308339645,
"train_speed(iter/s)": 0.042164
},
{
"epoch": 0.524097489463075,
"grad_norm": 0.024856839329004288,
"learning_rate": 9.265405588409258e-06,
"loss": 0.000432960782200098,
"memory(GiB)": 160.86,
"step": 1430,
"token_acc": 0.9999158390843292,
"train_speed(iter/s)": 0.042176
},
{
"epoch": 0.5259299981674913,
"grad_norm": 0.023576080799102783,
"learning_rate": 9.26039035788208e-06,
"loss": 0.0014881092123687268,
"memory(GiB)": 160.86,
"step": 1435,
"token_acc": 0.9995794078061911,
"train_speed(iter/s)": 0.042185
},
{
"epoch": 0.5277625068719076,
"grad_norm": 0.025212427601218224,
"learning_rate": 9.255359432483106e-06,
"loss": 0.0006445163395255804,
"memory(GiB)": 160.86,
"step": 1440,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.042195
},
{
"epoch": 0.5295950155763239,
"grad_norm": 0.05869888886809349,
"learning_rate": 9.25031283074579e-06,
"loss": 0.0012847738340497016,
"memory(GiB)": 160.86,
"step": 1445,
"token_acc": 0.9995791245791246,
"train_speed(iter/s)": 0.042206
},
{
"epoch": 0.5314275242807404,
"grad_norm": 0.02733391709625721,
"learning_rate": 9.245250571261328e-06,
"loss": 0.0012956521473824977,
"memory(GiB)": 160.86,
"step": 1450,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.042217
},
{
"epoch": 0.5332600329851567,
"grad_norm": 0.01605917513370514,
"learning_rate": 9.240172672678603e-06,
"loss": 0.0010051255114376545,
"memory(GiB)": 160.86,
"step": 1455,
"token_acc": 0.9997476234541937,
"train_speed(iter/s)": 0.042217
},
{
"epoch": 0.535092541689573,
"grad_norm": 0.07777733355760574,
"learning_rate": 9.235079153704108e-06,
"loss": 0.001209939643740654,
"memory(GiB)": 160.86,
"step": 1460,
"token_acc": 0.9994948644552955,
"train_speed(iter/s)": 0.042228
},
{
"epoch": 0.5369250503939894,
"grad_norm": 0.024418100714683533,
"learning_rate": 9.229970033101881e-06,
"loss": 0.0006480346899479627,
"memory(GiB)": 160.86,
"step": 1465,
"token_acc": 0.9998315789473684,
"train_speed(iter/s)": 0.042239
},
{
"epoch": 0.5387575590984057,
"grad_norm": 0.051130812615156174,
"learning_rate": 9.224845329693434e-06,
"loss": 0.0005965878255665303,
"memory(GiB)": 160.86,
"step": 1470,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.04225
},
{
"epoch": 0.540590067802822,
"grad_norm": 0.03825452923774719,
"learning_rate": 9.21970506235769e-06,
"loss": 0.0003675919026136398,
"memory(GiB)": 160.86,
"step": 1475,
"token_acc": 0.9998315221969506,
"train_speed(iter/s)": 0.042259
},
{
"epoch": 0.5424225765072385,
"grad_norm": 0.05280032381415367,
"learning_rate": 9.214549250030899e-06,
"loss": 0.00044973762705922125,
"memory(GiB)": 160.86,
"step": 1480,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.042271
},
{
"epoch": 0.5442550852116548,
"grad_norm": 0.13924196362495422,
"learning_rate": 9.209377911706585e-06,
"loss": 0.0010926604270935058,
"memory(GiB)": 160.86,
"step": 1485,
"token_acc": 0.9996634979389248,
"train_speed(iter/s)": 0.042282
},
{
"epoch": 0.5460875939160711,
"grad_norm": 0.0010057148756459355,
"learning_rate": 9.204191066435463e-06,
"loss": 7.150891469791532e-05,
"memory(GiB)": 160.86,
"step": 1490,
"token_acc": 1.0,
"train_speed(iter/s)": 0.042286
},
{
"epoch": 0.5479201026204874,
"grad_norm": 0.0028190938755869865,
"learning_rate": 9.198988733325381e-06,
"loss": 0.00018844833830371498,
"memory(GiB)": 160.86,
"step": 1495,
"token_acc": 1.0,
"train_speed(iter/s)": 0.042297
},
{
"epoch": 0.5497526113249038,
"grad_norm": 0.2529807388782501,
"learning_rate": 9.19377093154123e-06,
"loss": 0.0006476116366684436,
"memory(GiB)": 160.86,
"step": 1500,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.034291
},
{
"epoch": 0.5497526113249038,
"eval_loss": 0.0008861870155669749,
"eval_runtime": 172.4847,
"eval_samples_per_second": 2.551,
"eval_steps_per_second": 2.551,
"eval_token_acc": 0.999755138270358,
"step": 1500
},
{
"epoch": 0.5515851200293201,
"grad_norm": 0.07361137121915817,
"learning_rate": 9.188537680304901e-06,
"loss": 0.001575019396841526,
"memory(GiB)": 160.86,
"step": 1505,
"token_acc": 0.9997150923359839,
"train_speed(iter/s)": 0.033682
},
{
"epoch": 0.5534176287337365,
"grad_norm": 0.1123221218585968,
"learning_rate": 9.18328899889519e-06,
"loss": 0.0008759641088545323,
"memory(GiB)": 160.86,
"step": 1510,
"token_acc": 0.9997474960020201,
"train_speed(iter/s)": 0.033712
},
{
"epoch": 0.5552501374381529,
"grad_norm": 0.031373172998428345,
"learning_rate": 9.17802490664774e-06,
"loss": 0.0005370716098695993,
"memory(GiB)": 160.86,
"step": 1515,
"token_acc": 0.9997475385003787,
"train_speed(iter/s)": 0.033741
},
{
"epoch": 0.5570826461425692,
"grad_norm": 0.00548228295519948,
"learning_rate": 9.172745422954961e-06,
"loss": 0.0006150617729872466,
"memory(GiB)": 160.86,
"step": 1520,
"token_acc": 0.9997476234541937,
"train_speed(iter/s)": 0.033771
},
{
"epoch": 0.5589151548469855,
"grad_norm": 0.09783894568681717,
"learning_rate": 9.167450567265972e-06,
"loss": 0.0003677058033645153,
"memory(GiB)": 160.86,
"step": 1525,
"token_acc": 0.9999158036541214,
"train_speed(iter/s)": 0.033793
},
{
"epoch": 0.5607476635514018,
"grad_norm": 0.02310693822801113,
"learning_rate": 9.162140359086515e-06,
"loss": 0.0013180834241211415,
"memory(GiB)": 160.86,
"step": 1530,
"token_acc": 0.9994106751978448,
"train_speed(iter/s)": 0.033822
},
{
"epoch": 0.5625801722558182,
"grad_norm": 0.07956714183092117,
"learning_rate": 9.156814817978889e-06,
"loss": 0.0014457314275205136,
"memory(GiB)": 160.86,
"step": 1535,
"token_acc": 0.9994950345059754,
"train_speed(iter/s)": 0.033851
},
{
"epoch": 0.5644126809602346,
"grad_norm": 0.007547269109636545,
"learning_rate": 9.151473963561884e-06,
"loss": 0.0004539607558399439,
"memory(GiB)": 160.86,
"step": 1540,
"token_acc": 0.9998316356595673,
"train_speed(iter/s)": 0.033879
},
{
"epoch": 0.5662451896646509,
"grad_norm": 0.016255052760243416,
"learning_rate": 9.146117815510691e-06,
"loss": 0.0003765122266486287,
"memory(GiB)": 160.86,
"step": 1545,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.033907
},
{
"epoch": 0.5680776983690673,
"grad_norm": 0.06404280662536621,
"learning_rate": 9.140746393556853e-06,
"loss": 0.0009273691102862358,
"memory(GiB)": 160.86,
"step": 1550,
"token_acc": 0.9994106751978448,
"train_speed(iter/s)": 0.033936
},
{
"epoch": 0.5699102070734836,
"grad_norm": 0.030146759003400803,
"learning_rate": 9.135359717488179e-06,
"loss": 0.0006903111469000577,
"memory(GiB)": 160.86,
"step": 1555,
"token_acc": 0.9997473471450228,
"train_speed(iter/s)": 0.033965
},
{
"epoch": 0.5717427157778999,
"grad_norm": 0.017701471224427223,
"learning_rate": 9.129957807148666e-06,
"loss": 0.0014588728547096253,
"memory(GiB)": 160.86,
"step": 1560,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.033993
},
{
"epoch": 0.5735752244823162,
"grad_norm": 0.02424156479537487,
"learning_rate": 9.124540682438438e-06,
"loss": 0.00092041976749897,
"memory(GiB)": 160.86,
"step": 1565,
"token_acc": 0.9997475809844342,
"train_speed(iter/s)": 0.034021
},
{
"epoch": 0.5754077331867327,
"grad_norm": 0.06382456421852112,
"learning_rate": 9.119108363313665e-06,
"loss": 0.0009634297341108323,
"memory(GiB)": 160.86,
"step": 1570,
"token_acc": 0.9996634413125789,
"train_speed(iter/s)": 0.034047
},
{
"epoch": 0.577240241891149,
"grad_norm": 0.011778367683291435,
"learning_rate": 9.113660869786491e-06,
"loss": 0.0007347457576543093,
"memory(GiB)": 160.86,
"step": 1575,
"token_acc": 0.999663242970197,
"train_speed(iter/s)": 0.034075
},
{
"epoch": 0.5790727505955653,
"grad_norm": 0.01488505955785513,
"learning_rate": 9.108198221924966e-06,
"loss": 0.0007065658923238516,
"memory(GiB)": 160.86,
"step": 1580,
"token_acc": 0.9996636677036912,
"train_speed(iter/s)": 0.034103
},
{
"epoch": 0.5809052592999817,
"grad_norm": 0.016339842230081558,
"learning_rate": 9.102720439852964e-06,
"loss": 0.0004196997731924057,
"memory(GiB)": 160.86,
"step": 1585,
"token_acc": 0.9999158674070335,
"train_speed(iter/s)": 0.034131
},
{
"epoch": 0.582737768004398,
"grad_norm": 0.03133771941065788,
"learning_rate": 9.097227543750109e-06,
"loss": 0.0003929842729121447,
"memory(GiB)": 160.86,
"step": 1590,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.034104
},
{
"epoch": 0.5845702767088143,
"grad_norm": 0.10911545157432556,
"learning_rate": 9.091719553851707e-06,
"loss": 0.00033823368139564993,
"memory(GiB)": 160.86,
"step": 1595,
"token_acc": 0.9998316781686585,
"train_speed(iter/s)": 0.034131
},
{
"epoch": 0.5864027854132308,
"grad_norm": 0.06253647804260254,
"learning_rate": 9.086196490448668e-06,
"loss": 0.0004926771856844425,
"memory(GiB)": 160.86,
"step": 1600,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.034154
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.01017008163034916,
"learning_rate": 9.080658373887432e-06,
"loss": 0.0021519148722290993,
"memory(GiB)": 160.86,
"step": 1605,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.034177
},
{
"epoch": 0.5900678028220634,
"grad_norm": 0.027529926970601082,
"learning_rate": 9.07510522456989e-06,
"loss": 0.000728294812142849,
"memory(GiB)": 160.86,
"step": 1610,
"token_acc": 0.9996633280026934,
"train_speed(iter/s)": 0.034203
},
{
"epoch": 0.5919003115264797,
"grad_norm": 0.14524707198143005,
"learning_rate": 9.069537062953318e-06,
"loss": 0.0007321128156036139,
"memory(GiB)": 160.86,
"step": 1615,
"token_acc": 0.9996633846671716,
"train_speed(iter/s)": 0.03423
},
{
"epoch": 0.5937328202308961,
"grad_norm": 0.010788935236632824,
"learning_rate": 9.063953909550289e-06,
"loss": 0.0007929414510726929,
"memory(GiB)": 160.86,
"step": 1620,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.034256
},
{
"epoch": 0.5955653289353124,
"grad_norm": 0.04031025990843773,
"learning_rate": 9.05835578492861e-06,
"loss": 0.00044157886877655984,
"memory(GiB)": 160.86,
"step": 1625,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.034282
},
{
"epoch": 0.5973978376397288,
"grad_norm": 0.005226753186434507,
"learning_rate": 9.052742709711234e-06,
"loss": 0.0007471313234418631,
"memory(GiB)": 160.86,
"step": 1630,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.034307
},
{
"epoch": 0.5992303463441452,
"grad_norm": 0.006849437486380339,
"learning_rate": 9.0471147045762e-06,
"loss": 0.00016981502994894981,
"memory(GiB)": 160.86,
"step": 1635,
"token_acc": 1.0,
"train_speed(iter/s)": 0.034314
},
{
"epoch": 0.6010628550485615,
"grad_norm": 0.0021249176934361458,
"learning_rate": 9.041471790256543e-06,
"loss": 0.0004975998308509588,
"memory(GiB)": 160.86,
"step": 1640,
"token_acc": 0.9999157965644998,
"train_speed(iter/s)": 0.034341
},
{
"epoch": 0.6028953637529778,
"grad_norm": 0.03091166540980339,
"learning_rate": 9.035813987540216e-06,
"loss": 0.001137539092451334,
"memory(GiB)": 160.86,
"step": 1645,
"token_acc": 0.999579018270607,
"train_speed(iter/s)": 0.034367
},
{
"epoch": 0.6047278724573941,
"grad_norm": 0.020048417150974274,
"learning_rate": 9.030141317270026e-06,
"loss": 0.0009108279831707477,
"memory(GiB)": 160.86,
"step": 1650,
"token_acc": 0.9997473471450228,
"train_speed(iter/s)": 0.034393
},
{
"epoch": 0.6065603811618105,
"grad_norm": 0.0024872045032680035,
"learning_rate": 9.02445380034355e-06,
"loss": 0.00014628460630774497,
"memory(GiB)": 160.86,
"step": 1655,
"token_acc": 1.0,
"train_speed(iter/s)": 0.034418
},
{
"epoch": 0.6083928898662269,
"grad_norm": 0.1102481409907341,
"learning_rate": 9.018751457713062e-06,
"loss": 0.002010086178779602,
"memory(GiB)": 160.86,
"step": 1660,
"token_acc": 0.9996634413125789,
"train_speed(iter/s)": 0.034443
},
{
"epoch": 0.6102253985706432,
"grad_norm": 0.0067368666641414165,
"learning_rate": 9.013034310385442e-06,
"loss": 0.0004647184628993273,
"memory(GiB)": 160.86,
"step": 1665,
"token_acc": 0.9997474322276477,
"train_speed(iter/s)": 0.034469
},
{
"epoch": 0.6120579072750596,
"grad_norm": 0.0039915889501571655,
"learning_rate": 9.007302379422118e-06,
"loss": 0.0008955980651080608,
"memory(GiB)": 160.86,
"step": 1670,
"token_acc": 0.999663129526697,
"train_speed(iter/s)": 0.03449
},
{
"epoch": 0.6138904159794759,
"grad_norm": 0.04223395511507988,
"learning_rate": 9.00155568593898e-06,
"loss": 0.0006724436767399311,
"memory(GiB)": 160.86,
"step": 1675,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.034516
},
{
"epoch": 0.6157229246838922,
"grad_norm": 0.013977458700537682,
"learning_rate": 8.995794251106295e-06,
"loss": 0.0012857289984822273,
"memory(GiB)": 160.86,
"step": 1680,
"token_acc": 0.9995791245791246,
"train_speed(iter/s)": 0.034534
},
{
"epoch": 0.6175554333883086,
"grad_norm": 0.02960984595119953,
"learning_rate": 8.99001809614864e-06,
"loss": 0.0006384906824678183,
"memory(GiB)": 160.86,
"step": 1685,
"token_acc": 0.9997474109623642,
"train_speed(iter/s)": 0.034559
},
{
"epoch": 0.619387942092725,
"grad_norm": 0.14135026931762695,
"learning_rate": 8.98422724234482e-06,
"loss": 0.0018129302188754082,
"memory(GiB)": 160.86,
"step": 1690,
"token_acc": 0.9994108735903047,
"train_speed(iter/s)": 0.034584
},
{
"epoch": 0.6212204507971413,
"grad_norm": 0.011938896030187607,
"learning_rate": 8.978421711027789e-06,
"loss": 0.0010257656686007977,
"memory(GiB)": 160.86,
"step": 1695,
"token_acc": 0.999579018270607,
"train_speed(iter/s)": 0.034609
},
{
"epoch": 0.6230529595015576,
"grad_norm": 0.02054041065275669,
"learning_rate": 8.97260152358457e-06,
"loss": 0.0010426132939755917,
"memory(GiB)": 160.86,
"step": 1700,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.034627
},
{
"epoch": 0.624885468205974,
"grad_norm": 0.057805608958005905,
"learning_rate": 8.966766701456177e-06,
"loss": 0.0011805295012891292,
"memory(GiB)": 160.86,
"step": 1705,
"token_acc": 0.9994950345059754,
"train_speed(iter/s)": 0.03465
},
{
"epoch": 0.6267179769103903,
"grad_norm": 0.01560523733496666,
"learning_rate": 8.96091726613754e-06,
"loss": 0.0006526369601488113,
"memory(GiB)": 160.86,
"step": 1710,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.034675
},
{
"epoch": 0.6285504856148066,
"grad_norm": 0.02277560532093048,
"learning_rate": 8.95505323917742e-06,
"loss": 0.0003244250314310193,
"memory(GiB)": 160.86,
"step": 1715,
"token_acc": 0.9999158249158249,
"train_speed(iter/s)": 0.034697
},
{
"epoch": 0.6303829943192231,
"grad_norm": 0.03905067220330238,
"learning_rate": 8.949174642178333e-06,
"loss": 0.0006646113935858012,
"memory(GiB)": 160.86,
"step": 1720,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.034715
},
{
"epoch": 0.6322155030236394,
"grad_norm": 0.004376774653792381,
"learning_rate": 8.94328149679647e-06,
"loss": 0.0006781556177884341,
"memory(GiB)": 160.86,
"step": 1725,
"token_acc": 0.9996631011538786,
"train_speed(iter/s)": 0.034739
},
{
"epoch": 0.6340480117280557,
"grad_norm": 0.08241453766822815,
"learning_rate": 8.937373824741618e-06,
"loss": 0.0007374928332865238,
"memory(GiB)": 160.86,
"step": 1730,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.034764
},
{
"epoch": 0.635880520432472,
"grad_norm": 0.02215947024524212,
"learning_rate": 8.931451647777076e-06,
"loss": 0.001058538444340229,
"memory(GiB)": 160.86,
"step": 1735,
"token_acc": 0.9994950770007573,
"train_speed(iter/s)": 0.034781
},
{
"epoch": 0.6377130291368884,
"grad_norm": 0.05471364036202431,
"learning_rate": 8.92551498771958e-06,
"loss": 0.0005416409578174353,
"memory(GiB)": 160.86,
"step": 1740,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.034805
},
{
"epoch": 0.6395455378413047,
"grad_norm": 0.0009198402985930443,
"learning_rate": 8.919563866439218e-06,
"loss": 0.0011710536666214467,
"memory(GiB)": 160.86,
"step": 1745,
"token_acc": 0.9995790537127462,
"train_speed(iter/s)": 0.034822
},
{
"epoch": 0.6413780465457211,
"grad_norm": 0.02374288998544216,
"learning_rate": 8.913598305859354e-06,
"loss": 0.0002880813553929329,
"memory(GiB)": 160.86,
"step": 1750,
"token_acc": 0.9999158249158249,
"train_speed(iter/s)": 0.034839
},
{
"epoch": 0.6432105552501375,
"grad_norm": 0.03671794757246971,
"learning_rate": 8.907618327956546e-06,
"loss": 0.0009451866149902344,
"memory(GiB)": 160.86,
"step": 1755,
"token_acc": 0.9997473896934995,
"train_speed(iter/s)": 0.034863
},
{
"epoch": 0.6450430639545538,
"grad_norm": 0.02204386703670025,
"learning_rate": 8.90162395476046e-06,
"loss": 0.00012790242908522487,
"memory(GiB)": 160.86,
"step": 1760,
"token_acc": 1.0,
"train_speed(iter/s)": 0.034887
},
{
"epoch": 0.6468755726589701,
"grad_norm": 0.006437621079385281,
"learning_rate": 8.895615208353796e-06,
"loss": 0.0011966807767748832,
"memory(GiB)": 160.86,
"step": 1765,
"token_acc": 0.9996632146164857,
"train_speed(iter/s)": 0.034911
},
{
"epoch": 0.6487080813633864,
"grad_norm": 0.06638949364423752,
"learning_rate": 8.889592110872203e-06,
"loss": 0.0013600192032754421,
"memory(GiB)": 160.86,
"step": 1770,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.034934
},
{
"epoch": 0.6505405900678028,
"grad_norm": 0.029982449486851692,
"learning_rate": 8.883554684504198e-06,
"loss": 0.00047690006904304026,
"memory(GiB)": 160.86,
"step": 1775,
"token_acc": 0.9999158249158249,
"train_speed(iter/s)": 0.034958
},
{
"epoch": 0.6523730987722192,
"grad_norm": 0.0004446969833225012,
"learning_rate": 8.877502951491083e-06,
"loss": 0.0002472808351740241,
"memory(GiB)": 160.86,
"step": 1780,
"token_acc": 0.9999158674070335,
"train_speed(iter/s)": 0.034982
},
{
"epoch": 0.6542056074766355,
"grad_norm": 0.045220986008644104,
"learning_rate": 8.871436934126865e-06,
"loss": 0.00016599131049588323,
"memory(GiB)": 160.86,
"step": 1785,
"token_acc": 0.9999158107425492,
"train_speed(iter/s)": 0.035005
},
{
"epoch": 0.6560381161810519,
"grad_norm": 0.08464392274618149,
"learning_rate": 8.865356654758175e-06,
"loss": 0.0011138648726046086,
"memory(GiB)": 160.86,
"step": 1790,
"token_acc": 0.9997474534893509,
"train_speed(iter/s)": 0.035029
},
{
"epoch": 0.6578706248854682,
"grad_norm": 0.018666911870241165,
"learning_rate": 8.859262135784184e-06,
"loss": 0.0008051570504903794,
"memory(GiB)": 160.86,
"step": 1795,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.035052
},
{
"epoch": 0.6597031335898845,
"grad_norm": 0.03633316978812218,
"learning_rate": 8.853153399656513e-06,
"loss": 0.0012314721010625363,
"memory(GiB)": 160.86,
"step": 1800,
"token_acc": 0.9997476022211005,
"train_speed(iter/s)": 0.035075
},
{
"epoch": 0.661535642294301,
"grad_norm": 0.07466746866703033,
"learning_rate": 8.84703046887917e-06,
"loss": 0.0005056848283857107,
"memory(GiB)": 160.86,
"step": 1805,
"token_acc": 0.9998315789473684,
"train_speed(iter/s)": 0.035098
},
{
"epoch": 0.6633681509987173,
"grad_norm": 0.058270856738090515,
"learning_rate": 8.840893366008443e-06,
"loss": 0.0027731884270906447,
"memory(GiB)": 160.86,
"step": 1810,
"token_acc": 0.9989051709617652,
"train_speed(iter/s)": 0.03512
},
{
"epoch": 0.6652006597031336,
"grad_norm": 0.053415171802043915,
"learning_rate": 8.834742113652835e-06,
"loss": 0.0012996003031730651,
"memory(GiB)": 160.86,
"step": 1815,
"token_acc": 0.9996633846671716,
"train_speed(iter/s)": 0.035143
},
{
"epoch": 0.6670331684075499,
"grad_norm": 0.17921970784664154,
"learning_rate": 8.828576734472975e-06,
"loss": 0.002054636925458908,
"memory(GiB)": 160.86,
"step": 1820,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.035166
},
{
"epoch": 0.6688656771119663,
"grad_norm": 0.2059200257062912,
"learning_rate": 8.82239725118153e-06,
"loss": 0.000544156739488244,
"memory(GiB)": 160.86,
"step": 1825,
"token_acc": 0.9998317914213625,
"train_speed(iter/s)": 0.035188
},
{
"epoch": 0.6706981858163826,
"grad_norm": 0.0659668818116188,
"learning_rate": 8.816203686543128e-06,
"loss": 0.0011439280584454536,
"memory(GiB)": 160.86,
"step": 1830,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.03521
},
{
"epoch": 0.6725306945207989,
"grad_norm": 0.027126120403409004,
"learning_rate": 8.80999606337427e-06,
"loss": 0.0006697001401335001,
"memory(GiB)": 160.86,
"step": 1835,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.035233
},
{
"epoch": 0.6743632032252154,
"grad_norm": 0.04717881977558136,
"learning_rate": 8.803774404543246e-06,
"loss": 0.0008460984565317631,
"memory(GiB)": 160.86,
"step": 1840,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.035255
},
{
"epoch": 0.6761957119296317,
"grad_norm": 0.03212764859199524,
"learning_rate": 8.79753873297006e-06,
"loss": 0.0013919253833591938,
"memory(GiB)": 160.86,
"step": 1845,
"token_acc": 0.9995793370351674,
"train_speed(iter/s)": 0.035277
},
{
"epoch": 0.678028220634048,
"grad_norm": 0.004734094720333815,
"learning_rate": 8.791289071626324e-06,
"loss": 0.0017154796048998832,
"memory(GiB)": 160.86,
"step": 1850,
"token_acc": 0.9994106751978448,
"train_speed(iter/s)": 0.035298
},
{
"epoch": 0.6798607293384643,
"grad_norm": 0.002792911371216178,
"learning_rate": 8.7850254435352e-06,
"loss": 0.00024983214680105446,
"memory(GiB)": 160.86,
"step": 1855,
"token_acc": 0.9999158532480646,
"train_speed(iter/s)": 0.035304
},
{
"epoch": 0.6816932380428807,
"grad_norm": 0.069346122443676,
"learning_rate": 8.778747871771293e-06,
"loss": 0.0004865613766014576,
"memory(GiB)": 160.86,
"step": 1860,
"token_acc": 0.9999158603281447,
"train_speed(iter/s)": 0.035326
},
{
"epoch": 0.683525746747297,
"grad_norm": 0.0010090708965435624,
"learning_rate": 8.772456379460578e-06,
"loss": 0.0005619535222649574,
"memory(GiB)": 160.86,
"step": 1865,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.035348
},
{
"epoch": 0.6853582554517134,
"grad_norm": 0.00402231328189373,
"learning_rate": 8.766150989780317e-06,
"loss": 0.00032461092341691257,
"memory(GiB)": 160.86,
"step": 1870,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.03537
},
{
"epoch": 0.6871907641561298,
"grad_norm": 0.016630422323942184,
"learning_rate": 8.759831725958963e-06,
"loss": 0.0007076055742800235,
"memory(GiB)": 160.86,
"step": 1875,
"token_acc": 0.9999158461667929,
"train_speed(iter/s)": 0.035386
},
{
"epoch": 0.6890232728605461,
"grad_norm": 0.13864953815937042,
"learning_rate": 8.75349861127608e-06,
"loss": 0.0009592998772859574,
"memory(GiB)": 160.86,
"step": 1880,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.035408
},
{
"epoch": 0.6908557815649624,
"grad_norm": 0.12857644259929657,
"learning_rate": 8.747151669062256e-06,
"loss": 0.0003430765587836504,
"memory(GiB)": 160.86,
"step": 1885,
"token_acc": 0.9998316781686585,
"train_speed(iter/s)": 0.035429
},
{
"epoch": 0.6926882902693787,
"grad_norm": 0.007042865734547377,
"learning_rate": 8.740790922699024e-06,
"loss": 0.0002988249296322465,
"memory(GiB)": 160.86,
"step": 1890,
"token_acc": 0.9999157823816742,
"train_speed(iter/s)": 0.035451
},
{
"epoch": 0.6945207989737952,
"grad_norm": 0.004211138002574444,
"learning_rate": 8.73441639561877e-06,
"loss": 0.000298920925706625,
"memory(GiB)": 160.86,
"step": 1895,
"token_acc": 0.9998316781686585,
"train_speed(iter/s)": 0.035464
},
{
"epoch": 0.6963533076782115,
"grad_norm": 0.10895411670207977,
"learning_rate": 8.728028111304639e-06,
"loss": 0.0018308842554688454,
"memory(GiB)": 160.86,
"step": 1900,
"token_acc": 0.9995788054923764,
"train_speed(iter/s)": 0.035485
},
{
"epoch": 0.6981858163826278,
"grad_norm": 0.05376400053501129,
"learning_rate": 8.721626093290461e-06,
"loss": 0.0004374215379357338,
"memory(GiB)": 160.86,
"step": 1905,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.035506
},
{
"epoch": 0.7000183250870442,
"grad_norm": 0.007238362450152636,
"learning_rate": 8.715210365160662e-06,
"loss": 6.630108109675347e-05,
"memory(GiB)": 160.86,
"step": 1910,
"token_acc": 1.0,
"train_speed(iter/s)": 0.035527
},
{
"epoch": 0.7018508337914605,
"grad_norm": 0.00040705734863877296,
"learning_rate": 8.708780950550173e-06,
"loss": 0.0006973243784159422,
"memory(GiB)": 160.86,
"step": 1915,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.035548
},
{
"epoch": 0.7036833424958768,
"grad_norm": 0.0899810642004013,
"learning_rate": 8.702337873144343e-06,
"loss": 0.0013748856261372566,
"memory(GiB)": 160.86,
"step": 1920,
"token_acc": 0.9994948219247285,
"train_speed(iter/s)": 0.035569
},
{
"epoch": 0.7055158512002933,
"grad_norm": 0.08953223377466202,
"learning_rate": 8.695881156678856e-06,
"loss": 0.0006622021552175284,
"memory(GiB)": 160.86,
"step": 1925,
"token_acc": 0.9997475597441938,
"train_speed(iter/s)": 0.035589
},
{
"epoch": 0.7073483599047096,
"grad_norm": 0.015041066333651543,
"learning_rate": 8.689410824939639e-06,
"loss": 0.0003675042651593685,
"memory(GiB)": 160.86,
"step": 1930,
"token_acc": 0.9999158249158249,
"train_speed(iter/s)": 0.03561
},
{
"epoch": 0.7091808686091259,
"grad_norm": 0.015323134139180183,
"learning_rate": 8.682926901762776e-06,
"loss": 0.0009645667858421802,
"memory(GiB)": 160.86,
"step": 1935,
"token_acc": 0.999663242970197,
"train_speed(iter/s)": 0.03563
},
{
"epoch": 0.7110133773135422,
"grad_norm": 0.05264544486999512,
"learning_rate": 8.676429411034423e-06,
"loss": 0.0006276907399296761,
"memory(GiB)": 160.86,
"step": 1940,
"token_acc": 0.9996633280026934,
"train_speed(iter/s)": 0.035648
},
{
"epoch": 0.7128458860179586,
"grad_norm": 0.0028159820940345526,
"learning_rate": 8.669918376690716e-06,
"loss": 0.00036051685456186535,
"memory(GiB)": 160.86,
"step": 1945,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.035668
},
{
"epoch": 0.7146783947223749,
"grad_norm": 0.0341511145234108,
"learning_rate": 8.663393822717686e-06,
"loss": 0.0003709573531523347,
"memory(GiB)": 160.86,
"step": 1950,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.035688
},
{
"epoch": 0.7165109034267912,
"grad_norm": 0.0006480200099758804,
"learning_rate": 8.656855773151163e-06,
"loss": 0.0003987106028944254,
"memory(GiB)": 160.86,
"step": 1955,
"token_acc": 0.9998315789473684,
"train_speed(iter/s)": 0.035709
},
{
"epoch": 0.7183434121312077,
"grad_norm": 0.0002706103550735861,
"learning_rate": 8.650304252076704e-06,
"loss": 0.0003762753214687109,
"memory(GiB)": 160.86,
"step": 1960,
"token_acc": 0.9998316356595673,
"train_speed(iter/s)": 0.035729
},
{
"epoch": 0.720175920835624,
"grad_norm": 0.00926526915282011,
"learning_rate": 8.643739283629484e-06,
"loss": 0.00021247351542115213,
"memory(GiB)": 160.86,
"step": 1965,
"token_acc": 0.9999158603281447,
"train_speed(iter/s)": 0.035749
},
{
"epoch": 0.7220084295400403,
"grad_norm": 0.11203871667385101,
"learning_rate": 8.63716089199422e-06,
"loss": 0.0012671677395701408,
"memory(GiB)": 160.86,
"step": 1970,
"token_acc": 0.9995792662403231,
"train_speed(iter/s)": 0.03577
},
{
"epoch": 0.7238409382444566,
"grad_norm": 0.027508899569511414,
"learning_rate": 8.630569101405084e-06,
"loss": 0.0016218043863773346,
"memory(GiB)": 160.86,
"step": 1975,
"token_acc": 0.99949499200404,
"train_speed(iter/s)": 0.03579
},
{
"epoch": 0.725673446948873,
"grad_norm": 0.03338692709803581,
"learning_rate": 8.6239639361456e-06,
"loss": 0.0007595627568662167,
"memory(GiB)": 160.86,
"step": 1980,
"token_acc": 0.9997473045822103,
"train_speed(iter/s)": 0.03581
},
{
"epoch": 0.7275059556532893,
"grad_norm": 0.01979021355509758,
"learning_rate": 8.617345420548568e-06,
"loss": 0.00039132642559707164,
"memory(GiB)": 160.86,
"step": 1985,
"token_acc": 0.9998317631224765,
"train_speed(iter/s)": 0.035829
},
{
"epoch": 0.7293384643577057,
"grad_norm": 0.0021872930228710175,
"learning_rate": 8.610713578995969e-06,
"loss": 0.0002923472551628947,
"memory(GiB)": 160.86,
"step": 1990,
"token_acc": 0.9999158603281447,
"train_speed(iter/s)": 0.035848
},
{
"epoch": 0.7311709730621221,
"grad_norm": 0.007450213190168142,
"learning_rate": 8.604068435918876e-06,
"loss": 0.0004648041445761919,
"memory(GiB)": 160.86,
"step": 1995,
"token_acc": 0.9998316356595673,
"train_speed(iter/s)": 0.035868
},
{
"epoch": 0.7330034817665384,
"grad_norm": 0.018950950354337692,
"learning_rate": 8.597410015797358e-06,
"loss": 0.0011166405864059925,
"memory(GiB)": 160.86,
"step": 2000,
"token_acc": 0.9996636394214599,
"train_speed(iter/s)": 0.035879
},
{
"epoch": 0.7330034817665384,
"eval_loss": 0.0007337583811022341,
"eval_runtime": 199.2224,
"eval_samples_per_second": 2.209,
"eval_steps_per_second": 2.209,
"eval_token_acc": 0.9997704421284606,
"step": 2000
},
{
"epoch": 0.7348359904709547,
"grad_norm": 0.0039305477403104305,
"learning_rate": 8.590738343160402e-06,
"loss": 0.00037078014574944975,
"memory(GiB)": 160.86,
"step": 2005,
"token_acc": 0.9997927917427509,
"train_speed(iter/s)": 0.035487
},
{
"epoch": 0.736668499175371,
"grad_norm": 0.013306787237524986,
"learning_rate": 8.584053442585816e-06,
"loss": 0.0020991813391447066,
"memory(GiB)": 160.86,
"step": 2010,
"token_acc": 0.9996633846671716,
"train_speed(iter/s)": 0.035507
},
{
"epoch": 0.7385010078797875,
"grad_norm": 0.006368038710206747,
"learning_rate": 8.577355338700133e-06,
"loss": 0.000787766557186842,
"memory(GiB)": 160.86,
"step": 2015,
"token_acc": 0.9997473896934995,
"train_speed(iter/s)": 0.035525
},
{
"epoch": 0.7403335165842038,
"grad_norm": 0.010385467670857906,
"learning_rate": 8.570644056178533e-06,
"loss": 0.0008328554220497608,
"memory(GiB)": 160.86,
"step": 2020,
"token_acc": 0.9997476871320438,
"train_speed(iter/s)": 0.035538
},
{
"epoch": 0.7421660252886201,
"grad_norm": 0.01632188819348812,
"learning_rate": 8.563919619744735e-06,
"loss": 0.0005637739785015583,
"memory(GiB)": 160.86,
"step": 2025,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.035559
},
{
"epoch": 0.7439985339930365,
"grad_norm": 0.011626377701759338,
"learning_rate": 8.557182054170926e-06,
"loss": 0.0005918642971664667,
"memory(GiB)": 160.86,
"step": 2030,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.035578
},
{
"epoch": 0.7458310426974528,
"grad_norm": 0.0031517872121185064,
"learning_rate": 8.550431384277654e-06,
"loss": 0.00141130480915308,
"memory(GiB)": 160.86,
"step": 2035,
"token_acc": 0.9995790891489182,
"train_speed(iter/s)": 0.035597
},
{
"epoch": 0.7476635514018691,
"grad_norm": 0.05396876111626625,
"learning_rate": 8.543667634933743e-06,
"loss": 0.0004124412313103676,
"memory(GiB)": 160.86,
"step": 2040,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.035616
},
{
"epoch": 0.7494960601062856,
"grad_norm": 0.0036719287745654583,
"learning_rate": 8.536890831056199e-06,
"loss": 0.0014296333305537702,
"memory(GiB)": 160.86,
"step": 2045,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.035636
},
{
"epoch": 0.7513285688107019,
"grad_norm": 0.01854000613093376,
"learning_rate": 8.530100997610125e-06,
"loss": 0.00037872311659157274,
"memory(GiB)": 160.86,
"step": 2050,
"token_acc": 0.9999158886365548,
"train_speed(iter/s)": 0.035656
},
{
"epoch": 0.7531610775151182,
"grad_norm": 0.022685358300805092,
"learning_rate": 8.523298159608615e-06,
"loss": 0.0005078110843896866,
"memory(GiB)": 160.86,
"step": 2055,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.035675
},
{
"epoch": 0.7549935862195345,
"grad_norm": 0.0069847991690039635,
"learning_rate": 8.51648234211268e-06,
"loss": 0.0006114406045526266,
"memory(GiB)": 160.86,
"step": 2060,
"token_acc": 0.9999158036541214,
"train_speed(iter/s)": 0.035694
},
{
"epoch": 0.7568260949239509,
"grad_norm": 0.005377015098929405,
"learning_rate": 8.509653570231139e-06,
"loss": 0.000488346815109253,
"memory(GiB)": 160.86,
"step": 2065,
"token_acc": 0.9998316356595673,
"train_speed(iter/s)": 0.035714
},
{
"epoch": 0.7586586036283672,
"grad_norm": 0.13766171038150787,
"learning_rate": 8.502811869120537e-06,
"loss": 0.0007873100228607654,
"memory(GiB)": 160.86,
"step": 2070,
"token_acc": 0.9997473471450228,
"train_speed(iter/s)": 0.035733
},
{
"epoch": 0.7604911123327835,
"grad_norm": 0.08824609220027924,
"learning_rate": 8.495957263985049e-06,
"loss": 0.0008373255841434002,
"memory(GiB)": 160.86,
"step": 2075,
"token_acc": 0.9995790537127462,
"train_speed(iter/s)": 0.035751
},
{
"epoch": 0.7623236210372,
"grad_norm": 0.006550587713718414,
"learning_rate": 8.489089780076387e-06,
"loss": 0.00012923479080200194,
"memory(GiB)": 160.86,
"step": 2080,
"token_acc": 1.0,
"train_speed(iter/s)": 0.03577
},
{
"epoch": 0.7641561297416163,
"grad_norm": 0.06086429953575134,
"learning_rate": 8.482209442693706e-06,
"loss": 0.002163195610046387,
"memory(GiB)": 160.86,
"step": 2085,
"token_acc": 0.9990743078347218,
"train_speed(iter/s)": 0.03579
},
{
"epoch": 0.7659886384460326,
"grad_norm": 0.045746754854917526,
"learning_rate": 8.47531627718351e-06,
"loss": 0.00045907222665846347,
"memory(GiB)": 160.86,
"step": 2090,
"token_acc": 0.9998315080033698,
"train_speed(iter/s)": 0.035808
},
{
"epoch": 0.7678211471504489,
"grad_norm": 0.01716403290629387,
"learning_rate": 8.46841030893957e-06,
"loss": 0.0005397152155637742,
"memory(GiB)": 160.86,
"step": 2095,
"token_acc": 0.9997475597441938,
"train_speed(iter/s)": 0.035827
},
{
"epoch": 0.7696536558548653,
"grad_norm": 0.0022040277253836393,
"learning_rate": 8.461491563402807e-06,
"loss": 0.0012433138675987721,
"memory(GiB)": 160.86,
"step": 2100,
"token_acc": 0.9997475385003787,
"train_speed(iter/s)": 0.035846
},
{
"epoch": 0.7714861645592817,
"grad_norm": 0.028352022171020508,
"learning_rate": 8.454560066061225e-06,
"loss": 0.0011054543778300286,
"memory(GiB)": 160.86,
"step": 2105,
"token_acc": 0.9995790891489182,
"train_speed(iter/s)": 0.035865
},
{
"epoch": 0.773318673263698,
"grad_norm": 0.017512010410428047,
"learning_rate": 8.447615842449799e-06,
"loss": 0.00045901937410235404,
"memory(GiB)": 160.86,
"step": 2110,
"token_acc": 0.9999158249158249,
"train_speed(iter/s)": 0.035883
},
{
"epoch": 0.7751511819681144,
"grad_norm": 0.014501676894724369,
"learning_rate": 8.440658918150383e-06,
"loss": 0.0004790318664163351,
"memory(GiB)": 160.86,
"step": 2115,
"token_acc": 0.9997476446837147,
"train_speed(iter/s)": 0.035901
},
{
"epoch": 0.7769836906725307,
"grad_norm": 0.06630018353462219,
"learning_rate": 8.433689318791628e-06,
"loss": 0.0008208448067307472,
"memory(GiB)": 160.86,
"step": 2120,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.03592
},
{
"epoch": 0.778816199376947,
"grad_norm": 0.029544832184910774,
"learning_rate": 8.426707070048867e-06,
"loss": 0.00034202171955257656,
"memory(GiB)": 160.86,
"step": 2125,
"token_acc": 0.9999158036541214,
"train_speed(iter/s)": 0.035938
},
{
"epoch": 0.7806487080813634,
"grad_norm": 0.020295366644859314,
"learning_rate": 8.419712197644042e-06,
"loss": 0.00047438177280128,
"memory(GiB)": 160.86,
"step": 2130,
"token_acc": 0.9998316356595673,
"train_speed(iter/s)": 0.035956
},
{
"epoch": 0.7824812167857798,
"grad_norm": 0.021269747987389565,
"learning_rate": 8.412704727345597e-06,
"loss": 0.0006256222724914551,
"memory(GiB)": 160.86,
"step": 2135,
"token_acc": 0.9999158390843292,
"train_speed(iter/s)": 0.035974
},
{
"epoch": 0.7843137254901961,
"grad_norm": 0.035125475376844406,
"learning_rate": 8.405684684968383e-06,
"loss": 0.0005730021744966507,
"memory(GiB)": 160.86,
"step": 2140,
"token_acc": 0.9998315647633484,
"train_speed(iter/s)": 0.035992
},
{
"epoch": 0.7861462341946124,
"grad_norm": 0.06994622200727463,
"learning_rate": 8.398652096373566e-06,
"loss": 0.0003744778921827674,
"memory(GiB)": 160.86,
"step": 2145,
"token_acc": 0.9999157894736842,
"train_speed(iter/s)": 0.03601
},
{
"epoch": 0.7879787428990288,
"grad_norm": 0.006813399959355593,
"learning_rate": 8.39160698746853e-06,
"loss": 0.0007882724516093731,
"memory(GiB)": 160.86,
"step": 2150,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.036027
},
{
"epoch": 0.7898112516034451,
"grad_norm": 0.20248223841190338,
"learning_rate": 8.38454938420679e-06,
"loss": 0.00029504401609301565,
"memory(GiB)": 160.86,
"step": 2155,
"token_acc": 0.9999157823816742,
"train_speed(iter/s)": 0.036045
},
{
"epoch": 0.7916437603078614,
"grad_norm": 0.10259495675563812,
"learning_rate": 8.37747931258788e-06,
"loss": 0.0013766267336905002,
"memory(GiB)": 160.86,
"step": 2160,
"token_acc": 0.9995792662403231,
"train_speed(iter/s)": 0.036063
},
{
"epoch": 0.7934762690122779,
"grad_norm": 0.022682547569274902,
"learning_rate": 8.370396798657269e-06,
"loss": 0.0003458364633843303,
"memory(GiB)": 160.86,
"step": 2165,
"token_acc": 0.9999158532480646,
"train_speed(iter/s)": 0.036081
},
{
"epoch": 0.7953087777166942,
"grad_norm": 0.05654159560799599,
"learning_rate": 8.363301868506264e-06,
"loss": 0.0008417519740760327,
"memory(GiB)": 160.86,
"step": 2170,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.036099
},
{
"epoch": 0.7971412864211105,
"grad_norm": 0.010379817336797714,
"learning_rate": 8.35619454827191e-06,
"loss": 0.00014047393342480062,
"memory(GiB)": 160.86,
"step": 2175,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036117
},
{
"epoch": 0.7989737951255268,
"grad_norm": 0.002908756723627448,
"learning_rate": 8.349074864136897e-06,
"loss": 0.0010122337378561496,
"memory(GiB)": 160.86,
"step": 2180,
"token_acc": 0.9995790891489182,
"train_speed(iter/s)": 0.036134
},
{
"epoch": 0.8008063038299432,
"grad_norm": 0.015968699008226395,
"learning_rate": 8.341942842329465e-06,
"loss": 0.0010151905938982964,
"memory(GiB)": 160.86,
"step": 2185,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.036152
},
{
"epoch": 0.8026388125343595,
"grad_norm": 0.02950908988714218,
"learning_rate": 8.3347985091233e-06,
"loss": 0.0006167484447360039,
"memory(GiB)": 160.86,
"step": 2190,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.036169
},
{
"epoch": 0.8044713212387758,
"grad_norm": 0.004527771379798651,
"learning_rate": 8.327641890837443e-06,
"loss": 0.0001240343088284135,
"memory(GiB)": 160.86,
"step": 2195,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036187
},
{
"epoch": 0.8063038299431923,
"grad_norm": 0.09493066370487213,
"learning_rate": 8.320473013836197e-06,
"loss": 0.0003447512863203883,
"memory(GiB)": 160.86,
"step": 2200,
"token_acc": 0.9999158603281447,
"train_speed(iter/s)": 0.036205
},
{
"epoch": 0.8081363386476086,
"grad_norm": 0.016084903851151466,
"learning_rate": 8.313291904529018e-06,
"loss": 0.0009649941697716713,
"memory(GiB)": 160.86,
"step": 2205,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.036222
},
{
"epoch": 0.8099688473520249,
"grad_norm": 0.05419844388961792,
"learning_rate": 8.306098589370427e-06,
"loss": 0.0005068023223429918,
"memory(GiB)": 160.86,
"step": 2210,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.036239
},
{
"epoch": 0.8118013560564412,
"grad_norm": 0.12476948648691177,
"learning_rate": 8.298893094859916e-06,
"loss": 0.0009864597581326962,
"memory(GiB)": 160.86,
"step": 2215,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.036257
},
{
"epoch": 0.8136338647608576,
"grad_norm": 0.06563253700733185,
"learning_rate": 8.291675447541834e-06,
"loss": 0.000346578611060977,
"memory(GiB)": 160.86,
"step": 2220,
"token_acc": 0.999831734814067,
"train_speed(iter/s)": 0.036274
},
{
"epoch": 0.815466373465274,
"grad_norm": 0.0007064275559969246,
"learning_rate": 8.28444567400531e-06,
"loss": 0.0002860090462490916,
"memory(GiB)": 160.86,
"step": 2225,
"token_acc": 0.9998316073082428,
"train_speed(iter/s)": 0.03629
},
{
"epoch": 0.8172988821696903,
"grad_norm": 0.06441126018762589,
"learning_rate": 8.277203800884137e-06,
"loss": 0.0004928476177155971,
"memory(GiB)": 160.86,
"step": 2230,
"token_acc": 0.9999158036541214,
"train_speed(iter/s)": 0.036307
},
{
"epoch": 0.8191313908741067,
"grad_norm": 0.07549826800823212,
"learning_rate": 8.269949854856687e-06,
"loss": 0.0014977409504354,
"memory(GiB)": 160.86,
"step": 2235,
"token_acc": 0.9997476022211005,
"train_speed(iter/s)": 0.036324
},
{
"epoch": 0.820963899578523,
"grad_norm": 0.02339329943060875,
"learning_rate": 8.262683862645804e-06,
"loss": 0.00037619960494339466,
"memory(GiB)": 160.86,
"step": 2240,
"token_acc": 0.9998315221969506,
"train_speed(iter/s)": 0.036341
},
{
"epoch": 0.8227964082829393,
"grad_norm": 0.013340925797820091,
"learning_rate": 8.255405851018713e-06,
"loss": 0.0004039745777845383,
"memory(GiB)": 160.86,
"step": 2245,
"token_acc": 0.9999158249158249,
"train_speed(iter/s)": 0.036358
},
{
"epoch": 0.8246289169873557,
"grad_norm": 0.1738908737897873,
"learning_rate": 8.24811584678691e-06,
"loss": 0.0009243869222700596,
"memory(GiB)": 160.86,
"step": 2250,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.036375
},
{
"epoch": 0.8264614256917721,
"grad_norm": 0.1292845755815506,
"learning_rate": 8.24081387680608e-06,
"loss": 0.0004229114390909672,
"memory(GiB)": 160.86,
"step": 2255,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.036392
},
{
"epoch": 0.8282939343961884,
"grad_norm": 0.03298277407884598,
"learning_rate": 8.233499967975981e-06,
"loss": 0.0003614515298977494,
"memory(GiB)": 160.86,
"step": 2260,
"token_acc": 0.9999158674070335,
"train_speed(iter/s)": 0.036406
},
{
"epoch": 0.8301264431006047,
"grad_norm": 0.0037736741360276937,
"learning_rate": 8.226174147240359e-06,
"loss": 0.0006478279829025269,
"memory(GiB)": 160.86,
"step": 2265,
"token_acc": 0.9998315363881402,
"train_speed(iter/s)": 0.036422
},
{
"epoch": 0.8319589518050211,
"grad_norm": 0.010557832196354866,
"learning_rate": 8.218836441586834e-06,
"loss": 0.0005696366541087627,
"memory(GiB)": 160.86,
"step": 2270,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.036439
},
{
"epoch": 0.8337914605094374,
"grad_norm": 0.003406501142308116,
"learning_rate": 8.211486878046819e-06,
"loss": 0.0006424786522984504,
"memory(GiB)": 160.86,
"step": 2275,
"token_acc": 0.9998316498316498,
"train_speed(iter/s)": 0.036454
},
{
"epoch": 0.8356239692138537,
"grad_norm": 0.0992351546883583,
"learning_rate": 8.204125483695403e-06,
"loss": 0.0005788296461105346,
"memory(GiB)": 160.86,
"step": 2280,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.036471
},
{
"epoch": 0.8374564779182702,
"grad_norm": 0.010372207500040531,
"learning_rate": 8.196752285651261e-06,
"loss": 0.00029938730876892804,
"memory(GiB)": 160.86,
"step": 2285,
"token_acc": 0.9999157469036987,
"train_speed(iter/s)": 0.036487
},
{
"epoch": 0.8392889866226865,
"grad_norm": 0.0683954581618309,
"learning_rate": 8.189367311076551e-06,
"loss": 0.0007511110045015812,
"memory(GiB)": 160.86,
"step": 2290,
"token_acc": 0.9998317489694625,
"train_speed(iter/s)": 0.036504
},
{
"epoch": 0.8411214953271028,
"grad_norm": 0.006293443962931633,
"learning_rate": 8.181970587176814e-06,
"loss": 0.0003692630911245942,
"memory(GiB)": 160.86,
"step": 2295,
"token_acc": 0.9997475809844342,
"train_speed(iter/s)": 0.03652
},
{
"epoch": 0.8429540040315191,
"grad_norm": 0.006763943005353212,
"learning_rate": 8.174562141200878e-06,
"loss": 0.0002094252035021782,
"memory(GiB)": 160.86,
"step": 2300,
"token_acc": 0.9999158036541214,
"train_speed(iter/s)": 0.036535
},
{
"epoch": 0.8447865127359355,
"grad_norm": 0.04695817828178406,
"learning_rate": 8.167142000440749e-06,
"loss": 0.0005172740202397108,
"memory(GiB)": 160.86,
"step": 2305,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.036551
},
{
"epoch": 0.8466190214403518,
"grad_norm": 0.026909319683909416,
"learning_rate": 8.15971019223152e-06,
"loss": 0.00024677792098373177,
"memory(GiB)": 160.86,
"step": 2310,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036567
},
{
"epoch": 0.8484515301447682,
"grad_norm": 0.0009972673142328858,
"learning_rate": 8.152266743951264e-06,
"loss": 0.00048431595787405967,
"memory(GiB)": 160.86,
"step": 2315,
"token_acc": 0.9999157752884696,
"train_speed(iter/s)": 0.036583
},
{
"epoch": 0.8502840388491846,
"grad_norm": 0.1550913155078888,
"learning_rate": 8.144811683020932e-06,
"loss": 0.00014740382321178913,
"memory(GiB)": 160.86,
"step": 2320,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.036599
},
{
"epoch": 0.8521165475536009,
"grad_norm": 0.04358501732349396,
"learning_rate": 8.13734503690426e-06,
"loss": 0.0010699840262532235,
"memory(GiB)": 160.86,
"step": 2325,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.036612
},
{
"epoch": 0.8539490562580172,
"grad_norm": 0.002750721760094166,
"learning_rate": 8.12986683310766e-06,
"loss": 0.0002569463336840272,
"memory(GiB)": 160.86,
"step": 2330,
"token_acc": 0.9999158390843292,
"train_speed(iter/s)": 0.036627
},
{
"epoch": 0.8557815649624335,
"grad_norm": 0.010151500813663006,
"learning_rate": 8.12237709918012e-06,
"loss": 0.00014050663448870183,
"memory(GiB)": 160.86,
"step": 2335,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036644
},
{
"epoch": 0.8576140736668499,
"grad_norm": 0.004389213863760233,
"learning_rate": 8.114875862713107e-06,
"loss": 5.258661694824695e-05,
"memory(GiB)": 160.86,
"step": 2340,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036659
},
{
"epoch": 0.8594465823712663,
"grad_norm": 0.004478363320231438,
"learning_rate": 8.10736315134046e-06,
"loss": 0.0017528504133224488,
"memory(GiB)": 160.86,
"step": 2345,
"token_acc": 0.9996633846671716,
"train_speed(iter/s)": 0.036675
},
{
"epoch": 0.8612790910756826,
"grad_norm": 0.004733589943498373,
"learning_rate": 8.099838992738292e-06,
"loss": 0.0013998121954500674,
"memory(GiB)": 160.86,
"step": 2350,
"token_acc": 0.9994953318193288,
"train_speed(iter/s)": 0.03669
},
{
"epoch": 0.863111599780099,
"grad_norm": 0.00977323018014431,
"learning_rate": 8.092303414624884e-06,
"loss": 0.00046326019801199434,
"memory(GiB)": 160.86,
"step": 2355,
"token_acc": 0.9998316781686585,
"train_speed(iter/s)": 0.036705
},
{
"epoch": 0.8649441084845153,
"grad_norm": 0.04947784170508385,
"learning_rate": 8.08475644476059e-06,
"loss": 0.0001862859120592475,
"memory(GiB)": 160.86,
"step": 2360,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036721
},
{
"epoch": 0.8667766171889316,
"grad_norm": 0.21693383157253265,
"learning_rate": 8.077198110947725e-06,
"loss": 0.0009612908586859703,
"memory(GiB)": 160.86,
"step": 2365,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.036737
},
{
"epoch": 0.868609125893348,
"grad_norm": 0.023295719176530838,
"learning_rate": 8.069628441030472e-06,
"loss": 0.0004069589078426361,
"memory(GiB)": 160.86,
"step": 2370,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.036697
},
{
"epoch": 0.8704416345977644,
"grad_norm": 0.06745916604995728,
"learning_rate": 8.062047462894771e-06,
"loss": 0.0006006782408803701,
"memory(GiB)": 160.86,
"step": 2375,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.036712
},
{
"epoch": 0.8722741433021807,
"grad_norm": 0.05341252312064171,
"learning_rate": 8.054455204468225e-06,
"loss": 0.000835646316409111,
"memory(GiB)": 160.86,
"step": 2380,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.036728
},
{
"epoch": 0.874106652006597,
"grad_norm": 0.01815791241824627,
"learning_rate": 8.046851693719986e-06,
"loss": 0.00021557288710027933,
"memory(GiB)": 160.86,
"step": 2385,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036743
},
{
"epoch": 0.8759391607110134,
"grad_norm": 0.0018982563633471727,
"learning_rate": 8.039236958660666e-06,
"loss": 0.00010541609954088927,
"memory(GiB)": 160.86,
"step": 2390,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036759
},
{
"epoch": 0.8777716694154297,
"grad_norm": 0.0008025880670174956,
"learning_rate": 8.031611027342221e-06,
"loss": 0.00029539645183831455,
"memory(GiB)": 160.86,
"step": 2395,
"token_acc": 0.9998317631224765,
"train_speed(iter/s)": 0.036774
},
{
"epoch": 0.879604178119846,
"grad_norm": 0.02493736520409584,
"learning_rate": 8.023973927857857e-06,
"loss": 0.0010729983448982238,
"memory(GiB)": 160.86,
"step": 2400,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.036789
},
{
"epoch": 0.8814366868242625,
"grad_norm": 0.23594622313976288,
"learning_rate": 8.016325688341919e-06,
"loss": 0.0005186852067708969,
"memory(GiB)": 160.86,
"step": 2405,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.036805
},
{
"epoch": 0.8832691955286788,
"grad_norm": 0.014162681996822357,
"learning_rate": 8.00866633696979e-06,
"loss": 0.00019059464102610946,
"memory(GiB)": 160.86,
"step": 2410,
"token_acc": 0.9999158390843292,
"train_speed(iter/s)": 0.03682
},
{
"epoch": 0.8851017042330951,
"grad_norm": 0.04650455340743065,
"learning_rate": 8.000995901957792e-06,
"loss": 0.0004015204031020403,
"memory(GiB)": 160.86,
"step": 2415,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.036835
},
{
"epoch": 0.8869342129375114,
"grad_norm": 0.04503090679645538,
"learning_rate": 7.993314411563075e-06,
"loss": 0.0006881221663206816,
"memory(GiB)": 160.86,
"step": 2420,
"token_acc": 0.9997475172529877,
"train_speed(iter/s)": 0.03685
},
{
"epoch": 0.8887667216419278,
"grad_norm": 0.008592194877564907,
"learning_rate": 7.98562189408352e-06,
"loss": 0.0002544657327234745,
"memory(GiB)": 160.86,
"step": 2425,
"token_acc": 0.9999158107425492,
"train_speed(iter/s)": 0.036865
},
{
"epoch": 0.8905992303463441,
"grad_norm": 0.04029720276594162,
"learning_rate": 7.977918377857625e-06,
"loss": 0.0004797634668648243,
"memory(GiB)": 160.86,
"step": 2430,
"token_acc": 0.9998316498316498,
"train_speed(iter/s)": 0.03688
},
{
"epoch": 0.8924317390507605,
"grad_norm": 0.012428953312337399,
"learning_rate": 7.970203891264408e-06,
"loss": 0.00046463338658213614,
"memory(GiB)": 160.86,
"step": 2435,
"token_acc": 0.9998317631224765,
"train_speed(iter/s)": 0.036895
},
{
"epoch": 0.8942642477551769,
"grad_norm": 0.1128624677658081,
"learning_rate": 7.962478462723306e-06,
"loss": 0.000577373243868351,
"memory(GiB)": 160.86,
"step": 2440,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.03691
},
{
"epoch": 0.8960967564595932,
"grad_norm": 0.005943561438471079,
"learning_rate": 7.954742120694059e-06,
"loss": 0.0005296251736581325,
"memory(GiB)": 160.86,
"step": 2445,
"token_acc": 0.9998317489694625,
"train_speed(iter/s)": 0.036925
},
{
"epoch": 0.8979292651640095,
"grad_norm": 0.014219972304999828,
"learning_rate": 7.946994893676611e-06,
"loss": 5.174783291295171e-05,
"memory(GiB)": 160.86,
"step": 2450,
"token_acc": 1.0,
"train_speed(iter/s)": 0.03694
},
{
"epoch": 0.8997617738684259,
"grad_norm": 0.01472583319991827,
"learning_rate": 7.93923681021101e-06,
"loss": 0.0009220579639077186,
"memory(GiB)": 160.86,
"step": 2455,
"token_acc": 0.9996634413125789,
"train_speed(iter/s)": 0.036954
},
{
"epoch": 0.9015942825728422,
"grad_norm": 0.0020888156723231077,
"learning_rate": 7.931467898877298e-06,
"loss": 0.0004309060052037239,
"memory(GiB)": 160.86,
"step": 2460,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.036969
},
{
"epoch": 0.9034267912772586,
"grad_norm": 0.054128147661685944,
"learning_rate": 7.9236881882954e-06,
"loss": 0.00036832981277257204,
"memory(GiB)": 160.86,
"step": 2465,
"token_acc": 0.9999157752884696,
"train_speed(iter/s)": 0.036983
},
{
"epoch": 0.9052592999816749,
"grad_norm": 0.009187346324324608,
"learning_rate": 7.915897707125027e-06,
"loss": 0.0009874864481389523,
"memory(GiB)": 160.86,
"step": 2470,
"token_acc": 0.9996633280026934,
"train_speed(iter/s)": 0.036998
},
{
"epoch": 0.9070918086860913,
"grad_norm": 0.015212767757475376,
"learning_rate": 7.908096484065569e-06,
"loss": 0.00035822123754769564,
"memory(GiB)": 160.86,
"step": 2475,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.037012
},
{
"epoch": 0.9089243173905076,
"grad_norm": 0.028434082865715027,
"learning_rate": 7.900284547855992e-06,
"loss": 0.00033626847434788945,
"memory(GiB)": 160.86,
"step": 2480,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.037027
},
{
"epoch": 0.9107568260949239,
"grad_norm": 0.003858706448227167,
"learning_rate": 7.892461927274719e-06,
"loss": 0.00038427968975156545,
"memory(GiB)": 160.86,
"step": 2485,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.037041
},
{
"epoch": 0.9125893347993403,
"grad_norm": 0.028237823396921158,
"learning_rate": 7.884628651139543e-06,
"loss": 0.0008647294715046882,
"memory(GiB)": 160.86,
"step": 2490,
"token_acc": 0.9995789119083712,
"train_speed(iter/s)": 0.037056
},
{
"epoch": 0.9144218435037567,
"grad_norm": 0.014561748132109642,
"learning_rate": 7.876784748307502e-06,
"loss": 8.994525414891541e-05,
"memory(GiB)": 160.86,
"step": 2495,
"token_acc": 1.0,
"train_speed(iter/s)": 0.03707
},
{
"epoch": 0.916254352208173,
"grad_norm": 0.011074830777943134,
"learning_rate": 7.868930247674787e-06,
"loss": 0.0002087874570861459,
"memory(GiB)": 160.86,
"step": 2500,
"token_acc": 0.9999158107425492,
"train_speed(iter/s)": 0.037084
},
{
"epoch": 0.916254352208173,
"eval_loss": 0.0007594987982884049,
"eval_runtime": 172.1874,
"eval_samples_per_second": 2.555,
"eval_steps_per_second": 2.555,
"eval_token_acc": 0.9997704421284606,
"step": 2500
},
{
"epoch": 0.9180868609125893,
"grad_norm": 0.04182349890470505,
"learning_rate": 7.86106517817663e-06,
"loss": 0.00022406417410820724,
"memory(GiB)": 160.86,
"step": 2505,
"token_acc": 0.9997928078422231,
"train_speed(iter/s)": 0.036773
},
{
"epoch": 0.9199193696170057,
"grad_norm": 0.010813858360052109,
"learning_rate": 7.8531895687872e-06,
"loss": 0.0001518705626949668,
"memory(GiB)": 160.86,
"step": 2510,
"token_acc": 0.9999158603281447,
"train_speed(iter/s)": 0.036788
},
{
"epoch": 0.921751878321422,
"grad_norm": 4.607898881658912e-05,
"learning_rate": 7.845303448519486e-06,
"loss": 0.0005594564136117697,
"memory(GiB)": 160.86,
"step": 2515,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.036802
},
{
"epoch": 0.9235843870258383,
"grad_norm": 0.059696584939956665,
"learning_rate": 7.837406846425205e-06,
"loss": 0.0005560083314776421,
"memory(GiB)": 160.86,
"step": 2520,
"token_acc": 0.9994947368421052,
"train_speed(iter/s)": 0.036817
},
{
"epoch": 0.9254168957302548,
"grad_norm": 0.1952117681503296,
"learning_rate": 7.829499791594684e-06,
"loss": 0.0007309889886528253,
"memory(GiB)": 160.86,
"step": 2525,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.036831
},
{
"epoch": 0.9272494044346711,
"grad_norm": 0.005678711924701929,
"learning_rate": 7.821582313156763e-06,
"loss": 0.00012894930550828577,
"memory(GiB)": 160.86,
"step": 2530,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036845
},
{
"epoch": 0.9290819131390874,
"grad_norm": 0.0016558946808800101,
"learning_rate": 7.813654440278677e-06,
"loss": 0.0004136775154620409,
"memory(GiB)": 160.86,
"step": 2535,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.036859
},
{
"epoch": 0.9309144218435037,
"grad_norm": 0.0007809648523107171,
"learning_rate": 7.805716202165949e-06,
"loss": 4.669466288760304e-05,
"memory(GiB)": 160.86,
"step": 2540,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036873
},
{
"epoch": 0.9327469305479201,
"grad_norm": 0.0005511490162461996,
"learning_rate": 7.797767628062296e-06,
"loss": 2.539183187764138e-05,
"memory(GiB)": 160.86,
"step": 2545,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036887
},
{
"epoch": 0.9345794392523364,
"grad_norm": 0.008907792158424854,
"learning_rate": 7.789808747249505e-06,
"loss": 8.047035662457347e-05,
"memory(GiB)": 160.86,
"step": 2550,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036901
},
{
"epoch": 0.9364119479567528,
"grad_norm": 0.16766001284122467,
"learning_rate": 7.781839589047336e-06,
"loss": 0.001341984234750271,
"memory(GiB)": 160.86,
"step": 2555,
"token_acc": 0.9997474960020201,
"train_speed(iter/s)": 0.036915
},
{
"epoch": 0.9382444566611692,
"grad_norm": 0.0007593165501020849,
"learning_rate": 7.773860182813404e-06,
"loss": 6.514263805001974e-05,
"memory(GiB)": 160.86,
"step": 2560,
"token_acc": 1.0,
"train_speed(iter/s)": 0.036929
},
{
"epoch": 0.9400769653655855,
"grad_norm": 0.02255651168525219,
"learning_rate": 7.765870557943083e-06,
"loss": 0.0009576915763318539,
"memory(GiB)": 160.86,
"step": 2565,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.036943
},
{
"epoch": 0.9419094740700018,
"grad_norm": 0.04713983088731766,
"learning_rate": 7.75787074386939e-06,
"loss": 0.0006936299148947,
"memory(GiB)": 160.86,
"step": 2570,
"token_acc": 0.9997474322276477,
"train_speed(iter/s)": 0.036957
},
{
"epoch": 0.9437419827744182,
"grad_norm": 0.038788143545389175,
"learning_rate": 7.749860770062874e-06,
"loss": 0.0007801173254847526,
"memory(GiB)": 160.86,
"step": 2575,
"token_acc": 0.9998316073082428,
"train_speed(iter/s)": 0.036971
},
{
"epoch": 0.9455744914788345,
"grad_norm": 0.026828216388821602,
"learning_rate": 7.741840666031517e-06,
"loss": 0.0009264941327273846,
"memory(GiB)": 160.86,
"step": 2580,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.036984
},
{
"epoch": 0.9474070001832509,
"grad_norm": 0.03660447522997856,
"learning_rate": 7.733810461320619e-06,
"loss": 0.0004160061478614807,
"memory(GiB)": 160.86,
"step": 2585,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.036998
},
{
"epoch": 0.9492395088876672,
"grad_norm": 0.004005759488791227,
"learning_rate": 7.725770185512685e-06,
"loss": 0.00036098186392337085,
"memory(GiB)": 160.86,
"step": 2590,
"token_acc": 0.9999157752884696,
"train_speed(iter/s)": 0.037012
},
{
"epoch": 0.9510720175920836,
"grad_norm": 0.0006123992498032749,
"learning_rate": 7.717719868227327e-06,
"loss": 0.0003307197941467166,
"memory(GiB)": 160.86,
"step": 2595,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.037025
},
{
"epoch": 0.9529045262964999,
"grad_norm": 0.029207419604063034,
"learning_rate": 7.709659539121144e-06,
"loss": 7.62599753215909e-05,
"memory(GiB)": 160.86,
"step": 2600,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037039
},
{
"epoch": 0.9547370350009162,
"grad_norm": 0.03443612530827522,
"learning_rate": 7.70158922788762e-06,
"loss": 0.00035016366746276617,
"memory(GiB)": 160.86,
"step": 2605,
"token_acc": 0.9999158461667929,
"train_speed(iter/s)": 0.037052
},
{
"epoch": 0.9565695437053326,
"grad_norm": 0.020582979544997215,
"learning_rate": 7.693508964257015e-06,
"loss": 0.0006867663934826851,
"memory(GiB)": 160.86,
"step": 2610,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.037066
},
{
"epoch": 0.958402052409749,
"grad_norm": 0.010320069268345833,
"learning_rate": 7.685418777996245e-06,
"loss": 0.0002992436056956649,
"memory(GiB)": 160.86,
"step": 2615,
"token_acc": 0.9998317064961293,
"train_speed(iter/s)": 0.037079
},
{
"epoch": 0.9602345611141653,
"grad_norm": 0.06350167840719223,
"learning_rate": 7.677318698908788e-06,
"loss": 0.0014985553920269013,
"memory(GiB)": 160.86,
"step": 2620,
"token_acc": 0.9995792662403231,
"train_speed(iter/s)": 0.037092
},
{
"epoch": 0.9620670698185816,
"grad_norm": 0.0018099630251526833,
"learning_rate": 7.669208756834563e-06,
"loss": 0.0006455457769334316,
"memory(GiB)": 160.86,
"step": 2625,
"token_acc": 0.9997474109623642,
"train_speed(iter/s)": 0.037106
},
{
"epoch": 0.963899578522998,
"grad_norm": 0.02232094667851925,
"learning_rate": 7.66108898164982e-06,
"loss": 0.0005441450979560613,
"memory(GiB)": 160.86,
"step": 2630,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.037119
},
{
"epoch": 0.9657320872274143,
"grad_norm": 0.08803337812423706,
"learning_rate": 7.65295940326704e-06,
"loss": 0.00035574983339756725,
"memory(GiB)": 160.86,
"step": 2635,
"token_acc": 0.9998316498316498,
"train_speed(iter/s)": 0.037132
},
{
"epoch": 0.9675645959318306,
"grad_norm": 0.003819872625172138,
"learning_rate": 7.644820051634813e-06,
"loss": 0.0005564328283071518,
"memory(GiB)": 160.86,
"step": 2640,
"token_acc": 0.9998315363881402,
"train_speed(iter/s)": 0.037146
},
{
"epoch": 0.9693971046362471,
"grad_norm": 0.012264705263078213,
"learning_rate": 7.636670956737735e-06,
"loss": 0.0008389626629650593,
"memory(GiB)": 160.86,
"step": 2645,
"token_acc": 0.9995793016407236,
"train_speed(iter/s)": 0.037159
},
{
"epoch": 0.9712296133406634,
"grad_norm": 0.012444542720913887,
"learning_rate": 7.628512148596292e-06,
"loss": 0.0002988637425005436,
"memory(GiB)": 160.86,
"step": 2650,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.037172
},
{
"epoch": 0.9730621220450797,
"grad_norm": 0.04613952711224556,
"learning_rate": 7.620343657266758e-06,
"loss": 0.0006712310016155243,
"memory(GiB)": 160.86,
"step": 2655,
"token_acc": 0.9997473896934995,
"train_speed(iter/s)": 0.037185
},
{
"epoch": 0.974894630749496,
"grad_norm": 0.009678124450147152,
"learning_rate": 7.612165512841076e-06,
"loss": 0.0002654188079759479,
"memory(GiB)": 160.86,
"step": 2660,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.037198
},
{
"epoch": 0.9767271394539124,
"grad_norm": 0.10645924508571625,
"learning_rate": 7.603977745446749e-06,
"loss": 0.0006820098031312227,
"memory(GiB)": 160.86,
"step": 2665,
"token_acc": 0.999578947368421,
"train_speed(iter/s)": 0.037212
},
{
"epoch": 0.9785596481583287,
"grad_norm": 0.052510544657707214,
"learning_rate": 7.595780385246729e-06,
"loss": 0.000298806675709784,
"memory(GiB)": 160.86,
"step": 2670,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.037225
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.010894379578530788,
"learning_rate": 7.587573462439315e-06,
"loss": 0.0006402578670531512,
"memory(GiB)": 160.86,
"step": 2675,
"token_acc": 0.9996632996632997,
"train_speed(iter/s)": 0.037237
},
{
"epoch": 0.9822246655671615,
"grad_norm": 0.04109283536672592,
"learning_rate": 7.579357007258022e-06,
"loss": 0.0008437959477305412,
"memory(GiB)": 160.86,
"step": 2680,
"token_acc": 0.9997474534893509,
"train_speed(iter/s)": 0.03725
},
{
"epoch": 0.9840571742715778,
"grad_norm": 0.005569992121309042,
"learning_rate": 7.571131049971492e-06,
"loss": 0.00014509292086586356,
"memory(GiB)": 160.86,
"step": 2685,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037263
},
{
"epoch": 0.9858896829759941,
"grad_norm": 0.03271030634641647,
"learning_rate": 7.562895620883364e-06,
"loss": 0.0003884633770212531,
"memory(GiB)": 160.86,
"step": 2690,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.037276
},
{
"epoch": 0.9877221916804105,
"grad_norm": 0.01711997203528881,
"learning_rate": 7.554650750332175e-06,
"loss": 0.0009255507960915565,
"memory(GiB)": 160.86,
"step": 2695,
"token_acc": 0.9998315647633484,
"train_speed(iter/s)": 0.037289
},
{
"epoch": 0.9895547003848268,
"grad_norm": 0.02630673162639141,
"learning_rate": 7.546396468691241e-06,
"loss": 0.0005463588051497937,
"memory(GiB)": 160.86,
"step": 2700,
"token_acc": 0.9998316214850985,
"train_speed(iter/s)": 0.037302
},
{
"epoch": 0.9913872090892432,
"grad_norm": 0.005354244727641344,
"learning_rate": 7.53813280636855e-06,
"loss": 0.000519955437630415,
"memory(GiB)": 160.86,
"step": 2705,
"token_acc": 0.9999157823816742,
"train_speed(iter/s)": 0.037314
},
{
"epoch": 0.9932197177936595,
"grad_norm": 0.028666380792856216,
"learning_rate": 7.5298597938066446e-06,
"loss": 0.0007598635274916887,
"memory(GiB)": 160.86,
"step": 2710,
"token_acc": 0.9997474747474747,
"train_speed(iter/s)": 0.037327
},
{
"epoch": 0.9950522264980759,
"grad_norm": 0.027820078656077385,
"learning_rate": 7.5215774614825144e-06,
"loss": 0.00038032070733606815,
"memory(GiB)": 160.86,
"step": 2715,
"token_acc": 0.9998315931289997,
"train_speed(iter/s)": 0.037339
},
{
"epoch": 0.9968847352024922,
"grad_norm": 0.03211966156959534,
"learning_rate": 7.51328583990748e-06,
"loss": 0.0006773354019969702,
"memory(GiB)": 160.86,
"step": 2720,
"token_acc": 0.9996630443939011,
"train_speed(iter/s)": 0.037349
},
{
"epoch": 0.9987172439069085,
"grad_norm": 0.008736282587051392,
"learning_rate": 7.504984959627089e-06,
"loss": 0.0001820398378185928,
"memory(GiB)": 160.86,
"step": 2725,
"token_acc": 0.9999157894736842,
"train_speed(iter/s)": 0.037362
},
{
"epoch": 1.0003665017408834,
"grad_norm": 0.04173569008708,
"learning_rate": 7.4966748512209884e-06,
"loss": 0.00037901154719293116,
"memory(GiB)": 160.86,
"step": 2730,
"token_acc": 0.9998129267608269,
"train_speed(iter/s)": 0.037379
},
{
"epoch": 1.0021990104452996,
"grad_norm": 0.002946143504232168,
"learning_rate": 7.488355545302829e-06,
"loss": 0.00021834177896380426,
"memory(GiB)": 160.86,
"step": 2735,
"token_acc": 0.9999157965644998,
"train_speed(iter/s)": 0.037391
},
{
"epoch": 1.004031519149716,
"grad_norm": 0.020436054095625877,
"learning_rate": 7.480027072520137e-06,
"loss": 0.0004638895858079195,
"memory(GiB)": 160.86,
"step": 2740,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.037403
},
{
"epoch": 1.0058640278541322,
"grad_norm": 0.00012372307537589222,
"learning_rate": 7.471689463554212e-06,
"loss": 0.00014013800537213684,
"memory(GiB)": 160.86,
"step": 2745,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.037415
},
{
"epoch": 1.0076965365585486,
"grad_norm": 0.10363256931304932,
"learning_rate": 7.463342749120014e-06,
"loss": 0.0012814832851290702,
"memory(GiB)": 160.86,
"step": 2750,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.037427
},
{
"epoch": 1.009529045262965,
"grad_norm": 0.0360257662832737,
"learning_rate": 7.454986959966038e-06,
"loss": 0.0002859779866412282,
"memory(GiB)": 160.86,
"step": 2755,
"token_acc": 0.9998315363881402,
"train_speed(iter/s)": 0.037439
},
{
"epoch": 1.0113615539673813,
"grad_norm": 0.0018664754461497068,
"learning_rate": 7.446622126874219e-06,
"loss": 0.0011785308830440044,
"memory(GiB)": 160.86,
"step": 2760,
"token_acc": 0.999663129526697,
"train_speed(iter/s)": 0.037451
},
{
"epoch": 1.0131940626717977,
"grad_norm": 0.03385569900274277,
"learning_rate": 7.438248280659801e-06,
"loss": 0.00015975049464032054,
"memory(GiB)": 160.86,
"step": 2765,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037463
},
{
"epoch": 1.015026571376214,
"grad_norm": 0.017654770985245705,
"learning_rate": 7.4298654521712364e-06,
"loss": 0.0003454319899901748,
"memory(GiB)": 160.86,
"step": 2770,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.037475
},
{
"epoch": 1.0168590800806303,
"grad_norm": 0.05392535775899887,
"learning_rate": 7.4214736722900675e-06,
"loss": 0.0005449390038847924,
"memory(GiB)": 160.86,
"step": 2775,
"token_acc": 0.9997476022211005,
"train_speed(iter/s)": 0.037487
},
{
"epoch": 1.0186915887850467,
"grad_norm": 0.004342063330113888,
"learning_rate": 7.413072971930807e-06,
"loss": 0.0007950126193463803,
"memory(GiB)": 160.86,
"step": 2780,
"token_acc": 0.9998315647633484,
"train_speed(iter/s)": 0.037499
},
{
"epoch": 1.0205240974894632,
"grad_norm": 0.00310046155937016,
"learning_rate": 7.404663382040838e-06,
"loss": 0.0002729130210354924,
"memory(GiB)": 160.86,
"step": 2785,
"token_acc": 0.9999158532480646,
"train_speed(iter/s)": 0.03751
},
{
"epoch": 1.0223566061938794,
"grad_norm": 0.0021550292149186134,
"learning_rate": 7.396244933600285e-06,
"loss": 0.00016694137593731284,
"memory(GiB)": 160.86,
"step": 2790,
"token_acc": 0.9999158603281447,
"train_speed(iter/s)": 0.037522
},
{
"epoch": 1.0241891148982958,
"grad_norm": 0.000986380036920309,
"learning_rate": 7.387817657621911e-06,
"loss": 0.00015597309684380888,
"memory(GiB)": 160.86,
"step": 2795,
"token_acc": 0.9999158744847312,
"train_speed(iter/s)": 0.037533
},
{
"epoch": 1.0260216236027122,
"grad_norm": 0.001334765343926847,
"learning_rate": 7.379381585150997e-06,
"loss": 2.5839175214059652e-05,
"memory(GiB)": 160.86,
"step": 2800,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037545
},
{
"epoch": 1.0278541323071284,
"grad_norm": 0.0036596362479031086,
"learning_rate": 7.370936747265226e-06,
"loss": 0.00017838862258940936,
"memory(GiB)": 160.86,
"step": 2805,
"token_acc": 0.9999157752884696,
"train_speed(iter/s)": 0.037557
},
{
"epoch": 1.0296866410115448,
"grad_norm": 0.04679948464035988,
"learning_rate": 7.36248317507458e-06,
"loss": 9.25394706428051e-05,
"memory(GiB)": 160.86,
"step": 2810,
"token_acc": 0.9999157256025619,
"train_speed(iter/s)": 0.037568
},
{
"epoch": 1.0315191497159613,
"grad_norm": 0.014712713658809662,
"learning_rate": 7.35402089972121e-06,
"loss": 0.00011562753934413195,
"memory(GiB)": 160.86,
"step": 2815,
"token_acc": 1.0,
"train_speed(iter/s)": 0.03758
},
{
"epoch": 1.0333516584203775,
"grad_norm": 2.521344504202716e-05,
"learning_rate": 7.345549952379334e-06,
"loss": 3.463000466581434e-05,
"memory(GiB)": 160.86,
"step": 2820,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037592
},
{
"epoch": 1.0351841671247939,
"grad_norm": 0.24957123398780823,
"learning_rate": 7.337070364255112e-06,
"loss": 0.0008360546082258225,
"memory(GiB)": 160.86,
"step": 2825,
"token_acc": 0.9996632713191346,
"train_speed(iter/s)": 0.037604
},
{
"epoch": 1.03701667582921,
"grad_norm": 0.21494735777378082,
"learning_rate": 7.32858216658654e-06,
"loss": 0.0008594411425292492,
"memory(GiB)": 160.86,
"step": 2830,
"token_acc": 0.9999158532480646,
"train_speed(iter/s)": 0.037615
},
{
"epoch": 1.0388491845336265,
"grad_norm": 0.008956658653914928,
"learning_rate": 7.320085390643326e-06,
"loss": 0.00030957753770053385,
"memory(GiB)": 160.86,
"step": 2835,
"token_acc": 0.9999158532480646,
"train_speed(iter/s)": 0.037627
},
{
"epoch": 1.040681693238043,
"grad_norm": 0.002504108939319849,
"learning_rate": 7.311580067726783e-06,
"loss": 0.000167914351914078,
"memory(GiB)": 160.86,
"step": 2840,
"token_acc": 0.9999158886365548,
"train_speed(iter/s)": 0.037638
},
{
"epoch": 1.0425142019424591,
"grad_norm": 0.0135150495916605,
"learning_rate": 7.3030662291697105e-06,
"loss": 4.5498591498471795e-05,
"memory(GiB)": 160.86,
"step": 2845,
"token_acc": 1.0,
"train_speed(iter/s)": 0.03765
},
{
"epoch": 1.0443467106468756,
"grad_norm": 0.002792476676404476,
"learning_rate": 7.294543906336279e-06,
"loss": 0.000167688459623605,
"memory(GiB)": 160.86,
"step": 2850,
"token_acc": 0.9999157823816742,
"train_speed(iter/s)": 0.037661
},
{
"epoch": 1.046179219351292,
"grad_norm": 0.04909972473978996,
"learning_rate": 7.28601313062191e-06,
"loss": 0.000728160934522748,
"memory(GiB)": 160.86,
"step": 2855,
"token_acc": 0.9996635545462192,
"train_speed(iter/s)": 0.037672
},
{
"epoch": 1.0480117280557082,
"grad_norm": 0.002446983242407441,
"learning_rate": 7.27747393345317e-06,
"loss": 0.0003103788709267974,
"memory(GiB)": 160.86,
"step": 2860,
"token_acc": 0.9998317206562894,
"train_speed(iter/s)": 0.037684
},
{
"epoch": 1.0498442367601246,
"grad_norm": 0.005002601537853479,
"learning_rate": 7.268926346287647e-06,
"loss": 0.000590520678088069,
"memory(GiB)": 160.86,
"step": 2865,
"token_acc": 0.9998316781686585,
"train_speed(iter/s)": 0.037695
},
{
"epoch": 1.051676745464541,
"grad_norm": 0.0063280281610786915,
"learning_rate": 7.2603704006138365e-06,
"loss": 0.0006456949282437563,
"memory(GiB)": 160.86,
"step": 2870,
"token_acc": 0.9997474109623642,
"train_speed(iter/s)": 0.037707
},
{
"epoch": 1.0535092541689572,
"grad_norm": 0.005347462370991707,
"learning_rate": 7.251806127951025e-06,
"loss": 0.00015139146707952023,
"memory(GiB)": 160.86,
"step": 2875,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.037718
},
{
"epoch": 1.0553417628733737,
"grad_norm": 0.005681968294084072,
"learning_rate": 7.243233559849179e-06,
"loss": 0.00019556223414838315,
"memory(GiB)": 160.86,
"step": 2880,
"token_acc": 0.9999158320006734,
"train_speed(iter/s)": 0.037729
},
{
"epoch": 1.05717427157779,
"grad_norm": 0.0017381316283717752,
"learning_rate": 7.234652727888819e-06,
"loss": 0.0006761848460882902,
"memory(GiB)": 160.86,
"step": 2885,
"token_acc": 0.9998316498316498,
"train_speed(iter/s)": 0.03774
},
{
"epoch": 1.0590067802822063,
"grad_norm": 0.012453123927116394,
"learning_rate": 7.226063663680915e-06,
"loss": 0.0005378074944019318,
"memory(GiB)": 160.86,
"step": 2890,
"token_acc": 0.999663356337317,
"train_speed(iter/s)": 0.037751
},
{
"epoch": 1.0608392889866227,
"grad_norm": 0.026770737022161484,
"learning_rate": 7.217466398866757e-06,
"loss": 0.0007396583911031485,
"memory(GiB)": 160.86,
"step": 2895,
"token_acc": 0.9997474534893509,
"train_speed(iter/s)": 0.037762
},
{
"epoch": 1.062671797691039,
"grad_norm": 0.13343772292137146,
"learning_rate": 7.2088609651178505e-06,
"loss": 0.0006303425878286361,
"memory(GiB)": 160.86,
"step": 2900,
"token_acc": 0.9997473045822103,
"train_speed(iter/s)": 0.037773
},
{
"epoch": 1.0645043063954553,
"grad_norm": 0.04957849159836769,
"learning_rate": 7.200247394135793e-06,
"loss": 0.0002914240350946784,
"memory(GiB)": 160.86,
"step": 2905,
"token_acc": 0.9999157894736842,
"train_speed(iter/s)": 0.037784
},
{
"epoch": 1.0663368150998718,
"grad_norm": 0.0030663548968732357,
"learning_rate": 7.191625717652158e-06,
"loss": 0.0006854488048702479,
"memory(GiB)": 160.86,
"step": 2910,
"token_acc": 0.9997475597441938,
"train_speed(iter/s)": 0.037795
},
{
"epoch": 1.068169323804288,
"grad_norm": 0.044960979372262955,
"learning_rate": 7.18299596742838e-06,
"loss": 0.0005464905872941018,
"memory(GiB)": 160.86,
"step": 2915,
"token_acc": 0.9998315789473684,
"train_speed(iter/s)": 0.037806
},
{
"epoch": 1.0700018325087044,
"grad_norm": 0.05764192342758179,
"learning_rate": 7.174358175255636e-06,
"loss": 0.0005072502885013819,
"memory(GiB)": 160.86,
"step": 2920,
"token_acc": 0.9998316640013467,
"train_speed(iter/s)": 0.037816
},
{
"epoch": 1.0718343412131208,
"grad_norm": 0.010302331298589706,
"learning_rate": 7.1657123729547275e-06,
"loss": 0.0011625357903540135,
"memory(GiB)": 160.86,
"step": 2925,
"token_acc": 0.9999158178297837,
"train_speed(iter/s)": 0.037827
},
{
"epoch": 1.073666849917537,
"grad_norm": 0.04408176988363266,
"learning_rate": 7.157058592375966e-06,
"loss": 0.0004973907489329576,
"memory(GiB)": 160.86,
"step": 2930,
"token_acc": 0.9998316498316498,
"train_speed(iter/s)": 0.037838
},
{
"epoch": 1.0754993586219534,
"grad_norm": 0.0012950595701113343,
"learning_rate": 7.148396865399054e-06,
"loss": 0.00015295968623831868,
"memory(GiB)": 160.86,
"step": 2935,
"token_acc": 0.9999158886365548,
"train_speed(iter/s)": 0.037849
},
{
"epoch": 1.0773318673263699,
"grad_norm": 0.032750971615314484,
"learning_rate": 7.1397272239329684e-06,
"loss": 0.0010722282342612744,
"memory(GiB)": 160.86,
"step": 2940,
"token_acc": 0.999663242970197,
"train_speed(iter/s)": 0.03786
},
{
"epoch": 1.079164376030786,
"grad_norm": 0.0168730691075325,
"learning_rate": 7.131049699915842e-06,
"loss": 7.366950740106404e-05,
"memory(GiB)": 160.86,
"step": 2945,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037871
},
{
"epoch": 1.0809968847352025,
"grad_norm": 0.007587254513055086,
"learning_rate": 7.122364325314844e-06,
"loss": 0.0006255113985389471,
"memory(GiB)": 160.86,
"step": 2950,
"token_acc": 0.9999158107425492,
"train_speed(iter/s)": 0.037881
},
{
"epoch": 1.082829393439619,
"grad_norm": 0.0203808955848217,
"learning_rate": 7.113671132126067e-06,
"loss": 0.00010994931217283011,
"memory(GiB)": 160.86,
"step": 2955,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037892
},
{
"epoch": 1.0846619021440351,
"grad_norm": 0.00795274693518877,
"learning_rate": 7.104970152374405e-06,
"loss": 0.00014865098055452108,
"memory(GiB)": 160.86,
"step": 2960,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037902
},
{
"epoch": 1.0864944108484516,
"grad_norm": 0.005757440812885761,
"learning_rate": 7.09626141811344e-06,
"loss": 0.000553938839584589,
"memory(GiB)": 160.86,
"step": 2965,
"token_acc": 0.9998316923335858,
"train_speed(iter/s)": 0.037913
},
{
"epoch": 1.088326919552868,
"grad_norm": 0.010678775608539581,
"learning_rate": 7.087544961425317e-06,
"loss": 0.0004192313179373741,
"memory(GiB)": 160.86,
"step": 2970,
"token_acc": 0.9999157752884696,
"train_speed(iter/s)": 0.037924
},
{
"epoch": 1.0901594282572842,
"grad_norm": 0.0032097063958644867,
"learning_rate": 7.078820814420629e-06,
"loss": 0.0006281842943280935,
"memory(GiB)": 160.86,
"step": 2975,
"token_acc": 0.9997473045822103,
"train_speed(iter/s)": 0.037935
},
{
"epoch": 1.0919919369617006,
"grad_norm": 0.012336465530097485,
"learning_rate": 7.070089009238306e-06,
"loss": 0.000180811935570091,
"memory(GiB)": 160.86,
"step": 2980,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037945
},
{
"epoch": 1.093824445666117,
"grad_norm": 0.0761614739894867,
"learning_rate": 7.061349578045481e-06,
"loss": 0.0011349070817232132,
"memory(GiB)": 160.86,
"step": 2985,
"token_acc": 0.999578947368421,
"train_speed(iter/s)": 0.037956
},
{
"epoch": 1.0956569543705332,
"grad_norm": 0.0008425001287832856,
"learning_rate": 7.05260255303739e-06,
"loss": 0.000435651745647192,
"memory(GiB)": 160.86,
"step": 2990,
"token_acc": 0.9999158461667929,
"train_speed(iter/s)": 0.037967
},
{
"epoch": 1.0974894630749497,
"grad_norm": 0.0662672221660614,
"learning_rate": 7.043847966437235e-06,
"loss": 0.0007866304367780685,
"memory(GiB)": 160.86,
"step": 2995,
"token_acc": 0.9996635262449529,
"train_speed(iter/s)": 0.037978
},
{
"epoch": 1.0993219717793659,
"grad_norm": 0.02012745290994644,
"learning_rate": 7.035085850496079e-06,
"loss": 6.958455196581781e-05,
"memory(GiB)": 160.86,
"step": 3000,
"token_acc": 1.0,
"train_speed(iter/s)": 0.037988
},
{
"epoch": 1.0993219717793659,
"eval_loss": 0.0006502080941572785,
"eval_runtime": 172.5767,
"eval_samples_per_second": 2.55,
"eval_steps_per_second": 2.55,
"eval_token_acc": 0.9997857459865632,
"step": 3000
}
],
"logging_steps": 5,
"max_steps": 8184,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.74067294651731e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}