nvan15's picture
Batch upload part 16
e6dd826 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 2494,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020048115477145148,
"grad_norm": 0.3128751516342163,
"learning_rate": 9.6e-05,
"loss": 0.6108,
"step": 25
},
{
"epoch": 0.040096230954290296,
"grad_norm": 0.2898954153060913,
"learning_rate": 0.00019600000000000002,
"loss": 0.3986,
"step": 50
},
{
"epoch": 0.060144346431435444,
"grad_norm": 0.2638753652572632,
"learning_rate": 0.000296,
"loss": 0.3553,
"step": 75
},
{
"epoch": 0.08019246190858059,
"grad_norm": 0.2679823935031891,
"learning_rate": 0.00039600000000000003,
"loss": 0.3276,
"step": 100
},
{
"epoch": 0.08019246190858059,
"eval_loss": 0.3061896860599518,
"eval_runtime": 2.2347,
"eval_samples_per_second": 46.538,
"eval_steps_per_second": 0.895,
"step": 100
},
{
"epoch": 0.10024057738572574,
"grad_norm": 0.3121950924396515,
"learning_rate": 0.000496,
"loss": 0.3272,
"step": 125
},
{
"epoch": 0.12028869286287089,
"grad_norm": 0.2655491828918457,
"learning_rate": 0.000596,
"loss": 0.3186,
"step": 150
},
{
"epoch": 0.14033680834001605,
"grad_norm": 0.30342063307762146,
"learning_rate": 0.000696,
"loss": 0.3094,
"step": 175
},
{
"epoch": 0.16038492381716118,
"grad_norm": 0.3187066614627838,
"learning_rate": 0.000796,
"loss": 0.3092,
"step": 200
},
{
"epoch": 0.16038492381716118,
"eval_loss": 0.29791951179504395,
"eval_runtime": 2.0135,
"eval_samples_per_second": 51.651,
"eval_steps_per_second": 0.993,
"step": 200
},
{
"epoch": 0.18043303929430635,
"grad_norm": 0.29701462388038635,
"learning_rate": 0.000896,
"loss": 0.303,
"step": 225
},
{
"epoch": 0.20048115477145148,
"grad_norm": 0.3302502930164337,
"learning_rate": 0.000996,
"loss": 0.302,
"step": 250
},
{
"epoch": 0.22052927024859664,
"grad_norm": 0.2812274694442749,
"learning_rate": 0.0009997177878718869,
"loss": 0.6194,
"step": 275
},
{
"epoch": 0.24057738572574178,
"grad_norm": 0.2586809992790222,
"learning_rate": 0.0009988239768018291,
"loss": 0.3014,
"step": 300
},
{
"epoch": 0.24057738572574178,
"eval_loss": 0.2832469344139099,
"eval_runtime": 2.0065,
"eval_samples_per_second": 51.832,
"eval_steps_per_second": 0.997,
"step": 300
},
{
"epoch": 0.2606255012028869,
"grad_norm": 0.2703372538089752,
"learning_rate": 0.0009973191715938715,
"loss": 0.2981,
"step": 325
},
{
"epoch": 0.2806736166800321,
"grad_norm": 0.2600429356098175,
"learning_rate": 0.0009952052154376025,
"loss": 0.2955,
"step": 350
},
{
"epoch": 0.30072173215717724,
"grad_norm": 0.21045513451099396,
"learning_rate": 0.0009924846976528616,
"loss": 0.2865,
"step": 375
},
{
"epoch": 0.32076984763432237,
"grad_norm": 0.21870078146457672,
"learning_rate": 0.0009891609505181592,
"loss": 0.2754,
"step": 400
},
{
"epoch": 0.32076984763432237,
"eval_loss": 0.2693285048007965,
"eval_runtime": 2.008,
"eval_samples_per_second": 51.792,
"eval_steps_per_second": 0.996,
"step": 400
},
{
"epoch": 0.3408179631114675,
"grad_norm": 0.23548808693885803,
"learning_rate": 0.0009852380451890721,
"loss": 0.2812,
"step": 425
},
{
"epoch": 0.3608660785886127,
"grad_norm": 0.22311964631080627,
"learning_rate": 0.0009807207867116115,
"loss": 0.2868,
"step": 450
},
{
"epoch": 0.3809141940657578,
"grad_norm": 0.21461476385593414,
"learning_rate": 0.0009756147081366672,
"loss": 0.2765,
"step": 475
},
{
"epoch": 0.40096230954290296,
"grad_norm": 0.19619832932949066,
"learning_rate": 0.0009699260637427467,
"loss": 0.2759,
"step": 500
},
{
"epoch": 0.40096230954290296,
"eval_loss": 0.25922319293022156,
"eval_runtime": 2.0086,
"eval_samples_per_second": 51.777,
"eval_steps_per_second": 0.996,
"step": 500
},
{
"epoch": 0.4210104250200481,
"grad_norm": 0.16149669885635376,
"learning_rate": 0.0009636618213753006,
"loss": 0.2731,
"step": 525
},
{
"epoch": 0.4410585404971933,
"grad_norm": 0.20750294625759125,
"learning_rate": 0.0009568296539120225,
"loss": 0.2759,
"step": 550
},
{
"epoch": 0.4611066559743384,
"grad_norm": 0.18264305591583252,
"learning_rate": 0.0009494379298645788,
"loss": 0.2625,
"step": 575
},
{
"epoch": 0.48115477145148355,
"grad_norm": 0.17461912333965302,
"learning_rate": 0.0009414957031282751,
"loss": 0.2649,
"step": 600
},
{
"epoch": 0.48115477145148355,
"eval_loss": 0.24944312870502472,
"eval_runtime": 2.0116,
"eval_samples_per_second": 51.701,
"eval_steps_per_second": 0.994,
"step": 600
},
{
"epoch": 0.5012028869286287,
"grad_norm": 0.17447619140148163,
"learning_rate": 0.0009330127018922195,
"loss": 0.2597,
"step": 625
},
{
"epoch": 0.5212510024057738,
"grad_norm": 0.1970607340335846,
"learning_rate": 0.0009239993167235614,
"loss": 0.2577,
"step": 650
},
{
"epoch": 0.541299117882919,
"grad_norm": 0.17549267411231995,
"learning_rate": 0.0009144665878404079,
"loss": 0.2564,
"step": 675
},
{
"epoch": 0.5613472333600642,
"grad_norm": 0.17851398885250092,
"learning_rate": 0.0009044261915889984,
"loss": 0.2604,
"step": 700
},
{
"epoch": 0.5613472333600642,
"eval_loss": 0.24154677987098694,
"eval_runtime": 2.0084,
"eval_samples_per_second": 51.784,
"eval_steps_per_second": 0.996,
"step": 700
},
{
"epoch": 0.5813953488372093,
"grad_norm": 0.20294925570487976,
"learning_rate": 0.0008938904261417087,
"loss": 0.271,
"step": 725
},
{
"epoch": 0.6014434643143545,
"grad_norm": 0.16158199310302734,
"learning_rate": 0.0008828721964333975,
"loss": 0.2506,
"step": 750
},
{
"epoch": 0.6214915797914996,
"grad_norm": 0.2007351964712143,
"learning_rate": 0.000871384998354549,
"loss": 0.2454,
"step": 775
},
{
"epoch": 0.6415396952686447,
"grad_norm": 0.17736631631851196,
"learning_rate": 0.0008594429022205719,
"loss": 0.2514,
"step": 800
},
{
"epoch": 0.6415396952686447,
"eval_loss": 0.23755024373531342,
"eval_runtime": 4.2146,
"eval_samples_per_second": 24.676,
"eval_steps_per_second": 0.475,
"step": 800
},
{
"epoch": 0.6615878107457899,
"grad_norm": 0.15997523069381714,
"learning_rate": 0.0008470605355375032,
"loss": 0.2565,
"step": 825
},
{
"epoch": 0.681635926222935,
"grad_norm": 0.15349100530147552,
"learning_rate": 0.0008342530650852265,
"loss": 0.2598,
"step": 850
},
{
"epoch": 0.7016840417000801,
"grad_norm": 0.14758522808551788,
"learning_rate": 0.0008210361783401491,
"loss": 0.2426,
"step": 875
},
{
"epoch": 0.7217321571772254,
"grad_norm": 0.1686255782842636,
"learning_rate": 0.0008074260642600964,
"loss": 0.2462,
"step": 900
},
{
"epoch": 0.7217321571772254,
"eval_loss": 0.23332656919956207,
"eval_runtime": 4.2091,
"eval_samples_per_second": 24.708,
"eval_steps_per_second": 0.475,
"step": 900
},
{
"epoch": 0.7417802726543705,
"grad_norm": 0.1921042650938034,
"learning_rate": 0.0007934393934549542,
"loss": 0.249,
"step": 925
},
{
"epoch": 0.7618283881315157,
"grad_norm": 0.1523015797138214,
"learning_rate": 0.0007790932977673523,
"loss": 0.2399,
"step": 950
},
{
"epoch": 0.7818765036086608,
"grad_norm": 0.16055895388126373,
"learning_rate": 0.0007644053492883989,
"loss": 0.2363,
"step": 975
},
{
"epoch": 0.8019246190858059,
"grad_norm": 0.17422834038734436,
"learning_rate": 0.000749393538834164,
"loss": 0.2384,
"step": 1000
},
{
"epoch": 0.8019246190858059,
"eval_loss": 0.23113039135932922,
"eval_runtime": 4.2589,
"eval_samples_per_second": 24.419,
"eval_steps_per_second": 0.47,
"step": 1000
},
{
"epoch": 0.8219727345629511,
"grad_norm": 0.1599196046590805,
"learning_rate": 0.0007340762539092858,
"loss": 0.2485,
"step": 1025
},
{
"epoch": 0.8420208500400962,
"grad_norm": 0.15405167639255524,
"learning_rate": 0.0007184722561846798,
"loss": 0.2464,
"step": 1050
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.18259042501449585,
"learning_rate": 0.0007026006585169466,
"loss": 0.2421,
"step": 1075
},
{
"epoch": 0.8821170809943866,
"grad_norm": 0.14938652515411377,
"learning_rate": 0.0006864809015376217,
"loss": 0.2425,
"step": 1100
},
{
"epoch": 0.8821170809943866,
"eval_loss": 0.22659502923488617,
"eval_runtime": 4.2121,
"eval_samples_per_second": 24.691,
"eval_steps_per_second": 0.475,
"step": 1100
},
{
"epoch": 0.9021651964715317,
"grad_norm": 0.16447846591472626,
"learning_rate": 0.0006701327298409448,
"loss": 0.2414,
"step": 1125
},
{
"epoch": 0.9222133119486768,
"grad_norm": 0.1590721160173416,
"learning_rate": 0.000653576167799312,
"loss": 0.2287,
"step": 1150
},
{
"epoch": 0.942261427425822,
"grad_norm": 0.1653919219970703,
"learning_rate": 0.0006368314950360416,
"loss": 0.2351,
"step": 1175
},
{
"epoch": 0.9623095429029671,
"grad_norm": 0.1875888705253601,
"learning_rate": 0.000619919221585484,
"loss": 0.2374,
"step": 1200
},
{
"epoch": 0.9623095429029671,
"eval_loss": 0.22220070660114288,
"eval_runtime": 4.2412,
"eval_samples_per_second": 24.521,
"eval_steps_per_second": 0.472,
"step": 1200
},
{
"epoch": 0.9823576583801122,
"grad_norm": 0.15826693177223206,
"learning_rate": 0.0006028600627709151,
"loss": 0.2314,
"step": 1225
},
{
"epoch": 1.0024057738572574,
"grad_norm": 0.15599651634693146,
"learning_rate": 0.0005856749138309716,
"loss": 0.2246,
"step": 1250
},
{
"epoch": 1.0224538893344026,
"grad_norm": 0.15733949840068817,
"learning_rate": 0.000568384824325718,
"loss": 0.2024,
"step": 1275
},
{
"epoch": 1.0425020048115476,
"grad_norm": 0.14656169712543488,
"learning_rate": 0.0005510109723536876,
"loss": 0.2109,
"step": 1300
},
{
"epoch": 1.0425020048115476,
"eval_loss": 0.22098909318447113,
"eval_runtime": 4.2171,
"eval_samples_per_second": 24.661,
"eval_steps_per_second": 0.474,
"step": 1300
},
{
"epoch": 1.062550120288693,
"grad_norm": 0.15239104628562927,
"learning_rate": 0.0005335746386114814,
"loss": 0.1941,
"step": 1325
},
{
"epoch": 1.082598235765838,
"grad_norm": 0.1525331288576126,
"learning_rate": 0.0005160971803276981,
"loss": 0.2074,
"step": 1350
},
{
"epoch": 1.1026463512429832,
"grad_norm": 0.18646268546581268,
"learning_rate": 0.0004986000051031212,
"loss": 0.2008,
"step": 1375
},
{
"epoch": 1.1226944667201284,
"grad_norm": 0.1695125252008438,
"learning_rate": 0.00048110454468920866,
"loss": 0.2019,
"step": 1400
},
{
"epoch": 1.1226944667201284,
"eval_loss": 0.21638630330562592,
"eval_runtime": 4.2187,
"eval_samples_per_second": 24.652,
"eval_steps_per_second": 0.474,
"step": 1400
},
{
"epoch": 1.1427425821972734,
"grad_norm": 0.15290401875972748,
"learning_rate": 0.0004636322287369997,
"loss": 0.2021,
"step": 1425
},
{
"epoch": 1.1627906976744187,
"grad_norm": 0.16536127030849457,
"learning_rate": 0.0004462044585485944,
"loss": 0.1972,
"step": 1450
},
{
"epoch": 1.1828388131515637,
"grad_norm": 0.14967386424541473,
"learning_rate": 0.0004288425808633575,
"loss": 0.2033,
"step": 1475
},
{
"epoch": 1.202886928628709,
"grad_norm": 0.15446773171424866,
"learning_rate": 0.00041156786171095476,
"loss": 0.1957,
"step": 1500
},
{
"epoch": 1.202886928628709,
"eval_loss": 0.21511909365653992,
"eval_runtime": 4.2185,
"eval_samples_per_second": 24.654,
"eval_steps_per_second": 0.474,
"step": 1500
},
{
"epoch": 1.222935044105854,
"grad_norm": 0.1389647275209427,
"learning_rate": 0.00039440146036324753,
"loss": 0.1964,
"step": 1525
},
{
"epoch": 1.2429831595829992,
"grad_norm": 0.13631069660186768,
"learning_rate": 0.00037736440341695125,
"loss": 0.2033,
"step": 1550
},
{
"epoch": 1.2630312750601442,
"grad_norm": 0.1525258868932724,
"learning_rate": 0.0003604775590388047,
"loss": 0.2013,
"step": 1575
},
{
"epoch": 1.2830793905372895,
"grad_norm": 0.14098823070526123,
"learning_rate": 0.00034376161140479495,
"loss": 0.1976,
"step": 1600
},
{
"epoch": 1.2830793905372895,
"eval_loss": 0.21296119689941406,
"eval_runtime": 4.0201,
"eval_samples_per_second": 25.87,
"eval_steps_per_second": 0.497,
"step": 1600
},
{
"epoch": 1.3031275060144347,
"grad_norm": 0.1649613082408905,
"learning_rate": 0.0003272370353647465,
"loss": 0.2001,
"step": 1625
},
{
"epoch": 1.3231756214915797,
"grad_norm": 0.17128996551036835,
"learning_rate": 0.00031092407136330754,
"loss": 0.2015,
"step": 1650
},
{
"epoch": 1.343223736968725,
"grad_norm": 0.15200765430927277,
"learning_rate": 0.0002948427006480528,
"loss": 0.2056,
"step": 1675
},
{
"epoch": 1.36327185244587,
"grad_norm": 0.15264691412448883,
"learning_rate": 0.00027901262079506784,
"loss": 0.2032,
"step": 1700
},
{
"epoch": 1.36327185244587,
"eval_loss": 0.21197493374347687,
"eval_runtime": 4.256,
"eval_samples_per_second": 24.436,
"eval_steps_per_second": 0.47,
"step": 1700
},
{
"epoch": 1.3833199679230153,
"grad_norm": 0.16756217181682587,
"learning_rate": 0.000263453221581995,
"loss": 0.1987,
"step": 1725
},
{
"epoch": 1.4033680834001605,
"grad_norm": 0.13861249387264252,
"learning_rate": 0.00024818356123809036,
"loss": 0.1998,
"step": 1750
},
{
"epoch": 1.4234161988773055,
"grad_norm": 0.16581584513187408,
"learning_rate": 0.00023322234310038588,
"loss": 0.1875,
"step": 1775
},
{
"epoch": 1.4434643143544506,
"grad_norm": 0.1450669765472412,
"learning_rate": 0.00021858789270454783,
"loss": 0.1953,
"step": 1800
},
{
"epoch": 1.4434643143544506,
"eval_loss": 0.2067786008119583,
"eval_runtime": 4.2236,
"eval_samples_per_second": 24.624,
"eval_steps_per_second": 0.474,
"step": 1800
},
{
"epoch": 1.4635124298315958,
"grad_norm": 0.1343117356300354,
"learning_rate": 0.00020429813533849174,
"loss": 0.2026,
"step": 1825
},
{
"epoch": 1.483560545308741,
"grad_norm": 0.14949767291545868,
"learning_rate": 0.00019037057408624846,
"loss": 0.1929,
"step": 1850
},
{
"epoch": 1.5036086607858863,
"grad_norm": 0.12897883355617523,
"learning_rate": 0.00017682226838897568,
"loss": 0.1907,
"step": 1875
},
{
"epoch": 1.5236567762630313,
"grad_norm": 0.15060247480869293,
"learning_rate": 0.00016366981314937373,
"loss": 0.2003,
"step": 1900
},
{
"epoch": 1.5236567762630313,
"eval_loss": 0.2053409218788147,
"eval_runtime": 4.2538,
"eval_samples_per_second": 24.449,
"eval_steps_per_second": 0.47,
"step": 1900
},
{
"epoch": 1.5437048917401763,
"grad_norm": 0.1608167141675949,
"learning_rate": 0.0001509293184050995,
"loss": 0.1984,
"step": 1925
},
{
"epoch": 1.5637530072173216,
"grad_norm": 0.15426403284072876,
"learning_rate": 0.000138616389596077,
"loss": 0.1955,
"step": 1950
},
{
"epoch": 1.5838011226944668,
"grad_norm": 0.1431884467601776,
"learning_rate": 0.0001267461084498744,
"loss": 0.1955,
"step": 1975
},
{
"epoch": 1.6038492381716118,
"grad_norm": 0.14500346779823303,
"learning_rate": 0.00011533301450856055,
"loss": 0.1898,
"step": 2000
},
{
"epoch": 1.6038492381716118,
"eval_loss": 0.20465601980686188,
"eval_runtime": 4.2165,
"eval_samples_per_second": 24.665,
"eval_steps_per_second": 0.474,
"step": 2000
},
{
"epoch": 1.6238973536487569,
"grad_norm": 0.13658447563648224,
"learning_rate": 0.0001043910873196668,
"loss": 0.1882,
"step": 2025
},
{
"epoch": 1.6439454691259021,
"grad_norm": 0.14735296368598938,
"learning_rate": 9.393372931306943e-05,
"loss": 0.194,
"step": 2050
},
{
"epoch": 1.6639935846030474,
"grad_norm": 0.13270524144172668,
"learning_rate": 8.397374938476593e-05,
"loss": 0.191,
"step": 2075
},
{
"epoch": 1.6840417000801926,
"grad_norm": 0.13621263206005096,
"learning_rate": 7.452334720765258e-05,
"loss": 0.191,
"step": 2100
},
{
"epoch": 1.6840417000801926,
"eval_loss": 0.2029379904270172,
"eval_runtime": 4.2107,
"eval_samples_per_second": 24.699,
"eval_steps_per_second": 0.475,
"step": 2100
},
{
"epoch": 1.7040898155573376,
"grad_norm": 0.1418534368276596,
"learning_rate": 6.55940982885207e-05,
"loss": 0.1982,
"step": 2125
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.15418624877929688,
"learning_rate": 5.71969397895738e-05,
"loss": 0.1957,
"step": 2150
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.15846756100654602,
"learning_rate": 4.934215713183526e-05,
"loss": 0.1904,
"step": 2175
},
{
"epoch": 1.7642341619887731,
"grad_norm": 0.15319091081619263,
"learning_rate": 4.203937139685188e-05,
"loss": 0.1835,
"step": 2200
},
{
"epoch": 1.7642341619887731,
"eval_loss": 0.20121867954730988,
"eval_runtime": 4.2209,
"eval_samples_per_second": 24.64,
"eval_steps_per_second": 0.474,
"step": 2200
},
{
"epoch": 1.7842822774659182,
"grad_norm": 0.14038674533367157,
"learning_rate": 3.529752754212767e-05,
"loss": 0.1886,
"step": 2225
},
{
"epoch": 1.8043303929430632,
"grad_norm": 0.13600395619869232,
"learning_rate": 2.9124883444720253e-05,
"loss": 0.186,
"step": 2250
},
{
"epoch": 1.8243785084202084,
"grad_norm": 0.1356409341096878,
"learning_rate": 2.3528999786421755e-05,
"loss": 0.183,
"step": 2275
},
{
"epoch": 1.8444266238973537,
"grad_norm": 0.14992156624794006,
"learning_rate": 1.851673079291216e-05,
"loss": 0.1738,
"step": 2300
},
{
"epoch": 1.8444266238973537,
"eval_loss": 0.20082467794418335,
"eval_runtime": 4.2146,
"eval_samples_per_second": 24.676,
"eval_steps_per_second": 0.475,
"step": 2300
},
{
"epoch": 1.864474739374499,
"grad_norm": 0.1463785320520401,
"learning_rate": 1.4094215838229174e-05,
"loss": 0.1982,
"step": 2325
},
{
"epoch": 1.884522854851644,
"grad_norm": 0.15394070744514465,
"learning_rate": 1.0266871924838216e-05,
"loss": 0.1887,
"step": 2350
},
{
"epoch": 1.904570970328789,
"grad_norm": 0.14152726531028748,
"learning_rate": 7.03938704851248e-06,
"loss": 0.1783,
"step": 2375
},
{
"epoch": 1.9246190858059342,
"grad_norm": 0.1539337933063507,
"learning_rate": 4.415714456151243e-06,
"loss": 0.182,
"step": 2400
},
{
"epoch": 1.9246190858059342,
"eval_loss": 0.20044730603694916,
"eval_runtime": 2.0109,
"eval_samples_per_second": 51.717,
"eval_steps_per_second": 0.995,
"step": 2400
},
{
"epoch": 1.9446672012830795,
"grad_norm": 0.13150149583816528,
"learning_rate": 2.3990678035694656e-06,
"loss": 0.1947,
"step": 2425
},
{
"epoch": 1.9647153167602245,
"grad_norm": 0.14985321462154388,
"learning_rate": 9.919172191896753e-07,
"loss": 0.1947,
"step": 2450
},
{
"epoch": 1.9847634322373697,
"grad_norm": 0.1531253457069397,
"learning_rate": 1.9598627845779372e-07,
"loss": 0.1918,
"step": 2475
},
{
"epoch": 2.0,
"step": 2494,
"total_flos": 1.6216560612723917e+18,
"train_loss": 0.23998703379198943,
"train_runtime": 3791.0396,
"train_samples_per_second": 21.048,
"train_steps_per_second": 0.658
}
],
"logging_steps": 25,
"max_steps": 2494,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6216560612723917e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}