OminiControlRotation / nl_tasks /exps /run_ex09 /trainer_state.json
nvan15's picture
Batch upload part 4
3a244f5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 2438,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020508613617719443,
"grad_norm": 0.1543785035610199,
"learning_rate": 0.00019672131147540983,
"loss": 0.6266,
"step": 25
},
{
"epoch": 0.04101722723543889,
"grad_norm": 0.2001742422580719,
"learning_rate": 0.00040163934426229507,
"loss": 0.3858,
"step": 50
},
{
"epoch": 0.06152584085315833,
"grad_norm": 0.14139799773693085,
"learning_rate": 0.0006065573770491804,
"loss": 0.3604,
"step": 75
},
{
"epoch": 0.08203445447087777,
"grad_norm": 0.20014838874340057,
"learning_rate": 0.0008114754098360656,
"loss": 0.3285,
"step": 100
},
{
"epoch": 0.08203445447087777,
"eval_loss": 0.3202356696128845,
"eval_runtime": 22.7105,
"eval_samples_per_second": 44.033,
"eval_steps_per_second": 0.705,
"step": 100
},
{
"epoch": 0.10254306808859721,
"grad_norm": 0.2310003638267517,
"learning_rate": 0.0010163934426229509,
"loss": 0.341,
"step": 125
},
{
"epoch": 0.12305168170631665,
"grad_norm": 0.3138357102870941,
"learning_rate": 0.001221311475409836,
"loss": 0.3325,
"step": 150
},
{
"epoch": 0.1435602953240361,
"grad_norm": 1.425217628479004,
"learning_rate": 0.0014262295081967215,
"loss": 1.0626,
"step": 175
},
{
"epoch": 0.16406890894175555,
"grad_norm": 0.6279019713401794,
"learning_rate": 0.0016311475409836065,
"loss": 0.4087,
"step": 200
},
{
"epoch": 0.16406890894175555,
"eval_loss": 0.38264134526252747,
"eval_runtime": 22.5381,
"eval_samples_per_second": 44.369,
"eval_steps_per_second": 0.71,
"step": 200
},
{
"epoch": 0.184577522559475,
"grad_norm": 0.38462212681770325,
"learning_rate": 0.0018360655737704918,
"loss": 0.4034,
"step": 225
},
{
"epoch": 0.20508613617719443,
"grad_norm": 0.3672288954257965,
"learning_rate": 0.0019999743708232127,
"loss": 0.3633,
"step": 250
},
{
"epoch": 0.22559474979491387,
"grad_norm": 0.3607560694217682,
"learning_rate": 0.0019990774875676054,
"loss": 0.3487,
"step": 275
},
{
"epoch": 0.2461033634126333,
"grad_norm": 0.3346173167228699,
"learning_rate": 0.001996900458879386,
"loss": 0.3371,
"step": 300
},
{
"epoch": 0.2461033634126333,
"eval_loss": 0.3209039270877838,
"eval_runtime": 22.5783,
"eval_samples_per_second": 44.29,
"eval_steps_per_second": 0.709,
"step": 300
},
{
"epoch": 0.2666119770303528,
"grad_norm": 0.22805160284042358,
"learning_rate": 0.001993446074245224,
"loss": 0.3296,
"step": 325
},
{
"epoch": 0.2871205906480722,
"grad_norm": 0.18590985238552094,
"learning_rate": 0.0019887187598630527,
"loss": 0.3134,
"step": 350
},
{
"epoch": 0.30762920426579166,
"grad_norm": 0.21088963747024536,
"learning_rate": 0.0019827245729706648,
"loss": 0.3199,
"step": 375
},
{
"epoch": 0.3281378178835111,
"grad_norm": 0.18156805634498596,
"learning_rate": 0.0019754711940844047,
"loss": 0.2996,
"step": 400
},
{
"epoch": 0.3281378178835111,
"eval_loss": 0.28689703345298767,
"eval_runtime": 22.5553,
"eval_samples_per_second": 44.335,
"eval_steps_per_second": 0.709,
"step": 400
},
{
"epoch": 0.34864643150123054,
"grad_norm": 0.15950609743595123,
"learning_rate": 0.0019669679171579117,
"loss": 0.3044,
"step": 425
},
{
"epoch": 0.36915504511895,
"grad_norm": 0.16445936262607574,
"learning_rate": 0.001957225637673524,
"loss": 0.3019,
"step": 450
},
{
"epoch": 0.3896636587366694,
"grad_norm": 0.16957086324691772,
"learning_rate": 0.0019462568386815961,
"loss": 0.2863,
"step": 475
},
{
"epoch": 0.41017227235438886,
"grad_norm": 0.12954926490783691,
"learning_rate": 0.0019340755748056234,
"loss": 0.2701,
"step": 500
},
{
"epoch": 0.41017227235438886,
"eval_loss": 0.2733325660228729,
"eval_runtime": 22.5518,
"eval_samples_per_second": 44.342,
"eval_steps_per_second": 0.709,
"step": 500
},
{
"epoch": 0.4306808859721083,
"grad_norm": 0.1369732916355133,
"learning_rate": 0.0019206974542336672,
"loss": 0.271,
"step": 525
},
{
"epoch": 0.45118949958982774,
"grad_norm": 0.15917326509952545,
"learning_rate": 0.0019061396187191563,
"loss": 0.2802,
"step": 550
},
{
"epoch": 0.4716981132075472,
"grad_norm": 0.16746191680431366,
"learning_rate": 0.0018904207216166836,
"loss": 0.2691,
"step": 575
},
{
"epoch": 0.4922067268252666,
"grad_norm": 0.1554066687822342,
"learning_rate": 0.001873560903980955,
"loss": 0.286,
"step": 600
},
{
"epoch": 0.4922067268252666,
"eval_loss": 0.26212847232818604,
"eval_runtime": 22.5566,
"eval_samples_per_second": 44.333,
"eval_steps_per_second": 0.709,
"step": 600
},
{
"epoch": 0.5127153404429861,
"grad_norm": 0.13422970473766327,
"learning_rate": 0.0018555817687594984,
"loss": 0.2655,
"step": 625
},
{
"epoch": 0.5332239540607056,
"grad_norm": 0.14770525693893433,
"learning_rate": 0.0018365063531122169,
"loss": 0.26,
"step": 650
},
{
"epoch": 0.5537325676784249,
"grad_norm": 0.12729965150356293,
"learning_rate": 0.0018163590988932402,
"loss": 0.2694,
"step": 675
},
{
"epoch": 0.5742411812961444,
"grad_norm": 0.1334213763475418,
"learning_rate": 0.0017951658213329078,
"loss": 0.268,
"step": 700
},
{
"epoch": 0.5742411812961444,
"eval_loss": 0.25402218103408813,
"eval_runtime": 22.5274,
"eval_samples_per_second": 44.39,
"eval_steps_per_second": 0.71,
"step": 700
},
{
"epoch": 0.5947497949138638,
"grad_norm": 0.12552621960639954,
"learning_rate": 0.0017729536759600033,
"loss": 0.266,
"step": 725
},
{
"epoch": 0.6152584085315833,
"grad_norm": 0.13384173810482025,
"learning_rate": 0.0017497511238066307,
"loss": 0.2631,
"step": 750
},
{
"epoch": 0.6357670221493027,
"grad_norm": 0.12846872210502625,
"learning_rate": 0.00172558789494031,
"loss": 0.2588,
"step": 775
},
{
"epoch": 0.6562756357670222,
"grad_norm": 0.16066329181194305,
"learning_rate": 0.0017004949503700284,
"loss": 0.2636,
"step": 800
},
{
"epoch": 0.6562756357670222,
"eval_loss": 0.24892009794712067,
"eval_runtime": 22.5749,
"eval_samples_per_second": 44.297,
"eval_steps_per_second": 0.709,
"step": 800
},
{
"epoch": 0.6767842493847416,
"grad_norm": 0.11756884306669235,
"learning_rate": 0.0016745044423750449,
"loss": 0.2563,
"step": 825
},
{
"epoch": 0.6972928630024611,
"grad_norm": 0.1094069853425026,
"learning_rate": 0.0016476496733072946,
"loss": 0.2581,
"step": 850
},
{
"epoch": 0.7178014766201805,
"grad_norm": 0.10113517194986343,
"learning_rate": 0.0016199650529201684,
"loss": 0.2466,
"step": 875
},
{
"epoch": 0.7383100902379,
"grad_norm": 0.12762148678302765,
"learning_rate": 0.0015914860542783522,
"loss": 0.2511,
"step": 900
},
{
"epoch": 0.7383100902379,
"eval_loss": 0.24198263883590698,
"eval_runtime": 22.5893,
"eval_samples_per_second": 44.269,
"eval_steps_per_second": 0.708,
"step": 900
},
{
"epoch": 0.7588187038556193,
"grad_norm": 0.1412491798400879,
"learning_rate": 0.0015622491683052124,
"loss": 0.2538,
"step": 925
},
{
"epoch": 0.7793273174733388,
"grad_norm": 0.1309656947851181,
"learning_rate": 0.0015322918570259759,
"loss": 0.2417,
"step": 950
},
{
"epoch": 0.7998359310910582,
"grad_norm": 0.12559030950069427,
"learning_rate": 0.0015016525055666057,
"loss": 0.2498,
"step": 975
},
{
"epoch": 0.8203445447087777,
"grad_norm": 0.12614794075489044,
"learning_rate": 0.001470370372969886,
"loss": 0.2417,
"step": 1000
},
{
"epoch": 0.8203445447087777,
"eval_loss": 0.2378261834383011,
"eval_runtime": 22.5548,
"eval_samples_per_second": 44.336,
"eval_steps_per_second": 0.709,
"step": 1000
},
{
"epoch": 0.8408531583264971,
"grad_norm": 0.134114608168602,
"learning_rate": 0.0014384855418917311,
"loss": 0.2452,
"step": 1025
},
{
"epoch": 0.8613617719442166,
"grad_norm": 0.11434811353683472,
"learning_rate": 0.0014060388672421775,
"loss": 0.2412,
"step": 1050
},
{
"epoch": 0.881870385561936,
"grad_norm": 0.11284555494785309,
"learning_rate": 0.0013730719238368662,
"loss": 0.245,
"step": 1075
},
{
"epoch": 0.9023789991796555,
"grad_norm": 0.13569487631320953,
"learning_rate": 0.0013396269531260867,
"loss": 0.246,
"step": 1100
},
{
"epoch": 0.9023789991796555,
"eval_loss": 0.2345089465379715,
"eval_runtime": 22.5349,
"eval_samples_per_second": 44.376,
"eval_steps_per_second": 0.71,
"step": 1100
},
{
"epoch": 0.9228876127973749,
"grad_norm": 0.09805800765752792,
"learning_rate": 0.0013057468090696496,
"loss": 0.2414,
"step": 1125
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.0931050032377243,
"learning_rate": 0.0012714749032269287,
"loss": 0.2404,
"step": 1150
},
{
"epoch": 0.9639048400328137,
"grad_norm": 0.10308840870857239,
"learning_rate": 0.0012368551491324358,
"loss": 0.245,
"step": 1175
},
{
"epoch": 0.9844134536505332,
"grad_norm": 0.10258302837610245,
"learning_rate": 0.0012019319060282063,
"loss": 0.2509,
"step": 1200
},
{
"epoch": 0.9844134536505332,
"eval_loss": 0.22989174723625183,
"eval_runtime": 22.5404,
"eval_samples_per_second": 44.365,
"eval_steps_per_second": 0.71,
"step": 1200
},
{
"epoch": 1.0049220672682526,
"grad_norm": 0.11619790643453598,
"learning_rate": 0.0011667499220250803,
"loss": 0.2302,
"step": 1225
},
{
"epoch": 1.0254306808859721,
"grad_norm": 0.11248350143432617,
"learning_rate": 0.0011313542767657204,
"loss": 0.2105,
"step": 1250
},
{
"epoch": 1.0459392945036916,
"grad_norm": 0.12309166043996811,
"learning_rate": 0.0010957903236628267,
"loss": 0.2114,
"step": 1275
},
{
"epoch": 1.066447908121411,
"grad_norm": 0.10280752182006836,
"learning_rate": 0.001060103631786563,
"loss": 0.2138,
"step": 1300
},
{
"epoch": 1.066447908121411,
"eval_loss": 0.2270548790693283,
"eval_runtime": 22.5165,
"eval_samples_per_second": 44.412,
"eval_steps_per_second": 0.711,
"step": 1300
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.12517733871936798,
"learning_rate": 0.0010243399274756564,
"loss": 0.2111,
"step": 1325
},
{
"epoch": 1.1074651353568499,
"grad_norm": 0.09303736686706543,
"learning_rate": 0.0009885450357469806,
"loss": 0.2043,
"step": 1350
},
{
"epoch": 1.1279737489745694,
"grad_norm": 0.11305887997150421,
"learning_rate": 0.0009527648215787065,
"loss": 0.2057,
"step": 1375
},
{
"epoch": 1.1484823625922886,
"grad_norm": 0.1038450226187706,
"learning_rate": 0.000917045131142242,
"loss": 0.1984,
"step": 1400
},
{
"epoch": 1.1484823625922886,
"eval_loss": 0.22505907714366913,
"eval_runtime": 22.5688,
"eval_samples_per_second": 44.309,
"eval_steps_per_second": 0.709,
"step": 1400
},
{
"epoch": 1.1689909762100081,
"grad_norm": 0.10388393700122833,
"learning_rate": 0.0008814317330582753,
"loss": 0.2092,
"step": 1425
},
{
"epoch": 1.1894995898277276,
"grad_norm": 0.1294604390859604,
"learning_rate": 0.000845970259752183,
"loss": 0.2107,
"step": 1450
},
{
"epoch": 1.2100082034454471,
"grad_norm": 0.1135207936167717,
"learning_rate": 0.0008107061489839498,
"loss": 0.2069,
"step": 1475
},
{
"epoch": 1.2305168170631666,
"grad_norm": 0.11284071952104568,
"learning_rate": 0.0007756845856275194,
"loss": 0.2169,
"step": 1500
},
{
"epoch": 1.2305168170631666,
"eval_loss": 0.2215997278690338,
"eval_runtime": 22.56,
"eval_samples_per_second": 44.326,
"eval_steps_per_second": 0.709,
"step": 1500
},
{
"epoch": 1.251025430680886,
"grad_norm": 0.1196022480726242,
"learning_rate": 0.0007409504437741722,
"loss": 0.21,
"step": 1525
},
{
"epoch": 1.2715340442986054,
"grad_norm": 0.10071329027414322,
"learning_rate": 0.0007065482292341205,
"loss": 0.2027,
"step": 1550
},
{
"epoch": 1.2920426579163249,
"grad_norm": 0.09619199484586716,
"learning_rate": 0.0006725220225099911,
"loss": 0.2052,
"step": 1575
},
{
"epoch": 1.3125512715340442,
"grad_norm": 0.09156788140535355,
"learning_rate": 0.0006389154223152666,
"loss": 0.1987,
"step": 1600
},
{
"epoch": 1.3125512715340442,
"eval_loss": 0.21827217936515808,
"eval_runtime": 22.5451,
"eval_samples_per_second": 44.356,
"eval_steps_per_second": 0.71,
"step": 1600
},
{
"epoch": 1.3330598851517639,
"grad_norm": 0.09059920907020569,
"learning_rate": 0.0006057714897100551,
"loss": 0.201,
"step": 1625
},
{
"epoch": 1.3535684987694832,
"grad_norm": 0.10633113235235214,
"learning_rate": 0.0005731326929257713,
"loss": 0.2022,
"step": 1650
},
{
"epoch": 1.3740771123872026,
"grad_norm": 0.10689054429531097,
"learning_rate": 0.0005410408529494251,
"loss": 0.2001,
"step": 1675
},
{
"epoch": 1.3945857260049221,
"grad_norm": 0.10712600499391556,
"learning_rate": 0.0005095370899372412,
"loss": 0.2002,
"step": 1700
},
{
"epoch": 1.3945857260049221,
"eval_loss": 0.2159736305475235,
"eval_runtime": 22.541,
"eval_samples_per_second": 44.364,
"eval_steps_per_second": 0.71,
"step": 1700
},
{
"epoch": 1.4150943396226414,
"grad_norm": 0.0958191528916359,
"learning_rate": 0.0004786617705262746,
"loss": 0.1979,
"step": 1725
},
{
"epoch": 1.435602953240361,
"grad_norm": 0.097678542137146,
"learning_rate": 0.000448454456111529,
"loss": 0.1957,
"step": 1750
},
{
"epoch": 1.4561115668580804,
"grad_norm": 0.09663794934749603,
"learning_rate": 0.0004189538521548524,
"loss": 0.2034,
"step": 1775
},
{
"epoch": 1.4766201804758,
"grad_norm": 0.09802096337080002,
"learning_rate": 0.00039019775859056916,
"loss": 0.1927,
"step": 1800
},
{
"epoch": 1.4766201804758,
"eval_loss": 0.2144840955734253,
"eval_runtime": 22.5527,
"eval_samples_per_second": 44.341,
"eval_steps_per_second": 0.709,
"step": 1800
},
{
"epoch": 1.4971287940935194,
"grad_norm": 0.09784867614507675,
"learning_rate": 0.0003622230213913836,
"loss": 0.1917,
"step": 1825
},
{
"epoch": 1.5176374077112387,
"grad_norm": 0.09651490300893784,
"learning_rate": 0.0003350654853566223,
"loss": 0.1944,
"step": 1850
},
{
"epoch": 1.5381460213289582,
"grad_norm": 0.10981660336256027,
"learning_rate": 0.00030875994818330957,
"loss": 0.1958,
"step": 1875
},
{
"epoch": 1.5586546349466777,
"grad_norm": 0.11364647001028061,
"learning_rate": 0.0002833401158789207,
"loss": 0.1985,
"step": 1900
},
{
"epoch": 1.5586546349466777,
"eval_loss": 0.21162408590316772,
"eval_runtime": 22.5558,
"eval_samples_per_second": 44.334,
"eval_steps_per_second": 0.709,
"step": 1900
},
{
"epoch": 1.579163248564397,
"grad_norm": 0.10284125059843063,
"learning_rate": 0.00025883855957295053,
"loss": 0.1977,
"step": 1925
},
{
"epoch": 1.5996718621821167,
"grad_norm": 0.1027180403470993,
"learning_rate": 0.0002352866737826277,
"loss": 0.1977,
"step": 1950
},
{
"epoch": 1.620180475799836,
"grad_norm": 0.0985800251364708,
"learning_rate": 0.00021271463618625986,
"loss": 0.1926,
"step": 1975
},
{
"epoch": 1.6406890894175554,
"grad_norm": 0.10515035688877106,
"learning_rate": 0.00019115136895574402,
"loss": 0.1991,
"step": 2000
},
{
"epoch": 1.6406890894175554,
"eval_loss": 0.20991793274879456,
"eval_runtime": 22.5479,
"eval_samples_per_second": 44.35,
"eval_steps_per_second": 0.71,
"step": 2000
},
{
"epoch": 1.661197703035275,
"grad_norm": 0.10674113035202026,
"learning_rate": 0.0001706245016977931,
"loss": 0.1886,
"step": 2025
},
{
"epoch": 1.6817063166529942,
"grad_norm": 0.11014382541179657,
"learning_rate": 0.00015116033605136182,
"loss": 0.191,
"step": 2050
},
{
"epoch": 1.7022149302707137,
"grad_norm": 0.11587057262659073,
"learning_rate": 0.00013278381198663492,
"loss": 0.1971,
"step": 2075
},
{
"epoch": 1.7227235438884332,
"grad_norm": 0.11261642724275589,
"learning_rate": 0.0001155184758487573,
"loss": 0.1868,
"step": 2100
},
{
"epoch": 1.7227235438884332,
"eval_loss": 0.20869949460029602,
"eval_runtime": 22.5509,
"eval_samples_per_second": 44.344,
"eval_steps_per_second": 0.71,
"step": 2100
},
{
"epoch": 1.7432321575061525,
"grad_norm": 0.0879945233464241,
"learning_rate": 9.938645018725523e-05,
"loss": 0.1903,
"step": 2125
},
{
"epoch": 1.7637407711238722,
"grad_norm": 0.08777210116386414,
"learning_rate": 8.440840540980587e-05,
"loss": 0.1882,
"step": 2150
},
{
"epoch": 1.7842493847415914,
"grad_norm": 0.09200013428926468,
"learning_rate": 7.060353329667668e-05,
"loss": 0.197,
"step": 2175
},
{
"epoch": 1.804757998359311,
"grad_norm": 0.09770681709051132,
"learning_rate": 5.798952240976951e-05,
"loss": 0.1905,
"step": 2200
},
{
"epoch": 1.804757998359311,
"eval_loss": 0.2074345052242279,
"eval_runtime": 22.5502,
"eval_samples_per_second": 44.345,
"eval_steps_per_second": 0.71,
"step": 2200
},
{
"epoch": 1.8252666119770304,
"grad_norm": 0.10603570193052292,
"learning_rate": 4.65825354277799e-05,
"loss": 0.191,
"step": 2225
},
{
"epoch": 1.8457752255947497,
"grad_norm": 0.09987975656986237,
"learning_rate": 3.639718843651363e-05,
"loss": 0.1925,
"step": 2250
},
{
"epoch": 1.8662838392124692,
"grad_norm": 0.09468022733926773,
"learning_rate": 2.7446532200894104e-05,
"loss": 0.1975,
"step": 2275
},
{
"epoch": 1.8867924528301887,
"grad_norm": 0.09474539756774902,
"learning_rate": 1.9742035442658403e-05,
"loss": 0.1902,
"step": 2300
},
{
"epoch": 1.8867924528301887,
"eval_loss": 0.20703136920928955,
"eval_runtime": 22.5531,
"eval_samples_per_second": 44.34,
"eval_steps_per_second": 0.709,
"step": 2300
},
{
"epoch": 1.907301066447908,
"grad_norm": 0.11737816035747528,
"learning_rate": 1.3293570145169742e-05,
"loss": 0.1983,
"step": 2325
},
{
"epoch": 1.9278096800656277,
"grad_norm": 0.09696778655052185,
"learning_rate": 8.109398904173282e-06,
"loss": 0.1836,
"step": 2350
},
{
"epoch": 1.948318293683347,
"grad_norm": 0.09188945591449738,
"learning_rate": 4.196164340705577e-06,
"loss": 0.1888,
"step": 2375
},
{
"epoch": 1.9688269073010665,
"grad_norm": 0.0918751060962677,
"learning_rate": 1.5588805897215342e-06,
"loss": 0.19,
"step": 2400
},
{
"epoch": 1.9688269073010665,
"eval_loss": 0.20687735080718994,
"eval_runtime": 22.5488,
"eval_samples_per_second": 44.348,
"eval_steps_per_second": 0.71,
"step": 2400
},
{
"epoch": 1.989335520918786,
"grad_norm": 0.09456487745046616,
"learning_rate": 2.0092687534589705e-07,
"loss": 0.1866,
"step": 2425
},
{
"epoch": 2.0,
"step": 2438,
"total_flos": 1.58523627405312e+18,
"train_loss": 0.25376511950097774,
"train_runtime": 3753.3089,
"train_samples_per_second": 20.782,
"train_steps_per_second": 0.65
}
],
"logging_steps": 25,
"max_steps": 2438,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.58523627405312e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}