| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 100, |
| "global_step": 2438, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.020508613617719443, |
| "grad_norm": 0.1543785035610199, |
| "learning_rate": 0.00019672131147540983, |
| "loss": 0.6266, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.04101722723543889, |
| "grad_norm": 0.2001742422580719, |
| "learning_rate": 0.00040163934426229507, |
| "loss": 0.3858, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06152584085315833, |
| "grad_norm": 0.14139799773693085, |
| "learning_rate": 0.0006065573770491804, |
| "loss": 0.3604, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08203445447087777, |
| "grad_norm": 0.20014838874340057, |
| "learning_rate": 0.0008114754098360656, |
| "loss": 0.3285, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08203445447087777, |
| "eval_loss": 0.3202356696128845, |
| "eval_runtime": 22.7105, |
| "eval_samples_per_second": 44.033, |
| "eval_steps_per_second": 0.705, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.10254306808859721, |
| "grad_norm": 0.2310003638267517, |
| "learning_rate": 0.0010163934426229509, |
| "loss": 0.341, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.12305168170631665, |
| "grad_norm": 0.3138357102870941, |
| "learning_rate": 0.001221311475409836, |
| "loss": 0.3325, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.1435602953240361, |
| "grad_norm": 1.425217628479004, |
| "learning_rate": 0.0014262295081967215, |
| "loss": 1.0626, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.16406890894175555, |
| "grad_norm": 0.6279019713401794, |
| "learning_rate": 0.0016311475409836065, |
| "loss": 0.4087, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16406890894175555, |
| "eval_loss": 0.38264134526252747, |
| "eval_runtime": 22.5381, |
| "eval_samples_per_second": 44.369, |
| "eval_steps_per_second": 0.71, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.184577522559475, |
| "grad_norm": 0.38462212681770325, |
| "learning_rate": 0.0018360655737704918, |
| "loss": 0.4034, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.20508613617719443, |
| "grad_norm": 0.3672288954257965, |
| "learning_rate": 0.0019999743708232127, |
| "loss": 0.3633, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.22559474979491387, |
| "grad_norm": 0.3607560694217682, |
| "learning_rate": 0.0019990774875676054, |
| "loss": 0.3487, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.2461033634126333, |
| "grad_norm": 0.3346173167228699, |
| "learning_rate": 0.001996900458879386, |
| "loss": 0.3371, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2461033634126333, |
| "eval_loss": 0.3209039270877838, |
| "eval_runtime": 22.5783, |
| "eval_samples_per_second": 44.29, |
| "eval_steps_per_second": 0.709, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2666119770303528, |
| "grad_norm": 0.22805160284042358, |
| "learning_rate": 0.001993446074245224, |
| "loss": 0.3296, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.2871205906480722, |
| "grad_norm": 0.18590985238552094, |
| "learning_rate": 0.0019887187598630527, |
| "loss": 0.3134, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.30762920426579166, |
| "grad_norm": 0.21088963747024536, |
| "learning_rate": 0.0019827245729706648, |
| "loss": 0.3199, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.3281378178835111, |
| "grad_norm": 0.18156805634498596, |
| "learning_rate": 0.0019754711940844047, |
| "loss": 0.2996, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.3281378178835111, |
| "eval_loss": 0.28689703345298767, |
| "eval_runtime": 22.5553, |
| "eval_samples_per_second": 44.335, |
| "eval_steps_per_second": 0.709, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.34864643150123054, |
| "grad_norm": 0.15950609743595123, |
| "learning_rate": 0.0019669679171579117, |
| "loss": 0.3044, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.36915504511895, |
| "grad_norm": 0.16445936262607574, |
| "learning_rate": 0.001957225637673524, |
| "loss": 0.3019, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.3896636587366694, |
| "grad_norm": 0.16957086324691772, |
| "learning_rate": 0.0019462568386815961, |
| "loss": 0.2863, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.41017227235438886, |
| "grad_norm": 0.12954926490783691, |
| "learning_rate": 0.0019340755748056234, |
| "loss": 0.2701, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.41017227235438886, |
| "eval_loss": 0.2733325660228729, |
| "eval_runtime": 22.5518, |
| "eval_samples_per_second": 44.342, |
| "eval_steps_per_second": 0.709, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4306808859721083, |
| "grad_norm": 0.1369732916355133, |
| "learning_rate": 0.0019206974542336672, |
| "loss": 0.271, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.45118949958982774, |
| "grad_norm": 0.15917326509952545, |
| "learning_rate": 0.0019061396187191563, |
| "loss": 0.2802, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.4716981132075472, |
| "grad_norm": 0.16746191680431366, |
| "learning_rate": 0.0018904207216166836, |
| "loss": 0.2691, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.4922067268252666, |
| "grad_norm": 0.1554066687822342, |
| "learning_rate": 0.001873560903980955, |
| "loss": 0.286, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4922067268252666, |
| "eval_loss": 0.26212847232818604, |
| "eval_runtime": 22.5566, |
| "eval_samples_per_second": 44.333, |
| "eval_steps_per_second": 0.709, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.5127153404429861, |
| "grad_norm": 0.13422970473766327, |
| "learning_rate": 0.0018555817687594984, |
| "loss": 0.2655, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.5332239540607056, |
| "grad_norm": 0.14770525693893433, |
| "learning_rate": 0.0018365063531122169, |
| "loss": 0.26, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.5537325676784249, |
| "grad_norm": 0.12729965150356293, |
| "learning_rate": 0.0018163590988932402, |
| "loss": 0.2694, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.5742411812961444, |
| "grad_norm": 0.1334213763475418, |
| "learning_rate": 0.0017951658213329078, |
| "loss": 0.268, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5742411812961444, |
| "eval_loss": 0.25402218103408813, |
| "eval_runtime": 22.5274, |
| "eval_samples_per_second": 44.39, |
| "eval_steps_per_second": 0.71, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5947497949138638, |
| "grad_norm": 0.12552621960639954, |
| "learning_rate": 0.0017729536759600033, |
| "loss": 0.266, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.6152584085315833, |
| "grad_norm": 0.13384173810482025, |
| "learning_rate": 0.0017497511238066307, |
| "loss": 0.2631, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.6357670221493027, |
| "grad_norm": 0.12846872210502625, |
| "learning_rate": 0.00172558789494031, |
| "loss": 0.2588, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.6562756357670222, |
| "grad_norm": 0.16066329181194305, |
| "learning_rate": 0.0017004949503700284, |
| "loss": 0.2636, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.6562756357670222, |
| "eval_loss": 0.24892009794712067, |
| "eval_runtime": 22.5749, |
| "eval_samples_per_second": 44.297, |
| "eval_steps_per_second": 0.709, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.6767842493847416, |
| "grad_norm": 0.11756884306669235, |
| "learning_rate": 0.0016745044423750449, |
| "loss": 0.2563, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.6972928630024611, |
| "grad_norm": 0.1094069853425026, |
| "learning_rate": 0.0016476496733072946, |
| "loss": 0.2581, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.7178014766201805, |
| "grad_norm": 0.10113517194986343, |
| "learning_rate": 0.0016199650529201684, |
| "loss": 0.2466, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.7383100902379, |
| "grad_norm": 0.12762148678302765, |
| "learning_rate": 0.0015914860542783522, |
| "loss": 0.2511, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.7383100902379, |
| "eval_loss": 0.24198263883590698, |
| "eval_runtime": 22.5893, |
| "eval_samples_per_second": 44.269, |
| "eval_steps_per_second": 0.708, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.7588187038556193, |
| "grad_norm": 0.1412491798400879, |
| "learning_rate": 0.0015622491683052124, |
| "loss": 0.2538, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.7793273174733388, |
| "grad_norm": 0.1309656947851181, |
| "learning_rate": 0.0015322918570259759, |
| "loss": 0.2417, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.7998359310910582, |
| "grad_norm": 0.12559030950069427, |
| "learning_rate": 0.0015016525055666057, |
| "loss": 0.2498, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.8203445447087777, |
| "grad_norm": 0.12614794075489044, |
| "learning_rate": 0.001470370372969886, |
| "loss": 0.2417, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8203445447087777, |
| "eval_loss": 0.2378261834383011, |
| "eval_runtime": 22.5548, |
| "eval_samples_per_second": 44.336, |
| "eval_steps_per_second": 0.709, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8408531583264971, |
| "grad_norm": 0.134114608168602, |
| "learning_rate": 0.0014384855418917311, |
| "loss": 0.2452, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.8613617719442166, |
| "grad_norm": 0.11434811353683472, |
| "learning_rate": 0.0014060388672421775, |
| "loss": 0.2412, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.881870385561936, |
| "grad_norm": 0.11284555494785309, |
| "learning_rate": 0.0013730719238368662, |
| "loss": 0.245, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.9023789991796555, |
| "grad_norm": 0.13569487631320953, |
| "learning_rate": 0.0013396269531260867, |
| "loss": 0.246, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.9023789991796555, |
| "eval_loss": 0.2345089465379715, |
| "eval_runtime": 22.5349, |
| "eval_samples_per_second": 44.376, |
| "eval_steps_per_second": 0.71, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.9228876127973749, |
| "grad_norm": 0.09805800765752792, |
| "learning_rate": 0.0013057468090696496, |
| "loss": 0.2414, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.9433962264150944, |
| "grad_norm": 0.0931050032377243, |
| "learning_rate": 0.0012714749032269287, |
| "loss": 0.2404, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.9639048400328137, |
| "grad_norm": 0.10308840870857239, |
| "learning_rate": 0.0012368551491324358, |
| "loss": 0.245, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.9844134536505332, |
| "grad_norm": 0.10258302837610245, |
| "learning_rate": 0.0012019319060282063, |
| "loss": 0.2509, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.9844134536505332, |
| "eval_loss": 0.22989174723625183, |
| "eval_runtime": 22.5404, |
| "eval_samples_per_second": 44.365, |
| "eval_steps_per_second": 0.71, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.0049220672682526, |
| "grad_norm": 0.11619790643453598, |
| "learning_rate": 0.0011667499220250803, |
| "loss": 0.2302, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.0254306808859721, |
| "grad_norm": 0.11248350143432617, |
| "learning_rate": 0.0011313542767657204, |
| "loss": 0.2105, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.0459392945036916, |
| "grad_norm": 0.12309166043996811, |
| "learning_rate": 0.0010957903236628267, |
| "loss": 0.2114, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.066447908121411, |
| "grad_norm": 0.10280752182006836, |
| "learning_rate": 0.001060103631786563, |
| "loss": 0.2138, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.066447908121411, |
| "eval_loss": 0.2270548790693283, |
| "eval_runtime": 22.5165, |
| "eval_samples_per_second": 44.412, |
| "eval_steps_per_second": 0.711, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.0869565217391304, |
| "grad_norm": 0.12517733871936798, |
| "learning_rate": 0.0010243399274756564, |
| "loss": 0.2111, |
| "step": 1325 |
| }, |
| { |
| "epoch": 1.1074651353568499, |
| "grad_norm": 0.09303736686706543, |
| "learning_rate": 0.0009885450357469806, |
| "loss": 0.2043, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.1279737489745694, |
| "grad_norm": 0.11305887997150421, |
| "learning_rate": 0.0009527648215787065, |
| "loss": 0.2057, |
| "step": 1375 |
| }, |
| { |
| "epoch": 1.1484823625922886, |
| "grad_norm": 0.1038450226187706, |
| "learning_rate": 0.000917045131142242, |
| "loss": 0.1984, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.1484823625922886, |
| "eval_loss": 0.22505907714366913, |
| "eval_runtime": 22.5688, |
| "eval_samples_per_second": 44.309, |
| "eval_steps_per_second": 0.709, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.1689909762100081, |
| "grad_norm": 0.10388393700122833, |
| "learning_rate": 0.0008814317330582753, |
| "loss": 0.2092, |
| "step": 1425 |
| }, |
| { |
| "epoch": 1.1894995898277276, |
| "grad_norm": 0.1294604390859604, |
| "learning_rate": 0.000845970259752183, |
| "loss": 0.2107, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.2100082034454471, |
| "grad_norm": 0.1135207936167717, |
| "learning_rate": 0.0008107061489839498, |
| "loss": 0.2069, |
| "step": 1475 |
| }, |
| { |
| "epoch": 1.2305168170631666, |
| "grad_norm": 0.11284071952104568, |
| "learning_rate": 0.0007756845856275194, |
| "loss": 0.2169, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2305168170631666, |
| "eval_loss": 0.2215997278690338, |
| "eval_runtime": 22.56, |
| "eval_samples_per_second": 44.326, |
| "eval_steps_per_second": 0.709, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.251025430680886, |
| "grad_norm": 0.1196022480726242, |
| "learning_rate": 0.0007409504437741722, |
| "loss": 0.21, |
| "step": 1525 |
| }, |
| { |
| "epoch": 1.2715340442986054, |
| "grad_norm": 0.10071329027414322, |
| "learning_rate": 0.0007065482292341205, |
| "loss": 0.2027, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.2920426579163249, |
| "grad_norm": 0.09619199484586716, |
| "learning_rate": 0.0006725220225099911, |
| "loss": 0.2052, |
| "step": 1575 |
| }, |
| { |
| "epoch": 1.3125512715340442, |
| "grad_norm": 0.09156788140535355, |
| "learning_rate": 0.0006389154223152666, |
| "loss": 0.1987, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.3125512715340442, |
| "eval_loss": 0.21827217936515808, |
| "eval_runtime": 22.5451, |
| "eval_samples_per_second": 44.356, |
| "eval_steps_per_second": 0.71, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.3330598851517639, |
| "grad_norm": 0.09059920907020569, |
| "learning_rate": 0.0006057714897100551, |
| "loss": 0.201, |
| "step": 1625 |
| }, |
| { |
| "epoch": 1.3535684987694832, |
| "grad_norm": 0.10633113235235214, |
| "learning_rate": 0.0005731326929257713, |
| "loss": 0.2022, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.3740771123872026, |
| "grad_norm": 0.10689054429531097, |
| "learning_rate": 0.0005410408529494251, |
| "loss": 0.2001, |
| "step": 1675 |
| }, |
| { |
| "epoch": 1.3945857260049221, |
| "grad_norm": 0.10712600499391556, |
| "learning_rate": 0.0005095370899372412, |
| "loss": 0.2002, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3945857260049221, |
| "eval_loss": 0.2159736305475235, |
| "eval_runtime": 22.541, |
| "eval_samples_per_second": 44.364, |
| "eval_steps_per_second": 0.71, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.4150943396226414, |
| "grad_norm": 0.0958191528916359, |
| "learning_rate": 0.0004786617705262746, |
| "loss": 0.1979, |
| "step": 1725 |
| }, |
| { |
| "epoch": 1.435602953240361, |
| "grad_norm": 0.097678542137146, |
| "learning_rate": 0.000448454456111529, |
| "loss": 0.1957, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.4561115668580804, |
| "grad_norm": 0.09663794934749603, |
| "learning_rate": 0.0004189538521548524, |
| "loss": 0.2034, |
| "step": 1775 |
| }, |
| { |
| "epoch": 1.4766201804758, |
| "grad_norm": 0.09802096337080002, |
| "learning_rate": 0.00039019775859056916, |
| "loss": 0.1927, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.4766201804758, |
| "eval_loss": 0.2144840955734253, |
| "eval_runtime": 22.5527, |
| "eval_samples_per_second": 44.341, |
| "eval_steps_per_second": 0.709, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.4971287940935194, |
| "grad_norm": 0.09784867614507675, |
| "learning_rate": 0.0003622230213913836, |
| "loss": 0.1917, |
| "step": 1825 |
| }, |
| { |
| "epoch": 1.5176374077112387, |
| "grad_norm": 0.09651490300893784, |
| "learning_rate": 0.0003350654853566223, |
| "loss": 0.1944, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.5381460213289582, |
| "grad_norm": 0.10981660336256027, |
| "learning_rate": 0.00030875994818330957, |
| "loss": 0.1958, |
| "step": 1875 |
| }, |
| { |
| "epoch": 1.5586546349466777, |
| "grad_norm": 0.11364647001028061, |
| "learning_rate": 0.0002833401158789207, |
| "loss": 0.1985, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.5586546349466777, |
| "eval_loss": 0.21162408590316772, |
| "eval_runtime": 22.5558, |
| "eval_samples_per_second": 44.334, |
| "eval_steps_per_second": 0.709, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.579163248564397, |
| "grad_norm": 0.10284125059843063, |
| "learning_rate": 0.00025883855957295053, |
| "loss": 0.1977, |
| "step": 1925 |
| }, |
| { |
| "epoch": 1.5996718621821167, |
| "grad_norm": 0.1027180403470993, |
| "learning_rate": 0.0002352866737826277, |
| "loss": 0.1977, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.620180475799836, |
| "grad_norm": 0.0985800251364708, |
| "learning_rate": 0.00021271463618625986, |
| "loss": 0.1926, |
| "step": 1975 |
| }, |
| { |
| "epoch": 1.6406890894175554, |
| "grad_norm": 0.10515035688877106, |
| "learning_rate": 0.00019115136895574402, |
| "loss": 0.1991, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6406890894175554, |
| "eval_loss": 0.20991793274879456, |
| "eval_runtime": 22.5479, |
| "eval_samples_per_second": 44.35, |
| "eval_steps_per_second": 0.71, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.661197703035275, |
| "grad_norm": 0.10674113035202026, |
| "learning_rate": 0.0001706245016977931, |
| "loss": 0.1886, |
| "step": 2025 |
| }, |
| { |
| "epoch": 1.6817063166529942, |
| "grad_norm": 0.11014382541179657, |
| "learning_rate": 0.00015116033605136182, |
| "loss": 0.191, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.7022149302707137, |
| "grad_norm": 0.11587057262659073, |
| "learning_rate": 0.00013278381198663492, |
| "loss": 0.1971, |
| "step": 2075 |
| }, |
| { |
| "epoch": 1.7227235438884332, |
| "grad_norm": 0.11261642724275589, |
| "learning_rate": 0.0001155184758487573, |
| "loss": 0.1868, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.7227235438884332, |
| "eval_loss": 0.20869949460029602, |
| "eval_runtime": 22.5509, |
| "eval_samples_per_second": 44.344, |
| "eval_steps_per_second": 0.71, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.7432321575061525, |
| "grad_norm": 0.0879945233464241, |
| "learning_rate": 9.938645018725523e-05, |
| "loss": 0.1903, |
| "step": 2125 |
| }, |
| { |
| "epoch": 1.7637407711238722, |
| "grad_norm": 0.08777210116386414, |
| "learning_rate": 8.440840540980587e-05, |
| "loss": 0.1882, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.7842493847415914, |
| "grad_norm": 0.09200013428926468, |
| "learning_rate": 7.060353329667668e-05, |
| "loss": 0.197, |
| "step": 2175 |
| }, |
| { |
| "epoch": 1.804757998359311, |
| "grad_norm": 0.09770681709051132, |
| "learning_rate": 5.798952240976951e-05, |
| "loss": 0.1905, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.804757998359311, |
| "eval_loss": 0.2074345052242279, |
| "eval_runtime": 22.5502, |
| "eval_samples_per_second": 44.345, |
| "eval_steps_per_second": 0.71, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.8252666119770304, |
| "grad_norm": 0.10603570193052292, |
| "learning_rate": 4.65825354277799e-05, |
| "loss": 0.191, |
| "step": 2225 |
| }, |
| { |
| "epoch": 1.8457752255947497, |
| "grad_norm": 0.09987975656986237, |
| "learning_rate": 3.639718843651363e-05, |
| "loss": 0.1925, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.8662838392124692, |
| "grad_norm": 0.09468022733926773, |
| "learning_rate": 2.7446532200894104e-05, |
| "loss": 0.1975, |
| "step": 2275 |
| }, |
| { |
| "epoch": 1.8867924528301887, |
| "grad_norm": 0.09474539756774902, |
| "learning_rate": 1.9742035442658403e-05, |
| "loss": 0.1902, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.8867924528301887, |
| "eval_loss": 0.20703136920928955, |
| "eval_runtime": 22.5531, |
| "eval_samples_per_second": 44.34, |
| "eval_steps_per_second": 0.709, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.907301066447908, |
| "grad_norm": 0.11737816035747528, |
| "learning_rate": 1.3293570145169742e-05, |
| "loss": 0.1983, |
| "step": 2325 |
| }, |
| { |
| "epoch": 1.9278096800656277, |
| "grad_norm": 0.09696778655052185, |
| "learning_rate": 8.109398904173282e-06, |
| "loss": 0.1836, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.948318293683347, |
| "grad_norm": 0.09188945591449738, |
| "learning_rate": 4.196164340705577e-06, |
| "loss": 0.1888, |
| "step": 2375 |
| }, |
| { |
| "epoch": 1.9688269073010665, |
| "grad_norm": 0.0918751060962677, |
| "learning_rate": 1.5588805897215342e-06, |
| "loss": 0.19, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.9688269073010665, |
| "eval_loss": 0.20687735080718994, |
| "eval_runtime": 22.5488, |
| "eval_samples_per_second": 44.348, |
| "eval_steps_per_second": 0.71, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.989335520918786, |
| "grad_norm": 0.09456487745046616, |
| "learning_rate": 2.0092687534589705e-07, |
| "loss": 0.1866, |
| "step": 2425 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 2438, |
| "total_flos": 1.58523627405312e+18, |
| "train_loss": 0.25376511950097774, |
| "train_runtime": 3753.3089, |
| "train_samples_per_second": 20.782, |
| "train_steps_per_second": 0.65 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 2438, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.58523627405312e+18, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|