| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9212295869356388, |
| "eval_steps": 30, |
| "global_step": 4000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01, |
| "grad_norm": 2.113042176528739, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 1.7509, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.01, |
| "eval_loss": 1.228292465209961, |
| "eval_runtime": 0.6159, |
| "eval_samples_per_second": 6.495, |
| "eval_steps_per_second": 1.624, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 1.661469579110867, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 1.1478, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.03, |
| "eval_loss": 1.0729522705078125, |
| "eval_runtime": 0.5568, |
| "eval_samples_per_second": 7.183, |
| "eval_steps_per_second": 1.796, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.4443362785996876, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 1.0223, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_loss": 1.0873539447784424, |
| "eval_runtime": 0.5571, |
| "eval_samples_per_second": 7.18, |
| "eval_steps_per_second": 1.795, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.4587251002054071, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 1.0156, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.06, |
| "eval_loss": 1.1211045980453491, |
| "eval_runtime": 0.5582, |
| "eval_samples_per_second": 7.166, |
| "eval_steps_per_second": 1.792, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 1.2847131983132847, |
| "learning_rate": 2.4e-05, |
| "loss": 1.0016, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.07, |
| "eval_loss": 1.1070358753204346, |
| "eval_runtime": 0.5598, |
| "eval_samples_per_second": 7.146, |
| "eval_steps_per_second": 1.786, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 1.3382431357490099, |
| "learning_rate": 2.8800000000000002e-05, |
| "loss": 0.9996, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.09, |
| "eval_loss": 1.0517526865005493, |
| "eval_runtime": 0.5568, |
| "eval_samples_per_second": 7.184, |
| "eval_steps_per_second": 1.796, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 1.2345853900999129, |
| "learning_rate": 3.3600000000000004e-05, |
| "loss": 1.0139, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.1, |
| "eval_loss": 1.0749964714050293, |
| "eval_runtime": 0.5592, |
| "eval_samples_per_second": 7.153, |
| "eval_steps_per_second": 1.788, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 1.2244044739268343, |
| "learning_rate": 3.8400000000000005e-05, |
| "loss": 1.0034, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_loss": 1.0613545179367065, |
| "eval_runtime": 0.5589, |
| "eval_samples_per_second": 7.157, |
| "eval_steps_per_second": 1.789, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.0431475291904868, |
| "learning_rate": 3.999939500769685e-05, |
| "loss": 1.0113, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.13, |
| "eval_loss": 1.1293017864227295, |
| "eval_runtime": 0.5624, |
| "eval_samples_per_second": 7.113, |
| "eval_steps_per_second": 1.778, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 1.1355194938997377, |
| "learning_rate": 3.9996218898187263e-05, |
| "loss": 0.9819, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.14, |
| "eval_loss": 1.151803970336914, |
| "eval_runtime": 0.5544, |
| "eval_samples_per_second": 7.214, |
| "eval_steps_per_second": 1.804, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.002096318645778, |
| "learning_rate": 3.999032085516322e-05, |
| "loss": 0.9966, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_loss": 1.1503264904022217, |
| "eval_runtime": 0.5581, |
| "eval_samples_per_second": 7.168, |
| "eval_steps_per_second": 1.792, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 1.0441830545655453, |
| "learning_rate": 3.998170168148057e-05, |
| "loss": 0.993, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.17, |
| "eval_loss": 1.1149314641952515, |
| "eval_runtime": 0.5612, |
| "eval_samples_per_second": 7.128, |
| "eval_steps_per_second": 1.782, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.9894273776215953, |
| "learning_rate": 3.9970362550402e-05, |
| "loss": 1.0013, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.19, |
| "eval_loss": 1.137695550918579, |
| "eval_runtime": 0.5586, |
| "eval_samples_per_second": 7.161, |
| "eval_steps_per_second": 1.79, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.9860155816281625, |
| "learning_rate": 3.9956305005437365e-05, |
| "loss": 0.9879, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_loss": 1.1234393119812012, |
| "eval_runtime": 0.5579, |
| "eval_samples_per_second": 7.169, |
| "eval_steps_per_second": 1.792, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.9832922200849178, |
| "learning_rate": 3.993953096013357e-05, |
| "loss": 0.9752, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.22, |
| "eval_loss": 1.1383097171783447, |
| "eval_runtime": 0.5607, |
| "eval_samples_per_second": 7.134, |
| "eval_steps_per_second": 1.783, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.8913823988583212, |
| "learning_rate": 3.992004269781409e-05, |
| "loss": 0.9868, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.23, |
| "eval_loss": 1.1235936880111694, |
| "eval_runtime": 0.5602, |
| "eval_samples_per_second": 7.14, |
| "eval_steps_per_second": 1.785, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.8982861978725096, |
| "learning_rate": 3.989784287126818e-05, |
| "loss": 0.9803, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_loss": 1.1284420490264893, |
| "eval_runtime": 0.5589, |
| "eval_samples_per_second": 7.157, |
| "eval_steps_per_second": 1.789, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.9454352238454017, |
| "learning_rate": 3.987293450238972e-05, |
| "loss": 0.9898, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.26, |
| "eval_loss": 1.120229959487915, |
| "eval_runtime": 0.5547, |
| "eval_samples_per_second": 7.211, |
| "eval_steps_per_second": 1.803, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.9889512773502193, |
| "learning_rate": 3.984532098176592e-05, |
| "loss": 0.9699, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.27, |
| "eval_loss": 1.1161502599716187, |
| "eval_runtime": 0.5579, |
| "eval_samples_per_second": 7.17, |
| "eval_steps_per_second": 1.793, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.8664429324644183, |
| "learning_rate": 3.9815006068215785e-05, |
| "loss": 0.9648, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.29, |
| "eval_loss": 1.1602427959442139, |
| "eval_runtime": 0.5584, |
| "eval_samples_per_second": 7.163, |
| "eval_steps_per_second": 1.791, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.9808847012968992, |
| "learning_rate": 3.9781993888278404e-05, |
| "loss": 0.9846, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.3, |
| "eval_loss": 1.1806962490081787, |
| "eval_runtime": 0.5597, |
| "eval_samples_per_second": 7.147, |
| "eval_steps_per_second": 1.787, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.9314218284659608, |
| "learning_rate": 3.974628893565128e-05, |
| "loss": 0.9738, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_loss": 1.135533094406128, |
| "eval_runtime": 0.5539, |
| "eval_samples_per_second": 7.221, |
| "eval_steps_per_second": 1.805, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.9227397654305332, |
| "learning_rate": 3.970789607057863e-05, |
| "loss": 0.9585, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.33, |
| "eval_loss": 1.1579846143722534, |
| "eval_runtime": 0.5616, |
| "eval_samples_per_second": 7.122, |
| "eval_steps_per_second": 1.781, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.9125734661246961, |
| "learning_rate": 3.9666820519189786e-05, |
| "loss": 0.9495, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.35, |
| "eval_loss": 1.1361984014511108, |
| "eval_runtime": 0.5527, |
| "eval_samples_per_second": 7.237, |
| "eval_steps_per_second": 1.809, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.8758759988091239, |
| "learning_rate": 3.962306787278781e-05, |
| "loss": 0.9618, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.36, |
| "eval_loss": 1.1645328998565674, |
| "eval_runtime": 0.5589, |
| "eval_samples_per_second": 7.157, |
| "eval_steps_per_second": 1.789, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.8846060065129696, |
| "learning_rate": 3.957664408708839e-05, |
| "loss": 0.9677, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.37, |
| "eval_loss": 1.1109530925750732, |
| "eval_runtime": 0.5585, |
| "eval_samples_per_second": 7.162, |
| "eval_steps_per_second": 1.791, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.8483263644995762, |
| "learning_rate": 3.9527555481409144e-05, |
| "loss": 0.9498, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.39, |
| "eval_loss": 1.140505075454712, |
| "eval_runtime": 0.5565, |
| "eval_samples_per_second": 7.188, |
| "eval_steps_per_second": 1.797, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.9019276325326514, |
| "learning_rate": 3.947580873780938e-05, |
| "loss": 0.963, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_loss": 1.140986442565918, |
| "eval_runtime": 0.5552, |
| "eval_samples_per_second": 7.205, |
| "eval_steps_per_second": 1.801, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.8514201087836062, |
| "learning_rate": 3.942141090018057e-05, |
| "loss": 0.9384, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.42, |
| "eval_loss": 1.1474781036376953, |
| "eval_runtime": 0.5558, |
| "eval_samples_per_second": 7.197, |
| "eval_steps_per_second": 1.799, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.9146315304499618, |
| "learning_rate": 3.9364369373287495e-05, |
| "loss": 0.9631, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.43, |
| "eval_loss": 1.177463173866272, |
| "eval_runtime": 0.561, |
| "eval_samples_per_second": 7.13, |
| "eval_steps_per_second": 1.783, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.802994086627822, |
| "learning_rate": 3.930469192176029e-05, |
| "loss": 0.9515, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.45, |
| "eval_loss": 1.1355152130126953, |
| "eval_runtime": 0.5569, |
| "eval_samples_per_second": 7.182, |
| "eval_steps_per_second": 1.796, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.8990934982164889, |
| "learning_rate": 3.9242386669037495e-05, |
| "loss": 0.9578, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.46, |
| "eval_loss": 1.1421607732772827, |
| "eval_runtime": 0.5609, |
| "eval_samples_per_second": 7.131, |
| "eval_steps_per_second": 1.783, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.8585673645933002, |
| "learning_rate": 3.9177462096260285e-05, |
| "loss": 0.9415, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_loss": 1.1663556098937988, |
| "eval_runtime": 0.558, |
| "eval_samples_per_second": 7.169, |
| "eval_steps_per_second": 1.792, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.7615696788440944, |
| "learning_rate": 3.9109927041118005e-05, |
| "loss": 0.9387, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.49, |
| "eval_loss": 1.107798457145691, |
| "eval_runtime": 0.5589, |
| "eval_samples_per_second": 7.157, |
| "eval_steps_per_second": 1.789, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.8343416810061398, |
| "learning_rate": 3.903979069664514e-05, |
| "loss": 0.9338, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.5, |
| "eval_loss": 1.1265822649002075, |
| "eval_runtime": 0.5573, |
| "eval_samples_per_second": 7.177, |
| "eval_steps_per_second": 1.794, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.8787917026949466, |
| "learning_rate": 3.896706260996995e-05, |
| "loss": 0.9499, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.52, |
| "eval_loss": 1.1402231454849243, |
| "eval_runtime": 0.5556, |
| "eval_samples_per_second": 7.199, |
| "eval_steps_per_second": 1.8, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.8146261574653946, |
| "learning_rate": 3.8891752681014904e-05, |
| "loss": 0.9511, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.53, |
| "eval_loss": 1.1247859001159668, |
| "eval_runtime": 0.5614, |
| "eval_samples_per_second": 7.125, |
| "eval_steps_per_second": 1.781, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.8346263053254463, |
| "learning_rate": 3.8813871161149053e-05, |
| "loss": 0.9459, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.55, |
| "eval_loss": 1.155929446220398, |
| "eval_runtime": 0.5621, |
| "eval_samples_per_second": 7.117, |
| "eval_steps_per_second": 1.779, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.8766772464417112, |
| "learning_rate": 3.873342865179259e-05, |
| "loss": 0.9293, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_loss": 1.056406855583191, |
| "eval_runtime": 0.5534, |
| "eval_samples_per_second": 7.228, |
| "eval_steps_per_second": 1.807, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.7763657825441121, |
| "learning_rate": 3.86504361029738e-05, |
| "loss": 0.9322, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.58, |
| "eval_loss": 1.0825164318084717, |
| "eval_runtime": 0.5572, |
| "eval_samples_per_second": 7.179, |
| "eval_steps_per_second": 1.795, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 0.8387649732582555, |
| "learning_rate": 3.856490481183847e-05, |
| "loss": 0.924, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.59, |
| "eval_loss": 1.1075164079666138, |
| "eval_runtime": 0.5568, |
| "eval_samples_per_second": 7.184, |
| "eval_steps_per_second": 1.796, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 0.7896889685040874, |
| "learning_rate": 3.8476846421112096e-05, |
| "loss": 0.9277, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.61, |
| "eval_loss": 1.1195158958435059, |
| "eval_runtime": 0.5591, |
| "eval_samples_per_second": 7.154, |
| "eval_steps_per_second": 1.789, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 0.844041645637629, |
| "learning_rate": 3.8386272917515086e-05, |
| "loss": 0.9311, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.62, |
| "eval_loss": 1.1050487756729126, |
| "eval_runtime": 0.5585, |
| "eval_samples_per_second": 7.162, |
| "eval_steps_per_second": 1.79, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 0.7681851654349889, |
| "learning_rate": 3.829319663013106e-05, |
| "loss": 0.9192, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.63, |
| "eval_loss": 1.0749808549880981, |
| "eval_runtime": 0.5609, |
| "eval_samples_per_second": 7.132, |
| "eval_steps_per_second": 1.783, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.8605223155980966, |
| "learning_rate": 3.819763022872861e-05, |
| "loss": 0.9442, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.65, |
| "eval_loss": 1.117114782333374, |
| "eval_runtime": 0.5601, |
| "eval_samples_per_second": 7.142, |
| "eval_steps_per_second": 1.785, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 0.7918110282272083, |
| "learning_rate": 3.809958672203663e-05, |
| "loss": 0.9285, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.66, |
| "eval_loss": 1.0994510650634766, |
| "eval_runtime": 0.5614, |
| "eval_samples_per_second": 7.125, |
| "eval_steps_per_second": 1.781, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.9217033677356604, |
| "learning_rate": 3.799907945597359e-05, |
| "loss": 0.9209, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.68, |
| "eval_loss": 1.0954455137252808, |
| "eval_runtime": 0.5629, |
| "eval_samples_per_second": 7.106, |
| "eval_steps_per_second": 1.777, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 0.8034429414026776, |
| "learning_rate": 3.789612211183079e-05, |
| "loss": 0.9118, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.69, |
| "eval_loss": 1.0785255432128906, |
| "eval_runtime": 0.5567, |
| "eval_samples_per_second": 7.185, |
| "eval_steps_per_second": 1.796, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 0.7842181753550963, |
| "learning_rate": 3.779072870441009e-05, |
| "loss": 0.9255, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.71, |
| "eval_loss": 1.058296799659729, |
| "eval_runtime": 0.555, |
| "eval_samples_per_second": 7.207, |
| "eval_steps_per_second": 1.802, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.7996464468442931, |
| "learning_rate": 3.768291358011613e-05, |
| "loss": 0.9351, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_loss": 1.0723786354064941, |
| "eval_runtime": 0.5664, |
| "eval_samples_per_second": 7.063, |
| "eval_steps_per_second": 1.766, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 0.7695046197488054, |
| "learning_rate": 3.757269141500352e-05, |
| "loss": 0.9284, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.73, |
| "eval_loss": 1.0894200801849365, |
| "eval_runtime": 0.5634, |
| "eval_samples_per_second": 7.1, |
| "eval_steps_per_second": 1.775, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.7721868573404526, |
| "learning_rate": 3.7460077212779035e-05, |
| "loss": 0.9164, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.75, |
| "eval_loss": 1.0501906871795654, |
| "eval_runtime": 0.5647, |
| "eval_samples_per_second": 7.084, |
| "eval_steps_per_second": 1.771, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.7866789547512432, |
| "learning_rate": 3.734508630275932e-05, |
| "loss": 0.9159, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.76, |
| "eval_loss": 1.064911127090454, |
| "eval_runtime": 0.5614, |
| "eval_samples_per_second": 7.125, |
| "eval_steps_per_second": 1.781, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 0.9455983803978201, |
| "learning_rate": 3.722773433778423e-05, |
| "loss": 0.9137, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.78, |
| "eval_loss": 1.0576999187469482, |
| "eval_runtime": 0.5617, |
| "eval_samples_per_second": 7.122, |
| "eval_steps_per_second": 1.78, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 0.813819105428533, |
| "learning_rate": 3.710803729208609e-05, |
| "loss": 0.9284, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.79, |
| "eval_loss": 1.0807015895843506, |
| "eval_runtime": 0.5652, |
| "eval_samples_per_second": 7.077, |
| "eval_steps_per_second": 1.769, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 0.8764908329115464, |
| "learning_rate": 3.6986011459115274e-05, |
| "loss": 0.9194, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.81, |
| "eval_loss": 1.1170496940612793, |
| "eval_runtime": 0.5626, |
| "eval_samples_per_second": 7.11, |
| "eval_steps_per_second": 1.778, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 0.7939056270057047, |
| "learning_rate": 3.6861673449322294e-05, |
| "loss": 0.9138, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.82, |
| "eval_loss": 1.043514609336853, |
| "eval_runtime": 0.5671, |
| "eval_samples_per_second": 7.053, |
| "eval_steps_per_second": 1.763, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.8041456979314675, |
| "learning_rate": 3.673504018789674e-05, |
| "loss": 0.9171, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.84, |
| "eval_loss": 1.056423544883728, |
| "eval_runtime": 0.5639, |
| "eval_samples_per_second": 7.093, |
| "eval_steps_per_second": 1.773, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.7753386719779899, |
| "learning_rate": 3.660612891246338e-05, |
| "loss": 0.9152, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.85, |
| "eval_loss": 1.0508358478546143, |
| "eval_runtime": 0.5642, |
| "eval_samples_per_second": 7.089, |
| "eval_steps_per_second": 1.772, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 0.7914888664595032, |
| "learning_rate": 3.6474957170735755e-05, |
| "loss": 0.9204, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.86, |
| "eval_loss": 1.0375233888626099, |
| "eval_runtime": 0.5611, |
| "eval_samples_per_second": 7.129, |
| "eval_steps_per_second": 1.782, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.7788736799431811, |
| "learning_rate": 3.634154281812753e-05, |
| "loss": 0.9156, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.88, |
| "eval_loss": 1.063939094543457, |
| "eval_runtime": 0.5619, |
| "eval_samples_per_second": 7.119, |
| "eval_steps_per_second": 1.78, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.7336792812810264, |
| "learning_rate": 3.620590401532194e-05, |
| "loss": 0.9014, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.89, |
| "eval_loss": 1.0614831447601318, |
| "eval_runtime": 0.5626, |
| "eval_samples_per_second": 7.109, |
| "eval_steps_per_second": 1.777, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 0.7967758593948572, |
| "learning_rate": 3.6068059225799775e-05, |
| "loss": 0.9137, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.91, |
| "eval_loss": 1.1221044063568115, |
| "eval_runtime": 0.5601, |
| "eval_samples_per_second": 7.142, |
| "eval_steps_per_second": 1.785, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 0.8091540055356147, |
| "learning_rate": 3.592802721332603e-05, |
| "loss": 0.9047, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.92, |
| "eval_loss": 1.113644003868103, |
| "eval_runtime": 0.564, |
| "eval_samples_per_second": 7.092, |
| "eval_steps_per_second": 1.773, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 0.7336591291121157, |
| "learning_rate": 3.5785827039395786e-05, |
| "loss": 0.9096, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.94, |
| "eval_loss": 1.1049702167510986, |
| "eval_runtime": 0.5518, |
| "eval_samples_per_second": 7.249, |
| "eval_steps_per_second": 1.812, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.9010245369928321, |
| "learning_rate": 3.5641478060639455e-05, |
| "loss": 0.9169, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.95, |
| "eval_loss": 1.0915415287017822, |
| "eval_runtime": 0.5617, |
| "eval_samples_per_second": 7.121, |
| "eval_steps_per_second": 1.78, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 0.7853376659918098, |
| "learning_rate": 3.549499992618796e-05, |
| "loss": 0.9154, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.97, |
| "eval_loss": 1.0784239768981934, |
| "eval_runtime": 0.5553, |
| "eval_samples_per_second": 7.203, |
| "eval_steps_per_second": 1.801, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 0.7556103808313751, |
| "learning_rate": 3.5346412574998034e-05, |
| "loss": 0.9125, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.98, |
| "eval_loss": 1.0577061176300049, |
| "eval_runtime": 0.559, |
| "eval_samples_per_second": 7.156, |
| "eval_steps_per_second": 1.789, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 0.7274181467358091, |
| "learning_rate": 3.519573623313807e-05, |
| "loss": 0.8991, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.99, |
| "eval_loss": 1.0430834293365479, |
| "eval_runtime": 0.5618, |
| "eval_samples_per_second": 7.12, |
| "eval_steps_per_second": 1.78, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 0.8444109296811467, |
| "learning_rate": 3.50429914110349e-05, |
| "loss": 0.7581, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.01, |
| "eval_loss": 1.156642198562622, |
| "eval_runtime": 0.5616, |
| "eval_samples_per_second": 7.123, |
| "eval_steps_per_second": 1.781, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.8177345871017364, |
| "learning_rate": 3.4888198900681877e-05, |
| "loss": 0.6328, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.02, |
| "eval_loss": 1.141777753829956, |
| "eval_runtime": 0.564, |
| "eval_samples_per_second": 7.092, |
| "eval_steps_per_second": 1.773, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 0.8337632078158784, |
| "learning_rate": 3.4731379772808616e-05, |
| "loss": 0.6323, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.04, |
| "eval_loss": 1.1533713340759277, |
| "eval_runtime": 0.5616, |
| "eval_samples_per_second": 7.123, |
| "eval_steps_per_second": 1.781, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.9013587928760545, |
| "learning_rate": 3.45725553740128e-05, |
| "loss": 0.6355, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.05, |
| "eval_loss": 1.1193175315856934, |
| "eval_runtime": 0.5622, |
| "eval_samples_per_second": 7.115, |
| "eval_steps_per_second": 1.779, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 0.9233940860823125, |
| "learning_rate": 3.441174732385441e-05, |
| "loss": 0.6318, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.07, |
| "eval_loss": 1.1091649532318115, |
| "eval_runtime": 0.5603, |
| "eval_samples_per_second": 7.139, |
| "eval_steps_per_second": 1.785, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.9317253505947287, |
| "learning_rate": 3.424897751191284e-05, |
| "loss": 0.6394, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.08, |
| "eval_loss": 1.117910623550415, |
| "eval_runtime": 0.557, |
| "eval_samples_per_second": 7.181, |
| "eval_steps_per_second": 1.795, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.8336539692567165, |
| "learning_rate": 3.4084268094807226e-05, |
| "loss": 0.635, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.1, |
| "eval_loss": 1.0903522968292236, |
| "eval_runtime": 0.5665, |
| "eval_samples_per_second": 7.061, |
| "eval_steps_per_second": 1.765, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 0.8077068468195864, |
| "learning_rate": 3.3917641493180435e-05, |
| "loss": 0.6439, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.11, |
| "eval_loss": 1.122206211090088, |
| "eval_runtime": 0.562, |
| "eval_samples_per_second": 7.118, |
| "eval_steps_per_second": 1.78, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 0.9314234159417707, |
| "learning_rate": 3.37491203886471e-05, |
| "loss": 0.6426, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.12, |
| "eval_loss": 1.1049784421920776, |
| "eval_runtime": 0.5556, |
| "eval_samples_per_second": 7.2, |
| "eval_steps_per_second": 1.8, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 0.8825401355042614, |
| "learning_rate": 3.357872772070618e-05, |
| "loss": 0.6433, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.14, |
| "eval_loss": 1.106252670288086, |
| "eval_runtime": 0.5551, |
| "eval_samples_per_second": 7.206, |
| "eval_steps_per_second": 1.801, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.9680535038973261, |
| "learning_rate": 3.340648668361834e-05, |
| "loss": 0.6335, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.15, |
| "eval_loss": 1.1095067262649536, |
| "eval_runtime": 0.5583, |
| "eval_samples_per_second": 7.165, |
| "eval_steps_per_second": 1.791, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 0.8476424452250282, |
| "learning_rate": 3.32324207232487e-05, |
| "loss": 0.6304, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.17, |
| "eval_loss": 1.0738136768341064, |
| "eval_runtime": 0.56, |
| "eval_samples_per_second": 7.143, |
| "eval_steps_per_second": 1.786, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 0.8761272684066781, |
| "learning_rate": 3.3056553533875356e-05, |
| "loss": 0.6356, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.18, |
| "eval_loss": 1.0922038555145264, |
| "eval_runtime": 0.5562, |
| "eval_samples_per_second": 7.192, |
| "eval_steps_per_second": 1.798, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.9176083769422828, |
| "learning_rate": 3.287890905496403e-05, |
| "loss": 0.6355, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_loss": 1.0953761339187622, |
| "eval_runtime": 0.5615, |
| "eval_samples_per_second": 7.124, |
| "eval_steps_per_second": 1.781, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 0.9429629467429161, |
| "learning_rate": 3.2699511467909366e-05, |
| "loss": 0.6443, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.21, |
| "eval_loss": 1.119952917098999, |
| "eval_runtime": 0.5641, |
| "eval_samples_per_second": 7.091, |
| "eval_steps_per_second": 1.773, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 0.8519677080597838, |
| "learning_rate": 3.251838519274333e-05, |
| "loss": 0.6383, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.22, |
| "eval_loss": 1.087357997894287, |
| "eval_runtime": 0.5587, |
| "eval_samples_per_second": 7.16, |
| "eval_steps_per_second": 1.79, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 0.8317415997019414, |
| "learning_rate": 3.2335554884811094e-05, |
| "loss": 0.6369, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.24, |
| "eval_loss": 1.0768706798553467, |
| "eval_runtime": 0.5612, |
| "eval_samples_per_second": 7.127, |
| "eval_steps_per_second": 1.782, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.9262932278505193, |
| "learning_rate": 3.215104543141483e-05, |
| "loss": 0.6411, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.25, |
| "eval_loss": 1.0815571546554565, |
| "eval_runtime": 0.5655, |
| "eval_samples_per_second": 7.073, |
| "eval_steps_per_second": 1.768, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 0.8965871561409847, |
| "learning_rate": 3.1964881948426095e-05, |
| "loss": 0.6427, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.27, |
| "eval_loss": 1.0796209573745728, |
| "eval_runtime": 0.5604, |
| "eval_samples_per_second": 7.137, |
| "eval_steps_per_second": 1.784, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.9342624097169002, |
| "learning_rate": 3.177708977686691e-05, |
| "loss": 0.6401, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.28, |
| "eval_loss": 1.0979222059249878, |
| "eval_runtime": 0.5628, |
| "eval_samples_per_second": 7.107, |
| "eval_steps_per_second": 1.777, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.8060621195627613, |
| "learning_rate": 3.1587694479460295e-05, |
| "loss": 0.6398, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.3, |
| "eval_loss": 1.090583086013794, |
| "eval_runtime": 0.5581, |
| "eval_samples_per_second": 7.167, |
| "eval_steps_per_second": 1.792, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 1.0120855676595921, |
| "learning_rate": 3.139672183715065e-05, |
| "loss": 0.6455, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.31, |
| "eval_loss": 1.076340913772583, |
| "eval_runtime": 0.5506, |
| "eval_samples_per_second": 7.265, |
| "eval_steps_per_second": 1.816, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 0.8279633860223412, |
| "learning_rate": 3.1204197845594374e-05, |
| "loss": 0.6404, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.33, |
| "eval_loss": 1.1073503494262695, |
| "eval_runtime": 0.5602, |
| "eval_samples_per_second": 7.14, |
| "eval_steps_per_second": 1.785, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 0.8557038781436789, |
| "learning_rate": 3.101014871162124e-05, |
| "loss": 0.6447, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.34, |
| "eval_loss": 1.1301687955856323, |
| "eval_runtime": 0.5575, |
| "eval_samples_per_second": 7.175, |
| "eval_steps_per_second": 1.794, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 0.8848283530229999, |
| "learning_rate": 3.0814600849667086e-05, |
| "loss": 0.6376, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.35, |
| "eval_loss": 1.0906646251678467, |
| "eval_runtime": 0.5606, |
| "eval_samples_per_second": 7.135, |
| "eval_steps_per_second": 1.784, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 0.820872374997884, |
| "learning_rate": 3.061758087817822e-05, |
| "loss": 0.6497, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.37, |
| "eval_loss": 1.0533320903778076, |
| "eval_runtime": 0.5607, |
| "eval_samples_per_second": 7.135, |
| "eval_steps_per_second": 1.784, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 0.8755792073523442, |
| "learning_rate": 3.041911561598807e-05, |
| "loss": 0.642, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.38, |
| "eval_loss": 1.0812216997146606, |
| "eval_runtime": 0.5593, |
| "eval_samples_per_second": 7.152, |
| "eval_steps_per_second": 1.788, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.8983195258860408, |
| "learning_rate": 3.021923207866648e-05, |
| "loss": 0.6408, |
| "step": 2910 |
| }, |
| { |
| "epoch": 1.4, |
| "eval_loss": 1.0946580171585083, |
| "eval_runtime": 0.5591, |
| "eval_samples_per_second": 7.154, |
| "eval_steps_per_second": 1.789, |
| "step": 2910 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 0.8244241913008498, |
| "learning_rate": 3.0017957474842335e-05, |
| "loss": 0.6331, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.41, |
| "eval_loss": 1.1103713512420654, |
| "eval_runtime": 0.5574, |
| "eval_samples_per_second": 7.176, |
| "eval_steps_per_second": 1.794, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 0.8791946563571723, |
| "learning_rate": 2.981531920249987e-05, |
| "loss": 0.6386, |
| "step": 2970 |
| }, |
| { |
| "epoch": 1.43, |
| "eval_loss": 1.0734190940856934, |
| "eval_runtime": 0.5621, |
| "eval_samples_per_second": 7.116, |
| "eval_steps_per_second": 1.779, |
| "step": 2970 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 0.8289043628753012, |
| "learning_rate": 2.9611344845249132e-05, |
| "loss": 0.6405, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.44, |
| "eval_loss": 1.0503255128860474, |
| "eval_runtime": 0.5557, |
| "eval_samples_per_second": 7.198, |
| "eval_steps_per_second": 1.799, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 0.8395749224486603, |
| "learning_rate": 2.940606216857129e-05, |
| "loss": 0.6402, |
| "step": 3030 |
| }, |
| { |
| "epoch": 1.46, |
| "eval_loss": 1.0524193048477173, |
| "eval_runtime": 0.558, |
| "eval_samples_per_second": 7.168, |
| "eval_steps_per_second": 1.792, |
| "step": 3030 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 0.8865250863433722, |
| "learning_rate": 2.9199499116039087e-05, |
| "loss": 0.6401, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.47, |
| "eval_loss": 1.0809640884399414, |
| "eval_runtime": 0.561, |
| "eval_samples_per_second": 7.13, |
| "eval_steps_per_second": 1.783, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 0.8913390931470981, |
| "learning_rate": 2.8991683805513136e-05, |
| "loss": 0.6447, |
| "step": 3090 |
| }, |
| { |
| "epoch": 1.48, |
| "eval_loss": 1.093366265296936, |
| "eval_runtime": 0.5594, |
| "eval_samples_per_second": 7.151, |
| "eval_steps_per_second": 1.788, |
| "step": 3090 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.8844075355973444, |
| "learning_rate": 2.8782644525314413e-05, |
| "loss": 0.6365, |
| "step": 3120 |
| }, |
| { |
| "epoch": 1.5, |
| "eval_loss": 1.0947942733764648, |
| "eval_runtime": 0.5548, |
| "eval_samples_per_second": 7.21, |
| "eval_steps_per_second": 1.803, |
| "step": 3120 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 0.8642897764721639, |
| "learning_rate": 2.857240973037359e-05, |
| "loss": 0.6376, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.51, |
| "eval_loss": 1.0931771993637085, |
| "eval_runtime": 0.5593, |
| "eval_samples_per_second": 7.152, |
| "eval_steps_per_second": 1.788, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 0.8141009218386974, |
| "learning_rate": 2.8361008038357708e-05, |
| "loss": 0.6285, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.53, |
| "eval_loss": 1.0984861850738525, |
| "eval_runtime": 0.5635, |
| "eval_samples_per_second": 7.099, |
| "eval_steps_per_second": 1.775, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 0.8151950623224837, |
| "learning_rate": 2.8148468225774625e-05, |
| "loss": 0.6341, |
| "step": 3210 |
| }, |
| { |
| "epoch": 1.54, |
| "eval_loss": 1.0807080268859863, |
| "eval_runtime": 0.562, |
| "eval_samples_per_second": 7.118, |
| "eval_steps_per_second": 1.779, |
| "step": 3210 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 0.8803614424792732, |
| "learning_rate": 2.793481922405593e-05, |
| "loss": 0.641, |
| "step": 3240 |
| }, |
| { |
| "epoch": 1.56, |
| "eval_loss": 1.0697028636932373, |
| "eval_runtime": 0.5591, |
| "eval_samples_per_second": 7.154, |
| "eval_steps_per_second": 1.789, |
| "step": 3240 |
| }, |
| { |
| "epoch": 1.57, |
| "grad_norm": 0.848561772550623, |
| "learning_rate": 2.7720090115618715e-05, |
| "loss": 0.6287, |
| "step": 3270 |
| }, |
| { |
| "epoch": 1.57, |
| "eval_loss": 1.1105608940124512, |
| "eval_runtime": 0.5603, |
| "eval_samples_per_second": 7.139, |
| "eval_steps_per_second": 1.785, |
| "step": 3270 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 0.8542270599907826, |
| "learning_rate": 2.7504310129906788e-05, |
| "loss": 0.6308, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.59, |
| "eval_loss": 1.1107990741729736, |
| "eval_runtime": 0.5584, |
| "eval_samples_per_second": 7.163, |
| "eval_steps_per_second": 1.791, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.8871952343064018, |
| "learning_rate": 2.7287508639411912e-05, |
| "loss": 0.6406, |
| "step": 3330 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_loss": 1.1208276748657227, |
| "eval_runtime": 0.5595, |
| "eval_samples_per_second": 7.149, |
| "eval_steps_per_second": 1.787, |
| "step": 3330 |
| }, |
| { |
| "epoch": 1.61, |
| "grad_norm": 0.8878981133253421, |
| "learning_rate": 2.7069715155675535e-05, |
| "loss": 0.6272, |
| "step": 3360 |
| }, |
| { |
| "epoch": 1.61, |
| "eval_loss": 1.1005425453186035, |
| "eval_runtime": 0.5591, |
| "eval_samples_per_second": 7.155, |
| "eval_steps_per_second": 1.789, |
| "step": 3360 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 0.9471314635259924, |
| "learning_rate": 2.6850959325271638e-05, |
| "loss": 0.629, |
| "step": 3390 |
| }, |
| { |
| "epoch": 1.63, |
| "eval_loss": 1.110257863998413, |
| "eval_runtime": 0.5602, |
| "eval_samples_per_second": 7.14, |
| "eval_steps_per_second": 1.785, |
| "step": 3390 |
| }, |
| { |
| "epoch": 1.64, |
| "grad_norm": 0.9208242310434547, |
| "learning_rate": 2.6631270925771122e-05, |
| "loss": 0.6405, |
| "step": 3420 |
| }, |
| { |
| "epoch": 1.64, |
| "eval_loss": 1.1000797748565674, |
| "eval_runtime": 0.5627, |
| "eval_samples_per_second": 7.109, |
| "eval_steps_per_second": 1.777, |
| "step": 3420 |
| }, |
| { |
| "epoch": 1.66, |
| "grad_norm": 0.9420955962817003, |
| "learning_rate": 2.641067986168846e-05, |
| "loss": 0.6364, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.66, |
| "eval_loss": 1.109868049621582, |
| "eval_runtime": 0.5577, |
| "eval_samples_per_second": 7.172, |
| "eval_steps_per_second": 1.793, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 0.8871860851228487, |
| "learning_rate": 2.6189216160411005e-05, |
| "loss": 0.6296, |
| "step": 3480 |
| }, |
| { |
| "epoch": 1.67, |
| "eval_loss": 1.1052864789962769, |
| "eval_runtime": 0.5634, |
| "eval_samples_per_second": 7.099, |
| "eval_steps_per_second": 1.775, |
| "step": 3480 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 0.8365048627589012, |
| "learning_rate": 2.5966909968111562e-05, |
| "loss": 0.6317, |
| "step": 3510 |
| }, |
| { |
| "epoch": 1.69, |
| "eval_loss": 1.1079487800598145, |
| "eval_runtime": 0.5609, |
| "eval_samples_per_second": 7.131, |
| "eval_steps_per_second": 1.783, |
| "step": 3510 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 0.8841333301772851, |
| "learning_rate": 2.5743791545644883e-05, |
| "loss": 0.621, |
| "step": 3540 |
| }, |
| { |
| "epoch": 1.7, |
| "eval_loss": 1.088465690612793, |
| "eval_runtime": 0.5579, |
| "eval_samples_per_second": 7.17, |
| "eval_steps_per_second": 1.793, |
| "step": 3540 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 0.9230289862932771, |
| "learning_rate": 2.551989126442842e-05, |
| "loss": 0.6381, |
| "step": 3570 |
| }, |
| { |
| "epoch": 1.71, |
| "eval_loss": 1.1277779340744019, |
| "eval_runtime": 0.5615, |
| "eval_samples_per_second": 7.123, |
| "eval_steps_per_second": 1.781, |
| "step": 3570 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 0.8940425169953033, |
| "learning_rate": 2.529523960230814e-05, |
| "loss": 0.6264, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.73, |
| "eval_loss": 1.1413404941558838, |
| "eval_runtime": 0.5588, |
| "eval_samples_per_second": 7.159, |
| "eval_steps_per_second": 1.79, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 0.9445347194998787, |
| "learning_rate": 2.506986713940977e-05, |
| "loss": 0.6397, |
| "step": 3630 |
| }, |
| { |
| "epoch": 1.74, |
| "eval_loss": 1.1199023723602295, |
| "eval_runtime": 0.5588, |
| "eval_samples_per_second": 7.158, |
| "eval_steps_per_second": 1.789, |
| "step": 3630 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 0.8863129784510786, |
| "learning_rate": 2.4843804553976195e-05, |
| "loss": 0.6292, |
| "step": 3660 |
| }, |
| { |
| "epoch": 1.76, |
| "eval_loss": 1.1415538787841797, |
| "eval_runtime": 0.5599, |
| "eval_samples_per_second": 7.145, |
| "eval_steps_per_second": 1.786, |
| "step": 3660 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 0.8478522423707957, |
| "learning_rate": 2.461708261819144e-05, |
| "loss": 0.6284, |
| "step": 3690 |
| }, |
| { |
| "epoch": 1.77, |
| "eval_loss": 1.1508762836456299, |
| "eval_runtime": 0.5604, |
| "eval_samples_per_second": 7.138, |
| "eval_steps_per_second": 1.785, |
| "step": 3690 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 0.8687359660601236, |
| "learning_rate": 2.4389732193991872e-05, |
| "loss": 0.6305, |
| "step": 3720 |
| }, |
| { |
| "epoch": 1.79, |
| "eval_loss": 1.126154899597168, |
| "eval_runtime": 0.5594, |
| "eval_samples_per_second": 7.15, |
| "eval_steps_per_second": 1.788, |
| "step": 3720 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.9047811544306036, |
| "learning_rate": 2.416178422886524e-05, |
| "loss": 0.6314, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.8, |
| "eval_loss": 1.128692865371704, |
| "eval_runtime": 0.5597, |
| "eval_samples_per_second": 7.147, |
| "eval_steps_per_second": 1.787, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.82, |
| "grad_norm": 0.8540678672495516, |
| "learning_rate": 2.3933269751637984e-05, |
| "loss": 0.6335, |
| "step": 3780 |
| }, |
| { |
| "epoch": 1.82, |
| "eval_loss": 1.0934233665466309, |
| "eval_runtime": 0.555, |
| "eval_samples_per_second": 7.208, |
| "eval_steps_per_second": 1.802, |
| "step": 3780 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 0.8870274218489785, |
| "learning_rate": 2.3704219868251557e-05, |
| "loss": 0.625, |
| "step": 3810 |
| }, |
| { |
| "epoch": 1.83, |
| "eval_loss": 1.1019607782363892, |
| "eval_runtime": 0.5546, |
| "eval_samples_per_second": 7.213, |
| "eval_steps_per_second": 1.803, |
| "step": 3810 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 0.8857266513953586, |
| "learning_rate": 2.347466575752817e-05, |
| "loss": 0.6271, |
| "step": 3840 |
| }, |
| { |
| "epoch": 1.84, |
| "eval_loss": 1.0795825719833374, |
| "eval_runtime": 0.5614, |
| "eval_samples_per_second": 7.124, |
| "eval_steps_per_second": 1.781, |
| "step": 3840 |
| }, |
| { |
| "epoch": 1.86, |
| "grad_norm": 0.8778258114022474, |
| "learning_rate": 2.3244638666926683e-05, |
| "loss": 0.6182, |
| "step": 3870 |
| }, |
| { |
| "epoch": 1.86, |
| "eval_loss": 1.0906893014907837, |
| "eval_runtime": 0.5634, |
| "eval_samples_per_second": 7.099, |
| "eval_steps_per_second": 1.775, |
| "step": 3870 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 0.8734833851049317, |
| "learning_rate": 2.3014169908289116e-05, |
| "loss": 0.6286, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.87, |
| "eval_loss": 1.0897102355957031, |
| "eval_runtime": 0.5569, |
| "eval_samples_per_second": 7.182, |
| "eval_steps_per_second": 1.796, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.89, |
| "grad_norm": 0.9574674758882417, |
| "learning_rate": 2.2783290853578403e-05, |
| "loss": 0.6283, |
| "step": 3930 |
| }, |
| { |
| "epoch": 1.89, |
| "eval_loss": 1.072981595993042, |
| "eval_runtime": 0.5604, |
| "eval_samples_per_second": 7.137, |
| "eval_steps_per_second": 1.784, |
| "step": 3930 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 0.979549501221503, |
| "learning_rate": 2.2552032930607978e-05, |
| "loss": 0.6339, |
| "step": 3960 |
| }, |
| { |
| "epoch": 1.9, |
| "eval_loss": 1.0705056190490723, |
| "eval_runtime": 0.5646, |
| "eval_samples_per_second": 7.084, |
| "eval_steps_per_second": 1.771, |
| "step": 3960 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 0.9781562033531854, |
| "learning_rate": 2.2320427618763732e-05, |
| "loss": 0.6296, |
| "step": 3990 |
| }, |
| { |
| "epoch": 1.92, |
| "eval_loss": 1.080111026763916, |
| "eval_runtime": 0.5631, |
| "eval_samples_per_second": 7.104, |
| "eval_steps_per_second": 1.776, |
| "step": 3990 |
| } |
| ], |
| "logging_steps": 30, |
| "max_steps": 8328, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "total_flos": 1083775729532928.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|