| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 3750, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.008, |
| "grad_norm": 4.96875, |
| "learning_rate": 7.964601769911505e-06, |
| "loss": 1.4112, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 5.3125, |
| "learning_rate": 1.6814159292035402e-05, |
| "loss": 1.4365, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 2.265625, |
| "learning_rate": 2.5663716814159294e-05, |
| "loss": 1.2118, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 2.78125, |
| "learning_rate": 3.451327433628319e-05, |
| "loss": 1.1222, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 1.953125, |
| "learning_rate": 4.3362831858407084e-05, |
| "loss": 1.0566, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 2.75, |
| "learning_rate": 5.221238938053098e-05, |
| "loss": 1.0392, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 2.578125, |
| "learning_rate": 6.106194690265487e-05, |
| "loss": 1.1753, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 1.421875, |
| "learning_rate": 6.991150442477876e-05, |
| "loss": 1.0422, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 3.765625, |
| "learning_rate": 7.876106194690266e-05, |
| "loss": 1.026, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 2.609375, |
| "learning_rate": 8.761061946902655e-05, |
| "loss": 1.0633, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 3.171875, |
| "learning_rate": 9.646017699115044e-05, |
| "loss": 1.1138, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 3.609375, |
| "learning_rate": 9.999932848660433e-05, |
| "loss": 1.1079, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 2.765625, |
| "learning_rate": 9.99952248589506e-05, |
| "loss": 1.0538, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 3.5625, |
| "learning_rate": 9.998739097245067e-05, |
| "loss": 0.9637, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 5.6875, |
| "learning_rate": 9.997582741160886e-05, |
| "loss": 0.8554, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 7.40625, |
| "learning_rate": 9.99605350392091e-05, |
| "loss": 0.6704, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 7.03125, |
| "learning_rate": 9.994151499625049e-05, |
| "loss": 0.8075, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 3.828125, |
| "learning_rate": 9.991876870186222e-05, |
| "loss": 0.7331, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 3.890625, |
| "learning_rate": 9.98922978531977e-05, |
| "loss": 0.7264, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 4.15625, |
| "learning_rate": 9.986210442530788e-05, |
| "loss": 0.5792, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 5.53125, |
| "learning_rate": 9.982819067099396e-05, |
| "loss": 0.6228, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 5.875, |
| "learning_rate": 9.979055912063925e-05, |
| "loss": 0.7417, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 3.71875, |
| "learning_rate": 9.974921258202036e-05, |
| "loss": 0.472, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 4.15625, |
| "learning_rate": 9.970415414009773e-05, |
| "loss": 0.6284, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 5.1875, |
| "learning_rate": 9.965538715678548e-05, |
| "loss": 0.5349, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 4.59375, |
| "learning_rate": 9.960291527070051e-05, |
| "loss": 0.5165, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 4.0625, |
| "learning_rate": 9.954674239689109e-05, |
| "loss": 0.5656, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 3.875, |
| "learning_rate": 9.948687272654464e-05, |
| "loss": 0.6713, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 5.375, |
| "learning_rate": 9.942331072667517e-05, |
| "loss": 0.4347, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 5.875, |
| "learning_rate": 9.935606113978981e-05, |
| "loss": 0.4404, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 4.28125, |
| "learning_rate": 9.92851289835351e-05, |
| "loss": 0.4865, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 6.46875, |
| "learning_rate": 9.921051955032253e-05, |
| "loss": 0.5393, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.264, |
| "grad_norm": 9.0625, |
| "learning_rate": 9.913223840693375e-05, |
| "loss": 0.5358, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 5.0625, |
| "learning_rate": 9.905029139410508e-05, |
| "loss": 0.5756, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 10.625, |
| "learning_rate": 9.896468462609186e-05, |
| "loss": 0.4554, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 3.6875, |
| "learning_rate": 9.887542449021214e-05, |
| "loss": 0.3889, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.296, |
| "grad_norm": 6.25, |
| "learning_rate": 9.878251764637023e-05, |
| "loss": 0.4468, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 5.5, |
| "learning_rate": 9.868597102655968e-05, |
| "loss": 0.4612, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.312, |
| "grad_norm": 5.4375, |
| "learning_rate": 9.858579183434605e-05, |
| "loss": 0.6069, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 5.84375, |
| "learning_rate": 9.848198754432959e-05, |
| "loss": 0.4464, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.328, |
| "grad_norm": 5.125, |
| "learning_rate": 9.837456590158738e-05, |
| "loss": 0.5598, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 3.671875, |
| "learning_rate": 9.826353492109555e-05, |
| "loss": 0.4534, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.344, |
| "grad_norm": 7.34375, |
| "learning_rate": 9.814890288713121e-05, |
| "loss": 0.5155, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 6.875, |
| "learning_rate": 9.803067835265436e-05, |
| "loss": 0.484, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 3.875, |
| "learning_rate": 9.790887013866973e-05, |
| "loss": 0.4077, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 5.4375, |
| "learning_rate": 9.778348733356868e-05, |
| "loss": 0.3779, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.376, |
| "grad_norm": 5.34375, |
| "learning_rate": 9.765453929245096e-05, |
| "loss": 0.521, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 4.8125, |
| "learning_rate": 9.752203563642688e-05, |
| "loss": 0.4114, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.392, |
| "grad_norm": 8.25, |
| "learning_rate": 9.73859862518993e-05, |
| "loss": 0.4361, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 7.40625, |
| "learning_rate": 9.724640128982605e-05, |
| "loss": 0.518, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.408, |
| "grad_norm": 5.5625, |
| "learning_rate": 9.710329116496259e-05, |
| "loss": 0.4413, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 5.46875, |
| "learning_rate": 9.695666655508483e-05, |
| "loss": 0.4276, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.424, |
| "grad_norm": 5.375, |
| "learning_rate": 9.680653840019259e-05, |
| "loss": 0.4476, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 5.875, |
| "learning_rate": 9.665291790169311e-05, |
| "loss": 0.3562, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 6.4375, |
| "learning_rate": 9.649581652156559e-05, |
| "loss": 0.4511, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 5.1875, |
| "learning_rate": 9.633524598150568e-05, |
| "loss": 0.3985, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.456, |
| "grad_norm": 4.1875, |
| "learning_rate": 9.617121826205116e-05, |
| "loss": 0.5117, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 5.5, |
| "learning_rate": 9.600374560168783e-05, |
| "loss": 0.4569, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.472, |
| "grad_norm": 4.5, |
| "learning_rate": 9.583284049593652e-05, |
| "loss": 0.449, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 4.25, |
| "learning_rate": 9.56585156964207e-05, |
| "loss": 0.4769, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.488, |
| "grad_norm": 5.9375, |
| "learning_rate": 9.548078420991506e-05, |
| "loss": 0.5081, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 7.4375, |
| "learning_rate": 9.529965929737506e-05, |
| "loss": 0.4803, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.504, |
| "grad_norm": 4.59375, |
| "learning_rate": 9.511515447294748e-05, |
| "loss": 0.5015, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 6.71875, |
| "learning_rate": 9.49272835029621e-05, |
| "loss": 0.5174, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 4.875, |
| "learning_rate": 9.47360604049046e-05, |
| "loss": 0.4957, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.528, |
| "grad_norm": 4.71875, |
| "learning_rate": 9.454149944637064e-05, |
| "loss": 0.379, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.536, |
| "grad_norm": 4.96875, |
| "learning_rate": 9.434361514400132e-05, |
| "loss": 0.4857, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.544, |
| "grad_norm": 5.71875, |
| "learning_rate": 9.414242226240012e-05, |
| "loss": 0.4595, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.552, |
| "grad_norm": 6.8125, |
| "learning_rate": 9.393793581303116e-05, |
| "loss": 0.4157, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 5.03125, |
| "learning_rate": 9.37301710530993e-05, |
| "loss": 0.4432, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.568, |
| "grad_norm": 6.75, |
| "learning_rate": 9.351914348441169e-05, |
| "loss": 0.4695, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 2.890625, |
| "learning_rate": 9.330486885222114e-05, |
| "loss": 0.3493, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.584, |
| "grad_norm": 3.4375, |
| "learning_rate": 9.308736314405134e-05, |
| "loss": 0.4304, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.592, |
| "grad_norm": 3.1875, |
| "learning_rate": 9.286664258850402e-05, |
| "loss": 0.5057, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 4.375, |
| "learning_rate": 9.264272365404805e-05, |
| "loss": 0.4159, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.608, |
| "grad_norm": 4.3125, |
| "learning_rate": 9.241562304779072e-05, |
| "loss": 0.3647, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.616, |
| "grad_norm": 10.3125, |
| "learning_rate": 9.21853577142312e-05, |
| "loss": 0.4851, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.624, |
| "grad_norm": 6.3125, |
| "learning_rate": 9.195194483399625e-05, |
| "loss": 0.5071, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.632, |
| "grad_norm": 4.34375, |
| "learning_rate": 9.17154018225583e-05, |
| "loss": 0.3939, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 4.03125, |
| "learning_rate": 9.147574632893611e-05, |
| "loss": 0.3762, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.648, |
| "grad_norm": 3.1875, |
| "learning_rate": 9.12329962343779e-05, |
| "loss": 0.4427, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.656, |
| "grad_norm": 3.890625, |
| "learning_rate": 9.098716965102716e-05, |
| "loss": 0.357, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.664, |
| "grad_norm": 8.25, |
| "learning_rate": 9.073828492057133e-05, |
| "loss": 0.4071, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.672, |
| "grad_norm": 5.78125, |
| "learning_rate": 9.048636061287325e-05, |
| "loss": 0.4037, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 3.84375, |
| "learning_rate": 9.023141552458559e-05, |
| "loss": 0.3884, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.688, |
| "grad_norm": 3.453125, |
| "learning_rate": 8.997346867774839e-05, |
| "loss": 0.3641, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.696, |
| "grad_norm": 7.875, |
| "learning_rate": 8.97125393183699e-05, |
| "loss": 0.5387, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.704, |
| "grad_norm": 4.6875, |
| "learning_rate": 8.94486469149904e-05, |
| "loss": 0.4085, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.712, |
| "grad_norm": 6.03125, |
| "learning_rate": 8.918181115722976e-05, |
| "loss": 0.4055, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 7.375, |
| "learning_rate": 8.891205195431831e-05, |
| "loss": 0.42, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.728, |
| "grad_norm": 3.21875, |
| "learning_rate": 8.863938943361128e-05, |
| "loss": 0.3372, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.736, |
| "grad_norm": 3.65625, |
| "learning_rate": 8.836384393908721e-05, |
| "loss": 0.4544, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.744, |
| "grad_norm": 3.703125, |
| "learning_rate": 8.808543602982993e-05, |
| "loss": 0.4979, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.752, |
| "grad_norm": 2.90625, |
| "learning_rate": 8.780418647849458e-05, |
| "loss": 0.3366, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 4.53125, |
| "learning_rate": 8.752011626975781e-05, |
| "loss": 0.3778, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 3.4375, |
| "learning_rate": 8.723324659875201e-05, |
| "loss": 0.4498, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.776, |
| "grad_norm": 4.90625, |
| "learning_rate": 8.694359886948384e-05, |
| "loss": 0.4232, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.784, |
| "grad_norm": 4.15625, |
| "learning_rate": 8.665119469323737e-05, |
| "loss": 0.3602, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.792, |
| "grad_norm": 5.375, |
| "learning_rate": 8.635605588696148e-05, |
| "loss": 0.4095, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 3.5625, |
| "learning_rate": 8.605820447164206e-05, |
| "loss": 0.405, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.808, |
| "grad_norm": 4.40625, |
| "learning_rate": 8.575766267065905e-05, |
| "loss": 0.3137, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.816, |
| "grad_norm": 5.65625, |
| "learning_rate": 8.54544529081283e-05, |
| "loss": 0.3701, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.824, |
| "grad_norm": 5.0, |
| "learning_rate": 8.514859780722833e-05, |
| "loss": 0.3759, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.832, |
| "grad_norm": 4.9375, |
| "learning_rate": 8.484012018851246e-05, |
| "loss": 0.3032, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 4.375, |
| "learning_rate": 8.452904306820618e-05, |
| "loss": 0.4171, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.848, |
| "grad_norm": 4.53125, |
| "learning_rate": 8.421538965648966e-05, |
| "loss": 0.374, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.856, |
| "grad_norm": 4.3125, |
| "learning_rate": 8.389918335576623e-05, |
| "loss": 0.3358, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.864, |
| "grad_norm": 4.21875, |
| "learning_rate": 8.358044775891605e-05, |
| "loss": 0.3586, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.872, |
| "grad_norm": 4.15625, |
| "learning_rate": 8.325920664753595e-05, |
| "loss": 0.4036, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 4.625, |
| "learning_rate": 8.293548399016491e-05, |
| "loss": 0.3673, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.888, |
| "grad_norm": 2.625, |
| "learning_rate": 8.260930394049583e-05, |
| "loss": 0.3388, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 4.78125, |
| "learning_rate": 8.228069083557328e-05, |
| "loss": 0.427, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.904, |
| "grad_norm": 4.21875, |
| "learning_rate": 8.194966919397767e-05, |
| "loss": 0.3926, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.912, |
| "grad_norm": 5.84375, |
| "learning_rate": 8.161626371399591e-05, |
| "loss": 0.3654, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 3.71875, |
| "learning_rate": 8.128049927177854e-05, |
| "loss": 0.4047, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.928, |
| "grad_norm": 5.0, |
| "learning_rate": 8.094240091948375e-05, |
| "loss": 0.4114, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.936, |
| "grad_norm": 6.625, |
| "learning_rate": 8.06019938834081e-05, |
| "loss": 0.4485, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.944, |
| "grad_norm": 4.4375, |
| "learning_rate": 8.025930356210439e-05, |
| "loss": 0.3833, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.952, |
| "grad_norm": 3.171875, |
| "learning_rate": 7.991435552448657e-05, |
| "loss": 0.3742, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 4.65625, |
| "learning_rate": 7.956717550792199e-05, |
| "loss": 0.3284, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.968, |
| "grad_norm": 4.4375, |
| "learning_rate": 7.921778941631113e-05, |
| "loss": 0.3862, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.976, |
| "grad_norm": 8.9375, |
| "learning_rate": 7.886622331815477e-05, |
| "loss": 0.4144, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.984, |
| "grad_norm": 3.875, |
| "learning_rate": 7.851250344460902e-05, |
| "loss": 0.3654, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.992, |
| "grad_norm": 2.59375, |
| "learning_rate": 7.815665618752812e-05, |
| "loss": 0.3808, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 5.0625, |
| "learning_rate": 7.77987080974953e-05, |
| "loss": 0.3482, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.008, |
| "grad_norm": 3.28125, |
| "learning_rate": 7.743868588184176e-05, |
| "loss": 0.3312, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.016, |
| "grad_norm": 4.375, |
| "learning_rate": 7.707661640265401e-05, |
| "loss": 0.37, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.024, |
| "grad_norm": 4.40625, |
| "learning_rate": 7.67125266747696e-05, |
| "loss": 0.3253, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.032, |
| "grad_norm": 4.0625, |
| "learning_rate": 7.634644386376149e-05, |
| "loss": 0.4361, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 3.953125, |
| "learning_rate": 7.597839528391114e-05, |
| "loss": 0.3981, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.048, |
| "grad_norm": 7.6875, |
| "learning_rate": 7.560840839617056e-05, |
| "loss": 0.3634, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.056, |
| "grad_norm": 4.03125, |
| "learning_rate": 7.523651080611341e-05, |
| "loss": 0.3653, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.064, |
| "grad_norm": 3.09375, |
| "learning_rate": 7.48627302618752e-05, |
| "loss": 0.3433, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.072, |
| "grad_norm": 4.21875, |
| "learning_rate": 7.448709465208299e-05, |
| "loss": 0.3587, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 5.375, |
| "learning_rate": 7.410963200377458e-05, |
| "loss": 0.346, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.088, |
| "grad_norm": 4.0625, |
| "learning_rate": 7.373037048030731e-05, |
| "loss": 0.4562, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.096, |
| "grad_norm": 3.75, |
| "learning_rate": 7.334933837925675e-05, |
| "loss": 0.4333, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.104, |
| "grad_norm": 3.46875, |
| "learning_rate": 7.296656413030531e-05, |
| "loss": 0.306, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.112, |
| "grad_norm": 5.25, |
| "learning_rate": 7.25820762931211e-05, |
| "loss": 0.4095, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 3.578125, |
| "learning_rate": 7.219590355522697e-05, |
| "loss": 0.369, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.1280000000000001, |
| "grad_norm": 4.96875, |
| "learning_rate": 7.180807472986009e-05, |
| "loss": 0.3763, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.1360000000000001, |
| "grad_norm": 5.25, |
| "learning_rate": 7.141861875382215e-05, |
| "loss": 0.4269, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.144, |
| "grad_norm": 3.53125, |
| "learning_rate": 7.102756468532027e-05, |
| "loss": 0.4017, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.152, |
| "grad_norm": 4.6875, |
| "learning_rate": 7.063494170179898e-05, |
| "loss": 0.3601, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 3.8125, |
| "learning_rate": 7.024077909776309e-05, |
| "loss": 0.3678, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.168, |
| "grad_norm": 5.25, |
| "learning_rate": 6.984510628259212e-05, |
| "loss": 0.3732, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.176, |
| "grad_norm": 2.140625, |
| "learning_rate": 6.94479527783459e-05, |
| "loss": 0.3332, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.184, |
| "grad_norm": 4.5, |
| "learning_rate": 6.904934821756184e-05, |
| "loss": 0.3887, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.192, |
| "grad_norm": 2.296875, |
| "learning_rate": 6.864932234104409e-05, |
| "loss": 0.3196, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 4.15625, |
| "learning_rate": 6.824790499564435e-05, |
| "loss": 0.3256, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.208, |
| "grad_norm": 3.6875, |
| "learning_rate": 6.784512613203511e-05, |
| "loss": 0.3074, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.216, |
| "grad_norm": 3.0, |
| "learning_rate": 6.744101580247481e-05, |
| "loss": 0.35, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.224, |
| "grad_norm": 3.828125, |
| "learning_rate": 6.703560415856565e-05, |
| "loss": 0.3731, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.232, |
| "grad_norm": 5.625, |
| "learning_rate": 6.662892144900388e-05, |
| "loss": 0.3769, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 8.1875, |
| "learning_rate": 6.62209980173229e-05, |
| "loss": 0.4385, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.248, |
| "grad_norm": 5.9375, |
| "learning_rate": 6.581186429962922e-05, |
| "loss": 0.3563, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.256, |
| "grad_norm": 3.53125, |
| "learning_rate": 6.54015508223316e-05, |
| "loss": 0.3225, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.264, |
| "grad_norm": 4.125, |
| "learning_rate": 6.499008819986339e-05, |
| "loss": 0.3246, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.272, |
| "grad_norm": 4.34375, |
| "learning_rate": 6.457750713239828e-05, |
| "loss": 0.304, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 4.75, |
| "learning_rate": 6.41638384035597e-05, |
| "loss": 0.383, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.288, |
| "grad_norm": 4.46875, |
| "learning_rate": 6.374911287812406e-05, |
| "loss": 0.331, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.296, |
| "grad_norm": 3.984375, |
| "learning_rate": 6.333336149971776e-05, |
| "loss": 0.3022, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.304, |
| "grad_norm": 5.53125, |
| "learning_rate": 6.291661528850844e-05, |
| "loss": 0.3257, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.312, |
| "grad_norm": 4.03125, |
| "learning_rate": 6.249890533889054e-05, |
| "loss": 0.3071, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 4.25, |
| "learning_rate": 6.208026281716521e-05, |
| "loss": 0.3833, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.328, |
| "grad_norm": 3.109375, |
| "learning_rate": 6.166071895921496e-05, |
| "loss": 0.3378, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.336, |
| "grad_norm": 3.5, |
| "learning_rate": 6.124030506817309e-05, |
| "loss": 0.31, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.3439999999999999, |
| "grad_norm": 2.65625, |
| "learning_rate": 6.0819052512088057e-05, |
| "loss": 0.3139, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.3519999999999999, |
| "grad_norm": 4.90625, |
| "learning_rate": 6.039699272158305e-05, |
| "loss": 0.4388, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "grad_norm": 5.59375, |
| "learning_rate": 5.997415718751086e-05, |
| "loss": 0.3989, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3679999999999999, |
| "grad_norm": 5.5, |
| "learning_rate": 5.955057745860435e-05, |
| "loss": 0.3977, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.376, |
| "grad_norm": 6.25, |
| "learning_rate": 5.9126285139122406e-05, |
| "loss": 0.3527, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.384, |
| "grad_norm": 3.28125, |
| "learning_rate": 5.8701311886491947e-05, |
| "loss": 0.4044, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.392, |
| "grad_norm": 5.5625, |
| "learning_rate": 5.827568940894593e-05, |
| "loss": 0.374, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 3.625, |
| "learning_rate": 5.7849449463157435e-05, |
| "loss": 0.3479, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.408, |
| "grad_norm": 4.84375, |
| "learning_rate": 5.742262385187028e-05, |
| "loss": 0.3666, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.416, |
| "grad_norm": 5.4375, |
| "learning_rate": 5.699524442152613e-05, |
| "loss": 0.3707, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.424, |
| "grad_norm": 4.4375, |
| "learning_rate": 5.656734305988839e-05, |
| "loss": 0.3847, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.432, |
| "grad_norm": 3.96875, |
| "learning_rate": 5.613895169366292e-05, |
| "loss": 0.3515, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 3.734375, |
| "learning_rate": 5.571010228611597e-05, |
| "loss": 0.3763, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.448, |
| "grad_norm": 5.71875, |
| "learning_rate": 5.528082683468934e-05, |
| "loss": 0.3548, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.456, |
| "grad_norm": 5.03125, |
| "learning_rate": 5.485115736861288e-05, |
| "loss": 0.3903, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.464, |
| "grad_norm": 4.375, |
| "learning_rate": 5.442112594651484e-05, |
| "loss": 0.257, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.472, |
| "grad_norm": 5.46875, |
| "learning_rate": 5.399076465402979e-05, |
| "loss": 0.3424, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 3.40625, |
| "learning_rate": 5.356010560140475e-05, |
| "loss": 0.3317, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.488, |
| "grad_norm": 5.0, |
| "learning_rate": 5.312918092110325e-05, |
| "loss": 0.2753, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.496, |
| "grad_norm": 3.84375, |
| "learning_rate": 5.269802276540795e-05, |
| "loss": 0.3318, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.504, |
| "grad_norm": 5.71875, |
| "learning_rate": 5.226666330402164e-05, |
| "loss": 0.3836, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.512, |
| "grad_norm": 3.671875, |
| "learning_rate": 5.1835134721666956e-05, |
| "loss": 0.3498, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 2.90625, |
| "learning_rate": 5.1403469215685094e-05, |
| "loss": 0.4228, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.528, |
| "grad_norm": 4.4375, |
| "learning_rate": 5.097169899363342e-05, |
| "loss": 0.3703, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.536, |
| "grad_norm": 3.90625, |
| "learning_rate": 5.053985627088238e-05, |
| "loss": 0.3816, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.544, |
| "grad_norm": 2.71875, |
| "learning_rate": 5.010797326821189e-05, |
| "loss": 0.3842, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.552, |
| "grad_norm": 3.875, |
| "learning_rate": 4.9676082209407254e-05, |
| "loss": 0.3848, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 3.484375, |
| "learning_rate": 4.924421531885481e-05, |
| "loss": 0.3416, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.568, |
| "grad_norm": 4.1875, |
| "learning_rate": 4.881240481913773e-05, |
| "loss": 0.3407, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.576, |
| "grad_norm": 3.921875, |
| "learning_rate": 4.838068292863164e-05, |
| "loss": 0.3319, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.584, |
| "grad_norm": 4.4375, |
| "learning_rate": 4.7949081859100896e-05, |
| "loss": 0.3979, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.592, |
| "grad_norm": 4.84375, |
| "learning_rate": 4.7517633813295114e-05, |
| "loss": 0.4184, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 3.890625, |
| "learning_rate": 4.708637098254644e-05, |
| "loss": 0.3959, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.608, |
| "grad_norm": 4.8125, |
| "learning_rate": 4.6655325544367715e-05, |
| "loss": 0.313, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.616, |
| "grad_norm": 4.21875, |
| "learning_rate": 4.6224529660051593e-05, |
| "loss": 0.3012, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.624, |
| "grad_norm": 3.734375, |
| "learning_rate": 4.579401547227096e-05, |
| "loss": 0.2532, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.6320000000000001, |
| "grad_norm": 2.953125, |
| "learning_rate": 4.53638151026807e-05, |
| "loss": 0.2714, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.6400000000000001, |
| "grad_norm": 4.1875, |
| "learning_rate": 4.493396064952093e-05, |
| "loss": 0.3468, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.6480000000000001, |
| "grad_norm": 5.125, |
| "learning_rate": 4.450448418522221e-05, |
| "loss": 0.4547, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.6560000000000001, |
| "grad_norm": 3.5, |
| "learning_rate": 4.4075417754012475e-05, |
| "loss": 0.2839, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.6640000000000001, |
| "grad_norm": 5.84375, |
| "learning_rate": 4.364679336952609e-05, |
| "loss": 0.3426, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.6720000000000002, |
| "grad_norm": 4.84375, |
| "learning_rate": 4.321864301241535e-05, |
| "loss": 0.3325, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "grad_norm": 4.46875, |
| "learning_rate": 4.279099862796427e-05, |
| "loss": 0.314, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.688, |
| "grad_norm": 2.8125, |
| "learning_rate": 4.23638921237051e-05, |
| "loss": 0.4189, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.696, |
| "grad_norm": 3.59375, |
| "learning_rate": 4.1937355367037516e-05, |
| "loss": 0.3436, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.704, |
| "grad_norm": 6.21875, |
| "learning_rate": 4.151142018285112e-05, |
| "loss": 0.3681, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.712, |
| "grad_norm": 4.0625, |
| "learning_rate": 4.1086118351150785e-05, |
| "loss": 0.3716, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 2.703125, |
| "learning_rate": 4.066148160468543e-05, |
| "loss": 0.2761, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.728, |
| "grad_norm": 3.765625, |
| "learning_rate": 4.023754162658051e-05, |
| "loss": 0.2904, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.736, |
| "grad_norm": 5.4375, |
| "learning_rate": 3.981433004797395e-05, |
| "loss": 0.3563, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.744, |
| "grad_norm": 4.125, |
| "learning_rate": 3.939187844565616e-05, |
| "loss": 0.3248, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.752, |
| "grad_norm": 5.71875, |
| "learning_rate": 3.897021833971386e-05, |
| "loss": 0.3246, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 4.21875, |
| "learning_rate": 3.8549381191178516e-05, |
| "loss": 0.4073, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.768, |
| "grad_norm": 4.4375, |
| "learning_rate": 3.8129398399678814e-05, |
| "loss": 0.4147, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.776, |
| "grad_norm": 4.4375, |
| "learning_rate": 3.771030130109785e-05, |
| "loss": 0.2378, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.784, |
| "grad_norm": 3.65625, |
| "learning_rate": 3.729212116523518e-05, |
| "loss": 0.3305, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.792, |
| "grad_norm": 3.09375, |
| "learning_rate": 3.6874889193473646e-05, |
| "loss": 0.3865, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 4.5, |
| "learning_rate": 3.64586365164514e-05, |
| "loss": 0.3443, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.808, |
| "grad_norm": 2.71875, |
| "learning_rate": 3.604339419173912e-05, |
| "loss": 0.2762, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.8159999999999998, |
| "grad_norm": 5.15625, |
| "learning_rate": 3.5629193201522794e-05, |
| "loss": 0.3787, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.8239999999999998, |
| "grad_norm": 5.53125, |
| "learning_rate": 3.521606445029208e-05, |
| "loss": 0.4157, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.8319999999999999, |
| "grad_norm": 5.34375, |
| "learning_rate": 3.480403876253432e-05, |
| "loss": 0.3345, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "grad_norm": 3.3125, |
| "learning_rate": 3.4393146880434845e-05, |
| "loss": 0.3111, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.8479999999999999, |
| "grad_norm": 4.03125, |
| "learning_rate": 3.398341946158311e-05, |
| "loss": 0.3763, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.8559999999999999, |
| "grad_norm": 3.703125, |
| "learning_rate": 3.357488707668529e-05, |
| "loss": 0.3246, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.8639999999999999, |
| "grad_norm": 4.65625, |
| "learning_rate": 3.316758020728327e-05, |
| "loss": 0.3852, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.8719999999999999, |
| "grad_norm": 3.859375, |
| "learning_rate": 3.276152924348046e-05, |
| "loss": 0.3295, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 5.15625, |
| "learning_rate": 3.2356764481674254e-05, |
| "loss": 0.3567, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.888, |
| "grad_norm": 3.421875, |
| "learning_rate": 3.1953316122295554e-05, |
| "loss": 0.3091, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.896, |
| "grad_norm": 3.265625, |
| "learning_rate": 3.1551214267555416e-05, |
| "loss": 0.3847, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.904, |
| "grad_norm": 6.09375, |
| "learning_rate": 3.1150488919199124e-05, |
| "loss": 0.3958, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.912, |
| "grad_norm": 4.03125, |
| "learning_rate": 3.075116997626764e-05, |
| "loss": 0.384, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 4.0625, |
| "learning_rate": 3.0353287232866736e-05, |
| "loss": 0.3349, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.928, |
| "grad_norm": 4.375, |
| "learning_rate": 2.995687037594408e-05, |
| "loss": 0.3801, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.936, |
| "grad_norm": 5.09375, |
| "learning_rate": 2.9561948983074174e-05, |
| "loss": 0.3281, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.944, |
| "grad_norm": 6.8125, |
| "learning_rate": 2.916855252025149e-05, |
| "loss": 0.3549, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.952, |
| "grad_norm": 4.125, |
| "learning_rate": 2.877671033969193e-05, |
| "loss": 0.4092, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 4.21875, |
| "learning_rate": 2.8386451677642878e-05, |
| "loss": 0.3866, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.968, |
| "grad_norm": 3.078125, |
| "learning_rate": 2.7997805652201714e-05, |
| "loss": 0.3484, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.976, |
| "grad_norm": 4.09375, |
| "learning_rate": 2.7610801261143283e-05, |
| "loss": 0.3496, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.984, |
| "grad_norm": 4.6875, |
| "learning_rate": 2.7225467379756314e-05, |
| "loss": 0.3691, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.992, |
| "grad_norm": 5.5, |
| "learning_rate": 2.6841832758689002e-05, |
| "loss": 0.3698, |
| "step": 2490 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 3.875, |
| "learning_rate": 2.645992602180377e-05, |
| "loss": 0.3577, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.008, |
| "grad_norm": 4.40625, |
| "learning_rate": 2.607977566404164e-05, |
| "loss": 0.2871, |
| "step": 2510 |
| }, |
| { |
| "epoch": 2.016, |
| "grad_norm": 4.0625, |
| "learning_rate": 2.570141004929612e-05, |
| "loss": 0.3426, |
| "step": 2520 |
| }, |
| { |
| "epoch": 2.024, |
| "grad_norm": 3.5, |
| "learning_rate": 2.5324857408296994e-05, |
| "loss": 0.2656, |
| "step": 2530 |
| }, |
| { |
| "epoch": 2.032, |
| "grad_norm": 3.578125, |
| "learning_rate": 2.4950145836503836e-05, |
| "loss": 0.3473, |
| "step": 2540 |
| }, |
| { |
| "epoch": 2.04, |
| "grad_norm": 3.953125, |
| "learning_rate": 2.4577303292009822e-05, |
| "loss": 0.3588, |
| "step": 2550 |
| }, |
| { |
| "epoch": 2.048, |
| "grad_norm": 4.75, |
| "learning_rate": 2.4206357593455743e-05, |
| "loss": 0.3953, |
| "step": 2560 |
| }, |
| { |
| "epoch": 2.056, |
| "grad_norm": 3.96875, |
| "learning_rate": 2.383733641795428e-05, |
| "loss": 0.3209, |
| "step": 2570 |
| }, |
| { |
| "epoch": 2.064, |
| "grad_norm": 2.515625, |
| "learning_rate": 2.3470267299025068e-05, |
| "loss": 0.3299, |
| "step": 2580 |
| }, |
| { |
| "epoch": 2.072, |
| "grad_norm": 3.375, |
| "learning_rate": 2.3105177624540252e-05, |
| "loss": 0.2311, |
| "step": 2590 |
| }, |
| { |
| "epoch": 2.08, |
| "grad_norm": 4.46875, |
| "learning_rate": 2.274209463468117e-05, |
| "loss": 0.3035, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.088, |
| "grad_norm": 2.578125, |
| "learning_rate": 2.2381045419905655e-05, |
| "loss": 0.3344, |
| "step": 2610 |
| }, |
| { |
| "epoch": 2.096, |
| "grad_norm": 5.28125, |
| "learning_rate": 2.2022056918927037e-05, |
| "loss": 0.2794, |
| "step": 2620 |
| }, |
| { |
| "epoch": 2.104, |
| "grad_norm": 5.25, |
| "learning_rate": 2.166515591670394e-05, |
| "loss": 0.3416, |
| "step": 2630 |
| }, |
| { |
| "epoch": 2.112, |
| "grad_norm": 3.96875, |
| "learning_rate": 2.1310369042441985e-05, |
| "loss": 0.3152, |
| "step": 2640 |
| }, |
| { |
| "epoch": 2.12, |
| "grad_norm": 3.203125, |
| "learning_rate": 2.0957722767606774e-05, |
| "loss": 0.3015, |
| "step": 2650 |
| }, |
| { |
| "epoch": 2.128, |
| "grad_norm": 3.78125, |
| "learning_rate": 2.0607243403948863e-05, |
| "loss": 0.3843, |
| "step": 2660 |
| }, |
| { |
| "epoch": 2.136, |
| "grad_norm": 6.34375, |
| "learning_rate": 2.0258957101540625e-05, |
| "loss": 0.3299, |
| "step": 2670 |
| }, |
| { |
| "epoch": 2.144, |
| "grad_norm": 4.1875, |
| "learning_rate": 1.9912889846825038e-05, |
| "loss": 0.3636, |
| "step": 2680 |
| }, |
| { |
| "epoch": 2.152, |
| "grad_norm": 3.6875, |
| "learning_rate": 1.956906746067683e-05, |
| "loss": 0.3596, |
| "step": 2690 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 2.984375, |
| "learning_rate": 1.922751559647591e-05, |
| "loss": 0.3796, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.168, |
| "grad_norm": 3.21875, |
| "learning_rate": 1.888825973819336e-05, |
| "loss": 0.3175, |
| "step": 2710 |
| }, |
| { |
| "epoch": 2.176, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.8551325198489887e-05, |
| "loss": 0.2928, |
| "step": 2720 |
| }, |
| { |
| "epoch": 2.184, |
| "grad_norm": 4.1875, |
| "learning_rate": 1.8216737116827378e-05, |
| "loss": 0.2791, |
| "step": 2730 |
| }, |
| { |
| "epoch": 2.192, |
| "grad_norm": 5.71875, |
| "learning_rate": 1.7884520457592984e-05, |
| "loss": 0.3925, |
| "step": 2740 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 3.28125, |
| "learning_rate": 1.755470000823667e-05, |
| "loss": 0.2967, |
| "step": 2750 |
| }, |
| { |
| "epoch": 2.208, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.7227300377421574e-05, |
| "loss": 0.2475, |
| "step": 2760 |
| }, |
| { |
| "epoch": 2.216, |
| "grad_norm": 3.515625, |
| "learning_rate": 1.6902345993188017e-05, |
| "loss": 0.34, |
| "step": 2770 |
| }, |
| { |
| "epoch": 2.224, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.6579861101130896e-05, |
| "loss": 0.3418, |
| "step": 2780 |
| }, |
| { |
| "epoch": 2.232, |
| "grad_norm": 6.5, |
| "learning_rate": 1.6259869762590503e-05, |
| "loss": 0.4639, |
| "step": 2790 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 5.0625, |
| "learning_rate": 1.5942395852857466e-05, |
| "loss": 0.4252, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.248, |
| "grad_norm": 4.71875, |
| "learning_rate": 1.5627463059391173e-05, |
| "loss": 0.3562, |
| "step": 2810 |
| }, |
| { |
| "epoch": 2.2560000000000002, |
| "grad_norm": 3.78125, |
| "learning_rate": 1.531509488005257e-05, |
| "loss": 0.2792, |
| "step": 2820 |
| }, |
| { |
| "epoch": 2.2640000000000002, |
| "grad_norm": 5.25, |
| "learning_rate": 1.5005314621350709e-05, |
| "loss": 0.2659, |
| "step": 2830 |
| }, |
| { |
| "epoch": 2.2720000000000002, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.4698145396704044e-05, |
| "loss": 0.2647, |
| "step": 2840 |
| }, |
| { |
| "epoch": 2.2800000000000002, |
| "grad_norm": 5.3125, |
| "learning_rate": 1.4393610124715696e-05, |
| "loss": 0.2826, |
| "step": 2850 |
| }, |
| { |
| "epoch": 2.288, |
| "grad_norm": 4.59375, |
| "learning_rate": 1.4091731527463526e-05, |
| "loss": 0.2643, |
| "step": 2860 |
| }, |
| { |
| "epoch": 2.296, |
| "grad_norm": 5.75, |
| "learning_rate": 1.3792532128804803e-05, |
| "loss": 0.3758, |
| "step": 2870 |
| }, |
| { |
| "epoch": 2.304, |
| "grad_norm": 6.25, |
| "learning_rate": 1.3496034252695599e-05, |
| "loss": 0.3, |
| "step": 2880 |
| }, |
| { |
| "epoch": 2.312, |
| "grad_norm": 5.1875, |
| "learning_rate": 1.3202260021525158e-05, |
| "loss": 0.3376, |
| "step": 2890 |
| }, |
| { |
| "epoch": 2.32, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.2911231354465303e-05, |
| "loss": 0.3686, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.328, |
| "grad_norm": 3.859375, |
| "learning_rate": 1.262296996583504e-05, |
| "loss": 0.3372, |
| "step": 2910 |
| }, |
| { |
| "epoch": 2.336, |
| "grad_norm": 3.421875, |
| "learning_rate": 1.2337497363480317e-05, |
| "loss": 0.3071, |
| "step": 2920 |
| }, |
| { |
| "epoch": 2.344, |
| "grad_norm": 4.15625, |
| "learning_rate": 1.2054834847169316e-05, |
| "loss": 0.3724, |
| "step": 2930 |
| }, |
| { |
| "epoch": 2.352, |
| "grad_norm": 4.21875, |
| "learning_rate": 1.1775003507003236e-05, |
| "loss": 0.2919, |
| "step": 2940 |
| }, |
| { |
| "epoch": 2.36, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.1498024221842735e-05, |
| "loss": 0.2496, |
| "step": 2950 |
| }, |
| { |
| "epoch": 2.368, |
| "grad_norm": 5.28125, |
| "learning_rate": 1.1223917657750033e-05, |
| "loss": 0.265, |
| "step": 2960 |
| }, |
| { |
| "epoch": 2.376, |
| "grad_norm": 6.03125, |
| "learning_rate": 1.095270426644705e-05, |
| "loss": 0.3083, |
| "step": 2970 |
| }, |
| { |
| "epoch": 2.384, |
| "grad_norm": 4.25, |
| "learning_rate": 1.0684404283789385e-05, |
| "loss": 0.3392, |
| "step": 2980 |
| }, |
| { |
| "epoch": 2.392, |
| "grad_norm": 3.765625, |
| "learning_rate": 1.0419037728256564e-05, |
| "loss": 0.3743, |
| "step": 2990 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 4.5625, |
| "learning_rate": 1.015662439945832e-05, |
| "loss": 0.3846, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.408, |
| "grad_norm": 4.28125, |
| "learning_rate": 9.89718387665734e-06, |
| "loss": 0.2818, |
| "step": 3010 |
| }, |
| { |
| "epoch": 2.416, |
| "grad_norm": 4.1875, |
| "learning_rate": 9.640735517308435e-06, |
| "loss": 0.3442, |
| "step": 3020 |
| }, |
| { |
| "epoch": 2.424, |
| "grad_norm": 5.28125, |
| "learning_rate": 9.387298455614191e-06, |
| "loss": 0.2982, |
| "step": 3030 |
| }, |
| { |
| "epoch": 2.432, |
| "grad_norm": 7.5, |
| "learning_rate": 9.136891601097347e-06, |
| "loss": 0.3924, |
| "step": 3040 |
| }, |
| { |
| "epoch": 2.44, |
| "grad_norm": 3.15625, |
| "learning_rate": 8.889533637189895e-06, |
| "loss": 0.2838, |
| "step": 3050 |
| }, |
| { |
| "epoch": 2.448, |
| "grad_norm": 6.125, |
| "learning_rate": 8.645243019839112e-06, |
| "loss": 0.3035, |
| "step": 3060 |
| }, |
| { |
| "epoch": 2.456, |
| "grad_norm": 3.609375, |
| "learning_rate": 8.404037976130458e-06, |
| "loss": 0.3713, |
| "step": 3070 |
| }, |
| { |
| "epoch": 2.464, |
| "grad_norm": 4.84375, |
| "learning_rate": 8.16593650292764e-06, |
| "loss": 0.3242, |
| "step": 3080 |
| }, |
| { |
| "epoch": 2.472, |
| "grad_norm": 3.21875, |
| "learning_rate": 7.930956365529818e-06, |
| "loss": 0.3214, |
| "step": 3090 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 2.953125, |
| "learning_rate": 7.699115096346139e-06, |
| "loss": 0.3072, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.488, |
| "grad_norm": 4.5, |
| "learning_rate": 7.4704299935875185e-06, |
| "loss": 0.2528, |
| "step": 3110 |
| }, |
| { |
| "epoch": 2.496, |
| "grad_norm": 4.28125, |
| "learning_rate": 7.244918119976035e-06, |
| "loss": 0.3366, |
| "step": 3120 |
| }, |
| { |
| "epoch": 2.504, |
| "grad_norm": 5.53125, |
| "learning_rate": 7.022596301471868e-06, |
| "loss": 0.3603, |
| "step": 3130 |
| }, |
| { |
| "epoch": 2.512, |
| "grad_norm": 4.28125, |
| "learning_rate": 6.803481126017808e-06, |
| "loss": 0.2996, |
| "step": 3140 |
| }, |
| { |
| "epoch": 2.52, |
| "grad_norm": 3.90625, |
| "learning_rate": 6.587588942301626e-06, |
| "loss": 0.3519, |
| "step": 3150 |
| }, |
| { |
| "epoch": 2.528, |
| "grad_norm": 5.3125, |
| "learning_rate": 6.374935858536257e-06, |
| "loss": 0.2668, |
| "step": 3160 |
| }, |
| { |
| "epoch": 2.536, |
| "grad_norm": 5.25, |
| "learning_rate": 6.165537741257971e-06, |
| "loss": 0.3093, |
| "step": 3170 |
| }, |
| { |
| "epoch": 2.544, |
| "grad_norm": 5.0, |
| "learning_rate": 5.959410214142419e-06, |
| "loss": 0.3223, |
| "step": 3180 |
| }, |
| { |
| "epoch": 2.552, |
| "grad_norm": 5.15625, |
| "learning_rate": 5.756568656839056e-06, |
| "loss": 0.4137, |
| "step": 3190 |
| }, |
| { |
| "epoch": 2.56, |
| "grad_norm": 5.78125, |
| "learning_rate": 5.557028203823522e-06, |
| "loss": 0.3785, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.568, |
| "grad_norm": 3.59375, |
| "learning_rate": 5.360803743268494e-06, |
| "loss": 0.3343, |
| "step": 3210 |
| }, |
| { |
| "epoch": 2.576, |
| "grad_norm": 4.96875, |
| "learning_rate": 5.167909915932801e-06, |
| "loss": 0.3217, |
| "step": 3220 |
| }, |
| { |
| "epoch": 2.584, |
| "grad_norm": 3.6875, |
| "learning_rate": 4.9783611140690415e-06, |
| "loss": 0.3157, |
| "step": 3230 |
| }, |
| { |
| "epoch": 2.592, |
| "grad_norm": 3.84375, |
| "learning_rate": 4.7921714803498165e-06, |
| "loss": 0.2983, |
| "step": 3240 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 5.3125, |
| "learning_rate": 4.609354906812374e-06, |
| "loss": 0.3362, |
| "step": 3250 |
| }, |
| { |
| "epoch": 2.608, |
| "grad_norm": 3.953125, |
| "learning_rate": 4.429925033822252e-06, |
| "loss": 0.3844, |
| "step": 3260 |
| }, |
| { |
| "epoch": 2.616, |
| "grad_norm": 4.28125, |
| "learning_rate": 4.253895249055412e-06, |
| "loss": 0.2974, |
| "step": 3270 |
| }, |
| { |
| "epoch": 2.624, |
| "grad_norm": 3.734375, |
| "learning_rate": 4.0812786864994566e-06, |
| "loss": 0.3442, |
| "step": 3280 |
| }, |
| { |
| "epoch": 2.632, |
| "grad_norm": 3.171875, |
| "learning_rate": 3.912088225473537e-06, |
| "loss": 0.3572, |
| "step": 3290 |
| }, |
| { |
| "epoch": 2.64, |
| "grad_norm": 4.59375, |
| "learning_rate": 3.7463364896675735e-06, |
| "loss": 0.3092, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.648, |
| "grad_norm": 4.03125, |
| "learning_rate": 3.584035846200201e-06, |
| "loss": 0.3093, |
| "step": 3310 |
| }, |
| { |
| "epoch": 2.656, |
| "grad_norm": 3.8125, |
| "learning_rate": 3.425198404696178e-06, |
| "loss": 0.3035, |
| "step": 3320 |
| }, |
| { |
| "epoch": 2.664, |
| "grad_norm": 4.90625, |
| "learning_rate": 3.2698360163827325e-06, |
| "loss": 0.3166, |
| "step": 3330 |
| }, |
| { |
| "epoch": 2.672, |
| "grad_norm": 4.03125, |
| "learning_rate": 3.1179602732053947e-06, |
| "loss": 0.2739, |
| "step": 3340 |
| }, |
| { |
| "epoch": 2.68, |
| "grad_norm": 6.4375, |
| "learning_rate": 2.969582506963098e-06, |
| "loss": 0.3551, |
| "step": 3350 |
| }, |
| { |
| "epoch": 2.6879999999999997, |
| "grad_norm": 4.625, |
| "learning_rate": 2.824713788462602e-06, |
| "loss": 0.3293, |
| "step": 3360 |
| }, |
| { |
| "epoch": 2.6959999999999997, |
| "grad_norm": 3.71875, |
| "learning_rate": 2.6833649266925943e-06, |
| "loss": 0.3278, |
| "step": 3370 |
| }, |
| { |
| "epoch": 2.7039999999999997, |
| "grad_norm": 3.984375, |
| "learning_rate": 2.5455464680171126e-06, |
| "loss": 0.2763, |
| "step": 3380 |
| }, |
| { |
| "epoch": 2.7119999999999997, |
| "grad_norm": 6.125, |
| "learning_rate": 2.411268695388719e-06, |
| "loss": 0.3378, |
| "step": 3390 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "grad_norm": 5.09375, |
| "learning_rate": 2.28054162758119e-06, |
| "loss": 0.2644, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.7279999999999998, |
| "grad_norm": 4.09375, |
| "learning_rate": 2.1533750184420832e-06, |
| "loss": 0.3154, |
| "step": 3410 |
| }, |
| { |
| "epoch": 2.7359999999999998, |
| "grad_norm": 5.28125, |
| "learning_rate": 2.0297783561649244e-06, |
| "loss": 0.2217, |
| "step": 3420 |
| }, |
| { |
| "epoch": 2.7439999999999998, |
| "grad_norm": 5.59375, |
| "learning_rate": 1.9097608625812726e-06, |
| "loss": 0.3446, |
| "step": 3430 |
| }, |
| { |
| "epoch": 2.752, |
| "grad_norm": 5.25, |
| "learning_rate": 1.7933314924726886e-06, |
| "loss": 0.387, |
| "step": 3440 |
| }, |
| { |
| "epoch": 2.76, |
| "grad_norm": 5.1875, |
| "learning_rate": 1.6804989329025521e-06, |
| "loss": 0.3531, |
| "step": 3450 |
| }, |
| { |
| "epoch": 2.768, |
| "grad_norm": 5.9375, |
| "learning_rate": 1.5712716025679587e-06, |
| "loss": 0.2906, |
| "step": 3460 |
| }, |
| { |
| "epoch": 2.776, |
| "grad_norm": 5.28125, |
| "learning_rate": 1.4656576511715204e-06, |
| "loss": 0.2759, |
| "step": 3470 |
| }, |
| { |
| "epoch": 2.784, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.3636649588133432e-06, |
| "loss": 0.3646, |
| "step": 3480 |
| }, |
| { |
| "epoch": 2.792, |
| "grad_norm": 4.4375, |
| "learning_rate": 1.265301135403052e-06, |
| "loss": 0.3467, |
| "step": 3490 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.1705735200920053e-06, |
| "loss": 0.2817, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.808, |
| "grad_norm": 5.4375, |
| "learning_rate": 1.0794891807256956e-06, |
| "loss": 0.3304, |
| "step": 3510 |
| }, |
| { |
| "epoch": 2.816, |
| "grad_norm": 3.53125, |
| "learning_rate": 9.920549133164314e-07, |
| "loss": 0.3544, |
| "step": 3520 |
| }, |
| { |
| "epoch": 2.824, |
| "grad_norm": 6.21875, |
| "learning_rate": 9.08277241536215e-07, |
| "loss": 0.3082, |
| "step": 3530 |
| }, |
| { |
| "epoch": 2.832, |
| "grad_norm": 3.984375, |
| "learning_rate": 8.281624162300494e-07, |
| "loss": 0.2201, |
| "step": 3540 |
| }, |
| { |
| "epoch": 2.84, |
| "grad_norm": 6.9375, |
| "learning_rate": 7.517164149495326e-07, |
| "loss": 0.2885, |
| "step": 3550 |
| }, |
| { |
| "epoch": 2.848, |
| "grad_norm": 5.625, |
| "learning_rate": 6.789449415068316e-07, |
| "loss": 0.2716, |
| "step": 3560 |
| }, |
| { |
| "epoch": 2.856, |
| "grad_norm": 5.46875, |
| "learning_rate": 6.098534255491561e-07, |
| "loss": 0.2723, |
| "step": 3570 |
| }, |
| { |
| "epoch": 2.864, |
| "grad_norm": 2.765625, |
| "learning_rate": 5.44447022153588e-07, |
| "loss": 0.3245, |
| "step": 3580 |
| }, |
| { |
| "epoch": 2.872, |
| "grad_norm": 5.34375, |
| "learning_rate": 4.827306114425056e-07, |
| "loss": 0.2905, |
| "step": 3590 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 5.59375, |
| "learning_rate": 4.2470879821941423e-07, |
| "loss": 0.3986, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.888, |
| "grad_norm": 3.203125, |
| "learning_rate": 3.703859116254038e-07, |
| "loss": 0.3328, |
| "step": 3610 |
| }, |
| { |
| "epoch": 2.896, |
| "grad_norm": 5.875, |
| "learning_rate": 3.197660048161133e-07, |
| "loss": 0.2893, |
| "step": 3620 |
| }, |
| { |
| "epoch": 2.904, |
| "grad_norm": 4.875, |
| "learning_rate": 2.728528546593667e-07, |
| "loss": 0.3573, |
| "step": 3630 |
| }, |
| { |
| "epoch": 2.912, |
| "grad_norm": 3.921875, |
| "learning_rate": 2.2964996145330986e-07, |
| "loss": 0.2721, |
| "step": 3640 |
| }, |
| { |
| "epoch": 2.92, |
| "grad_norm": 3.296875, |
| "learning_rate": 1.9016054866528576e-07, |
| "loss": 0.2943, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.928, |
| "grad_norm": 3.84375, |
| "learning_rate": 1.5438756269130495e-07, |
| "loss": 0.3179, |
| "step": 3660 |
| }, |
| { |
| "epoch": 2.936, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.223336726362323e-07, |
| "loss": 0.3203, |
| "step": 3670 |
| }, |
| { |
| "epoch": 2.944, |
| "grad_norm": 3.34375, |
| "learning_rate": 9.400127011461312e-08, |
| "loss": 0.3184, |
| "step": 3680 |
| }, |
| { |
| "epoch": 2.952, |
| "grad_norm": 4.625, |
| "learning_rate": 6.939246907222696e-08, |
| "loss": 0.3581, |
| "step": 3690 |
| }, |
| { |
| "epoch": 2.96, |
| "grad_norm": 4.5, |
| "learning_rate": 4.850910562839151e-08, |
| "loss": 0.3222, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.968, |
| "grad_norm": 6.59375, |
| "learning_rate": 3.135273793893889e-08, |
| "loss": 0.2907, |
| "step": 3710 |
| }, |
| { |
| "epoch": 2.976, |
| "grad_norm": 3.5625, |
| "learning_rate": 1.7924646079964248e-08, |
| "loss": 0.3589, |
| "step": 3720 |
| }, |
| { |
| "epoch": 2.984, |
| "grad_norm": 4.625, |
| "learning_rate": 8.225831952324292e-09, |
| "loss": 0.3172, |
| "step": 3730 |
| }, |
| { |
| "epoch": 2.992, |
| "grad_norm": 5.75, |
| "learning_rate": 2.257019206874933e-09, |
| "loss": 0.29, |
| "step": 3740 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 3.109375, |
| "learning_rate": 1.8653190470008242e-11, |
| "loss": 0.2897, |
| "step": 3750 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 3750, |
| "total_flos": 6.386198055566157e+17, |
| "train_loss": 0.40305233942667645, |
| "train_runtime": 8900.4049, |
| "train_samples_per_second": 6.741, |
| "train_steps_per_second": 0.421 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3750, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.386198055566157e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|