diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,72027 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2000, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016666666666666666, + "grad_norm": 9.4375, + "learning_rate": 8.640000000000002e-06, + "loss": 5.2912, + "loss/crossentropy": 2.164160817861557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059289701282978, + "step": 2 + }, + { + "epoch": 0.0003333333333333333, + "grad_norm": 7.90625, + "learning_rate": 9.280000000000001e-06, + "loss": 4.7345, + "loss/crossentropy": 1.9222038090229034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22200099378824234, + "step": 4 + }, + { + "epoch": 0.0005, + "grad_norm": 6.625, + "learning_rate": 9.920000000000002e-06, + "loss": 5.159, + "loss/crossentropy": 2.4256778359413147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2173849642276764, + "step": 6 + }, + { + "epoch": 0.0006666666666666666, + "grad_norm": 6.6875, + "learning_rate": 1.056e-05, + "loss": 4.2586, + "loss/crossentropy": 1.0981817543506622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15661142766475677, + "step": 8 + }, + { + "epoch": 0.0008333333333333334, + "grad_norm": 5.3125, + "learning_rate": 1.1200000000000001e-05, + "loss": 4.9503, + "loss/crossentropy": 2.302097499370575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299620732665062, + "step": 10 + }, + { + "epoch": 0.001, + "grad_norm": 5.34375, + "learning_rate": 1.1840000000000002e-05, + "loss": 5.5701, + "loss/crossentropy": 1.7413269132375717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19975881464779377, + "step": 12 + }, + { + "epoch": 0.0011666666666666668, + "grad_norm": 5.4375, + "learning_rate": 1.2480000000000002e-05, + "loss": 5.0073, + "loss/crossentropy": 1.2278007790446281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15528732910752296, + "step": 14 + }, + { + "epoch": 0.0013333333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 2.45025634765625, + "learning_rate": 1.3120000000000001e-05, + "loss": 5.1973, + "loss/crossentropy": 2.5199625492095947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22803819924592972, + "step": 16 + }, + { + "epoch": 0.0015, + "grad_norm": 5.0, + "grad_norm_var": 0.64693603515625, + "learning_rate": 1.376e-05, + "loss": 4.7183, + "loss/crossentropy": 2.4793767035007477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2310861274600029, + "step": 18 + }, + { + "epoch": 0.0016666666666666668, + "grad_norm": 5.34375, + "grad_norm_var": 0.34068603515625, + "learning_rate": 1.4400000000000003e-05, + "loss": 4.5878, + "loss/crossentropy": 1.9572802186012268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875070109963417, + "step": 20 + }, + { + "epoch": 0.0018333333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.20558268229166668, + "learning_rate": 1.5040000000000002e-05, + "loss": 5.5266, + "loss/crossentropy": 1.6191904172301292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16165770776569843, + "step": 22 + }, + { + "epoch": 0.002, + "grad_norm": 5.875, + "grad_norm_var": 0.10188395182291667, + "learning_rate": 1.5680000000000002e-05, + "loss": 5.32, + "loss/crossentropy": 2.563029944896698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23509466275572777, + "step": 24 + }, + { + "epoch": 0.0021666666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.10467122395833334, + "learning_rate": 1.6320000000000003e-05, + "loss": 4.6407, + "loss/crossentropy": 1.9466444551944733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20470014959573746, + "step": 26 + }, + { + "epoch": 0.0023333333333333335, + "grad_norm": 5.25, + "grad_norm_var": 0.03609619140625, + "learning_rate": 1.6960000000000004e-05, + "loss": 4.7149, + "loss/crossentropy": 1.9283565133810043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20196671783924103, + "step": 28 + }, + { + "epoch": 0.0025, + "grad_norm": 5.34375, + "grad_norm_var": 0.03609619140625, + "learning_rate": 1.76e-05, + "loss": 5.4057, + "loss/crossentropy": 1.890766903758049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21231163665652275, + "step": 30 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 6.0, + "grad_norm_var": 0.0775390625, + "learning_rate": 1.824e-05, + "loss": 4.5287, + "loss/crossentropy": 2.2417571544647217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24475030601024628, + "step": 32 + }, + { + "epoch": 0.0028333333333333335, + "grad_norm": 5.25, + "grad_norm_var": 0.07095947265625, + "learning_rate": 1.8880000000000002e-05, + "loss": 5.5623, + "loss/crossentropy": 1.8421208187937737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875022854655981, + "step": 34 + }, + { + "epoch": 0.003, + "grad_norm": 5.1875, + "grad_norm_var": 0.07320556640625, + "learning_rate": 1.9520000000000003e-05, + "loss": 5.5626, + "loss/crossentropy": 2.560234487056732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24960926175117493, + "step": 36 + }, + { + "epoch": 0.0031666666666666666, + "grad_norm": 5.4375, + "grad_norm_var": 0.06907552083333333, + "learning_rate": 2.016e-05, + "loss": 5.2685, + "loss/crossentropy": 2.3100323379039764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2332722283899784, + "step": 38 + }, + { + "epoch": 0.0033333333333333335, + "grad_norm": 5.28125, + "grad_norm_var": 0.0484375, + "learning_rate": 2.08e-05, + "loss": 5.0457, + "loss/crossentropy": 1.8883708715438843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20036867633461952, + "step": 40 + }, + { + "epoch": 0.0035, + "grad_norm": 5.5625, + "grad_norm_var": 0.052994791666666666, + "learning_rate": 2.144e-05, + "loss": 5.085, + "loss/crossentropy": 1.181441307067871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15967545099556446, + "step": 42 + }, + { + "epoch": 0.0036666666666666666, + "grad_norm": 5.40625, + "grad_norm_var": 0.052587890625, + "learning_rate": 2.2080000000000002e-05, + "loss": 4.9071, + "loss/crossentropy": 2.132170617580414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24000748619437218, + "step": 44 + }, + { + "epoch": 0.003833333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.06109619140625, + "learning_rate": 2.2720000000000003e-05, + "loss": 5.0678, + "loss/crossentropy": 2.0978946685791016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20172996446490288, + "step": 46 + }, + { + "epoch": 0.004, + "grad_norm": 6.59375, + "grad_norm_var": 0.12342122395833334, + "learning_rate": 2.3360000000000003e-05, + "loss": 5.212, + "loss/crossentropy": 2.2711612582206726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26810943335294724, + "step": 48 + }, + { + "epoch": 0.004166666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.12473958333333333, + "learning_rate": 2.4000000000000004e-05, + "loss": 4.8046, + "loss/crossentropy": 1.5270142555236816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17906202748417854, + "step": 50 + }, + { + "epoch": 0.004333333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.15995686848958332, + "learning_rate": 2.4640000000000005e-05, + "loss": 4.8139, + "loss/crossentropy": 2.6848429441452026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2273760698735714, + "step": 52 + }, + { + "epoch": 0.0045, + "grad_norm": 5.40625, + "grad_norm_var": 0.16112874348958334, + "learning_rate": 2.5280000000000005e-05, + "loss": 5.3508, + "loss/crossentropy": 2.5355905294418335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24536684900522232, + "step": 54 + }, + { + "epoch": 0.004666666666666667, + "grad_norm": 5.75, + "grad_norm_var": 0.163671875, + "learning_rate": 2.5920000000000006e-05, + "loss": 5.5182, + "loss/crossentropy": 2.480812221765518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21574737504124641, + "step": 56 + }, + { + "epoch": 0.004833333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.17935791015625, + "learning_rate": 2.656e-05, + "loss": 4.919, + "loss/crossentropy": 1.4815584272146225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1718614138662815, + "step": 58 + }, + { + "epoch": 0.005, + "grad_norm": 5.3125, + "grad_norm_var": 0.194921875, + "learning_rate": 2.72e-05, + "loss": 4.1787, + "loss/crossentropy": 0.5379917472600937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11368012242019176, + "step": 60 + }, + { + "epoch": 0.005166666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 0.18131103515625, + "learning_rate": 2.784e-05, + "loss": 4.7688, + "loss/crossentropy": 2.2010596245527267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20431457087397575, + "step": 62 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 6.0, + "grad_norm_var": 0.135009765625, + "learning_rate": 2.8480000000000002e-05, + "loss": 5.4329, + "loss/crossentropy": 2.047866404056549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21845614910125732, + "step": 64 + }, + { + "epoch": 0.0055, + "grad_norm": 6.28125, + "grad_norm_var": 0.17209879557291666, + "learning_rate": 2.9120000000000002e-05, + "loss": 4.8769, + "loss/crossentropy": 2.302406132221222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2274726703763008, + "step": 66 + }, + { + "epoch": 0.005666666666666667, + "grad_norm": 6.0625, + "grad_norm_var": 0.15818684895833332, + "learning_rate": 2.9760000000000003e-05, + "loss": 5.4183, + "loss/crossentropy": 2.040872871875763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20815075933933258, + "step": 68 + }, + { + "epoch": 0.005833333333333334, + "grad_norm": 5.71875, + "grad_norm_var": 0.15071614583333334, + "learning_rate": 3.0400000000000004e-05, + "loss": 5.3612, + "loss/crossentropy": 1.6699720919132233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23643635585904121, + "step": 70 + }, + { + "epoch": 0.006, + "grad_norm": 5.46875, + "grad_norm_var": 0.14836832682291667, + "learning_rate": 3.104e-05, + "loss": 4.6081, + "loss/crossentropy": 2.111388862133026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21453475579619408, + "step": 72 + }, + { + "epoch": 0.006166666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.13775634765625, + "learning_rate": 3.168e-05, + "loss": 5.28, + "loss/crossentropy": 2.7730491161346436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24475040659308434, + "step": 74 + }, + { + "epoch": 0.006333333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.14843343098958334, + "learning_rate": 3.232e-05, + "loss": 4.9434, + "loss/crossentropy": 1.8236006125807762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18496105633676052, + "step": 76 + }, + { + "epoch": 0.0065, + "grad_norm": 5.46875, + "grad_norm_var": 0.14843343098958334, + "learning_rate": 3.296e-05, + "loss": 4.8855, + "loss/crossentropy": 2.105473317205906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216831523925066, + "step": 78 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.12511393229166667, + "learning_rate": 3.3600000000000004e-05, + "loss": 4.8352, + "loss/crossentropy": 2.1026684939861298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20776227116584778, + "step": 80 + }, + { + "epoch": 0.006833333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.10536702473958333, + "learning_rate": 3.4240000000000004e-05, + "loss": 4.8734, + "loss/crossentropy": 1.8813765197992325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019767053425312, + "step": 82 + }, + { + "epoch": 0.007, + "grad_norm": 5.21875, + "grad_norm_var": 0.08444010416666667, + "learning_rate": 3.4880000000000005e-05, + "loss": 4.9091, + "loss/crossentropy": 1.3020039498806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16160101629793644, + "step": 84 + }, + { + "epoch": 0.007166666666666667, + "grad_norm": 5.53125, + "grad_norm_var": 0.07952067057291666, + "learning_rate": 3.5520000000000006e-05, + "loss": 5.2581, + "loss/crossentropy": 1.912569299340248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20463180169463158, + "step": 86 + }, + { + "epoch": 0.007333333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.07932535807291667, + "learning_rate": 3.6160000000000006e-05, + "loss": 5.2699, + "loss/crossentropy": 2.6311103105545044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24578388035297394, + "step": 88 + }, + { + "epoch": 0.0075, + "grad_norm": 6.21875, + "grad_norm_var": 0.11334228515625, + "learning_rate": 3.680000000000001e-05, + "loss": 4.6852, + "loss/crossentropy": 2.1940360069274902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21145440265536308, + "step": 90 + }, + { + "epoch": 0.007666666666666666, + "grad_norm": 6.03125, + "grad_norm_var": 0.0927734375, + "learning_rate": 3.744000000000001e-05, + "loss": 5.1184, + "loss/crossentropy": 1.5804511904716492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18777066841721535, + "step": 92 + }, + { + "epoch": 0.007833333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.11366780598958333, + "learning_rate": 3.808e-05, + "loss": 4.5045, + "loss/crossentropy": 1.8188975527882576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19868333637714386, + "step": 94 + }, + { + "epoch": 0.008, + "grad_norm": 5.59375, + "grad_norm_var": 0.10859375, + "learning_rate": 3.872e-05, + "loss": 5.5254, + "loss/crossentropy": 2.3987780809402466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2095976211130619, + "step": 96 + }, + { + "epoch": 0.008166666666666666, + "grad_norm": 6.21875, + "grad_norm_var": 0.10833333333333334, + "learning_rate": 3.936e-05, + "loss": 5.129, + "loss/crossentropy": 2.186008095741272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24592407420277596, + "step": 98 + }, + { + "epoch": 0.008333333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.09739583333333333, + "learning_rate": 4e-05, + "loss": 4.9801, + "loss/crossentropy": 2.0446798354387283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20239055342972279, + "step": 100 + }, + { + "epoch": 0.0085, + "grad_norm": 5.40625, + "grad_norm_var": 0.09557291666666666, + "learning_rate": 4e-05, + "loss": 5.3834, + "loss/crossentropy": 2.29119148850441, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22458446770906448, + "step": 102 + }, + { + "epoch": 0.008666666666666666, + "grad_norm": 5.8125, + "grad_norm_var": 0.08826497395833334, + "learning_rate": 4e-05, + "loss": 4.7358, + "loss/crossentropy": 2.1947161257267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2436535656452179, + "step": 104 + }, + { + "epoch": 0.008833333333333334, + "grad_norm": 5.34375, + "grad_norm_var": 0.08043212890625, + "learning_rate": 4e-05, + "loss": 5.1568, + "loss/crossentropy": 2.04066064953804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21565158292651176, + "step": 106 + }, + { + "epoch": 0.009, + "grad_norm": 5.84375, + "grad_norm_var": 0.06982014973958334, + "learning_rate": 4e-05, + "loss": 4.875, + "loss/crossentropy": 1.8622316792607307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19797790050506592, + "step": 108 + }, + { + "epoch": 0.009166666666666667, + "grad_norm": 5.9375, + "grad_norm_var": 0.05045166015625, + "learning_rate": 4e-05, + "loss": 5.05, + "loss/crossentropy": 1.489914320409298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17602229118347168, + "step": 110 + }, + { + "epoch": 0.009333333333333334, + "grad_norm": 5.71875, + "grad_norm_var": 0.16669514973958333, + "learning_rate": 4e-05, + "loss": 5.4823, + "loss/crossentropy": 1.7484403923153877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20575151592493057, + "step": 112 + }, + { + "epoch": 0.0095, + "grad_norm": 5.96875, + "grad_norm_var": 0.15703125, + "learning_rate": 4e-05, + "loss": 5.0073, + "loss/crossentropy": 1.7794604748487473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22764279693365097, + "step": 114 + }, + { + "epoch": 0.009666666666666667, + "grad_norm": 5.6875, + "grad_norm_var": 0.14993082682291667, + "learning_rate": 4e-05, + "loss": 5.352, + "loss/crossentropy": 2.6334983110427856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23335204645991325, + "step": 116 + }, + { + "epoch": 0.009833333333333333, + "grad_norm": 5.6875, + "grad_norm_var": 0.14498291015625, + "learning_rate": 4e-05, + "loss": 5.3831, + "loss/crossentropy": 1.6918310597538948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19874560460448265, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 5.71875, + "grad_norm_var": 0.1791015625, + "learning_rate": 4e-05, + "loss": 5.0705, + "loss/crossentropy": 2.277990937232971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2351701594889164, + "step": 120 + }, + { + "epoch": 0.010166666666666666, + "grad_norm": 5.6875, + "grad_norm_var": 0.19599202473958333, + "learning_rate": 4e-05, + "loss": 4.8238, + "loss/crossentropy": 1.9308300465345383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19367161393165588, + "step": 122 + }, + { + "epoch": 0.010333333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.19933268229166667, + "learning_rate": 4e-05, + "loss": 5.1309, + "loss/crossentropy": 1.2643241733312607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15139093436300755, + "step": 124 + }, + { + "epoch": 0.0105, + "grad_norm": 5.8125, + "grad_norm_var": 0.19879150390625, + "learning_rate": 4e-05, + "loss": 5.0878, + "loss/crossentropy": 1.5644885823130608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17594983614981174, + "step": 126 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 5.25, + "grad_norm_var": 0.08401285807291667, + "learning_rate": 4e-05, + "loss": 4.9866, + "loss/crossentropy": 2.0537383928894997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19487846456468105, + "step": 128 + }, + { + "epoch": 0.010833333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.07447509765625, + "learning_rate": 4e-05, + "loss": 5.0095, + "loss/crossentropy": 1.8626472651958466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333734817802906, + "step": 130 + }, + { + "epoch": 0.011, + "grad_norm": 5.625, + "grad_norm_var": 0.06751302083333334, + "learning_rate": 4e-05, + "loss": 5.305, + "loss/crossentropy": 1.4333342388272285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17128831893205643, + "step": 132 + }, + { + "epoch": 0.011166666666666667, + "grad_norm": 6.625, + "grad_norm_var": 0.14511311848958333, + "learning_rate": 4e-05, + "loss": 4.8194, + "loss/crossentropy": 1.4352454990148544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869527231901884, + "step": 134 + }, + { + "epoch": 0.011333333333333334, + "grad_norm": 5.46875, + "grad_norm_var": 0.12706705729166667, + "learning_rate": 4e-05, + "loss": 5.1539, + "loss/crossentropy": 2.2443730235099792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240537665784359, + "step": 136 + }, + { + "epoch": 0.0115, + "grad_norm": 5.21875, + "grad_norm_var": 0.12381184895833333, + "learning_rate": 4e-05, + "loss": 4.7903, + "loss/crossentropy": 2.265403002500534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24458804354071617, + "step": 138 + }, + { + "epoch": 0.011666666666666667, + "grad_norm": 5.53125, + "grad_norm_var": 0.12317708333333334, + "learning_rate": 4e-05, + "loss": 5.3068, + "loss/crossentropy": 1.260722041130066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1643795594573021, + "step": 140 + }, + { + "epoch": 0.011833333333333333, + "grad_norm": 6.1875, + "grad_norm_var": 0.14358317057291667, + "learning_rate": 4e-05, + "loss": 5.394, + "loss/crossentropy": 2.1383322402834892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20691991224884987, + "step": 142 + }, + { + "epoch": 0.012, + "grad_norm": 5.75, + "grad_norm_var": 0.14034830729166667, + "learning_rate": 4e-05, + "loss": 5.1448, + "loss/crossentropy": 2.40448135137558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21756655722856522, + "step": 144 + }, + { + "epoch": 0.012166666666666666, + "grad_norm": 5.8125, + "grad_norm_var": 0.15500895182291666, + "learning_rate": 4e-05, + "loss": 5.3001, + "loss/crossentropy": 1.8169787228107452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20706401392817497, + "step": 146 + }, + { + "epoch": 0.012333333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.15266520182291668, + "learning_rate": 4e-05, + "loss": 5.2623, + "loss/crossentropy": 1.8481503129005432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19028427824378014, + "step": 148 + }, + { + "epoch": 0.0125, + "grad_norm": 5.53125, + "grad_norm_var": 0.08918863932291667, + "learning_rate": 4e-05, + "loss": 4.7276, + "loss/crossentropy": 1.4998832270503044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18570281565189362, + "step": 150 + }, + { + "epoch": 0.012666666666666666, + "grad_norm": 5.5625, + "grad_norm_var": 0.09029947916666667, + "learning_rate": 4e-05, + "loss": 4.9088, + "loss/crossentropy": 2.361013948917389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21920835599303246, + "step": 152 + }, + { + "epoch": 0.012833333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.0796875, + "learning_rate": 4e-05, + "loss": 5.4666, + "loss/crossentropy": 2.4189918637275696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187294214963913, + "step": 154 + }, + { + "epoch": 0.013, + "grad_norm": 5.53125, + "grad_norm_var": 0.07991129557291667, + "learning_rate": 4e-05, + "loss": 4.9728, + "loss/crossentropy": 1.3893551230430603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17549285665154457, + "step": 156 + }, + { + "epoch": 0.013166666666666667, + "grad_norm": 5.78125, + "grad_norm_var": 0.05859375, + "learning_rate": 4e-05, + "loss": 4.6976, + "loss/crossentropy": 1.219208374619484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18519950285553932, + "step": 158 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 6.25, + "grad_norm_var": 0.08166910807291666, + "learning_rate": 4e-05, + "loss": 5.2135, + "loss/crossentropy": 2.5400354266166687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2633819431066513, + "step": 160 + }, + { + "epoch": 0.0135, + "grad_norm": 5.8125, + "grad_norm_var": 0.08655192057291666, + "learning_rate": 4e-05, + "loss": 5.1126, + "loss/crossentropy": 2.380533277988434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545565590262413, + "step": 162 + }, + { + "epoch": 0.013666666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.09368082682291666, + "learning_rate": 4e-05, + "loss": 5.279, + "loss/crossentropy": 2.279165208339691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159963957965374, + "step": 164 + }, + { + "epoch": 0.013833333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.106103515625, + "learning_rate": 4e-05, + "loss": 4.4199, + "loss/crossentropy": 1.5300931632518768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16407620534300804, + "step": 166 + }, + { + "epoch": 0.014, + "grad_norm": 5.1875, + "grad_norm_var": 0.137744140625, + "learning_rate": 4e-05, + "loss": 4.6505, + "loss/crossentropy": 1.1790905147790909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16002923622727394, + "step": 168 + }, + { + "epoch": 0.014166666666666666, + "grad_norm": 5.6875, + "grad_norm_var": 0.14759114583333333, + "learning_rate": 4e-05, + "loss": 5.5463, + "loss/crossentropy": 2.186621367931366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2605951316654682, + "step": 170 + }, + { + "epoch": 0.014333333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 0.15513916015625, + "learning_rate": 4e-05, + "loss": 5.2762, + "loss/crossentropy": 2.4367510974407196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.221878033131361, + "step": 172 + }, + { + "epoch": 0.0145, + "grad_norm": 5.8125, + "grad_norm_var": 0.16365559895833334, + "learning_rate": 4e-05, + "loss": 5.2094, + "loss/crossentropy": 1.7971658408641815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21865063533186913, + "step": 174 + }, + { + "epoch": 0.014666666666666666, + "grad_norm": 5.3125, + "grad_norm_var": 0.15614827473958334, + "learning_rate": 4e-05, + "loss": 4.3297, + "loss/crossentropy": 2.00938368588686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059515416622162, + "step": 176 + }, + { + "epoch": 0.014833333333333334, + "grad_norm": 5.5, + "grad_norm_var": 0.12600504557291667, + "learning_rate": 4e-05, + "loss": 5.7069, + "loss/crossentropy": 2.2138592898845673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146594636142254, + "step": 178 + }, + { + "epoch": 0.015, + "grad_norm": 5.6875, + "grad_norm_var": 0.13190104166666666, + "learning_rate": 4e-05, + "loss": 5.2236, + "loss/crossentropy": 2.3264683187007904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24893302470445633, + "step": 180 + }, + { + "epoch": 0.015166666666666667, + "grad_norm": 7.65625, + "grad_norm_var": 0.41073811848958336, + "learning_rate": 4e-05, + "loss": 5.0168, + "loss/crossentropy": 2.421372711658478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2704160064458847, + "step": 182 + }, + { + "epoch": 0.015333333333333332, + "grad_norm": 5.4375, + "grad_norm_var": 0.370556640625, + "learning_rate": 4e-05, + "loss": 4.8129, + "loss/crossentropy": 2.6394213438034058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23327884078025818, + "step": 184 + }, + { + "epoch": 0.0155, + "grad_norm": 6.03125, + "grad_norm_var": 0.37688802083333334, + "learning_rate": 4e-05, + "loss": 4.9848, + "loss/crossentropy": 2.0492628812789917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2619275823235512, + "step": 186 + }, + { + "epoch": 0.015666666666666666, + "grad_norm": 5.75, + "grad_norm_var": 0.36477457682291664, + "learning_rate": 4e-05, + "loss": 5.2399, + "loss/crossentropy": 2.671754002571106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2431931532919407, + "step": 188 + }, + { + "epoch": 0.015833333333333335, + "grad_norm": 5.65625, + "grad_norm_var": 0.36471354166666664, + "learning_rate": 4e-05, + "loss": 5.4273, + "loss/crossentropy": 1.8667291477322578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19933542981743813, + "step": 190 + }, + { + "epoch": 0.016, + "grad_norm": 5.59375, + "grad_norm_var": 0.3186848958333333, + "learning_rate": 4e-05, + "loss": 5.2553, + "loss/crossentropy": 2.3034614622592926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2405945621430874, + "step": 192 + }, + { + "epoch": 0.016166666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.39088134765625, + "learning_rate": 4e-05, + "loss": 4.7129, + "loss/crossentropy": 1.9020505920052528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275901511311531, + "step": 194 + }, + { + "epoch": 0.01633333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.4091796875, + "learning_rate": 4e-05, + "loss": 4.6671, + "loss/crossentropy": 1.9580153226852417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23201943933963776, + "step": 196 + }, + { + "epoch": 0.0165, + "grad_norm": 5.15625, + "grad_norm_var": 0.18052978515625, + "learning_rate": 4e-05, + "loss": 4.7744, + "loss/crossentropy": 2.0856711715459824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20316448248922825, + "step": 198 + }, + { + "epoch": 0.016666666666666666, + "grad_norm": 5.46875, + "grad_norm_var": 0.18019205729166668, + "learning_rate": 4e-05, + "loss": 5.5373, + "loss/crossentropy": 2.463364541530609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22138278931379318, + "step": 200 + }, + { + "epoch": 0.016833333333333332, + "grad_norm": 5.53125, + "grad_norm_var": 0.13873291015625, + "learning_rate": 4e-05, + "loss": 4.8611, + "loss/crossentropy": 2.0712440609931946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275029793381691, + "step": 202 + }, + { + "epoch": 0.017, + "grad_norm": 5.34375, + "grad_norm_var": 0.133837890625, + "learning_rate": 4e-05, + "loss": 5.2617, + "loss/crossentropy": 2.640321433544159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2553870268166065, + "step": 204 + }, + { + "epoch": 0.017166666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.14345296223958334, + "learning_rate": 4e-05, + "loss": 5.1737, + "loss/crossentropy": 2.339095562696457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2486417517066002, + "step": 206 + }, + { + "epoch": 0.017333333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.139306640625, + "learning_rate": 4e-05, + "loss": 5.2609, + "loss/crossentropy": 1.7947577238082886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1953553967177868, + "step": 208 + }, + { + "epoch": 0.0175, + "grad_norm": 5.3125, + "grad_norm_var": 0.056538899739583336, + "learning_rate": 4e-05, + "loss": 4.9067, + "loss/crossentropy": 1.9185269623994827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18810669146478176, + "step": 210 + }, + { + "epoch": 0.017666666666666667, + "grad_norm": 5.59375, + "grad_norm_var": 0.05703125, + "learning_rate": 4e-05, + "loss": 5.0059, + "loss/crossentropy": 1.9670357257127762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22664643824100494, + "step": 212 + }, + { + "epoch": 0.017833333333333333, + "grad_norm": 5.6875, + "grad_norm_var": 0.04908447265625, + "learning_rate": 4e-05, + "loss": 5.2811, + "loss/crossentropy": 1.1792488172650337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16042216308414936, + "step": 214 + }, + { + "epoch": 0.018, + "grad_norm": 5.1875, + "grad_norm_var": 0.057535807291666664, + "learning_rate": 4e-05, + "loss": 4.6516, + "loss/crossentropy": 1.189962238073349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15354885905981064, + "step": 216 + }, + { + "epoch": 0.018166666666666668, + "grad_norm": 5.53125, + "grad_norm_var": 0.12376302083333333, + "learning_rate": 4e-05, + "loss": 5.5928, + "loss/crossentropy": 2.4891774654388428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23457545787096024, + "step": 218 + }, + { + "epoch": 0.018333333333333333, + "grad_norm": 5.59375, + "grad_norm_var": 0.11832275390625, + "learning_rate": 4e-05, + "loss": 5.0127, + "loss/crossentropy": 1.6934428215026855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18142644688487053, + "step": 220 + }, + { + "epoch": 0.0185, + "grad_norm": 5.5, + "grad_norm_var": 0.11252848307291667, + "learning_rate": 4e-05, + "loss": 5.3514, + "loss/crossentropy": 2.0071809887886047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19888557493686676, + "step": 222 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 5.40625, + "grad_norm_var": 0.11529947916666666, + "learning_rate": 4e-05, + "loss": 5.2169, + "loss/crossentropy": 2.1441567465662956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21664733067154884, + "step": 224 + }, + { + "epoch": 0.018833333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.10338541666666666, + "learning_rate": 4e-05, + "loss": 5.5327, + "loss/crossentropy": 2.479779541492462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23695838451385498, + "step": 226 + }, + { + "epoch": 0.019, + "grad_norm": 5.5625, + "grad_norm_var": 0.11315104166666666, + "learning_rate": 4e-05, + "loss": 5.1857, + "loss/crossentropy": 1.8668599054217339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20227666944265366, + "step": 228 + }, + { + "epoch": 0.019166666666666665, + "grad_norm": 5.71875, + "grad_norm_var": 0.11767171223958334, + "learning_rate": 4e-05, + "loss": 4.3191, + "loss/crossentropy": 1.2977168932557106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17029542475938797, + "step": 230 + }, + { + "epoch": 0.019333333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.12414957682291666, + "learning_rate": 4e-05, + "loss": 4.5462, + "loss/crossentropy": 1.5032763928174973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17685853876173496, + "step": 232 + }, + { + "epoch": 0.0195, + "grad_norm": 5.53125, + "grad_norm_var": 0.06717122395833333, + "learning_rate": 4e-05, + "loss": 4.9859, + "loss/crossentropy": 1.3485910668969154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15323390066623688, + "step": 234 + }, + { + "epoch": 0.019666666666666666, + "grad_norm": 6.1875, + "grad_norm_var": 0.09693603515625, + "learning_rate": 4e-05, + "loss": 5.187, + "loss/crossentropy": 2.1716194823384285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2310977354645729, + "step": 236 + }, + { + "epoch": 0.019833333333333335, + "grad_norm": 5.96875, + "grad_norm_var": 0.10621337890625, + "learning_rate": 4e-05, + "loss": 4.9196, + "loss/crossentropy": 1.9589915871620178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23559781908988953, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 5.21875, + "grad_norm_var": 0.11256103515625, + "learning_rate": 4e-05, + "loss": 5.2263, + "loss/crossentropy": 2.0510232746601105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26163269206881523, + "step": 240 + }, + { + "epoch": 0.020166666666666666, + "grad_norm": 7.0625, + "grad_norm_var": 0.24869384765625, + "learning_rate": 4e-05, + "loss": 4.9284, + "loss/crossentropy": 1.565013274550438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3279726207256317, + "step": 242 + }, + { + "epoch": 0.02033333333333333, + "grad_norm": 5.5, + "grad_norm_var": 0.2613118489583333, + "learning_rate": 4e-05, + "loss": 4.9741, + "loss/crossentropy": 1.447442576289177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1801074482500553, + "step": 244 + }, + { + "epoch": 0.0205, + "grad_norm": 5.34375, + "grad_norm_var": 0.258056640625, + "learning_rate": 4e-05, + "loss": 5.115, + "loss/crossentropy": 1.7351520657539368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19535227306187153, + "step": 246 + }, + { + "epoch": 0.020666666666666667, + "grad_norm": 5.46875, + "grad_norm_var": 0.22069905598958334, + "learning_rate": 4e-05, + "loss": 5.1927, + "loss/crossentropy": 1.2356021031737328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1455559842288494, + "step": 248 + }, + { + "epoch": 0.020833333333333332, + "grad_norm": 5.4375, + "grad_norm_var": 0.22005208333333334, + "learning_rate": 4e-05, + "loss": 5.1384, + "loss/crossentropy": 2.3360126316547394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24712038040161133, + "step": 250 + }, + { + "epoch": 0.021, + "grad_norm": 5.90625, + "grad_norm_var": 0.20193684895833333, + "learning_rate": 4e-05, + "loss": 4.8796, + "loss/crossentropy": 2.037757635116577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274589955806732, + "step": 252 + }, + { + "epoch": 0.021166666666666667, + "grad_norm": 5.75, + "grad_norm_var": 0.21686197916666666, + "learning_rate": 4e-05, + "loss": 5.4491, + "loss/crossentropy": 2.517900228500366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23455920815467834, + "step": 254 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 5.53125, + "grad_norm_var": 0.20636393229166666, + "learning_rate": 4e-05, + "loss": 5.5084, + "loss/crossentropy": 2.3689188957214355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23140762373805046, + "step": 256 + }, + { + "epoch": 0.0215, + "grad_norm": 5.375, + "grad_norm_var": 0.06953125, + "learning_rate": 4e-05, + "loss": 5.1052, + "loss/crossentropy": 2.45695823431015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22622350975871086, + "step": 258 + }, + { + "epoch": 0.021666666666666667, + "grad_norm": 5.6875, + "grad_norm_var": 0.047587076822916664, + "learning_rate": 4e-05, + "loss": 5.1092, + "loss/crossentropy": 2.481264054775238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26096219196915627, + "step": 260 + }, + { + "epoch": 0.021833333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.05701497395833333, + "learning_rate": 4e-05, + "loss": 4.9228, + "loss/crossentropy": 2.266570031642914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023736946284771, + "step": 262 + }, + { + "epoch": 0.022, + "grad_norm": 5.6875, + "grad_norm_var": 0.07498372395833333, + "learning_rate": 4e-05, + "loss": 5.3689, + "loss/crossentropy": 2.311848521232605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21398617699742317, + "step": 264 + }, + { + "epoch": 0.022166666666666668, + "grad_norm": 5.1875, + "grad_norm_var": 0.08192952473958333, + "learning_rate": 4e-05, + "loss": 5.2604, + "loss/crossentropy": 1.5015419125556946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18505272828042507, + "step": 266 + }, + { + "epoch": 0.022333333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.07519124348958334, + "learning_rate": 4e-05, + "loss": 4.8222, + "loss/crossentropy": 1.9860661998391151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171510737389326, + "step": 268 + }, + { + "epoch": 0.0225, + "grad_norm": 5.4375, + "grad_norm_var": 0.05950520833333333, + "learning_rate": 4e-05, + "loss": 5.7417, + "loss/crossentropy": 2.4434638023376465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2453981228172779, + "step": 270 + }, + { + "epoch": 0.02266666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.06282145182291667, + "learning_rate": 4e-05, + "loss": 4.462, + "loss/crossentropy": 1.597841739654541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16011325269937515, + "step": 272 + }, + { + "epoch": 0.022833333333333334, + "grad_norm": 5.5625, + "grad_norm_var": 0.06526285807291667, + "learning_rate": 4e-05, + "loss": 4.8157, + "loss/crossentropy": 1.8127425089478493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005634494125843, + "step": 274 + }, + { + "epoch": 0.023, + "grad_norm": 5.875, + "grad_norm_var": 0.07187093098958333, + "learning_rate": 4e-05, + "loss": 5.1641, + "loss/crossentropy": 2.327672451734543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233283594250679, + "step": 276 + }, + { + "epoch": 0.023166666666666665, + "grad_norm": 5.46875, + "grad_norm_var": 0.06812744140625, + "learning_rate": 4e-05, + "loss": 5.0835, + "loss/crossentropy": 1.4483234286308289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945433549582958, + "step": 278 + }, + { + "epoch": 0.023333333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.07877197265625, + "learning_rate": 4e-05, + "loss": 4.9559, + "loss/crossentropy": 1.4878328368067741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17448855936527252, + "step": 280 + }, + { + "epoch": 0.0235, + "grad_norm": 5.71875, + "grad_norm_var": 0.07849934895833334, + "learning_rate": 4e-05, + "loss": 4.9091, + "loss/crossentropy": 1.396668791770935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1872088983654976, + "step": 282 + }, + { + "epoch": 0.023666666666666666, + "grad_norm": 5.5625, + "grad_norm_var": 0.0837890625, + "learning_rate": 4e-05, + "loss": 4.6245, + "loss/crossentropy": 2.4077460169792175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21915850043296814, + "step": 284 + }, + { + "epoch": 0.023833333333333335, + "grad_norm": 5.125, + "grad_norm_var": 0.09215087890625, + "learning_rate": 4e-05, + "loss": 4.1122, + "loss/crossentropy": 1.5891410186886787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18681494891643524, + "step": 286 + }, + { + "epoch": 0.024, + "grad_norm": 5.59375, + "grad_norm_var": 0.087353515625, + "learning_rate": 4e-05, + "loss": 5.0425, + "loss/crossentropy": 2.184404134750366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1966254562139511, + "step": 288 + }, + { + "epoch": 0.024166666666666666, + "grad_norm": 5.8125, + "grad_norm_var": 0.11027018229166667, + "learning_rate": 4e-05, + "loss": 5.1775, + "loss/crossentropy": 2.3604514598846436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20175722613930702, + "step": 290 + }, + { + "epoch": 0.024333333333333332, + "grad_norm": 5.5625, + "grad_norm_var": 0.10240478515625, + "learning_rate": 4e-05, + "loss": 5.0769, + "loss/crossentropy": 1.402433268725872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16730003245174885, + "step": 292 + }, + { + "epoch": 0.0245, + "grad_norm": 5.25, + "grad_norm_var": 0.09696858723958333, + "learning_rate": 4e-05, + "loss": 4.9796, + "loss/crossentropy": 1.5468462631106377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18110387213528156, + "step": 294 + }, + { + "epoch": 0.024666666666666667, + "grad_norm": 5.625, + "grad_norm_var": 0.07235921223958333, + "learning_rate": 4e-05, + "loss": 4.6309, + "loss/crossentropy": 2.3594585359096527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23140858113765717, + "step": 296 + }, + { + "epoch": 0.024833333333333332, + "grad_norm": 5.09375, + "grad_norm_var": 0.074072265625, + "learning_rate": 4e-05, + "loss": 5.1319, + "loss/crossentropy": 1.3274840712547302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16749969869852066, + "step": 298 + }, + { + "epoch": 0.025, + "grad_norm": 5.4375, + "grad_norm_var": 0.06261393229166666, + "learning_rate": 4e-05, + "loss": 4.7546, + "loss/crossentropy": 0.9479904547333717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1345222145318985, + "step": 300 + }, + { + "epoch": 0.025166666666666667, + "grad_norm": 5.78125, + "grad_norm_var": 5.926005045572917, + "learning_rate": 4e-05, + "loss": 4.1966, + "loss/crossentropy": 1.8578788191080093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18595384806394577, + "step": 302 + }, + { + "epoch": 0.025333333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 5.941337076822917, + "learning_rate": 4e-05, + "loss": 4.5656, + "loss/crossentropy": 1.192492350935936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15102743916213512, + "step": 304 + }, + { + "epoch": 0.0255, + "grad_norm": 5.21875, + "grad_norm_var": 6.0068359375, + "learning_rate": 4e-05, + "loss": 4.7877, + "loss/crossentropy": 2.530356705188751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265690192580223, + "step": 306 + }, + { + "epoch": 0.025666666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 5.976558430989583, + "learning_rate": 4e-05, + "loss": 5.002, + "loss/crossentropy": 1.917500764131546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19061551988124847, + "step": 308 + }, + { + "epoch": 0.025833333333333333, + "grad_norm": 5.71875, + "grad_norm_var": 5.966988118489583, + "learning_rate": 4e-05, + "loss": 5.3558, + "loss/crossentropy": 2.382882058620453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21255266666412354, + "step": 310 + }, + { + "epoch": 0.026, + "grad_norm": 5.15625, + "grad_norm_var": 6.049019368489583, + "learning_rate": 4e-05, + "loss": 5.0068, + "loss/crossentropy": 2.380540519952774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22027641534805298, + "step": 312 + }, + { + "epoch": 0.026166666666666668, + "grad_norm": 5.6875, + "grad_norm_var": 6.014827473958333, + "learning_rate": 4e-05, + "loss": 4.8984, + "loss/crossentropy": 1.7228035554289818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18263637647032738, + "step": 314 + }, + { + "epoch": 0.026333333333333334, + "grad_norm": 5.5625, + "grad_norm_var": 5.969254557291666, + "learning_rate": 4e-05, + "loss": 5.3086, + "loss/crossentropy": 1.3391352742910385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15971971489489079, + "step": 316 + }, + { + "epoch": 0.0265, + "grad_norm": 5.4375, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 4e-05, + "loss": 4.879, + "loss/crossentropy": 2.5128698348999023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24337675794959068, + "step": 318 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.056441243489583334, + "learning_rate": 4e-05, + "loss": 4.6445, + "loss/crossentropy": 2.6085115671157837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2379378005862236, + "step": 320 + }, + { + "epoch": 0.026833333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.07005208333333333, + "learning_rate": 4e-05, + "loss": 5.0695, + "loss/crossentropy": 2.1165069714188576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19540811702609062, + "step": 322 + }, + { + "epoch": 0.027, + "grad_norm": 5.46875, + "grad_norm_var": 0.059228515625, + "learning_rate": 4e-05, + "loss": 5.5129, + "loss/crossentropy": 1.4290212765336037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1564047373831272, + "step": 324 + }, + { + "epoch": 0.027166666666666665, + "grad_norm": 5.78125, + "grad_norm_var": 0.0625, + "learning_rate": 4e-05, + "loss": 4.941, + "loss/crossentropy": 2.273834705352783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24747185036540031, + "step": 326 + }, + { + "epoch": 0.027333333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.05660400390625, + "learning_rate": 4e-05, + "loss": 4.486, + "loss/crossentropy": 2.2860072553157806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299199439585209, + "step": 328 + }, + { + "epoch": 0.0275, + "grad_norm": 5.46875, + "grad_norm_var": 0.057906087239583334, + "learning_rate": 4e-05, + "loss": 5.4171, + "loss/crossentropy": 2.138988643884659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22434020042419434, + "step": 330 + }, + { + "epoch": 0.027666666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.049072265625, + "learning_rate": 4e-05, + "loss": 5.4328, + "loss/crossentropy": 2.5559749603271484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22616703063249588, + "step": 332 + }, + { + "epoch": 0.027833333333333335, + "grad_norm": 5.34375, + "grad_norm_var": 0.04973551432291667, + "learning_rate": 4e-05, + "loss": 5.1085, + "loss/crossentropy": 2.2669193148612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23874261230230331, + "step": 334 + }, + { + "epoch": 0.028, + "grad_norm": 5.65625, + "grad_norm_var": 0.050374348958333336, + "learning_rate": 4e-05, + "loss": 5.1428, + "loss/crossentropy": 1.1794737800955772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15420645847916603, + "step": 336 + }, + { + "epoch": 0.028166666666666666, + "grad_norm": 5.5625, + "grad_norm_var": 0.04049072265625, + "learning_rate": 4e-05, + "loss": 5.3906, + "loss/crossentropy": 2.6066287755966187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180948294699192, + "step": 338 + }, + { + "epoch": 0.028333333333333332, + "grad_norm": 5.28125, + "grad_norm_var": 0.044140625, + "learning_rate": 4e-05, + "loss": 5.0314, + "loss/crossentropy": 2.4064601063728333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222812995314598, + "step": 340 + }, + { + "epoch": 0.0285, + "grad_norm": 6.28125, + "grad_norm_var": 0.138671875, + "learning_rate": 4e-05, + "loss": 5.3436, + "loss/crossentropy": 1.8959501832723618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23883518390357494, + "step": 342 + }, + { + "epoch": 0.028666666666666667, + "grad_norm": 5.625, + "grad_norm_var": 0.13948160807291668, + "learning_rate": 4e-05, + "loss": 5.1752, + "loss/crossentropy": 1.8244957998394966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19024837389588356, + "step": 344 + }, + { + "epoch": 0.028833333333333332, + "grad_norm": 5.0625, + "grad_norm_var": 0.18053385416666667, + "learning_rate": 4e-05, + "loss": 4.8808, + "loss/crossentropy": 1.635428212583065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17805110290646553, + "step": 346 + }, + { + "epoch": 0.029, + "grad_norm": 5.75, + "grad_norm_var": 0.18841145833333334, + "learning_rate": 4e-05, + "loss": 5.7496, + "loss/crossentropy": 2.2286045253276825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23897172883152962, + "step": 348 + }, + { + "epoch": 0.029166666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.238916015625, + "learning_rate": 4e-05, + "loss": 4.0875, + "loss/crossentropy": 1.5485807359218597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19915905967354774, + "step": 350 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.230078125, + "learning_rate": 4e-05, + "loss": 5.3479, + "loss/crossentropy": 2.485140085220337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23857445269823074, + "step": 352 + }, + { + "epoch": 0.0295, + "grad_norm": 5.53125, + "grad_norm_var": 0.225634765625, + "learning_rate": 4e-05, + "loss": 4.8827, + "loss/crossentropy": 2.4410774409770966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23277483880519867, + "step": 354 + }, + { + "epoch": 0.029666666666666668, + "grad_norm": 5.15625, + "grad_norm_var": 0.22734375, + "learning_rate": 4e-05, + "loss": 4.3879, + "loss/crossentropy": 1.7819544896483421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17286515049636364, + "step": 356 + }, + { + "epoch": 0.029833333333333333, + "grad_norm": 5.90625, + "grad_norm_var": 0.12259114583333333, + "learning_rate": 4e-05, + "loss": 5.1247, + "loss/crossentropy": 2.2199259996414185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22783676907420158, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 5.6875, + "grad_norm_var": 0.12567952473958333, + "learning_rate": 4e-05, + "loss": 4.9228, + "loss/crossentropy": 2.2036180198192596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20686358213424683, + "step": 360 + }, + { + "epoch": 0.030166666666666668, + "grad_norm": 5.6875, + "grad_norm_var": 0.11106770833333333, + "learning_rate": 4e-05, + "loss": 4.9717, + "loss/crossentropy": 1.7007370814681053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126607969403267, + "step": 362 + }, + { + "epoch": 0.030333333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.09641520182291667, + "learning_rate": 4e-05, + "loss": 5.1573, + "loss/crossentropy": 1.4196887761354446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16805725172162056, + "step": 364 + }, + { + "epoch": 0.0305, + "grad_norm": 6.625, + "grad_norm_var": 0.16404622395833332, + "learning_rate": 4e-05, + "loss": 5.3914, + "loss/crossentropy": 2.2769704461097717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20909231901168823, + "step": 366 + }, + { + "epoch": 0.030666666666666665, + "grad_norm": 5.3125, + "grad_norm_var": 0.167578125, + "learning_rate": 4e-05, + "loss": 5.3227, + "loss/crossentropy": 2.4747599363327026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2311922200024128, + "step": 368 + }, + { + "epoch": 0.030833333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.18045247395833333, + "learning_rate": 4e-05, + "loss": 5.2762, + "loss/crossentropy": 1.7561465799808502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1812431439757347, + "step": 370 + }, + { + "epoch": 0.031, + "grad_norm": 5.40625, + "grad_norm_var": 0.20201822916666667, + "learning_rate": 4e-05, + "loss": 4.6249, + "loss/crossentropy": 1.8143546804785728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970304287970066, + "step": 372 + }, + { + "epoch": 0.031166666666666665, + "grad_norm": 5.9375, + "grad_norm_var": 0.20995686848958334, + "learning_rate": 4e-05, + "loss": 4.9844, + "loss/crossentropy": 1.9378879070281982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20872123166918755, + "step": 374 + }, + { + "epoch": 0.03133333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.38843994140625, + "learning_rate": 4e-05, + "loss": 5.0347, + "loss/crossentropy": 1.7512712702155113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19000215269625187, + "step": 376 + }, + { + "epoch": 0.0315, + "grad_norm": 6.46875, + "grad_norm_var": 0.3963826497395833, + "learning_rate": 4e-05, + "loss": 5.8688, + "loss/crossentropy": 1.9144393801689148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19313014298677444, + "step": 378 + }, + { + "epoch": 0.03166666666666667, + "grad_norm": 6.0625, + "grad_norm_var": 0.39254150390625, + "learning_rate": 4e-05, + "loss": 4.7452, + "loss/crossentropy": 1.5328343883156776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17478107661008835, + "step": 380 + }, + { + "epoch": 0.03183333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.35426025390625, + "learning_rate": 4e-05, + "loss": 4.9958, + "loss/crossentropy": 2.0120982453227043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19707869738340378, + "step": 382 + }, + { + "epoch": 0.032, + "grad_norm": 6.125, + "grad_norm_var": 0.34947509765625, + "learning_rate": 4e-05, + "loss": 5.3694, + "loss/crossentropy": 2.1418115496635437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2317424789071083, + "step": 384 + }, + { + "epoch": 0.03216666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.3590494791666667, + "learning_rate": 4e-05, + "loss": 5.0396, + "loss/crossentropy": 2.2628641948103905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19536038115620613, + "step": 386 + }, + { + "epoch": 0.03233333333333333, + "grad_norm": 5.625, + "grad_norm_var": 0.32274983723958334, + "learning_rate": 4e-05, + "loss": 5.1977, + "loss/crossentropy": 2.708618402481079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24159733951091766, + "step": 388 + }, + { + "epoch": 0.0325, + "grad_norm": 5.0, + "grad_norm_var": 0.3513671875, + "learning_rate": 4e-05, + "loss": 5.1688, + "loss/crossentropy": 1.427762784063816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16233957186341286, + "step": 390 + }, + { + "epoch": 0.03266666666666666, + "grad_norm": 5.59375, + "grad_norm_var": 0.15084635416666667, + "learning_rate": 4e-05, + "loss": 5.0311, + "loss/crossentropy": 1.1995328813791275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16477875411510468, + "step": 392 + }, + { + "epoch": 0.03283333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.09377848307291667, + "learning_rate": 4e-05, + "loss": 5.1179, + "loss/crossentropy": 1.8344684839248657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19066274911165237, + "step": 394 + }, + { + "epoch": 0.033, + "grad_norm": 5.53125, + "grad_norm_var": 0.075634765625, + "learning_rate": 4e-05, + "loss": 5.0157, + "loss/crossentropy": 2.1210782676935196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18409163504838943, + "step": 396 + }, + { + "epoch": 0.033166666666666664, + "grad_norm": 5.90625, + "grad_norm_var": 0.08240559895833334, + "learning_rate": 4e-05, + "loss": 4.8918, + "loss/crossentropy": 2.5905413031578064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24663139507174492, + "step": 398 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 5.96875, + "grad_norm_var": 0.08899332682291666, + "learning_rate": 4e-05, + "loss": 4.8675, + "loss/crossentropy": 1.6443525850772858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19166382029652596, + "step": 400 + }, + { + "epoch": 0.0335, + "grad_norm": 5.875, + "grad_norm_var": 0.09504801432291667, + "learning_rate": 4e-05, + "loss": 5.0497, + "loss/crossentropy": 1.82669086009264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19115986675024033, + "step": 402 + }, + { + "epoch": 0.033666666666666664, + "grad_norm": 5.34375, + "grad_norm_var": 0.09576822916666666, + "learning_rate": 4e-05, + "loss": 5.2452, + "loss/crossentropy": 2.424153983592987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2266378439962864, + "step": 404 + }, + { + "epoch": 0.03383333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.08782145182291666, + "learning_rate": 4e-05, + "loss": 5.556, + "loss/crossentropy": 1.9700486361980438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20585943385958672, + "step": 406 + }, + { + "epoch": 0.034, + "grad_norm": 5.625, + "grad_norm_var": 0.095166015625, + "learning_rate": 4e-05, + "loss": 5.3139, + "loss/crossentropy": 2.2227725982666016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2445167973637581, + "step": 408 + }, + { + "epoch": 0.034166666666666665, + "grad_norm": 5.0625, + "grad_norm_var": 0.11912434895833333, + "learning_rate": 4e-05, + "loss": 5.1151, + "loss/crossentropy": 2.4945799708366394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22168324142694473, + "step": 410 + }, + { + "epoch": 0.034333333333333334, + "grad_norm": 5.5625, + "grad_norm_var": 0.12263997395833333, + "learning_rate": 4e-05, + "loss": 4.6133, + "loss/crossentropy": 2.288731187582016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22380968183279037, + "step": 412 + }, + { + "epoch": 0.0345, + "grad_norm": 5.0625, + "grad_norm_var": 0.12537434895833333, + "learning_rate": 4e-05, + "loss": 5.1334, + "loss/crossentropy": 2.1317990124225616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19617579877376556, + "step": 414 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 13.3125, + "grad_norm_var": 3.95738525390625, + "learning_rate": 4e-05, + "loss": 4.456, + "loss/crossentropy": 1.048334889113903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13727323338389397, + "step": 416 + }, + { + "epoch": 0.034833333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 3.9932291666666666, + "learning_rate": 4e-05, + "loss": 5.0826, + "loss/crossentropy": 1.9634416326880455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20289554446935654, + "step": 418 + }, + { + "epoch": 0.035, + "grad_norm": 5.0625, + "grad_norm_var": 4.047330729166666, + "learning_rate": 4e-05, + "loss": 4.7622, + "loss/crossentropy": 1.7749098986387253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1768258735537529, + "step": 420 + }, + { + "epoch": 0.035166666666666666, + "grad_norm": 5.59375, + "grad_norm_var": 4.063570149739584, + "learning_rate": 4e-05, + "loss": 5.2613, + "loss/crossentropy": 2.7100062370300293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460428662598133, + "step": 422 + }, + { + "epoch": 0.035333333333333335, + "grad_norm": 5.375, + "grad_norm_var": 4.056966145833333, + "learning_rate": 4e-05, + "loss": 4.2625, + "loss/crossentropy": 2.2796683609485626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166573479771614, + "step": 424 + }, + { + "epoch": 0.0355, + "grad_norm": 41.0, + "grad_norm_var": 80.93147379557291, + "learning_rate": 4e-05, + "loss": 4.7386, + "loss/crossentropy": 2.4747623205184937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24762186035513878, + "step": 426 + }, + { + "epoch": 0.035666666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 81.11119384765625, + "learning_rate": 4e-05, + "loss": 4.984, + "loss/crossentropy": 2.4123693108558655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23126976937055588, + "step": 428 + }, + { + "epoch": 0.035833333333333335, + "grad_norm": 6.4375, + "grad_norm_var": 80.71106770833333, + "learning_rate": 4e-05, + "loss": 5.3447, + "loss/crossentropy": 2.5928608775138855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24646497890353203, + "step": 430 + }, + { + "epoch": 0.036, + "grad_norm": 5.40625, + "grad_norm_var": 79.07750244140625, + "learning_rate": 4e-05, + "loss": 4.6058, + "loss/crossentropy": 1.2497084438800812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15603481978178024, + "step": 432 + }, + { + "epoch": 0.036166666666666666, + "grad_norm": 5.375, + "grad_norm_var": 79.04388020833333, + "learning_rate": 4e-05, + "loss": 4.5617, + "loss/crossentropy": 2.0195882841944695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020249329507351, + "step": 434 + }, + { + "epoch": 0.036333333333333336, + "grad_norm": 5.4375, + "grad_norm_var": 78.90089518229166, + "learning_rate": 4e-05, + "loss": 5.3138, + "loss/crossentropy": 2.254369556903839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22006989270448685, + "step": 436 + }, + { + "epoch": 0.0365, + "grad_norm": 7.65625, + "grad_norm_var": 78.45159098307292, + "learning_rate": 4e-05, + "loss": 4.723, + "loss/crossentropy": 1.7706375047564507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2226849813014269, + "step": 438 + }, + { + "epoch": 0.03666666666666667, + "grad_norm": 6.8125, + "grad_norm_var": 77.962744140625, + "learning_rate": 4e-05, + "loss": 5.7765, + "loss/crossentropy": 2.3849419355392456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24738119542598724, + "step": 440 + }, + { + "epoch": 0.036833333333333336, + "grad_norm": 4.8125, + "grad_norm_var": 0.5287760416666667, + "learning_rate": 4e-05, + "loss": 4.8364, + "loss/crossentropy": 1.9102841913700104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137163415551186, + "step": 442 + }, + { + "epoch": 0.037, + "grad_norm": 5.75, + "grad_norm_var": 0.48355712890625, + "learning_rate": 4e-05, + "loss": 5.4136, + "loss/crossentropy": 1.9578236639499664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1841808743774891, + "step": 444 + }, + { + "epoch": 0.03716666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.46092122395833335, + "learning_rate": 4e-05, + "loss": 4.6981, + "loss/crossentropy": 1.9191040992736816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19365688413381577, + "step": 446 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 5.15625, + "grad_norm_var": 0.46985270182291666, + "learning_rate": 4e-05, + "loss": 5.2009, + "loss/crossentropy": 2.8165441155433655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2253844402730465, + "step": 448 + }, + { + "epoch": 0.0375, + "grad_norm": 5.15625, + "grad_norm_var": 0.4892578125, + "learning_rate": 4e-05, + "loss": 4.6916, + "loss/crossentropy": 1.6579081416130066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2267564721405506, + "step": 450 + }, + { + "epoch": 0.03766666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.505322265625, + "learning_rate": 4e-05, + "loss": 4.8117, + "loss/crossentropy": 1.8108457028865814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969534195959568, + "step": 452 + }, + { + "epoch": 0.03783333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.21812744140625, + "learning_rate": 4e-05, + "loss": 4.9635, + "loss/crossentropy": 1.8094572871923447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18736277520656586, + "step": 454 + }, + { + "epoch": 0.038, + "grad_norm": 5.875, + "grad_norm_var": 0.10299479166666667, + "learning_rate": 4e-05, + "loss": 5.2644, + "loss/crossentropy": 2.125039577484131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20303135737776756, + "step": 456 + }, + { + "epoch": 0.03816666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.07994791666666666, + "learning_rate": 4e-05, + "loss": 5.0687, + "loss/crossentropy": 1.938138335943222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19551260024309158, + "step": 458 + }, + { + "epoch": 0.03833333333333333, + "grad_norm": 5.65625, + "grad_norm_var": 0.07825113932291666, + "learning_rate": 4e-05, + "loss": 5.5359, + "loss/crossentropy": 1.8810575380921364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18603547476232052, + "step": 460 + }, + { + "epoch": 0.0385, + "grad_norm": 6.1875, + "grad_norm_var": 0.11744791666666667, + "learning_rate": 4e-05, + "loss": 4.9456, + "loss/crossentropy": 2.3643300533294678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21903591975569725, + "step": 462 + }, + { + "epoch": 0.03866666666666667, + "grad_norm": 5.59375, + "grad_norm_var": 0.13967692057291667, + "learning_rate": 4e-05, + "loss": 4.5044, + "loss/crossentropy": 2.0795028433203697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20525045320391655, + "step": 464 + }, + { + "epoch": 0.03883333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.14654541015625, + "learning_rate": 4e-05, + "loss": 5.2285, + "loss/crossentropy": 2.670228064060211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22724847868084908, + "step": 466 + }, + { + "epoch": 0.039, + "grad_norm": 5.375, + "grad_norm_var": 0.13710530598958334, + "learning_rate": 4e-05, + "loss": 5.0264, + "loss/crossentropy": 1.9535819217562675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19420474022626877, + "step": 468 + }, + { + "epoch": 0.03916666666666667, + "grad_norm": 5.75, + "grad_norm_var": 0.12994791666666666, + "learning_rate": 4e-05, + "loss": 5.1026, + "loss/crossentropy": 2.0033904761075974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19617502018809319, + "step": 470 + }, + { + "epoch": 0.03933333333333333, + "grad_norm": 6.1875, + "grad_norm_var": 0.13538004557291666, + "learning_rate": 4e-05, + "loss": 4.9723, + "loss/crossentropy": 2.530248761177063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24241825193166733, + "step": 472 + }, + { + "epoch": 0.0395, + "grad_norm": 5.125, + "grad_norm_var": 0.14451497395833332, + "learning_rate": 4e-05, + "loss": 4.6632, + "loss/crossentropy": 1.8372912853956223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19772254303097725, + "step": 474 + }, + { + "epoch": 0.03966666666666667, + "grad_norm": 5.46875, + "grad_norm_var": 0.16470947265625, + "learning_rate": 4e-05, + "loss": 5.2215, + "loss/crossentropy": 1.9594649076461792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21060075610876083, + "step": 476 + }, + { + "epoch": 0.03983333333333333, + "grad_norm": 5.625, + "grad_norm_var": 0.12909749348958333, + "learning_rate": 4e-05, + "loss": 5.2237, + "loss/crossentropy": 1.7402563989162445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18676879815757275, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 5.0625, + "grad_norm_var": 0.111962890625, + "learning_rate": 4e-05, + "loss": 4.852, + "loss/crossentropy": 2.213899254798889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23049471899867058, + "step": 480 + }, + { + "epoch": 0.04016666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.11487223307291666, + "learning_rate": 4e-05, + "loss": 4.6514, + "loss/crossentropy": 1.3909804075956345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1692242305725813, + "step": 482 + }, + { + "epoch": 0.04033333333333333, + "grad_norm": 5.84375, + "grad_norm_var": 0.12057291666666667, + "learning_rate": 4e-05, + "loss": 5.5403, + "loss/crossentropy": 2.5903589129447937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23884601891040802, + "step": 484 + }, + { + "epoch": 0.0405, + "grad_norm": 5.4375, + "grad_norm_var": 0.12375895182291667, + "learning_rate": 4e-05, + "loss": 5.015, + "loss/crossentropy": 2.107970714569092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21329589560627937, + "step": 486 + }, + { + "epoch": 0.04066666666666666, + "grad_norm": 5.5625, + "grad_norm_var": 0.093359375, + "learning_rate": 4e-05, + "loss": 5.075, + "loss/crossentropy": 1.8551287949085236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2251221276819706, + "step": 488 + }, + { + "epoch": 0.04083333333333333, + "grad_norm": 5.53125, + "grad_norm_var": 0.08202718098958334, + "learning_rate": 4e-05, + "loss": 5.5261, + "loss/crossentropy": 2.0091424509882927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18690266273915768, + "step": 490 + }, + { + "epoch": 0.041, + "grad_norm": 4.9375, + "grad_norm_var": 0.11715087890625, + "learning_rate": 4e-05, + "loss": 5.1047, + "loss/crossentropy": 1.6703919917345047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19888342171907425, + "step": 492 + }, + { + "epoch": 0.041166666666666664, + "grad_norm": 5.625, + "grad_norm_var": 0.14237874348958332, + "learning_rate": 4e-05, + "loss": 5.674, + "loss/crossentropy": 2.249885469675064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21771755814552307, + "step": 494 + }, + { + "epoch": 0.04133333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.13059895833333332, + "learning_rate": 4e-05, + "loss": 5.2914, + "loss/crossentropy": 2.4916725754737854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22457458823919296, + "step": 496 + }, + { + "epoch": 0.0415, + "grad_norm": 5.03125, + "grad_norm_var": 0.14097900390625, + "learning_rate": 4e-05, + "loss": 4.8649, + "loss/crossentropy": 1.9688801318407059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109392024576664, + "step": 498 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 5.5625, + "grad_norm_var": 0.15689697265625, + "learning_rate": 4e-05, + "loss": 5.0342, + "loss/crossentropy": 2.300499051809311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113119661808014, + "step": 500 + }, + { + "epoch": 0.041833333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.200390625, + "learning_rate": 4e-05, + "loss": 5.4782, + "loss/crossentropy": 2.1189796030521393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2590511702001095, + "step": 502 + }, + { + "epoch": 0.042, + "grad_norm": 5.5625, + "grad_norm_var": 0.20634358723958332, + "learning_rate": 4e-05, + "loss": 5.5632, + "loss/crossentropy": 2.599315345287323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2375834807753563, + "step": 504 + }, + { + "epoch": 0.042166666666666665, + "grad_norm": 5.46875, + "grad_norm_var": 0.20716145833333333, + "learning_rate": 4e-05, + "loss": 5.3239, + "loss/crossentropy": 2.3774854242801666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033194676041603, + "step": 506 + }, + { + "epoch": 0.042333333333333334, + "grad_norm": 5.9375, + "grad_norm_var": 0.17721354166666667, + "learning_rate": 4e-05, + "loss": 5.4254, + "loss/crossentropy": 1.6486110389232635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900431402027607, + "step": 508 + }, + { + "epoch": 0.0425, + "grad_norm": 5.78125, + "grad_norm_var": 0.18352864583333334, + "learning_rate": 4e-05, + "loss": 4.6721, + "loss/crossentropy": 2.2632896304130554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22721751406788826, + "step": 510 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 4.96875, + "grad_norm_var": 0.20403238932291667, + "learning_rate": 4e-05, + "loss": 4.5325, + "loss/crossentropy": 1.248498149216175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.166516724973917, + "step": 512 + }, + { + "epoch": 0.042833333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.20436197916666668, + "learning_rate": 4e-05, + "loss": 4.6529, + "loss/crossentropy": 2.0403945446014404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20415923185646534, + "step": 514 + }, + { + "epoch": 0.043, + "grad_norm": 5.5, + "grad_norm_var": 0.18642171223958334, + "learning_rate": 4e-05, + "loss": 4.8357, + "loss/crossentropy": 1.902937613427639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22126169875264168, + "step": 516 + }, + { + "epoch": 0.043166666666666666, + "grad_norm": 6.1875, + "grad_norm_var": 0.16643473307291667, + "learning_rate": 4e-05, + "loss": 5.2757, + "loss/crossentropy": 2.0559470653533936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19494493678212166, + "step": 518 + }, + { + "epoch": 0.043333333333333335, + "grad_norm": 5.5, + "grad_norm_var": 0.16109619140625, + "learning_rate": 4e-05, + "loss": 5.2423, + "loss/crossentropy": 2.4924589097499847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24120581522583961, + "step": 520 + }, + { + "epoch": 0.0435, + "grad_norm": 5.4375, + "grad_norm_var": 0.16646728515625, + "learning_rate": 4e-05, + "loss": 5.1833, + "loss/crossentropy": 1.681239552795887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20514622330665588, + "step": 522 + }, + { + "epoch": 0.043666666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.13847249348958332, + "learning_rate": 4e-05, + "loss": 5.142, + "loss/crossentropy": 2.186009407043457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21601705998182297, + "step": 524 + }, + { + "epoch": 0.043833333333333335, + "grad_norm": 5.5, + "grad_norm_var": 0.11901041666666666, + "learning_rate": 4e-05, + "loss": 5.941, + "loss/crossentropy": 2.4676918387413025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23410123214125633, + "step": 526 + }, + { + "epoch": 0.044, + "grad_norm": 5.84375, + "grad_norm_var": 0.13984375, + "learning_rate": 4e-05, + "loss": 5.0339, + "loss/crossentropy": 2.1201189160346985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19850488752126694, + "step": 528 + }, + { + "epoch": 0.04416666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.12274983723958334, + "learning_rate": 4e-05, + "loss": 5.2603, + "loss/crossentropy": 1.8619603216648102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19815063290297985, + "step": 530 + }, + { + "epoch": 0.044333333333333336, + "grad_norm": 5.46875, + "grad_norm_var": 0.11757405598958333, + "learning_rate": 4e-05, + "loss": 5.2116, + "loss/crossentropy": 2.5051605105400085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23532362654805183, + "step": 532 + }, + { + "epoch": 0.0445, + "grad_norm": 5.53125, + "grad_norm_var": 0.10702718098958333, + "learning_rate": 4e-05, + "loss": 5.0707, + "loss/crossentropy": 1.956557109951973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19309379532933235, + "step": 534 + }, + { + "epoch": 0.04466666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 0.111572265625, + "learning_rate": 4e-05, + "loss": 5.253, + "loss/crossentropy": 1.8768843710422516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20720825903117657, + "step": 536 + }, + { + "epoch": 0.044833333333333336, + "grad_norm": 5.09375, + "grad_norm_var": 0.11599934895833333, + "learning_rate": 4e-05, + "loss": 5.2099, + "loss/crossentropy": 2.205892413854599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255897894501686, + "step": 538 + }, + { + "epoch": 0.045, + "grad_norm": 5.0, + "grad_norm_var": 0.12721354166666668, + "learning_rate": 4e-05, + "loss": 4.6425, + "loss/crossentropy": 0.9755007773637772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13529013842344284, + "step": 540 + }, + { + "epoch": 0.04516666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.11718343098958334, + "learning_rate": 4e-05, + "loss": 5.0623, + "loss/crossentropy": 2.016813486814499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20038552209734917, + "step": 542 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 6.1875, + "grad_norm_var": 0.10767822265625, + "learning_rate": 4e-05, + "loss": 5.7204, + "loss/crossentropy": 2.6434133052825928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23127877712249756, + "step": 544 + }, + { + "epoch": 0.0455, + "grad_norm": 5.125, + "grad_norm_var": 0.11239827473958333, + "learning_rate": 4e-05, + "loss": 5.2036, + "loss/crossentropy": 1.9537419080734253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20113248005509377, + "step": 546 + }, + { + "epoch": 0.04566666666666667, + "grad_norm": 6.1875, + "grad_norm_var": 0.14345296223958334, + "learning_rate": 4e-05, + "loss": 5.5505, + "loss/crossentropy": 2.5237995982170105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211185209453106, + "step": 548 + }, + { + "epoch": 0.04583333333333333, + "grad_norm": 6.28125, + "grad_norm_var": 0.16265869140625, + "learning_rate": 4e-05, + "loss": 4.7336, + "loss/crossentropy": 1.9351204261183739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19682549498975277, + "step": 550 + }, + { + "epoch": 0.046, + "grad_norm": 5.15625, + "grad_norm_var": 0.16594645182291667, + "learning_rate": 4e-05, + "loss": 5.0406, + "loss/crossentropy": 2.4625622630119324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21886296197772026, + "step": 552 + }, + { + "epoch": 0.04616666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.16692301432291667, + "learning_rate": 4e-05, + "loss": 4.7676, + "loss/crossentropy": 1.4909594282507896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1696036048233509, + "step": 554 + }, + { + "epoch": 0.04633333333333333, + "grad_norm": 5.84375, + "grad_norm_var": 0.1544921875, + "learning_rate": 4e-05, + "loss": 5.1545, + "loss/crossentropy": 1.6347006186842918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19013791903853416, + "step": 556 + }, + { + "epoch": 0.0465, + "grad_norm": 5.96875, + "grad_norm_var": 0.16282145182291666, + "learning_rate": 4e-05, + "loss": 4.7549, + "loss/crossentropy": 2.16485732793808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2486794888973236, + "step": 558 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.17987874348958333, + "learning_rate": 4e-05, + "loss": 4.5371, + "loss/crossentropy": 2.0290512144565582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2468886598944664, + "step": 560 + }, + { + "epoch": 0.04683333333333333, + "grad_norm": 5.53125, + "grad_norm_var": 0.16985270182291667, + "learning_rate": 4e-05, + "loss": 5.4622, + "loss/crossentropy": 1.8024419024586678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2226398829370737, + "step": 562 + }, + { + "epoch": 0.047, + "grad_norm": 6.5625, + "grad_norm_var": 0.22628580729166667, + "learning_rate": 4e-05, + "loss": 5.7771, + "loss/crossentropy": 2.0841223895549774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2839187905192375, + "step": 564 + }, + { + "epoch": 0.04716666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 0.19843343098958333, + "learning_rate": 4e-05, + "loss": 5.2361, + "loss/crossentropy": 1.9585634768009186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2198926955461502, + "step": 566 + }, + { + "epoch": 0.04733333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.20552978515625, + "learning_rate": 4e-05, + "loss": 5.2611, + "loss/crossentropy": 2.0971501171588898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21288679540157318, + "step": 568 + }, + { + "epoch": 0.0475, + "grad_norm": 5.4375, + "grad_norm_var": 0.18889567057291667, + "learning_rate": 4e-05, + "loss": 5.1852, + "loss/crossentropy": 2.444584846496582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557501494884491, + "step": 570 + }, + { + "epoch": 0.04766666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.18349202473958334, + "learning_rate": 4e-05, + "loss": 5.0379, + "loss/crossentropy": 2.0450302958488464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205327320843935, + "step": 572 + }, + { + "epoch": 0.04783333333333333, + "grad_norm": 5.71875, + "grad_norm_var": 0.17245686848958333, + "learning_rate": 4e-05, + "loss": 5.8995, + "loss/crossentropy": 2.5409964323043823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22698857262730598, + "step": 574 + }, + { + "epoch": 0.048, + "grad_norm": 5.71875, + "grad_norm_var": 0.11966145833333333, + "learning_rate": 4e-05, + "loss": 5.0287, + "loss/crossentropy": 1.696646198630333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20716996863484383, + "step": 576 + }, + { + "epoch": 0.04816666666666667, + "grad_norm": 5.8125, + "grad_norm_var": 0.12636311848958334, + "learning_rate": 4e-05, + "loss": 5.3317, + "loss/crossentropy": 2.2942482829093933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25501084327697754, + "step": 578 + }, + { + "epoch": 0.04833333333333333, + "grad_norm": 5.59375, + "grad_norm_var": 0.05279541015625, + "learning_rate": 4e-05, + "loss": 5.248, + "loss/crossentropy": 2.4969963431358337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23663923889398575, + "step": 580 + }, + { + "epoch": 0.0485, + "grad_norm": 5.40625, + "grad_norm_var": 0.055322265625, + "learning_rate": 4e-05, + "loss": 5.0694, + "loss/crossentropy": 2.179100275039673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20842677354812622, + "step": 582 + }, + { + "epoch": 0.048666666666666664, + "grad_norm": 5.71875, + "grad_norm_var": 0.05797119140625, + "learning_rate": 4e-05, + "loss": 5.0973, + "loss/crossentropy": 2.542878270149231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23901895433664322, + "step": 584 + }, + { + "epoch": 0.04883333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.104296875, + "learning_rate": 4e-05, + "loss": 4.4501, + "loss/crossentropy": 1.3374748602509499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15280469879508018, + "step": 586 + }, + { + "epoch": 0.049, + "grad_norm": 5.09375, + "grad_norm_var": 0.11021728515625, + "learning_rate": 4e-05, + "loss": 4.2479, + "loss/crossentropy": 2.1031662821769714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21286172419786453, + "step": 588 + }, + { + "epoch": 0.049166666666666664, + "grad_norm": 5.75, + "grad_norm_var": 0.11578369140625, + "learning_rate": 4e-05, + "loss": 4.4553, + "loss/crossentropy": 2.226746082305908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2602303549647331, + "step": 590 + }, + { + "epoch": 0.04933333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.10282796223958333, + "learning_rate": 4e-05, + "loss": 5.1448, + "loss/crossentropy": 1.9679524078965187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005962673574686, + "step": 592 + }, + { + "epoch": 0.0495, + "grad_norm": 5.125, + "grad_norm_var": 0.08931884765625, + "learning_rate": 4e-05, + "loss": 4.7614, + "loss/crossentropy": 1.159188948571682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14164014346897602, + "step": 594 + }, + { + "epoch": 0.049666666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.09078369140625, + "learning_rate": 4e-05, + "loss": 5.0281, + "loss/crossentropy": 1.8265404999256134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891588643193245, + "step": 596 + }, + { + "epoch": 0.049833333333333334, + "grad_norm": 5.4375, + "grad_norm_var": 0.12486979166666666, + "learning_rate": 4e-05, + "loss": 5.2755, + "loss/crossentropy": 2.3386247754096985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26255329325795174, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 5.09375, + "grad_norm_var": 0.11562093098958333, + "learning_rate": 4e-05, + "loss": 5.0808, + "loss/crossentropy": 1.9218714386224747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169225849211216, + "step": 600 + }, + { + "epoch": 0.050166666666666665, + "grad_norm": 5.21875, + "grad_norm_var": 0.11552327473958333, + "learning_rate": 4e-05, + "loss": 5.2712, + "loss/crossentropy": 1.5146638751029968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18410832434892654, + "step": 602 + }, + { + "epoch": 0.050333333333333334, + "grad_norm": 5.59375, + "grad_norm_var": 0.12724202473958332, + "learning_rate": 4e-05, + "loss": 4.7125, + "loss/crossentropy": 2.293552666902542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22562003880739212, + "step": 604 + }, + { + "epoch": 0.0505, + "grad_norm": 5.34375, + "grad_norm_var": 0.11326497395833333, + "learning_rate": 4e-05, + "loss": 5.1169, + "loss/crossentropy": 1.7014083191752434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18919607624411583, + "step": 606 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 5.5, + "grad_norm_var": 0.11340738932291666, + "learning_rate": 4e-05, + "loss": 5.322, + "loss/crossentropy": 1.4259334281086922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16573997400701046, + "step": 608 + }, + { + "epoch": 0.050833333333333335, + "grad_norm": 6.09375, + "grad_norm_var": 0.14000244140625, + "learning_rate": 4e-05, + "loss": 4.7041, + "loss/crossentropy": 1.3645060807466507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17446519620716572, + "step": 610 + }, + { + "epoch": 0.051, + "grad_norm": 5.84375, + "grad_norm_var": 0.11724853515625, + "learning_rate": 4e-05, + "loss": 5.0354, + "loss/crossentropy": 1.4357607513666153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1666635200381279, + "step": 612 + }, + { + "epoch": 0.051166666666666666, + "grad_norm": 5.4375, + "grad_norm_var": 0.09254150390625, + "learning_rate": 4e-05, + "loss": 4.8718, + "loss/crossentropy": 2.40578031539917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22677217051386833, + "step": 614 + }, + { + "epoch": 0.051333333333333335, + "grad_norm": 5.34375, + "grad_norm_var": 0.06404622395833333, + "learning_rate": 4e-05, + "loss": 5.0725, + "loss/crossentropy": 1.8017898797988892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18605670891702175, + "step": 616 + }, + { + "epoch": 0.0515, + "grad_norm": 6.8125, + "grad_norm_var": 0.16373697916666666, + "learning_rate": 4e-05, + "loss": 5.0028, + "loss/crossentropy": 1.283976010978222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15850860998034477, + "step": 618 + }, + { + "epoch": 0.051666666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.21100260416666666, + "learning_rate": 4e-05, + "loss": 4.1235, + "loss/crossentropy": 1.4461367800831795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16198919713497162, + "step": 620 + }, + { + "epoch": 0.051833333333333335, + "grad_norm": 5.25, + "grad_norm_var": 0.222509765625, + "learning_rate": 4e-05, + "loss": 5.0553, + "loss/crossentropy": 2.7310924530029297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21842283383011818, + "step": 622 + }, + { + "epoch": 0.052, + "grad_norm": 5.34375, + "grad_norm_var": 0.23632405598958334, + "learning_rate": 4e-05, + "loss": 4.602, + "loss/crossentropy": 2.3664903938770294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23367810249328613, + "step": 624 + }, + { + "epoch": 0.05216666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 0.20592447916666667, + "learning_rate": 4e-05, + "loss": 4.9899, + "loss/crossentropy": 1.2108296155929565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14673249796032906, + "step": 626 + }, + { + "epoch": 0.052333333333333336, + "grad_norm": 5.03125, + "grad_norm_var": 0.19685872395833334, + "learning_rate": 4e-05, + "loss": 4.4967, + "loss/crossentropy": 2.283060073852539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23095425590872765, + "step": 628 + }, + { + "epoch": 0.0525, + "grad_norm": 5.28125, + "grad_norm_var": 0.19582926432291667, + "learning_rate": 4e-05, + "loss": 5.1891, + "loss/crossentropy": 2.3244327008724213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20680969208478928, + "step": 630 + }, + { + "epoch": 0.05266666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.19827067057291667, + "learning_rate": 4e-05, + "loss": 4.7258, + "loss/crossentropy": 1.5194010734558105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2393566593527794, + "step": 632 + }, + { + "epoch": 0.052833333333333336, + "grad_norm": 5.53125, + "grad_norm_var": 0.04147135416666667, + "learning_rate": 4e-05, + "loss": 5.3842, + "loss/crossentropy": 2.275718003511429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21031928807497025, + "step": 634 + }, + { + "epoch": 0.053, + "grad_norm": 5.375, + "grad_norm_var": 0.04000244140625, + "learning_rate": 4e-05, + "loss": 4.8783, + "loss/crossentropy": 2.351560056209564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2495262697339058, + "step": 636 + }, + { + "epoch": 0.05316666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.0392578125, + "learning_rate": 4e-05, + "loss": 4.9231, + "loss/crossentropy": 2.415817618370056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23617206886410713, + "step": 638 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 5.53125, + "grad_norm_var": 0.041259765625, + "learning_rate": 4e-05, + "loss": 5.1447, + "loss/crossentropy": 2.2912066876888275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22658982872962952, + "step": 640 + }, + { + "epoch": 0.0535, + "grad_norm": 4.8125, + "grad_norm_var": 0.060530598958333334, + "learning_rate": 4e-05, + "loss": 4.1259, + "loss/crossentropy": 1.3093429505825043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1687719877809286, + "step": 642 + }, + { + "epoch": 0.05366666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.06966145833333333, + "learning_rate": 4e-05, + "loss": 4.796, + "loss/crossentropy": 1.8905025273561478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039121687412262, + "step": 644 + }, + { + "epoch": 0.05383333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.08878580729166667, + "learning_rate": 4e-05, + "loss": 4.0684, + "loss/crossentropy": 1.2859214022755623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1498262993991375, + "step": 646 + }, + { + "epoch": 0.054, + "grad_norm": 5.34375, + "grad_norm_var": 0.08448893229166667, + "learning_rate": 4e-05, + "loss": 5.1231, + "loss/crossentropy": 1.3642852455377579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15733711794018745, + "step": 648 + }, + { + "epoch": 0.05416666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.07732747395833334, + "learning_rate": 4e-05, + "loss": 5.221, + "loss/crossentropy": 1.8772388100624084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18816200830042362, + "step": 650 + }, + { + "epoch": 0.05433333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.08318684895833334, + "learning_rate": 4e-05, + "loss": 4.9661, + "loss/crossentropy": 2.054552912712097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23715369030833244, + "step": 652 + }, + { + "epoch": 0.0545, + "grad_norm": 5.28125, + "grad_norm_var": 0.08352457682291667, + "learning_rate": 4e-05, + "loss": 5.0736, + "loss/crossentropy": 2.033175766468048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20897972956299782, + "step": 654 + }, + { + "epoch": 0.05466666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.07459309895833334, + "learning_rate": 4e-05, + "loss": 5.2757, + "loss/crossentropy": 1.4109216630458832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16402552276849747, + "step": 656 + }, + { + "epoch": 0.05483333333333333, + "grad_norm": 5.53125, + "grad_norm_var": 0.063525390625, + "learning_rate": 4e-05, + "loss": 4.4992, + "loss/crossentropy": 1.7298256531357765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20376906543970108, + "step": 658 + }, + { + "epoch": 0.055, + "grad_norm": 5.6875, + "grad_norm_var": 0.07955322265625, + "learning_rate": 4e-05, + "loss": 5.2287, + "loss/crossentropy": 2.059411734342575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22227568551898003, + "step": 660 + }, + { + "epoch": 0.05516666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.05319010416666667, + "learning_rate": 4e-05, + "loss": 5.1509, + "loss/crossentropy": 2.0033040791749954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19995213858783245, + "step": 662 + }, + { + "epoch": 0.05533333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.059794108072916664, + "learning_rate": 4e-05, + "loss": 5.3328, + "loss/crossentropy": 2.360860764980316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22881463915109634, + "step": 664 + }, + { + "epoch": 0.0555, + "grad_norm": 4.9375, + "grad_norm_var": 0.07548421223958333, + "learning_rate": 4e-05, + "loss": 4.7419, + "loss/crossentropy": 1.968344509601593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20371564105153084, + "step": 666 + }, + { + "epoch": 0.05566666666666667, + "grad_norm": 5.5625, + "grad_norm_var": 0.07323811848958334, + "learning_rate": 4e-05, + "loss": 4.8958, + "loss/crossentropy": 2.8875539898872375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2515605129301548, + "step": 668 + }, + { + "epoch": 0.05583333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.15545247395833334, + "learning_rate": 4e-05, + "loss": 5.5544, + "loss/crossentropy": 1.9591965079307556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150409147143364, + "step": 670 + }, + { + "epoch": 0.056, + "grad_norm": 5.65625, + "grad_norm_var": 0.1416015625, + "learning_rate": 4e-05, + "loss": 5.1414, + "loss/crossentropy": 1.8480764627456665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19637847691774368, + "step": 672 + }, + { + "epoch": 0.05616666666666666, + "grad_norm": 5.46875, + "grad_norm_var": 0.1478515625, + "learning_rate": 4e-05, + "loss": 5.4557, + "loss/crossentropy": 1.8511382415890694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19494801573455334, + "step": 674 + }, + { + "epoch": 0.05633333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.14837239583333334, + "learning_rate": 4e-05, + "loss": 5.1891, + "loss/crossentropy": 2.2563489973545074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20653247460722923, + "step": 676 + }, + { + "epoch": 0.0565, + "grad_norm": 4.75, + "grad_norm_var": 0.20735270182291668, + "learning_rate": 4e-05, + "loss": 5.0675, + "loss/crossentropy": 2.1194111332297325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18754742667078972, + "step": 678 + }, + { + "epoch": 0.056666666666666664, + "grad_norm": 5.03125, + "grad_norm_var": 0.20435791015625, + "learning_rate": 4e-05, + "loss": 4.8841, + "loss/crossentropy": 2.106343001127243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169276624917984, + "step": 680 + }, + { + "epoch": 0.05683333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.1974609375, + "learning_rate": 4e-05, + "loss": 4.692, + "loss/crossentropy": 2.522699236869812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24442313238978386, + "step": 682 + }, + { + "epoch": 0.057, + "grad_norm": 5.0625, + "grad_norm_var": 0.22665608723958333, + "learning_rate": 4e-05, + "loss": 4.0882, + "loss/crossentropy": 2.3042386770248413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26410669833421707, + "step": 684 + }, + { + "epoch": 0.057166666666666664, + "grad_norm": 5.34375, + "grad_norm_var": 0.13101806640625, + "learning_rate": 4e-05, + "loss": 4.6953, + "loss/crossentropy": 2.2301080226898193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25920237600803375, + "step": 686 + }, + { + "epoch": 0.05733333333333333, + "grad_norm": 5.6875, + "grad_norm_var": 0.14117431640625, + "learning_rate": 4e-05, + "loss": 4.9626, + "loss/crossentropy": 1.1905392110347748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1609304826706648, + "step": 688 + }, + { + "epoch": 0.0575, + "grad_norm": 5.0625, + "grad_norm_var": 0.14351806640625, + "learning_rate": 4e-05, + "loss": 4.9173, + "loss/crossentropy": 2.519528329372406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2300243265926838, + "step": 690 + }, + { + "epoch": 0.057666666666666665, + "grad_norm": 5.25, + "grad_norm_var": 0.145947265625, + "learning_rate": 4e-05, + "loss": 4.8009, + "loss/crossentropy": 2.0772966742515564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19900832697749138, + "step": 692 + }, + { + "epoch": 0.057833333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.08697509765625, + "learning_rate": 4e-05, + "loss": 5.1798, + "loss/crossentropy": 1.7827163264155388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675160013139248, + "step": 694 + }, + { + "epoch": 0.058, + "grad_norm": 5.1875, + "grad_norm_var": 0.08430989583333333, + "learning_rate": 4e-05, + "loss": 4.8942, + "loss/crossentropy": 1.373624011874199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1578064877539873, + "step": 696 + }, + { + "epoch": 0.058166666666666665, + "grad_norm": 5.15625, + "grad_norm_var": 0.0779296875, + "learning_rate": 4e-05, + "loss": 5.0116, + "loss/crossentropy": 2.012943536043167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20599104836583138, + "step": 698 + }, + { + "epoch": 0.058333333333333334, + "grad_norm": 5.5625, + "grad_norm_var": 0.051025390625, + "learning_rate": 4e-05, + "loss": 5.2032, + "loss/crossentropy": 2.2552223205566406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2347894161939621, + "step": 700 + }, + { + "epoch": 0.0585, + "grad_norm": 5.1875, + "grad_norm_var": 0.05080973307291667, + "learning_rate": 4e-05, + "loss": 4.7641, + "loss/crossentropy": 1.840671882033348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20984739437699318, + "step": 702 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 5.5, + "grad_norm_var": 0.04462483723958333, + "learning_rate": 4e-05, + "loss": 4.653, + "loss/crossentropy": 1.0027276128530502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13207154348492622, + "step": 704 + }, + { + "epoch": 0.058833333333333335, + "grad_norm": 5.84375, + "grad_norm_var": 0.06643473307291667, + "learning_rate": 4e-05, + "loss": 5.1619, + "loss/crossentropy": 2.2675763964653015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21700335666537285, + "step": 706 + }, + { + "epoch": 0.059, + "grad_norm": 4.9375, + "grad_norm_var": 0.07395426432291667, + "learning_rate": 4e-05, + "loss": 5.1794, + "loss/crossentropy": 2.1361162662506104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916163712739944, + "step": 708 + }, + { + "epoch": 0.059166666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.08739827473958334, + "learning_rate": 4e-05, + "loss": 4.8597, + "loss/crossentropy": 1.6593957543373108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18479419499635696, + "step": 710 + }, + { + "epoch": 0.059333333333333335, + "grad_norm": 4.8125, + "grad_norm_var": 0.10388997395833334, + "learning_rate": 4e-05, + "loss": 4.3248, + "loss/crossentropy": 1.333509661257267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22560935281217098, + "step": 712 + }, + { + "epoch": 0.0595, + "grad_norm": 5.125, + "grad_norm_var": 0.10705973307291666, + "learning_rate": 4e-05, + "loss": 4.9656, + "loss/crossentropy": 2.230701059103012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20880433917045593, + "step": 714 + }, + { + "epoch": 0.059666666666666666, + "grad_norm": 5.375, + "grad_norm_var": 0.09498291015625, + "learning_rate": 4e-05, + "loss": 5.2067, + "loss/crossentropy": 2.4413784742355347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24665574729442596, + "step": 716 + }, + { + "epoch": 0.059833333333333336, + "grad_norm": 5.71875, + "grad_norm_var": 0.111181640625, + "learning_rate": 4e-05, + "loss": 4.8928, + "loss/crossentropy": 2.3639025390148163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20420604944229126, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 5.625, + "grad_norm_var": 0.10507405598958333, + "learning_rate": 4e-05, + "loss": 4.6914, + "loss/crossentropy": 2.329402983188629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23727432638406754, + "step": 720 + }, + { + "epoch": 0.06016666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.08019205729166666, + "learning_rate": 4e-05, + "loss": 5.5923, + "loss/crossentropy": 2.584353506565094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22987722977995872, + "step": 722 + }, + { + "epoch": 0.060333333333333336, + "grad_norm": 5.375, + "grad_norm_var": 0.06521809895833333, + "learning_rate": 4e-05, + "loss": 4.9543, + "loss/crossentropy": 1.8400915935635567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17739013954997063, + "step": 724 + }, + { + "epoch": 0.0605, + "grad_norm": 5.375, + "grad_norm_var": 0.04706624348958333, + "learning_rate": 4e-05, + "loss": 5.5607, + "loss/crossentropy": 2.403924733400345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308141142129898, + "step": 726 + }, + { + "epoch": 0.06066666666666667, + "grad_norm": 5.5625, + "grad_norm_var": 0.029488118489583333, + "learning_rate": 4e-05, + "loss": 5.0417, + "loss/crossentropy": 1.4918632730841637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17629101499915123, + "step": 728 + }, + { + "epoch": 0.060833333333333336, + "grad_norm": 5.8125, + "grad_norm_var": 0.038916015625, + "learning_rate": 4e-05, + "loss": 5.2212, + "loss/crossentropy": 2.5051349997520447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24874291196465492, + "step": 730 + }, + { + "epoch": 0.061, + "grad_norm": 5.15625, + "grad_norm_var": 0.03756510416666667, + "learning_rate": 4e-05, + "loss": 4.7708, + "loss/crossentropy": 1.9273648858070374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22050853073596954, + "step": 732 + }, + { + "epoch": 0.06116666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.046223958333333336, + "learning_rate": 4e-05, + "loss": 4.8641, + "loss/crossentropy": 2.55005943775177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2286563366651535, + "step": 734 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.06383056640625, + "learning_rate": 4e-05, + "loss": 4.474, + "loss/crossentropy": 0.9446901753544807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12489008717238903, + "step": 736 + }, + { + "epoch": 0.0615, + "grad_norm": 4.90625, + "grad_norm_var": 0.08017171223958333, + "learning_rate": 4e-05, + "loss": 5.0295, + "loss/crossentropy": 2.087219849228859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19661586359143257, + "step": 738 + }, + { + "epoch": 0.06166666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.08253580729166667, + "learning_rate": 4e-05, + "loss": 5.4304, + "loss/crossentropy": 2.1698725819587708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202838696539402, + "step": 740 + }, + { + "epoch": 0.06183333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.08683268229166667, + "learning_rate": 4e-05, + "loss": 4.4203, + "loss/crossentropy": 1.530371643602848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16617339104413986, + "step": 742 + }, + { + "epoch": 0.062, + "grad_norm": 5.0, + "grad_norm_var": 0.08704020182291666, + "learning_rate": 4e-05, + "loss": 4.9379, + "loss/crossentropy": 2.3629955649375916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2189258188009262, + "step": 744 + }, + { + "epoch": 0.06216666666666667, + "grad_norm": 5.9375, + "grad_norm_var": 0.10247395833333334, + "learning_rate": 4e-05, + "loss": 4.9963, + "loss/crossentropy": 2.0986749082803726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2822576127946377, + "step": 746 + }, + { + "epoch": 0.06233333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.11256510416666667, + "learning_rate": 4e-05, + "loss": 4.6594, + "loss/crossentropy": 2.3157600462436676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23773421347141266, + "step": 748 + }, + { + "epoch": 0.0625, + "grad_norm": 4.78125, + "grad_norm_var": 0.10819905598958333, + "learning_rate": 4e-05, + "loss": 4.7568, + "loss/crossentropy": 1.8990642204880714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18217475526034832, + "step": 750 + }, + { + "epoch": 0.06266666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.14478759765625, + "learning_rate": 4e-05, + "loss": 4.5526, + "loss/crossentropy": 2.3672779500484467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21740539371967316, + "step": 752 + }, + { + "epoch": 0.06283333333333334, + "grad_norm": 5.53125, + "grad_norm_var": 0.136181640625, + "learning_rate": 4e-05, + "loss": 5.4863, + "loss/crossentropy": 2.2541432678699493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22684243321418762, + "step": 754 + }, + { + "epoch": 0.063, + "grad_norm": 5.59375, + "grad_norm_var": 0.14283447265625, + "learning_rate": 4e-05, + "loss": 4.582, + "loss/crossentropy": 1.5108322128653526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16953840106725693, + "step": 756 + }, + { + "epoch": 0.06316666666666666, + "grad_norm": 6.21875, + "grad_norm_var": 0.19959309895833333, + "learning_rate": 4e-05, + "loss": 5.1294, + "loss/crossentropy": 1.2453164830803871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14559439942240715, + "step": 758 + }, + { + "epoch": 0.06333333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.19706624348958332, + "learning_rate": 4e-05, + "loss": 4.524, + "loss/crossentropy": 2.1696812510490417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20982523635029793, + "step": 760 + }, + { + "epoch": 0.0635, + "grad_norm": 4.9375, + "grad_norm_var": 0.172509765625, + "learning_rate": 4e-05, + "loss": 5.2126, + "loss/crossentropy": 2.0176029577851295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048779744654894, + "step": 762 + }, + { + "epoch": 0.06366666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.17336832682291667, + "learning_rate": 4e-05, + "loss": 5.1471, + "loss/crossentropy": 2.5985326170921326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24666956812143326, + "step": 764 + }, + { + "epoch": 0.06383333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.15237223307291667, + "learning_rate": 4e-05, + "loss": 4.9156, + "loss/crossentropy": 1.7296672835946083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16684475913643837, + "step": 766 + }, + { + "epoch": 0.064, + "grad_norm": 5.0625, + "grad_norm_var": 0.11751302083333333, + "learning_rate": 4e-05, + "loss": 5.1446, + "loss/crossentropy": 1.9281784817576408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19502447918057442, + "step": 768 + }, + { + "epoch": 0.06416666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.11482747395833333, + "learning_rate": 4e-05, + "loss": 4.8494, + "loss/crossentropy": 1.8453112244606018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19226614944636822, + "step": 770 + }, + { + "epoch": 0.06433333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.10813395182291667, + "learning_rate": 4e-05, + "loss": 4.4943, + "loss/crossentropy": 1.4456355720758438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1608578786253929, + "step": 772 + }, + { + "epoch": 0.0645, + "grad_norm": 5.125, + "grad_norm_var": 0.030171712239583332, + "learning_rate": 4e-05, + "loss": 4.7407, + "loss/crossentropy": 1.93793186545372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1842864453792572, + "step": 774 + }, + { + "epoch": 0.06466666666666666, + "grad_norm": 5.3125, + "grad_norm_var": 0.02496337890625, + "learning_rate": 4e-05, + "loss": 5.1067, + "loss/crossentropy": 2.04061222076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23408568277955055, + "step": 776 + }, + { + "epoch": 0.06483333333333334, + "grad_norm": 5.65625, + "grad_norm_var": 0.04267171223958333, + "learning_rate": 4e-05, + "loss": 5.3864, + "loss/crossentropy": 2.418355941772461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2686479911208153, + "step": 778 + }, + { + "epoch": 0.065, + "grad_norm": 5.8125, + "grad_norm_var": 0.05896809895833333, + "learning_rate": 4e-05, + "loss": 5.8333, + "loss/crossentropy": 1.8340658321976662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828531064093113, + "step": 780 + }, + { + "epoch": 0.06516666666666666, + "grad_norm": 5.375, + "grad_norm_var": 0.06099853515625, + "learning_rate": 4e-05, + "loss": 5.004, + "loss/crossentropy": 1.4804940819740295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16691730543971062, + "step": 782 + }, + { + "epoch": 0.06533333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.07131754557291667, + "learning_rate": 4e-05, + "loss": 5.1147, + "loss/crossentropy": 2.2358897924423218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22555414587259293, + "step": 784 + }, + { + "epoch": 0.0655, + "grad_norm": 5.15625, + "grad_norm_var": 0.06549479166666666, + "learning_rate": 4e-05, + "loss": 5.1507, + "loss/crossentropy": 2.0352462232112885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2000010460615158, + "step": 786 + }, + { + "epoch": 0.06566666666666666, + "grad_norm": 5.4375, + "grad_norm_var": 0.06360677083333334, + "learning_rate": 4e-05, + "loss": 5.1654, + "loss/crossentropy": 2.098189502954483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18786033987998962, + "step": 788 + }, + { + "epoch": 0.06583333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.09657796223958333, + "learning_rate": 4e-05, + "loss": 4.6436, + "loss/crossentropy": 2.23826864361763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015607375651598, + "step": 790 + }, + { + "epoch": 0.066, + "grad_norm": 5.71875, + "grad_norm_var": 0.1150390625, + "learning_rate": 4e-05, + "loss": 5.1651, + "loss/crossentropy": 1.690386563539505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726808026432991, + "step": 792 + }, + { + "epoch": 0.06616666666666667, + "grad_norm": 5.75, + "grad_norm_var": 0.12457275390625, + "learning_rate": 4e-05, + "loss": 5.9104, + "loss/crossentropy": 2.5857779383659363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23685766011476517, + "step": 794 + }, + { + "epoch": 0.06633333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.10857747395833334, + "learning_rate": 4e-05, + "loss": 5.2371, + "loss/crossentropy": 1.768482819199562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18570737540721893, + "step": 796 + }, + { + "epoch": 0.0665, + "grad_norm": 5.46875, + "grad_norm_var": 0.10833333333333334, + "learning_rate": 4e-05, + "loss": 5.4012, + "loss/crossentropy": 1.9267660677433014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040906585752964, + "step": 798 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 5.8125, + "grad_norm_var": 0.1189453125, + "learning_rate": 4e-05, + "loss": 4.9879, + "loss/crossentropy": 1.2167518213391304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16967863216996193, + "step": 800 + }, + { + "epoch": 0.06683333333333333, + "grad_norm": 6.59375, + "grad_norm_var": 0.22284749348958333, + "learning_rate": 4e-05, + "loss": 5.419, + "loss/crossentropy": 2.580377459526062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2212601639330387, + "step": 802 + }, + { + "epoch": 0.067, + "grad_norm": 5.28125, + "grad_norm_var": 0.21803385416666668, + "learning_rate": 4e-05, + "loss": 5.1168, + "loss/crossentropy": 1.6435775309801102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17711565271019936, + "step": 804 + }, + { + "epoch": 0.06716666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.177197265625, + "learning_rate": 4e-05, + "loss": 4.8107, + "loss/crossentropy": 1.8337387293577194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18238316662609577, + "step": 806 + }, + { + "epoch": 0.06733333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.16861979166666666, + "learning_rate": 4e-05, + "loss": 4.6552, + "loss/crossentropy": 0.9152474626898766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1301976777613163, + "step": 808 + }, + { + "epoch": 0.0675, + "grad_norm": 5.25, + "grad_norm_var": 0.15790608723958333, + "learning_rate": 4e-05, + "loss": 5.2066, + "loss/crossentropy": 2.6412184834480286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262093760073185, + "step": 810 + }, + { + "epoch": 0.06766666666666667, + "grad_norm": 5.625, + "grad_norm_var": 0.16119384765625, + "learning_rate": 4e-05, + "loss": 5.1448, + "loss/crossentropy": 2.0506534948945045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1795985009521246, + "step": 812 + }, + { + "epoch": 0.06783333333333333, + "grad_norm": 5.78125, + "grad_norm_var": 0.17060139973958333, + "learning_rate": 4e-05, + "loss": 5.3269, + "loss/crossentropy": 1.978536695241928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082153595983982, + "step": 814 + }, + { + "epoch": 0.068, + "grad_norm": 5.125, + "grad_norm_var": 0.18036702473958333, + "learning_rate": 4e-05, + "loss": 4.9719, + "loss/crossentropy": 2.047212928533554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22651727497577667, + "step": 816 + }, + { + "epoch": 0.06816666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.07636311848958334, + "learning_rate": 4e-05, + "loss": 4.6999, + "loss/crossentropy": 2.19197478890419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21344346180558205, + "step": 818 + }, + { + "epoch": 0.06833333333333333, + "grad_norm": 5.6875, + "grad_norm_var": 0.08183186848958333, + "learning_rate": 4e-05, + "loss": 4.8976, + "loss/crossentropy": 1.737916611135006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20098767057061195, + "step": 820 + }, + { + "epoch": 0.0685, + "grad_norm": 5.3125, + "grad_norm_var": 0.08097330729166667, + "learning_rate": 4e-05, + "loss": 4.8823, + "loss/crossentropy": 2.3596703112125397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2249809466302395, + "step": 822 + }, + { + "epoch": 0.06866666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.07472330729166667, + "learning_rate": 4e-05, + "loss": 4.9886, + "loss/crossentropy": 1.005987472832203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1333060059696436, + "step": 824 + }, + { + "epoch": 0.06883333333333333, + "grad_norm": 5.75, + "grad_norm_var": 0.11842447916666667, + "learning_rate": 4e-05, + "loss": 4.2789, + "loss/crossentropy": 1.277719035744667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.184982817620039, + "step": 826 + }, + { + "epoch": 0.069, + "grad_norm": 5.46875, + "grad_norm_var": 0.12177327473958334, + "learning_rate": 4e-05, + "loss": 5.2915, + "loss/crossentropy": 2.51472669839859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24901026487350464, + "step": 828 + }, + { + "epoch": 0.06916666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.12773030598958332, + "learning_rate": 4e-05, + "loss": 4.5775, + "loss/crossentropy": 1.783848948776722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1819527931511402, + "step": 830 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.08843994140625, + "learning_rate": 4e-05, + "loss": 4.9177, + "loss/crossentropy": 1.785573087632656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1884115468710661, + "step": 832 + }, + { + "epoch": 0.0695, + "grad_norm": 5.59375, + "grad_norm_var": 0.10284830729166666, + "learning_rate": 4e-05, + "loss": 5.365, + "loss/crossentropy": 2.576684892177582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24840709939599037, + "step": 834 + }, + { + "epoch": 0.06966666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.10690104166666667, + "learning_rate": 4e-05, + "loss": 4.9888, + "loss/crossentropy": 1.324866883456707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16928620636463165, + "step": 836 + }, + { + "epoch": 0.06983333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.10640869140625, + "learning_rate": 4e-05, + "loss": 4.7214, + "loss/crossentropy": 1.2735597863793373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14277022145688534, + "step": 838 + }, + { + "epoch": 0.07, + "grad_norm": 5.65625, + "grad_norm_var": 0.10634358723958333, + "learning_rate": 4e-05, + "loss": 5.2884, + "loss/crossentropy": 2.3253634870052338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503334864974022, + "step": 840 + }, + { + "epoch": 0.07016666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.06145833333333333, + "learning_rate": 4e-05, + "loss": 5.0755, + "loss/crossentropy": 2.1621678471565247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23412977531552315, + "step": 842 + }, + { + "epoch": 0.07033333333333333, + "grad_norm": 5.5, + "grad_norm_var": 0.05701497395833333, + "learning_rate": 4e-05, + "loss": 4.9175, + "loss/crossentropy": 1.3182961717247963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1545308232307434, + "step": 844 + }, + { + "epoch": 0.0705, + "grad_norm": 5.5625, + "grad_norm_var": 0.04241129557291667, + "learning_rate": 4e-05, + "loss": 4.7885, + "loss/crossentropy": 1.8972595036029816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1849633026868105, + "step": 846 + }, + { + "epoch": 0.07066666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 0.05201416015625, + "learning_rate": 4e-05, + "loss": 5.4305, + "loss/crossentropy": 1.8853968381881714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18923737294971943, + "step": 848 + }, + { + "epoch": 0.07083333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.03893229166666667, + "learning_rate": 4e-05, + "loss": 4.8156, + "loss/crossentropy": 2.07659313082695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22809230163693428, + "step": 850 + }, + { + "epoch": 0.071, + "grad_norm": 5.8125, + "grad_norm_var": 0.042252604166666666, + "learning_rate": 4e-05, + "loss": 4.8969, + "loss/crossentropy": 1.9546649530529976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20475487783551216, + "step": 852 + }, + { + "epoch": 0.07116666666666667, + "grad_norm": 5.53125, + "grad_norm_var": 0.04049479166666667, + "learning_rate": 4e-05, + "loss": 4.9251, + "loss/crossentropy": 1.9904922246932983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2668594792485237, + "step": 854 + }, + { + "epoch": 0.07133333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.040755208333333334, + "learning_rate": 4e-05, + "loss": 4.869, + "loss/crossentropy": 1.5018320679664612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16269180364906788, + "step": 856 + }, + { + "epoch": 0.0715, + "grad_norm": 5.40625, + "grad_norm_var": 0.04816080729166667, + "learning_rate": 4e-05, + "loss": 5.0008, + "loss/crossentropy": 1.5820802375674248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1965535432100296, + "step": 858 + }, + { + "epoch": 0.07166666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.049544270833333334, + "learning_rate": 4e-05, + "loss": 5.2759, + "loss/crossentropy": 2.0585487335920334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19966792315244675, + "step": 860 + }, + { + "epoch": 0.07183333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.10250244140625, + "learning_rate": 4e-05, + "loss": 5.383, + "loss/crossentropy": 1.9323284551501274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18302668258547783, + "step": 862 + }, + { + "epoch": 0.072, + "grad_norm": 5.28125, + "grad_norm_var": 0.09348958333333333, + "learning_rate": 4e-05, + "loss": 5.2908, + "loss/crossentropy": 2.5324109196662903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2370966337621212, + "step": 864 + }, + { + "epoch": 0.07216666666666667, + "grad_norm": 5.75, + "grad_norm_var": 0.09452718098958333, + "learning_rate": 4e-05, + "loss": 4.9592, + "loss/crossentropy": 1.6841916590929031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1897643618285656, + "step": 866 + }, + { + "epoch": 0.07233333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.09479166666666666, + "learning_rate": 4e-05, + "loss": 5.0178, + "loss/crossentropy": 2.266584038734436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22946883738040924, + "step": 868 + }, + { + "epoch": 0.0725, + "grad_norm": 5.65625, + "grad_norm_var": 0.10764567057291667, + "learning_rate": 4e-05, + "loss": 5.0285, + "loss/crossentropy": 2.6628386974334717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22233451902866364, + "step": 870 + }, + { + "epoch": 0.07266666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.15286051432291667, + "learning_rate": 4e-05, + "loss": 5.2623, + "loss/crossentropy": 2.4069382548332214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2290056124329567, + "step": 872 + }, + { + "epoch": 0.07283333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.14179280598958333, + "learning_rate": 4e-05, + "loss": 5.2452, + "loss/crossentropy": 1.9541796445846558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18380443193018436, + "step": 874 + }, + { + "epoch": 0.073, + "grad_norm": 5.28125, + "grad_norm_var": 0.139306640625, + "learning_rate": 4e-05, + "loss": 5.198, + "loss/crossentropy": 1.4988721013069153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17175763100385666, + "step": 876 + }, + { + "epoch": 0.07316666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.10328369140625, + "learning_rate": 4e-05, + "loss": 4.9121, + "loss/crossentropy": 2.0601812303066254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19267475232481956, + "step": 878 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.13046875, + "learning_rate": 4e-05, + "loss": 4.3653, + "loss/crossentropy": 1.9160602986812592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2581901587545872, + "step": 880 + }, + { + "epoch": 0.0735, + "grad_norm": 5.59375, + "grad_norm_var": 0.14569905598958333, + "learning_rate": 4e-05, + "loss": 4.7792, + "loss/crossentropy": 2.4729442596435547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2501170076429844, + "step": 882 + }, + { + "epoch": 0.07366666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.14511311848958333, + "learning_rate": 4e-05, + "loss": 5.2312, + "loss/crossentropy": 2.5770280361175537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22895906120538712, + "step": 884 + }, + { + "epoch": 0.07383333333333333, + "grad_norm": 5.5, + "grad_norm_var": 0.15028889973958334, + "learning_rate": 4e-05, + "loss": 4.6341, + "loss/crossentropy": 1.9654364585876465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19148295372724533, + "step": 886 + }, + { + "epoch": 0.074, + "grad_norm": 5.1875, + "grad_norm_var": 0.09972330729166666, + "learning_rate": 4e-05, + "loss": 4.9945, + "loss/crossentropy": 2.1374219059944153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22470850497484207, + "step": 888 + }, + { + "epoch": 0.07416666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.09351806640625, + "learning_rate": 4e-05, + "loss": 5.1861, + "loss/crossentropy": 2.381446748971939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19887309148907661, + "step": 890 + }, + { + "epoch": 0.07433333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.09345296223958334, + "learning_rate": 4e-05, + "loss": 4.8414, + "loss/crossentropy": 1.1452403292059898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17032541893422604, + "step": 892 + }, + { + "epoch": 0.0745, + "grad_norm": 5.78125, + "grad_norm_var": 0.17688802083333333, + "learning_rate": 4e-05, + "loss": 5.0405, + "loss/crossentropy": 2.2590576112270355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102867066860199, + "step": 894 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.13743082682291666, + "learning_rate": 4e-05, + "loss": 5.2191, + "loss/crossentropy": 1.7941122353076935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186736473813653, + "step": 896 + }, + { + "epoch": 0.07483333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.12220052083333334, + "learning_rate": 4e-05, + "loss": 5.1269, + "loss/crossentropy": 2.2678189873695374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22983458638191223, + "step": 898 + }, + { + "epoch": 0.075, + "grad_norm": 5.375, + "grad_norm_var": 0.13631184895833334, + "learning_rate": 4e-05, + "loss": 4.8294, + "loss/crossentropy": 2.2537818551063538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2183692753314972, + "step": 900 + }, + { + "epoch": 0.07516666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.12668863932291666, + "learning_rate": 4e-05, + "loss": 4.7788, + "loss/crossentropy": 2.1383658349514008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21728335320949554, + "step": 902 + }, + { + "epoch": 0.07533333333333334, + "grad_norm": 5.71875, + "grad_norm_var": 0.13644205729166667, + "learning_rate": 4e-05, + "loss": 5.0809, + "loss/crossentropy": 1.899217240512371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22059489786624908, + "step": 904 + }, + { + "epoch": 0.0755, + "grad_norm": 5.125, + "grad_norm_var": 0.14010416666666667, + "learning_rate": 4e-05, + "loss": 5.072, + "loss/crossentropy": 2.597853124141693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22190867736935616, + "step": 906 + }, + { + "epoch": 0.07566666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.14192708333333334, + "learning_rate": 4e-05, + "loss": 5.0169, + "loss/crossentropy": 2.3640182316303253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.234844408929348, + "step": 908 + }, + { + "epoch": 0.07583333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.04889322916666667, + "learning_rate": 4e-05, + "loss": 4.3463, + "loss/crossentropy": 1.7232627272605896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17844930291175842, + "step": 910 + }, + { + "epoch": 0.076, + "grad_norm": 5.40625, + "grad_norm_var": 0.07498372395833333, + "learning_rate": 4e-05, + "loss": 4.2796, + "loss/crossentropy": 1.3739222288131714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15678460523486137, + "step": 912 + }, + { + "epoch": 0.07616666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 0.096728515625, + "learning_rate": 4e-05, + "loss": 4.9171, + "loss/crossentropy": 2.0244703590869904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22794625163078308, + "step": 914 + }, + { + "epoch": 0.07633333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.10305989583333333, + "learning_rate": 4e-05, + "loss": 4.9391, + "loss/crossentropy": 2.0325954258441925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203802440315485, + "step": 916 + }, + { + "epoch": 0.0765, + "grad_norm": 5.125, + "grad_norm_var": 0.1103515625, + "learning_rate": 4e-05, + "loss": 4.8463, + "loss/crossentropy": 2.536973237991333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22789884731173515, + "step": 918 + }, + { + "epoch": 0.07666666666666666, + "grad_norm": 5.5, + "grad_norm_var": 0.10354410807291667, + "learning_rate": 4e-05, + "loss": 4.956, + "loss/crossentropy": 2.464049816131592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22606119513511658, + "step": 920 + }, + { + "epoch": 0.07683333333333334, + "grad_norm": 5.625, + "grad_norm_var": 0.10116780598958333, + "learning_rate": 4e-05, + "loss": 5.1667, + "loss/crossentropy": 1.8012469932436943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026966456323862, + "step": 922 + }, + { + "epoch": 0.077, + "grad_norm": 4.875, + "grad_norm_var": 0.12096354166666666, + "learning_rate": 4e-05, + "loss": 4.3867, + "loss/crossentropy": 0.7537075951695442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11622426472604275, + "step": 924 + }, + { + "epoch": 0.07716666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.14560139973958333, + "learning_rate": 4e-05, + "loss": 4.9043, + "loss/crossentropy": 2.0230683609843254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19815506786108017, + "step": 926 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 5.8125, + "grad_norm_var": 0.11731363932291666, + "learning_rate": 4e-05, + "loss": 5.4114, + "loss/crossentropy": 2.449110984802246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23653070628643036, + "step": 928 + }, + { + "epoch": 0.0775, + "grad_norm": 5.21875, + "grad_norm_var": 0.11243489583333334, + "learning_rate": 4e-05, + "loss": 4.9905, + "loss/crossentropy": 1.4106080010533333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20390398800373077, + "step": 930 + }, + { + "epoch": 0.07766666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.10862223307291667, + "learning_rate": 4e-05, + "loss": 5.1747, + "loss/crossentropy": 2.168779395520687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18856257386505604, + "step": 932 + }, + { + "epoch": 0.07783333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.09322916666666667, + "learning_rate": 4e-05, + "loss": 5.3753, + "loss/crossentropy": 1.9819502532482147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197083055973053, + "step": 934 + }, + { + "epoch": 0.078, + "grad_norm": 6.0, + "grad_norm_var": 0.12967122395833333, + "learning_rate": 4e-05, + "loss": 4.586, + "loss/crossentropy": 2.2913994789123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21365177258849144, + "step": 936 + }, + { + "epoch": 0.07816666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.1279296875, + "learning_rate": 4e-05, + "loss": 5.4079, + "loss/crossentropy": 2.3492658138275146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2506399601697922, + "step": 938 + }, + { + "epoch": 0.07833333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.10709228515625, + "learning_rate": 4e-05, + "loss": 5.1427, + "loss/crossentropy": 2.3426185250282288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21920472010970116, + "step": 940 + }, + { + "epoch": 0.0785, + "grad_norm": 5.5, + "grad_norm_var": 0.09029947916666667, + "learning_rate": 4e-05, + "loss": 5.7286, + "loss/crossentropy": 1.9868685603141785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1850258819758892, + "step": 942 + }, + { + "epoch": 0.07866666666666666, + "grad_norm": 5.65625, + "grad_norm_var": 0.080859375, + "learning_rate": 4e-05, + "loss": 5.1068, + "loss/crossentropy": 2.3053890466690063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049488164484501, + "step": 944 + }, + { + "epoch": 0.07883333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.09524739583333333, + "learning_rate": 4e-05, + "loss": 5.1353, + "loss/crossentropy": 2.455892562866211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22708288580179214, + "step": 946 + }, + { + "epoch": 0.079, + "grad_norm": 5.625, + "grad_norm_var": 0.09212239583333333, + "learning_rate": 4e-05, + "loss": 5.5379, + "loss/crossentropy": 2.7193942070007324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23220528662204742, + "step": 948 + }, + { + "epoch": 0.07916666666666666, + "grad_norm": 5.375, + "grad_norm_var": 0.08515218098958334, + "learning_rate": 4e-05, + "loss": 5.3782, + "loss/crossentropy": 2.350933760404587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21525833755731583, + "step": 950 + }, + { + "epoch": 0.07933333333333334, + "grad_norm": 5.34375, + "grad_norm_var": 0.046708170572916666, + "learning_rate": 4e-05, + "loss": 5.1386, + "loss/crossentropy": 1.689025953412056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20566146075725555, + "step": 952 + }, + { + "epoch": 0.0795, + "grad_norm": 5.40625, + "grad_norm_var": 0.04062093098958333, + "learning_rate": 4e-05, + "loss": 4.5698, + "loss/crossentropy": 1.6841574162244797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17244276031851768, + "step": 954 + }, + { + "epoch": 0.07966666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 0.040755208333333334, + "learning_rate": 4e-05, + "loss": 5.2786, + "loss/crossentropy": 1.983876220881939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17574166506528854, + "step": 956 + }, + { + "epoch": 0.07983333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.04439697265625, + "learning_rate": 4e-05, + "loss": 4.7437, + "loss/crossentropy": 1.4942948892712593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328424882143736, + "step": 958 + }, + { + "epoch": 0.08, + "grad_norm": 5.25, + "grad_norm_var": 0.03677978515625, + "learning_rate": 4e-05, + "loss": 5.1834, + "loss/crossentropy": 1.8319725766777992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919633485376835, + "step": 960 + }, + { + "epoch": 0.08016666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 0.021480305989583334, + "learning_rate": 4e-05, + "loss": 5.5035, + "loss/crossentropy": 1.7946438565850258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19422894716262817, + "step": 962 + }, + { + "epoch": 0.08033333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.040262858072916664, + "learning_rate": 4e-05, + "loss": 5.0136, + "loss/crossentropy": 1.359002597630024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14611583948135376, + "step": 964 + }, + { + "epoch": 0.0805, + "grad_norm": 5.34375, + "grad_norm_var": 0.040848795572916666, + "learning_rate": 4e-05, + "loss": 5.1417, + "loss/crossentropy": 1.8300999030470848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20316387340426445, + "step": 966 + }, + { + "epoch": 0.08066666666666666, + "grad_norm": 5.4375, + "grad_norm_var": 0.04348551432291667, + "learning_rate": 4e-05, + "loss": 5.4677, + "loss/crossentropy": 2.5086329579353333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2248428463935852, + "step": 968 + }, + { + "epoch": 0.08083333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.04659830729166667, + "learning_rate": 4e-05, + "loss": 4.9775, + "loss/crossentropy": 1.6595203876495361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1784625742584467, + "step": 970 + }, + { + "epoch": 0.081, + "grad_norm": 5.21875, + "grad_norm_var": 0.04006754557291667, + "learning_rate": 4e-05, + "loss": 4.5601, + "loss/crossentropy": 2.0515496730804443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.251333761960268, + "step": 972 + }, + { + "epoch": 0.08116666666666666, + "grad_norm": 5.9375, + "grad_norm_var": 0.07532145182291666, + "learning_rate": 4e-05, + "loss": 4.9809, + "loss/crossentropy": 2.0578393265604973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044786848127842, + "step": 974 + }, + { + "epoch": 0.08133333333333333, + "grad_norm": 5.5, + "grad_norm_var": 0.07902018229166667, + "learning_rate": 4e-05, + "loss": 5.3295, + "loss/crossentropy": 2.3250069618225098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22813353687524796, + "step": 976 + }, + { + "epoch": 0.0815, + "grad_norm": 5.46875, + "grad_norm_var": 0.10435791015625, + "learning_rate": 4e-05, + "loss": 4.2614, + "loss/crossentropy": 1.668112076818943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1645719800144434, + "step": 978 + }, + { + "epoch": 0.08166666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.09332275390625, + "learning_rate": 4e-05, + "loss": 4.5831, + "loss/crossentropy": 2.0174410790205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.194509482011199, + "step": 980 + }, + { + "epoch": 0.08183333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.10445556640625, + "learning_rate": 4e-05, + "loss": 4.487, + "loss/crossentropy": 1.5212369486689568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15324652940034866, + "step": 982 + }, + { + "epoch": 0.082, + "grad_norm": 5.625, + "grad_norm_var": 0.11470947265625, + "learning_rate": 4e-05, + "loss": 5.4141, + "loss/crossentropy": 2.1497460901737213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21780480071902275, + "step": 984 + }, + { + "epoch": 0.08216666666666667, + "grad_norm": 5.6875, + "grad_norm_var": 0.12024332682291666, + "learning_rate": 4e-05, + "loss": 4.9362, + "loss/crossentropy": 2.4396926164627075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23286250606179237, + "step": 986 + }, + { + "epoch": 0.08233333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.12483317057291667, + "learning_rate": 4e-05, + "loss": 5.0999, + "loss/crossentropy": 1.889777421951294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20137444324791431, + "step": 988 + }, + { + "epoch": 0.0825, + "grad_norm": 5.4375, + "grad_norm_var": 0.09685872395833334, + "learning_rate": 4e-05, + "loss": 5.0512, + "loss/crossentropy": 1.6567106246948242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1837901659309864, + "step": 990 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.12862955729166667, + "learning_rate": 4e-05, + "loss": 5.1472, + "loss/crossentropy": 2.333681643009186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2757877744734287, + "step": 992 + }, + { + "epoch": 0.08283333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.09720052083333333, + "learning_rate": 4e-05, + "loss": 4.9003, + "loss/crossentropy": 2.455785632133484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.235845647752285, + "step": 994 + }, + { + "epoch": 0.083, + "grad_norm": 5.59375, + "grad_norm_var": 0.10006103515625, + "learning_rate": 4e-05, + "loss": 4.7869, + "loss/crossentropy": 1.6923584789037704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18443119525909424, + "step": 996 + }, + { + "epoch": 0.08316666666666667, + "grad_norm": 5.59375, + "grad_norm_var": 0.12935791015625, + "learning_rate": 4e-05, + "loss": 5.2978, + "loss/crossentropy": 2.250712603330612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22323672845959663, + "step": 998 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.14816080729166667, + "learning_rate": 4e-05, + "loss": 4.8683, + "loss/crossentropy": 1.629243291914463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17515265941619873, + "step": 1000 + }, + { + "epoch": 0.0835, + "grad_norm": 4.90625, + "grad_norm_var": 0.163134765625, + "learning_rate": 4e-05, + "loss": 4.8394, + "loss/crossentropy": 1.5451477617025375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16557494178414345, + "step": 1002 + }, + { + "epoch": 0.08366666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.16617431640625, + "learning_rate": 4e-05, + "loss": 5.3454, + "loss/crossentropy": 2.027478814125061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930145062506199, + "step": 1004 + }, + { + "epoch": 0.08383333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.16291910807291668, + "learning_rate": 4e-05, + "loss": 5.4164, + "loss/crossentropy": 1.7522178888320923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17787319794297218, + "step": 1006 + }, + { + "epoch": 0.084, + "grad_norm": 5.375, + "grad_norm_var": 0.12831624348958334, + "learning_rate": 4e-05, + "loss": 5.1326, + "loss/crossentropy": 2.2606292963027954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22609979286789894, + "step": 1008 + }, + { + "epoch": 0.08416666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.12669270833333332, + "learning_rate": 4e-05, + "loss": 4.6972, + "loss/crossentropy": 1.3283646404743195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16521522216498852, + "step": 1010 + }, + { + "epoch": 0.08433333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.12975260416666667, + "learning_rate": 4e-05, + "loss": 4.4952, + "loss/crossentropy": 1.5593970566987991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17167419753968716, + "step": 1012 + }, + { + "epoch": 0.0845, + "grad_norm": 6.46875, + "grad_norm_var": 0.16975504557291668, + "learning_rate": 4e-05, + "loss": 4.9853, + "loss/crossentropy": 1.7582807093858719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954742781817913, + "step": 1014 + }, + { + "epoch": 0.08466666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.14810791015625, + "learning_rate": 4e-05, + "loss": 5.5712, + "loss/crossentropy": 1.4745187312364578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.175180334597826, + "step": 1016 + }, + { + "epoch": 0.08483333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.1140625, + "learning_rate": 4e-05, + "loss": 5.02, + "loss/crossentropy": 1.4914978370070457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.153540201485157, + "step": 1018 + }, + { + "epoch": 0.085, + "grad_norm": 5.21875, + "grad_norm_var": 0.11285400390625, + "learning_rate": 4e-05, + "loss": 5.0581, + "loss/crossentropy": 2.0033098682761192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19129659608006477, + "step": 1020 + }, + { + "epoch": 0.08516666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.11636962890625, + "learning_rate": 4e-05, + "loss": 4.9019, + "loss/crossentropy": 1.3103836476802826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14399930834770203, + "step": 1022 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.11998291015625, + "learning_rate": 4e-05, + "loss": 5.3554, + "loss/crossentropy": 2.316011965274811, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24070628732442856, + "step": 1024 + }, + { + "epoch": 0.0855, + "grad_norm": 5.3125, + "grad_norm_var": 0.135400390625, + "learning_rate": 4e-05, + "loss": 5.0308, + "loss/crossentropy": 1.4706613272428513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16036737896502018, + "step": 1026 + }, + { + "epoch": 0.08566666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.12779541015625, + "learning_rate": 4e-05, + "loss": 4.9992, + "loss/crossentropy": 2.273362785577774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22011291980743408, + "step": 1028 + }, + { + "epoch": 0.08583333333333333, + "grad_norm": 5.75, + "grad_norm_var": 0.05761311848958333, + "learning_rate": 4e-05, + "loss": 4.8171, + "loss/crossentropy": 1.5396167114377022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16673108749091625, + "step": 1030 + }, + { + "epoch": 0.086, + "grad_norm": 5.34375, + "grad_norm_var": 0.20703125, + "learning_rate": 4e-05, + "loss": 5.0549, + "loss/crossentropy": 2.5265402793884277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24674354121088982, + "step": 1032 + }, + { + "epoch": 0.08616666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.23644205729166667, + "learning_rate": 4e-05, + "loss": 4.7003, + "loss/crossentropy": 1.2078011631965637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17747630923986435, + "step": 1034 + }, + { + "epoch": 0.08633333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.23554280598958333, + "learning_rate": 4e-05, + "loss": 5.3133, + "loss/crossentropy": 2.0871264040470123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24350010231137276, + "step": 1036 + }, + { + "epoch": 0.0865, + "grad_norm": 5.40625, + "grad_norm_var": 0.22498372395833333, + "learning_rate": 4e-05, + "loss": 4.7851, + "loss/crossentropy": 1.9023667722940445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18097983859479427, + "step": 1038 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.25533854166666664, + "learning_rate": 4e-05, + "loss": 4.3374, + "loss/crossentropy": 1.3959245532751083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1558135487139225, + "step": 1040 + }, + { + "epoch": 0.08683333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.24257405598958334, + "learning_rate": 4e-05, + "loss": 5.2833, + "loss/crossentropy": 2.175060898065567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21963898465037346, + "step": 1042 + }, + { + "epoch": 0.087, + "grad_norm": 4.78125, + "grad_norm_var": 0.27828369140625, + "learning_rate": 4e-05, + "loss": 5.4538, + "loss/crossentropy": 2.4693975150585175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25492599606513977, + "step": 1044 + }, + { + "epoch": 0.08716666666666667, + "grad_norm": 7.6875, + "grad_norm_var": 0.59791259765625, + "learning_rate": 4e-05, + "loss": 4.7527, + "loss/crossentropy": 1.8814911097288132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21333661675453186, + "step": 1046 + }, + { + "epoch": 0.08733333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.4727701822916667, + "learning_rate": 4e-05, + "loss": 4.6145, + "loss/crossentropy": 1.8344381749629974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21359984576702118, + "step": 1048 + }, + { + "epoch": 0.0875, + "grad_norm": 4.78125, + "grad_norm_var": 0.48202718098958336, + "learning_rate": 4e-05, + "loss": 5.2823, + "loss/crossentropy": 2.050938367843628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1791608016937971, + "step": 1050 + }, + { + "epoch": 0.08766666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.48342692057291664, + "learning_rate": 4e-05, + "loss": 5.0677, + "loss/crossentropy": 2.4595237970352173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22237907722592354, + "step": 1052 + }, + { + "epoch": 0.08783333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.4832967122395833, + "learning_rate": 4e-05, + "loss": 5.6448, + "loss/crossentropy": 2.6399565935134888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23271853476762772, + "step": 1054 + }, + { + "epoch": 0.088, + "grad_norm": 5.25, + "grad_norm_var": 0.4495930989583333, + "learning_rate": 4e-05, + "loss": 4.7361, + "loss/crossentropy": 1.2053988501429558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15466507151722908, + "step": 1056 + }, + { + "epoch": 0.08816666666666667, + "grad_norm": 5.46875, + "grad_norm_var": 0.461181640625, + "learning_rate": 4e-05, + "loss": 4.9132, + "loss/crossentropy": 1.488468736410141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14729905128479004, + "step": 1058 + }, + { + "epoch": 0.08833333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.4491170247395833, + "learning_rate": 4e-05, + "loss": 4.8853, + "loss/crossentropy": 1.5609579607844353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16526676714420319, + "step": 1060 + }, + { + "epoch": 0.0885, + "grad_norm": 5.46875, + "grad_norm_var": 0.07265218098958333, + "learning_rate": 4e-05, + "loss": 4.6105, + "loss/crossentropy": 1.6217800825834274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17694772593677044, + "step": 1062 + }, + { + "epoch": 0.08866666666666667, + "grad_norm": 8.0625, + "grad_norm_var": 0.61412353515625, + "learning_rate": 4e-05, + "loss": 5.2193, + "loss/crossentropy": 1.8461291044950485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21077187731862068, + "step": 1064 + }, + { + "epoch": 0.08883333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.60703125, + "learning_rate": 4e-05, + "loss": 5.1246, + "loss/crossentropy": 2.6400803327560425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22616837173700333, + "step": 1066 + }, + { + "epoch": 0.089, + "grad_norm": 5.46875, + "grad_norm_var": 0.6093587239583333, + "learning_rate": 4e-05, + "loss": 5.0405, + "loss/crossentropy": 2.375422090291977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20153706148266792, + "step": 1068 + }, + { + "epoch": 0.08916666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.6176432291666667, + "learning_rate": 4e-05, + "loss": 4.6603, + "loss/crossentropy": 2.2146050930023193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24322227016091347, + "step": 1070 + }, + { + "epoch": 0.08933333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.6146769205729167, + "learning_rate": 4e-05, + "loss": 4.8999, + "loss/crossentropy": 1.8436658903956413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19220684841275215, + "step": 1072 + }, + { + "epoch": 0.0895, + "grad_norm": 5.15625, + "grad_norm_var": 0.6200358072916666, + "learning_rate": 4e-05, + "loss": 5.1488, + "loss/crossentropy": 1.8162973299622536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20228558778762817, + "step": 1074 + }, + { + "epoch": 0.08966666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.6080729166666666, + "learning_rate": 4e-05, + "loss": 3.9845, + "loss/crossentropy": 2.2521041929721832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21630793064832687, + "step": 1076 + }, + { + "epoch": 0.08983333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.5907389322916666, + "learning_rate": 4e-05, + "loss": 4.899, + "loss/crossentropy": 2.8008521795272827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185939960181713, + "step": 1078 + }, + { + "epoch": 0.09, + "grad_norm": 5.25, + "grad_norm_var": 0.04781494140625, + "learning_rate": 4e-05, + "loss": 5.1706, + "loss/crossentropy": 2.006529211997986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23423625528812408, + "step": 1080 + }, + { + "epoch": 0.09016666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.043359375, + "learning_rate": 4e-05, + "loss": 4.5149, + "loss/crossentropy": 1.5252627283334732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16670545563101768, + "step": 1082 + }, + { + "epoch": 0.09033333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.029801432291666666, + "learning_rate": 4e-05, + "loss": 5.1486, + "loss/crossentropy": 1.7940563037991524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2000715285539627, + "step": 1084 + }, + { + "epoch": 0.0905, + "grad_norm": 6.03125, + "grad_norm_var": 0.07154541015625, + "learning_rate": 4e-05, + "loss": 5.0886, + "loss/crossentropy": 2.1735753268003464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20067806914448738, + "step": 1086 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.07135009765625, + "learning_rate": 4e-05, + "loss": 4.8474, + "loss/crossentropy": 2.2438295483589172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555602967739105, + "step": 1088 + }, + { + "epoch": 0.09083333333333334, + "grad_norm": 5.75, + "grad_norm_var": 0.09073893229166667, + "learning_rate": 4e-05, + "loss": 5.2058, + "loss/crossentropy": 1.4616017490625381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21333470940589905, + "step": 1090 + }, + { + "epoch": 0.091, + "grad_norm": 5.90625, + "grad_norm_var": 0.11666259765625, + "learning_rate": 4e-05, + "loss": 4.6305, + "loss/crossentropy": 2.167073041200638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22877944260835648, + "step": 1092 + }, + { + "epoch": 0.09116666666666666, + "grad_norm": 6.0, + "grad_norm_var": 0.14568684895833334, + "learning_rate": 4e-05, + "loss": 5.1547, + "loss/crossentropy": 2.292715698480606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23625801876187325, + "step": 1094 + }, + { + "epoch": 0.09133333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.1615234375, + "learning_rate": 4e-05, + "loss": 4.6894, + "loss/crossentropy": 1.8984995782375336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20002290606498718, + "step": 1096 + }, + { + "epoch": 0.0915, + "grad_norm": 5.53125, + "grad_norm_var": 0.15767822265625, + "learning_rate": 4e-05, + "loss": 5.3258, + "loss/crossentropy": 2.3215838074684143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.239714615046978, + "step": 1098 + }, + { + "epoch": 0.09166666666666666, + "grad_norm": 5.28125, + "grad_norm_var": 0.153759765625, + "learning_rate": 4e-05, + "loss": 5.659, + "loss/crossentropy": 2.0826582312583923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21085572242736816, + "step": 1100 + }, + { + "epoch": 0.09183333333333334, + "grad_norm": 5.75, + "grad_norm_var": 0.14205322265625, + "learning_rate": 4e-05, + "loss": 5.2229, + "loss/crossentropy": 1.4214537590742111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1674542874097824, + "step": 1102 + }, + { + "epoch": 0.092, + "grad_norm": 5.125, + "grad_norm_var": 0.14312744140625, + "learning_rate": 4e-05, + "loss": 4.8583, + "loss/crossentropy": 1.8252490535378456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999596320092678, + "step": 1104 + }, + { + "epoch": 0.09216666666666666, + "grad_norm": 5.3125, + "grad_norm_var": 0.9711873372395833, + "learning_rate": 4e-05, + "loss": 4.6134, + "loss/crossentropy": 1.295023687183857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15225711092352867, + "step": 1106 + }, + { + "epoch": 0.09233333333333334, + "grad_norm": 5.8125, + "grad_norm_var": 0.9461873372395834, + "learning_rate": 4e-05, + "loss": 4.8484, + "loss/crossentropy": 1.2159418240189552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1386381033807993, + "step": 1108 + }, + { + "epoch": 0.0925, + "grad_norm": 5.0625, + "grad_norm_var": 0.9702962239583334, + "learning_rate": 4e-05, + "loss": 5.0996, + "loss/crossentropy": 2.4133604764938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2145095020532608, + "step": 1110 + }, + { + "epoch": 0.09266666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.9756510416666667, + "learning_rate": 4e-05, + "loss": 4.8344, + "loss/crossentropy": 1.7319612950086594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1731079574674368, + "step": 1112 + }, + { + "epoch": 0.09283333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.9765625, + "learning_rate": 4e-05, + "loss": 4.8517, + "loss/crossentropy": 1.6233867853879929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1842523030936718, + "step": 1114 + }, + { + "epoch": 0.093, + "grad_norm": 5.40625, + "grad_norm_var": 0.9680826822916667, + "learning_rate": 4e-05, + "loss": 4.8547, + "loss/crossentropy": 2.48843851685524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22499863803386688, + "step": 1116 + }, + { + "epoch": 0.09316666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.98170166015625, + "learning_rate": 4e-05, + "loss": 5.0257, + "loss/crossentropy": 1.9700958281755447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18995188921689987, + "step": 1118 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 5.5, + "grad_norm_var": 0.9780232747395833, + "learning_rate": 4e-05, + "loss": 5.2743, + "loss/crossentropy": 2.393950343132019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21254169195890427, + "step": 1120 + }, + { + "epoch": 0.0935, + "grad_norm": 4.875, + "grad_norm_var": 0.07667643229166667, + "learning_rate": 4e-05, + "loss": 4.3726, + "loss/crossentropy": 2.0441563352942467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.176620876416564, + "step": 1122 + }, + { + "epoch": 0.09366666666666666, + "grad_norm": 5.40625, + "grad_norm_var": 0.05546875, + "learning_rate": 4e-05, + "loss": 4.7267, + "loss/crossentropy": 1.8536287397146225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18368082493543625, + "step": 1124 + }, + { + "epoch": 0.09383333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.0544921875, + "learning_rate": 4e-05, + "loss": 5.0803, + "loss/crossentropy": 0.8758844807744026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12684419937431812, + "step": 1126 + }, + { + "epoch": 0.094, + "grad_norm": 5.5, + "grad_norm_var": 0.048726399739583336, + "learning_rate": 4e-05, + "loss": 5.1615, + "loss/crossentropy": 2.5835047364234924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21840446069836617, + "step": 1128 + }, + { + "epoch": 0.09416666666666666, + "grad_norm": 5.25, + "grad_norm_var": 0.04986979166666667, + "learning_rate": 4e-05, + "loss": 5.5816, + "loss/crossentropy": 2.6384198665618896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2301637977361679, + "step": 1130 + }, + { + "epoch": 0.09433333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.056103515625, + "learning_rate": 4e-05, + "loss": 5.4229, + "loss/crossentropy": 1.1250766292214394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18534447066485882, + "step": 1132 + }, + { + "epoch": 0.0945, + "grad_norm": 5.125, + "grad_norm_var": 0.04959309895833333, + "learning_rate": 4e-05, + "loss": 5.3554, + "loss/crossentropy": 2.3953994810581207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2530173920094967, + "step": 1134 + }, + { + "epoch": 0.09466666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.048828125, + "learning_rate": 4e-05, + "loss": 4.88, + "loss/crossentropy": 1.7053054720163345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1791569832712412, + "step": 1136 + }, + { + "epoch": 0.09483333333333334, + "grad_norm": 6.03125, + "grad_norm_var": 0.06383056640625, + "learning_rate": 4e-05, + "loss": 5.3785, + "loss/crossentropy": 2.3179805874824524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2322893813252449, + "step": 1138 + }, + { + "epoch": 0.095, + "grad_norm": 5.15625, + "grad_norm_var": 0.07327067057291667, + "learning_rate": 4e-05, + "loss": 4.8176, + "loss/crossentropy": 1.420469008386135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16073052026331425, + "step": 1140 + }, + { + "epoch": 0.09516666666666666, + "grad_norm": 5.3125, + "grad_norm_var": 0.08088785807291667, + "learning_rate": 4e-05, + "loss": 4.3205, + "loss/crossentropy": 2.4501261115074158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22779357805848122, + "step": 1142 + }, + { + "epoch": 0.09533333333333334, + "grad_norm": 5.3125, + "grad_norm_var": 0.07838541666666667, + "learning_rate": 4e-05, + "loss": 5.4399, + "loss/crossentropy": 1.6310898885130882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19184290245175362, + "step": 1144 + }, + { + "epoch": 0.0955, + "grad_norm": 5.5, + "grad_norm_var": 0.09289957682291666, + "learning_rate": 4e-05, + "loss": 4.874, + "loss/crossentropy": 2.2742528915405273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111048549413681, + "step": 1146 + }, + { + "epoch": 0.09566666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.08596598307291667, + "learning_rate": 4e-05, + "loss": 4.5276, + "loss/crossentropy": 2.360960155725479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2365327812731266, + "step": 1148 + }, + { + "epoch": 0.09583333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.08917643229166666, + "learning_rate": 4e-05, + "loss": 5.2565, + "loss/crossentropy": 2.354514867067337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22909708321094513, + "step": 1150 + }, + { + "epoch": 0.096, + "grad_norm": 5.15625, + "grad_norm_var": 0.08723958333333333, + "learning_rate": 4e-05, + "loss": 4.9971, + "loss/crossentropy": 1.930092841386795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069223504513502, + "step": 1152 + }, + { + "epoch": 0.09616666666666666, + "grad_norm": 5.71875, + "grad_norm_var": 0.083056640625, + "learning_rate": 4e-05, + "loss": 4.5473, + "loss/crossentropy": 2.1483106315135956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045624665915966, + "step": 1154 + }, + { + "epoch": 0.09633333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.5146484375, + "learning_rate": 4e-05, + "loss": 5.0278, + "loss/crossentropy": 1.9653250426054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19056643545627594, + "step": 1156 + }, + { + "epoch": 0.0965, + "grad_norm": 5.0, + "grad_norm_var": 0.53082275390625, + "learning_rate": 4e-05, + "loss": 4.2204, + "loss/crossentropy": 1.59404868632555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19967875257134438, + "step": 1158 + }, + { + "epoch": 0.09666666666666666, + "grad_norm": 5.65625, + "grad_norm_var": 0.5387003580729167, + "learning_rate": 4e-05, + "loss": 4.9826, + "loss/crossentropy": 2.1325821727514267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22309079766273499, + "step": 1160 + }, + { + "epoch": 0.09683333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.5381510416666667, + "learning_rate": 4e-05, + "loss": 4.3877, + "loss/crossentropy": 1.4681326597929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22608055919408798, + "step": 1162 + }, + { + "epoch": 0.097, + "grad_norm": 5.59375, + "grad_norm_var": 0.5563639322916667, + "learning_rate": 4e-05, + "loss": 4.3296, + "loss/crossentropy": 1.7761568650603294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17438078857958317, + "step": 1164 + }, + { + "epoch": 0.09716666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.5484659830729167, + "learning_rate": 4e-05, + "loss": 4.6415, + "loss/crossentropy": 2.402270257472992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308974713087082, + "step": 1166 + }, + { + "epoch": 0.09733333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.5458333333333333, + "learning_rate": 4e-05, + "loss": 5.0955, + "loss/crossentropy": 2.493393361568451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2291024811565876, + "step": 1168 + }, + { + "epoch": 0.0975, + "grad_norm": 5.3125, + "grad_norm_var": 0.5023396809895834, + "learning_rate": 4e-05, + "loss": 5.3826, + "loss/crossentropy": 2.0367672443389893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1950318068265915, + "step": 1170 + }, + { + "epoch": 0.09766666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.09302978515625, + "learning_rate": 4e-05, + "loss": 4.8491, + "loss/crossentropy": 1.9876883029937744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22394466400146484, + "step": 1172 + }, + { + "epoch": 0.09783333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.08215738932291666, + "learning_rate": 4e-05, + "loss": 5.3828, + "loss/crossentropy": 3.0178449749946594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20590073242783546, + "step": 1174 + }, + { + "epoch": 0.098, + "grad_norm": 5.25, + "grad_norm_var": 0.07343343098958334, + "learning_rate": 4e-05, + "loss": 4.7851, + "loss/crossentropy": 1.9217498302459717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19339833036065102, + "step": 1176 + }, + { + "epoch": 0.09816666666666667, + "grad_norm": 6.375, + "grad_norm_var": 0.13677978515625, + "learning_rate": 4e-05, + "loss": 5.4871, + "loss/crossentropy": 2.4020891785621643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366964928805828, + "step": 1178 + }, + { + "epoch": 0.09833333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.12213134765625, + "learning_rate": 4e-05, + "loss": 4.8593, + "loss/crossentropy": 2.4006099104881287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22088930010795593, + "step": 1180 + }, + { + "epoch": 0.0985, + "grad_norm": 6.5, + "grad_norm_var": 0.21571858723958334, + "learning_rate": 4e-05, + "loss": 5.0819, + "loss/crossentropy": 2.0782680213451385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24928846955299377, + "step": 1182 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.21457926432291666, + "learning_rate": 4e-05, + "loss": 5.0726, + "loss/crossentropy": 2.0695590674877167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18602534383535385, + "step": 1184 + }, + { + "epoch": 0.09883333333333333, + "grad_norm": 5.78125, + "grad_norm_var": 0.22304280598958334, + "learning_rate": 4e-05, + "loss": 4.7877, + "loss/crossentropy": 1.6022805571556091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210852961987257, + "step": 1186 + }, + { + "epoch": 0.099, + "grad_norm": 5.375, + "grad_norm_var": 0.18235677083333332, + "learning_rate": 4e-05, + "loss": 5.1089, + "loss/crossentropy": 2.142100676894188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20843049511313438, + "step": 1188 + }, + { + "epoch": 0.09916666666666667, + "grad_norm": 5.59375, + "grad_norm_var": 0.23609619140625, + "learning_rate": 4e-05, + "loss": 4.787, + "loss/crossentropy": 1.4725098609924316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18635604158043861, + "step": 1190 + }, + { + "epoch": 0.09933333333333333, + "grad_norm": 5.625, + "grad_norm_var": 0.245166015625, + "learning_rate": 4e-05, + "loss": 5.169, + "loss/crossentropy": 2.237101376056671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352110929787159, + "step": 1192 + }, + { + "epoch": 0.0995, + "grad_norm": 5.46875, + "grad_norm_var": 0.18853759765625, + "learning_rate": 4e-05, + "loss": 5.5126, + "loss/crossentropy": 2.0248168110847473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19281497597694397, + "step": 1194 + }, + { + "epoch": 0.09966666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.19179280598958334, + "learning_rate": 4e-05, + "loss": 5.1818, + "loss/crossentropy": 1.6968555450439453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19379430450499058, + "step": 1196 + }, + { + "epoch": 0.09983333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.10631103515625, + "learning_rate": 4e-05, + "loss": 4.8209, + "loss/crossentropy": 1.5468868017196655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16440916433930397, + "step": 1198 + }, + { + "epoch": 0.1, + "grad_norm": 5.125, + "grad_norm_var": 0.10435791015625, + "learning_rate": 4e-05, + "loss": 5.0316, + "loss/crossentropy": 1.8919531255960464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1950981542468071, + "step": 1200 + }, + { + "epoch": 0.10016666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.0984375, + "learning_rate": 4e-05, + "loss": 5.147, + "loss/crossentropy": 1.6899343207478523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17296983301639557, + "step": 1202 + }, + { + "epoch": 0.10033333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.10318603515625, + "learning_rate": 4e-05, + "loss": 4.8684, + "loss/crossentropy": 2.129749298095703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22954006493091583, + "step": 1204 + }, + { + "epoch": 0.1005, + "grad_norm": 6.0, + "grad_norm_var": 0.08917643229166666, + "learning_rate": 4e-05, + "loss": 5.0732, + "loss/crossentropy": 1.873815581202507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18716946989297867, + "step": 1206 + }, + { + "epoch": 0.10066666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.07224934895833333, + "learning_rate": 4e-05, + "loss": 5.2633, + "loss/crossentropy": 2.32140251994133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23857327923178673, + "step": 1208 + }, + { + "epoch": 0.10083333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.07857666015625, + "learning_rate": 4e-05, + "loss": 5.4185, + "loss/crossentropy": 2.4386764764785767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22701247781515121, + "step": 1210 + }, + { + "epoch": 0.101, + "grad_norm": 5.0, + "grad_norm_var": 0.06886393229166667, + "learning_rate": 4e-05, + "loss": 5.0207, + "loss/crossentropy": 1.033334881067276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1509057618677616, + "step": 1212 + }, + { + "epoch": 0.10116666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.07232666015625, + "learning_rate": 4e-05, + "loss": 5.3547, + "loss/crossentropy": 2.575928032398224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22841667383909225, + "step": 1214 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.08534749348958333, + "learning_rate": 4e-05, + "loss": 4.6306, + "loss/crossentropy": 1.8462003320455551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828253660351038, + "step": 1216 + }, + { + "epoch": 0.1015, + "grad_norm": 5.5625, + "grad_norm_var": 0.07909749348958334, + "learning_rate": 4e-05, + "loss": 4.7978, + "loss/crossentropy": 2.3569419384002686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24894802644848824, + "step": 1218 + }, + { + "epoch": 0.10166666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.08370768229166667, + "learning_rate": 4e-05, + "loss": 5.6661, + "loss/crossentropy": 2.0236599445343018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20601807162165642, + "step": 1220 + }, + { + "epoch": 0.10183333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.1169921875, + "learning_rate": 4e-05, + "loss": 4.9545, + "loss/crossentropy": 2.3474625945091248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21253760159015656, + "step": 1222 + }, + { + "epoch": 0.102, + "grad_norm": 5.09375, + "grad_norm_var": 0.12795817057291667, + "learning_rate": 4e-05, + "loss": 5.1116, + "loss/crossentropy": 2.0291855931282043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004396729171276, + "step": 1224 + }, + { + "epoch": 0.10216666666666667, + "grad_norm": 5.6875, + "grad_norm_var": 0.13644205729166667, + "learning_rate": 4e-05, + "loss": 4.839, + "loss/crossentropy": 1.786637932062149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1793370395898819, + "step": 1226 + }, + { + "epoch": 0.10233333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.19013264973958333, + "learning_rate": 4e-05, + "loss": 4.4245, + "loss/crossentropy": 1.5033576264977455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1820121891796589, + "step": 1228 + }, + { + "epoch": 0.1025, + "grad_norm": 5.78125, + "grad_norm_var": 0.20201822916666667, + "learning_rate": 4e-05, + "loss": 5.0492, + "loss/crossentropy": 1.7019300237298012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1789207085967064, + "step": 1230 + }, + { + "epoch": 0.10266666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.21126302083333334, + "learning_rate": 4e-05, + "loss": 4.788, + "loss/crossentropy": 1.907809428870678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18188271671533585, + "step": 1232 + }, + { + "epoch": 0.10283333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.21638997395833334, + "learning_rate": 4e-05, + "loss": 5.1517, + "loss/crossentropy": 2.4552002549171448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21234703436493874, + "step": 1234 + }, + { + "epoch": 0.103, + "grad_norm": 4.96875, + "grad_norm_var": 0.20930582682291668, + "learning_rate": 4e-05, + "loss": 4.9628, + "loss/crossentropy": 2.644266128540039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23479052260518074, + "step": 1236 + }, + { + "epoch": 0.10316666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.12864176432291666, + "learning_rate": 4e-05, + "loss": 5.3084, + "loss/crossentropy": 2.67303067445755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22366882488131523, + "step": 1238 + }, + { + "epoch": 0.10333333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.111181640625, + "learning_rate": 4e-05, + "loss": 4.9053, + "loss/crossentropy": 1.5672541037201881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16596291214227676, + "step": 1240 + }, + { + "epoch": 0.1035, + "grad_norm": 4.625, + "grad_norm_var": 0.10232747395833333, + "learning_rate": 4e-05, + "loss": 4.5866, + "loss/crossentropy": 2.6629343032836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24874060973525047, + "step": 1242 + }, + { + "epoch": 0.10366666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.08292643229166667, + "learning_rate": 4e-05, + "loss": 4.4504, + "loss/crossentropy": 2.156973510980606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21799809858202934, + "step": 1244 + }, + { + "epoch": 0.10383333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.05780843098958333, + "learning_rate": 4e-05, + "loss": 5.2903, + "loss/crossentropy": 1.837260901927948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21180275082588196, + "step": 1246 + }, + { + "epoch": 0.104, + "grad_norm": 5.28125, + "grad_norm_var": 0.0615234375, + "learning_rate": 4e-05, + "loss": 4.5459, + "loss/crossentropy": 1.2645907923579216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14384145848453045, + "step": 1248 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.06304931640625, + "learning_rate": 4e-05, + "loss": 5.0964, + "loss/crossentropy": 2.430781602859497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22693831473588943, + "step": 1250 + }, + { + "epoch": 0.10433333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.3664021809895833, + "learning_rate": 4e-05, + "loss": 4.4491, + "loss/crossentropy": 1.7102079764008522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1991841085255146, + "step": 1252 + }, + { + "epoch": 0.1045, + "grad_norm": 5.03125, + "grad_norm_var": 0.36282145182291664, + "learning_rate": 4e-05, + "loss": 4.9358, + "loss/crossentropy": 2.073123261332512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17315717414021492, + "step": 1254 + }, + { + "epoch": 0.10466666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.37418212890625, + "learning_rate": 4e-05, + "loss": 5.5336, + "loss/crossentropy": 2.1373045444488525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219538614153862, + "step": 1256 + }, + { + "epoch": 0.10483333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.37675374348958335, + "learning_rate": 4e-05, + "loss": 4.912, + "loss/crossentropy": 1.0041880533099174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13228822499513626, + "step": 1258 + }, + { + "epoch": 0.105, + "grad_norm": 4.875, + "grad_norm_var": 0.35985921223958334, + "learning_rate": 4e-05, + "loss": 5.418, + "loss/crossentropy": 2.5306063890457153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2324695996940136, + "step": 1260 + }, + { + "epoch": 0.10516666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.352197265625, + "learning_rate": 4e-05, + "loss": 4.8752, + "loss/crossentropy": 1.99101173132658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560059323906898, + "step": 1262 + }, + { + "epoch": 0.10533333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.3563639322916667, + "learning_rate": 4e-05, + "loss": 4.5795, + "loss/crossentropy": 1.2876827344298363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13764869794249535, + "step": 1264 + }, + { + "epoch": 0.1055, + "grad_norm": 5.5625, + "grad_norm_var": 0.3610310872395833, + "learning_rate": 4e-05, + "loss": 4.9981, + "loss/crossentropy": 2.373332917690277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24393631890416145, + "step": 1266 + }, + { + "epoch": 0.10566666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.06151936848958333, + "learning_rate": 4e-05, + "loss": 4.7999, + "loss/crossentropy": 2.0270435735583305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20645768009126186, + "step": 1268 + }, + { + "epoch": 0.10583333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.06073811848958333, + "learning_rate": 4e-05, + "loss": 4.6323, + "loss/crossentropy": 1.5579545721411705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1702574621886015, + "step": 1270 + }, + { + "epoch": 0.106, + "grad_norm": 5.15625, + "grad_norm_var": 0.047265625, + "learning_rate": 4e-05, + "loss": 4.4562, + "loss/crossentropy": 2.4982908964157104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2260519601404667, + "step": 1272 + }, + { + "epoch": 0.10616666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.03284098307291667, + "learning_rate": 4e-05, + "loss": 4.9248, + "loss/crossentropy": 1.9325073957443237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19312690198421478, + "step": 1274 + }, + { + "epoch": 0.10633333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.031884765625, + "learning_rate": 4e-05, + "loss": 4.9711, + "loss/crossentropy": 1.769273281097412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21776899322867393, + "step": 1276 + }, + { + "epoch": 0.1065, + "grad_norm": 5.28125, + "grad_norm_var": 0.028999837239583333, + "learning_rate": 4e-05, + "loss": 5.5337, + "loss/crossentropy": 2.284360885620117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22272326424717903, + "step": 1278 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.028499348958333334, + "learning_rate": 4e-05, + "loss": 5.2305, + "loss/crossentropy": 1.8736866936087608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111079953610897, + "step": 1280 + }, + { + "epoch": 0.10683333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.04016927083333333, + "learning_rate": 4e-05, + "loss": 4.5859, + "loss/crossentropy": 1.6894002929329872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19435017928481102, + "step": 1282 + }, + { + "epoch": 0.107, + "grad_norm": 5.0625, + "grad_norm_var": 0.03518473307291667, + "learning_rate": 4e-05, + "loss": 4.6698, + "loss/crossentropy": 1.5436028242111206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1630682311952114, + "step": 1284 + }, + { + "epoch": 0.10716666666666666, + "grad_norm": 5.84375, + "grad_norm_var": 0.06803385416666667, + "learning_rate": 4e-05, + "loss": 5.3232, + "loss/crossentropy": 2.6317964792251587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24785209447145462, + "step": 1286 + }, + { + "epoch": 0.10733333333333334, + "grad_norm": 5.53125, + "grad_norm_var": 0.10422770182291667, + "learning_rate": 4e-05, + "loss": 5.1804, + "loss/crossentropy": 1.5235177874565125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16595890559256077, + "step": 1288 + }, + { + "epoch": 0.1075, + "grad_norm": 5.0625, + "grad_norm_var": 0.11223958333333334, + "learning_rate": 4e-05, + "loss": 5.0182, + "loss/crossentropy": 2.0896430388092995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19103838317096233, + "step": 1290 + }, + { + "epoch": 0.10766666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.13593343098958333, + "learning_rate": 4e-05, + "loss": 4.7246, + "loss/crossentropy": 2.059629112482071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21217556670308113, + "step": 1292 + }, + { + "epoch": 0.10783333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.15558268229166666, + "learning_rate": 4e-05, + "loss": 4.3552, + "loss/crossentropy": 1.9174365252256393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18914584256708622, + "step": 1294 + }, + { + "epoch": 0.108, + "grad_norm": 5.25, + "grad_norm_var": 0.15556233723958332, + "learning_rate": 4e-05, + "loss": 4.8855, + "loss/crossentropy": 2.258558452129364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21850135922431946, + "step": 1296 + }, + { + "epoch": 0.10816666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.14166666666666666, + "learning_rate": 4e-05, + "loss": 4.8919, + "loss/crossentropy": 2.0549103915691376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19344163686037064, + "step": 1298 + }, + { + "epoch": 0.10833333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.14542643229166666, + "learning_rate": 4e-05, + "loss": 4.614, + "loss/crossentropy": 2.098690018057823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2107415907084942, + "step": 1300 + }, + { + "epoch": 0.1085, + "grad_norm": 5.375, + "grad_norm_var": 0.11389567057291666, + "learning_rate": 4e-05, + "loss": 4.8096, + "loss/crossentropy": 1.3393612429499626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17516088113188744, + "step": 1302 + }, + { + "epoch": 0.10866666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 4e-05, + "loss": 5.2506, + "loss/crossentropy": 1.7870676293969154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19672731682658195, + "step": 1304 + }, + { + "epoch": 0.10883333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.04983317057291667, + "learning_rate": 4e-05, + "loss": 5.2597, + "loss/crossentropy": 2.245271325111389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21325793489813805, + "step": 1306 + }, + { + "epoch": 0.109, + "grad_norm": 6.0, + "grad_norm_var": 0.09648030598958333, + "learning_rate": 4e-05, + "loss": 5.0179, + "loss/crossentropy": 2.556147426366806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22807640954852104, + "step": 1308 + }, + { + "epoch": 0.10916666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.078759765625, + "learning_rate": 4e-05, + "loss": 4.9811, + "loss/crossentropy": 2.244424045085907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22903723642230034, + "step": 1310 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.0791015625, + "learning_rate": 4e-05, + "loss": 4.3534, + "loss/crossentropy": 1.5312049500644207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16121600940823555, + "step": 1312 + }, + { + "epoch": 0.1095, + "grad_norm": 6.0, + "grad_norm_var": 0.10878499348958333, + "learning_rate": 4e-05, + "loss": 5.0244, + "loss/crossentropy": 1.7539609968662262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039298675954342, + "step": 1314 + }, + { + "epoch": 0.10966666666666666, + "grad_norm": 5.25, + "grad_norm_var": 0.10193684895833334, + "learning_rate": 4e-05, + "loss": 5.0756, + "loss/crossentropy": 2.2906831800937653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22330694645643234, + "step": 1316 + }, + { + "epoch": 0.10983333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.10165608723958333, + "learning_rate": 4e-05, + "loss": 5.4659, + "loss/crossentropy": 2.422152817249298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22418388351798058, + "step": 1318 + }, + { + "epoch": 0.11, + "grad_norm": 4.78125, + "grad_norm_var": 0.12274983723958334, + "learning_rate": 4e-05, + "loss": 5.0434, + "loss/crossentropy": 2.335483729839325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262851968407631, + "step": 1320 + }, + { + "epoch": 0.11016666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.11308186848958333, + "learning_rate": 4e-05, + "loss": 4.7446, + "loss/crossentropy": 1.9849571883678436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22970320284366608, + "step": 1322 + }, + { + "epoch": 0.11033333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.08787434895833333, + "learning_rate": 4e-05, + "loss": 4.9389, + "loss/crossentropy": 1.5644195303320885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21707364916801453, + "step": 1324 + }, + { + "epoch": 0.1105, + "grad_norm": 4.90625, + "grad_norm_var": 0.09641927083333333, + "learning_rate": 4e-05, + "loss": 5.1959, + "loss/crossentropy": 2.2511331140995026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20980435609817505, + "step": 1326 + }, + { + "epoch": 0.11066666666666666, + "grad_norm": 5.4375, + "grad_norm_var": 0.20533854166666668, + "learning_rate": 4e-05, + "loss": 5.3261, + "loss/crossentropy": 2.4248663187026978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21466375887393951, + "step": 1328 + }, + { + "epoch": 0.11083333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.17899983723958332, + "learning_rate": 4e-05, + "loss": 4.4923, + "loss/crossentropy": 2.2259855568408966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21654033288359642, + "step": 1330 + }, + { + "epoch": 0.111, + "grad_norm": 5.78125, + "grad_norm_var": 0.19322916666666667, + "learning_rate": 4e-05, + "loss": 5.5515, + "loss/crossentropy": 2.337790846824646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23069706931710243, + "step": 1332 + }, + { + "epoch": 0.11116666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.21103108723958333, + "learning_rate": 4e-05, + "loss": 4.8015, + "loss/crossentropy": 1.8960464149713516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18768173828721046, + "step": 1334 + }, + { + "epoch": 0.11133333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.19947916666666668, + "learning_rate": 4e-05, + "loss": 4.9181, + "loss/crossentropy": 2.5201704502105713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23990615457296371, + "step": 1336 + }, + { + "epoch": 0.1115, + "grad_norm": 5.125, + "grad_norm_var": 0.20764567057291666, + "learning_rate": 4e-05, + "loss": 4.9746, + "loss/crossentropy": 2.4421244263648987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21953672170639038, + "step": 1338 + }, + { + "epoch": 0.11166666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.230712890625, + "learning_rate": 4e-05, + "loss": 4.5514, + "loss/crossentropy": 1.0926872938871384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1308863628655672, + "step": 1340 + }, + { + "epoch": 0.11183333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.23811442057291668, + "learning_rate": 4e-05, + "loss": 4.5832, + "loss/crossentropy": 1.838907465338707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18402666971087456, + "step": 1342 + }, + { + "epoch": 0.112, + "grad_norm": 4.84375, + "grad_norm_var": 0.14998372395833334, + "learning_rate": 4e-05, + "loss": 4.5698, + "loss/crossentropy": 0.5912249013781548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10033663548529148, + "step": 1344 + }, + { + "epoch": 0.11216666666666666, + "grad_norm": 6.125, + "grad_norm_var": 0.30123291015625, + "learning_rate": 4e-05, + "loss": 5.5643, + "loss/crossentropy": 2.446168899536133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21937239170074463, + "step": 1346 + }, + { + "epoch": 0.11233333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.2791015625, + "learning_rate": 4e-05, + "loss": 5.3916, + "loss/crossentropy": 2.0473891273140907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.195414362475276, + "step": 1348 + }, + { + "epoch": 0.1125, + "grad_norm": 5.09375, + "grad_norm_var": 0.26171468098958334, + "learning_rate": 4e-05, + "loss": 4.8752, + "loss/crossentropy": 2.4043519496917725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23139266297221184, + "step": 1350 + }, + { + "epoch": 0.11266666666666666, + "grad_norm": 6.28125, + "grad_norm_var": 0.3319295247395833, + "learning_rate": 4e-05, + "loss": 5.2582, + "loss/crossentropy": 2.082743376493454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21164586022496223, + "step": 1352 + }, + { + "epoch": 0.11283333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.324609375, + "learning_rate": 4e-05, + "loss": 4.8166, + "loss/crossentropy": 2.4302121698856354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206825278699398, + "step": 1354 + }, + { + "epoch": 0.113, + "grad_norm": 4.84375, + "grad_norm_var": 0.3073201497395833, + "learning_rate": 4e-05, + "loss": 4.5695, + "loss/crossentropy": 1.4399050921201706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16213641315698624, + "step": 1356 + }, + { + "epoch": 0.11316666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.2970052083333333, + "learning_rate": 4e-05, + "loss": 5.0332, + "loss/crossentropy": 1.3866655454039574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17396113276481628, + "step": 1358 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.24954020182291667, + "learning_rate": 4e-05, + "loss": 4.807, + "loss/crossentropy": 2.120422273874283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19081920012831688, + "step": 1360 + }, + { + "epoch": 0.1135, + "grad_norm": 5.46875, + "grad_norm_var": 0.1865234375, + "learning_rate": 4e-05, + "loss": 4.4229, + "loss/crossentropy": 2.40205454826355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23431218788027763, + "step": 1362 + }, + { + "epoch": 0.11366666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.18375244140625, + "learning_rate": 4e-05, + "loss": 5.2446, + "loss/crossentropy": 2.0221628546714783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2017949502915144, + "step": 1364 + }, + { + "epoch": 0.11383333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.18479410807291666, + "learning_rate": 4e-05, + "loss": 4.7709, + "loss/crossentropy": 1.6139603182673454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16068048775196075, + "step": 1366 + }, + { + "epoch": 0.114, + "grad_norm": 5.375, + "grad_norm_var": 0.10015869140625, + "learning_rate": 4e-05, + "loss": 5.212, + "loss/crossentropy": 2.3433795869350433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126586139202118, + "step": 1368 + }, + { + "epoch": 0.11416666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.09189046223958333, + "learning_rate": 4e-05, + "loss": 4.8335, + "loss/crossentropy": 1.349827267229557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15882672742009163, + "step": 1370 + }, + { + "epoch": 0.11433333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.09112955729166666, + "learning_rate": 4e-05, + "loss": 5.3731, + "loss/crossentropy": 2.245560199022293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21613704785704613, + "step": 1372 + }, + { + "epoch": 0.1145, + "grad_norm": 5.1875, + "grad_norm_var": 0.09959309895833333, + "learning_rate": 4e-05, + "loss": 5.1417, + "loss/crossentropy": 2.2078827619552612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2603098005056381, + "step": 1374 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.09361572265625, + "learning_rate": 4e-05, + "loss": 4.8827, + "loss/crossentropy": 1.0163473710417747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13514012470841408, + "step": 1376 + }, + { + "epoch": 0.11483333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.04693603515625, + "learning_rate": 4e-05, + "loss": 5.586, + "loss/crossentropy": 2.3435881435871124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20985861867666245, + "step": 1378 + }, + { + "epoch": 0.115, + "grad_norm": 5.53125, + "grad_norm_var": 0.060009765625, + "learning_rate": 4e-05, + "loss": 5.4013, + "loss/crossentropy": 2.3194149136543274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22041558474302292, + "step": 1380 + }, + { + "epoch": 0.11516666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.05862223307291667, + "learning_rate": 4e-05, + "loss": 4.7929, + "loss/crossentropy": 1.5186148211359978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15534107387065887, + "step": 1382 + }, + { + "epoch": 0.11533333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.04973551432291667, + "learning_rate": 4e-05, + "loss": 5.4201, + "loss/crossentropy": 2.5172139406204224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23081282153725624, + "step": 1384 + }, + { + "epoch": 0.1155, + "grad_norm": 5.15625, + "grad_norm_var": 0.05025634765625, + "learning_rate": 4e-05, + "loss": 4.9595, + "loss/crossentropy": 1.8386990427970886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22093252837657928, + "step": 1386 + }, + { + "epoch": 0.11566666666666667, + "grad_norm": 5.8125, + "grad_norm_var": 0.07411702473958333, + "learning_rate": 4e-05, + "loss": 5.8714, + "loss/crossentropy": 2.4365760684013367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22801436483860016, + "step": 1388 + }, + { + "epoch": 0.11583333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.08166910807291666, + "learning_rate": 4e-05, + "loss": 5.5638, + "loss/crossentropy": 2.097976215183735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022643592208624, + "step": 1390 + }, + { + "epoch": 0.116, + "grad_norm": 5.3125, + "grad_norm_var": 0.07825113932291666, + "learning_rate": 4e-05, + "loss": 4.4936, + "loss/crossentropy": 1.5444388389587402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18695401214063168, + "step": 1392 + }, + { + "epoch": 0.11616666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 0.09191080729166666, + "learning_rate": 4e-05, + "loss": 5.3733, + "loss/crossentropy": 2.4442446529865265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22987202182412148, + "step": 1394 + }, + { + "epoch": 0.11633333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.08596598307291667, + "learning_rate": 4e-05, + "loss": 4.4895, + "loss/crossentropy": 2.2781380712985992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2173830196261406, + "step": 1396 + }, + { + "epoch": 0.1165, + "grad_norm": 5.28125, + "grad_norm_var": 0.07771809895833333, + "learning_rate": 4e-05, + "loss": 5.594, + "loss/crossentropy": 1.9904277324676514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20960525795817375, + "step": 1398 + }, + { + "epoch": 0.11666666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.07701416015625, + "learning_rate": 4e-05, + "loss": 4.8685, + "loss/crossentropy": 1.894834741950035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187057975679636, + "step": 1400 + }, + { + "epoch": 0.11683333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.075244140625, + "learning_rate": 4e-05, + "loss": 4.952, + "loss/crossentropy": 2.408271312713623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21414168551564217, + "step": 1402 + }, + { + "epoch": 0.117, + "grad_norm": 5.1875, + "grad_norm_var": 0.046728515625, + "learning_rate": 4e-05, + "loss": 4.8514, + "loss/crossentropy": 1.8350339084863663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26108158379793167, + "step": 1404 + }, + { + "epoch": 0.11716666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.033056640625, + "learning_rate": 4e-05, + "loss": 4.9133, + "loss/crossentropy": 2.0125522017478943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20184185728430748, + "step": 1406 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.038525390625, + "learning_rate": 4e-05, + "loss": 4.7645, + "loss/crossentropy": 1.840851441025734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736996527761221, + "step": 1408 + }, + { + "epoch": 0.1175, + "grad_norm": 5.0625, + "grad_norm_var": 0.02086181640625, + "learning_rate": 4e-05, + "loss": 5.0723, + "loss/crossentropy": 1.8003590106964111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19614629819989204, + "step": 1410 + }, + { + "epoch": 0.11766666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.049544270833333334, + "learning_rate": 4e-05, + "loss": 4.4699, + "loss/crossentropy": 1.66546168923378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1754789799451828, + "step": 1412 + }, + { + "epoch": 0.11783333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.05738525390625, + "learning_rate": 4e-05, + "loss": 5.0322, + "loss/crossentropy": 2.2698044180870056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20972862467169762, + "step": 1414 + }, + { + "epoch": 0.118, + "grad_norm": 4.875, + "grad_norm_var": 0.05716145833333333, + "learning_rate": 4e-05, + "loss": 5.4831, + "loss/crossentropy": 2.5912956595420837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23193010687828064, + "step": 1416 + }, + { + "epoch": 0.11816666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.05792643229166667, + "learning_rate": 4e-05, + "loss": 5.418, + "loss/crossentropy": 2.321280747652054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22956770285964012, + "step": 1418 + }, + { + "epoch": 0.11833333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.22893473307291667, + "learning_rate": 4e-05, + "loss": 6.2402, + "loss/crossentropy": 2.3734790682792664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287864312529564, + "step": 1420 + }, + { + "epoch": 0.1185, + "grad_norm": 5.21875, + "grad_norm_var": 0.23326822916666667, + "learning_rate": 4e-05, + "loss": 5.1958, + "loss/crossentropy": 2.5214961767196655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23864174634218216, + "step": 1422 + }, + { + "epoch": 0.11866666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.22447916666666667, + "learning_rate": 4e-05, + "loss": 5.4997, + "loss/crossentropy": 2.4628164768218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24057137593626976, + "step": 1424 + }, + { + "epoch": 0.11883333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.23722330729166666, + "learning_rate": 4e-05, + "loss": 4.7698, + "loss/crossentropy": 1.5074485763907433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1644449681043625, + "step": 1426 + }, + { + "epoch": 0.119, + "grad_norm": 4.96875, + "grad_norm_var": 0.229150390625, + "learning_rate": 4e-05, + "loss": 5.1593, + "loss/crossentropy": 1.9605501666665077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836913451552391, + "step": 1428 + }, + { + "epoch": 0.11916666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.22888997395833333, + "learning_rate": 4e-05, + "loss": 4.5879, + "loss/crossentropy": 0.9429236724972725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12995466589927673, + "step": 1430 + }, + { + "epoch": 0.11933333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.229541015625, + "learning_rate": 4e-05, + "loss": 4.85, + "loss/crossentropy": 2.3147625029087067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474900841712952, + "step": 1432 + }, + { + "epoch": 0.1195, + "grad_norm": 5.78125, + "grad_norm_var": 0.24944254557291667, + "learning_rate": 4e-05, + "loss": 5.2804, + "loss/crossentropy": 2.4392059445381165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091044746339321, + "step": 1434 + }, + { + "epoch": 0.11966666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.10818684895833333, + "learning_rate": 4e-05, + "loss": 4.7755, + "loss/crossentropy": 2.5641788244247437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216048277914524, + "step": 1436 + }, + { + "epoch": 0.11983333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.10390218098958333, + "learning_rate": 4e-05, + "loss": 4.5948, + "loss/crossentropy": 1.3560013100504875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16775010898709297, + "step": 1438 + }, + { + "epoch": 0.12, + "grad_norm": 5.1875, + "grad_norm_var": 0.10220947265625, + "learning_rate": 4e-05, + "loss": 5.0291, + "loss/crossentropy": 1.731564313173294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059439942240715, + "step": 1440 + }, + { + "epoch": 0.12016666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.09065348307291667, + "learning_rate": 4e-05, + "loss": 4.8764, + "loss/crossentropy": 1.6669957488775253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2174536045640707, + "step": 1442 + }, + { + "epoch": 0.12033333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.05572509765625, + "learning_rate": 4e-05, + "loss": 4.1789, + "loss/crossentropy": 1.694481611251831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21173863485455513, + "step": 1444 + }, + { + "epoch": 0.1205, + "grad_norm": 5.90625, + "grad_norm_var": 0.09667561848958334, + "learning_rate": 4e-05, + "loss": 5.8553, + "loss/crossentropy": 2.4365376234054565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22777355462312698, + "step": 1446 + }, + { + "epoch": 0.12066666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.0845703125, + "learning_rate": 4e-05, + "loss": 4.9573, + "loss/crossentropy": 1.7493075802922249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954033151268959, + "step": 1448 + }, + { + "epoch": 0.12083333333333333, + "grad_norm": 5.75, + "grad_norm_var": 0.07862955729166667, + "learning_rate": 4e-05, + "loss": 5.2187, + "loss/crossentropy": 2.0391087383031845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20556922629475594, + "step": 1450 + }, + { + "epoch": 0.121, + "grad_norm": 5.15625, + "grad_norm_var": 0.079150390625, + "learning_rate": 4e-05, + "loss": 5.0911, + "loss/crossentropy": 2.0242467522621155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108009122312069, + "step": 1452 + }, + { + "epoch": 0.12116666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.07587483723958334, + "learning_rate": 4e-05, + "loss": 4.9383, + "loss/crossentropy": 2.3943995237350464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22050346434116364, + "step": 1454 + }, + { + "epoch": 0.12133333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.07675374348958333, + "learning_rate": 4e-05, + "loss": 4.6839, + "loss/crossentropy": 1.3471101224422455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15038909018039703, + "step": 1456 + }, + { + "epoch": 0.1215, + "grad_norm": 5.46875, + "grad_norm_var": 0.07496337890625, + "learning_rate": 4e-05, + "loss": 4.5277, + "loss/crossentropy": 1.43942092359066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15480425581336021, + "step": 1458 + }, + { + "epoch": 0.12166666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.08983968098958334, + "learning_rate": 4e-05, + "loss": 4.7697, + "loss/crossentropy": 1.4175751879811287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17789405956864357, + "step": 1460 + }, + { + "epoch": 0.12183333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.0681640625, + "learning_rate": 4e-05, + "loss": 4.8328, + "loss/crossentropy": 1.766202375292778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20591039210557938, + "step": 1462 + }, + { + "epoch": 0.122, + "grad_norm": 5.125, + "grad_norm_var": 0.06751302083333334, + "learning_rate": 4e-05, + "loss": 5.0332, + "loss/crossentropy": 1.4572946727275848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17859814316034317, + "step": 1464 + }, + { + "epoch": 0.12216666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.052718098958333334, + "learning_rate": 4e-05, + "loss": 4.6043, + "loss/crossentropy": 1.657370686531067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17122356966137886, + "step": 1466 + }, + { + "epoch": 0.12233333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.055952962239583334, + "learning_rate": 4e-05, + "loss": 4.3136, + "loss/crossentropy": 1.8274584114551544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.194772370159626, + "step": 1468 + }, + { + "epoch": 0.1225, + "grad_norm": 4.875, + "grad_norm_var": 0.058056640625, + "learning_rate": 4e-05, + "loss": 4.4468, + "loss/crossentropy": 2.0565488040447235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20730895921587944, + "step": 1470 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 5.40625, + "grad_norm_var": 0.05943603515625, + "learning_rate": 4e-05, + "loss": 4.9933, + "loss/crossentropy": 2.241286873817444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22634489834308624, + "step": 1472 + }, + { + "epoch": 0.12283333333333334, + "grad_norm": 6.53125, + "grad_norm_var": 0.18683268229166666, + "learning_rate": 4e-05, + "loss": 4.3905, + "loss/crossentropy": 1.1198562234640121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1370545905083418, + "step": 1474 + }, + { + "epoch": 0.123, + "grad_norm": 4.84375, + "grad_norm_var": 0.17771809895833332, + "learning_rate": 4e-05, + "loss": 4.7302, + "loss/crossentropy": 1.3211162611842155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15940341539680958, + "step": 1476 + }, + { + "epoch": 0.12316666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 0.17636311848958333, + "learning_rate": 4e-05, + "loss": 4.942, + "loss/crossentropy": 1.9065639302134514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158641517162323, + "step": 1478 + }, + { + "epoch": 0.12333333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.19544270833333333, + "learning_rate": 4e-05, + "loss": 4.5968, + "loss/crossentropy": 1.284117877483368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15446274541318417, + "step": 1480 + }, + { + "epoch": 0.1235, + "grad_norm": 5.1875, + "grad_norm_var": 0.21178385416666667, + "learning_rate": 4e-05, + "loss": 5.2084, + "loss/crossentropy": 2.334298014640808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24484197422862053, + "step": 1482 + }, + { + "epoch": 0.12366666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.20715738932291666, + "learning_rate": 4e-05, + "loss": 4.9366, + "loss/crossentropy": 2.0920065343379974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23238061368465424, + "step": 1484 + }, + { + "epoch": 0.12383333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.216259765625, + "learning_rate": 4e-05, + "loss": 4.8202, + "loss/crossentropy": 1.1066526547074318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14913895167410374, + "step": 1486 + }, + { + "epoch": 0.124, + "grad_norm": 4.5625, + "grad_norm_var": 0.23922119140625, + "learning_rate": 4e-05, + "loss": 4.0508, + "loss/crossentropy": 1.4457052126526833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.148674588650465, + "step": 1488 + }, + { + "epoch": 0.12416666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.12756754557291666, + "learning_rate": 4e-05, + "loss": 4.7907, + "loss/crossentropy": 2.526742786169052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22900137305259705, + "step": 1490 + }, + { + "epoch": 0.12433333333333334, + "grad_norm": 5.6875, + "grad_norm_var": 0.13847249348958332, + "learning_rate": 4e-05, + "loss": 5.2301, + "loss/crossentropy": 3.177564024925232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22343944758176804, + "step": 1492 + }, + { + "epoch": 0.1245, + "grad_norm": 4.75, + "grad_norm_var": 0.14231770833333332, + "learning_rate": 4e-05, + "loss": 4.9742, + "loss/crossentropy": 2.5111494660377502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2281503677368164, + "step": 1494 + }, + { + "epoch": 0.12466666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.12511393229166667, + "learning_rate": 4e-05, + "loss": 5.148, + "loss/crossentropy": 1.6791961714625359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17439424619078636, + "step": 1496 + }, + { + "epoch": 0.12483333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.09153238932291667, + "learning_rate": 4e-05, + "loss": 5.0941, + "loss/crossentropy": 2.4359480142593384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23082533478736877, + "step": 1498 + }, + { + "epoch": 0.125, + "grad_norm": 5.46875, + "grad_norm_var": 0.098681640625, + "learning_rate": 4e-05, + "loss": 5.4316, + "loss/crossentropy": 2.4162683486938477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21909157186746597, + "step": 1500 + }, + { + "epoch": 0.12516666666666668, + "grad_norm": 5.53125, + "grad_norm_var": 0.09726155598958333, + "learning_rate": 4e-05, + "loss": 4.6494, + "loss/crossentropy": 1.3783812075853348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16549547761678696, + "step": 1502 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 5.3125, + "grad_norm_var": 0.06874593098958333, + "learning_rate": 4e-05, + "loss": 5.1572, + "loss/crossentropy": 2.214748978614807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22452621906995773, + "step": 1504 + }, + { + "epoch": 0.1255, + "grad_norm": 5.0625, + "grad_norm_var": 0.06366780598958334, + "learning_rate": 4e-05, + "loss": 4.6462, + "loss/crossentropy": 1.724791169166565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19539067894220352, + "step": 1506 + }, + { + "epoch": 0.12566666666666668, + "grad_norm": 5.125, + "grad_norm_var": 0.06627604166666666, + "learning_rate": 4e-05, + "loss": 4.7258, + "loss/crossentropy": 1.3971355706453323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1541696861386299, + "step": 1508 + }, + { + "epoch": 0.12583333333333332, + "grad_norm": 4.9375, + "grad_norm_var": 0.057275390625, + "learning_rate": 4e-05, + "loss": 4.8271, + "loss/crossentropy": 1.845518447458744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17741846106946468, + "step": 1510 + }, + { + "epoch": 0.126, + "grad_norm": 5.0, + "grad_norm_var": 0.07776285807291666, + "learning_rate": 4e-05, + "loss": 4.0287, + "loss/crossentropy": 2.070504516363144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981128826737404, + "step": 1512 + }, + { + "epoch": 0.12616666666666668, + "grad_norm": 6.3125, + "grad_norm_var": 0.17545166015625, + "learning_rate": 4e-05, + "loss": 4.5925, + "loss/crossentropy": 2.5709685683250427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24376041069626808, + "step": 1514 + }, + { + "epoch": 0.12633333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.17541910807291666, + "learning_rate": 4e-05, + "loss": 5.0738, + "loss/crossentropy": 2.435946464538574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22766336053609848, + "step": 1516 + }, + { + "epoch": 0.1265, + "grad_norm": 5.75, + "grad_norm_var": 0.18553059895833332, + "learning_rate": 4e-05, + "loss": 5.0032, + "loss/crossentropy": 1.9362272024154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20861081406474113, + "step": 1518 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 5.375, + "grad_norm_var": 0.18470052083333333, + "learning_rate": 4e-05, + "loss": 5.0846, + "loss/crossentropy": 2.1167075484991074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20843248441815376, + "step": 1520 + }, + { + "epoch": 0.12683333333333333, + "grad_norm": 5.90625, + "grad_norm_var": 0.3120930989583333, + "learning_rate": 4e-05, + "loss": 4.5747, + "loss/crossentropy": 1.9277404323220253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18708796799182892, + "step": 1522 + }, + { + "epoch": 0.127, + "grad_norm": 4.96875, + "grad_norm_var": 0.309375, + "learning_rate": 4e-05, + "loss": 4.5207, + "loss/crossentropy": 2.263314723968506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160516083240509, + "step": 1524 + }, + { + "epoch": 0.12716666666666668, + "grad_norm": 5.09375, + "grad_norm_var": 0.30276285807291664, + "learning_rate": 4e-05, + "loss": 4.5577, + "loss/crossentropy": 2.3493450582027435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21893595904111862, + "step": 1526 + }, + { + "epoch": 0.12733333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.4759724934895833, + "learning_rate": 4e-05, + "loss": 4.8924, + "loss/crossentropy": 1.4031277000904083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15571350418031216, + "step": 1528 + }, + { + "epoch": 0.1275, + "grad_norm": 5.46875, + "grad_norm_var": 0.4598592122395833, + "learning_rate": 4e-05, + "loss": 4.8127, + "loss/crossentropy": 1.026276372373104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14767338708043098, + "step": 1530 + }, + { + "epoch": 0.12766666666666668, + "grad_norm": 5.03125, + "grad_norm_var": 0.4837849934895833, + "learning_rate": 4e-05, + "loss": 4.8847, + "loss/crossentropy": 1.8962939083576202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18200470879673958, + "step": 1532 + }, + { + "epoch": 0.12783333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.4787068684895833, + "learning_rate": 4e-05, + "loss": 5.0413, + "loss/crossentropy": 2.3904325664043427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21139219775795937, + "step": 1534 + }, + { + "epoch": 0.128, + "grad_norm": 5.28125, + "grad_norm_var": 0.5261067708333333, + "learning_rate": 4e-05, + "loss": 4.6779, + "loss/crossentropy": 2.3674957752227783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22268419340252876, + "step": 1536 + }, + { + "epoch": 0.12816666666666668, + "grad_norm": 4.6875, + "grad_norm_var": 0.4163411458333333, + "learning_rate": 4e-05, + "loss": 4.7978, + "loss/crossentropy": 1.4360255599021912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14481048472225666, + "step": 1538 + }, + { + "epoch": 0.12833333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.41028238932291666, + "learning_rate": 4e-05, + "loss": 5.2889, + "loss/crossentropy": 1.9318012371659279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18672936409711838, + "step": 1540 + }, + { + "epoch": 0.1285, + "grad_norm": 4.96875, + "grad_norm_var": 0.4143880208333333, + "learning_rate": 4e-05, + "loss": 5.3868, + "loss/crossentropy": 2.4637942910194397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21543822437524796, + "step": 1542 + }, + { + "epoch": 0.12866666666666668, + "grad_norm": 5.78125, + "grad_norm_var": 0.12024739583333334, + "learning_rate": 4e-05, + "loss": 5.0045, + "loss/crossentropy": 1.7794182002544403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21857787482440472, + "step": 1544 + }, + { + "epoch": 0.12883333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.09724934895833333, + "learning_rate": 4e-05, + "loss": 5.0813, + "loss/crossentropy": 1.905199073255062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17909251898527145, + "step": 1546 + }, + { + "epoch": 0.129, + "grad_norm": 5.03125, + "grad_norm_var": 0.097900390625, + "learning_rate": 4e-05, + "loss": 4.7055, + "loss/crossentropy": 2.350240021944046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20850903540849686, + "step": 1548 + }, + { + "epoch": 0.12916666666666668, + "grad_norm": 4.8125, + "grad_norm_var": 0.09957275390625, + "learning_rate": 4e-05, + "loss": 4.666, + "loss/crossentropy": 1.4298752844333649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15599924698472023, + "step": 1550 + }, + { + "epoch": 0.12933333333333333, + "grad_norm": 5.90625, + "grad_norm_var": 0.12213134765625, + "learning_rate": 4e-05, + "loss": 4.5669, + "loss/crossentropy": 1.468435674905777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16555116325616837, + "step": 1552 + }, + { + "epoch": 0.1295, + "grad_norm": 5.21875, + "grad_norm_var": 0.10188802083333333, + "learning_rate": 4e-05, + "loss": 5.1023, + "loss/crossentropy": 2.4195556640625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22506968677043915, + "step": 1554 + }, + { + "epoch": 0.12966666666666668, + "grad_norm": 5.125, + "grad_norm_var": 0.10370686848958334, + "learning_rate": 4e-05, + "loss": 5.1492, + "loss/crossentropy": 1.5128118842840195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15466869808733463, + "step": 1556 + }, + { + "epoch": 0.12983333333333333, + "grad_norm": 14.1875, + "grad_norm_var": 5.136393229166667, + "learning_rate": 4e-05, + "loss": 5.2507, + "loss/crossentropy": 1.1354478374123573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13475182093679905, + "step": 1558 + }, + { + "epoch": 0.13, + "grad_norm": 5.03125, + "grad_norm_var": 5.169254557291667, + "learning_rate": 4e-05, + "loss": 4.7625, + "loss/crossentropy": 1.5475751757621765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16386966034770012, + "step": 1560 + }, + { + "epoch": 0.13016666666666668, + "grad_norm": 4.9375, + "grad_norm_var": 5.164351399739584, + "learning_rate": 4e-05, + "loss": 5.0927, + "loss/crossentropy": 2.1593563556671143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24635591357946396, + "step": 1562 + }, + { + "epoch": 0.13033333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 5.16011962890625, + "learning_rate": 4e-05, + "loss": 5.1048, + "loss/crossentropy": 2.4482282400131226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22428294271230698, + "step": 1564 + }, + { + "epoch": 0.1305, + "grad_norm": 4.96875, + "grad_norm_var": 5.107157389322917, + "learning_rate": 4e-05, + "loss": 4.5934, + "loss/crossentropy": 1.9403712749481201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931654028594494, + "step": 1566 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 5.34375, + "grad_norm_var": 5.100223795572917, + "learning_rate": 4e-05, + "loss": 5.2486, + "loss/crossentropy": 1.9268745481967926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18993456289172173, + "step": 1568 + }, + { + "epoch": 0.13083333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 5.137565104166667, + "learning_rate": 4e-05, + "loss": 4.8246, + "loss/crossentropy": 2.275236487388611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21554533019661903, + "step": 1570 + }, + { + "epoch": 0.131, + "grad_norm": 4.84375, + "grad_norm_var": 5.121484375, + "learning_rate": 4e-05, + "loss": 4.8314, + "loss/crossentropy": 2.044840008020401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845100037753582, + "step": 1572 + }, + { + "epoch": 0.13116666666666665, + "grad_norm": 5.25, + "grad_norm_var": 0.04846598307291667, + "learning_rate": 4e-05, + "loss": 4.5061, + "loss/crossentropy": 1.3665557280182838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1586722508072853, + "step": 1574 + }, + { + "epoch": 0.13133333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.051285807291666666, + "learning_rate": 4e-05, + "loss": 4.9826, + "loss/crossentropy": 1.770714707672596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17428990453481674, + "step": 1576 + }, + { + "epoch": 0.1315, + "grad_norm": 5.40625, + "grad_norm_var": 0.053385416666666664, + "learning_rate": 4e-05, + "loss": 5.4026, + "loss/crossentropy": 1.7916882634162903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20171686075627804, + "step": 1578 + }, + { + "epoch": 0.13166666666666665, + "grad_norm": 5.1875, + "grad_norm_var": 0.059305826822916664, + "learning_rate": 4e-05, + "loss": 4.938, + "loss/crossentropy": 1.495934583246708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16308372281491756, + "step": 1580 + }, + { + "epoch": 0.13183333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.054488118489583334, + "learning_rate": 4e-05, + "loss": 4.8879, + "loss/crossentropy": 2.5814104676246643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21871698647737503, + "step": 1582 + }, + { + "epoch": 0.132, + "grad_norm": 5.25, + "grad_norm_var": 0.06337483723958333, + "learning_rate": 4e-05, + "loss": 4.9582, + "loss/crossentropy": 2.209455542266369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19592761620879173, + "step": 1584 + }, + { + "epoch": 0.13216666666666665, + "grad_norm": 5.25, + "grad_norm_var": 0.057145182291666666, + "learning_rate": 4e-05, + "loss": 4.7327, + "loss/crossentropy": 1.3624914586544037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1524915173649788, + "step": 1586 + }, + { + "epoch": 0.13233333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.057145182291666666, + "learning_rate": 4e-05, + "loss": 5.075, + "loss/crossentropy": 2.308533728122711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21155225485563278, + "step": 1588 + }, + { + "epoch": 0.1325, + "grad_norm": 5.03125, + "grad_norm_var": 0.06763916015625, + "learning_rate": 4e-05, + "loss": 5.7472, + "loss/crossentropy": 1.77063799649477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19738946482539177, + "step": 1590 + }, + { + "epoch": 0.13266666666666665, + "grad_norm": 5.15625, + "grad_norm_var": 0.06131184895833333, + "learning_rate": 4e-05, + "loss": 5.0464, + "loss/crossentropy": 1.8190487623214722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1730487048625946, + "step": 1592 + }, + { + "epoch": 0.13283333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.062093098958333336, + "learning_rate": 4e-05, + "loss": 4.6534, + "loss/crossentropy": 1.6317052841186523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1610642485320568, + "step": 1594 + }, + { + "epoch": 0.133, + "grad_norm": 5.15625, + "grad_norm_var": 0.05699462890625, + "learning_rate": 4e-05, + "loss": 5.2636, + "loss/crossentropy": 2.040685288608074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18768882751464844, + "step": 1596 + }, + { + "epoch": 0.13316666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.06627197265625, + "learning_rate": 4e-05, + "loss": 4.99, + "loss/crossentropy": 2.288883000612259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22606860101222992, + "step": 1598 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.07382405598958333, + "learning_rate": 4e-05, + "loss": 4.4231, + "loss/crossentropy": 1.997236281633377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19324356690049171, + "step": 1600 + }, + { + "epoch": 0.1335, + "grad_norm": 5.84375, + "grad_norm_var": 0.10868733723958333, + "learning_rate": 4e-05, + "loss": 4.8902, + "loss/crossentropy": 1.9118523299694061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18948077410459518, + "step": 1602 + }, + { + "epoch": 0.13366666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.13033447265625, + "learning_rate": 4e-05, + "loss": 4.5752, + "loss/crossentropy": 1.4698756337165833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17594150640070438, + "step": 1604 + }, + { + "epoch": 0.13383333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.11119384765625, + "learning_rate": 4e-05, + "loss": 5.0654, + "loss/crossentropy": 1.5521681532263756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.167510736733675, + "step": 1606 + }, + { + "epoch": 0.134, + "grad_norm": 5.40625, + "grad_norm_var": 0.12392171223958333, + "learning_rate": 4e-05, + "loss": 5.4244, + "loss/crossentropy": 2.3366805016994476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125181294977665, + "step": 1608 + }, + { + "epoch": 0.13416666666666666, + "grad_norm": 5.40625, + "grad_norm_var": 0.12021077473958333, + "learning_rate": 4e-05, + "loss": 4.8163, + "loss/crossentropy": 2.4447622895240784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22324832528829575, + "step": 1610 + }, + { + "epoch": 0.13433333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.11339518229166666, + "learning_rate": 4e-05, + "loss": 5.0853, + "loss/crossentropy": 1.8828533068299294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18855641968548298, + "step": 1612 + }, + { + "epoch": 0.1345, + "grad_norm": 8.5625, + "grad_norm_var": 0.7942545572916667, + "learning_rate": 4e-05, + "loss": 5.023, + "loss/crossentropy": 2.5013024508953094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21763851121068, + "step": 1614 + }, + { + "epoch": 0.13466666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.7565104166666666, + "learning_rate": 4e-05, + "loss": 4.9341, + "loss/crossentropy": 2.0742052495479584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010239139199257, + "step": 1616 + }, + { + "epoch": 0.13483333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.7486328125, + "learning_rate": 4e-05, + "loss": 5.2218, + "loss/crossentropy": 2.068577319383621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23265555873513222, + "step": 1618 + }, + { + "epoch": 0.135, + "grad_norm": 5.34375, + "grad_norm_var": 0.74644775390625, + "learning_rate": 4e-05, + "loss": 5.3211, + "loss/crossentropy": 2.0565109848976135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19935489632189274, + "step": 1620 + }, + { + "epoch": 0.13516666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.7264322916666667, + "learning_rate": 4e-05, + "loss": 4.5425, + "loss/crossentropy": 1.4980078116059303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15502066165208817, + "step": 1622 + }, + { + "epoch": 0.13533333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.7397745768229167, + "learning_rate": 4e-05, + "loss": 4.9548, + "loss/crossentropy": 2.127755284309387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20423029735684395, + "step": 1624 + }, + { + "epoch": 0.1355, + "grad_norm": 5.21875, + "grad_norm_var": 0.741650390625, + "learning_rate": 4e-05, + "loss": 4.9034, + "loss/crossentropy": 2.0613020807504654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056688815355301, + "step": 1626 + }, + { + "epoch": 0.13566666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.73834228515625, + "learning_rate": 4e-05, + "loss": 4.7378, + "loss/crossentropy": 2.2652209401130676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20464074611663818, + "step": 1628 + }, + { + "epoch": 0.13583333333333333, + "grad_norm": 5.5, + "grad_norm_var": 0.07753499348958333, + "learning_rate": 4e-05, + "loss": 4.7986, + "loss/crossentropy": 1.7409793213009834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20771316066384315, + "step": 1630 + }, + { + "epoch": 0.136, + "grad_norm": 5.125, + "grad_norm_var": 0.06633707682291666, + "learning_rate": 4e-05, + "loss": 5.4698, + "loss/crossentropy": 2.0719391107559204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20104419440031052, + "step": 1632 + }, + { + "epoch": 0.13616666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.05220947265625, + "learning_rate": 4e-05, + "loss": 4.8056, + "loss/crossentropy": 2.4364999532699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327173836529255, + "step": 1634 + }, + { + "epoch": 0.13633333333333333, + "grad_norm": 5.71875, + "grad_norm_var": 0.1138671875, + "learning_rate": 4e-05, + "loss": 5.0928, + "loss/crossentropy": 1.8130161613225937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1989502925425768, + "step": 1636 + }, + { + "epoch": 0.1365, + "grad_norm": 5.53125, + "grad_norm_var": 0.12629801432291668, + "learning_rate": 4e-05, + "loss": 4.4983, + "loss/crossentropy": 1.8206287994980812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003721445798874, + "step": 1638 + }, + { + "epoch": 0.13666666666666666, + "grad_norm": 5.875, + "grad_norm_var": 0.15676676432291667, + "learning_rate": 4e-05, + "loss": 4.6903, + "loss/crossentropy": 1.9255887940526009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551080249249935, + "step": 1640 + }, + { + "epoch": 0.13683333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.164306640625, + "learning_rate": 4e-05, + "loss": 4.6729, + "loss/crossentropy": 1.737329825758934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21636349894106388, + "step": 1642 + }, + { + "epoch": 0.137, + "grad_norm": 5.0625, + "grad_norm_var": 0.16217041015625, + "learning_rate": 4e-05, + "loss": 5.31, + "loss/crossentropy": 2.59636914730072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2293633446097374, + "step": 1644 + }, + { + "epoch": 0.13716666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.14149983723958334, + "learning_rate": 4e-05, + "loss": 5.5565, + "loss/crossentropy": 2.477478504180908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.232582226395607, + "step": 1646 + }, + { + "epoch": 0.13733333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.13826497395833334, + "learning_rate": 4e-05, + "loss": 4.979, + "loss/crossentropy": 1.4540888145565987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14884299412369728, + "step": 1648 + }, + { + "epoch": 0.1375, + "grad_norm": 5.28125, + "grad_norm_var": 0.12708333333333333, + "learning_rate": 4e-05, + "loss": 4.9889, + "loss/crossentropy": 1.8568930253386497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040102779865265, + "step": 1650 + }, + { + "epoch": 0.13766666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.0666015625, + "learning_rate": 4e-05, + "loss": 4.7083, + "loss/crossentropy": 1.5726129412651062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16069914400577545, + "step": 1652 + }, + { + "epoch": 0.13783333333333334, + "grad_norm": 5.40625, + "grad_norm_var": 0.059098307291666666, + "learning_rate": 4e-05, + "loss": 5.2683, + "loss/crossentropy": 1.7672999277710915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18627581745386124, + "step": 1654 + }, + { + "epoch": 0.138, + "grad_norm": 5.53125, + "grad_norm_var": 0.03515218098958333, + "learning_rate": 4e-05, + "loss": 5.4057, + "loss/crossentropy": 2.0668781399726868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19123420864343643, + "step": 1656 + }, + { + "epoch": 0.13816666666666666, + "grad_norm": 5.78125, + "grad_norm_var": 0.046610514322916664, + "learning_rate": 4e-05, + "loss": 4.9816, + "loss/crossentropy": 1.6409248635172844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18773740530014038, + "step": 1658 + }, + { + "epoch": 0.13833333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.052632649739583336, + "learning_rate": 4e-05, + "loss": 5.1193, + "loss/crossentropy": 1.2817718982696533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17280958406627178, + "step": 1660 + }, + { + "epoch": 0.1385, + "grad_norm": 5.0625, + "grad_norm_var": 0.05597330729166667, + "learning_rate": 4e-05, + "loss": 4.7329, + "loss/crossentropy": 2.2702360451221466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21893694251775742, + "step": 1662 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.0603515625, + "learning_rate": 4e-05, + "loss": 5.0972, + "loss/crossentropy": 2.0147531405091286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18305204808712006, + "step": 1664 + }, + { + "epoch": 0.13883333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.06276041666666667, + "learning_rate": 4e-05, + "loss": 5.1082, + "loss/crossentropy": 2.198255777359009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22349874302744865, + "step": 1666 + }, + { + "epoch": 0.139, + "grad_norm": 5.0625, + "grad_norm_var": 0.06510416666666667, + "learning_rate": 4e-05, + "loss": 4.676, + "loss/crossentropy": 1.8043780699372292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21263182908296585, + "step": 1668 + }, + { + "epoch": 0.13916666666666666, + "grad_norm": 5.25, + "grad_norm_var": 0.06417643229166667, + "learning_rate": 4e-05, + "loss": 5.7267, + "loss/crossentropy": 2.2814477682113647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21907350048422813, + "step": 1670 + }, + { + "epoch": 0.13933333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.06222330729166667, + "learning_rate": 4e-05, + "loss": 5.3567, + "loss/crossentropy": 2.4507880806922913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21009447425603867, + "step": 1672 + }, + { + "epoch": 0.1395, + "grad_norm": 4.625, + "grad_norm_var": 0.05358072916666667, + "learning_rate": 4e-05, + "loss": 5.083, + "loss/crossentropy": 2.072799079120159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938009299337864, + "step": 1674 + }, + { + "epoch": 0.13966666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.07610270182291666, + "learning_rate": 4e-05, + "loss": 4.23, + "loss/crossentropy": 1.8492163196206093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20542608201503754, + "step": 1676 + }, + { + "epoch": 0.13983333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.06927083333333334, + "learning_rate": 4e-05, + "loss": 4.8043, + "loss/crossentropy": 1.972286880016327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20922426879405975, + "step": 1678 + }, + { + "epoch": 0.14, + "grad_norm": 5.4375, + "grad_norm_var": 0.07862955729166667, + "learning_rate": 4e-05, + "loss": 4.5887, + "loss/crossentropy": 1.6558887809515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17858467251062393, + "step": 1680 + }, + { + "epoch": 0.14016666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.087744140625, + "learning_rate": 4e-05, + "loss": 4.761, + "loss/crossentropy": 1.950936883687973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931697092950344, + "step": 1682 + }, + { + "epoch": 0.14033333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.102978515625, + "learning_rate": 4e-05, + "loss": 4.8183, + "loss/crossentropy": 1.3800019018817693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13202445802744478, + "step": 1684 + }, + { + "epoch": 0.1405, + "grad_norm": 5.0, + "grad_norm_var": 0.083447265625, + "learning_rate": 4e-05, + "loss": 4.4808, + "loss/crossentropy": 2.337316393852234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25428661331534386, + "step": 1686 + }, + { + "epoch": 0.14066666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.10367431640625, + "learning_rate": 4e-05, + "loss": 5.0679, + "loss/crossentropy": 2.343565195798874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149459458887577, + "step": 1688 + }, + { + "epoch": 0.14083333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.09651285807291667, + "learning_rate": 4e-05, + "loss": 5.114, + "loss/crossentropy": 2.6408793926239014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21381603553891182, + "step": 1690 + }, + { + "epoch": 0.141, + "grad_norm": 5.25, + "grad_norm_var": 0.07955322265625, + "learning_rate": 4e-05, + "loss": 5.163, + "loss/crossentropy": 2.383645087480545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23237445205450058, + "step": 1692 + }, + { + "epoch": 0.14116666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.08216145833333334, + "learning_rate": 4e-05, + "loss": 4.9111, + "loss/crossentropy": 1.9802673906087875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19009397737681866, + "step": 1694 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 5.3125, + "grad_norm_var": 0.07603759765625, + "learning_rate": 4e-05, + "loss": 5.0697, + "loss/crossentropy": 2.302330046892166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208226066082716, + "step": 1696 + }, + { + "epoch": 0.1415, + "grad_norm": 4.84375, + "grad_norm_var": 0.06842447916666666, + "learning_rate": 4e-05, + "loss": 5.205, + "loss/crossentropy": 1.977390617132187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20193704962730408, + "step": 1698 + }, + { + "epoch": 0.14166666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.0548828125, + "learning_rate": 4e-05, + "loss": 5.1493, + "loss/crossentropy": 2.0484447479248047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19997265562415123, + "step": 1700 + }, + { + "epoch": 0.14183333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.07564697265625, + "learning_rate": 4e-05, + "loss": 4.9987, + "loss/crossentropy": 1.799696996808052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18479579128324986, + "step": 1702 + }, + { + "epoch": 0.142, + "grad_norm": 5.21875, + "grad_norm_var": 0.049723307291666664, + "learning_rate": 4e-05, + "loss": 4.781, + "loss/crossentropy": 1.1656968668103218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15266522020101547, + "step": 1704 + }, + { + "epoch": 0.14216666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.058854166666666666, + "learning_rate": 4e-05, + "loss": 4.1525, + "loss/crossentropy": 1.4563089236617088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14857494831085205, + "step": 1706 + }, + { + "epoch": 0.14233333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.07823893229166666, + "learning_rate": 4e-05, + "loss": 4.1668, + "loss/crossentropy": 1.807879388332367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19141371175646782, + "step": 1708 + }, + { + "epoch": 0.1425, + "grad_norm": 5.8125, + "grad_norm_var": 0.10091145833333333, + "learning_rate": 4e-05, + "loss": 4.9732, + "loss/crossentropy": 2.9230883717536926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23849774524569511, + "step": 1710 + }, + { + "epoch": 0.14266666666666666, + "grad_norm": 5.375, + "grad_norm_var": 0.16031494140625, + "learning_rate": 4e-05, + "loss": 5.1238, + "loss/crossentropy": 1.3809229135513306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20320942997932434, + "step": 1712 + }, + { + "epoch": 0.14283333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.15139567057291667, + "learning_rate": 4e-05, + "loss": 4.9135, + "loss/crossentropy": 2.339945375919342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23271268606185913, + "step": 1714 + }, + { + "epoch": 0.143, + "grad_norm": 6.09375, + "grad_norm_var": 0.17459309895833333, + "learning_rate": 4e-05, + "loss": 5.1534, + "loss/crossentropy": 1.5341841503977776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16835985332727432, + "step": 1716 + }, + { + "epoch": 0.14316666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.16959228515625, + "learning_rate": 4e-05, + "loss": 4.572, + "loss/crossentropy": 2.6712507009506226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24255206063389778, + "step": 1718 + }, + { + "epoch": 0.14333333333333334, + "grad_norm": 5.3125, + "grad_norm_var": 0.16822509765625, + "learning_rate": 4e-05, + "loss": 5.2017, + "loss/crossentropy": 2.1750669479370117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2266225516796112, + "step": 1720 + }, + { + "epoch": 0.1435, + "grad_norm": 4.875, + "grad_norm_var": 0.15441080729166667, + "learning_rate": 4e-05, + "loss": 4.8313, + "loss/crossentropy": 1.5534632056951523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15007868967950344, + "step": 1722 + }, + { + "epoch": 0.14366666666666666, + "grad_norm": 5.5625, + "grad_norm_var": 0.14334309895833333, + "learning_rate": 4e-05, + "loss": 5.5259, + "loss/crossentropy": 2.7301476895809174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24438628181815147, + "step": 1724 + }, + { + "epoch": 0.14383333333333334, + "grad_norm": 5.71875, + "grad_norm_var": 0.15627848307291667, + "learning_rate": 4e-05, + "loss": 4.7724, + "loss/crossentropy": 1.191504381597042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1475481502711773, + "step": 1726 + }, + { + "epoch": 0.144, + "grad_norm": 5.1875, + "grad_norm_var": 0.14524739583333332, + "learning_rate": 4e-05, + "loss": 4.8071, + "loss/crossentropy": 2.629876434803009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133907452225685, + "step": 1728 + }, + { + "epoch": 0.14416666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.17906494140625, + "learning_rate": 4e-05, + "loss": 4.3475, + "loss/crossentropy": 2.0846259891986847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23153432458639145, + "step": 1730 + }, + { + "epoch": 0.14433333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.111328125, + "learning_rate": 4e-05, + "loss": 5.3218, + "loss/crossentropy": 2.3203621357679367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2167310230433941, + "step": 1732 + }, + { + "epoch": 0.1445, + "grad_norm": 4.9375, + "grad_norm_var": 0.10002848307291666, + "learning_rate": 4e-05, + "loss": 5.3713, + "loss/crossentropy": 2.592851758003235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2294953651726246, + "step": 1734 + }, + { + "epoch": 0.14466666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.107666015625, + "learning_rate": 4e-05, + "loss": 4.4483, + "loss/crossentropy": 0.9307321533560753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13924349658191204, + "step": 1736 + }, + { + "epoch": 0.14483333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.10692952473958334, + "learning_rate": 4e-05, + "loss": 4.9398, + "loss/crossentropy": 2.3195015490055084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2094910740852356, + "step": 1738 + }, + { + "epoch": 0.145, + "grad_norm": 4.875, + "grad_norm_var": 0.10390625, + "learning_rate": 4e-05, + "loss": 4.3281, + "loss/crossentropy": 2.1040413677692413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23669259622693062, + "step": 1740 + }, + { + "epoch": 0.14516666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.07890625, + "learning_rate": 4e-05, + "loss": 4.7054, + "loss/crossentropy": 1.9919825196266174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1985725536942482, + "step": 1742 + }, + { + "epoch": 0.14533333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.07511393229166667, + "learning_rate": 4e-05, + "loss": 5.1666, + "loss/crossentropy": 2.397672086954117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19976507499814034, + "step": 1744 + }, + { + "epoch": 0.1455, + "grad_norm": 4.84375, + "grad_norm_var": 0.07239176432291666, + "learning_rate": 4e-05, + "loss": 4.7493, + "loss/crossentropy": 1.2222031652927399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14994047954678535, + "step": 1746 + }, + { + "epoch": 0.14566666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.07967122395833333, + "learning_rate": 4e-05, + "loss": 4.8289, + "loss/crossentropy": 2.458436369895935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22444933280348778, + "step": 1748 + }, + { + "epoch": 0.14583333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.07899983723958333, + "learning_rate": 4e-05, + "loss": 4.6862, + "loss/crossentropy": 1.9638415053486824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19102539867162704, + "step": 1750 + }, + { + "epoch": 0.146, + "grad_norm": 4.875, + "grad_norm_var": 0.05963134765625, + "learning_rate": 4e-05, + "loss": 4.6494, + "loss/crossentropy": 2.514769494533539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001716122031212, + "step": 1752 + }, + { + "epoch": 0.14616666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.202734375, + "learning_rate": 4e-05, + "loss": 4.8056, + "loss/crossentropy": 1.6116134598851204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18631109967827797, + "step": 1754 + }, + { + "epoch": 0.14633333333333334, + "grad_norm": 5.46875, + "grad_norm_var": 0.18538004557291668, + "learning_rate": 4e-05, + "loss": 4.6867, + "loss/crossentropy": 1.6297817006707191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18314477056264877, + "step": 1756 + }, + { + "epoch": 0.1465, + "grad_norm": 5.0, + "grad_norm_var": 0.15367431640625, + "learning_rate": 4e-05, + "loss": 5.3406, + "loss/crossentropy": 2.525915801525116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21952301263809204, + "step": 1758 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.15338134765625, + "learning_rate": 4e-05, + "loss": 5.3172, + "loss/crossentropy": 2.1865801215171814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21111416071653366, + "step": 1760 + }, + { + "epoch": 0.14683333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.16013997395833332, + "learning_rate": 4e-05, + "loss": 4.4209, + "loss/crossentropy": 1.7157298550009727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20164168626070023, + "step": 1762 + }, + { + "epoch": 0.147, + "grad_norm": 5.0, + "grad_norm_var": 0.1662109375, + "learning_rate": 4e-05, + "loss": 4.7885, + "loss/crossentropy": 1.6044250950217247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19163594394922256, + "step": 1764 + }, + { + "epoch": 0.14716666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.17174479166666667, + "learning_rate": 4e-05, + "loss": 5.5379, + "loss/crossentropy": 2.4521120488643646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20806816592812538, + "step": 1766 + }, + { + "epoch": 0.14733333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.16829020182291668, + "learning_rate": 4e-05, + "loss": 4.4508, + "loss/crossentropy": 2.1604004204273224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1913926713168621, + "step": 1768 + }, + { + "epoch": 0.1475, + "grad_norm": 5.5, + "grad_norm_var": 0.05987955729166667, + "learning_rate": 4e-05, + "loss": 5.3149, + "loss/crossentropy": 2.428584039211273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24361254274845123, + "step": 1770 + }, + { + "epoch": 0.14766666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.05725504557291667, + "learning_rate": 4e-05, + "loss": 5.5396, + "loss/crossentropy": 2.1453306525945663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18525381945073605, + "step": 1772 + }, + { + "epoch": 0.14783333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.06269124348958334, + "learning_rate": 4e-05, + "loss": 4.9702, + "loss/crossentropy": 2.037685215473175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19330525025725365, + "step": 1774 + }, + { + "epoch": 0.148, + "grad_norm": 4.8125, + "grad_norm_var": 0.07906494140625, + "learning_rate": 4e-05, + "loss": 4.8811, + "loss/crossentropy": 1.9805008471012115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17861885949969292, + "step": 1776 + }, + { + "epoch": 0.14816666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.09000244140625, + "learning_rate": 4e-05, + "loss": 5.0579, + "loss/crossentropy": 2.3986242413520813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22519202157855034, + "step": 1778 + }, + { + "epoch": 0.14833333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.08487955729166667, + "learning_rate": 4e-05, + "loss": 4.3706, + "loss/crossentropy": 1.5709987133741379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20712516456842422, + "step": 1780 + }, + { + "epoch": 0.1485, + "grad_norm": 5.0625, + "grad_norm_var": 0.08948160807291666, + "learning_rate": 4e-05, + "loss": 5.2491, + "loss/crossentropy": 2.725129246711731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22343413904309273, + "step": 1782 + }, + { + "epoch": 0.14866666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.09230143229166667, + "learning_rate": 4e-05, + "loss": 5.1373, + "loss/crossentropy": 1.8716261237859726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893265787512064, + "step": 1784 + }, + { + "epoch": 0.14883333333333335, + "grad_norm": 5.34375, + "grad_norm_var": 0.08098958333333334, + "learning_rate": 4e-05, + "loss": 5.4504, + "loss/crossentropy": 2.106525592505932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19252919033169746, + "step": 1786 + }, + { + "epoch": 0.149, + "grad_norm": 5.21875, + "grad_norm_var": 0.08274739583333333, + "learning_rate": 4e-05, + "loss": 5.155, + "loss/crossentropy": 2.140032261610031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22491325810551643, + "step": 1788 + }, + { + "epoch": 0.14916666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.07890625, + "learning_rate": 4e-05, + "loss": 4.6521, + "loss/crossentropy": 1.9066472426056862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19813631661236286, + "step": 1790 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 5.375, + "grad_norm_var": 0.07017822265625, + "learning_rate": 4e-05, + "loss": 4.9176, + "loss/crossentropy": 1.9560877978801727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19545845314860344, + "step": 1792 + }, + { + "epoch": 0.1495, + "grad_norm": 5.21875, + "grad_norm_var": 0.046187337239583334, + "learning_rate": 4e-05, + "loss": 5.0363, + "loss/crossentropy": 1.8670417666435242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26671649515628815, + "step": 1794 + }, + { + "epoch": 0.14966666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.040913899739583336, + "learning_rate": 4e-05, + "loss": 4.7842, + "loss/crossentropy": 1.4337139576673508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16944348998367786, + "step": 1796 + }, + { + "epoch": 0.14983333333333335, + "grad_norm": 4.9375, + "grad_norm_var": 0.04244384765625, + "learning_rate": 4e-05, + "loss": 4.8675, + "loss/crossentropy": 2.3128662705421448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2631648778915405, + "step": 1798 + }, + { + "epoch": 0.15, + "grad_norm": 5.5, + "grad_norm_var": 0.034098307291666664, + "learning_rate": 4e-05, + "loss": 5.4269, + "loss/crossentropy": 2.659923791885376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22622456029057503, + "step": 1800 + }, + { + "epoch": 0.15016666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.0412109375, + "learning_rate": 4e-05, + "loss": 4.7557, + "loss/crossentropy": 1.2567346766591072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16816434264183044, + "step": 1802 + }, + { + "epoch": 0.15033333333333335, + "grad_norm": 5.21875, + "grad_norm_var": 0.04107666015625, + "learning_rate": 4e-05, + "loss": 4.9006, + "loss/crossentropy": 1.7615478411316872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22866688668727875, + "step": 1804 + }, + { + "epoch": 0.1505, + "grad_norm": 5.0, + "grad_norm_var": 0.06204427083333333, + "learning_rate": 4e-05, + "loss": 4.8503, + "loss/crossentropy": 1.9991141185164452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18227214738726616, + "step": 1806 + }, + { + "epoch": 0.15066666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.05517171223958333, + "learning_rate": 4e-05, + "loss": 4.7189, + "loss/crossentropy": 2.0811602771282196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848256252706051, + "step": 1808 + }, + { + "epoch": 0.15083333333333335, + "grad_norm": 5.5, + "grad_norm_var": 0.10266927083333334, + "learning_rate": 4e-05, + "loss": 4.8733, + "loss/crossentropy": 2.1010901927948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20911147445440292, + "step": 1810 + }, + { + "epoch": 0.151, + "grad_norm": 5.03125, + "grad_norm_var": 0.09928385416666667, + "learning_rate": 4e-05, + "loss": 5.1623, + "loss/crossentropy": 1.4801330715417862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16037312522530556, + "step": 1812 + }, + { + "epoch": 0.15116666666666667, + "grad_norm": 5.84375, + "grad_norm_var": 0.13240559895833334, + "learning_rate": 4e-05, + "loss": 4.9778, + "loss/crossentropy": 1.7996556013822556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23594452254474163, + "step": 1814 + }, + { + "epoch": 0.15133333333333332, + "grad_norm": 5.09375, + "grad_norm_var": 0.13013916015625, + "learning_rate": 4e-05, + "loss": 5.1219, + "loss/crossentropy": 1.7359198927879333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1817542277276516, + "step": 1816 + }, + { + "epoch": 0.1515, + "grad_norm": 5.03125, + "grad_norm_var": 0.13631184895833334, + "learning_rate": 4e-05, + "loss": 4.7057, + "loss/crossentropy": 2.527916431427002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22807636111974716, + "step": 1818 + }, + { + "epoch": 0.15166666666666667, + "grad_norm": 5.71875, + "grad_norm_var": 0.15128580729166666, + "learning_rate": 4e-05, + "loss": 4.7407, + "loss/crossentropy": 1.3911296725273132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24339628778398037, + "step": 1820 + }, + { + "epoch": 0.15183333333333332, + "grad_norm": 4.96875, + "grad_norm_var": 0.124462890625, + "learning_rate": 4e-05, + "loss": 5.0528, + "loss/crossentropy": 1.8240971639752388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18753194250166416, + "step": 1822 + }, + { + "epoch": 0.152, + "grad_norm": 4.96875, + "grad_norm_var": 0.1193359375, + "learning_rate": 4e-05, + "loss": 4.4, + "loss/crossentropy": 1.6177871525287628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020118124783039, + "step": 1824 + }, + { + "epoch": 0.15216666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.09401041666666667, + "learning_rate": 4e-05, + "loss": 4.8499, + "loss/crossentropy": 2.3816640377044678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21570777148008347, + "step": 1826 + }, + { + "epoch": 0.15233333333333332, + "grad_norm": 8.1875, + "grad_norm_var": 0.6413899739583333, + "learning_rate": 4e-05, + "loss": 5.6275, + "loss/crossentropy": 2.400721490383148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21228517964482307, + "step": 1828 + }, + { + "epoch": 0.1525, + "grad_norm": 5.4375, + "grad_norm_var": 0.6223917643229167, + "learning_rate": 4e-05, + "loss": 4.5715, + "loss/crossentropy": 1.3201181143522263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16993128694593906, + "step": 1830 + }, + { + "epoch": 0.15266666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.6361328125, + "learning_rate": 4e-05, + "loss": 5.4743, + "loss/crossentropy": 2.2901048958301544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23060648143291473, + "step": 1832 + }, + { + "epoch": 0.15283333333333332, + "grad_norm": 4.90625, + "grad_norm_var": 0.630322265625, + "learning_rate": 4e-05, + "loss": 4.8528, + "loss/crossentropy": 1.1291920691728592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13143914192914963, + "step": 1834 + }, + { + "epoch": 0.153, + "grad_norm": 4.6875, + "grad_norm_var": 0.6576171875, + "learning_rate": 4e-05, + "loss": 5.0234, + "loss/crossentropy": 1.9712878987193108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17850744351744652, + "step": 1836 + }, + { + "epoch": 0.15316666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.7058553059895833, + "learning_rate": 4e-05, + "loss": 4.779, + "loss/crossentropy": 2.4524222016334534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22154907137155533, + "step": 1838 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 4.90625, + "grad_norm_var": 0.70953369140625, + "learning_rate": 4e-05, + "loss": 5.1157, + "loss/crossentropy": 2.52553254365921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21638255938887596, + "step": 1840 + }, + { + "epoch": 0.1535, + "grad_norm": 5.28125, + "grad_norm_var": 0.6929036458333333, + "learning_rate": 4e-05, + "loss": 4.9487, + "loss/crossentropy": 2.4182560443878174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22422944754362106, + "step": 1842 + }, + { + "epoch": 0.15366666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.13957926432291667, + "learning_rate": 4e-05, + "loss": 4.9538, + "loss/crossentropy": 2.422487258911133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22509846463799477, + "step": 1844 + }, + { + "epoch": 0.15383333333333332, + "grad_norm": 4.9375, + "grad_norm_var": 0.13420817057291667, + "learning_rate": 4e-05, + "loss": 5.3952, + "loss/crossentropy": 2.584647834300995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22367503494024277, + "step": 1846 + }, + { + "epoch": 0.154, + "grad_norm": 5.21875, + "grad_norm_var": 0.10582275390625, + "learning_rate": 4e-05, + "loss": 5.0143, + "loss/crossentropy": 2.4110784828662872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21831950172781944, + "step": 1848 + }, + { + "epoch": 0.15416666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.11378580729166667, + "learning_rate": 4e-05, + "loss": 5.1977, + "loss/crossentropy": 2.0590811669826508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20148218050599098, + "step": 1850 + }, + { + "epoch": 0.15433333333333332, + "grad_norm": 5.21875, + "grad_norm_var": 0.10273030598958334, + "learning_rate": 4e-05, + "loss": 4.6938, + "loss/crossentropy": 1.8851531371474266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17905624769628048, + "step": 1852 + }, + { + "epoch": 0.1545, + "grad_norm": 4.9375, + "grad_norm_var": 0.048177083333333336, + "learning_rate": 4e-05, + "loss": 4.8345, + "loss/crossentropy": 1.723634012043476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19837238639593124, + "step": 1854 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.03463541666666667, + "learning_rate": 4e-05, + "loss": 5.229, + "loss/crossentropy": 1.8247249498963356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20431892201304436, + "step": 1856 + }, + { + "epoch": 0.15483333333333332, + "grad_norm": 5.0, + "grad_norm_var": 0.042643229166666664, + "learning_rate": 4e-05, + "loss": 4.8978, + "loss/crossentropy": 2.5018930435180664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22337466478347778, + "step": 1858 + }, + { + "epoch": 0.155, + "grad_norm": 5.09375, + "grad_norm_var": 0.04010009765625, + "learning_rate": 4e-05, + "loss": 4.932, + "loss/crossentropy": 1.747809186577797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17998847924172878, + "step": 1860 + }, + { + "epoch": 0.15516666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.05349934895833333, + "learning_rate": 4e-05, + "loss": 4.4732, + "loss/crossentropy": 1.3985635191202164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15633606724441051, + "step": 1862 + }, + { + "epoch": 0.15533333333333332, + "grad_norm": 4.5625, + "grad_norm_var": 0.08147379557291666, + "learning_rate": 4e-05, + "loss": 4.3033, + "loss/crossentropy": 1.821037471294403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18188106641173363, + "step": 1864 + }, + { + "epoch": 0.1555, + "grad_norm": 5.3125, + "grad_norm_var": 0.075244140625, + "learning_rate": 4e-05, + "loss": 5.4331, + "loss/crossentropy": 2.3469000458717346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21943211928009987, + "step": 1866 + }, + { + "epoch": 0.15566666666666668, + "grad_norm": 7.59375, + "grad_norm_var": 0.48971354166666664, + "learning_rate": 4e-05, + "loss": 5.1285, + "loss/crossentropy": 2.018453985452652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18594501912593842, + "step": 1868 + }, + { + "epoch": 0.15583333333333332, + "grad_norm": 5.1875, + "grad_norm_var": 0.49491780598958335, + "learning_rate": 4e-05, + "loss": 4.6245, + "loss/crossentropy": 1.441122718155384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1634649857878685, + "step": 1870 + }, + { + "epoch": 0.156, + "grad_norm": 4.84375, + "grad_norm_var": 0.5040201822916667, + "learning_rate": 4e-05, + "loss": 4.7244, + "loss/crossentropy": 2.1576380729675293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19306758418679237, + "step": 1872 + }, + { + "epoch": 0.15616666666666668, + "grad_norm": 5.125, + "grad_norm_var": 0.47877197265625, + "learning_rate": 4e-05, + "loss": 5.286, + "loss/crossentropy": 2.008717902004719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18943626806139946, + "step": 1874 + }, + { + "epoch": 0.15633333333333332, + "grad_norm": 5.1875, + "grad_norm_var": 0.48489176432291664, + "learning_rate": 4e-05, + "loss": 4.9675, + "loss/crossentropy": 2.4755281805992126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2329132817685604, + "step": 1876 + }, + { + "epoch": 0.1565, + "grad_norm": 4.78125, + "grad_norm_var": 0.4714152018229167, + "learning_rate": 4e-05, + "loss": 5.249, + "loss/crossentropy": 2.4515629410743713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22444384172558784, + "step": 1878 + }, + { + "epoch": 0.15666666666666668, + "grad_norm": 6.53125, + "grad_norm_var": 0.5702473958333333, + "learning_rate": 4e-05, + "loss": 4.9503, + "loss/crossentropy": 1.8355879187583923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18035876750946045, + "step": 1880 + }, + { + "epoch": 0.15683333333333332, + "grad_norm": 5.5, + "grad_norm_var": 0.5729166666666666, + "learning_rate": 4e-05, + "loss": 5.7065, + "loss/crossentropy": 1.975680448114872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19011396169662476, + "step": 1882 + }, + { + "epoch": 0.157, + "grad_norm": 4.78125, + "grad_norm_var": 0.25494791666666666, + "learning_rate": 4e-05, + "loss": 5.0065, + "loss/crossentropy": 2.348602294921875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2531758286058903, + "step": 1884 + }, + { + "epoch": 0.15716666666666668, + "grad_norm": 5.15625, + "grad_norm_var": 0.249853515625, + "learning_rate": 4e-05, + "loss": 4.9532, + "loss/crossentropy": 1.5291048362851143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16095010749995708, + "step": 1886 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.25266927083333335, + "learning_rate": 4e-05, + "loss": 4.4705, + "loss/crossentropy": 1.795276552438736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22460604459047318, + "step": 1888 + }, + { + "epoch": 0.1575, + "grad_norm": 4.4375, + "grad_norm_var": 0.29225260416666665, + "learning_rate": 4e-05, + "loss": 4.5639, + "loss/crossentropy": 1.6168242916464806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1597994789481163, + "step": 1890 + }, + { + "epoch": 0.15766666666666668, + "grad_norm": 5.625, + "grad_norm_var": 0.27740478515625, + "learning_rate": 4e-05, + "loss": 5.4135, + "loss/crossentropy": 2.2597386240959167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24958740919828415, + "step": 1892 + }, + { + "epoch": 0.15783333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.28808186848958334, + "learning_rate": 4e-05, + "loss": 4.6908, + "loss/crossentropy": 1.3280053436756134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1516177151352167, + "step": 1894 + }, + { + "epoch": 0.158, + "grad_norm": 4.96875, + "grad_norm_var": 0.13253580729166667, + "learning_rate": 4e-05, + "loss": 4.8624, + "loss/crossentropy": 1.9109614789485931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17931961454451084, + "step": 1896 + }, + { + "epoch": 0.15816666666666668, + "grad_norm": 4.875, + "grad_norm_var": 0.10818684895833333, + "learning_rate": 4e-05, + "loss": 4.9535, + "loss/crossentropy": 1.4697567075490952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16392095386981964, + "step": 1898 + }, + { + "epoch": 0.15833333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.10221354166666667, + "learning_rate": 4e-05, + "loss": 5.0563, + "loss/crossentropy": 1.7749396488070488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17989159747958183, + "step": 1900 + }, + { + "epoch": 0.1585, + "grad_norm": 5.21875, + "grad_norm_var": 0.13990885416666668, + "learning_rate": 4e-05, + "loss": 5.369, + "loss/crossentropy": 2.3539693355560303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21434181556105614, + "step": 1902 + }, + { + "epoch": 0.15866666666666668, + "grad_norm": 5.78125, + "grad_norm_var": 0.15950520833333334, + "learning_rate": 4e-05, + "loss": 5.3091, + "loss/crossentropy": 2.474464476108551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21791671961545944, + "step": 1904 + }, + { + "epoch": 0.15883333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.18865559895833334, + "learning_rate": 4e-05, + "loss": 4.7584, + "loss/crossentropy": 1.7396632134914398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19973601773381233, + "step": 1906 + }, + { + "epoch": 0.159, + "grad_norm": 5.125, + "grad_norm_var": 0.18391520182291668, + "learning_rate": 4e-05, + "loss": 5.0961, + "loss/crossentropy": 2.441837340593338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207939263433218, + "step": 1908 + }, + { + "epoch": 0.15916666666666668, + "grad_norm": 5.46875, + "grad_norm_var": 0.15128580729166666, + "learning_rate": 4e-05, + "loss": 5.4184, + "loss/crossentropy": 2.0546700954437256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21630385518074036, + "step": 1910 + }, + { + "epoch": 0.15933333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.1484375, + "learning_rate": 4e-05, + "loss": 5.432, + "loss/crossentropy": 2.465700089931488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23406285047531128, + "step": 1912 + }, + { + "epoch": 0.1595, + "grad_norm": 5.15625, + "grad_norm_var": 0.17476806640625, + "learning_rate": 4e-05, + "loss": 4.7457, + "loss/crossentropy": 1.1225157380104065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15988787077367306, + "step": 1914 + }, + { + "epoch": 0.15966666666666668, + "grad_norm": 5.125, + "grad_norm_var": 0.20487874348958332, + "learning_rate": 4e-05, + "loss": 4.8988, + "loss/crossentropy": 1.849706619977951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1671137586236, + "step": 1916 + }, + { + "epoch": 0.15983333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.18251546223958334, + "learning_rate": 4e-05, + "loss": 5.6658, + "loss/crossentropy": 2.5173650979995728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206265851855278, + "step": 1918 + }, + { + "epoch": 0.16, + "grad_norm": 5.0, + "grad_norm_var": 0.15904541015625, + "learning_rate": 4e-05, + "loss": 5.4365, + "loss/crossentropy": 2.3284026384353638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2226024679839611, + "step": 1920 + }, + { + "epoch": 0.16016666666666668, + "grad_norm": 5.28125, + "grad_norm_var": 0.07131754557291667, + "learning_rate": 4e-05, + "loss": 5.0088, + "loss/crossentropy": 0.9736118018627167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12116567231714725, + "step": 1922 + }, + { + "epoch": 0.16033333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.09021809895833334, + "learning_rate": 4e-05, + "loss": 4.7493, + "loss/crossentropy": 2.010709524154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19896573573350906, + "step": 1924 + }, + { + "epoch": 0.1605, + "grad_norm": 5.15625, + "grad_norm_var": 0.10076497395833334, + "learning_rate": 4e-05, + "loss": 5.3031, + "loss/crossentropy": 1.8335441946983337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24130244553089142, + "step": 1926 + }, + { + "epoch": 0.16066666666666668, + "grad_norm": 4.78125, + "grad_norm_var": 0.13570556640625, + "learning_rate": 4e-05, + "loss": 5.1219, + "loss/crossentropy": 1.9984001368284225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19060589745640755, + "step": 1928 + }, + { + "epoch": 0.16083333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.11756184895833334, + "learning_rate": 4e-05, + "loss": 5.0371, + "loss/crossentropy": 1.7571651637554169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17572467029094696, + "step": 1930 + }, + { + "epoch": 0.161, + "grad_norm": 5.5, + "grad_norm_var": 0.10657145182291666, + "learning_rate": 4e-05, + "loss": 4.8262, + "loss/crossentropy": 2.319933772087097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20385072752833366, + "step": 1932 + }, + { + "epoch": 0.16116666666666668, + "grad_norm": 5.28125, + "grad_norm_var": 0.10193684895833334, + "learning_rate": 4e-05, + "loss": 4.7018, + "loss/crossentropy": 2.2420734465122223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2276177629828453, + "step": 1934 + }, + { + "epoch": 0.16133333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.114697265625, + "learning_rate": 4e-05, + "loss": 4.8164, + "loss/crossentropy": 2.019700661301613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1761186197400093, + "step": 1936 + }, + { + "epoch": 0.1615, + "grad_norm": 5.0625, + "grad_norm_var": 0.11516927083333334, + "learning_rate": 4e-05, + "loss": 5.228, + "loss/crossentropy": 2.334056079387665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20008408278226852, + "step": 1938 + }, + { + "epoch": 0.16166666666666665, + "grad_norm": 5.28125, + "grad_norm_var": 0.08961181640625, + "learning_rate": 4e-05, + "loss": 4.815, + "loss/crossentropy": 1.8608058020472527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20045208930969238, + "step": 1940 + }, + { + "epoch": 0.16183333333333333, + "grad_norm": 5.75, + "grad_norm_var": 0.10299072265625, + "learning_rate": 4e-05, + "loss": 4.8539, + "loss/crossentropy": 1.859613299369812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18477841094136238, + "step": 1942 + }, + { + "epoch": 0.162, + "grad_norm": 4.84375, + "grad_norm_var": 0.0744140625, + "learning_rate": 4e-05, + "loss": 4.4145, + "loss/crossentropy": 2.1982096135616302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072415016591549, + "step": 1944 + }, + { + "epoch": 0.16216666666666665, + "grad_norm": 4.71875, + "grad_norm_var": 0.0837890625, + "learning_rate": 4e-05, + "loss": 5.1867, + "loss/crossentropy": 2.53378689289093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22257087007164955, + "step": 1946 + }, + { + "epoch": 0.16233333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.071875, + "learning_rate": 4e-05, + "loss": 4.7655, + "loss/crossentropy": 1.2198933511972427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14932805858552456, + "step": 1948 + }, + { + "epoch": 0.1625, + "grad_norm": 5.15625, + "grad_norm_var": 0.073681640625, + "learning_rate": 4e-05, + "loss": 5.2139, + "loss/crossentropy": 2.3506920337677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21482441201806068, + "step": 1950 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 5.28125, + "grad_norm_var": 0.06640625, + "learning_rate": 4e-05, + "loss": 4.2194, + "loss/crossentropy": 0.9886042326688766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13658013939857483, + "step": 1952 + }, + { + "epoch": 0.16283333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.06549072265625, + "learning_rate": 4e-05, + "loss": 4.6741, + "loss/crossentropy": 2.5076074600219727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21432289853692055, + "step": 1954 + }, + { + "epoch": 0.163, + "grad_norm": 5.5625, + "grad_norm_var": 0.07711181640625, + "learning_rate": 4e-05, + "loss": 5.4932, + "loss/crossentropy": 2.4833337664604187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21723904460668564, + "step": 1956 + }, + { + "epoch": 0.16316666666666665, + "grad_norm": 4.8125, + "grad_norm_var": 0.06767171223958333, + "learning_rate": 4e-05, + "loss": 4.6413, + "loss/crossentropy": 1.8598149567842484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18153854832053185, + "step": 1958 + }, + { + "epoch": 0.16333333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.07623291015625, + "learning_rate": 4e-05, + "loss": 4.8104, + "loss/crossentropy": 2.2267325818538666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2370566502213478, + "step": 1960 + }, + { + "epoch": 0.1635, + "grad_norm": 4.90625, + "grad_norm_var": 0.08196614583333334, + "learning_rate": 4e-05, + "loss": 5.6272, + "loss/crossentropy": 2.7236159443855286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216436043381691, + "step": 1962 + }, + { + "epoch": 0.16366666666666665, + "grad_norm": 4.84375, + "grad_norm_var": 0.08528645833333333, + "learning_rate": 4e-05, + "loss": 4.5921, + "loss/crossentropy": 1.4360825419425964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15567945316433907, + "step": 1964 + }, + { + "epoch": 0.16383333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.08876546223958333, + "learning_rate": 4e-05, + "loss": 4.8939, + "loss/crossentropy": 1.9993894025683403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18757018819451332, + "step": 1966 + }, + { + "epoch": 0.164, + "grad_norm": 5.125, + "grad_norm_var": 0.09542643229166667, + "learning_rate": 4e-05, + "loss": 5.2536, + "loss/crossentropy": 2.448215901851654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187819555401802, + "step": 1968 + }, + { + "epoch": 0.16416666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.09426676432291667, + "learning_rate": 4e-05, + "loss": 5.3415, + "loss/crossentropy": 2.382662773132324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23334766179323196, + "step": 1970 + }, + { + "epoch": 0.16433333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.08240559895833334, + "learning_rate": 4e-05, + "loss": 5.1642, + "loss/crossentropy": 1.7083993628621101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550025209784508, + "step": 1972 + }, + { + "epoch": 0.1645, + "grad_norm": 5.3125, + "grad_norm_var": 0.070556640625, + "learning_rate": 4e-05, + "loss": 4.8266, + "loss/crossentropy": 2.033088594675064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23395147919654846, + "step": 1974 + }, + { + "epoch": 0.16466666666666666, + "grad_norm": 5.3125, + "grad_norm_var": 0.06925455729166667, + "learning_rate": 4e-05, + "loss": 5.4875, + "loss/crossentropy": 2.3617620170116425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22937371581792831, + "step": 1976 + }, + { + "epoch": 0.16483333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.05813395182291667, + "learning_rate": 4e-05, + "loss": 5.0183, + "loss/crossentropy": 2.381405919790268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20841734111309052, + "step": 1978 + }, + { + "epoch": 0.165, + "grad_norm": 4.9375, + "grad_norm_var": 0.056494140625, + "learning_rate": 4e-05, + "loss": 4.9318, + "loss/crossentropy": 1.984310194849968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19360090605914593, + "step": 1980 + }, + { + "epoch": 0.16516666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.09052327473958334, + "learning_rate": 4e-05, + "loss": 4.7867, + "loss/crossentropy": 1.1787148118019104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13031774573028088, + "step": 1982 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.08163655598958333, + "learning_rate": 4e-05, + "loss": 5.1374, + "loss/crossentropy": 2.660287320613861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21815495565533638, + "step": 1984 + }, + { + "epoch": 0.1655, + "grad_norm": 5.40625, + "grad_norm_var": 0.092822265625, + "learning_rate": 4e-05, + "loss": 4.5718, + "loss/crossentropy": 1.7051584795117378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1718977987766266, + "step": 1986 + }, + { + "epoch": 0.16566666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.086572265625, + "learning_rate": 4e-05, + "loss": 5.3293, + "loss/crossentropy": 1.5497848987579346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15701960772275925, + "step": 1988 + }, + { + "epoch": 0.16583333333333333, + "grad_norm": 5.625, + "grad_norm_var": 0.09440104166666667, + "learning_rate": 4e-05, + "loss": 5.0158, + "loss/crossentropy": 2.073232203722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22496861219406128, + "step": 1990 + }, + { + "epoch": 0.166, + "grad_norm": 5.0, + "grad_norm_var": 0.11298421223958334, + "learning_rate": 4e-05, + "loss": 4.6335, + "loss/crossentropy": 1.3899268805980682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14914513006806374, + "step": 1992 + }, + { + "epoch": 0.16616666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.10818684895833333, + "learning_rate": 4e-05, + "loss": 5.4054, + "loss/crossentropy": 2.4187216758728027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222491316497326, + "step": 1994 + }, + { + "epoch": 0.16633333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.11230061848958334, + "learning_rate": 4e-05, + "loss": 4.887, + "loss/crossentropy": 1.6145347505807877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16213739477097988, + "step": 1996 + }, + { + "epoch": 0.1665, + "grad_norm": 4.90625, + "grad_norm_var": 0.07278238932291667, + "learning_rate": 4e-05, + "loss": 4.5128, + "loss/crossentropy": 2.2953919768333435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2423577792942524, + "step": 1998 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.08388264973958333, + "learning_rate": 4e-05, + "loss": 5.0227, + "loss/crossentropy": 2.7778520584106445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22993957251310349, + "step": 2000 + }, + { + "epoch": 0.16683333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.09149983723958334, + "learning_rate": 4e-05, + "loss": 4.5579, + "loss/crossentropy": 1.120415337383747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14762288890779018, + "step": 2002 + }, + { + "epoch": 0.167, + "grad_norm": 5.125, + "grad_norm_var": 0.08527018229166666, + "learning_rate": 4e-05, + "loss": 4.5543, + "loss/crossentropy": 1.6726857349276543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18800584971904755, + "step": 2004 + }, + { + "epoch": 0.16716666666666666, + "grad_norm": 5.3125, + "grad_norm_var": 0.06734619140625, + "learning_rate": 4e-05, + "loss": 4.6986, + "loss/crossentropy": 1.9222635477781296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19062339887022972, + "step": 2006 + }, + { + "epoch": 0.16733333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.05347900390625, + "learning_rate": 4e-05, + "loss": 4.8448, + "loss/crossentropy": 2.118234932422638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22146522253751755, + "step": 2008 + }, + { + "epoch": 0.1675, + "grad_norm": 5.0, + "grad_norm_var": 0.05175374348958333, + "learning_rate": 4e-05, + "loss": 4.8215, + "loss/crossentropy": 2.252037912607193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157456949353218, + "step": 2010 + }, + { + "epoch": 0.16766666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.06926676432291666, + "learning_rate": 4e-05, + "loss": 4.3806, + "loss/crossentropy": 0.9033909440040588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1250423714518547, + "step": 2012 + }, + { + "epoch": 0.16783333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.07434488932291666, + "learning_rate": 4e-05, + "loss": 4.7516, + "loss/crossentropy": 1.9430923759937286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18675028160214424, + "step": 2014 + }, + { + "epoch": 0.168, + "grad_norm": 5.15625, + "grad_norm_var": 0.06448160807291667, + "learning_rate": 4e-05, + "loss": 4.7063, + "loss/crossentropy": 1.634926363825798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1670081913471222, + "step": 2016 + }, + { + "epoch": 0.16816666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.04830729166666667, + "learning_rate": 4e-05, + "loss": 4.857, + "loss/crossentropy": 1.694766104221344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19327403232455254, + "step": 2018 + }, + { + "epoch": 0.16833333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.05015869140625, + "learning_rate": 4e-05, + "loss": 5.5487, + "loss/crossentropy": 2.1367068588733673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21962807700037956, + "step": 2020 + }, + { + "epoch": 0.1685, + "grad_norm": 4.96875, + "grad_norm_var": 0.07320556640625, + "learning_rate": 4e-05, + "loss": 4.0799, + "loss/crossentropy": 1.671954207122326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1811060570180416, + "step": 2022 + }, + { + "epoch": 0.16866666666666666, + "grad_norm": 5.28125, + "grad_norm_var": 0.09479166666666666, + "learning_rate": 4e-05, + "loss": 5.2455, + "loss/crossentropy": 1.6374276280403137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16752460598945618, + "step": 2024 + }, + { + "epoch": 0.16883333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.09312744140625, + "learning_rate": 4e-05, + "loss": 5.3886, + "loss/crossentropy": 2.3457963168621063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288379706442356, + "step": 2026 + }, + { + "epoch": 0.169, + "grad_norm": 4.75, + "grad_norm_var": 0.06985270182291667, + "learning_rate": 4e-05, + "loss": 4.6938, + "loss/crossentropy": 2.563704550266266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21900682151317596, + "step": 2028 + }, + { + "epoch": 0.16916666666666666, + "grad_norm": 5.71875, + "grad_norm_var": 0.09034830729166667, + "learning_rate": 4e-05, + "loss": 4.6583, + "loss/crossentropy": 2.2988042533397675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2221379354596138, + "step": 2030 + }, + { + "epoch": 0.16933333333333334, + "grad_norm": 5.5, + "grad_norm_var": 0.10146077473958333, + "learning_rate": 4e-05, + "loss": 4.9018, + "loss/crossentropy": 2.246855854988098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2294277399778366, + "step": 2032 + }, + { + "epoch": 0.1695, + "grad_norm": 4.71875, + "grad_norm_var": 0.1115234375, + "learning_rate": 4e-05, + "loss": 4.2763, + "loss/crossentropy": 1.483270302414894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20540903508663177, + "step": 2034 + }, + { + "epoch": 0.16966666666666666, + "grad_norm": 6.25, + "grad_norm_var": 0.20623372395833334, + "learning_rate": 4e-05, + "loss": 4.6115, + "loss/crossentropy": 0.7337675020098686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12142804265022278, + "step": 2036 + }, + { + "epoch": 0.16983333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.1724609375, + "learning_rate": 4e-05, + "loss": 5.2868, + "loss/crossentropy": 2.4326335787773132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2261388711631298, + "step": 2038 + }, + { + "epoch": 0.17, + "grad_norm": 5.25, + "grad_norm_var": 0.16256510416666667, + "learning_rate": 4e-05, + "loss": 4.6802, + "loss/crossentropy": 1.8480440527200699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009208109229803, + "step": 2040 + }, + { + "epoch": 0.17016666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.17239176432291667, + "learning_rate": 4e-05, + "loss": 4.8374, + "loss/crossentropy": 1.7596202716231346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18629582971334457, + "step": 2042 + }, + { + "epoch": 0.17033333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.15969645182291667, + "learning_rate": 4e-05, + "loss": 5.0063, + "loss/crossentropy": 1.6326167657971382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21383372321724892, + "step": 2044 + }, + { + "epoch": 0.1705, + "grad_norm": 5.25, + "grad_norm_var": 0.14698893229166668, + "learning_rate": 4e-05, + "loss": 4.9583, + "loss/crossentropy": 1.3872774839401245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19810861721634865, + "step": 2046 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.14521077473958333, + "learning_rate": 4e-05, + "loss": 4.2737, + "loss/crossentropy": 1.2238230854272842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1741249691694975, + "step": 2048 + }, + { + "epoch": 0.17083333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.14703369140625, + "learning_rate": 4e-05, + "loss": 3.9872, + "loss/crossentropy": 1.1708182319998741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17563006840646267, + "step": 2050 + }, + { + "epoch": 0.171, + "grad_norm": 4.8125, + "grad_norm_var": 0.04694010416666667, + "learning_rate": 4e-05, + "loss": 4.826, + "loss/crossentropy": 1.4472626447677612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18563862517476082, + "step": 2052 + }, + { + "epoch": 0.17116666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.04892171223958333, + "learning_rate": 4e-05, + "loss": 4.5356, + "loss/crossentropy": 1.3219031170010567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.136514600366354, + "step": 2054 + }, + { + "epoch": 0.17133333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.0365234375, + "learning_rate": 4e-05, + "loss": 4.9406, + "loss/crossentropy": 2.4237805008888245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22321195155382156, + "step": 2056 + }, + { + "epoch": 0.1715, + "grad_norm": 5.375, + "grad_norm_var": 0.05188802083333333, + "learning_rate": 4e-05, + "loss": 5.5955, + "loss/crossentropy": 1.8901968151330948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951073817908764, + "step": 2058 + }, + { + "epoch": 0.17166666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.04957275390625, + "learning_rate": 4e-05, + "loss": 5.2047, + "loss/crossentropy": 2.6344847083091736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2257305197417736, + "step": 2060 + }, + { + "epoch": 0.17183333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.048140462239583334, + "learning_rate": 4e-05, + "loss": 5.1074, + "loss/crossentropy": 2.4700939655303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20783070847392082, + "step": 2062 + }, + { + "epoch": 0.172, + "grad_norm": 5.1875, + "grad_norm_var": 0.044384765625, + "learning_rate": 4e-05, + "loss": 4.9033, + "loss/crossentropy": 2.0393999814987183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21963367983698845, + "step": 2064 + }, + { + "epoch": 0.17216666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.046468098958333336, + "learning_rate": 4e-05, + "loss": 5.1977, + "loss/crossentropy": 2.3000362515449524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22355759143829346, + "step": 2066 + }, + { + "epoch": 0.17233333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.04269205729166667, + "learning_rate": 4e-05, + "loss": 5.0526, + "loss/crossentropy": 1.692564770579338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17172588407993317, + "step": 2068 + }, + { + "epoch": 0.1725, + "grad_norm": 4.65625, + "grad_norm_var": 0.05230712890625, + "learning_rate": 4e-05, + "loss": 4.6657, + "loss/crossentropy": 2.475119471549988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216409109532833, + "step": 2070 + }, + { + "epoch": 0.17266666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.05286051432291667, + "learning_rate": 4e-05, + "loss": 4.8245, + "loss/crossentropy": 2.1208333671092987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088773548603058, + "step": 2072 + }, + { + "epoch": 0.17283333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.05090738932291667, + "learning_rate": 4e-05, + "loss": 4.0344, + "loss/crossentropy": 1.5833616331219673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1833811979740858, + "step": 2074 + }, + { + "epoch": 0.173, + "grad_norm": 5.6875, + "grad_norm_var": 0.09172770182291666, + "learning_rate": 4e-05, + "loss": 5.3868, + "loss/crossentropy": 2.053065747022629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20190668106079102, + "step": 2076 + }, + { + "epoch": 0.17316666666666666, + "grad_norm": 5.5625, + "grad_norm_var": 0.10670572916666667, + "learning_rate": 4e-05, + "loss": 4.4207, + "loss/crossentropy": 1.4702235013246536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15409984439611435, + "step": 2078 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.10556233723958333, + "learning_rate": 4e-05, + "loss": 4.8621, + "loss/crossentropy": 1.6585796177387238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16805803030729294, + "step": 2080 + }, + { + "epoch": 0.1735, + "grad_norm": 5.84375, + "grad_norm_var": 0.1333984375, + "learning_rate": 4e-05, + "loss": 5.2264, + "loss/crossentropy": 1.9839501976966858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20378449745476246, + "step": 2082 + }, + { + "epoch": 0.17366666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.13909098307291667, + "learning_rate": 4e-05, + "loss": 4.7717, + "loss/crossentropy": 1.573902688920498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1820588782429695, + "step": 2084 + }, + { + "epoch": 0.17383333333333334, + "grad_norm": 5.625, + "grad_norm_var": 0.13918863932291667, + "learning_rate": 4e-05, + "loss": 5.1317, + "loss/crossentropy": 2.24695548415184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23131686076521873, + "step": 2086 + }, + { + "epoch": 0.174, + "grad_norm": 4.96875, + "grad_norm_var": 0.14078369140625, + "learning_rate": 4e-05, + "loss": 5.0558, + "loss/crossentropy": 2.0588990449905396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19663485884666443, + "step": 2088 + }, + { + "epoch": 0.17416666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 2.6382120768229167, + "learning_rate": 4e-05, + "loss": 5.2796, + "loss/crossentropy": 3.171107590198517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2164313718676567, + "step": 2090 + }, + { + "epoch": 0.17433333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 2.699609375, + "learning_rate": 4e-05, + "loss": 4.3192, + "loss/crossentropy": 1.8898755833506584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18126432970166206, + "step": 2092 + }, + { + "epoch": 0.1745, + "grad_norm": 5.1875, + "grad_norm_var": 2.6998697916666665, + "learning_rate": 4e-05, + "loss": 5.3395, + "loss/crossentropy": 2.3107918202877045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22003484517335892, + "step": 2094 + }, + { + "epoch": 0.17466666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 2.71519775390625, + "learning_rate": 4e-05, + "loss": 5.0895, + "loss/crossentropy": 2.438191533088684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299581542611122, + "step": 2096 + }, + { + "epoch": 0.17483333333333334, + "grad_norm": 4.625, + "grad_norm_var": 2.741471354166667, + "learning_rate": 4e-05, + "loss": 5.0573, + "loss/crossentropy": 1.8615416586399078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17831822112202644, + "step": 2098 + }, + { + "epoch": 0.175, + "grad_norm": 5.09375, + "grad_norm_var": 2.72359619140625, + "learning_rate": 4e-05, + "loss": 4.7074, + "loss/crossentropy": 1.6599205955863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18141921050846577, + "step": 2100 + }, + { + "epoch": 0.17516666666666666, + "grad_norm": 5.71875, + "grad_norm_var": 2.745556640625, + "learning_rate": 4e-05, + "loss": 4.9259, + "loss/crossentropy": 2.2989392578601837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21219024062156677, + "step": 2102 + }, + { + "epoch": 0.17533333333333334, + "grad_norm": 5.0, + "grad_norm_var": 2.73033447265625, + "learning_rate": 4e-05, + "loss": 4.7961, + "loss/crossentropy": 1.7167327478528023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22489573061466217, + "step": 2104 + }, + { + "epoch": 0.1755, + "grad_norm": 5.09375, + "grad_norm_var": 0.09425455729166667, + "learning_rate": 4e-05, + "loss": 5.0813, + "loss/crossentropy": 1.840710736811161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18269490636885166, + "step": 2106 + }, + { + "epoch": 0.17566666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.08704427083333334, + "learning_rate": 4e-05, + "loss": 5.1087, + "loss/crossentropy": 1.8294510319828987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20847078040242195, + "step": 2108 + }, + { + "epoch": 0.17583333333333334, + "grad_norm": 5.625, + "grad_norm_var": 0.11080322265625, + "learning_rate": 4e-05, + "loss": 5.5059, + "loss/crossentropy": 1.6975836902856827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17947252094745636, + "step": 2110 + }, + { + "epoch": 0.176, + "grad_norm": 5.25, + "grad_norm_var": 0.08876546223958333, + "learning_rate": 4e-05, + "loss": 5.5898, + "loss/crossentropy": 1.7362675666809082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20171913504600525, + "step": 2112 + }, + { + "epoch": 0.17616666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.07919514973958333, + "learning_rate": 4e-05, + "loss": 5.3091, + "loss/crossentropy": 1.721466027200222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18343711271882057, + "step": 2114 + }, + { + "epoch": 0.17633333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.088134765625, + "learning_rate": 4e-05, + "loss": 5.4335, + "loss/crossentropy": 2.5684576630592346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20511827990412712, + "step": 2116 + }, + { + "epoch": 0.1765, + "grad_norm": 5.4375, + "grad_norm_var": 0.111962890625, + "learning_rate": 4e-05, + "loss": 5.4175, + "loss/crossentropy": 2.235967993736267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22469640895724297, + "step": 2118 + }, + { + "epoch": 0.17666666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.10972900390625, + "learning_rate": 4e-05, + "loss": 4.8452, + "loss/crossentropy": 1.9292369186878204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18822060897946358, + "step": 2120 + }, + { + "epoch": 0.17683333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.12224934895833334, + "learning_rate": 4e-05, + "loss": 5.119, + "loss/crossentropy": 1.9760426580905914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19950895011425018, + "step": 2122 + }, + { + "epoch": 0.177, + "grad_norm": 5.25, + "grad_norm_var": 0.17069905598958332, + "learning_rate": 4e-05, + "loss": 5.0708, + "loss/crossentropy": 2.346391201019287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23865826427936554, + "step": 2124 + }, + { + "epoch": 0.17716666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.16614583333333333, + "learning_rate": 4e-05, + "loss": 4.943, + "loss/crossentropy": 2.2712226808071136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997787281870842, + "step": 2126 + }, + { + "epoch": 0.17733333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.17420247395833333, + "learning_rate": 4e-05, + "loss": 5.0841, + "loss/crossentropy": 2.0525820776820183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887969095259905, + "step": 2128 + }, + { + "epoch": 0.1775, + "grad_norm": 6.71875, + "grad_norm_var": 0.34625244140625, + "learning_rate": 4e-05, + "loss": 4.298, + "loss/crossentropy": 1.7505680918693542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16385265067219734, + "step": 2130 + }, + { + "epoch": 0.17766666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.33577067057291665, + "learning_rate": 4e-05, + "loss": 5.0106, + "loss/crossentropy": 2.127786874771118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24591680243611336, + "step": 2132 + }, + { + "epoch": 0.17783333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.2997233072916667, + "learning_rate": 4e-05, + "loss": 5.2174, + "loss/crossentropy": 1.796779453754425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18160532787442207, + "step": 2134 + }, + { + "epoch": 0.178, + "grad_norm": 5.09375, + "grad_norm_var": 0.29931233723958334, + "learning_rate": 4e-05, + "loss": 4.7535, + "loss/crossentropy": 0.8597075119614601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11564531922340393, + "step": 2136 + }, + { + "epoch": 0.17816666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.2877888997395833, + "learning_rate": 4e-05, + "loss": 5.0003, + "loss/crossentropy": 2.004398114979267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18805610574781895, + "step": 2138 + }, + { + "epoch": 0.17833333333333334, + "grad_norm": 5.3125, + "grad_norm_var": 0.23642171223958333, + "learning_rate": 4e-05, + "loss": 4.8822, + "loss/crossentropy": 2.124794065952301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21644308045506477, + "step": 2140 + }, + { + "epoch": 0.1785, + "grad_norm": 5.28125, + "grad_norm_var": 0.22561442057291667, + "learning_rate": 4e-05, + "loss": 5.2293, + "loss/crossentropy": 1.2877550274133682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14798221364617348, + "step": 2142 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 5.65625, + "grad_norm_var": 0.22821858723958333, + "learning_rate": 4e-05, + "loss": 4.7824, + "loss/crossentropy": 1.6831488832831383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15735942497849464, + "step": 2144 + }, + { + "epoch": 0.17883333333333334, + "grad_norm": 5.59375, + "grad_norm_var": 0.05546875, + "learning_rate": 4e-05, + "loss": 5.2751, + "loss/crossentropy": 2.193062275648117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21409276500344276, + "step": 2146 + }, + { + "epoch": 0.179, + "grad_norm": 4.78125, + "grad_norm_var": 0.06773681640625, + "learning_rate": 4e-05, + "loss": 4.6843, + "loss/crossentropy": 2.02949271351099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134292870759964, + "step": 2148 + }, + { + "epoch": 0.17916666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.0623046875, + "learning_rate": 4e-05, + "loss": 5.2284, + "loss/crossentropy": 2.0511502772569656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925040427595377, + "step": 2150 + }, + { + "epoch": 0.17933333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.05582275390625, + "learning_rate": 4e-05, + "loss": 4.8028, + "loss/crossentropy": 2.178094059228897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19704850018024445, + "step": 2152 + }, + { + "epoch": 0.1795, + "grad_norm": 5.78125, + "grad_norm_var": 0.090478515625, + "learning_rate": 4e-05, + "loss": 5.1375, + "loss/crossentropy": 2.071473777294159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016850858926773, + "step": 2154 + }, + { + "epoch": 0.17966666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.08433837890625, + "learning_rate": 4e-05, + "loss": 4.4847, + "loss/crossentropy": 1.7026320695877075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854725144803524, + "step": 2156 + }, + { + "epoch": 0.17983333333333335, + "grad_norm": 5.0625, + "grad_norm_var": 0.08508707682291666, + "learning_rate": 4e-05, + "loss": 4.8903, + "loss/crossentropy": 1.9909127950668335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21856766939163208, + "step": 2158 + }, + { + "epoch": 0.18, + "grad_norm": 5.15625, + "grad_norm_var": 0.06851806640625, + "learning_rate": 4e-05, + "loss": 4.8267, + "loss/crossentropy": 1.6173651814460754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17569880932569504, + "step": 2160 + }, + { + "epoch": 0.18016666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.061747233072916664, + "learning_rate": 4e-05, + "loss": 4.6537, + "loss/crossentropy": 1.753201201558113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20511113107204437, + "step": 2162 + }, + { + "epoch": 0.18033333333333335, + "grad_norm": 5.0625, + "grad_norm_var": 0.050764973958333334, + "learning_rate": 4e-05, + "loss": 5.3518, + "loss/crossentropy": 2.5091399550437927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.232530165463686, + "step": 2164 + }, + { + "epoch": 0.1805, + "grad_norm": 8.75, + "grad_norm_var": 0.8775349934895833, + "learning_rate": 4e-05, + "loss": 4.9893, + "loss/crossentropy": 2.5543057322502136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21447621285915375, + "step": 2166 + }, + { + "epoch": 0.18066666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.867041015625, + "learning_rate": 4e-05, + "loss": 5.153, + "loss/crossentropy": 1.5453489795327187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1768591459840536, + "step": 2168 + }, + { + "epoch": 0.18083333333333335, + "grad_norm": 5.5, + "grad_norm_var": 0.86197509765625, + "learning_rate": 4e-05, + "loss": 4.941, + "loss/crossentropy": 1.713011920452118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18514833971858025, + "step": 2170 + }, + { + "epoch": 0.181, + "grad_norm": 5.0625, + "grad_norm_var": 0.86636962890625, + "learning_rate": 4e-05, + "loss": 5.2454, + "loss/crossentropy": 2.531073272228241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22826341539621353, + "step": 2172 + }, + { + "epoch": 0.18116666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.8854817708333333, + "learning_rate": 4e-05, + "loss": 4.6659, + "loss/crossentropy": 1.2854736521840096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15183403715491295, + "step": 2174 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 5.1875, + "grad_norm_var": 0.8856119791666667, + "learning_rate": 4e-05, + "loss": 5.1651, + "loss/crossentropy": 1.829525165259838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17748862504959106, + "step": 2176 + }, + { + "epoch": 0.1815, + "grad_norm": 5.1875, + "grad_norm_var": 0.8767578125, + "learning_rate": 4e-05, + "loss": 5.3755, + "loss/crossentropy": 2.63100802898407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24154625460505486, + "step": 2178 + }, + { + "epoch": 0.18166666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.9402994791666667, + "learning_rate": 4e-05, + "loss": 4.9287, + "loss/crossentropy": 1.4673430994153023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15381817147135735, + "step": 2180 + }, + { + "epoch": 0.18183333333333335, + "grad_norm": 5.25, + "grad_norm_var": 0.12263997395833333, + "learning_rate": 4e-05, + "loss": 4.7876, + "loss/crossentropy": 2.181483656167984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17778964713215828, + "step": 2182 + }, + { + "epoch": 0.182, + "grad_norm": 5.09375, + "grad_norm_var": 0.12118733723958333, + "learning_rate": 4e-05, + "loss": 4.9908, + "loss/crossentropy": 2.0452851057052612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21750615909695625, + "step": 2184 + }, + { + "epoch": 0.18216666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.10130208333333333, + "learning_rate": 4e-05, + "loss": 5.2839, + "loss/crossentropy": 1.343794122338295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16215426102280617, + "step": 2186 + }, + { + "epoch": 0.18233333333333332, + "grad_norm": 5.34375, + "grad_norm_var": 0.10143229166666666, + "learning_rate": 4e-05, + "loss": 4.9467, + "loss/crossentropy": 1.4881090819835663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15420327335596085, + "step": 2188 + }, + { + "epoch": 0.1825, + "grad_norm": 4.9375, + "grad_norm_var": 0.09563395182291666, + "learning_rate": 4e-05, + "loss": 4.7357, + "loss/crossentropy": 2.0466759502887726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20744113996624947, + "step": 2190 + }, + { + "epoch": 0.18266666666666667, + "grad_norm": 6.0, + "grad_norm_var": 0.15123291015625, + "learning_rate": 4e-05, + "loss": 4.8545, + "loss/crossentropy": 2.399523586034775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22361257672309875, + "step": 2192 + }, + { + "epoch": 0.18283333333333332, + "grad_norm": 4.75, + "grad_norm_var": 0.13795572916666668, + "learning_rate": 4e-05, + "loss": 4.3254, + "loss/crossentropy": 2.5588160157203674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560463845729828, + "step": 2194 + }, + { + "epoch": 0.183, + "grad_norm": 5.0, + "grad_norm_var": 0.10146077473958333, + "learning_rate": 4e-05, + "loss": 4.3573, + "loss/crossentropy": 1.7524387538433075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18234056793153286, + "step": 2196 + }, + { + "epoch": 0.18316666666666667, + "grad_norm": 6.40625, + "grad_norm_var": 0.21132405598958334, + "learning_rate": 4e-05, + "loss": 4.23, + "loss/crossentropy": 1.385098822414875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15484962239861488, + "step": 2198 + }, + { + "epoch": 0.18333333333333332, + "grad_norm": 5.40625, + "grad_norm_var": 0.21073811848958332, + "learning_rate": 4e-05, + "loss": 5.2157, + "loss/crossentropy": 2.572742462158203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22486525401473045, + "step": 2200 + }, + { + "epoch": 0.1835, + "grad_norm": 4.625, + "grad_norm_var": 0.23424072265625, + "learning_rate": 4e-05, + "loss": 4.0647, + "loss/crossentropy": 1.7396223545074463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16994404792785645, + "step": 2202 + }, + { + "epoch": 0.18366666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 0.22974853515625, + "learning_rate": 4e-05, + "loss": 5.3349, + "loss/crossentropy": 2.492325782775879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23226865381002426, + "step": 2204 + }, + { + "epoch": 0.18383333333333332, + "grad_norm": 5.4375, + "grad_norm_var": 0.23821614583333334, + "learning_rate": 4e-05, + "loss": 4.6774, + "loss/crossentropy": 1.0675053745508194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1418643444776535, + "step": 2206 + }, + { + "epoch": 0.184, + "grad_norm": 4.875, + "grad_norm_var": 0.18370768229166667, + "learning_rate": 4e-05, + "loss": 4.6962, + "loss/crossentropy": 1.62255859375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18710698559880257, + "step": 2208 + }, + { + "epoch": 0.18416666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.19257405598958333, + "learning_rate": 4e-05, + "loss": 4.572, + "loss/crossentropy": 1.9560261443257332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18211353570222855, + "step": 2210 + }, + { + "epoch": 0.18433333333333332, + "grad_norm": 5.40625, + "grad_norm_var": 0.19798177083333332, + "learning_rate": 4e-05, + "loss": 4.9418, + "loss/crossentropy": 2.48140287399292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21697937697172165, + "step": 2212 + }, + { + "epoch": 0.1845, + "grad_norm": 5.21875, + "grad_norm_var": 0.079150390625, + "learning_rate": 4e-05, + "loss": 5.364, + "loss/crossentropy": 2.5937938690185547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2184484452009201, + "step": 2214 + }, + { + "epoch": 0.18466666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.07825520833333334, + "learning_rate": 4e-05, + "loss": 4.9239, + "loss/crossentropy": 1.6036360636353493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17884932085871696, + "step": 2216 + }, + { + "epoch": 0.18483333333333332, + "grad_norm": 4.65625, + "grad_norm_var": 0.077587890625, + "learning_rate": 4e-05, + "loss": 4.6433, + "loss/crossentropy": 1.7372171953320503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18615025654435158, + "step": 2218 + }, + { + "epoch": 0.185, + "grad_norm": 4.90625, + "grad_norm_var": 0.06953125, + "learning_rate": 4e-05, + "loss": 4.3449, + "loss/crossentropy": 1.685012899339199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18126941099762917, + "step": 2220 + }, + { + "epoch": 0.18516666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.07590738932291667, + "learning_rate": 4e-05, + "loss": 5.6169, + "loss/crossentropy": 1.9879830479621887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21872147917747498, + "step": 2222 + }, + { + "epoch": 0.18533333333333332, + "grad_norm": 4.65625, + "grad_norm_var": 0.09312744140625, + "learning_rate": 4e-05, + "loss": 4.1555, + "loss/crossentropy": 0.8916665241122246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11059846729040146, + "step": 2224 + }, + { + "epoch": 0.1855, + "grad_norm": 4.90625, + "grad_norm_var": 0.08787434895833333, + "learning_rate": 4e-05, + "loss": 5.4427, + "loss/crossentropy": 2.078547030687332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19260921701788902, + "step": 2226 + }, + { + "epoch": 0.18566666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.08349202473958334, + "learning_rate": 4e-05, + "loss": 4.473, + "loss/crossentropy": 1.3787953928112984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1644715555012226, + "step": 2228 + }, + { + "epoch": 0.18583333333333332, + "grad_norm": 5.3125, + "grad_norm_var": 0.08700764973958333, + "learning_rate": 4e-05, + "loss": 5.3315, + "loss/crossentropy": 2.1148226857185364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21458249166607857, + "step": 2230 + }, + { + "epoch": 0.186, + "grad_norm": 5.15625, + "grad_norm_var": 0.07706705729166667, + "learning_rate": 4e-05, + "loss": 5.247, + "loss/crossentropy": 2.0855464041233063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19903532043099403, + "step": 2232 + }, + { + "epoch": 0.18616666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.06721598307291667, + "learning_rate": 4e-05, + "loss": 4.5324, + "loss/crossentropy": 2.331184357404709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067573331296444, + "step": 2234 + }, + { + "epoch": 0.18633333333333332, + "grad_norm": 5.03125, + "grad_norm_var": 0.06634114583333334, + "learning_rate": 4e-05, + "loss": 4.5482, + "loss/crossentropy": 1.4255691543221474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15853366628289223, + "step": 2236 + }, + { + "epoch": 0.1865, + "grad_norm": 5.1875, + "grad_norm_var": 0.04989827473958333, + "learning_rate": 4e-05, + "loss": 4.9923, + "loss/crossentropy": 1.6706126257777214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20294161699712276, + "step": 2238 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 4.9375, + "grad_norm_var": 0.03319905598958333, + "learning_rate": 4e-05, + "loss": 5.0886, + "loss/crossentropy": 2.1218108534812927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21483079716563225, + "step": 2240 + }, + { + "epoch": 0.18683333333333332, + "grad_norm": 4.96875, + "grad_norm_var": 0.02584228515625, + "learning_rate": 4e-05, + "loss": 4.8981, + "loss/crossentropy": 1.091003268957138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18091485276818275, + "step": 2242 + }, + { + "epoch": 0.187, + "grad_norm": 5.09375, + "grad_norm_var": 0.022977701822916665, + "learning_rate": 4e-05, + "loss": 4.9307, + "loss/crossentropy": 1.5533147603273392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16520674899220467, + "step": 2244 + }, + { + "epoch": 0.18716666666666668, + "grad_norm": 5.15625, + "grad_norm_var": 0.023177083333333334, + "learning_rate": 4e-05, + "loss": 5.1086, + "loss/crossentropy": 1.7440512776374817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17713807709515095, + "step": 2246 + }, + { + "epoch": 0.18733333333333332, + "grad_norm": 4.84375, + "grad_norm_var": 0.027958170572916666, + "learning_rate": 4e-05, + "loss": 4.9532, + "loss/crossentropy": 1.5148718804121017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19824461452662945, + "step": 2248 + }, + { + "epoch": 0.1875, + "grad_norm": 5.0625, + "grad_norm_var": 0.02974853515625, + "learning_rate": 4e-05, + "loss": 5.4161, + "loss/crossentropy": 2.427815794944763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23496362939476967, + "step": 2250 + }, + { + "epoch": 0.18766666666666668, + "grad_norm": 5.0, + "grad_norm_var": 0.03557535807291667, + "learning_rate": 4e-05, + "loss": 5.3724, + "loss/crossentropy": 2.0604121685028076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21071171015501022, + "step": 2252 + }, + { + "epoch": 0.18783333333333332, + "grad_norm": 4.78125, + "grad_norm_var": 0.06519775390625, + "learning_rate": 4e-05, + "loss": 4.9961, + "loss/crossentropy": 2.419283837080002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21685468032956123, + "step": 2254 + }, + { + "epoch": 0.188, + "grad_norm": 5.25, + "grad_norm_var": 0.05738525390625, + "learning_rate": 4e-05, + "loss": 5.7778, + "loss/crossentropy": 2.7884849309921265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20676440745592117, + "step": 2256 + }, + { + "epoch": 0.18816666666666668, + "grad_norm": 5.5, + "grad_norm_var": 0.06968994140625, + "learning_rate": 4e-05, + "loss": 5.0168, + "loss/crossentropy": 1.3894076570868492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17267927527427673, + "step": 2258 + }, + { + "epoch": 0.18833333333333332, + "grad_norm": 4.71875, + "grad_norm_var": 0.079150390625, + "learning_rate": 4e-05, + "loss": 5.0644, + "loss/crossentropy": 1.7085940018296242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17589740082621574, + "step": 2260 + }, + { + "epoch": 0.1885, + "grad_norm": 5.125, + "grad_norm_var": 0.07649332682291667, + "learning_rate": 4e-05, + "loss": 4.6788, + "loss/crossentropy": 2.5399693846702576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22661524266004562, + "step": 2262 + }, + { + "epoch": 0.18866666666666668, + "grad_norm": 4.375, + "grad_norm_var": 0.10227457682291667, + "learning_rate": 4e-05, + "loss": 4.3746, + "loss/crossentropy": 1.6717079058289528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17274832166731358, + "step": 2264 + }, + { + "epoch": 0.18883333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.104296875, + "learning_rate": 4e-05, + "loss": 5.2109, + "loss/crossentropy": 2.150670550763607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020272258669138, + "step": 2266 + }, + { + "epoch": 0.189, + "grad_norm": 5.625, + "grad_norm_var": 0.1154296875, + "learning_rate": 4e-05, + "loss": 5.4183, + "loss/crossentropy": 1.5393069609999657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19283157959580421, + "step": 2268 + }, + { + "epoch": 0.18916666666666668, + "grad_norm": 5.28125, + "grad_norm_var": 0.103369140625, + "learning_rate": 4e-05, + "loss": 5.1102, + "loss/crossentropy": 1.8512317463755608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18179438635706902, + "step": 2270 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.11417643229166667, + "learning_rate": 4e-05, + "loss": 4.9907, + "loss/crossentropy": 1.8801306560635567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18935791589319706, + "step": 2272 + }, + { + "epoch": 0.1895, + "grad_norm": 5.0, + "grad_norm_var": 0.09908447265625, + "learning_rate": 4e-05, + "loss": 5.1938, + "loss/crossentropy": 1.6166588142514229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16408336907625198, + "step": 2274 + }, + { + "epoch": 0.18966666666666668, + "grad_norm": 4.875, + "grad_norm_var": 0.098291015625, + "learning_rate": 4e-05, + "loss": 4.4079, + "loss/crossentropy": 1.650967113673687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15877833776175976, + "step": 2276 + }, + { + "epoch": 0.18983333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.101953125, + "learning_rate": 4e-05, + "loss": 5.4296, + "loss/crossentropy": 2.6282625794410706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21001841127872467, + "step": 2278 + }, + { + "epoch": 0.19, + "grad_norm": 5.6875, + "grad_norm_var": 0.10572509765625, + "learning_rate": 4e-05, + "loss": 5.0226, + "loss/crossentropy": 1.8326758667826653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20088370144367218, + "step": 2280 + }, + { + "epoch": 0.19016666666666668, + "grad_norm": 5.4375, + "grad_norm_var": 0.10113525390625, + "learning_rate": 4e-05, + "loss": 5.1215, + "loss/crossentropy": 2.404378890991211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352002039551735, + "step": 2282 + }, + { + "epoch": 0.19033333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.09342447916666667, + "learning_rate": 4e-05, + "loss": 4.2729, + "loss/crossentropy": 1.60519190877676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19760267436504364, + "step": 2284 + }, + { + "epoch": 0.1905, + "grad_norm": 5.1875, + "grad_norm_var": 0.08205973307291667, + "learning_rate": 4e-05, + "loss": 4.9924, + "loss/crossentropy": 1.188548594713211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14038443379104137, + "step": 2286 + }, + { + "epoch": 0.19066666666666668, + "grad_norm": 5.46875, + "grad_norm_var": 0.14068603515625, + "learning_rate": 4e-05, + "loss": 5.083, + "loss/crossentropy": 2.4681698083877563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23486436530947685, + "step": 2288 + }, + { + "epoch": 0.19083333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.13886311848958333, + "learning_rate": 4e-05, + "loss": 5.2423, + "loss/crossentropy": 2.2314860820770264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19190067425370216, + "step": 2290 + }, + { + "epoch": 0.191, + "grad_norm": 5.40625, + "grad_norm_var": 0.12511393229166667, + "learning_rate": 4e-05, + "loss": 5.0995, + "loss/crossentropy": 1.8339603021740913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18039636500179768, + "step": 2292 + }, + { + "epoch": 0.19116666666666668, + "grad_norm": 5.4375, + "grad_norm_var": 0.13111572265625, + "learning_rate": 4e-05, + "loss": 4.3892, + "loss/crossentropy": 1.6122345626354218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16903558000922203, + "step": 2294 + }, + { + "epoch": 0.19133333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.13097330729166667, + "learning_rate": 4e-05, + "loss": 5.1152, + "loss/crossentropy": 1.9905153512954712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18592159822583199, + "step": 2296 + }, + { + "epoch": 0.1915, + "grad_norm": 5.15625, + "grad_norm_var": 0.13268229166666667, + "learning_rate": 4e-05, + "loss": 5.1794, + "loss/crossentropy": 2.347501277923584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232624776661396, + "step": 2298 + }, + { + "epoch": 0.19166666666666668, + "grad_norm": 5.28125, + "grad_norm_var": 0.14368489583333333, + "learning_rate": 4e-05, + "loss": 4.485, + "loss/crossentropy": 1.5246716812252998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17598333582282066, + "step": 2300 + }, + { + "epoch": 0.19183333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.15271809895833333, + "learning_rate": 4e-05, + "loss": 4.8725, + "loss/crossentropy": 2.488103985786438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2659139633178711, + "step": 2302 + }, + { + "epoch": 0.192, + "grad_norm": 5.125, + "grad_norm_var": 0.09537760416666667, + "learning_rate": 4e-05, + "loss": 4.7235, + "loss/crossentropy": 2.192271262407303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22700363397598267, + "step": 2304 + }, + { + "epoch": 0.19216666666666668, + "grad_norm": 5.0625, + "grad_norm_var": 0.11910400390625, + "learning_rate": 4e-05, + "loss": 4.8076, + "loss/crossentropy": 2.3486633598804474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21487021818757057, + "step": 2306 + }, + { + "epoch": 0.19233333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.7286458333333333, + "learning_rate": 4e-05, + "loss": 4.8531, + "loss/crossentropy": 1.9830282926559448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20326359942555428, + "step": 2308 + }, + { + "epoch": 0.1925, + "grad_norm": 4.59375, + "grad_norm_var": 0.7384765625, + "learning_rate": 4e-05, + "loss": 4.583, + "loss/crossentropy": 1.994844913482666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19658523239195347, + "step": 2310 + }, + { + "epoch": 0.19266666666666668, + "grad_norm": 5.03125, + "grad_norm_var": 0.7403605143229167, + "learning_rate": 4e-05, + "loss": 4.6763, + "loss/crossentropy": 2.076766610145569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21167897433042526, + "step": 2312 + }, + { + "epoch": 0.19283333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.7406209309895834, + "learning_rate": 4e-05, + "loss": 5.2687, + "loss/crossentropy": 1.9793154150247574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1830280590802431, + "step": 2314 + }, + { + "epoch": 0.193, + "grad_norm": 5.125, + "grad_norm_var": 0.7391764322916666, + "learning_rate": 4e-05, + "loss": 5.642, + "loss/crossentropy": 2.6030354499816895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964925155043602, + "step": 2316 + }, + { + "epoch": 0.19316666666666665, + "grad_norm": 5.0, + "grad_norm_var": 0.7379191080729167, + "learning_rate": 4e-05, + "loss": 5.1509, + "loss/crossentropy": 1.6609367281198502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19058697298169136, + "step": 2318 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 0.7442545572916667, + "learning_rate": 4e-05, + "loss": 4.8977, + "loss/crossentropy": 1.2792168036103249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.160741800442338, + "step": 2320 + }, + { + "epoch": 0.1935, + "grad_norm": 4.78125, + "grad_norm_var": 0.7247355143229167, + "learning_rate": 4e-05, + "loss": 4.5242, + "loss/crossentropy": 1.762831062078476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17957177013158798, + "step": 2322 + }, + { + "epoch": 0.19366666666666665, + "grad_norm": 5.09375, + "grad_norm_var": 0.06565348307291667, + "learning_rate": 4e-05, + "loss": 5.2638, + "loss/crossentropy": 2.090299479663372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19620474614202976, + "step": 2324 + }, + { + "epoch": 0.19383333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.06474202473958333, + "learning_rate": 4e-05, + "loss": 5.1114, + "loss/crossentropy": 2.4732211232185364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23878782615065575, + "step": 2326 + }, + { + "epoch": 0.194, + "grad_norm": 5.03125, + "grad_norm_var": 0.06474202473958333, + "learning_rate": 4e-05, + "loss": 5.012, + "loss/crossentropy": 1.8278373926877975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19272247329354286, + "step": 2328 + }, + { + "epoch": 0.19416666666666665, + "grad_norm": 4.65625, + "grad_norm_var": 0.07177327473958334, + "learning_rate": 4e-05, + "loss": 5.0843, + "loss/crossentropy": 2.1059842854738235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18917707912623882, + "step": 2330 + }, + { + "epoch": 0.19433333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.05779622395833333, + "learning_rate": 4e-05, + "loss": 4.6124, + "loss/crossentropy": 2.4292266964912415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2285252884030342, + "step": 2332 + }, + { + "epoch": 0.1945, + "grad_norm": 4.96875, + "grad_norm_var": 0.05810139973958333, + "learning_rate": 4e-05, + "loss": 4.9482, + "loss/crossentropy": 2.253578156232834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20206546410918236, + "step": 2334 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.04045817057291667, + "learning_rate": 4e-05, + "loss": 4.3528, + "loss/crossentropy": 2.476481080055237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20762532204389572, + "step": 2336 + }, + { + "epoch": 0.19483333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.24381510416666666, + "learning_rate": 4e-05, + "loss": 5.4346, + "loss/crossentropy": 2.1690665781497955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23577401787042618, + "step": 2338 + }, + { + "epoch": 0.195, + "grad_norm": 5.25, + "grad_norm_var": 0.24620768229166667, + "learning_rate": 4e-05, + "loss": 5.4965, + "loss/crossentropy": 2.1100385785102844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158173955976963, + "step": 2340 + }, + { + "epoch": 0.19516666666666665, + "grad_norm": 5.3125, + "grad_norm_var": 0.2534993489583333, + "learning_rate": 4e-05, + "loss": 4.904, + "loss/crossentropy": 1.9950718879699707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21088406071066856, + "step": 2342 + }, + { + "epoch": 0.19533333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.25813395182291665, + "learning_rate": 4e-05, + "loss": 5.2952, + "loss/crossentropy": 2.032729558646679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18892317451536655, + "step": 2344 + }, + { + "epoch": 0.1955, + "grad_norm": 4.96875, + "grad_norm_var": 0.24455973307291667, + "learning_rate": 4e-05, + "loss": 4.843, + "loss/crossentropy": 2.043039858341217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19413789920508862, + "step": 2346 + }, + { + "epoch": 0.19566666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.21181233723958334, + "learning_rate": 4e-05, + "loss": 5.0166, + "loss/crossentropy": 2.136802703142166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046213485300541, + "step": 2348 + }, + { + "epoch": 0.19583333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.20038655598958333, + "learning_rate": 4e-05, + "loss": 5.1836, + "loss/crossentropy": 1.9726791083812714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19684670493006706, + "step": 2350 + }, + { + "epoch": 0.196, + "grad_norm": 5.0, + "grad_norm_var": 0.18697916666666667, + "learning_rate": 4e-05, + "loss": 4.5345, + "loss/crossentropy": 1.5553816556930542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23889531195163727, + "step": 2352 + }, + { + "epoch": 0.19616666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.09179280598958334, + "learning_rate": 4e-05, + "loss": 4.2414, + "loss/crossentropy": 1.5767273381352425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16629118844866753, + "step": 2354 + }, + { + "epoch": 0.19633333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.105859375, + "learning_rate": 4e-05, + "loss": 4.545, + "loss/crossentropy": 1.3365295231342316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1607996355742216, + "step": 2356 + }, + { + "epoch": 0.1965, + "grad_norm": 5.125, + "grad_norm_var": 0.08943684895833333, + "learning_rate": 4e-05, + "loss": 5.5573, + "loss/crossentropy": 1.968780405819416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766455788165331, + "step": 2358 + }, + { + "epoch": 0.19666666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.07890218098958333, + "learning_rate": 4e-05, + "loss": 5.5475, + "loss/crossentropy": 1.7819099575281143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18220307305455208, + "step": 2360 + }, + { + "epoch": 0.19683333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.09217122395833334, + "learning_rate": 4e-05, + "loss": 5.383, + "loss/crossentropy": 2.179343730211258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25340667366981506, + "step": 2362 + }, + { + "epoch": 0.197, + "grad_norm": 4.96875, + "grad_norm_var": 0.089697265625, + "learning_rate": 4e-05, + "loss": 5.4171, + "loss/crossentropy": 2.6181305050849915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063765563070774, + "step": 2364 + }, + { + "epoch": 0.19716666666666666, + "grad_norm": 5.5, + "grad_norm_var": 0.4177734375, + "learning_rate": 4e-05, + "loss": 5.3798, + "loss/crossentropy": 2.5250502824783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21604949235916138, + "step": 2366 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.413525390625, + "learning_rate": 4e-05, + "loss": 4.6161, + "loss/crossentropy": 1.679262101650238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18522610887885094, + "step": 2368 + }, + { + "epoch": 0.1975, + "grad_norm": 5.0, + "grad_norm_var": 0.35937093098958334, + "learning_rate": 4e-05, + "loss": 4.7427, + "loss/crossentropy": 1.8730647563934326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18919868022203445, + "step": 2370 + }, + { + "epoch": 0.19766666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.32896728515625, + "learning_rate": 4e-05, + "loss": 4.9464, + "loss/crossentropy": 1.8721271231770515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19726279936730862, + "step": 2372 + }, + { + "epoch": 0.19783333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.3389973958333333, + "learning_rate": 4e-05, + "loss": 4.9454, + "loss/crossentropy": 2.39280566573143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158326916396618, + "step": 2374 + }, + { + "epoch": 0.198, + "grad_norm": 5.0, + "grad_norm_var": 0.3389973958333333, + "learning_rate": 4e-05, + "loss": 5.2534, + "loss/crossentropy": 2.6808955669403076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230102077126503, + "step": 2376 + }, + { + "epoch": 0.19816666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.3664347330729167, + "learning_rate": 4e-05, + "loss": 4.9032, + "loss/crossentropy": 2.062427654862404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18818776682019234, + "step": 2378 + }, + { + "epoch": 0.19833333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.36503499348958335, + "learning_rate": 4e-05, + "loss": 4.4601, + "loss/crossentropy": 0.9793087244033813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11906297132372856, + "step": 2380 + }, + { + "epoch": 0.1985, + "grad_norm": 5.4375, + "grad_norm_var": 0.05455729166666667, + "learning_rate": 4e-05, + "loss": 4.8264, + "loss/crossentropy": 2.2848470509052277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223106287419796, + "step": 2382 + }, + { + "epoch": 0.19866666666666666, + "grad_norm": 5.65625, + "grad_norm_var": 0.07232666015625, + "learning_rate": 4e-05, + "loss": 4.9411, + "loss/crossentropy": 2.03727525472641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002503089606762, + "step": 2384 + }, + { + "epoch": 0.19883333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.06990559895833333, + "learning_rate": 4e-05, + "loss": 4.7408, + "loss/crossentropy": 2.0407300665974617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17866826429963112, + "step": 2386 + }, + { + "epoch": 0.199, + "grad_norm": 4.875, + "grad_norm_var": 0.06881510416666667, + "learning_rate": 4e-05, + "loss": 4.6838, + "loss/crossentropy": 1.9522857144474983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1880640648305416, + "step": 2388 + }, + { + "epoch": 0.19916666666666666, + "grad_norm": 5.34375, + "grad_norm_var": 0.07489827473958334, + "learning_rate": 4e-05, + "loss": 4.1322, + "loss/crossentropy": 1.9232835546135902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843096949160099, + "step": 2390 + }, + { + "epoch": 0.19933333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.08131103515625, + "learning_rate": 4e-05, + "loss": 4.9668, + "loss/crossentropy": 2.4705487489700317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21942654252052307, + "step": 2392 + }, + { + "epoch": 0.1995, + "grad_norm": 5.78125, + "grad_norm_var": 0.11480712890625, + "learning_rate": 4e-05, + "loss": 4.046, + "loss/crossentropy": 1.8307190835475922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106981799006462, + "step": 2394 + }, + { + "epoch": 0.19966666666666666, + "grad_norm": 5.375, + "grad_norm_var": 0.116650390625, + "learning_rate": 4e-05, + "loss": 5.0327, + "loss/crossentropy": 1.923123762011528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18329189345240593, + "step": 2396 + }, + { + "epoch": 0.19983333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.11988525390625, + "learning_rate": 4e-05, + "loss": 4.2767, + "loss/crossentropy": 1.2369297593832016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16844778135418892, + "step": 2398 + }, + { + "epoch": 0.2, + "grad_norm": 5.1875, + "grad_norm_var": 0.094140625, + "learning_rate": 4e-05, + "loss": 5.1696, + "loss/crossentropy": 2.8806328773498535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21086610108613968, + "step": 2400 + }, + { + "epoch": 0.20016666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.10504150390625, + "learning_rate": 4e-05, + "loss": 5.0176, + "loss/crossentropy": 1.6765633448958397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1639060154557228, + "step": 2402 + }, + { + "epoch": 0.20033333333333334, + "grad_norm": 5.53125, + "grad_norm_var": 0.14058837890625, + "learning_rate": 4e-05, + "loss": 4.9152, + "loss/crossentropy": 1.8263401091098785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19616793654859066, + "step": 2404 + }, + { + "epoch": 0.2005, + "grad_norm": 5.34375, + "grad_norm_var": 0.142041015625, + "learning_rate": 4e-05, + "loss": 5.5261, + "loss/crossentropy": 2.289825439453125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2241990529000759, + "step": 2406 + }, + { + "epoch": 0.20066666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.14482014973958332, + "learning_rate": 4e-05, + "loss": 5.0313, + "loss/crossentropy": 2.223125606775284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748021453619003, + "step": 2408 + }, + { + "epoch": 0.20083333333333334, + "grad_norm": 5.625, + "grad_norm_var": 0.120166015625, + "learning_rate": 4e-05, + "loss": 4.7346, + "loss/crossentropy": 2.1327845007181168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1936967596411705, + "step": 2410 + }, + { + "epoch": 0.201, + "grad_norm": 5.1875, + "grad_norm_var": 0.11864827473958334, + "learning_rate": 4e-05, + "loss": 5.4438, + "loss/crossentropy": 2.612125277519226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22732067853212357, + "step": 2412 + }, + { + "epoch": 0.20116666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.10035400390625, + "learning_rate": 4e-05, + "loss": 4.7912, + "loss/crossentropy": 1.7989770472049713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17794376984238625, + "step": 2414 + }, + { + "epoch": 0.20133333333333334, + "grad_norm": 5.3125, + "grad_norm_var": 0.10250244140625, + "learning_rate": 4e-05, + "loss": 5.2793, + "loss/crossentropy": 1.7428071647882462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1840539537370205, + "step": 2416 + }, + { + "epoch": 0.2015, + "grad_norm": 5.1875, + "grad_norm_var": 0.08040364583333333, + "learning_rate": 4e-05, + "loss": 5.137, + "loss/crossentropy": 2.491989552974701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21401550620794296, + "step": 2418 + }, + { + "epoch": 0.20166666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.07121988932291666, + "learning_rate": 4e-05, + "loss": 5.1646, + "loss/crossentropy": 1.4126268327236176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16175073012709618, + "step": 2420 + }, + { + "epoch": 0.20183333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.07980143229166667, + "learning_rate": 4e-05, + "loss": 4.6839, + "loss/crossentropy": 2.006320595741272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155579999089241, + "step": 2422 + }, + { + "epoch": 0.202, + "grad_norm": 4.96875, + "grad_norm_var": 0.075634765625, + "learning_rate": 4e-05, + "loss": 5.0565, + "loss/crossentropy": 2.3960747718811035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1939377263188362, + "step": 2424 + }, + { + "epoch": 0.20216666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.055924479166666666, + "learning_rate": 4e-05, + "loss": 4.5787, + "loss/crossentropy": 1.9523345828056335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18938801437616348, + "step": 2426 + }, + { + "epoch": 0.20233333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.05636393229166667, + "learning_rate": 4e-05, + "loss": 4.8534, + "loss/crossentropy": 2.0117806047201157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1813819408416748, + "step": 2428 + }, + { + "epoch": 0.2025, + "grad_norm": 5.1875, + "grad_norm_var": 0.07122395833333334, + "learning_rate": 4e-05, + "loss": 4.7015, + "loss/crossentropy": 1.265286423265934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14998789504170418, + "step": 2430 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.059403483072916666, + "learning_rate": 4e-05, + "loss": 5.1934, + "loss/crossentropy": 2.120694190263748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23458874225616455, + "step": 2432 + }, + { + "epoch": 0.20283333333333334, + "grad_norm": 5.625, + "grad_norm_var": 0.08866780598958333, + "learning_rate": 4e-05, + "loss": 4.2152, + "loss/crossentropy": 0.7457961067557335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11005957797169685, + "step": 2434 + }, + { + "epoch": 0.203, + "grad_norm": 5.03125, + "grad_norm_var": 0.074462890625, + "learning_rate": 4e-05, + "loss": 5.2149, + "loss/crossentropy": 1.8204586580395699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17762837558984756, + "step": 2436 + }, + { + "epoch": 0.20316666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 0.08463134765625, + "learning_rate": 4e-05, + "loss": 5.2604, + "loss/crossentropy": 2.4990166425704956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22241423279047012, + "step": 2438 + }, + { + "epoch": 0.20333333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.08203125, + "learning_rate": 4e-05, + "loss": 4.7264, + "loss/crossentropy": 1.5507011637091637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16992885246872902, + "step": 2440 + }, + { + "epoch": 0.2035, + "grad_norm": 5.0, + "grad_norm_var": 0.07978108723958334, + "learning_rate": 4e-05, + "loss": 4.5444, + "loss/crossentropy": 1.4435075148940086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15607544034719467, + "step": 2442 + }, + { + "epoch": 0.20366666666666666, + "grad_norm": 5.5, + "grad_norm_var": 0.08411458333333334, + "learning_rate": 4e-05, + "loss": 4.8557, + "loss/crossentropy": 2.127426564693451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19914470985531807, + "step": 2444 + }, + { + "epoch": 0.20383333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.06495768229166667, + "learning_rate": 4e-05, + "loss": 5.2658, + "loss/crossentropy": 2.2032350599765778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21544507518410683, + "step": 2446 + }, + { + "epoch": 0.204, + "grad_norm": 5.0625, + "grad_norm_var": 0.06483968098958333, + "learning_rate": 4e-05, + "loss": 4.762, + "loss/crossentropy": 1.4311346858739853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1594764105975628, + "step": 2448 + }, + { + "epoch": 0.20416666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.037093098958333334, + "learning_rate": 4e-05, + "loss": 5.1773, + "loss/crossentropy": 2.3031201362609863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23431189358234406, + "step": 2450 + }, + { + "epoch": 0.20433333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.047163899739583334, + "learning_rate": 4e-05, + "loss": 4.7683, + "loss/crossentropy": 1.0590153932571411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14155251905322075, + "step": 2452 + }, + { + "epoch": 0.2045, + "grad_norm": 4.78125, + "grad_norm_var": 0.03553059895833333, + "learning_rate": 4e-05, + "loss": 4.7823, + "loss/crossentropy": 2.6489208340644836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22386395931243896, + "step": 2454 + }, + { + "epoch": 0.20466666666666666, + "grad_norm": 5.625, + "grad_norm_var": 0.056233723958333336, + "learning_rate": 4e-05, + "loss": 4.9762, + "loss/crossentropy": 1.5068995282053947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19015095010399818, + "step": 2456 + }, + { + "epoch": 0.20483333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.05579020182291667, + "learning_rate": 4e-05, + "loss": 4.8357, + "loss/crossentropy": 2.2568003833293915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21340489014983177, + "step": 2458 + }, + { + "epoch": 0.205, + "grad_norm": 4.90625, + "grad_norm_var": 0.045426432291666666, + "learning_rate": 4e-05, + "loss": 4.8352, + "loss/crossentropy": 2.2082974314689636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2341243252158165, + "step": 2460 + }, + { + "epoch": 0.20516666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.04690348307291667, + "learning_rate": 4e-05, + "loss": 5.3281, + "loss/crossentropy": 2.6525495648384094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21573406457901, + "step": 2462 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.049051920572916664, + "learning_rate": 4e-05, + "loss": 4.479, + "loss/crossentropy": 1.88506181538105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18165216967463493, + "step": 2464 + }, + { + "epoch": 0.2055, + "grad_norm": 4.96875, + "grad_norm_var": 0.05089518229166667, + "learning_rate": 4e-05, + "loss": 4.6123, + "loss/crossentropy": 0.9983108341693878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15218045935034752, + "step": 2466 + }, + { + "epoch": 0.20566666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.0470703125, + "learning_rate": 4e-05, + "loss": 5.4004, + "loss/crossentropy": 2.5370147228240967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2264598123729229, + "step": 2468 + }, + { + "epoch": 0.20583333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.092822265625, + "learning_rate": 4e-05, + "loss": 5.4403, + "loss/crossentropy": 2.3860780596733093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223365642130375, + "step": 2470 + }, + { + "epoch": 0.206, + "grad_norm": 5.0, + "grad_norm_var": 0.07654622395833334, + "learning_rate": 4e-05, + "loss": 5.4358, + "loss/crossentropy": 2.091231919825077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188719617202878, + "step": 2472 + }, + { + "epoch": 0.20616666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.08943684895833333, + "learning_rate": 4e-05, + "loss": 4.1961, + "loss/crossentropy": 2.3965645730495453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22580226883292198, + "step": 2474 + }, + { + "epoch": 0.20633333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.11350504557291667, + "learning_rate": 4e-05, + "loss": 4.7299, + "loss/crossentropy": 1.1824834942817688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1517321616411209, + "step": 2476 + }, + { + "epoch": 0.2065, + "grad_norm": 5.09375, + "grad_norm_var": 0.12706705729166667, + "learning_rate": 4e-05, + "loss": 4.9486, + "loss/crossentropy": 2.443815290927887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23438604921102524, + "step": 2478 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.14342041015625, + "learning_rate": 4e-05, + "loss": 4.6524, + "loss/crossentropy": 1.9140185862779617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21477340161800385, + "step": 2480 + }, + { + "epoch": 0.20683333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.14384358723958332, + "learning_rate": 4e-05, + "loss": 5.1093, + "loss/crossentropy": 2.3452938199043274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23434526473283768, + "step": 2482 + }, + { + "epoch": 0.207, + "grad_norm": 4.875, + "grad_norm_var": 0.14306233723958334, + "learning_rate": 4e-05, + "loss": 4.3748, + "loss/crossentropy": 1.414752148091793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15633751079440117, + "step": 2484 + }, + { + "epoch": 0.20716666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.07537434895833334, + "learning_rate": 4e-05, + "loss": 5.2316, + "loss/crossentropy": 2.1838470697402954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045600563287735, + "step": 2486 + }, + { + "epoch": 0.20733333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.07502848307291667, + "learning_rate": 4e-05, + "loss": 4.6684, + "loss/crossentropy": 1.1734877079725266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1591203808784485, + "step": 2488 + }, + { + "epoch": 0.2075, + "grad_norm": 5.0, + "grad_norm_var": 0.08381754557291667, + "learning_rate": 4e-05, + "loss": 4.6216, + "loss/crossentropy": 2.521151304244995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22322241961956024, + "step": 2490 + }, + { + "epoch": 0.20766666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.056494140625, + "learning_rate": 4e-05, + "loss": 5.1962, + "loss/crossentropy": 1.9911609292030334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19882928766310215, + "step": 2492 + }, + { + "epoch": 0.20783333333333334, + "grad_norm": 5.40625, + "grad_norm_var": 0.05728759765625, + "learning_rate": 4e-05, + "loss": 4.8796, + "loss/crossentropy": 1.5489679425954819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.168511301279068, + "step": 2494 + }, + { + "epoch": 0.208, + "grad_norm": 5.15625, + "grad_norm_var": 0.03863525390625, + "learning_rate": 4e-05, + "loss": 4.9423, + "loss/crossentropy": 2.0905182361602783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21954334527254105, + "step": 2496 + }, + { + "epoch": 0.20816666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.037430826822916666, + "learning_rate": 4e-05, + "loss": 4.897, + "loss/crossentropy": 2.207546591758728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22128864005208015, + "step": 2498 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 6.0625, + "grad_norm_var": 0.10032552083333333, + "learning_rate": 4e-05, + "loss": 5.2691, + "loss/crossentropy": 1.9401999711990356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22317113354802132, + "step": 2500 + }, + { + "epoch": 0.2085, + "grad_norm": 4.78125, + "grad_norm_var": 0.104150390625, + "learning_rate": 4e-05, + "loss": 4.6666, + "loss/crossentropy": 1.600251205265522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15894647873938084, + "step": 2502 + }, + { + "epoch": 0.20866666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.10896809895833333, + "learning_rate": 4e-05, + "loss": 4.9524, + "loss/crossentropy": 1.9892296642065048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17438082955777645, + "step": 2504 + }, + { + "epoch": 0.20883333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.09537760416666667, + "learning_rate": 4e-05, + "loss": 4.8822, + "loss/crossentropy": 1.401974692940712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15046437829732895, + "step": 2506 + }, + { + "epoch": 0.209, + "grad_norm": 5.15625, + "grad_norm_var": 0.102197265625, + "learning_rate": 4e-05, + "loss": 5.2628, + "loss/crossentropy": 1.6273729652166367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18275013752281666, + "step": 2508 + }, + { + "epoch": 0.20916666666666667, + "grad_norm": 5.59375, + "grad_norm_var": 0.11470947265625, + "learning_rate": 4e-05, + "loss": 5.2575, + "loss/crossentropy": 2.275018572807312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2178800217807293, + "step": 2510 + }, + { + "epoch": 0.20933333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.12200113932291666, + "learning_rate": 4e-05, + "loss": 5.1247, + "loss/crossentropy": 1.5231431126594543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15373231656849384, + "step": 2512 + }, + { + "epoch": 0.2095, + "grad_norm": 5.46875, + "grad_norm_var": 0.14397379557291667, + "learning_rate": 4e-05, + "loss": 5.4726, + "loss/crossentropy": 2.2370805740356445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20047016441822052, + "step": 2514 + }, + { + "epoch": 0.20966666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.09107666015625, + "learning_rate": 4e-05, + "loss": 4.7663, + "loss/crossentropy": 1.8152910470962524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18139103800058365, + "step": 2516 + }, + { + "epoch": 0.20983333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.10071614583333334, + "learning_rate": 4e-05, + "loss": 4.5476, + "loss/crossentropy": 0.983028382062912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11749066784977913, + "step": 2518 + }, + { + "epoch": 0.21, + "grad_norm": 5.03125, + "grad_norm_var": 0.09244384765625, + "learning_rate": 4e-05, + "loss": 5.1911, + "loss/crossentropy": 2.3584609627723694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22418444603681564, + "step": 2520 + }, + { + "epoch": 0.21016666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.09595947265625, + "learning_rate": 4e-05, + "loss": 4.8606, + "loss/crossentropy": 2.3720744848251343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20562008023262024, + "step": 2522 + }, + { + "epoch": 0.21033333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.08292643229166667, + "learning_rate": 4e-05, + "loss": 4.9864, + "loss/crossentropy": 0.9131257832050323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11408952996134758, + "step": 2524 + }, + { + "epoch": 0.2105, + "grad_norm": 5.28125, + "grad_norm_var": 0.06822916666666666, + "learning_rate": 4e-05, + "loss": 4.9971, + "loss/crossentropy": 1.5935637727379799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20797242410480976, + "step": 2526 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.06370035807291667, + "learning_rate": 4e-05, + "loss": 5.2704, + "loss/crossentropy": 2.1786339581012726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19719019532203674, + "step": 2528 + }, + { + "epoch": 0.21083333333333334, + "grad_norm": 5.4375, + "grad_norm_var": 0.0521484375, + "learning_rate": 4e-05, + "loss": 5.4812, + "loss/crossentropy": 2.0704859495162964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22412089258432388, + "step": 2530 + }, + { + "epoch": 0.211, + "grad_norm": 5.09375, + "grad_norm_var": 0.04208577473958333, + "learning_rate": 4e-05, + "loss": 5.3409, + "loss/crossentropy": 2.5442384481430054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23552602529525757, + "step": 2532 + }, + { + "epoch": 0.21116666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.025386555989583334, + "learning_rate": 4e-05, + "loss": 5.168, + "loss/crossentropy": 2.3277163207530975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21133242174983025, + "step": 2534 + }, + { + "epoch": 0.21133333333333335, + "grad_norm": 5.09375, + "grad_norm_var": 0.026786295572916667, + "learning_rate": 4e-05, + "loss": 4.81, + "loss/crossentropy": 2.286676347255707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255946770310402, + "step": 2536 + }, + { + "epoch": 0.2115, + "grad_norm": 5.09375, + "grad_norm_var": 0.023111979166666668, + "learning_rate": 4e-05, + "loss": 4.725, + "loss/crossentropy": 1.7959834411740303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1922011747956276, + "step": 2538 + }, + { + "epoch": 0.21166666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.024149576822916668, + "learning_rate": 4e-05, + "loss": 5.055, + "loss/crossentropy": 2.4185322523117065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21328793838620186, + "step": 2540 + }, + { + "epoch": 0.21183333333333335, + "grad_norm": 4.96875, + "grad_norm_var": 0.019254557291666665, + "learning_rate": 4e-05, + "loss": 5.0846, + "loss/crossentropy": 2.1555165350437164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327948994934559, + "step": 2542 + }, + { + "epoch": 0.212, + "grad_norm": 5.5, + "grad_norm_var": 0.04269205729166667, + "learning_rate": 4e-05, + "loss": 5.2851, + "loss/crossentropy": 1.4759873449802399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15248359367251396, + "step": 2544 + }, + { + "epoch": 0.21216666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.031884765625, + "learning_rate": 4e-05, + "loss": 4.7554, + "loss/crossentropy": 1.4834392219781876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17983658611774445, + "step": 2546 + }, + { + "epoch": 0.21233333333333335, + "grad_norm": 5.46875, + "grad_norm_var": 0.043603515625, + "learning_rate": 4e-05, + "loss": 5.007, + "loss/crossentropy": 1.708244226872921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18391894362866879, + "step": 2548 + }, + { + "epoch": 0.2125, + "grad_norm": 5.0625, + "grad_norm_var": 0.04400634765625, + "learning_rate": 4e-05, + "loss": 4.9281, + "loss/crossentropy": 1.6128144562244415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1744004562497139, + "step": 2550 + }, + { + "epoch": 0.21266666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.059098307291666666, + "learning_rate": 4e-05, + "loss": 4.9651, + "loss/crossentropy": 1.640898883342743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18997144512832165, + "step": 2552 + }, + { + "epoch": 0.21283333333333335, + "grad_norm": 5.4375, + "grad_norm_var": 0.08271077473958334, + "learning_rate": 4e-05, + "loss": 5.2758, + "loss/crossentropy": 1.6650393679738045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1907278336584568, + "step": 2554 + }, + { + "epoch": 0.213, + "grad_norm": 5.46875, + "grad_norm_var": 0.09267171223958333, + "learning_rate": 4e-05, + "loss": 5.3275, + "loss/crossentropy": 2.447467267513275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21921542286872864, + "step": 2556 + }, + { + "epoch": 0.21316666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.09659830729166667, + "learning_rate": 4e-05, + "loss": 5.2095, + "loss/crossentropy": 2.4506001472473145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22498662024736404, + "step": 2558 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 5.34375, + "grad_norm_var": 0.09029947916666667, + "learning_rate": 4e-05, + "loss": 5.4234, + "loss/crossentropy": 2.3685405254364014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21306321397423744, + "step": 2560 + }, + { + "epoch": 0.2135, + "grad_norm": 5.21875, + "grad_norm_var": 0.10598958333333333, + "learning_rate": 4e-05, + "loss": 4.6514, + "loss/crossentropy": 2.0201190412044525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1936206892132759, + "step": 2562 + }, + { + "epoch": 0.21366666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.09833577473958334, + "learning_rate": 4e-05, + "loss": 5.7124, + "loss/crossentropy": 2.7159610986709595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22015909105539322, + "step": 2564 + }, + { + "epoch": 0.21383333333333332, + "grad_norm": 4.84375, + "grad_norm_var": 0.10123697916666667, + "learning_rate": 4e-05, + "loss": 4.8923, + "loss/crossentropy": 2.1644165217876434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22002530470490456, + "step": 2566 + }, + { + "epoch": 0.214, + "grad_norm": 5.34375, + "grad_norm_var": 0.0859375, + "learning_rate": 4e-05, + "loss": 4.6498, + "loss/crossentropy": 1.9540190249681473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15238827653229237, + "step": 2568 + }, + { + "epoch": 0.21416666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.22779947916666668, + "learning_rate": 4e-05, + "loss": 5.609, + "loss/crossentropy": 2.4001490473747253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24488025903701782, + "step": 2570 + }, + { + "epoch": 0.21433333333333332, + "grad_norm": 5.03125, + "grad_norm_var": 0.23202718098958333, + "learning_rate": 4e-05, + "loss": 5.2706, + "loss/crossentropy": 2.3217179775238037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21956229209899902, + "step": 2572 + }, + { + "epoch": 0.2145, + "grad_norm": 4.90625, + "grad_norm_var": 0.23212483723958333, + "learning_rate": 4e-05, + "loss": 4.9006, + "loss/crossentropy": 1.3002420365810394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17373565584421158, + "step": 2574 + }, + { + "epoch": 0.21466666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.24062093098958334, + "learning_rate": 4e-05, + "loss": 4.8351, + "loss/crossentropy": 2.2719730138778687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139144465327263, + "step": 2576 + }, + { + "epoch": 0.21483333333333332, + "grad_norm": 4.9375, + "grad_norm_var": 0.21861572265625, + "learning_rate": 4e-05, + "loss": 4.6618, + "loss/crossentropy": 1.901275411248207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19540333189070225, + "step": 2578 + }, + { + "epoch": 0.215, + "grad_norm": 5.3125, + "grad_norm_var": 0.23121337890625, + "learning_rate": 4e-05, + "loss": 5.1308, + "loss/crossentropy": 2.380274325609207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21984224021434784, + "step": 2580 + }, + { + "epoch": 0.21516666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.22320556640625, + "learning_rate": 4e-05, + "loss": 4.8584, + "loss/crossentropy": 1.8006494864821434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16921357065439224, + "step": 2582 + }, + { + "epoch": 0.21533333333333332, + "grad_norm": 5.3125, + "grad_norm_var": 0.22681884765625, + "learning_rate": 4e-05, + "loss": 4.9215, + "loss/crossentropy": 2.525661528110504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218607347458601, + "step": 2584 + }, + { + "epoch": 0.2155, + "grad_norm": 5.3125, + "grad_norm_var": 0.06099853515625, + "learning_rate": 4e-05, + "loss": 4.1835, + "loss/crossentropy": 2.3541648387908936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21644888073205948, + "step": 2586 + }, + { + "epoch": 0.21566666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.05845947265625, + "learning_rate": 4e-05, + "loss": 5.0687, + "loss/crossentropy": 2.440452992916107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21727336198091507, + "step": 2588 + }, + { + "epoch": 0.21583333333333332, + "grad_norm": 5.0, + "grad_norm_var": 0.05611572265625, + "learning_rate": 4e-05, + "loss": 4.4566, + "loss/crossentropy": 1.6151638180017471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18537553399801254, + "step": 2590 + }, + { + "epoch": 0.216, + "grad_norm": 4.84375, + "grad_norm_var": 0.17750244140625, + "learning_rate": 4e-05, + "loss": 4.9577, + "loss/crossentropy": 2.416458487510681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24135924503207207, + "step": 2592 + }, + { + "epoch": 0.21616666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.18006184895833333, + "learning_rate": 4e-05, + "loss": 5.2044, + "loss/crossentropy": 2.1265391409397125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17638667300343513, + "step": 2594 + }, + { + "epoch": 0.21633333333333332, + "grad_norm": 5.03125, + "grad_norm_var": 0.17928059895833334, + "learning_rate": 4e-05, + "loss": 4.6906, + "loss/crossentropy": 2.0304845348000526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004259079694748, + "step": 2596 + }, + { + "epoch": 0.2165, + "grad_norm": 5.09375, + "grad_norm_var": 0.19010416666666666, + "learning_rate": 4e-05, + "loss": 4.858, + "loss/crossentropy": 1.6524630934000015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17721411027014256, + "step": 2598 + }, + { + "epoch": 0.21666666666666667, + "grad_norm": 5.5, + "grad_norm_var": 0.19853108723958332, + "learning_rate": 4e-05, + "loss": 5.0746, + "loss/crossentropy": 2.5842694640159607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22306865453720093, + "step": 2600 + }, + { + "epoch": 0.21683333333333332, + "grad_norm": 5.125, + "grad_norm_var": 0.17987874348958333, + "learning_rate": 4e-05, + "loss": 5.3043, + "loss/crossentropy": 2.285725235939026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159329690039158, + "step": 2602 + }, + { + "epoch": 0.217, + "grad_norm": 4.84375, + "grad_norm_var": 0.7171834309895834, + "learning_rate": 4e-05, + "loss": 4.7946, + "loss/crossentropy": 1.577958881855011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1998040471225977, + "step": 2604 + }, + { + "epoch": 0.21716666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.6925130208333333, + "learning_rate": 4e-05, + "loss": 5.1966, + "loss/crossentropy": 1.9956328868865967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1921469122171402, + "step": 2606 + }, + { + "epoch": 0.21733333333333332, + "grad_norm": 5.0, + "grad_norm_var": 0.6096638997395833, + "learning_rate": 4e-05, + "loss": 4.9423, + "loss/crossentropy": 1.4313837885856628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15414538234472275, + "step": 2608 + }, + { + "epoch": 0.2175, + "grad_norm": 4.90625, + "grad_norm_var": 0.5982706705729167, + "learning_rate": 4e-05, + "loss": 4.4947, + "loss/crossentropy": 1.1389791369438171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13487368822097778, + "step": 2610 + }, + { + "epoch": 0.21766666666666667, + "grad_norm": 5.71875, + "grad_norm_var": 0.5936848958333333, + "learning_rate": 4e-05, + "loss": 5.0737, + "loss/crossentropy": 2.404743731021881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2210671752691269, + "step": 2612 + }, + { + "epoch": 0.21783333333333332, + "grad_norm": 5.28125, + "grad_norm_var": 0.5944661458333333, + "learning_rate": 4e-05, + "loss": 5.5376, + "loss/crossentropy": 2.3369793593883514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20953010395169258, + "step": 2614 + }, + { + "epoch": 0.218, + "grad_norm": 5.0, + "grad_norm_var": 0.63160400390625, + "learning_rate": 4e-05, + "loss": 4.953, + "loss/crossentropy": 1.8187780529260635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17888355813920498, + "step": 2616 + }, + { + "epoch": 0.21816666666666668, + "grad_norm": 4.4375, + "grad_norm_var": 0.67525634765625, + "learning_rate": 4e-05, + "loss": 4.6163, + "loss/crossentropy": 2.0286522433161736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18695184588432312, + "step": 2618 + }, + { + "epoch": 0.21833333333333332, + "grad_norm": 4.96875, + "grad_norm_var": 0.13238525390625, + "learning_rate": 4e-05, + "loss": 4.3147, + "loss/crossentropy": 1.8871822357177734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17654790729284286, + "step": 2620 + }, + { + "epoch": 0.2185, + "grad_norm": 5.09375, + "grad_norm_var": 0.13385416666666666, + "learning_rate": 4e-05, + "loss": 4.956, + "loss/crossentropy": 2.6005072593688965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20175550132989883, + "step": 2622 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 5.125, + "grad_norm_var": 0.138134765625, + "learning_rate": 4e-05, + "loss": 5.2147, + "loss/crossentropy": 2.5134531259536743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22386842593550682, + "step": 2624 + }, + { + "epoch": 0.21883333333333332, + "grad_norm": 5.3125, + "grad_norm_var": 0.14290364583333334, + "learning_rate": 4e-05, + "loss": 5.3094, + "loss/crossentropy": 1.8400039002299309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17169796116650105, + "step": 2626 + }, + { + "epoch": 0.219, + "grad_norm": 5.0625, + "grad_norm_var": 0.06952718098958334, + "learning_rate": 4e-05, + "loss": 5.0435, + "loss/crossentropy": 1.4271916523575783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1459579486399889, + "step": 2628 + }, + { + "epoch": 0.21916666666666668, + "grad_norm": 4.84375, + "grad_norm_var": 0.05868733723958333, + "learning_rate": 4e-05, + "loss": 4.6969, + "loss/crossentropy": 2.2238671481609344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19480058550834656, + "step": 2630 + }, + { + "epoch": 0.21933333333333332, + "grad_norm": 5.59375, + "grad_norm_var": 0.08567708333333333, + "learning_rate": 4e-05, + "loss": 5.0919, + "loss/crossentropy": 2.6155874729156494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124779336154461, + "step": 2632 + }, + { + "epoch": 0.2195, + "grad_norm": 5.15625, + "grad_norm_var": 0.08435872395833334, + "learning_rate": 4e-05, + "loss": 4.6738, + "loss/crossentropy": 2.1278350353240967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25172296166419983, + "step": 2634 + }, + { + "epoch": 0.21966666666666668, + "grad_norm": 5.1875, + "grad_norm_var": 0.0876953125, + "learning_rate": 4e-05, + "loss": 4.7663, + "loss/crossentropy": 2.2914819419384003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232300266623497, + "step": 2636 + }, + { + "epoch": 0.21983333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.092822265625, + "learning_rate": 4e-05, + "loss": 5.2515, + "loss/crossentropy": 1.9743507206439972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19434590265154839, + "step": 2638 + }, + { + "epoch": 0.22, + "grad_norm": 5.3125, + "grad_norm_var": 0.090478515625, + "learning_rate": 4e-05, + "loss": 5.0155, + "loss/crossentropy": 2.3763028979301453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22711258754134178, + "step": 2640 + }, + { + "epoch": 0.22016666666666668, + "grad_norm": 5.15625, + "grad_norm_var": 0.09208577473958333, + "learning_rate": 4e-05, + "loss": 5.0988, + "loss/crossentropy": 1.600294180214405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16233623772859573, + "step": 2642 + }, + { + "epoch": 0.22033333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.09388020833333334, + "learning_rate": 4e-05, + "loss": 4.9415, + "loss/crossentropy": 2.3785403072834015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217193104326725, + "step": 2644 + }, + { + "epoch": 0.2205, + "grad_norm": 4.78125, + "grad_norm_var": 0.09466145833333334, + "learning_rate": 4e-05, + "loss": 4.4769, + "loss/crossentropy": 1.0602325424551964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12778138555586338, + "step": 2646 + }, + { + "epoch": 0.22066666666666668, + "grad_norm": 5.46875, + "grad_norm_var": 0.08785400390625, + "learning_rate": 4e-05, + "loss": 4.9776, + "loss/crossentropy": 2.049244850873947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20482390746474266, + "step": 2648 + }, + { + "epoch": 0.22083333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.08487955729166667, + "learning_rate": 4e-05, + "loss": 4.9487, + "loss/crossentropy": 2.3347797989845276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474630758166313, + "step": 2650 + }, + { + "epoch": 0.221, + "grad_norm": 5.28125, + "grad_norm_var": 0.06261393229166666, + "learning_rate": 4e-05, + "loss": 4.755, + "loss/crossentropy": 1.834708720445633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18110329657793045, + "step": 2652 + }, + { + "epoch": 0.22116666666666668, + "grad_norm": 4.96875, + "grad_norm_var": 0.06884358723958334, + "learning_rate": 4e-05, + "loss": 5.009, + "loss/crossentropy": 2.164376437664032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20554454252123833, + "step": 2654 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.08476155598958333, + "learning_rate": 4e-05, + "loss": 4.673, + "loss/crossentropy": 2.358445018529892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216878242790699, + "step": 2656 + }, + { + "epoch": 0.2215, + "grad_norm": 5.03125, + "grad_norm_var": 0.08046875, + "learning_rate": 4e-05, + "loss": 5.1396, + "loss/crossentropy": 1.6497721672058105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18405374325811863, + "step": 2658 + }, + { + "epoch": 0.22166666666666668, + "grad_norm": 5.15625, + "grad_norm_var": 0.0720703125, + "learning_rate": 4e-05, + "loss": 5.5726, + "loss/crossentropy": 2.38449564576149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24445926398038864, + "step": 2660 + }, + { + "epoch": 0.22183333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.062483723958333334, + "learning_rate": 4e-05, + "loss": 5.2137, + "loss/crossentropy": 1.9726266413927078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19294766709208488, + "step": 2662 + }, + { + "epoch": 0.222, + "grad_norm": 4.71875, + "grad_norm_var": 0.0697265625, + "learning_rate": 4e-05, + "loss": 4.8543, + "loss/crossentropy": 2.184689074754715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20648836717009544, + "step": 2664 + }, + { + "epoch": 0.22216666666666668, + "grad_norm": 5.28125, + "grad_norm_var": 0.07967122395833333, + "learning_rate": 4e-05, + "loss": 4.6217, + "loss/crossentropy": 2.200216382741928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21623068675398827, + "step": 2666 + }, + { + "epoch": 0.22233333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.08019205729166666, + "learning_rate": 4e-05, + "loss": 5.5548, + "loss/crossentropy": 2.570397049188614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21973755210638046, + "step": 2668 + }, + { + "epoch": 0.2225, + "grad_norm": 4.625, + "grad_norm_var": 0.07486572265625, + "learning_rate": 4e-05, + "loss": 4.5188, + "loss/crossentropy": 1.840833805501461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19348925352096558, + "step": 2670 + }, + { + "epoch": 0.22266666666666668, + "grad_norm": 4.90625, + "grad_norm_var": 0.064306640625, + "learning_rate": 4e-05, + "loss": 4.9382, + "loss/crossentropy": 2.048027887940407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18247253075242043, + "step": 2672 + }, + { + "epoch": 0.22283333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.07825520833333334, + "learning_rate": 4e-05, + "loss": 5.2389, + "loss/crossentropy": 1.7717494443058968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19412222132086754, + "step": 2674 + }, + { + "epoch": 0.223, + "grad_norm": 5.0625, + "grad_norm_var": 0.06483968098958333, + "learning_rate": 4e-05, + "loss": 5.3882, + "loss/crossentropy": 2.4813308119773865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152557373046875, + "step": 2676 + }, + { + "epoch": 0.22316666666666668, + "grad_norm": 4.875, + "grad_norm_var": 0.05845947265625, + "learning_rate": 4e-05, + "loss": 4.686, + "loss/crossentropy": 2.3142955899238586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20804548263549805, + "step": 2678 + }, + { + "epoch": 0.22333333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.05245768229166667, + "learning_rate": 4e-05, + "loss": 4.7857, + "loss/crossentropy": 2.190678149461746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21705422922968864, + "step": 2680 + }, + { + "epoch": 0.2235, + "grad_norm": 5.21875, + "grad_norm_var": 0.042643229166666664, + "learning_rate": 4e-05, + "loss": 5.3474, + "loss/crossentropy": 1.5699248164892197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278335839509964, + "step": 2682 + }, + { + "epoch": 0.22366666666666668, + "grad_norm": 4.84375, + "grad_norm_var": 0.038309733072916664, + "learning_rate": 4e-05, + "loss": 5.0322, + "loss/crossentropy": 1.916561797261238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19276293367147446, + "step": 2684 + }, + { + "epoch": 0.22383333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 4e-05, + "loss": 4.9872, + "loss/crossentropy": 2.2632661163806915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21685384586453438, + "step": 2686 + }, + { + "epoch": 0.224, + "grad_norm": 4.6875, + "grad_norm_var": 0.03487955729166667, + "learning_rate": 4e-05, + "loss": 4.5296, + "loss/crossentropy": 1.44465272128582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1623321734368801, + "step": 2688 + }, + { + "epoch": 0.22416666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.02545166015625, + "learning_rate": 4e-05, + "loss": 4.4794, + "loss/crossentropy": 2.4449245929718018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22241264954209328, + "step": 2690 + }, + { + "epoch": 0.22433333333333333, + "grad_norm": 5.5, + "grad_norm_var": 0.04412434895833333, + "learning_rate": 4e-05, + "loss": 5.2167, + "loss/crossentropy": 1.7128597050905228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19268352910876274, + "step": 2692 + }, + { + "epoch": 0.2245, + "grad_norm": 5.875, + "grad_norm_var": 0.10227864583333333, + "learning_rate": 4e-05, + "loss": 5.5377, + "loss/crossentropy": 2.5913642048835754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.265250276774168, + "step": 2694 + }, + { + "epoch": 0.22466666666666665, + "grad_norm": 4.84375, + "grad_norm_var": 0.10377197265625, + "learning_rate": 4e-05, + "loss": 4.514, + "loss/crossentropy": 2.124376595020294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20373161137104034, + "step": 2696 + }, + { + "epoch": 0.22483333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.101025390625, + "learning_rate": 4e-05, + "loss": 5.056, + "loss/crossentropy": 1.651014804840088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17066487483680248, + "step": 2698 + }, + { + "epoch": 0.225, + "grad_norm": 4.5625, + "grad_norm_var": 0.11314697265625, + "learning_rate": 4e-05, + "loss": 4.2127, + "loss/crossentropy": 2.065021328628063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1880048643797636, + "step": 2700 + }, + { + "epoch": 0.22516666666666665, + "grad_norm": 5.125, + "grad_norm_var": 0.11829020182291666, + "learning_rate": 4e-05, + "loss": 5.2311, + "loss/crossentropy": 2.4089654088020325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25183702632784843, + "step": 2702 + }, + { + "epoch": 0.22533333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.10618082682291667, + "learning_rate": 4e-05, + "loss": 4.624, + "loss/crossentropy": 1.7160001248121262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726439669728279, + "step": 2704 + }, + { + "epoch": 0.2255, + "grad_norm": 4.65625, + "grad_norm_var": 0.10435791015625, + "learning_rate": 4e-05, + "loss": 4.7344, + "loss/crossentropy": 1.2806991934776306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15048817172646523, + "step": 2706 + }, + { + "epoch": 0.22566666666666665, + "grad_norm": 10.375, + "grad_norm_var": 1.8723307291666667, + "learning_rate": 4e-05, + "loss": 5.1457, + "loss/crossentropy": 1.30407252907753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14570009522140026, + "step": 2708 + }, + { + "epoch": 0.22583333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 1.8645182291666667, + "learning_rate": 4e-05, + "loss": 5.4753, + "loss/crossentropy": 2.5484912395477295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21789904311299324, + "step": 2710 + }, + { + "epoch": 0.226, + "grad_norm": 4.90625, + "grad_norm_var": 1.8605305989583334, + "learning_rate": 4e-05, + "loss": 5.1614, + "loss/crossentropy": 1.9052416235208511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19455315172672272, + "step": 2712 + }, + { + "epoch": 0.22616666666666665, + "grad_norm": 5.28125, + "grad_norm_var": 1.8538899739583334, + "learning_rate": 4e-05, + "loss": 4.8978, + "loss/crossentropy": 2.0532439947128296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112819291651249, + "step": 2714 + }, + { + "epoch": 0.22633333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 1.8289021809895833, + "learning_rate": 4e-05, + "loss": 4.7618, + "loss/crossentropy": 2.1942814588546753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2094101719558239, + "step": 2716 + }, + { + "epoch": 0.2265, + "grad_norm": 4.5625, + "grad_norm_var": 1.8766560872395834, + "learning_rate": 4e-05, + "loss": 4.2254, + "loss/crossentropy": 2.525265157222748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201676107943058, + "step": 2718 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 1.8827962239583333, + "learning_rate": 4e-05, + "loss": 4.8646, + "loss/crossentropy": 0.7136424034833908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1026750449091196, + "step": 2720 + }, + { + "epoch": 0.22683333333333333, + "grad_norm": 4.875, + "grad_norm_var": 1.8532389322916667, + "learning_rate": 4e-05, + "loss": 5.2678, + "loss/crossentropy": 2.462954103946686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22561665251851082, + "step": 2722 + }, + { + "epoch": 0.227, + "grad_norm": 5.15625, + "grad_norm_var": 0.05969645182291667, + "learning_rate": 4e-05, + "loss": 5.0547, + "loss/crossentropy": 2.5214640498161316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22827640548348427, + "step": 2724 + }, + { + "epoch": 0.22716666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 9.401981608072917, + "learning_rate": 4e-05, + "loss": 4.972, + "loss/crossentropy": 2.2954089641571045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23702961578965187, + "step": 2726 + }, + { + "epoch": 0.22733333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 9.377437337239583, + "learning_rate": 4e-05, + "loss": 5.5277, + "loss/crossentropy": 1.4282821118831635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1785445362329483, + "step": 2728 + }, + { + "epoch": 0.2275, + "grad_norm": 5.3125, + "grad_norm_var": 9.397847493489584, + "learning_rate": 4e-05, + "loss": 5.0704, + "loss/crossentropy": 1.5387426540255547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17435546405613422, + "step": 2730 + }, + { + "epoch": 0.22766666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 9.3921875, + "learning_rate": 4e-05, + "loss": 4.5527, + "loss/crossentropy": 1.2322108745574951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1475350596010685, + "step": 2732 + }, + { + "epoch": 0.22783333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 9.334228515625, + "learning_rate": 4e-05, + "loss": 4.6543, + "loss/crossentropy": 1.1598545908927917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1438927836716175, + "step": 2734 + }, + { + "epoch": 0.228, + "grad_norm": 5.40625, + "grad_norm_var": 9.274283854166667, + "learning_rate": 4e-05, + "loss": 4.9029, + "loss/crossentropy": 1.6407746598124504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1901111900806427, + "step": 2736 + }, + { + "epoch": 0.22816666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 9.262223307291666, + "learning_rate": 4e-05, + "loss": 5.0487, + "loss/crossentropy": 1.8306042179465294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21500974148511887, + "step": 2738 + }, + { + "epoch": 0.22833333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 9.212333170572917, + "learning_rate": 4e-05, + "loss": 5.2418, + "loss/crossentropy": 2.3094605207443237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21840552985668182, + "step": 2740 + }, + { + "epoch": 0.2285, + "grad_norm": 5.3125, + "grad_norm_var": 0.0423828125, + "learning_rate": 4e-05, + "loss": 5.6929, + "loss/crossentropy": 1.8620459735393524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19147185236215591, + "step": 2742 + }, + { + "epoch": 0.22866666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.057938639322916666, + "learning_rate": 4e-05, + "loss": 4.5089, + "loss/crossentropy": 1.920153945684433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18584110215306282, + "step": 2744 + }, + { + "epoch": 0.22883333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.0654296875, + "learning_rate": 4e-05, + "loss": 4.6279, + "loss/crossentropy": 1.5992632433772087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17130916193127632, + "step": 2746 + }, + { + "epoch": 0.229, + "grad_norm": 5.0, + "grad_norm_var": 0.07082926432291667, + "learning_rate": 4e-05, + "loss": 4.4131, + "loss/crossentropy": 1.882856197655201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18213853426277637, + "step": 2748 + }, + { + "epoch": 0.22916666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.07899983723958333, + "learning_rate": 4e-05, + "loss": 4.8627, + "loss/crossentropy": 2.1046335101127625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19893395900726318, + "step": 2750 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.07330322265625, + "learning_rate": 4e-05, + "loss": 4.9209, + "loss/crossentropy": 2.348418891429901, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22180702164769173, + "step": 2752 + }, + { + "epoch": 0.2295, + "grad_norm": 5.09375, + "grad_norm_var": 0.073828125, + "learning_rate": 4e-05, + "loss": 5.0366, + "loss/crossentropy": 1.4402420744299889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1656176634132862, + "step": 2754 + }, + { + "epoch": 0.22966666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.06373697916666667, + "learning_rate": 4e-05, + "loss": 4.9903, + "loss/crossentropy": 1.663852408528328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19472996331751347, + "step": 2756 + }, + { + "epoch": 0.22983333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.05071614583333333, + "learning_rate": 4e-05, + "loss": 4.8286, + "loss/crossentropy": 2.364410251379013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22333412244915962, + "step": 2758 + }, + { + "epoch": 0.23, + "grad_norm": 4.71875, + "grad_norm_var": 0.059305826822916664, + "learning_rate": 4e-05, + "loss": 4.9322, + "loss/crossentropy": 2.570330262184143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22481412440538406, + "step": 2760 + }, + { + "epoch": 0.23016666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.05406494140625, + "learning_rate": 4e-05, + "loss": 5.1796, + "loss/crossentropy": 2.4776630997657776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199436090886593, + "step": 2762 + }, + { + "epoch": 0.23033333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.05169270833333333, + "learning_rate": 4e-05, + "loss": 5.3964, + "loss/crossentropy": 2.457890510559082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21888697147369385, + "step": 2764 + }, + { + "epoch": 0.2305, + "grad_norm": 4.71875, + "grad_norm_var": 0.059468587239583336, + "learning_rate": 4e-05, + "loss": 4.9103, + "loss/crossentropy": 2.312767207622528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19433093816041946, + "step": 2766 + }, + { + "epoch": 0.23066666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.06252848307291667, + "learning_rate": 4e-05, + "loss": 4.5758, + "loss/crossentropy": 1.472007542848587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1721639148890972, + "step": 2768 + }, + { + "epoch": 0.23083333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.06825764973958333, + "learning_rate": 4e-05, + "loss": 5.0607, + "loss/crossentropy": 2.117499329149723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20281216129660606, + "step": 2770 + }, + { + "epoch": 0.231, + "grad_norm": 5.0625, + "grad_norm_var": 0.049637858072916666, + "learning_rate": 4e-05, + "loss": 4.5248, + "loss/crossentropy": 2.032151460647583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22201980277895927, + "step": 2772 + }, + { + "epoch": 0.23116666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.04595947265625, + "learning_rate": 4e-05, + "loss": 4.8957, + "loss/crossentropy": 2.0667436867952347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20032967068254948, + "step": 2774 + }, + { + "epoch": 0.23133333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.04348551432291667, + "learning_rate": 4e-05, + "loss": 4.8943, + "loss/crossentropy": 1.8684946075081825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16954820603132248, + "step": 2776 + }, + { + "epoch": 0.2315, + "grad_norm": 5.15625, + "grad_norm_var": 0.28596598307291665, + "learning_rate": 4e-05, + "loss": 5.2042, + "loss/crossentropy": 1.9076000452041626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20960036292672157, + "step": 2778 + }, + { + "epoch": 0.23166666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.29817301432291665, + "learning_rate": 4e-05, + "loss": 4.6055, + "loss/crossentropy": 2.335671216249466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20992399007081985, + "step": 2780 + }, + { + "epoch": 0.23183333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.2762858072916667, + "learning_rate": 4e-05, + "loss": 5.1989, + "loss/crossentropy": 2.675258159637451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23351893201470375, + "step": 2782 + }, + { + "epoch": 0.232, + "grad_norm": 4.78125, + "grad_norm_var": 0.2702433268229167, + "learning_rate": 4e-05, + "loss": 5.0575, + "loss/crossentropy": 1.3609646335244179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15972201712429523, + "step": 2784 + }, + { + "epoch": 0.23216666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.26500244140625, + "learning_rate": 4e-05, + "loss": 5.123, + "loss/crossentropy": 2.211142838001251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24086641520261765, + "step": 2786 + }, + { + "epoch": 0.23233333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.27975260416666664, + "learning_rate": 4e-05, + "loss": 5.0675, + "loss/crossentropy": 2.1681629419326782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20870641618967056, + "step": 2788 + }, + { + "epoch": 0.2325, + "grad_norm": 4.90625, + "grad_norm_var": 0.28085530598958336, + "learning_rate": 4e-05, + "loss": 5.1701, + "loss/crossentropy": 2.3621520698070526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21960894763469696, + "step": 2790 + }, + { + "epoch": 0.23266666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.3078084309895833, + "learning_rate": 4e-05, + "loss": 4.1998, + "loss/crossentropy": 1.8437300026416779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18039709888398647, + "step": 2792 + }, + { + "epoch": 0.23283333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.03883056640625, + "learning_rate": 4e-05, + "loss": 4.6421, + "loss/crossentropy": 1.9587142765522003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116855699568987, + "step": 2794 + }, + { + "epoch": 0.233, + "grad_norm": 5.0625, + "grad_norm_var": 0.17548421223958333, + "learning_rate": 4e-05, + "loss": 5.0527, + "loss/crossentropy": 2.1764910221099854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081896774470806, + "step": 2796 + }, + { + "epoch": 0.23316666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.190234375, + "learning_rate": 4e-05, + "loss": 4.5767, + "loss/crossentropy": 1.5604673027992249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18230070546269417, + "step": 2798 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 5.34375, + "grad_norm_var": 0.19625244140625, + "learning_rate": 4e-05, + "loss": 5.6337, + "loss/crossentropy": 2.379370391368866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22885718196630478, + "step": 2800 + }, + { + "epoch": 0.2335, + "grad_norm": 5.84375, + "grad_norm_var": 0.24107666015625, + "learning_rate": 4e-05, + "loss": 4.9887, + "loss/crossentropy": 1.4264894649386406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15079515427350998, + "step": 2802 + }, + { + "epoch": 0.23366666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.24075113932291667, + "learning_rate": 4e-05, + "loss": 5.1173, + "loss/crossentropy": 2.43440181016922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21988075599074364, + "step": 2804 + }, + { + "epoch": 0.23383333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.24049072265625, + "learning_rate": 4e-05, + "loss": 5.1611, + "loss/crossentropy": 1.7003138586878777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18909470550715923, + "step": 2806 + }, + { + "epoch": 0.234, + "grad_norm": 5.46875, + "grad_norm_var": 0.19302978515625, + "learning_rate": 4e-05, + "loss": 4.9761, + "loss/crossentropy": 1.9690175727009773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17825855500996113, + "step": 2808 + }, + { + "epoch": 0.23416666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 0.20549723307291667, + "learning_rate": 4e-05, + "loss": 5.146, + "loss/crossentropy": 2.0230807662010193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24265416339039803, + "step": 2810 + }, + { + "epoch": 0.23433333333333334, + "grad_norm": 5.65625, + "grad_norm_var": 0.13414306640625, + "learning_rate": 4e-05, + "loss": 4.7241, + "loss/crossentropy": 1.9726131781935692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17903245240449905, + "step": 2812 + }, + { + "epoch": 0.2345, + "grad_norm": 5.21875, + "grad_norm_var": 0.13177083333333334, + "learning_rate": 4e-05, + "loss": 4.8185, + "loss/crossentropy": 1.9392977207899094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18167594075202942, + "step": 2814 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.40625, + "grad_norm_var": 0.13352864583333332, + "learning_rate": 4e-05, + "loss": 5.2522, + "loss/crossentropy": 2.45490038394928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22192463278770447, + "step": 2816 + }, + { + "epoch": 0.23483333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.11972249348958333, + "learning_rate": 4e-05, + "loss": 4.731, + "loss/crossentropy": 2.515716075897217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21755141019821167, + "step": 2818 + }, + { + "epoch": 0.235, + "grad_norm": 5.28125, + "grad_norm_var": 0.10526936848958333, + "learning_rate": 4e-05, + "loss": 5.2301, + "loss/crossentropy": 2.4293786883354187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21814614161849022, + "step": 2820 + }, + { + "epoch": 0.23516666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.11220296223958333, + "learning_rate": 4e-05, + "loss": 4.5221, + "loss/crossentropy": 1.4309967905282974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15550539828836918, + "step": 2822 + }, + { + "epoch": 0.23533333333333334, + "grad_norm": 5.40625, + "grad_norm_var": 0.13733317057291666, + "learning_rate": 4e-05, + "loss": 4.8982, + "loss/crossentropy": 2.560435712337494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20439713820815086, + "step": 2824 + }, + { + "epoch": 0.2355, + "grad_norm": 5.0625, + "grad_norm_var": 0.114306640625, + "learning_rate": 4e-05, + "loss": 5.4189, + "loss/crossentropy": 2.490498185157776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24041643366217613, + "step": 2826 + }, + { + "epoch": 0.23566666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.10898030598958333, + "learning_rate": 4e-05, + "loss": 5.0196, + "loss/crossentropy": 2.0801108181476593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159007117152214, + "step": 2828 + }, + { + "epoch": 0.23583333333333334, + "grad_norm": 5.34375, + "grad_norm_var": 0.10545247395833333, + "learning_rate": 4e-05, + "loss": 5.0843, + "loss/crossentropy": 2.540014386177063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21403054893016815, + "step": 2830 + }, + { + "epoch": 0.236, + "grad_norm": 5.28125, + "grad_norm_var": 0.105712890625, + "learning_rate": 4e-05, + "loss": 5.3198, + "loss/crossentropy": 2.0823977291584015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19510109908878803, + "step": 2832 + }, + { + "epoch": 0.23616666666666666, + "grad_norm": 5.46875, + "grad_norm_var": 0.07984619140625, + "learning_rate": 4e-05, + "loss": 5.3449, + "loss/crossentropy": 2.335395246744156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23566577583551407, + "step": 2834 + }, + { + "epoch": 0.23633333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.08772379557291667, + "learning_rate": 4e-05, + "loss": 4.9142, + "loss/crossentropy": 2.067429706454277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20775259472429752, + "step": 2836 + }, + { + "epoch": 0.2365, + "grad_norm": 5.375, + "grad_norm_var": 0.09342041015625, + "learning_rate": 4e-05, + "loss": 5.0618, + "loss/crossentropy": 1.6095528677105904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1610132586210966, + "step": 2838 + }, + { + "epoch": 0.23666666666666666, + "grad_norm": 5.375, + "grad_norm_var": 0.06728108723958333, + "learning_rate": 4e-05, + "loss": 4.9791, + "loss/crossentropy": 1.1937666982412338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16449680365622044, + "step": 2840 + }, + { + "epoch": 0.23683333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.09244791666666667, + "learning_rate": 4e-05, + "loss": 4.6665, + "loss/crossentropy": 2.3993532061576843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20165612176060677, + "step": 2842 + }, + { + "epoch": 0.237, + "grad_norm": 4.65625, + "grad_norm_var": 0.09495035807291667, + "learning_rate": 4e-05, + "loss": 5.0856, + "loss/crossentropy": 1.9005714282393456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18283692002296448, + "step": 2844 + }, + { + "epoch": 0.23716666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.10019124348958333, + "learning_rate": 4e-05, + "loss": 4.728, + "loss/crossentropy": 2.730278193950653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2370842583477497, + "step": 2846 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.0990234375, + "learning_rate": 4e-05, + "loss": 5.1465, + "loss/crossentropy": 2.6805002689361572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24382244795560837, + "step": 2848 + }, + { + "epoch": 0.2375, + "grad_norm": 5.3125, + "grad_norm_var": 0.09869384765625, + "learning_rate": 4e-05, + "loss": 4.9877, + "loss/crossentropy": 2.367294877767563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22220474854111671, + "step": 2850 + }, + { + "epoch": 0.23766666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.097509765625, + "learning_rate": 4e-05, + "loss": 5.356, + "loss/crossentropy": 2.0009628012776375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20864171534776688, + "step": 2852 + }, + { + "epoch": 0.23783333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.09065348307291667, + "learning_rate": 4e-05, + "loss": 4.5579, + "loss/crossentropy": 1.845793679356575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19666944071650505, + "step": 2854 + }, + { + "epoch": 0.238, + "grad_norm": 4.6875, + "grad_norm_var": 0.09095052083333334, + "learning_rate": 4e-05, + "loss": 5.1909, + "loss/crossentropy": 2.2471812665462494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2083476521074772, + "step": 2856 + }, + { + "epoch": 0.23816666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.07459309895833334, + "learning_rate": 4e-05, + "loss": 5.174, + "loss/crossentropy": 2.458581805229187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21159670129418373, + "step": 2858 + }, + { + "epoch": 0.23833333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.07265625, + "learning_rate": 4e-05, + "loss": 4.5802, + "loss/crossentropy": 1.3199757784605026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1531830132007599, + "step": 2860 + }, + { + "epoch": 0.2385, + "grad_norm": 4.78125, + "grad_norm_var": 0.07107747395833333, + "learning_rate": 4e-05, + "loss": 4.7332, + "loss/crossentropy": 1.2995287701487541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14581317454576492, + "step": 2862 + }, + { + "epoch": 0.23866666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.06717122395833333, + "learning_rate": 4e-05, + "loss": 4.8991, + "loss/crossentropy": 1.44887076318264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18584540858864784, + "step": 2864 + }, + { + "epoch": 0.23883333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.05944010416666667, + "learning_rate": 4e-05, + "loss": 5.1643, + "loss/crossentropy": 2.570837378501892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20513088256120682, + "step": 2866 + }, + { + "epoch": 0.239, + "grad_norm": 4.875, + "grad_norm_var": 0.056929524739583334, + "learning_rate": 4e-05, + "loss": 4.6723, + "loss/crossentropy": 2.399744689464569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21387247368693352, + "step": 2868 + }, + { + "epoch": 0.23916666666666667, + "grad_norm": 6.375, + "grad_norm_var": 0.16711832682291666, + "learning_rate": 4e-05, + "loss": 5.237, + "loss/crossentropy": 1.9510470181703568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18807476945221424, + "step": 2870 + }, + { + "epoch": 0.23933333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.16842041015625, + "learning_rate": 4e-05, + "loss": 4.6487, + "loss/crossentropy": 1.1220924705266953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1417934186756611, + "step": 2872 + }, + { + "epoch": 0.2395, + "grad_norm": 5.125, + "grad_norm_var": 0.16691080729166666, + "learning_rate": 4e-05, + "loss": 5.2557, + "loss/crossentropy": 2.146589756011963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22325074672698975, + "step": 2874 + }, + { + "epoch": 0.23966666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.15556233723958332, + "learning_rate": 4e-05, + "loss": 4.945, + "loss/crossentropy": 1.7336683943867683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16801801696419716, + "step": 2876 + }, + { + "epoch": 0.23983333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.16125895182291666, + "learning_rate": 4e-05, + "loss": 4.9316, + "loss/crossentropy": 1.9470653384923935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18395037576556206, + "step": 2878 + }, + { + "epoch": 0.24, + "grad_norm": 4.625, + "grad_norm_var": 0.16829427083333334, + "learning_rate": 4e-05, + "loss": 4.5489, + "loss/crossentropy": 1.708813153207302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18478217348456383, + "step": 2880 + }, + { + "epoch": 0.24016666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.16689046223958334, + "learning_rate": 4e-05, + "loss": 4.3889, + "loss/crossentropy": 1.398942418396473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16694530472159386, + "step": 2882 + }, + { + "epoch": 0.24033333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.17603759765625, + "learning_rate": 4e-05, + "loss": 4.7529, + "loss/crossentropy": 2.43044650554657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21148262172937393, + "step": 2884 + }, + { + "epoch": 0.2405, + "grad_norm": 4.875, + "grad_norm_var": 0.047900390625, + "learning_rate": 4e-05, + "loss": 4.5462, + "loss/crossentropy": 2.5115047097206116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20838917791843414, + "step": 2886 + }, + { + "epoch": 0.24066666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.051953125, + "learning_rate": 4e-05, + "loss": 5.2548, + "loss/crossentropy": 2.7271772623062134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21330803260207176, + "step": 2888 + }, + { + "epoch": 0.24083333333333334, + "grad_norm": 5.34375, + "grad_norm_var": 0.05816650390625, + "learning_rate": 4e-05, + "loss": 5.792, + "loss/crossentropy": 2.412293791770935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21458745002746582, + "step": 2890 + }, + { + "epoch": 0.241, + "grad_norm": 4.71875, + "grad_norm_var": 0.059619140625, + "learning_rate": 4e-05, + "loss": 4.9378, + "loss/crossentropy": 1.7731594443321228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1800384782254696, + "step": 2892 + }, + { + "epoch": 0.24116666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.0548828125, + "learning_rate": 4e-05, + "loss": 4.4171, + "loss/crossentropy": 2.4972161054611206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959226578474045, + "step": 2894 + }, + { + "epoch": 0.24133333333333334, + "grad_norm": 5.65625, + "grad_norm_var": 0.073046875, + "learning_rate": 4e-05, + "loss": 5.5958, + "loss/crossentropy": 2.3957661390304565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111573964357376, + "step": 2896 + }, + { + "epoch": 0.2415, + "grad_norm": 5.125, + "grad_norm_var": 0.08391927083333334, + "learning_rate": 4e-05, + "loss": 5.1817, + "loss/crossentropy": 1.2799173444509506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18157821521162987, + "step": 2898 + }, + { + "epoch": 0.24166666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.07545166015625, + "learning_rate": 4e-05, + "loss": 5.0338, + "loss/crossentropy": 2.3033843338489532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21752947941422462, + "step": 2900 + }, + { + "epoch": 0.24183333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.06261393229166666, + "learning_rate": 4e-05, + "loss": 5.3801, + "loss/crossentropy": 1.950482338666916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18952583149075508, + "step": 2902 + }, + { + "epoch": 0.242, + "grad_norm": 4.96875, + "grad_norm_var": 0.06597900390625, + "learning_rate": 4e-05, + "loss": 4.7877, + "loss/crossentropy": 1.822025142610073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20692705176770687, + "step": 2904 + }, + { + "epoch": 0.24216666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.060791015625, + "learning_rate": 4e-05, + "loss": 4.5748, + "loss/crossentropy": 2.2172627449035645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20654083043336868, + "step": 2906 + }, + { + "epoch": 0.24233333333333335, + "grad_norm": 4.5, + "grad_norm_var": 0.07350260416666667, + "learning_rate": 4e-05, + "loss": 4.5271, + "loss/crossentropy": 1.8431589156389236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18556870892643929, + "step": 2908 + }, + { + "epoch": 0.2425, + "grad_norm": 5.6875, + "grad_norm_var": 0.10172119140625, + "learning_rate": 4e-05, + "loss": 4.9521, + "loss/crossentropy": 1.194792091846466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20142144337296486, + "step": 2910 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.11009114583333333, + "learning_rate": 4e-05, + "loss": 4.9352, + "loss/crossentropy": 1.2394988313317299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14485261403024197, + "step": 2912 + }, + { + "epoch": 0.24283333333333335, + "grad_norm": 5.1875, + "grad_norm_var": 0.10601806640625, + "learning_rate": 4e-05, + "loss": 4.505, + "loss/crossentropy": 2.194884717464447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23719510436058044, + "step": 2914 + }, + { + "epoch": 0.243, + "grad_norm": 5.21875, + "grad_norm_var": 0.10779622395833334, + "learning_rate": 4e-05, + "loss": 4.5226, + "loss/crossentropy": 2.356331080198288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.221795491874218, + "step": 2916 + }, + { + "epoch": 0.24316666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.108837890625, + "learning_rate": 4e-05, + "loss": 4.9338, + "loss/crossentropy": 2.3776062428951263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22553601115942, + "step": 2918 + }, + { + "epoch": 0.24333333333333335, + "grad_norm": 5.03125, + "grad_norm_var": 0.10677083333333333, + "learning_rate": 4e-05, + "loss": 4.83, + "loss/crossentropy": 1.745754636824131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18756554648280144, + "step": 2920 + }, + { + "epoch": 0.2435, + "grad_norm": 5.15625, + "grad_norm_var": 0.11197916666666667, + "learning_rate": 4e-05, + "loss": 4.8526, + "loss/crossentropy": 2.37031289935112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172992266714573, + "step": 2922 + }, + { + "epoch": 0.24366666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.09511311848958333, + "learning_rate": 4e-05, + "loss": 4.6856, + "loss/crossentropy": 2.4921224117279053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21399492397904396, + "step": 2924 + }, + { + "epoch": 0.24383333333333335, + "grad_norm": 5.5, + "grad_norm_var": 0.07069905598958333, + "learning_rate": 4e-05, + "loss": 5.342, + "loss/crossentropy": 2.2873608469963074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288646660745144, + "step": 2926 + }, + { + "epoch": 0.244, + "grad_norm": 4.53125, + "grad_norm_var": 0.05777587890625, + "learning_rate": 4e-05, + "loss": 4.4681, + "loss/crossentropy": 0.7939189150929451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1262371763586998, + "step": 2928 + }, + { + "epoch": 0.24416666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 4e-05, + "loss": 4.5896, + "loss/crossentropy": 2.084025114774704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19710740074515343, + "step": 2930 + }, + { + "epoch": 0.24433333333333335, + "grad_norm": 4.78125, + "grad_norm_var": 0.04582926432291667, + "learning_rate": 4e-05, + "loss": 5.1147, + "loss/crossentropy": 1.9766810834407806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1635744720697403, + "step": 2932 + }, + { + "epoch": 0.2445, + "grad_norm": 4.78125, + "grad_norm_var": 0.054911295572916664, + "learning_rate": 4e-05, + "loss": 4.5908, + "loss/crossentropy": 2.331399142742157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101854346692562, + "step": 2934 + }, + { + "epoch": 0.24466666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.05533447265625, + "learning_rate": 4e-05, + "loss": 5.0402, + "loss/crossentropy": 2.1609912514686584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20745176821947098, + "step": 2936 + }, + { + "epoch": 0.24483333333333332, + "grad_norm": 5.1875, + "grad_norm_var": 0.05715738932291667, + "learning_rate": 4e-05, + "loss": 4.7067, + "loss/crossentropy": 1.9947044774889946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18351775035262108, + "step": 2938 + }, + { + "epoch": 0.245, + "grad_norm": 4.9375, + "grad_norm_var": 0.053971354166666666, + "learning_rate": 4e-05, + "loss": 4.5758, + "loss/crossentropy": 1.7316635847091675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17024549469351768, + "step": 2940 + }, + { + "epoch": 0.24516666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.14542643229166666, + "learning_rate": 4e-05, + "loss": 5.457, + "loss/crossentropy": 2.462052643299103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24193332344293594, + "step": 2942 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 4.875, + "grad_norm_var": 0.13683268229166667, + "learning_rate": 4e-05, + "loss": 4.7638, + "loss/crossentropy": 2.3683615624904633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22581441700458527, + "step": 2944 + }, + { + "epoch": 0.2455, + "grad_norm": 4.59375, + "grad_norm_var": 0.14928385416666667, + "learning_rate": 4e-05, + "loss": 4.9376, + "loss/crossentropy": 2.416099488735199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069566398859024, + "step": 2946 + }, + { + "epoch": 0.24566666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.14763997395833334, + "learning_rate": 4e-05, + "loss": 4.9896, + "loss/crossentropy": 2.327180027961731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20854448154568672, + "step": 2948 + }, + { + "epoch": 0.24583333333333332, + "grad_norm": 4.90625, + "grad_norm_var": 0.13697916666666668, + "learning_rate": 4e-05, + "loss": 4.8112, + "loss/crossentropy": 2.242485076189041, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2354438677430153, + "step": 2950 + }, + { + "epoch": 0.246, + "grad_norm": 4.96875, + "grad_norm_var": 0.1345703125, + "learning_rate": 4e-05, + "loss": 4.7682, + "loss/crossentropy": 1.564257226884365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19489132426679134, + "step": 2952 + }, + { + "epoch": 0.24616666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.13258056640625, + "learning_rate": 4e-05, + "loss": 4.9509, + "loss/crossentropy": 2.032680094242096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20923538878560066, + "step": 2954 + }, + { + "epoch": 0.24633333333333332, + "grad_norm": 5.28125, + "grad_norm_var": 0.138916015625, + "learning_rate": 4e-05, + "loss": 5.0157, + "loss/crossentropy": 1.8767257183790207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18023189157247543, + "step": 2956 + }, + { + "epoch": 0.2465, + "grad_norm": 5.40625, + "grad_norm_var": 0.04752197265625, + "learning_rate": 4e-05, + "loss": 5.2726, + "loss/crossentropy": 2.728525757789612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22680773586034775, + "step": 2958 + }, + { + "epoch": 0.24666666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.562744140625, + "learning_rate": 4e-05, + "loss": 5.1125, + "loss/crossentropy": 2.137243375182152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2695735916495323, + "step": 2960 + }, + { + "epoch": 0.24683333333333332, + "grad_norm": 4.6875, + "grad_norm_var": 0.563916015625, + "learning_rate": 4e-05, + "loss": 4.4757, + "loss/crossentropy": 2.077547214925289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17666416242718697, + "step": 2962 + }, + { + "epoch": 0.247, + "grad_norm": 4.78125, + "grad_norm_var": 0.5870402018229167, + "learning_rate": 4e-05, + "loss": 4.5436, + "loss/crossentropy": 2.0804325118660927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18956683948636055, + "step": 2964 + }, + { + "epoch": 0.24716666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.593603515625, + "learning_rate": 4e-05, + "loss": 5.3799, + "loss/crossentropy": 2.3817147612571716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155500017106533, + "step": 2966 + }, + { + "epoch": 0.24733333333333332, + "grad_norm": 5.25, + "grad_norm_var": 0.6034464518229167, + "learning_rate": 4e-05, + "loss": 4.6662, + "loss/crossentropy": 1.789809986948967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085736319422722, + "step": 2968 + }, + { + "epoch": 0.2475, + "grad_norm": 5.0625, + "grad_norm_var": 0.602734375, + "learning_rate": 4e-05, + "loss": 5.3509, + "loss/crossentropy": 1.5908312797546387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1741067972034216, + "step": 2970 + }, + { + "epoch": 0.24766666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.5999348958333334, + "learning_rate": 4e-05, + "loss": 4.859, + "loss/crossentropy": 1.097387008368969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1571547295898199, + "step": 2972 + }, + { + "epoch": 0.24783333333333332, + "grad_norm": 5.1875, + "grad_norm_var": 0.6059733072916667, + "learning_rate": 4e-05, + "loss": 5.2508, + "loss/crossentropy": 1.552187517285347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1697844583541155, + "step": 2974 + }, + { + "epoch": 0.248, + "grad_norm": 5.59375, + "grad_norm_var": 0.09501546223958333, + "learning_rate": 4e-05, + "loss": 4.5019, + "loss/crossentropy": 2.224805660545826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20385221019387245, + "step": 2976 + }, + { + "epoch": 0.24816666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.08841145833333333, + "learning_rate": 4e-05, + "loss": 5.1168, + "loss/crossentropy": 2.5963165760040283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21954534575343132, + "step": 2978 + }, + { + "epoch": 0.24833333333333332, + "grad_norm": 4.875, + "grad_norm_var": 0.076953125, + "learning_rate": 4e-05, + "loss": 4.5793, + "loss/crossentropy": 1.7659974992275238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1737888753414154, + "step": 2980 + }, + { + "epoch": 0.2485, + "grad_norm": 5.4375, + "grad_norm_var": 0.07746988932291667, + "learning_rate": 4e-05, + "loss": 5.2939, + "loss/crossentropy": 2.5869803428649902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22545062378048897, + "step": 2982 + }, + { + "epoch": 0.24866666666666667, + "grad_norm": 5.53125, + "grad_norm_var": 0.08444010416666667, + "learning_rate": 4e-05, + "loss": 4.9712, + "loss/crossentropy": 2.442513942718506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22997939586639404, + "step": 2984 + }, + { + "epoch": 0.24883333333333332, + "grad_norm": 5.15625, + "grad_norm_var": 0.11669514973958334, + "learning_rate": 4e-05, + "loss": 4.8956, + "loss/crossentropy": 2.3541765213012695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21100665628910065, + "step": 2986 + }, + { + "epoch": 0.249, + "grad_norm": 5.15625, + "grad_norm_var": 0.11220296223958333, + "learning_rate": 4e-05, + "loss": 5.2216, + "loss/crossentropy": 1.5780949518084526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18446550890803337, + "step": 2988 + }, + { + "epoch": 0.24916666666666668, + "grad_norm": 5.75, + "grad_norm_var": 0.12652587890625, + "learning_rate": 4e-05, + "loss": 5.1027, + "loss/crossentropy": 1.2989030554890633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17299162782728672, + "step": 2990 + }, + { + "epoch": 0.24933333333333332, + "grad_norm": 4.625, + "grad_norm_var": 0.13007405598958333, + "learning_rate": 4e-05, + "loss": 4.0987, + "loss/crossentropy": 1.86490598320961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18420669436454773, + "step": 2992 + }, + { + "epoch": 0.2495, + "grad_norm": 4.75, + "grad_norm_var": 0.14172770182291666, + "learning_rate": 4e-05, + "loss": 4.6842, + "loss/crossentropy": 1.3324758186936378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16431885957717896, + "step": 2994 + }, + { + "epoch": 0.24966666666666668, + "grad_norm": 5.03125, + "grad_norm_var": 0.13136393229166668, + "learning_rate": 4e-05, + "loss": 4.9357, + "loss/crossentropy": 2.010372966527939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944860704243183, + "step": 2996 + }, + { + "epoch": 0.24983333333333332, + "grad_norm": 5.34375, + "grad_norm_var": 0.13075764973958334, + "learning_rate": 4e-05, + "loss": 5.4454, + "loss/crossentropy": 2.445209562778473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20995644852519035, + "step": 2998 + }, + { + "epoch": 0.25, + "grad_norm": 4.875, + "grad_norm_var": 0.13524983723958334, + "learning_rate": 4e-05, + "loss": 4.1599, + "loss/crossentropy": 2.438918113708496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217733934521675, + "step": 3000 + }, + { + "epoch": 0.25016666666666665, + "grad_norm": 5.4375, + "grad_norm_var": 0.15467122395833333, + "learning_rate": 4e-05, + "loss": 4.6514, + "loss/crossentropy": 1.777455359697342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.164383664727211, + "step": 3002 + }, + { + "epoch": 0.25033333333333335, + "grad_norm": 5.28125, + "grad_norm_var": 0.15601806640625, + "learning_rate": 4e-05, + "loss": 4.8879, + "loss/crossentropy": 1.5648024901747704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1650273408740759, + "step": 3004 + }, + { + "epoch": 0.2505, + "grad_norm": 5.25, + "grad_norm_var": 0.1333984375, + "learning_rate": 4e-05, + "loss": 4.8656, + "loss/crossentropy": 1.9242472425103188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20352683775126934, + "step": 3006 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.11575520833333333, + "learning_rate": 4e-05, + "loss": 5.4478, + "loss/crossentropy": 2.404434084892273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24195265397429466, + "step": 3008 + }, + { + "epoch": 0.25083333333333335, + "grad_norm": 4.6875, + "grad_norm_var": 0.11451416015625, + "learning_rate": 4e-05, + "loss": 4.4537, + "loss/crossentropy": 1.8876915350556374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17346367239952087, + "step": 3010 + }, + { + "epoch": 0.251, + "grad_norm": 5.90625, + "grad_norm_var": 0.148291015625, + "learning_rate": 4e-05, + "loss": 4.7405, + "loss/crossentropy": 1.463827095925808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1612564455717802, + "step": 3012 + }, + { + "epoch": 0.25116666666666665, + "grad_norm": 4.71875, + "grad_norm_var": 0.16005452473958334, + "learning_rate": 4e-05, + "loss": 4.3048, + "loss/crossentropy": 1.8139414489269257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981374379247427, + "step": 3014 + }, + { + "epoch": 0.25133333333333335, + "grad_norm": 4.9375, + "grad_norm_var": 0.13645833333333332, + "learning_rate": 4e-05, + "loss": 5.2113, + "loss/crossentropy": 2.3456265330314636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157314494252205, + "step": 3016 + }, + { + "epoch": 0.2515, + "grad_norm": 5.28125, + "grad_norm_var": 0.13212483723958332, + "learning_rate": 4e-05, + "loss": 5.2592, + "loss/crossentropy": 1.983870379626751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20767552591860294, + "step": 3018 + }, + { + "epoch": 0.25166666666666665, + "grad_norm": 5.125, + "grad_norm_var": 0.15126546223958334, + "learning_rate": 4e-05, + "loss": 5.5097, + "loss/crossentropy": 1.9750956296920776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23682751134037971, + "step": 3020 + }, + { + "epoch": 0.25183333333333335, + "grad_norm": 4.90625, + "grad_norm_var": 0.16835530598958334, + "learning_rate": 4e-05, + "loss": 4.7177, + "loss/crossentropy": 1.952264316380024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19494295865297318, + "step": 3022 + }, + { + "epoch": 0.252, + "grad_norm": 7.9375, + "grad_norm_var": 0.65435791015625, + "learning_rate": 4e-05, + "loss": 4.9452, + "loss/crossentropy": 1.7668914496898651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20950108766555786, + "step": 3024 + }, + { + "epoch": 0.25216666666666665, + "grad_norm": 5.375, + "grad_norm_var": 0.6429646809895834, + "learning_rate": 4e-05, + "loss": 5.0399, + "loss/crossentropy": 2.3525235652923584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22499406710267067, + "step": 3026 + }, + { + "epoch": 0.25233333333333335, + "grad_norm": 4.84375, + "grad_norm_var": 0.6343587239583334, + "learning_rate": 4e-05, + "loss": 4.7009, + "loss/crossentropy": 2.423838883638382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22652239352464676, + "step": 3028 + }, + { + "epoch": 0.2525, + "grad_norm": 5.6875, + "grad_norm_var": 0.6251139322916667, + "learning_rate": 4e-05, + "loss": 4.9118, + "loss/crossentropy": 2.315009117126465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25304970145225525, + "step": 3030 + }, + { + "epoch": 0.25266666666666665, + "grad_norm": 5.15625, + "grad_norm_var": 0.6044921875, + "learning_rate": 4e-05, + "loss": 5.3209, + "loss/crossentropy": 1.1788423582911491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18229475058615208, + "step": 3032 + }, + { + "epoch": 0.25283333333333335, + "grad_norm": 4.90625, + "grad_norm_var": 0.59713134765625, + "learning_rate": 4e-05, + "loss": 4.8294, + "loss/crossentropy": 2.1284771263599396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20650208741426468, + "step": 3034 + }, + { + "epoch": 0.253, + "grad_norm": 5.25, + "grad_norm_var": 0.946728515625, + "learning_rate": 4e-05, + "loss": 4.833, + "loss/crossentropy": 1.8748324885964394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19778919033706188, + "step": 3036 + }, + { + "epoch": 0.25316666666666665, + "grad_norm": 5.6875, + "grad_norm_var": 0.8805989583333333, + "learning_rate": 4e-05, + "loss": 5.103, + "loss/crossentropy": 1.374475210905075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24287692829966545, + "step": 3038 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 4.90625, + "grad_norm_var": 0.49957275390625, + "learning_rate": 4e-05, + "loss": 4.6131, + "loss/crossentropy": 2.303386151790619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943720206618309, + "step": 3040 + }, + { + "epoch": 0.2535, + "grad_norm": 4.84375, + "grad_norm_var": 0.5026652018229166, + "learning_rate": 4e-05, + "loss": 5.2035, + "loss/crossentropy": 2.6998149752616882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21584071591496468, + "step": 3042 + }, + { + "epoch": 0.25366666666666665, + "grad_norm": 4.5625, + "grad_norm_var": 0.5387003580729167, + "learning_rate": 4e-05, + "loss": 4.652, + "loss/crossentropy": 1.8677115961909294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18451419100165367, + "step": 3044 + }, + { + "epoch": 0.25383333333333336, + "grad_norm": 5.3125, + "grad_norm_var": 0.5251302083333333, + "learning_rate": 4e-05, + "loss": 5.0967, + "loss/crossentropy": 2.085889607667923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815444529056549, + "step": 3046 + }, + { + "epoch": 0.254, + "grad_norm": 4.78125, + "grad_norm_var": 0.536572265625, + "learning_rate": 4e-05, + "loss": 5.1203, + "loss/crossentropy": 1.9297250807285309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21572398394346237, + "step": 3048 + }, + { + "epoch": 0.25416666666666665, + "grad_norm": 4.59375, + "grad_norm_var": 0.5628255208333334, + "learning_rate": 4e-05, + "loss": 5.0028, + "loss/crossentropy": 2.18538436293602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21398789063096046, + "step": 3050 + }, + { + "epoch": 0.25433333333333336, + "grad_norm": 5.0, + "grad_norm_var": 0.08681233723958333, + "learning_rate": 4e-05, + "loss": 4.7436, + "loss/crossentropy": 1.881318211555481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17555838078260422, + "step": 3052 + }, + { + "epoch": 0.2545, + "grad_norm": 5.53125, + "grad_norm_var": 0.06534830729166667, + "learning_rate": 4e-05, + "loss": 5.5268, + "loss/crossentropy": 2.593364179134369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22329099476337433, + "step": 3054 + }, + { + "epoch": 0.25466666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.06262613932291666, + "learning_rate": 4e-05, + "loss": 5.3541, + "loss/crossentropy": 1.8421437069773674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1585595551878214, + "step": 3056 + }, + { + "epoch": 0.25483333333333336, + "grad_norm": 8.75, + "grad_norm_var": 0.9886678059895834, + "learning_rate": 4e-05, + "loss": 4.2504, + "loss/crossentropy": 1.5335690155625343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15620680898427963, + "step": 3058 + }, + { + "epoch": 0.255, + "grad_norm": 5.21875, + "grad_norm_var": 0.9575358072916667, + "learning_rate": 4e-05, + "loss": 5.0432, + "loss/crossentropy": 2.136391341686249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133907824754715, + "step": 3060 + }, + { + "epoch": 0.25516666666666665, + "grad_norm": 4.34375, + "grad_norm_var": 0.994775390625, + "learning_rate": 4e-05, + "loss": 4.5259, + "loss/crossentropy": 2.214922845363617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20154564455151558, + "step": 3062 + }, + { + "epoch": 0.25533333333333336, + "grad_norm": 4.9375, + "grad_norm_var": 0.9885701497395833, + "learning_rate": 4e-05, + "loss": 5.2165, + "loss/crossentropy": 1.6571815237402916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20795507915318012, + "step": 3064 + }, + { + "epoch": 0.2555, + "grad_norm": 4.96875, + "grad_norm_var": 0.9559529622395834, + "learning_rate": 4e-05, + "loss": 4.9805, + "loss/crossentropy": 2.20686411857605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20064949244260788, + "step": 3066 + }, + { + "epoch": 0.25566666666666665, + "grad_norm": 5.25, + "grad_norm_var": 0.9485636393229167, + "learning_rate": 4e-05, + "loss": 5.049, + "loss/crossentropy": 1.8665556535124779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17407236993312836, + "step": 3068 + }, + { + "epoch": 0.25583333333333336, + "grad_norm": 5.1875, + "grad_norm_var": 0.9401652018229166, + "learning_rate": 4e-05, + "loss": 5.3568, + "loss/crossentropy": 2.4138555824756622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21526311710476875, + "step": 3070 + }, + { + "epoch": 0.256, + "grad_norm": 4.71875, + "grad_norm_var": 0.9445149739583333, + "learning_rate": 4e-05, + "loss": 5.3499, + "loss/crossentropy": 1.4635539650917053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14331906288862228, + "step": 3072 + }, + { + "epoch": 0.25616666666666665, + "grad_norm": 5.625, + "grad_norm_var": 0.07545572916666667, + "learning_rate": 4e-05, + "loss": 5.6253, + "loss/crossentropy": 2.168128550052643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2246512994170189, + "step": 3074 + }, + { + "epoch": 0.25633333333333336, + "grad_norm": 5.40625, + "grad_norm_var": 0.0783203125, + "learning_rate": 4e-05, + "loss": 5.3584, + "loss/crossentropy": 2.067095883190632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18999614752829075, + "step": 3076 + }, + { + "epoch": 0.2565, + "grad_norm": 4.625, + "grad_norm_var": 0.05533447265625, + "learning_rate": 4e-05, + "loss": 4.4266, + "loss/crossentropy": 1.4931324049830437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15274662151932716, + "step": 3078 + }, + { + "epoch": 0.25666666666666665, + "grad_norm": 5.0625, + "grad_norm_var": 0.07102457682291667, + "learning_rate": 4e-05, + "loss": 4.7741, + "loss/crossentropy": 1.9350454285740852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.177174037322402, + "step": 3080 + }, + { + "epoch": 0.25683333333333336, + "grad_norm": 5.3125, + "grad_norm_var": 0.08879801432291666, + "learning_rate": 4e-05, + "loss": 5.2018, + "loss/crossentropy": 2.3658514618873596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21689199656248093, + "step": 3082 + }, + { + "epoch": 0.257, + "grad_norm": 5.5, + "grad_norm_var": 0.10442301432291666, + "learning_rate": 4e-05, + "loss": 5.0723, + "loss/crossentropy": 1.9613457173109055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923742052167654, + "step": 3084 + }, + { + "epoch": 0.25716666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.10349934895833333, + "learning_rate": 4e-05, + "loss": 5.1772, + "loss/crossentropy": 2.0767830312252045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23773017153143883, + "step": 3086 + }, + { + "epoch": 0.25733333333333336, + "grad_norm": 5.375, + "grad_norm_var": 0.10396728515625, + "learning_rate": 4e-05, + "loss": 4.8215, + "loss/crossentropy": 1.7117633819580078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1797526590526104, + "step": 3088 + }, + { + "epoch": 0.2575, + "grad_norm": 5.28125, + "grad_norm_var": 0.114306640625, + "learning_rate": 4e-05, + "loss": 5.2554, + "loss/crossentropy": 1.9202795922756195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22795704752206802, + "step": 3090 + }, + { + "epoch": 0.25766666666666665, + "grad_norm": 4.5625, + "grad_norm_var": 0.12177327473958334, + "learning_rate": 4e-05, + "loss": 4.9953, + "loss/crossentropy": 1.3964777737855911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15975043550133705, + "step": 3092 + }, + { + "epoch": 0.25783333333333336, + "grad_norm": 5.125, + "grad_norm_var": 0.117822265625, + "learning_rate": 4e-05, + "loss": 4.9921, + "loss/crossentropy": 2.2678469121456146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159191109240055, + "step": 3094 + }, + { + "epoch": 0.258, + "grad_norm": 5.0625, + "grad_norm_var": 0.10702718098958333, + "learning_rate": 4e-05, + "loss": 5.1833, + "loss/crossentropy": 1.8077596053481102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17754389718174934, + "step": 3096 + }, + { + "epoch": 0.25816666666666666, + "grad_norm": 5.46875, + "grad_norm_var": 0.12102457682291666, + "learning_rate": 4e-05, + "loss": 5.5425, + "loss/crossentropy": 2.327247679233551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090022973716259, + "step": 3098 + }, + { + "epoch": 0.25833333333333336, + "grad_norm": 5.125, + "grad_norm_var": 0.11256510416666667, + "learning_rate": 4e-05, + "loss": 4.4231, + "loss/crossentropy": 1.0582982525229454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14690488018095493, + "step": 3100 + }, + { + "epoch": 0.2585, + "grad_norm": 4.78125, + "grad_norm_var": 0.12447509765625, + "learning_rate": 4e-05, + "loss": 5.1684, + "loss/crossentropy": 2.24881511926651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19497083500027657, + "step": 3102 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.12945556640625, + "learning_rate": 4e-05, + "loss": 4.5281, + "loss/crossentropy": 2.5441195368766785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22488819062709808, + "step": 3104 + }, + { + "epoch": 0.25883333333333336, + "grad_norm": 5.0, + "grad_norm_var": 0.09163004557291667, + "learning_rate": 4e-05, + "loss": 4.4234, + "loss/crossentropy": 1.352687880396843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15268324688076973, + "step": 3106 + }, + { + "epoch": 0.259, + "grad_norm": 4.625, + "grad_norm_var": 0.10302327473958334, + "learning_rate": 4e-05, + "loss": 5.2155, + "loss/crossentropy": 2.357410877943039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21430985257029533, + "step": 3108 + }, + { + "epoch": 0.25916666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.10701497395833333, + "learning_rate": 4e-05, + "loss": 4.9418, + "loss/crossentropy": 1.0604775324463844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14432695135474205, + "step": 3110 + }, + { + "epoch": 0.25933333333333336, + "grad_norm": 5.28125, + "grad_norm_var": 0.1150390625, + "learning_rate": 4e-05, + "loss": 5.6923, + "loss/crossentropy": 2.180578827857971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20271052047610283, + "step": 3112 + }, + { + "epoch": 0.2595, + "grad_norm": 6.125, + "grad_norm_var": 0.15416259765625, + "learning_rate": 4e-05, + "loss": 5.4177, + "loss/crossentropy": 2.2036134004592896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19966942071914673, + "step": 3114 + }, + { + "epoch": 0.25966666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.15191650390625, + "learning_rate": 4e-05, + "loss": 4.8192, + "loss/crossentropy": 2.486625075340271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21220136806368828, + "step": 3116 + }, + { + "epoch": 0.25983333333333336, + "grad_norm": 5.0, + "grad_norm_var": 0.14612223307291666, + "learning_rate": 4e-05, + "loss": 5.0516, + "loss/crossentropy": 2.187554508447647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20102859288454056, + "step": 3118 + }, + { + "epoch": 0.26, + "grad_norm": 4.375, + "grad_norm_var": 0.16428629557291666, + "learning_rate": 4e-05, + "loss": 4.7012, + "loss/crossentropy": 2.2847339808940887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20380695909261703, + "step": 3120 + }, + { + "epoch": 0.26016666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.17362874348958332, + "learning_rate": 4e-05, + "loss": 4.6752, + "loss/crossentropy": 2.6803387999534607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214877150952816, + "step": 3122 + }, + { + "epoch": 0.26033333333333336, + "grad_norm": 5.09375, + "grad_norm_var": 0.15247395833333333, + "learning_rate": 4e-05, + "loss": 5.0271, + "loss/crossentropy": 1.861951231956482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054112609475851, + "step": 3124 + }, + { + "epoch": 0.2605, + "grad_norm": 4.84375, + "grad_norm_var": 0.17760416666666667, + "learning_rate": 4e-05, + "loss": 5.052, + "loss/crossentropy": 1.845504753291607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22759747132658958, + "step": 3126 + }, + { + "epoch": 0.26066666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.168603515625, + "learning_rate": 4e-05, + "loss": 4.966, + "loss/crossentropy": 2.169025242328644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220594372600317, + "step": 3128 + }, + { + "epoch": 0.2608333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.0830078125, + "learning_rate": 4e-05, + "loss": 4.4052, + "loss/crossentropy": 1.50277678668499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1798977069556713, + "step": 3130 + }, + { + "epoch": 0.261, + "grad_norm": 4.875, + "grad_norm_var": 0.09996337890625, + "learning_rate": 4e-05, + "loss": 5.3784, + "loss/crossentropy": 2.4498740434646606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22047830745577812, + "step": 3132 + }, + { + "epoch": 0.26116666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.10013020833333333, + "learning_rate": 4e-05, + "loss": 4.8054, + "loss/crossentropy": 1.9772669970989227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18800954148173332, + "step": 3134 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.09488525390625, + "learning_rate": 4e-05, + "loss": 5.2146, + "loss/crossentropy": 1.8985195010900497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984998844563961, + "step": 3136 + }, + { + "epoch": 0.2615, + "grad_norm": 5.125, + "grad_norm_var": 0.086572265625, + "learning_rate": 4e-05, + "loss": 4.9154, + "loss/crossentropy": 1.8307873159646988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17886247113347054, + "step": 3138 + }, + { + "epoch": 0.26166666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.08778889973958333, + "learning_rate": 4e-05, + "loss": 4.2008, + "loss/crossentropy": 1.0259275138378143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13531352765858173, + "step": 3140 + }, + { + "epoch": 0.2618333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.07936197916666667, + "learning_rate": 4e-05, + "loss": 4.8041, + "loss/crossentropy": 1.9614720344543457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22988587990403175, + "step": 3142 + }, + { + "epoch": 0.262, + "grad_norm": 4.96875, + "grad_norm_var": 0.07545572916666667, + "learning_rate": 4e-05, + "loss": 4.651, + "loss/crossentropy": 2.0952285528182983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139127030968666, + "step": 3144 + }, + { + "epoch": 0.26216666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.07194010416666667, + "learning_rate": 4e-05, + "loss": 4.8872, + "loss/crossentropy": 2.3387314677238464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2338441088795662, + "step": 3146 + }, + { + "epoch": 0.2623333333333333, + "grad_norm": 7.0625, + "grad_norm_var": 0.3492838541666667, + "learning_rate": 4e-05, + "loss": 4.8195, + "loss/crossentropy": 1.864521287381649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1690395548939705, + "step": 3148 + }, + { + "epoch": 0.2625, + "grad_norm": 4.96875, + "grad_norm_var": 0.34215087890625, + "learning_rate": 4e-05, + "loss": 4.9518, + "loss/crossentropy": 2.1711268723011017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20523667708039284, + "step": 3150 + }, + { + "epoch": 0.26266666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.33498942057291664, + "learning_rate": 4e-05, + "loss": 5.4427, + "loss/crossentropy": 2.0541456565260887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1990213841199875, + "step": 3152 + }, + { + "epoch": 0.2628333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.33255208333333336, + "learning_rate": 4e-05, + "loss": 4.4446, + "loss/crossentropy": 2.2430761456489563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24454378709197044, + "step": 3154 + }, + { + "epoch": 0.263, + "grad_norm": 5.5, + "grad_norm_var": 0.3441365559895833, + "learning_rate": 4e-05, + "loss": 4.6601, + "loss/crossentropy": 1.9410057738423347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1790495365858078, + "step": 3156 + }, + { + "epoch": 0.26316666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.34894205729166666, + "learning_rate": 4e-05, + "loss": 4.716, + "loss/crossentropy": 2.0159209072589874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19673524983227253, + "step": 3158 + }, + { + "epoch": 0.2633333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.3651041666666667, + "learning_rate": 4e-05, + "loss": 5.6148, + "loss/crossentropy": 2.481500566005707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217854592949152, + "step": 3160 + }, + { + "epoch": 0.2635, + "grad_norm": 5.875, + "grad_norm_var": 0.3628743489583333, + "learning_rate": 4e-05, + "loss": 4.9497, + "loss/crossentropy": 2.3142440021038055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22278093919157982, + "step": 3162 + }, + { + "epoch": 0.26366666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.11575520833333333, + "learning_rate": 4e-05, + "loss": 5.2122, + "loss/crossentropy": 1.4542016088962555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14941613376140594, + "step": 3164 + }, + { + "epoch": 0.2638333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.13199462890625, + "learning_rate": 4e-05, + "loss": 4.6475, + "loss/crossentropy": 2.320178806781769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20817406103014946, + "step": 3166 + }, + { + "epoch": 0.264, + "grad_norm": 5.09375, + "grad_norm_var": 0.13214518229166666, + "learning_rate": 4e-05, + "loss": 5.1067, + "loss/crossentropy": 1.9453425705432892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127118781208992, + "step": 3168 + }, + { + "epoch": 0.26416666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.15015869140625, + "learning_rate": 4e-05, + "loss": 4.8359, + "loss/crossentropy": 1.9130103662610054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1991387903690338, + "step": 3170 + }, + { + "epoch": 0.2643333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.14706624348958333, + "learning_rate": 4e-05, + "loss": 5.1114, + "loss/crossentropy": 1.981778234243393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1902402602136135, + "step": 3172 + }, + { + "epoch": 0.2645, + "grad_norm": 5.4375, + "grad_norm_var": 0.13782145182291666, + "learning_rate": 4e-05, + "loss": 5.281, + "loss/crossentropy": 2.231064334511757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19765946455299854, + "step": 3174 + }, + { + "epoch": 0.26466666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.119384765625, + "learning_rate": 4e-05, + "loss": 4.3434, + "loss/crossentropy": 1.753200277686119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18743771128356457, + "step": 3176 + }, + { + "epoch": 0.2648333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.10230712890625, + "learning_rate": 4e-05, + "loss": 5.3008, + "loss/crossentropy": 2.396784156560898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2302083782851696, + "step": 3178 + }, + { + "epoch": 0.265, + "grad_norm": 5.0, + "grad_norm_var": 0.097509765625, + "learning_rate": 4e-05, + "loss": 5.6565, + "loss/crossentropy": 2.6294925808906555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2336473949253559, + "step": 3180 + }, + { + "epoch": 0.26516666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.09308268229166666, + "learning_rate": 4e-05, + "loss": 4.9291, + "loss/crossentropy": 1.6210493966937065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16944964230060577, + "step": 3182 + }, + { + "epoch": 0.2653333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.09791259765625, + "learning_rate": 4e-05, + "loss": 4.6843, + "loss/crossentropy": 1.8240682110190392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20642348006367683, + "step": 3184 + }, + { + "epoch": 0.2655, + "grad_norm": 5.125, + "grad_norm_var": 0.08756510416666667, + "learning_rate": 4e-05, + "loss": 4.7397, + "loss/crossentropy": 2.4440919160842896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144545502960682, + "step": 3186 + }, + { + "epoch": 0.26566666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.07662760416666667, + "learning_rate": 4e-05, + "loss": 5.3081, + "loss/crossentropy": 2.5918545722961426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22979921475052834, + "step": 3188 + }, + { + "epoch": 0.2658333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.07615559895833333, + "learning_rate": 4e-05, + "loss": 4.3005, + "loss/crossentropy": 1.7671714574098587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187911469489336, + "step": 3190 + }, + { + "epoch": 0.266, + "grad_norm": 5.375, + "grad_norm_var": 0.084375, + "learning_rate": 4e-05, + "loss": 5.0428, + "loss/crossentropy": 1.6388864442706108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955713890492916, + "step": 3192 + }, + { + "epoch": 0.26616666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.04895426432291667, + "learning_rate": 4e-05, + "loss": 5.1676, + "loss/crossentropy": 1.9798070192337036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20462529733777046, + "step": 3194 + }, + { + "epoch": 0.2663333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.04947509765625, + "learning_rate": 4e-05, + "loss": 4.5824, + "loss/crossentropy": 2.0957940965890884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1770927645266056, + "step": 3196 + }, + { + "epoch": 0.2665, + "grad_norm": 4.90625, + "grad_norm_var": 0.04244384765625, + "learning_rate": 4e-05, + "loss": 5.1135, + "loss/crossentropy": 1.977191299200058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1755628753453493, + "step": 3198 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 5.3125, + "grad_norm_var": 0.043863932291666664, + "learning_rate": 4e-05, + "loss": 5.1938, + "loss/crossentropy": 1.5719657689332962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15234808064997196, + "step": 3200 + }, + { + "epoch": 0.2668333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.09797770182291667, + "learning_rate": 4e-05, + "loss": 5.1316, + "loss/crossentropy": 1.9265673011541367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19334514439105988, + "step": 3202 + }, + { + "epoch": 0.267, + "grad_norm": 5.4375, + "grad_norm_var": 0.11991780598958333, + "learning_rate": 4e-05, + "loss": 5.4066, + "loss/crossentropy": 1.7487748563289642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16883439384400845, + "step": 3204 + }, + { + "epoch": 0.26716666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.10393473307291666, + "learning_rate": 4e-05, + "loss": 5.3232, + "loss/crossentropy": 1.8918979242444038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1814283076673746, + "step": 3206 + }, + { + "epoch": 0.2673333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.10279947916666667, + "learning_rate": 4e-05, + "loss": 4.5995, + "loss/crossentropy": 2.1503345668315887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536822408437729, + "step": 3208 + }, + { + "epoch": 0.2675, + "grad_norm": 4.90625, + "grad_norm_var": 0.10256754557291667, + "learning_rate": 4e-05, + "loss": 4.8599, + "loss/crossentropy": 1.4438120797276497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15272528119385242, + "step": 3210 + }, + { + "epoch": 0.26766666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.10813395182291667, + "learning_rate": 4e-05, + "loss": 4.9177, + "loss/crossentropy": 1.8605887293815613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19962546601891518, + "step": 3212 + }, + { + "epoch": 0.2678333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.11399332682291667, + "learning_rate": 4e-05, + "loss": 5.6567, + "loss/crossentropy": 2.586755871772766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24593067169189453, + "step": 3214 + }, + { + "epoch": 0.268, + "grad_norm": 10.3125, + "grad_norm_var": 1.7810506184895833, + "learning_rate": 4e-05, + "loss": 5.4459, + "loss/crossentropy": 1.9296107813715935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18788691982626915, + "step": 3216 + }, + { + "epoch": 0.26816666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 1.7888631184895833, + "learning_rate": 4e-05, + "loss": 4.6947, + "loss/crossentropy": 2.048318862915039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1871740221977234, + "step": 3218 + }, + { + "epoch": 0.2683333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 1.8235514322916666, + "learning_rate": 4e-05, + "loss": 4.4589, + "loss/crossentropy": 1.732799842953682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16609930619597435, + "step": 3220 + }, + { + "epoch": 0.2685, + "grad_norm": 5.34375, + "grad_norm_var": 1.8171712239583333, + "learning_rate": 4e-05, + "loss": 5.099, + "loss/crossentropy": 2.048336148262024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951584815979004, + "step": 3222 + }, + { + "epoch": 0.26866666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 1.83570556640625, + "learning_rate": 4e-05, + "loss": 4.5305, + "loss/crossentropy": 2.4394567012786865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21930208802223206, + "step": 3224 + }, + { + "epoch": 0.2688333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 1.851416015625, + "learning_rate": 4e-05, + "loss": 4.3228, + "loss/crossentropy": 1.6342605128884315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1711755134165287, + "step": 3226 + }, + { + "epoch": 0.269, + "grad_norm": 4.9375, + "grad_norm_var": 1.8655558268229167, + "learning_rate": 4e-05, + "loss": 4.9384, + "loss/crossentropy": 2.0174030661582947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106718048453331, + "step": 3228 + }, + { + "epoch": 0.26916666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 1.8641886393229166, + "learning_rate": 4e-05, + "loss": 5.324, + "loss/crossentropy": 1.9064742401242256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16172141209244728, + "step": 3230 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.06669514973958333, + "learning_rate": 4e-05, + "loss": 4.881, + "loss/crossentropy": 2.1121154129505157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20355704799294472, + "step": 3232 + }, + { + "epoch": 0.2695, + "grad_norm": 5.1875, + "grad_norm_var": 0.34280192057291664, + "learning_rate": 4e-05, + "loss": 5.3211, + "loss/crossentropy": 1.989701747894287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1762289609760046, + "step": 3234 + }, + { + "epoch": 0.26966666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.33136393229166666, + "learning_rate": 4e-05, + "loss": 4.7003, + "loss/crossentropy": 2.457001119852066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2369070202112198, + "step": 3236 + }, + { + "epoch": 0.2698333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.32483317057291666, + "learning_rate": 4e-05, + "loss": 4.9221, + "loss/crossentropy": 0.9388425797224045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12862342968583107, + "step": 3238 + }, + { + "epoch": 0.27, + "grad_norm": 5.03125, + "grad_norm_var": 0.318994140625, + "learning_rate": 4e-05, + "loss": 4.5479, + "loss/crossentropy": 0.9154981449246407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.126858439296484, + "step": 3240 + }, + { + "epoch": 0.27016666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.32135009765625, + "learning_rate": 4e-05, + "loss": 4.9938, + "loss/crossentropy": 1.5279822647571564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15195055678486824, + "step": 3242 + }, + { + "epoch": 0.2703333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.30831705729166664, + "learning_rate": 4e-05, + "loss": 5.1522, + "loss/crossentropy": 2.0843097865581512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19562271982431412, + "step": 3244 + }, + { + "epoch": 0.2705, + "grad_norm": 5.09375, + "grad_norm_var": 0.3346964518229167, + "learning_rate": 4e-05, + "loss": 5.0055, + "loss/crossentropy": 1.8548680245876312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1670110784471035, + "step": 3246 + }, + { + "epoch": 0.27066666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.32146809895833334, + "learning_rate": 4e-05, + "loss": 4.913, + "loss/crossentropy": 2.6331475973129272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23913651704788208, + "step": 3248 + }, + { + "epoch": 0.2708333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.05701497395833333, + "learning_rate": 4e-05, + "loss": 4.9626, + "loss/crossentropy": 2.254679262638092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20621953532099724, + "step": 3250 + }, + { + "epoch": 0.271, + "grad_norm": 5.0, + "grad_norm_var": 0.048291015625, + "learning_rate": 4e-05, + "loss": 4.4947, + "loss/crossentropy": 2.1386323794722557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046901538968086, + "step": 3252 + }, + { + "epoch": 0.27116666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.04820556640625, + "learning_rate": 4e-05, + "loss": 4.8927, + "loss/crossentropy": 1.6399564519524574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18125244602560997, + "step": 3254 + }, + { + "epoch": 0.2713333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.05284830729166667, + "learning_rate": 4e-05, + "loss": 5.1166, + "loss/crossentropy": 1.8724690079689026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21212750673294067, + "step": 3256 + }, + { + "epoch": 0.2715, + "grad_norm": 4.875, + "grad_norm_var": 0.03268229166666667, + "learning_rate": 4e-05, + "loss": 4.6229, + "loss/crossentropy": 1.315964438021183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17530952394008636, + "step": 3258 + }, + { + "epoch": 0.27166666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.03658854166666667, + "learning_rate": 4e-05, + "loss": 5.214, + "loss/crossentropy": 1.7634995728731155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21435541100800037, + "step": 3260 + }, + { + "epoch": 0.2718333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.15556233723958332, + "learning_rate": 4e-05, + "loss": 4.5934, + "loss/crossentropy": 2.2641907036304474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568279296159744, + "step": 3262 + }, + { + "epoch": 0.272, + "grad_norm": 4.40625, + "grad_norm_var": 0.18863525390625, + "learning_rate": 4e-05, + "loss": 4.6097, + "loss/crossentropy": 1.5739614740014076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1697534527629614, + "step": 3264 + }, + { + "epoch": 0.27216666666666667, + "grad_norm": 5.65625, + "grad_norm_var": 0.21256510416666666, + "learning_rate": 4e-05, + "loss": 4.8451, + "loss/crossentropy": 2.052751898765564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18519877456128597, + "step": 3266 + }, + { + "epoch": 0.2723333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.23088785807291667, + "learning_rate": 4e-05, + "loss": 4.6083, + "loss/crossentropy": 1.1079089492559433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11845254711806774, + "step": 3268 + }, + { + "epoch": 0.2725, + "grad_norm": 4.9375, + "grad_norm_var": 0.2380859375, + "learning_rate": 4e-05, + "loss": 4.9116, + "loss/crossentropy": 1.9221205562353134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930643692612648, + "step": 3270 + }, + { + "epoch": 0.27266666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.2255859375, + "learning_rate": 4e-05, + "loss": 5.3047, + "loss/crossentropy": 2.6353172063827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22324591875076294, + "step": 3272 + }, + { + "epoch": 0.2728333333333333, + "grad_norm": 6.5625, + "grad_norm_var": 0.3466796875, + "learning_rate": 4e-05, + "loss": 4.5763, + "loss/crossentropy": 2.372876226902008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2210596241056919, + "step": 3274 + }, + { + "epoch": 0.273, + "grad_norm": 4.5, + "grad_norm_var": 0.37942708333333336, + "learning_rate": 4e-05, + "loss": 4.5375, + "loss/crossentropy": 1.4885797277092934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15314335376024246, + "step": 3276 + }, + { + "epoch": 0.27316666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.27278238932291665, + "learning_rate": 4e-05, + "loss": 4.698, + "loss/crossentropy": 1.7347316294908524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17671633325517178, + "step": 3278 + }, + { + "epoch": 0.2733333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.239697265625, + "learning_rate": 4e-05, + "loss": 5.1095, + "loss/crossentropy": 1.9635898768901825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21393844485282898, + "step": 3280 + }, + { + "epoch": 0.2735, + "grad_norm": 4.78125, + "grad_norm_var": 0.23287760416666667, + "learning_rate": 4e-05, + "loss": 4.4866, + "loss/crossentropy": 1.3102534040808678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16433557122945786, + "step": 3282 + }, + { + "epoch": 0.27366666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.21649983723958333, + "learning_rate": 4e-05, + "loss": 4.8568, + "loss/crossentropy": 2.0422130823135376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17257808148860931, + "step": 3284 + }, + { + "epoch": 0.2738333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.207275390625, + "learning_rate": 4e-05, + "loss": 5.3481, + "loss/crossentropy": 1.767970271408558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1866742093116045, + "step": 3286 + }, + { + "epoch": 0.274, + "grad_norm": 5.34375, + "grad_norm_var": 0.21874593098958334, + "learning_rate": 4e-05, + "loss": 4.8972, + "loss/crossentropy": 2.234264552593231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20818063244223595, + "step": 3288 + }, + { + "epoch": 0.27416666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.07056884765625, + "learning_rate": 4e-05, + "loss": 4.5813, + "loss/crossentropy": 1.398232415318489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15509275533258915, + "step": 3290 + }, + { + "epoch": 0.2743333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.054541015625, + "learning_rate": 4e-05, + "loss": 4.8906, + "loss/crossentropy": 1.7504291385412216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19099892303347588, + "step": 3292 + }, + { + "epoch": 0.2745, + "grad_norm": 5.28125, + "grad_norm_var": 0.04859619140625, + "learning_rate": 4e-05, + "loss": 4.9809, + "loss/crossentropy": 2.1250991821289062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20637407153844833, + "step": 3294 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.045426432291666666, + "learning_rate": 4e-05, + "loss": 5.0804, + "loss/crossentropy": 2.4322333335876465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22350069507956505, + "step": 3296 + }, + { + "epoch": 0.2748333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.03292643229166667, + "learning_rate": 4e-05, + "loss": 5.1094, + "loss/crossentropy": 2.3606340885162354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20735639706254005, + "step": 3298 + }, + { + "epoch": 0.275, + "grad_norm": 4.875, + "grad_norm_var": 0.030582682291666666, + "learning_rate": 4e-05, + "loss": 4.8605, + "loss/crossentropy": 1.2059366628527641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15578687004745007, + "step": 3300 + }, + { + "epoch": 0.27516666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.04980061848958333, + "learning_rate": 4e-05, + "loss": 4.461, + "loss/crossentropy": 1.335912600159645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15207264013588428, + "step": 3302 + }, + { + "epoch": 0.2753333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.042867024739583336, + "learning_rate": 4e-05, + "loss": 4.2409, + "loss/crossentropy": 0.9111597612500191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1322095450013876, + "step": 3304 + }, + { + "epoch": 0.2755, + "grad_norm": 4.59375, + "grad_norm_var": 0.05423177083333333, + "learning_rate": 4e-05, + "loss": 5.0508, + "loss/crossentropy": 2.276846766471863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20194971933960915, + "step": 3306 + }, + { + "epoch": 0.27566666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.05865885416666667, + "learning_rate": 4e-05, + "loss": 3.8198, + "loss/crossentropy": 1.5635306984186172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16944929771125317, + "step": 3308 + }, + { + "epoch": 0.2758333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.060282389322916664, + "learning_rate": 4e-05, + "loss": 4.7673, + "loss/crossentropy": 2.0001417845487595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25854466669261456, + "step": 3310 + }, + { + "epoch": 0.276, + "grad_norm": 5.28125, + "grad_norm_var": 0.06731770833333334, + "learning_rate": 4e-05, + "loss": 5.1252, + "loss/crossentropy": 2.004520893096924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436689630150795, + "step": 3312 + }, + { + "epoch": 0.27616666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.0599609375, + "learning_rate": 4e-05, + "loss": 4.1743, + "loss/crossentropy": 1.7104368656873703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18997495248913765, + "step": 3314 + }, + { + "epoch": 0.2763333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.067431640625, + "learning_rate": 4e-05, + "loss": 4.3987, + "loss/crossentropy": 1.6688388288021088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20947271212935448, + "step": 3316 + }, + { + "epoch": 0.2765, + "grad_norm": 4.59375, + "grad_norm_var": 0.04918212890625, + "learning_rate": 4e-05, + "loss": 4.6297, + "loss/crossentropy": 1.7906979843974113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828641202300787, + "step": 3318 + }, + { + "epoch": 0.27666666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.13917643229166668, + "learning_rate": 4e-05, + "loss": 5.2282, + "loss/crossentropy": 1.8596151024103165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19812500476837158, + "step": 3320 + }, + { + "epoch": 0.2768333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.13097330729166667, + "learning_rate": 4e-05, + "loss": 4.6779, + "loss/crossentropy": 1.829872913658619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17703045904636383, + "step": 3322 + }, + { + "epoch": 0.277, + "grad_norm": 5.5625, + "grad_norm_var": 0.14312744140625, + "learning_rate": 4e-05, + "loss": 5.2029, + "loss/crossentropy": 2.1334268152713776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22605855762958527, + "step": 3324 + }, + { + "epoch": 0.2771666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.13534749348958333, + "learning_rate": 4e-05, + "loss": 5.2196, + "loss/crossentropy": 2.1470797285437584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1935490034520626, + "step": 3326 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.14254150390625, + "learning_rate": 4e-05, + "loss": 4.7942, + "loss/crossentropy": 1.2008096277713776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13764559477567673, + "step": 3328 + }, + { + "epoch": 0.2775, + "grad_norm": 5.15625, + "grad_norm_var": 0.14039306640625, + "learning_rate": 4e-05, + "loss": 4.7285, + "loss/crossentropy": 0.860782727599144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11173893883824348, + "step": 3330 + }, + { + "epoch": 0.2776666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.12434488932291667, + "learning_rate": 4e-05, + "loss": 5.1798, + "loss/crossentropy": 1.5026598796248436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16885693185031414, + "step": 3332 + }, + { + "epoch": 0.2778333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.11148681640625, + "learning_rate": 4e-05, + "loss": 5.1526, + "loss/crossentropy": 1.838165283203125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18797642178833485, + "step": 3334 + }, + { + "epoch": 0.278, + "grad_norm": 5.25, + "grad_norm_var": 0.06652018229166666, + "learning_rate": 4e-05, + "loss": 4.8834, + "loss/crossentropy": 1.5069977939128876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18563106283545494, + "step": 3336 + }, + { + "epoch": 0.2781666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.07502848307291667, + "learning_rate": 4e-05, + "loss": 4.7134, + "loss/crossentropy": 2.5508508384227753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22817543521523476, + "step": 3338 + }, + { + "epoch": 0.2783333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.04722900390625, + "learning_rate": 4e-05, + "loss": 4.7194, + "loss/crossentropy": 2.5258346498012543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2168513759970665, + "step": 3340 + }, + { + "epoch": 0.2785, + "grad_norm": 5.125, + "grad_norm_var": 0.04166259765625, + "learning_rate": 4e-05, + "loss": 4.8206, + "loss/crossentropy": 1.079502247273922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16153132170438766, + "step": 3342 + }, + { + "epoch": 0.2786666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.04251302083333333, + "learning_rate": 4e-05, + "loss": 5.8333, + "loss/crossentropy": 1.930338904261589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19220566004514694, + "step": 3344 + }, + { + "epoch": 0.2788333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.05429280598958333, + "learning_rate": 4e-05, + "loss": 4.6864, + "loss/crossentropy": 2.3099615573883057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20308882370591164, + "step": 3346 + }, + { + "epoch": 0.279, + "grad_norm": 4.78125, + "grad_norm_var": 0.05310872395833333, + "learning_rate": 4e-05, + "loss": 4.0611, + "loss/crossentropy": 1.3752945065498352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14378135465085506, + "step": 3348 + }, + { + "epoch": 0.2791666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.053450520833333334, + "learning_rate": 4e-05, + "loss": 5.0266, + "loss/crossentropy": 2.265912115573883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20121736079454422, + "step": 3350 + }, + { + "epoch": 0.2793333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.036909993489583334, + "learning_rate": 4e-05, + "loss": 4.7789, + "loss/crossentropy": 2.411749243736267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24077193066477776, + "step": 3352 + }, + { + "epoch": 0.2795, + "grad_norm": 5.1875, + "grad_norm_var": 0.049214680989583336, + "learning_rate": 4e-05, + "loss": 5.0433, + "loss/crossentropy": 1.2377412021160126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1717513632029295, + "step": 3354 + }, + { + "epoch": 0.2796666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.05455729166666667, + "learning_rate": 4e-05, + "loss": 5.0701, + "loss/crossentropy": 1.9626172259449959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19007350504398346, + "step": 3356 + }, + { + "epoch": 0.2798333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.06783447265625, + "learning_rate": 4e-05, + "loss": 4.8856, + "loss/crossentropy": 1.9040052741765976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1902400143444538, + "step": 3358 + }, + { + "epoch": 0.28, + "grad_norm": 4.96875, + "grad_norm_var": 0.061909993489583336, + "learning_rate": 4e-05, + "loss": 4.5763, + "loss/crossentropy": 2.0596228316426277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19010531157255173, + "step": 3360 + }, + { + "epoch": 0.2801666666666667, + "grad_norm": 5.875, + "grad_norm_var": 0.10143229166666666, + "learning_rate": 4e-05, + "loss": 4.9529, + "loss/crossentropy": 1.5693950355052948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19930481910705566, + "step": 3362 + }, + { + "epoch": 0.2803333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.09498697916666667, + "learning_rate": 4e-05, + "loss": 5.1627, + "loss/crossentropy": 1.9942995011806488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22873876243829727, + "step": 3364 + }, + { + "epoch": 0.2805, + "grad_norm": 5.28125, + "grad_norm_var": 0.09830322265625, + "learning_rate": 4e-05, + "loss": 4.52, + "loss/crossentropy": 0.7917919382452965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12067169696092606, + "step": 3366 + }, + { + "epoch": 0.2806666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.10076497395833334, + "learning_rate": 4e-05, + "loss": 4.3528, + "loss/crossentropy": 1.2874325066804886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14191594906151295, + "step": 3368 + }, + { + "epoch": 0.2808333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.07919514973958333, + "learning_rate": 4e-05, + "loss": 4.8554, + "loss/crossentropy": 1.3948650658130646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20387698709964752, + "step": 3370 + }, + { + "epoch": 0.281, + "grad_norm": 4.9375, + "grad_norm_var": 0.07200113932291667, + "learning_rate": 4e-05, + "loss": 5.2899, + "loss/crossentropy": 1.7417053952813148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18501684069633484, + "step": 3372 + }, + { + "epoch": 0.2811666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.08860270182291667, + "learning_rate": 4e-05, + "loss": 3.8689, + "loss/crossentropy": 1.5205266997218132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15771334990859032, + "step": 3374 + }, + { + "epoch": 0.2813333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.11339518229166666, + "learning_rate": 4e-05, + "loss": 5.5822, + "loss/crossentropy": 2.372869312763214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21234075352549553, + "step": 3376 + }, + { + "epoch": 0.2815, + "grad_norm": 5.46875, + "grad_norm_var": 0.08644205729166667, + "learning_rate": 4e-05, + "loss": 4.5008, + "loss/crossentropy": 1.7059477791190147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1649102047085762, + "step": 3378 + }, + { + "epoch": 0.2816666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.0859375, + "learning_rate": 4e-05, + "loss": 5.1233, + "loss/crossentropy": 2.4804338812828064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21589474752545357, + "step": 3380 + }, + { + "epoch": 0.2818333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.08381754557291667, + "learning_rate": 4e-05, + "loss": 4.868, + "loss/crossentropy": 1.9849779903888702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23055313155055046, + "step": 3382 + }, + { + "epoch": 0.282, + "grad_norm": 4.3125, + "grad_norm_var": 0.11340738932291666, + "learning_rate": 4e-05, + "loss": 4.6629, + "loss/crossentropy": 2.002195544540882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18036412820219994, + "step": 3384 + }, + { + "epoch": 0.2821666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.11770426432291667, + "learning_rate": 4e-05, + "loss": 5.5229, + "loss/crossentropy": 2.409064471721649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20758257806301117, + "step": 3386 + }, + { + "epoch": 0.2823333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.12473551432291667, + "learning_rate": 4e-05, + "loss": 4.3955, + "loss/crossentropy": 1.6170982271432877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150630559772253, + "step": 3388 + }, + { + "epoch": 0.2825, + "grad_norm": 5.0, + "grad_norm_var": 0.208203125, + "learning_rate": 4e-05, + "loss": 4.9565, + "loss/crossentropy": 2.2994788140058517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032534722238779, + "step": 3390 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.18736572265625, + "learning_rate": 4e-05, + "loss": 4.9765, + "loss/crossentropy": 1.881778173148632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19996779970824718, + "step": 3392 + }, + { + "epoch": 0.2828333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.18381754557291666, + "learning_rate": 4e-05, + "loss": 5.0977, + "loss/crossentropy": 1.3992729112505913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14480482786893845, + "step": 3394 + }, + { + "epoch": 0.283, + "grad_norm": 4.96875, + "grad_norm_var": 0.18495686848958334, + "learning_rate": 4e-05, + "loss": 4.6631, + "loss/crossentropy": 2.6132925748825073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21732110902667046, + "step": 3396 + }, + { + "epoch": 0.2831666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.19269205729166666, + "learning_rate": 4e-05, + "loss": 4.7744, + "loss/crossentropy": 1.9939734041690826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19406055100262165, + "step": 3398 + }, + { + "epoch": 0.2833333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.16276041666666666, + "learning_rate": 4e-05, + "loss": 4.9806, + "loss/crossentropy": 2.3022571206092834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050349861383438, + "step": 3400 + }, + { + "epoch": 0.2835, + "grad_norm": 4.75, + "grad_norm_var": 0.15948893229166666, + "learning_rate": 4e-05, + "loss": 4.7427, + "loss/crossentropy": 1.8530340567231178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16540405713021755, + "step": 3402 + }, + { + "epoch": 0.2836666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.15429280598958334, + "learning_rate": 4e-05, + "loss": 4.7811, + "loss/crossentropy": 2.265649139881134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2381511926651001, + "step": 3404 + }, + { + "epoch": 0.2838333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.04602864583333333, + "learning_rate": 4e-05, + "loss": 5.1025, + "loss/crossentropy": 2.4468571543693542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114931084215641, + "step": 3406 + }, + { + "epoch": 0.284, + "grad_norm": 4.59375, + "grad_norm_var": 0.05552978515625, + "learning_rate": 4e-05, + "loss": 4.8916, + "loss/crossentropy": 1.9908486604690552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20576748996973038, + "step": 3408 + }, + { + "epoch": 0.2841666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.048173014322916666, + "learning_rate": 4e-05, + "loss": 4.3453, + "loss/crossentropy": 1.8514483124017715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18919625878334045, + "step": 3410 + }, + { + "epoch": 0.2843333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.056494140625, + "learning_rate": 4e-05, + "loss": 5.3242, + "loss/crossentropy": 1.6181135475635529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1568435113877058, + "step": 3412 + }, + { + "epoch": 0.2845, + "grad_norm": 4.78125, + "grad_norm_var": 0.06448160807291667, + "learning_rate": 4e-05, + "loss": 4.917, + "loss/crossentropy": 1.7242164313793182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557476624846458, + "step": 3414 + }, + { + "epoch": 0.2846666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.06378580729166666, + "learning_rate": 4e-05, + "loss": 4.9142, + "loss/crossentropy": 2.7115097641944885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2387315072119236, + "step": 3416 + }, + { + "epoch": 0.2848333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.06672770182291667, + "learning_rate": 4e-05, + "loss": 4.5766, + "loss/crossentropy": 1.780228778719902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914806980639696, + "step": 3418 + }, + { + "epoch": 0.285, + "grad_norm": 4.78125, + "grad_norm_var": 0.06864827473958333, + "learning_rate": 4e-05, + "loss": 4.7589, + "loss/crossentropy": 1.842021107673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18138272687792778, + "step": 3420 + }, + { + "epoch": 0.2851666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.05806884765625, + "learning_rate": 4e-05, + "loss": 5.3465, + "loss/crossentropy": 2.4019596874713898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22154908254742622, + "step": 3422 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.7114420572916667, + "learning_rate": 4e-05, + "loss": 4.7384, + "loss/crossentropy": 2.0549621507525444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19369005970656872, + "step": 3424 + }, + { + "epoch": 0.2855, + "grad_norm": 5.65625, + "grad_norm_var": 0.6972615559895833, + "learning_rate": 4e-05, + "loss": 4.7996, + "loss/crossentropy": 2.2193926870822906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2191501259803772, + "step": 3426 + }, + { + "epoch": 0.2856666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.7040974934895833, + "learning_rate": 4e-05, + "loss": 5.373, + "loss/crossentropy": 2.4476476907730103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21305923536419868, + "step": 3428 + }, + { + "epoch": 0.28583333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.73140869140625, + "learning_rate": 4e-05, + "loss": 4.2742, + "loss/crossentropy": 2.048733487725258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17609414085745811, + "step": 3430 + }, + { + "epoch": 0.286, + "grad_norm": 4.78125, + "grad_norm_var": 0.733203125, + "learning_rate": 4e-05, + "loss": 4.9869, + "loss/crossentropy": 2.569149136543274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2256152704358101, + "step": 3432 + }, + { + "epoch": 0.2861666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.72750244140625, + "learning_rate": 4e-05, + "loss": 5.0844, + "loss/crossentropy": 1.9300435483455658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18370277993381023, + "step": 3434 + }, + { + "epoch": 0.28633333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.7176920572916666, + "learning_rate": 4e-05, + "loss": 5.0577, + "loss/crossentropy": 2.043847441673279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19139155372977257, + "step": 3436 + }, + { + "epoch": 0.2865, + "grad_norm": 5.15625, + "grad_norm_var": 0.7329264322916667, + "learning_rate": 4e-05, + "loss": 4.9035, + "loss/crossentropy": 1.8298492655158043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17233120650053024, + "step": 3438 + }, + { + "epoch": 0.2866666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.11510416666666666, + "learning_rate": 4e-05, + "loss": 4.9975, + "loss/crossentropy": 2.417716324329376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20786355063319206, + "step": 3440 + }, + { + "epoch": 0.28683333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.08084309895833333, + "learning_rate": 4e-05, + "loss": 4.853, + "loss/crossentropy": 1.0041667819023132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14175090938806534, + "step": 3442 + }, + { + "epoch": 0.287, + "grad_norm": 4.9375, + "grad_norm_var": 0.09130452473958334, + "learning_rate": 4e-05, + "loss": 4.6887, + "loss/crossentropy": 2.447651743888855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20354222133755684, + "step": 3444 + }, + { + "epoch": 0.2871666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.09529622395833333, + "learning_rate": 4e-05, + "loss": 4.4465, + "loss/crossentropy": 1.1983712315559387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18250321969389915, + "step": 3446 + }, + { + "epoch": 0.28733333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.12408854166666666, + "learning_rate": 4e-05, + "loss": 5.1078, + "loss/crossentropy": 2.3835307359695435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20711689442396164, + "step": 3448 + }, + { + "epoch": 0.2875, + "grad_norm": 4.6875, + "grad_norm_var": 0.13255208333333332, + "learning_rate": 4e-05, + "loss": 4.579, + "loss/crossentropy": 1.925173059105873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21384599804878235, + "step": 3450 + }, + { + "epoch": 0.2876666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.15198160807291666, + "learning_rate": 4e-05, + "loss": 4.6881, + "loss/crossentropy": 2.016170620918274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19685513153672218, + "step": 3452 + }, + { + "epoch": 0.28783333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.10729166666666666, + "learning_rate": 4e-05, + "loss": 4.8844, + "loss/crossentropy": 1.8985230028629303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19102028012275696, + "step": 3454 + }, + { + "epoch": 0.288, + "grad_norm": 5.15625, + "grad_norm_var": 0.10963134765625, + "learning_rate": 4e-05, + "loss": 5.2529, + "loss/crossentropy": 2.064757615327835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18660160526633263, + "step": 3456 + }, + { + "epoch": 0.2881666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.11500244140625, + "learning_rate": 4e-05, + "loss": 4.5073, + "loss/crossentropy": 2.4254125356674194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18952980637550354, + "step": 3458 + }, + { + "epoch": 0.28833333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.10279947916666667, + "learning_rate": 4e-05, + "loss": 4.7031, + "loss/crossentropy": 2.1310142278671265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081185169517994, + "step": 3460 + }, + { + "epoch": 0.2885, + "grad_norm": 4.875, + "grad_norm_var": 0.09498291015625, + "learning_rate": 4e-05, + "loss": 5.0103, + "loss/crossentropy": 1.6102216243743896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1663418561220169, + "step": 3462 + }, + { + "epoch": 0.2886666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.21265869140625, + "learning_rate": 4e-05, + "loss": 5.3356, + "loss/crossentropy": 2.331478774547577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20860225334763527, + "step": 3464 + }, + { + "epoch": 0.28883333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.20702718098958334, + "learning_rate": 4e-05, + "loss": 4.9703, + "loss/crossentropy": 1.7024380043148994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17736496776342392, + "step": 3466 + }, + { + "epoch": 0.289, + "grad_norm": 4.75, + "grad_norm_var": 0.1900390625, + "learning_rate": 4e-05, + "loss": 4.9114, + "loss/crossentropy": 2.388719826936722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955118253827095, + "step": 3468 + }, + { + "epoch": 0.2891666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.20167643229166668, + "learning_rate": 4e-05, + "loss": 4.9545, + "loss/crossentropy": 2.093321107327938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18176774308085442, + "step": 3470 + }, + { + "epoch": 0.28933333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.20956624348958333, + "learning_rate": 4e-05, + "loss": 4.7973, + "loss/crossentropy": 2.4865227341651917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278795726597309, + "step": 3472 + }, + { + "epoch": 0.2895, + "grad_norm": 4.78125, + "grad_norm_var": 0.21008707682291666, + "learning_rate": 4e-05, + "loss": 5.1973, + "loss/crossentropy": 1.8062629103660583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20775445736944675, + "step": 3474 + }, + { + "epoch": 0.2896666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.213525390625, + "learning_rate": 4e-05, + "loss": 4.5915, + "loss/crossentropy": 2.489815413951874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21216319501399994, + "step": 3476 + }, + { + "epoch": 0.28983333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.20618082682291666, + "learning_rate": 4e-05, + "loss": 4.7876, + "loss/crossentropy": 2.5675423741340637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23500743508338928, + "step": 3478 + }, + { + "epoch": 0.29, + "grad_norm": 5.1875, + "grad_norm_var": 0.04607747395833333, + "learning_rate": 4e-05, + "loss": 5.3603, + "loss/crossentropy": 2.3670946955680847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199428491294384, + "step": 3480 + }, + { + "epoch": 0.2901666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.03524983723958333, + "learning_rate": 4e-05, + "loss": 4.9564, + "loss/crossentropy": 1.8418959975242615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22825615853071213, + "step": 3482 + }, + { + "epoch": 0.29033333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.036051432291666664, + "learning_rate": 4e-05, + "loss": 4.4244, + "loss/crossentropy": 1.8724389523267746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19478942267596722, + "step": 3484 + }, + { + "epoch": 0.2905, + "grad_norm": 5.09375, + "grad_norm_var": 0.04407145182291667, + "learning_rate": 4e-05, + "loss": 4.4165, + "loss/crossentropy": 2.111702561378479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20953534543514252, + "step": 3486 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.711181640625, + "learning_rate": 4e-05, + "loss": 4.3107, + "loss/crossentropy": 2.3148096799850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190384529531002, + "step": 3488 + }, + { + "epoch": 0.29083333333333333, + "grad_norm": 7.6875, + "grad_norm_var": 1.09732666015625, + "learning_rate": 4e-05, + "loss": 4.9802, + "loss/crossentropy": 1.6215331330895424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20154114998877048, + "step": 3490 + }, + { + "epoch": 0.291, + "grad_norm": 5.0625, + "grad_norm_var": 1.06236572265625, + "learning_rate": 4e-05, + "loss": 4.7553, + "loss/crossentropy": 1.8760625272989273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19243442453444004, + "step": 3492 + }, + { + "epoch": 0.2911666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 1.0556925455729167, + "learning_rate": 4e-05, + "loss": 5.3884, + "loss/crossentropy": 2.2172908782958984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23113702610135078, + "step": 3494 + }, + { + "epoch": 0.29133333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 1.0721964518229166, + "learning_rate": 4e-05, + "loss": 5.1201, + "loss/crossentropy": 2.3262163400650024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330826565623283, + "step": 3496 + }, + { + "epoch": 0.2915, + "grad_norm": 4.53125, + "grad_norm_var": 1.1188639322916667, + "learning_rate": 4e-05, + "loss": 4.4723, + "loss/crossentropy": 2.01950091868639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16761692985892296, + "step": 3498 + }, + { + "epoch": 0.2916666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 1.0830037434895834, + "learning_rate": 4e-05, + "loss": 4.668, + "loss/crossentropy": 1.7684204503893852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895194798707962, + "step": 3500 + }, + { + "epoch": 0.29183333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 1.0822224934895834, + "learning_rate": 4e-05, + "loss": 4.8008, + "loss/crossentropy": 2.3736203610897064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22592481598258018, + "step": 3502 + }, + { + "epoch": 0.292, + "grad_norm": 4.96875, + "grad_norm_var": 0.6147135416666667, + "learning_rate": 4e-05, + "loss": 5.3605, + "loss/crossentropy": 2.3942826986312866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2380472868680954, + "step": 3504 + }, + { + "epoch": 0.2921666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.19387613932291667, + "learning_rate": 4e-05, + "loss": 4.4185, + "loss/crossentropy": 2.3212435841560364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20279332622885704, + "step": 3506 + }, + { + "epoch": 0.29233333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.20012613932291667, + "learning_rate": 4e-05, + "loss": 5.1964, + "loss/crossentropy": 2.09537735581398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19826416298747063, + "step": 3508 + }, + { + "epoch": 0.2925, + "grad_norm": 4.75, + "grad_norm_var": 0.211572265625, + "learning_rate": 4e-05, + "loss": 5.1824, + "loss/crossentropy": 2.1464912593364716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115538753569126, + "step": 3510 + }, + { + "epoch": 0.2926666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.22095947265625, + "learning_rate": 4e-05, + "loss": 4.8959, + "loss/crossentropy": 0.9554274380207062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12400326877832413, + "step": 3512 + }, + { + "epoch": 0.29283333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.20963541666666666, + "learning_rate": 4e-05, + "loss": 4.5966, + "loss/crossentropy": 1.952194757759571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851113010197878, + "step": 3514 + }, + { + "epoch": 0.293, + "grad_norm": 4.75, + "grad_norm_var": 0.20462239583333333, + "learning_rate": 4e-05, + "loss": 4.9804, + "loss/crossentropy": 2.3729577362537384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21630284935235977, + "step": 3516 + }, + { + "epoch": 0.2931666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.19927978515625, + "learning_rate": 4e-05, + "loss": 4.7558, + "loss/crossentropy": 2.4642003178596497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22600191831588745, + "step": 3518 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 6.4375, + "grad_norm_var": 0.20206705729166666, + "learning_rate": 4e-05, + "loss": 4.6977, + "loss/crossentropy": 1.8401424586772919, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26591188833117485, + "step": 3520 + }, + { + "epoch": 0.2935, + "grad_norm": 5.1875, + "grad_norm_var": 0.19804280598958332, + "learning_rate": 4e-05, + "loss": 4.8546, + "loss/crossentropy": 1.9076418355107307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19173485785722733, + "step": 3522 + }, + { + "epoch": 0.2936666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.20474853515625, + "learning_rate": 4e-05, + "loss": 5.6459, + "loss/crossentropy": 2.5764644145965576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21692033484578133, + "step": 3524 + }, + { + "epoch": 0.29383333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.191015625, + "learning_rate": 4e-05, + "loss": 5.1832, + "loss/crossentropy": 2.5007553696632385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23094122856855392, + "step": 3526 + }, + { + "epoch": 0.294, + "grad_norm": 5.03125, + "grad_norm_var": 0.1990234375, + "learning_rate": 4e-05, + "loss": 4.7084, + "loss/crossentropy": 1.4607620611786842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1543761007487774, + "step": 3528 + }, + { + "epoch": 0.2941666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.20777587890625, + "learning_rate": 4e-05, + "loss": 4.6505, + "loss/crossentropy": 2.0909395068883896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16600767709314823, + "step": 3530 + }, + { + "epoch": 0.29433333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.20909830729166667, + "learning_rate": 4e-05, + "loss": 4.6819, + "loss/crossentropy": 0.9055970907211304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1309139858931303, + "step": 3532 + }, + { + "epoch": 0.2945, + "grad_norm": 4.59375, + "grad_norm_var": 0.23097330729166668, + "learning_rate": 4e-05, + "loss": 4.5222, + "loss/crossentropy": 1.9780186116695404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18702267482876778, + "step": 3534 + }, + { + "epoch": 0.2946666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.08114827473958333, + "learning_rate": 4e-05, + "loss": 4.4649, + "loss/crossentropy": 0.9073627293109894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10845192894339561, + "step": 3536 + }, + { + "epoch": 0.29483333333333334, + "grad_norm": 4.375, + "grad_norm_var": 0.10050455729166667, + "learning_rate": 4e-05, + "loss": 4.7735, + "loss/crossentropy": 1.6618280410766602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16298267990350723, + "step": 3538 + }, + { + "epoch": 0.295, + "grad_norm": 5.15625, + "grad_norm_var": 0.0806640625, + "learning_rate": 4e-05, + "loss": 5.0086, + "loss/crossentropy": 1.9391166269779205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18532704934477806, + "step": 3540 + }, + { + "epoch": 0.2951666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.09185791015625, + "learning_rate": 4e-05, + "loss": 5.1276, + "loss/crossentropy": 2.1296051144599915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987944282591343, + "step": 3542 + }, + { + "epoch": 0.29533333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.09579671223958333, + "learning_rate": 4e-05, + "loss": 4.5496, + "loss/crossentropy": 1.3152644261717796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15342802554368973, + "step": 3544 + }, + { + "epoch": 0.2955, + "grad_norm": 4.71875, + "grad_norm_var": 0.092041015625, + "learning_rate": 4e-05, + "loss": 4.6793, + "loss/crossentropy": 2.123941093683243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20897839963436127, + "step": 3546 + }, + { + "epoch": 0.2956666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.08857014973958334, + "learning_rate": 4e-05, + "loss": 4.8814, + "loss/crossentropy": 2.487266719341278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22740302234888077, + "step": 3548 + }, + { + "epoch": 0.29583333333333334, + "grad_norm": 5.34375, + "grad_norm_var": 0.07704671223958333, + "learning_rate": 4e-05, + "loss": 5.4725, + "loss/crossentropy": 2.515300452709198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22232673317193985, + "step": 3550 + }, + { + "epoch": 0.296, + "grad_norm": 5.0, + "grad_norm_var": 0.07159830729166666, + "learning_rate": 4e-05, + "loss": 5.225, + "loss/crossentropy": 2.5012297928333282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24186847358942032, + "step": 3552 + }, + { + "epoch": 0.2961666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.055403645833333334, + "learning_rate": 4e-05, + "loss": 4.4848, + "loss/crossentropy": 1.829747200012207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132105603814125, + "step": 3554 + }, + { + "epoch": 0.29633333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.05221354166666667, + "learning_rate": 4e-05, + "loss": 4.7871, + "loss/crossentropy": 2.078303784132004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992538534104824, + "step": 3556 + }, + { + "epoch": 0.2965, + "grad_norm": 4.6875, + "grad_norm_var": 0.05115559895833333, + "learning_rate": 4e-05, + "loss": 4.1948, + "loss/crossentropy": 1.9181992933154106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17587333172559738, + "step": 3558 + }, + { + "epoch": 0.2966666666666667, + "grad_norm": 5.46875, + "grad_norm_var": 0.05836181640625, + "learning_rate": 4e-05, + "loss": 5.1335, + "loss/crossentropy": 2.469767451286316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22005610167980194, + "step": 3560 + }, + { + "epoch": 0.29683333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.06483968098958333, + "learning_rate": 4e-05, + "loss": 5.0746, + "loss/crossentropy": 1.754760041832924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1840778887271881, + "step": 3562 + }, + { + "epoch": 0.297, + "grad_norm": 4.875, + "grad_norm_var": 0.06282552083333333, + "learning_rate": 4e-05, + "loss": 4.6255, + "loss/crossentropy": 1.6421096697449684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16191892698407173, + "step": 3564 + }, + { + "epoch": 0.2971666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.05679931640625, + "learning_rate": 4e-05, + "loss": 4.8003, + "loss/crossentropy": 1.5874068588018417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16946525312960148, + "step": 3566 + }, + { + "epoch": 0.29733333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.06601155598958333, + "learning_rate": 4e-05, + "loss": 4.9971, + "loss/crossentropy": 2.3962987661361694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20658328384160995, + "step": 3568 + }, + { + "epoch": 0.2975, + "grad_norm": 4.875, + "grad_norm_var": 0.06623942057291667, + "learning_rate": 4e-05, + "loss": 5.5287, + "loss/crossentropy": 2.5732553601264954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2334974780678749, + "step": 3570 + }, + { + "epoch": 0.2976666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.06647135416666666, + "learning_rate": 4e-05, + "loss": 5.0897, + "loss/crossentropy": 2.3147547245025635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20426007360219955, + "step": 3572 + }, + { + "epoch": 0.29783333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.07278645833333333, + "learning_rate": 4e-05, + "loss": 4.5379, + "loss/crossentropy": 1.3760528713464737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13978317752480507, + "step": 3574 + }, + { + "epoch": 0.298, + "grad_norm": 5.8125, + "grad_norm_var": 0.27421468098958335, + "learning_rate": 4e-05, + "loss": 5.0086, + "loss/crossentropy": 1.6505027040839195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21621382609009743, + "step": 3576 + }, + { + "epoch": 0.2981666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.27554931640625, + "learning_rate": 4e-05, + "loss": 5.0922, + "loss/crossentropy": 1.40341117978096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16107412800192833, + "step": 3578 + }, + { + "epoch": 0.29833333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.279931640625, + "learning_rate": 4e-05, + "loss": 4.7093, + "loss/crossentropy": 1.147512048482895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15361671149730682, + "step": 3580 + }, + { + "epoch": 0.2985, + "grad_norm": 4.75, + "grad_norm_var": 0.28746337890625, + "learning_rate": 4e-05, + "loss": 5.0159, + "loss/crossentropy": 1.4668299034237862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17344230972230434, + "step": 3582 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.2789713541666667, + "learning_rate": 4e-05, + "loss": 4.8091, + "loss/crossentropy": 1.2716411352157593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1398077104240656, + "step": 3584 + }, + { + "epoch": 0.29883333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.2873331705729167, + "learning_rate": 4e-05, + "loss": 4.7028, + "loss/crossentropy": 2.2946461737155914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183996111154556, + "step": 3586 + }, + { + "epoch": 0.299, + "grad_norm": 4.90625, + "grad_norm_var": 0.291650390625, + "learning_rate": 4e-05, + "loss": 5.2464, + "loss/crossentropy": 1.7846654728055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16545471921563148, + "step": 3588 + }, + { + "epoch": 0.2991666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.26832275390625, + "learning_rate": 4e-05, + "loss": 4.6774, + "loss/crossentropy": 1.7224418818950653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17625931091606617, + "step": 3590 + }, + { + "epoch": 0.29933333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.05462239583333333, + "learning_rate": 4e-05, + "loss": 5.2031, + "loss/crossentropy": 2.0435468032956123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18544995225965977, + "step": 3592 + }, + { + "epoch": 0.2995, + "grad_norm": 4.78125, + "grad_norm_var": 0.055924479166666666, + "learning_rate": 4e-05, + "loss": 5.073, + "loss/crossentropy": 2.0027381628751755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18559293076395988, + "step": 3594 + }, + { + "epoch": 0.2996666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 1.8641560872395833, + "learning_rate": 4e-05, + "loss": 4.5997, + "loss/crossentropy": 1.6425791680812836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16363847069442272, + "step": 3596 + }, + { + "epoch": 0.29983333333333334, + "grad_norm": 5.34375, + "grad_norm_var": 1.8575480143229166, + "learning_rate": 4e-05, + "loss": 4.984, + "loss/crossentropy": 2.020586669445038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21033752337098122, + "step": 3598 + }, + { + "epoch": 0.3, + "grad_norm": 5.34375, + "grad_norm_var": 1.85592041015625, + "learning_rate": 4e-05, + "loss": 5.0516, + "loss/crossentropy": 2.270563930273056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22760266438126564, + "step": 3600 + }, + { + "epoch": 0.3001666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 1.8462076822916667, + "learning_rate": 4e-05, + "loss": 5.1508, + "loss/crossentropy": 2.489174962043762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22328739613294601, + "step": 3602 + }, + { + "epoch": 0.30033333333333334, + "grad_norm": 5.0, + "grad_norm_var": 1.8499959309895833, + "learning_rate": 4e-05, + "loss": 4.8976, + "loss/crossentropy": 1.9335657581686974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18224405869841576, + "step": 3604 + }, + { + "epoch": 0.3005, + "grad_norm": 5.0, + "grad_norm_var": 1.8414021809895833, + "learning_rate": 4e-05, + "loss": 5.4268, + "loss/crossentropy": 1.7507117837667465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16100936010479927, + "step": 3606 + }, + { + "epoch": 0.3006666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 1.9059244791666667, + "learning_rate": 4e-05, + "loss": 4.8097, + "loss/crossentropy": 2.112537205219269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001986764371395, + "step": 3608 + }, + { + "epoch": 0.30083333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 1.8837076822916667, + "learning_rate": 4e-05, + "loss": 5.5574, + "loss/crossentropy": 2.74730384349823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2495119497179985, + "step": 3610 + }, + { + "epoch": 0.301, + "grad_norm": 4.96875, + "grad_norm_var": 0.12821858723958332, + "learning_rate": 4e-05, + "loss": 5.0715, + "loss/crossentropy": 2.563708484172821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22458581998944283, + "step": 3612 + }, + { + "epoch": 0.3011666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.0716796875, + "learning_rate": 4e-05, + "loss": 4.237, + "loss/crossentropy": 2.0116556510329247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19622116163372993, + "step": 3614 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.05546875, + "learning_rate": 4e-05, + "loss": 4.3814, + "loss/crossentropy": 2.005406975746155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17633453384041786, + "step": 3616 + }, + { + "epoch": 0.3015, + "grad_norm": 4.65625, + "grad_norm_var": 0.042801920572916666, + "learning_rate": 4e-05, + "loss": 4.6612, + "loss/crossentropy": 1.9902734458446503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19928394444286823, + "step": 3618 + }, + { + "epoch": 0.3016666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.04230143229166667, + "learning_rate": 4e-05, + "loss": 4.388, + "loss/crossentropy": 1.580359660089016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15102678537368774, + "step": 3620 + }, + { + "epoch": 0.30183333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.04609375, + "learning_rate": 4e-05, + "loss": 4.5381, + "loss/crossentropy": 0.9725519716739655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12142913416028023, + "step": 3622 + }, + { + "epoch": 0.302, + "grad_norm": 5.5, + "grad_norm_var": 0.054671223958333334, + "learning_rate": 4e-05, + "loss": 5.0478, + "loss/crossentropy": 2.1247295141220093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23397767543792725, + "step": 3624 + }, + { + "epoch": 0.30216666666666664, + "grad_norm": 4.53125, + "grad_norm_var": 0.06005452473958333, + "learning_rate": 4e-05, + "loss": 4.3377, + "loss/crossentropy": 0.9640841111540794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19951742701232433, + "step": 3626 + }, + { + "epoch": 0.30233333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.06534830729166667, + "learning_rate": 4e-05, + "loss": 4.5658, + "loss/crossentropy": 2.2950040102005005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21112338826060295, + "step": 3628 + }, + { + "epoch": 0.3025, + "grad_norm": 5.125, + "grad_norm_var": 0.08375244140625, + "learning_rate": 4e-05, + "loss": 5.3449, + "loss/crossentropy": 2.4528151154518127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23017141222953796, + "step": 3630 + }, + { + "epoch": 0.30266666666666664, + "grad_norm": 5.0625, + "grad_norm_var": 0.08409830729166666, + "learning_rate": 4e-05, + "loss": 5.032, + "loss/crossentropy": 1.951070874929428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18713463097810745, + "step": 3632 + }, + { + "epoch": 0.30283333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.07734375, + "learning_rate": 4e-05, + "loss": 4.9745, + "loss/crossentropy": 1.9134965389966965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16628370434045792, + "step": 3634 + }, + { + "epoch": 0.303, + "grad_norm": 4.90625, + "grad_norm_var": 0.07965087890625, + "learning_rate": 4e-05, + "loss": 4.6723, + "loss/crossentropy": 2.3537526428699493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268533818423748, + "step": 3636 + }, + { + "epoch": 0.30316666666666664, + "grad_norm": 4.96875, + "grad_norm_var": 0.06760660807291667, + "learning_rate": 4e-05, + "loss": 5.6514, + "loss/crossentropy": 2.3131695985794067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19909925386309624, + "step": 3638 + }, + { + "epoch": 0.30333333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.058329264322916664, + "learning_rate": 4e-05, + "loss": 4.8647, + "loss/crossentropy": 2.0087155923247337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1793438270688057, + "step": 3640 + }, + { + "epoch": 0.3035, + "grad_norm": 5.375, + "grad_norm_var": 0.05432535807291667, + "learning_rate": 4e-05, + "loss": 4.9107, + "loss/crossentropy": 2.076196014881134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22751999273896217, + "step": 3642 + }, + { + "epoch": 0.30366666666666664, + "grad_norm": 5.0, + "grad_norm_var": 0.047526041666666664, + "learning_rate": 4e-05, + "loss": 5.6599, + "loss/crossentropy": 2.399946093559265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21956488490104675, + "step": 3644 + }, + { + "epoch": 0.30383333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.03313395182291667, + "learning_rate": 4e-05, + "loss": 4.3474, + "loss/crossentropy": 1.9796275794506073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20108654350042343, + "step": 3646 + }, + { + "epoch": 0.304, + "grad_norm": 4.6875, + "grad_norm_var": 0.03508707682291667, + "learning_rate": 4e-05, + "loss": 4.3212, + "loss/crossentropy": 1.328979179263115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1541975736618042, + "step": 3648 + }, + { + "epoch": 0.30416666666666664, + "grad_norm": 4.34375, + "grad_norm_var": 0.07428385416666666, + "learning_rate": 4e-05, + "loss": 4.2876, + "loss/crossentropy": 2.024112194776535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18114975281059742, + "step": 3650 + }, + { + "epoch": 0.30433333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.07897135416666666, + "learning_rate": 4e-05, + "loss": 4.8026, + "loss/crossentropy": 2.082690417766571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21567688882350922, + "step": 3652 + }, + { + "epoch": 0.3045, + "grad_norm": 4.625, + "grad_norm_var": 0.09465738932291666, + "learning_rate": 4e-05, + "loss": 5.3316, + "loss/crossentropy": 2.600900650024414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172529399394989, + "step": 3654 + }, + { + "epoch": 0.30466666666666664, + "grad_norm": 4.71875, + "grad_norm_var": 0.09465738932291666, + "learning_rate": 4e-05, + "loss": 5.3408, + "loss/crossentropy": 2.4749475717544556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20883005484938622, + "step": 3656 + }, + { + "epoch": 0.30483333333333335, + "grad_norm": 5.65625, + "grad_norm_var": 0.11627197265625, + "learning_rate": 4e-05, + "loss": 4.8844, + "loss/crossentropy": 2.0045883879065514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1844545528292656, + "step": 3658 + }, + { + "epoch": 0.305, + "grad_norm": 5.375, + "grad_norm_var": 0.128369140625, + "learning_rate": 4e-05, + "loss": 5.303, + "loss/crossentropy": 2.2967261970043182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20461099222302437, + "step": 3660 + }, + { + "epoch": 0.30516666666666664, + "grad_norm": 5.25, + "grad_norm_var": 0.13121337890625, + "learning_rate": 4e-05, + "loss": 5.1984, + "loss/crossentropy": 2.034844785928726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299644947052002, + "step": 3662 + }, + { + "epoch": 0.30533333333333335, + "grad_norm": 4.90625, + "grad_norm_var": 0.127197265625, + "learning_rate": 4e-05, + "loss": 4.2718, + "loss/crossentropy": 2.4984880089759827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2178654558956623, + "step": 3664 + }, + { + "epoch": 0.3055, + "grad_norm": 4.90625, + "grad_norm_var": 0.08873697916666666, + "learning_rate": 4e-05, + "loss": 5.309, + "loss/crossentropy": 2.227113127708435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22134817764163017, + "step": 3666 + }, + { + "epoch": 0.30566666666666664, + "grad_norm": 5.5625, + "grad_norm_var": 0.09855143229166667, + "learning_rate": 4e-05, + "loss": 5.3596, + "loss/crossentropy": 2.1710298359394073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19393081590533257, + "step": 3668 + }, + { + "epoch": 0.30583333333333335, + "grad_norm": 5.25, + "grad_norm_var": 0.07805582682291666, + "learning_rate": 4e-05, + "loss": 4.8099, + "loss/crossentropy": 2.4346525073051453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2716682218015194, + "step": 3670 + }, + { + "epoch": 0.306, + "grad_norm": 4.90625, + "grad_norm_var": 0.07301025390625, + "learning_rate": 4e-05, + "loss": 4.8859, + "loss/crossentropy": 1.582870475947857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15237391367554665, + "step": 3672 + }, + { + "epoch": 0.30616666666666664, + "grad_norm": 4.59375, + "grad_norm_var": 0.06607666015625, + "learning_rate": 4e-05, + "loss": 5.2553, + "loss/crossentropy": 1.999775506556034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1884305290877819, + "step": 3674 + }, + { + "epoch": 0.30633333333333335, + "grad_norm": 5.0, + "grad_norm_var": 0.05813802083333333, + "learning_rate": 4e-05, + "loss": 4.5563, + "loss/crossentropy": 1.2396316081285477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1338381376117468, + "step": 3676 + }, + { + "epoch": 0.3065, + "grad_norm": 5.0625, + "grad_norm_var": 0.052718098958333334, + "learning_rate": 4e-05, + "loss": 4.8746, + "loss/crossentropy": 2.200127214193344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212520282715559, + "step": 3678 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 4.53125, + "grad_norm_var": 0.07420247395833333, + "learning_rate": 4e-05, + "loss": 4.7778, + "loss/crossentropy": 2.107081711292267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1995658278465271, + "step": 3680 + }, + { + "epoch": 0.30683333333333335, + "grad_norm": 5.6875, + "grad_norm_var": 0.11503499348958333, + "learning_rate": 4e-05, + "loss": 5.2662, + "loss/crossentropy": 2.5890790224075317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21765975654125214, + "step": 3682 + }, + { + "epoch": 0.307, + "grad_norm": 5.4375, + "grad_norm_var": 0.10989176432291667, + "learning_rate": 4e-05, + "loss": 4.6385, + "loss/crossentropy": 2.281311720609665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19055946916341782, + "step": 3684 + }, + { + "epoch": 0.30716666666666664, + "grad_norm": 4.96875, + "grad_norm_var": 0.10813802083333333, + "learning_rate": 4e-05, + "loss": 4.6168, + "loss/crossentropy": 2.4844754934310913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209065817296505, + "step": 3686 + }, + { + "epoch": 0.30733333333333335, + "grad_norm": 4.5, + "grad_norm_var": 0.12333577473958333, + "learning_rate": 4e-05, + "loss": 4.0221, + "loss/crossentropy": 1.6593739092350006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19019902870059013, + "step": 3688 + }, + { + "epoch": 0.3075, + "grad_norm": 4.90625, + "grad_norm_var": 0.13010660807291666, + "learning_rate": 4e-05, + "loss": 4.9422, + "loss/crossentropy": 2.1268528401851654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18465667217969894, + "step": 3690 + }, + { + "epoch": 0.30766666666666664, + "grad_norm": 4.84375, + "grad_norm_var": 0.131494140625, + "learning_rate": 4e-05, + "loss": 4.437, + "loss/crossentropy": 1.5557752773165703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.177669333294034, + "step": 3692 + }, + { + "epoch": 0.30783333333333335, + "grad_norm": 4.71875, + "grad_norm_var": 0.13357747395833333, + "learning_rate": 4e-05, + "loss": 4.3403, + "loss/crossentropy": 1.798197090625763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15706698969006538, + "step": 3694 + }, + { + "epoch": 0.308, + "grad_norm": 4.46875, + "grad_norm_var": 0.12081705729166667, + "learning_rate": 4e-05, + "loss": 4.6368, + "loss/crossentropy": 1.8836499452590942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17866197228431702, + "step": 3696 + }, + { + "epoch": 0.30816666666666664, + "grad_norm": 5.125, + "grad_norm_var": 0.083447265625, + "learning_rate": 4e-05, + "loss": 5.3486, + "loss/crossentropy": 2.1042481660842896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17958716675639153, + "step": 3698 + }, + { + "epoch": 0.30833333333333335, + "grad_norm": 5.0625, + "grad_norm_var": 0.0623046875, + "learning_rate": 4e-05, + "loss": 5.0219, + "loss/crossentropy": 2.4937482476234436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2245134860277176, + "step": 3700 + }, + { + "epoch": 0.3085, + "grad_norm": 4.75, + "grad_norm_var": 0.06243082682291667, + "learning_rate": 4e-05, + "loss": 4.4926, + "loss/crossentropy": 1.953664094209671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048991098999977, + "step": 3702 + }, + { + "epoch": 0.30866666666666664, + "grad_norm": 5.125, + "grad_norm_var": 0.0521484375, + "learning_rate": 4e-05, + "loss": 5.2685, + "loss/crossentropy": 2.191207781434059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21109728887677193, + "step": 3704 + }, + { + "epoch": 0.30883333333333335, + "grad_norm": 4.90625, + "grad_norm_var": 0.03722330729166667, + "learning_rate": 4e-05, + "loss": 4.9706, + "loss/crossentropy": 2.088053673505783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20717396587133408, + "step": 3706 + }, + { + "epoch": 0.309, + "grad_norm": 4.90625, + "grad_norm_var": 0.038863118489583334, + "learning_rate": 4e-05, + "loss": 4.7178, + "loss/crossentropy": 1.7166788578033447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16185236908495426, + "step": 3708 + }, + { + "epoch": 0.30916666666666665, + "grad_norm": 4.90625, + "grad_norm_var": 0.040999348958333334, + "learning_rate": 4e-05, + "loss": 5.0539, + "loss/crossentropy": 2.4472317695617676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22122575715184212, + "step": 3710 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 5.6875, + "grad_norm_var": 0.058333333333333334, + "learning_rate": 4e-05, + "loss": 5.3555, + "loss/crossentropy": 1.8674227595329285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979275792837143, + "step": 3712 + }, + { + "epoch": 0.3095, + "grad_norm": 4.78125, + "grad_norm_var": 0.062483723958333334, + "learning_rate": 4e-05, + "loss": 4.3914, + "loss/crossentropy": 2.116463601589203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22198039293289185, + "step": 3714 + }, + { + "epoch": 0.30966666666666665, + "grad_norm": 4.59375, + "grad_norm_var": 0.09490559895833334, + "learning_rate": 4e-05, + "loss": 4.6182, + "loss/crossentropy": 2.104882702231407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18206360936164856, + "step": 3716 + }, + { + "epoch": 0.30983333333333335, + "grad_norm": 5.0625, + "grad_norm_var": 0.09440104166666667, + "learning_rate": 4e-05, + "loss": 4.3327, + "loss/crossentropy": 1.162502907216549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16418416053056717, + "step": 3718 + }, + { + "epoch": 0.31, + "grad_norm": 4.90625, + "grad_norm_var": 0.09283447265625, + "learning_rate": 4e-05, + "loss": 4.6641, + "loss/crossentropy": 1.5889663323760033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1913046780973673, + "step": 3720 + }, + { + "epoch": 0.31016666666666665, + "grad_norm": 4.90625, + "grad_norm_var": 0.09950764973958333, + "learning_rate": 4e-05, + "loss": 4.6431, + "loss/crossentropy": 2.3483422100543976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22202644869685173, + "step": 3722 + }, + { + "epoch": 0.31033333333333335, + "grad_norm": 6.21875, + "grad_norm_var": 0.19620768229166666, + "learning_rate": 4e-05, + "loss": 5.4244, + "loss/crossentropy": 2.0444701313972473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22006652504205704, + "step": 3724 + }, + { + "epoch": 0.3105, + "grad_norm": 4.90625, + "grad_norm_var": 0.19073893229166666, + "learning_rate": 4e-05, + "loss": 4.9727, + "loss/crossentropy": 2.4517840147018433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100413180887699, + "step": 3726 + }, + { + "epoch": 0.31066666666666665, + "grad_norm": 4.90625, + "grad_norm_var": 0.16324462890625, + "learning_rate": 4e-05, + "loss": 5.1654, + "loss/crossentropy": 1.9761425107717514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948082633316517, + "step": 3728 + }, + { + "epoch": 0.31083333333333335, + "grad_norm": 4.875, + "grad_norm_var": 0.153759765625, + "learning_rate": 4e-05, + "loss": 4.8851, + "loss/crossentropy": 1.5147030353546143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16606771387159824, + "step": 3730 + }, + { + "epoch": 0.311, + "grad_norm": 4.8125, + "grad_norm_var": 0.12649332682291667, + "learning_rate": 4e-05, + "loss": 4.8831, + "loss/crossentropy": 2.117761880159378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187788724899292, + "step": 3732 + }, + { + "epoch": 0.31116666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.12493082682291666, + "learning_rate": 4e-05, + "loss": 4.7365, + "loss/crossentropy": 2.5555994510650635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22206680104136467, + "step": 3734 + }, + { + "epoch": 0.31133333333333335, + "grad_norm": 4.71875, + "grad_norm_var": 0.14605712890625, + "learning_rate": 4e-05, + "loss": 5.1176, + "loss/crossentropy": 1.7837401628494263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18287081271409988, + "step": 3736 + }, + { + "epoch": 0.3115, + "grad_norm": 4.8125, + "grad_norm_var": 0.1443359375, + "learning_rate": 4e-05, + "loss": 5.3582, + "loss/crossentropy": 2.330405503511429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22329926490783691, + "step": 3738 + }, + { + "epoch": 0.31166666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.043212890625, + "learning_rate": 4e-05, + "loss": 4.8281, + "loss/crossentropy": 1.5541361793875694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18051035329699516, + "step": 3740 + }, + { + "epoch": 0.31183333333333335, + "grad_norm": 5.0625, + "grad_norm_var": 0.057352701822916664, + "learning_rate": 4e-05, + "loss": 4.9007, + "loss/crossentropy": 2.17046582698822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19161805883049965, + "step": 3742 + }, + { + "epoch": 0.312, + "grad_norm": 5.21875, + "grad_norm_var": 0.0638671875, + "learning_rate": 4e-05, + "loss": 5.3251, + "loss/crossentropy": 2.254283905029297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19508297741413116, + "step": 3744 + }, + { + "epoch": 0.31216666666666665, + "grad_norm": 4.71875, + "grad_norm_var": 0.06750895182291666, + "learning_rate": 4e-05, + "loss": 5.1207, + "loss/crossentropy": 2.116343930363655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21743535064160824, + "step": 3746 + }, + { + "epoch": 0.31233333333333335, + "grad_norm": 5.46875, + "grad_norm_var": 0.08072916666666667, + "learning_rate": 4e-05, + "loss": 4.6328, + "loss/crossentropy": 2.3838615715503693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21325530484318733, + "step": 3748 + }, + { + "epoch": 0.3125, + "grad_norm": 5.03125, + "grad_norm_var": 0.08307291666666666, + "learning_rate": 4e-05, + "loss": 4.9696, + "loss/crossentropy": 1.8378008008003235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19234362617135048, + "step": 3750 + }, + { + "epoch": 0.31266666666666665, + "grad_norm": 5.0, + "grad_norm_var": 0.05933837890625, + "learning_rate": 4e-05, + "loss": 5.1705, + "loss/crossentropy": 2.1367595493793488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20700618252158165, + "step": 3752 + }, + { + "epoch": 0.31283333333333335, + "grad_norm": 5.15625, + "grad_norm_var": 0.06483968098958333, + "learning_rate": 4e-05, + "loss": 4.9547, + "loss/crossentropy": 1.8602565303444862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17942636832594872, + "step": 3754 + }, + { + "epoch": 0.313, + "grad_norm": 4.90625, + "grad_norm_var": 0.06610921223958334, + "learning_rate": 4e-05, + "loss": 4.1205, + "loss/crossentropy": 1.9927891492843628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2000282108783722, + "step": 3756 + }, + { + "epoch": 0.31316666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.048140462239583334, + "learning_rate": 4e-05, + "loss": 4.6056, + "loss/crossentropy": 2.0198487788438797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19490646198391914, + "step": 3758 + }, + { + "epoch": 0.31333333333333335, + "grad_norm": 4.78125, + "grad_norm_var": 0.050390625, + "learning_rate": 4e-05, + "loss": 5.2929, + "loss/crossentropy": 2.36793053150177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22241582348942757, + "step": 3760 + }, + { + "epoch": 0.3135, + "grad_norm": 4.71875, + "grad_norm_var": 0.06569010416666667, + "learning_rate": 4e-05, + "loss": 4.9795, + "loss/crossentropy": 2.4190186858177185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21430841460824013, + "step": 3762 + }, + { + "epoch": 0.31366666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.05250244140625, + "learning_rate": 4e-05, + "loss": 4.9606, + "loss/crossentropy": 1.79479618370533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916802916675806, + "step": 3764 + }, + { + "epoch": 0.31383333333333335, + "grad_norm": 5.21875, + "grad_norm_var": 0.07081705729166667, + "learning_rate": 4e-05, + "loss": 4.373, + "loss/crossentropy": 1.937475398182869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18721584975719452, + "step": 3766 + }, + { + "epoch": 0.314, + "grad_norm": 4.71875, + "grad_norm_var": 0.07121988932291666, + "learning_rate": 4e-05, + "loss": 4.286, + "loss/crossentropy": 1.4481577202677727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16050837188959122, + "step": 3768 + }, + { + "epoch": 0.31416666666666665, + "grad_norm": 5.15625, + "grad_norm_var": 0.06432291666666666, + "learning_rate": 4e-05, + "loss": 4.9644, + "loss/crossentropy": 1.147791676223278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13392825238406658, + "step": 3770 + }, + { + "epoch": 0.31433333333333335, + "grad_norm": 4.9375, + "grad_norm_var": 0.06131184895833333, + "learning_rate": 4e-05, + "loss": 4.6804, + "loss/crossentropy": 1.4648746028542519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1747361645102501, + "step": 3772 + }, + { + "epoch": 0.3145, + "grad_norm": 4.78125, + "grad_norm_var": 0.0615234375, + "learning_rate": 4e-05, + "loss": 4.8572, + "loss/crossentropy": 2.5256667137145996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21758800372481346, + "step": 3774 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 4.96875, + "grad_norm_var": 0.05904541015625, + "learning_rate": 4e-05, + "loss": 5.6007, + "loss/crossentropy": 2.4053181409835815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21975573524832726, + "step": 3776 + }, + { + "epoch": 0.31483333333333335, + "grad_norm": 5.0625, + "grad_norm_var": 0.03664957682291667, + "learning_rate": 4e-05, + "loss": 4.6974, + "loss/crossentropy": 1.6105652749538422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15871884860098362, + "step": 3778 + }, + { + "epoch": 0.315, + "grad_norm": 4.46875, + "grad_norm_var": 0.0544921875, + "learning_rate": 4e-05, + "loss": 4.3221, + "loss/crossentropy": 1.7338557913899422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18251017853617668, + "step": 3780 + }, + { + "epoch": 0.31516666666666665, + "grad_norm": 4.59375, + "grad_norm_var": 0.045947265625, + "learning_rate": 4e-05, + "loss": 4.4713, + "loss/crossentropy": 1.9897050261497498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19093642942607403, + "step": 3782 + }, + { + "epoch": 0.31533333333333335, + "grad_norm": 5.1875, + "grad_norm_var": 0.054427083333333334, + "learning_rate": 4e-05, + "loss": 4.9917, + "loss/crossentropy": 1.9837996065616608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19374994188547134, + "step": 3784 + }, + { + "epoch": 0.3155, + "grad_norm": 5.15625, + "grad_norm_var": 0.0572265625, + "learning_rate": 4e-05, + "loss": 4.8675, + "loss/crossentropy": 2.3804187774658203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22300543636083603, + "step": 3786 + }, + { + "epoch": 0.31566666666666665, + "grad_norm": 5.0625, + "grad_norm_var": 0.05806884765625, + "learning_rate": 4e-05, + "loss": 4.8823, + "loss/crossentropy": 2.190940797328949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202349789440632, + "step": 3788 + }, + { + "epoch": 0.31583333333333335, + "grad_norm": 4.59375, + "grad_norm_var": 0.07258707682291667, + "learning_rate": 4e-05, + "loss": 4.7738, + "loss/crossentropy": 1.6915459632873535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17686060070991516, + "step": 3790 + }, + { + "epoch": 0.316, + "grad_norm": 5.53125, + "grad_norm_var": 0.10279947916666667, + "learning_rate": 4e-05, + "loss": 5.4494, + "loss/crossentropy": 1.9292872324585915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17759693413972855, + "step": 3792 + }, + { + "epoch": 0.31616666666666665, + "grad_norm": 4.90625, + "grad_norm_var": 0.09698893229166666, + "learning_rate": 4e-05, + "loss": 5.0908, + "loss/crossentropy": 1.4473036751151085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1512235328555107, + "step": 3794 + }, + { + "epoch": 0.31633333333333336, + "grad_norm": 4.875, + "grad_norm_var": 0.07909749348958334, + "learning_rate": 4e-05, + "loss": 4.9736, + "loss/crossentropy": 2.1033048927783966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22066448256373405, + "step": 3796 + }, + { + "epoch": 0.3165, + "grad_norm": 4.9375, + "grad_norm_var": 0.08420817057291667, + "learning_rate": 4e-05, + "loss": 4.3264, + "loss/crossentropy": 0.7586923539638519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10199865326285362, + "step": 3798 + }, + { + "epoch": 0.31666666666666665, + "grad_norm": 4.4375, + "grad_norm_var": 0.10089518229166666, + "learning_rate": 4e-05, + "loss": 4.2479, + "loss/crossentropy": 1.5026521235704422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14725450798869133, + "step": 3800 + }, + { + "epoch": 0.31683333333333336, + "grad_norm": 4.90625, + "grad_norm_var": 0.10911458333333333, + "learning_rate": 4e-05, + "loss": 4.5031, + "loss/crossentropy": 1.4894457682967186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17442147992551327, + "step": 3802 + }, + { + "epoch": 0.317, + "grad_norm": 4.84375, + "grad_norm_var": 0.10983072916666667, + "learning_rate": 4e-05, + "loss": 4.185, + "loss/crossentropy": 1.578904926776886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1705641932785511, + "step": 3804 + }, + { + "epoch": 0.31716666666666665, + "grad_norm": 5.21875, + "grad_norm_var": 0.09895833333333333, + "learning_rate": 4e-05, + "loss": 4.8458, + "loss/crossentropy": 2.099468767642975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21197058260440826, + "step": 3806 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 4.6875, + "grad_norm_var": 0.97633056640625, + "learning_rate": 4e-05, + "loss": 4.7211, + "loss/crossentropy": 0.40370237082242966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.08282195776700974, + "step": 3808 + }, + { + "epoch": 0.3175, + "grad_norm": 5.03125, + "grad_norm_var": 0.9772745768229166, + "learning_rate": 4e-05, + "loss": 5.1344, + "loss/crossentropy": 1.8095313608646393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19387658312916756, + "step": 3810 + }, + { + "epoch": 0.31766666666666665, + "grad_norm": 4.90625, + "grad_norm_var": 0.9825358072916667, + "learning_rate": 4e-05, + "loss": 4.6487, + "loss/crossentropy": 1.3583406507968903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14877759665250778, + "step": 3812 + }, + { + "epoch": 0.31783333333333336, + "grad_norm": 4.46875, + "grad_norm_var": 0.9834635416666667, + "learning_rate": 4e-05, + "loss": 4.9006, + "loss/crossentropy": 1.9160986170172691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18941473588347435, + "step": 3814 + }, + { + "epoch": 0.318, + "grad_norm": 4.75, + "grad_norm_var": 0.94478759765625, + "learning_rate": 4e-05, + "loss": 4.9874, + "loss/crossentropy": 2.3055627644062042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155849188566208, + "step": 3816 + }, + { + "epoch": 0.31816666666666665, + "grad_norm": 4.875, + "grad_norm_var": 0.9505045572916667, + "learning_rate": 4e-05, + "loss": 4.5018, + "loss/crossentropy": 1.952782303094864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18765784800052643, + "step": 3818 + }, + { + "epoch": 0.31833333333333336, + "grad_norm": 4.9375, + "grad_norm_var": 0.9403483072916666, + "learning_rate": 4e-05, + "loss": 5.0223, + "loss/crossentropy": 2.0353624671697617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889826748520136, + "step": 3820 + }, + { + "epoch": 0.3185, + "grad_norm": 4.90625, + "grad_norm_var": 0.9404581705729167, + "learning_rate": 4e-05, + "loss": 4.8417, + "loss/crossentropy": 2.015192322432995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17999350652098656, + "step": 3822 + }, + { + "epoch": 0.31866666666666665, + "grad_norm": 4.5625, + "grad_norm_var": 0.053999837239583334, + "learning_rate": 4e-05, + "loss": 4.6136, + "loss/crossentropy": 1.6971989944577217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18851319141685963, + "step": 3824 + }, + { + "epoch": 0.31883333333333336, + "grad_norm": 4.78125, + "grad_norm_var": 0.04108072916666667, + "learning_rate": 4e-05, + "loss": 5.0423, + "loss/crossentropy": 2.3695130348205566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2304910495877266, + "step": 3826 + }, + { + "epoch": 0.319, + "grad_norm": 4.84375, + "grad_norm_var": 0.043745930989583334, + "learning_rate": 4e-05, + "loss": 5.2882, + "loss/crossentropy": 2.449763298034668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22548651695251465, + "step": 3828 + }, + { + "epoch": 0.31916666666666665, + "grad_norm": 4.53125, + "grad_norm_var": 0.039286295572916664, + "learning_rate": 4e-05, + "loss": 4.4039, + "loss/crossentropy": 1.6869621872901917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16885101795196533, + "step": 3830 + }, + { + "epoch": 0.31933333333333336, + "grad_norm": 5.4375, + "grad_norm_var": 0.05597330729166667, + "learning_rate": 4e-05, + "loss": 5.409, + "loss/crossentropy": 2.49446177482605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22949394211173058, + "step": 3832 + }, + { + "epoch": 0.3195, + "grad_norm": 4.96875, + "grad_norm_var": 0.05478108723958333, + "learning_rate": 4e-05, + "loss": 4.5315, + "loss/crossentropy": 2.66249018907547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23358352109789848, + "step": 3834 + }, + { + "epoch": 0.31966666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.05045166015625, + "learning_rate": 4e-05, + "loss": 4.7935, + "loss/crossentropy": 2.1069458723068237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21881438791751862, + "step": 3836 + }, + { + "epoch": 0.31983333333333336, + "grad_norm": 4.84375, + "grad_norm_var": 0.05243733723958333, + "learning_rate": 4e-05, + "loss": 5.1939, + "loss/crossentropy": 1.933380126953125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17562244832515717, + "step": 3838 + }, + { + "epoch": 0.32, + "grad_norm": 5.03125, + "grad_norm_var": 0.04023030598958333, + "learning_rate": 4e-05, + "loss": 5.0553, + "loss/crossentropy": 2.7020374536514282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23687465488910675, + "step": 3840 + }, + { + "epoch": 0.32016666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.264453125, + "learning_rate": 4e-05, + "loss": 4.9235, + "loss/crossentropy": 1.614741176366806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20109020546078682, + "step": 3842 + }, + { + "epoch": 0.32033333333333336, + "grad_norm": 5.0, + "grad_norm_var": 0.26038004557291666, + "learning_rate": 4e-05, + "loss": 5.7655, + "loss/crossentropy": 2.1124462485313416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2182532399892807, + "step": 3844 + }, + { + "epoch": 0.3205, + "grad_norm": 5.03125, + "grad_norm_var": 0.2306640625, + "learning_rate": 4e-05, + "loss": 5.5571, + "loss/crossentropy": 1.813891276717186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19078289158642292, + "step": 3846 + }, + { + "epoch": 0.32066666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.2259765625, + "learning_rate": 4e-05, + "loss": 5.1904, + "loss/crossentropy": 1.3256629407405853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13756003230810165, + "step": 3848 + }, + { + "epoch": 0.32083333333333336, + "grad_norm": 4.53125, + "grad_norm_var": 0.24933268229166666, + "learning_rate": 4e-05, + "loss": 4.5194, + "loss/crossentropy": 1.691509410738945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18102239537984133, + "step": 3850 + }, + { + "epoch": 0.321, + "grad_norm": 4.6875, + "grad_norm_var": 0.26067708333333334, + "learning_rate": 4e-05, + "loss": 4.5382, + "loss/crossentropy": 1.8079805970191956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17653886415064335, + "step": 3852 + }, + { + "epoch": 0.32116666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.278125, + "learning_rate": 4e-05, + "loss": 5.7217, + "loss/crossentropy": 1.882356882095337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1414974480867386, + "step": 3854 + }, + { + "epoch": 0.32133333333333336, + "grad_norm": 4.5, + "grad_norm_var": 0.31217041015625, + "learning_rate": 4e-05, + "loss": 4.301, + "loss/crossentropy": 1.9757508039474487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19711638614535332, + "step": 3856 + }, + { + "epoch": 0.3215, + "grad_norm": 5.0625, + "grad_norm_var": 0.10032552083333333, + "learning_rate": 4e-05, + "loss": 5.2709, + "loss/crossentropy": 1.9217640459537506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18658523634076118, + "step": 3858 + }, + { + "epoch": 0.32166666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.11614176432291666, + "learning_rate": 4e-05, + "loss": 4.824, + "loss/crossentropy": 1.439366839826107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1621258743107319, + "step": 3860 + }, + { + "epoch": 0.32183333333333336, + "grad_norm": 4.59375, + "grad_norm_var": 0.12642822265625, + "learning_rate": 4e-05, + "loss": 4.8631, + "loss/crossentropy": 1.9740833342075348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18762733228504658, + "step": 3862 + }, + { + "epoch": 0.322, + "grad_norm": 5.03125, + "grad_norm_var": 0.11793212890625, + "learning_rate": 4e-05, + "loss": 4.6297, + "loss/crossentropy": 1.8139654770493507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1765834055840969, + "step": 3864 + }, + { + "epoch": 0.32216666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.10741780598958334, + "learning_rate": 4e-05, + "loss": 4.9415, + "loss/crossentropy": 1.6795841604471207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075301818549633, + "step": 3866 + }, + { + "epoch": 0.32233333333333336, + "grad_norm": 5.15625, + "grad_norm_var": 0.10282796223958333, + "learning_rate": 4e-05, + "loss": 4.7924, + "loss/crossentropy": 0.9799469262361526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13794360868632793, + "step": 3868 + }, + { + "epoch": 0.3225, + "grad_norm": 5.53125, + "grad_norm_var": 0.08677978515625, + "learning_rate": 4e-05, + "loss": 5.0427, + "loss/crossentropy": 2.014233537018299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18055324628949165, + "step": 3870 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.09534098307291666, + "learning_rate": 4e-05, + "loss": 5.1675, + "loss/crossentropy": 2.118018291890621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19751184806227684, + "step": 3872 + }, + { + "epoch": 0.32283333333333336, + "grad_norm": 4.65625, + "grad_norm_var": 0.08857014973958334, + "learning_rate": 4e-05, + "loss": 4.8977, + "loss/crossentropy": 1.732959657907486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17621224001049995, + "step": 3874 + }, + { + "epoch": 0.323, + "grad_norm": 5.0625, + "grad_norm_var": 0.07437744140625, + "learning_rate": 4e-05, + "loss": 5.006, + "loss/crossentropy": 1.454321675002575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17282269150018692, + "step": 3876 + }, + { + "epoch": 0.32316666666666666, + "grad_norm": 5.5625, + "grad_norm_var": 0.0849609375, + "learning_rate": 4e-05, + "loss": 5.2047, + "loss/crossentropy": 2.027598097920418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19759362563490868, + "step": 3878 + }, + { + "epoch": 0.3233333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.0951171875, + "learning_rate": 4e-05, + "loss": 4.8963, + "loss/crossentropy": 1.6502454578876495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19640647992491722, + "step": 3880 + }, + { + "epoch": 0.3235, + "grad_norm": 4.34375, + "grad_norm_var": 0.12183837890625, + "learning_rate": 4e-05, + "loss": 4.6344, + "loss/crossentropy": 2.5610092878341675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21330714598298073, + "step": 3882 + }, + { + "epoch": 0.32366666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.1318359375, + "learning_rate": 4e-05, + "loss": 4.9918, + "loss/crossentropy": 2.5567209720611572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21084987744688988, + "step": 3884 + }, + { + "epoch": 0.3238333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.12971598307291668, + "learning_rate": 4e-05, + "loss": 4.6148, + "loss/crossentropy": 2.139784097671509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20885953679680824, + "step": 3886 + }, + { + "epoch": 0.324, + "grad_norm": 5.625, + "grad_norm_var": 0.12698160807291667, + "learning_rate": 4e-05, + "loss": 5.1695, + "loss/crossentropy": 2.2444785833358765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216685228049755, + "step": 3888 + }, + { + "epoch": 0.32416666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.12121988932291666, + "learning_rate": 4e-05, + "loss": 5.1188, + "loss/crossentropy": 1.5884385108947754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17183389514684677, + "step": 3890 + }, + { + "epoch": 0.3243333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.12537434895833333, + "learning_rate": 4e-05, + "loss": 4.5442, + "loss/crossentropy": 1.5150625482201576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1651450339704752, + "step": 3892 + }, + { + "epoch": 0.3245, + "grad_norm": 4.8125, + "grad_norm_var": 0.105859375, + "learning_rate": 4e-05, + "loss": 4.973, + "loss/crossentropy": 2.0057149529457092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19402217864990234, + "step": 3894 + }, + { + "epoch": 0.32466666666666666, + "grad_norm": 5.28125, + "grad_norm_var": 0.09908447265625, + "learning_rate": 4e-05, + "loss": 5.1988, + "loss/crossentropy": 1.953499749302864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18943873792886734, + "step": 3896 + }, + { + "epoch": 0.3248333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.070166015625, + "learning_rate": 4e-05, + "loss": 5.2872, + "loss/crossentropy": 1.3116377219557762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14449688233435154, + "step": 3898 + }, + { + "epoch": 0.325, + "grad_norm": 4.6875, + "grad_norm_var": 0.06925455729166667, + "learning_rate": 4e-05, + "loss": 5.0211, + "loss/crossentropy": 0.8303311765193939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13305531814694405, + "step": 3900 + }, + { + "epoch": 0.32516666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.05601806640625, + "learning_rate": 4e-05, + "loss": 4.5965, + "loss/crossentropy": 2.015589267015457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2238336279988289, + "step": 3902 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.04247639973958333, + "learning_rate": 4e-05, + "loss": 5.0168, + "loss/crossentropy": 2.3163425028324127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127598598599434, + "step": 3904 + }, + { + "epoch": 0.3255, + "grad_norm": 5.0625, + "grad_norm_var": 0.04058837890625, + "learning_rate": 4e-05, + "loss": 4.6785, + "loss/crossentropy": 2.3567277789115906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23480309918522835, + "step": 3906 + }, + { + "epoch": 0.32566666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.043863932291666664, + "learning_rate": 4e-05, + "loss": 4.682, + "loss/crossentropy": 2.10383278131485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18023706413805485, + "step": 3908 + }, + { + "epoch": 0.3258333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.046187337239583334, + "learning_rate": 4e-05, + "loss": 5.1307, + "loss/crossentropy": 2.568555533885956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2189786396920681, + "step": 3910 + }, + { + "epoch": 0.326, + "grad_norm": 5.0, + "grad_norm_var": 0.03759358723958333, + "learning_rate": 4e-05, + "loss": 4.7643, + "loss/crossentropy": 1.2290566712617874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13838442414999008, + "step": 3912 + }, + { + "epoch": 0.32616666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.03323160807291667, + "learning_rate": 4e-05, + "loss": 4.6486, + "loss/crossentropy": 2.0212435722351074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013506405055523, + "step": 3914 + }, + { + "epoch": 0.3263333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.05370686848958333, + "learning_rate": 4e-05, + "loss": 4.7519, + "loss/crossentropy": 2.114032506942749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23080236837267876, + "step": 3916 + }, + { + "epoch": 0.3265, + "grad_norm": 4.84375, + "grad_norm_var": 0.057356770833333334, + "learning_rate": 4e-05, + "loss": 4.5704, + "loss/crossentropy": 1.7334122359752655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1655733287334442, + "step": 3918 + }, + { + "epoch": 0.32666666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.09133707682291667, + "learning_rate": 4e-05, + "loss": 4.7757, + "loss/crossentropy": 1.6256769001483917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19204578548669815, + "step": 3920 + }, + { + "epoch": 0.3268333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.10172119140625, + "learning_rate": 4e-05, + "loss": 4.8596, + "loss/crossentropy": 1.5621510818600655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21445403434336185, + "step": 3922 + }, + { + "epoch": 0.327, + "grad_norm": 4.65625, + "grad_norm_var": 0.10689697265625, + "learning_rate": 4e-05, + "loss": 5.1977, + "loss/crossentropy": 2.47482892870903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19530130550265312, + "step": 3924 + }, + { + "epoch": 0.32716666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.10823160807291667, + "learning_rate": 4e-05, + "loss": 5.4857, + "loss/crossentropy": 2.1922404766082764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200409695506096, + "step": 3926 + }, + { + "epoch": 0.3273333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.10944010416666666, + "learning_rate": 4e-05, + "loss": 4.9498, + "loss/crossentropy": 1.8819985389709473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18507347628474236, + "step": 3928 + }, + { + "epoch": 0.3275, + "grad_norm": 5.46875, + "grad_norm_var": 0.12170817057291666, + "learning_rate": 4e-05, + "loss": 5.1485, + "loss/crossentropy": 1.8625006452202797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19629210233688354, + "step": 3930 + }, + { + "epoch": 0.32766666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.10520833333333333, + "learning_rate": 4e-05, + "loss": 4.4317, + "loss/crossentropy": 1.701745517551899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17160597257316113, + "step": 3932 + }, + { + "epoch": 0.3278333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.106494140625, + "learning_rate": 4e-05, + "loss": 5.4612, + "loss/crossentropy": 2.235189765691757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071330025792122, + "step": 3934 + }, + { + "epoch": 0.328, + "grad_norm": 4.65625, + "grad_norm_var": 0.06888020833333333, + "learning_rate": 4e-05, + "loss": 4.5437, + "loss/crossentropy": 1.5824126675724983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1563483476638794, + "step": 3936 + }, + { + "epoch": 0.32816666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.07414957682291666, + "learning_rate": 4e-05, + "loss": 4.5375, + "loss/crossentropy": 1.6542961448431015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892082616686821, + "step": 3938 + }, + { + "epoch": 0.3283333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.06927083333333334, + "learning_rate": 4e-05, + "loss": 4.9733, + "loss/crossentropy": 1.3052943646907806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1394607052206993, + "step": 3940 + }, + { + "epoch": 0.3285, + "grad_norm": 5.03125, + "grad_norm_var": 0.06451822916666666, + "learning_rate": 4e-05, + "loss": 4.7605, + "loss/crossentropy": 2.0294989347457886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20356900244951248, + "step": 3942 + }, + { + "epoch": 0.32866666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.06417643229166667, + "learning_rate": 4e-05, + "loss": 4.9209, + "loss/crossentropy": 1.5529565215110779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1776176542043686, + "step": 3944 + }, + { + "epoch": 0.3288333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.03951416015625, + "learning_rate": 4e-05, + "loss": 4.7867, + "loss/crossentropy": 1.780011311173439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17883405461907387, + "step": 3946 + }, + { + "epoch": 0.329, + "grad_norm": 5.0625, + "grad_norm_var": 0.063134765625, + "learning_rate": 4e-05, + "loss": 4.7582, + "loss/crossentropy": 1.2589640021324158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15058333426713943, + "step": 3948 + }, + { + "epoch": 0.32916666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.17603759765625, + "learning_rate": 4e-05, + "loss": 5.0849, + "loss/crossentropy": 2.279158651828766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21800993382930756, + "step": 3950 + }, + { + "epoch": 0.3293333333333333, + "grad_norm": 5.625, + "grad_norm_var": 0.1841796875, + "learning_rate": 4e-05, + "loss": 5.1935, + "loss/crossentropy": 2.0449778214097023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19124972261488438, + "step": 3952 + }, + { + "epoch": 0.3295, + "grad_norm": 5.09375, + "grad_norm_var": 0.151416015625, + "learning_rate": 4e-05, + "loss": 4.8903, + "loss/crossentropy": 2.590156674385071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216012328863144, + "step": 3954 + }, + { + "epoch": 0.32966666666666666, + "grad_norm": 4.34375, + "grad_norm_var": 0.19921875, + "learning_rate": 4e-05, + "loss": 4.3514, + "loss/crossentropy": 2.0297087728977203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1800425611436367, + "step": 3956 + }, + { + "epoch": 0.3298333333333333, + "grad_norm": 4.25, + "grad_norm_var": 0.24986572265625, + "learning_rate": 4e-05, + "loss": 4.1529, + "loss/crossentropy": 1.5149021744728088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15365608409047127, + "step": 3958 + }, + { + "epoch": 0.33, + "grad_norm": 5.65625, + "grad_norm_var": 0.32239176432291666, + "learning_rate": 4e-05, + "loss": 4.6866, + "loss/crossentropy": 2.2006970942020416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23189735412597656, + "step": 3960 + }, + { + "epoch": 0.33016666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.3184895833333333, + "learning_rate": 4e-05, + "loss": 4.9903, + "loss/crossentropy": 1.732621654868126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17110507935285568, + "step": 3962 + }, + { + "epoch": 0.3303333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.33411051432291666, + "learning_rate": 4e-05, + "loss": 5.0462, + "loss/crossentropy": 2.0788157284259796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21352742239832878, + "step": 3964 + }, + { + "epoch": 0.3305, + "grad_norm": 5.15625, + "grad_norm_var": 0.22056884765625, + "learning_rate": 4e-05, + "loss": 5.4219, + "loss/crossentropy": 2.5594743490219116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218813955783844, + "step": 3966 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 5.5, + "grad_norm_var": 0.7591756184895834, + "learning_rate": 4e-05, + "loss": 5.2523, + "loss/crossentropy": 2.548967719078064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22973427176475525, + "step": 3968 + }, + { + "epoch": 0.3308333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.7623006184895833, + "learning_rate": 4e-05, + "loss": 4.8598, + "loss/crossentropy": 2.1485989689826965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088041864335537, + "step": 3970 + }, + { + "epoch": 0.331, + "grad_norm": 4.84375, + "grad_norm_var": 0.7180623372395833, + "learning_rate": 4e-05, + "loss": 4.9403, + "loss/crossentropy": 1.671858310699463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029484622180462, + "step": 3972 + }, + { + "epoch": 0.33116666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.6586588541666667, + "learning_rate": 4e-05, + "loss": 4.9027, + "loss/crossentropy": 2.0752905011177063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24618571624159813, + "step": 3974 + }, + { + "epoch": 0.3313333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.6116495768229167, + "learning_rate": 4e-05, + "loss": 5.267, + "loss/crossentropy": 2.5116668939590454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215965948998928, + "step": 3976 + }, + { + "epoch": 0.3315, + "grad_norm": 4.96875, + "grad_norm_var": 0.61109619140625, + "learning_rate": 4e-05, + "loss": 5.0597, + "loss/crossentropy": 2.00965479016304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21152685582637787, + "step": 3978 + }, + { + "epoch": 0.33166666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.5948201497395833, + "learning_rate": 4e-05, + "loss": 5.0523, + "loss/crossentropy": 2.200817584991455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1879693791270256, + "step": 3980 + }, + { + "epoch": 0.3318333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.59000244140625, + "learning_rate": 4e-05, + "loss": 5.1081, + "loss/crossentropy": 1.3755600899457932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17068074271082878, + "step": 3982 + }, + { + "epoch": 0.332, + "grad_norm": 4.5, + "grad_norm_var": 0.05705973307291667, + "learning_rate": 4e-05, + "loss": 5.1049, + "loss/crossentropy": 2.173791080713272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22593029215931892, + "step": 3984 + }, + { + "epoch": 0.33216666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.19778238932291667, + "learning_rate": 4e-05, + "loss": 5.1376, + "loss/crossentropy": 2.2958777248859406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23552871868014336, + "step": 3986 + }, + { + "epoch": 0.3323333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.21233317057291667, + "learning_rate": 4e-05, + "loss": 5.0748, + "loss/crossentropy": 1.2604089081287384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17020060494542122, + "step": 3988 + }, + { + "epoch": 0.3325, + "grad_norm": 4.9375, + "grad_norm_var": 0.20779622395833333, + "learning_rate": 4e-05, + "loss": 4.9403, + "loss/crossentropy": 1.6421017423272133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15172147378325462, + "step": 3990 + }, + { + "epoch": 0.33266666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.20526936848958333, + "learning_rate": 4e-05, + "loss": 5.3164, + "loss/crossentropy": 2.2981130182743073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21888618916273117, + "step": 3992 + }, + { + "epoch": 0.3328333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.20545247395833333, + "learning_rate": 4e-05, + "loss": 5.0443, + "loss/crossentropy": 2.0642926692962646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18958804570138454, + "step": 3994 + }, + { + "epoch": 0.333, + "grad_norm": 5.1875, + "grad_norm_var": 0.22216389973958334, + "learning_rate": 4e-05, + "loss": 5.0989, + "loss/crossentropy": 2.440661907196045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130712941288948, + "step": 3996 + }, + { + "epoch": 0.33316666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.23648681640625, + "learning_rate": 4e-05, + "loss": 4.2979, + "loss/crossentropy": 1.7171569466590881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16440174356102943, + "step": 3998 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.211572265625, + "learning_rate": 4e-05, + "loss": 5.0666, + "loss/crossentropy": 2.1179882287979126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20530518516898155, + "step": 4000 + }, + { + "epoch": 0.3335, + "grad_norm": 5.0, + "grad_norm_var": 0.07102864583333333, + "learning_rate": 4e-05, + "loss": 4.9151, + "loss/crossentropy": 2.2780506312847137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23622548207640648, + "step": 4002 + }, + { + "epoch": 0.33366666666666667, + "grad_norm": 5.59375, + "grad_norm_var": 0.08527018229166666, + "learning_rate": 4e-05, + "loss": 4.5568, + "loss/crossentropy": 1.0578216835856438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17180327884852886, + "step": 4004 + }, + { + "epoch": 0.3338333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.08487955729166667, + "learning_rate": 4e-05, + "loss": 4.5557, + "loss/crossentropy": 2.4999157786369324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2154349498450756, + "step": 4006 + }, + { + "epoch": 0.334, + "grad_norm": 4.8125, + "grad_norm_var": 0.0830078125, + "learning_rate": 4e-05, + "loss": 4.9014, + "loss/crossentropy": 2.443576067686081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23118840903043747, + "step": 4008 + }, + { + "epoch": 0.33416666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.07903645833333334, + "learning_rate": 4e-05, + "loss": 4.7797, + "loss/crossentropy": 2.619147837162018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22277311235666275, + "step": 4010 + }, + { + "epoch": 0.3343333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.07941080729166666, + "learning_rate": 4e-05, + "loss": 4.6468, + "loss/crossentropy": 1.4702613353729248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15802369080483913, + "step": 4012 + }, + { + "epoch": 0.3345, + "grad_norm": 4.96875, + "grad_norm_var": 0.06698811848958333, + "learning_rate": 4e-05, + "loss": 4.8489, + "loss/crossentropy": 2.362972617149353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217639461159706, + "step": 4014 + }, + { + "epoch": 0.33466666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.06425374348958333, + "learning_rate": 4e-05, + "loss": 5.0929, + "loss/crossentropy": 2.2614522576332092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22378670051693916, + "step": 4016 + }, + { + "epoch": 0.3348333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.06874593098958333, + "learning_rate": 4e-05, + "loss": 4.8375, + "loss/crossentropy": 2.6223338842391968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232932560145855, + "step": 4018 + }, + { + "epoch": 0.335, + "grad_norm": 4.90625, + "grad_norm_var": 0.028238932291666668, + "learning_rate": 4e-05, + "loss": 4.5218, + "loss/crossentropy": 1.5880136415362358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15379321202635765, + "step": 4020 + }, + { + "epoch": 0.33516666666666667, + "grad_norm": 5.5625, + "grad_norm_var": 0.05325113932291667, + "learning_rate": 4e-05, + "loss": 5.2639, + "loss/crossentropy": 1.8124565333127975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19380969554185867, + "step": 4022 + }, + { + "epoch": 0.3353333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.083203125, + "learning_rate": 4e-05, + "loss": 4.8314, + "loss/crossentropy": 1.6842049807310104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17753949016332626, + "step": 4024 + }, + { + "epoch": 0.3355, + "grad_norm": 4.96875, + "grad_norm_var": 0.08212483723958333, + "learning_rate": 4e-05, + "loss": 5.4194, + "loss/crossentropy": 2.30204838514328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22381476312875748, + "step": 4026 + }, + { + "epoch": 0.33566666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.06549479166666666, + "learning_rate": 4e-05, + "loss": 5.2147, + "loss/crossentropy": 2.446783661842346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124006450176239, + "step": 4028 + }, + { + "epoch": 0.3358333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.07688802083333333, + "learning_rate": 4e-05, + "loss": 4.4994, + "loss/crossentropy": 1.6328002288937569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17847843281924725, + "step": 4030 + }, + { + "epoch": 0.336, + "grad_norm": 5.0625, + "grad_norm_var": 0.07667643229166667, + "learning_rate": 4e-05, + "loss": 4.9838, + "loss/crossentropy": 1.2188917100429535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14789031259715557, + "step": 4032 + }, + { + "epoch": 0.33616666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.07649332682291667, + "learning_rate": 4e-05, + "loss": 4.4962, + "loss/crossentropy": 2.2478115260601044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19586298614740372, + "step": 4034 + }, + { + "epoch": 0.3363333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.08118082682291666, + "learning_rate": 4e-05, + "loss": 4.9507, + "loss/crossentropy": 2.4908804297447205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20837373286485672, + "step": 4036 + }, + { + "epoch": 0.3365, + "grad_norm": 5.375, + "grad_norm_var": 0.06300455729166667, + "learning_rate": 4e-05, + "loss": 5.0215, + "loss/crossentropy": 2.0036857947707176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18870280869305134, + "step": 4038 + }, + { + "epoch": 0.33666666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.046614583333333334, + "learning_rate": 4e-05, + "loss": 4.7382, + "loss/crossentropy": 1.7384353280067444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17724663019180298, + "step": 4040 + }, + { + "epoch": 0.3368333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.047770182291666664, + "learning_rate": 4e-05, + "loss": 4.969, + "loss/crossentropy": 1.8062372133135796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18678754568099976, + "step": 4042 + }, + { + "epoch": 0.337, + "grad_norm": 5.15625, + "grad_norm_var": 0.05071614583333333, + "learning_rate": 4e-05, + "loss": 5.4893, + "loss/crossentropy": 2.328328639268875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2098284848034382, + "step": 4044 + }, + { + "epoch": 0.33716666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.03817952473958333, + "learning_rate": 4e-05, + "loss": 4.7384, + "loss/crossentropy": 2.3635981678962708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23770976066589355, + "step": 4046 + }, + { + "epoch": 0.3373333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.03723958333333333, + "learning_rate": 4e-05, + "loss": 5.3287, + "loss/crossentropy": 2.3061963617801666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013118974864483, + "step": 4048 + }, + { + "epoch": 0.3375, + "grad_norm": 5.1875, + "grad_norm_var": 0.04104410807291667, + "learning_rate": 4e-05, + "loss": 5.528, + "loss/crossentropy": 2.056824043393135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19357911497354507, + "step": 4050 + }, + { + "epoch": 0.33766666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.04153645833333333, + "learning_rate": 4e-05, + "loss": 4.2291, + "loss/crossentropy": 1.8092492744326591, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18169726617634296, + "step": 4052 + }, + { + "epoch": 0.3378333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.03385009765625, + "learning_rate": 4e-05, + "loss": 4.9316, + "loss/crossentropy": 1.640300840139389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19402212649583817, + "step": 4054 + }, + { + "epoch": 0.338, + "grad_norm": 5.28125, + "grad_norm_var": 0.0404296875, + "learning_rate": 4e-05, + "loss": 5.1329, + "loss/crossentropy": 2.0621906220912933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21923184767365456, + "step": 4056 + }, + { + "epoch": 0.33816666666666667, + "grad_norm": 5.96875, + "grad_norm_var": 0.108056640625, + "learning_rate": 4e-05, + "loss": 5.268, + "loss/crossentropy": 1.8473474383354187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038598507642746, + "step": 4058 + }, + { + "epoch": 0.3383333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.10699462890625, + "learning_rate": 4e-05, + "loss": 4.5688, + "loss/crossentropy": 1.7940621376037598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18157127313315868, + "step": 4060 + }, + { + "epoch": 0.3385, + "grad_norm": 4.84375, + "grad_norm_var": 0.10950520833333334, + "learning_rate": 4e-05, + "loss": 5.1588, + "loss/crossentropy": 1.2232213392853737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12759443558752537, + "step": 4062 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.12467041015625, + "learning_rate": 4e-05, + "loss": 5.0865, + "loss/crossentropy": 1.5486139208078384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.165193947032094, + "step": 4064 + }, + { + "epoch": 0.3388333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.11685791015625, + "learning_rate": 4e-05, + "loss": 4.8044, + "loss/crossentropy": 1.621782824397087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1728647444397211, + "step": 4066 + }, + { + "epoch": 0.339, + "grad_norm": 4.875, + "grad_norm_var": 0.1095703125, + "learning_rate": 4e-05, + "loss": 4.9063, + "loss/crossentropy": 2.137328863143921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19338041357696056, + "step": 4068 + }, + { + "epoch": 0.33916666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.09488525390625, + "learning_rate": 4e-05, + "loss": 4.7485, + "loss/crossentropy": 2.2186881601810455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21209581568837166, + "step": 4070 + }, + { + "epoch": 0.3393333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.10592447916666667, + "learning_rate": 4e-05, + "loss": 5.1329, + "loss/crossentropy": 2.485084891319275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2395104244351387, + "step": 4072 + }, + { + "epoch": 0.3395, + "grad_norm": 4.84375, + "grad_norm_var": 0.04698893229166667, + "learning_rate": 4e-05, + "loss": 4.7952, + "loss/crossentropy": 2.354505956172943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22287846729159355, + "step": 4074 + }, + { + "epoch": 0.3396666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.07971598307291666, + "learning_rate": 4e-05, + "loss": 4.7303, + "loss/crossentropy": 2.4395925402641296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21506384015083313, + "step": 4076 + }, + { + "epoch": 0.3398333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.07437744140625, + "learning_rate": 4e-05, + "loss": 5.1776, + "loss/crossentropy": 2.2610137462615967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21337458491325378, + "step": 4078 + }, + { + "epoch": 0.34, + "grad_norm": 4.875, + "grad_norm_var": 0.06252848307291667, + "learning_rate": 4e-05, + "loss": 4.7574, + "loss/crossentropy": 1.0004555508494377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10723226889967918, + "step": 4080 + }, + { + "epoch": 0.3401666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.07395426432291667, + "learning_rate": 4e-05, + "loss": 4.4206, + "loss/crossentropy": 1.8156683892011642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18349895626306534, + "step": 4082 + }, + { + "epoch": 0.3403333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 0.09114583333333333, + "learning_rate": 4e-05, + "loss": 5.0601, + "loss/crossentropy": 1.5282996445894241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19007975049316883, + "step": 4084 + }, + { + "epoch": 0.3405, + "grad_norm": 4.875, + "grad_norm_var": 0.09127197265625, + "learning_rate": 4e-05, + "loss": 5.1873, + "loss/crossentropy": 1.8278708755970001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1809481494128704, + "step": 4086 + }, + { + "epoch": 0.3406666666666667, + "grad_norm": 5.875, + "grad_norm_var": 0.1298828125, + "learning_rate": 4e-05, + "loss": 5.6198, + "loss/crossentropy": 1.9336512684822083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21758461371064186, + "step": 4088 + }, + { + "epoch": 0.3408333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.14052327473958334, + "learning_rate": 4e-05, + "loss": 4.6314, + "loss/crossentropy": 1.5730094835162163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1618702970445156, + "step": 4090 + }, + { + "epoch": 0.341, + "grad_norm": 5.15625, + "grad_norm_var": 0.10575764973958333, + "learning_rate": 4e-05, + "loss": 5.2243, + "loss/crossentropy": 2.555690288543701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24880832433700562, + "step": 4092 + }, + { + "epoch": 0.3411666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.13854166666666667, + "learning_rate": 4e-05, + "loss": 4.7328, + "loss/crossentropy": 2.2953881919384003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20687290653586388, + "step": 4094 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.16490885416666667, + "learning_rate": 4e-05, + "loss": 4.4851, + "loss/crossentropy": 2.585106134414673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208958238363266, + "step": 4096 + }, + { + "epoch": 0.3415, + "grad_norm": 4.84375, + "grad_norm_var": 0.15780843098958333, + "learning_rate": 4e-05, + "loss": 5.0493, + "loss/crossentropy": 2.4826498925685883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24454660713672638, + "step": 4098 + }, + { + "epoch": 0.3416666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.13704427083333334, + "learning_rate": 4e-05, + "loss": 5.0745, + "loss/crossentropy": 2.297918736934662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883096918463707, + "step": 4100 + }, + { + "epoch": 0.3418333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.13837483723958333, + "learning_rate": 4e-05, + "loss": 4.8996, + "loss/crossentropy": 2.1561270356178284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22525209560990334, + "step": 4102 + }, + { + "epoch": 0.342, + "grad_norm": 4.9375, + "grad_norm_var": 0.05943603515625, + "learning_rate": 4e-05, + "loss": 4.8204, + "loss/crossentropy": 2.0578841269016266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22047049179673195, + "step": 4104 + }, + { + "epoch": 0.3421666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.058447265625, + "learning_rate": 4e-05, + "loss": 4.7867, + "loss/crossentropy": 2.4789949655532837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21268969774246216, + "step": 4106 + }, + { + "epoch": 0.3423333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.05172119140625, + "learning_rate": 4e-05, + "loss": 4.9054, + "loss/crossentropy": 2.394868552684784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20047050714492798, + "step": 4108 + }, + { + "epoch": 0.3425, + "grad_norm": 4.96875, + "grad_norm_var": 0.052083333333333336, + "learning_rate": 4e-05, + "loss": 5.0385, + "loss/crossentropy": 2.43982994556427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21248744055628777, + "step": 4110 + }, + { + "epoch": 0.3426666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.03857014973958333, + "learning_rate": 4e-05, + "loss": 5.0087, + "loss/crossentropy": 1.919486790895462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2137829028069973, + "step": 4112 + }, + { + "epoch": 0.3428333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.05504150390625, + "learning_rate": 4e-05, + "loss": 4.3936, + "loss/crossentropy": 0.8350804150104523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11626596376299858, + "step": 4114 + }, + { + "epoch": 0.343, + "grad_norm": 5.25, + "grad_norm_var": 0.05771077473958333, + "learning_rate": 4e-05, + "loss": 4.821, + "loss/crossentropy": 1.480412408709526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16480615735054016, + "step": 4116 + }, + { + "epoch": 0.3431666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.13240559895833334, + "learning_rate": 4e-05, + "loss": 4.6536, + "loss/crossentropy": 1.9430910348892212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1870816107839346, + "step": 4118 + }, + { + "epoch": 0.3433333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.13743489583333332, + "learning_rate": 4e-05, + "loss": 5.11, + "loss/crossentropy": 2.1659523844718933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21893510594964027, + "step": 4120 + }, + { + "epoch": 0.3435, + "grad_norm": 4.84375, + "grad_norm_var": 0.13619384765625, + "learning_rate": 4e-05, + "loss": 5.0083, + "loss/crossentropy": 2.2735475897789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21642907708883286, + "step": 4122 + }, + { + "epoch": 0.3436666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.13544514973958333, + "learning_rate": 4e-05, + "loss": 5.1141, + "loss/crossentropy": 2.3820077180862427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19136777892708778, + "step": 4124 + }, + { + "epoch": 0.3438333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.13720296223958334, + "learning_rate": 4e-05, + "loss": 4.7251, + "loss/crossentropy": 1.08474662899971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13359439186751842, + "step": 4126 + }, + { + "epoch": 0.344, + "grad_norm": 5.28125, + "grad_norm_var": 0.13561197916666667, + "learning_rate": 4e-05, + "loss": 4.8041, + "loss/crossentropy": 2.264466494321823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23006341233849525, + "step": 4128 + }, + { + "epoch": 0.3441666666666667, + "grad_norm": 5.53125, + "grad_norm_var": 0.13995768229166666, + "learning_rate": 4e-05, + "loss": 5.1237, + "loss/crossentropy": 2.056764245033264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545086592435837, + "step": 4130 + }, + { + "epoch": 0.3443333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.13658854166666667, + "learning_rate": 4e-05, + "loss": 5.2477, + "loss/crossentropy": 2.574945330619812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190633974969387, + "step": 4132 + }, + { + "epoch": 0.3445, + "grad_norm": 5.15625, + "grad_norm_var": 0.07923177083333334, + "learning_rate": 4e-05, + "loss": 5.1239, + "loss/crossentropy": 2.2885874211788177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122734598815441, + "step": 4134 + }, + { + "epoch": 0.3446666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.07522379557291667, + "learning_rate": 4e-05, + "loss": 4.9945, + "loss/crossentropy": 1.8102921470999718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18051475286483765, + "step": 4136 + }, + { + "epoch": 0.3448333333333333, + "grad_norm": 19.875, + "grad_norm_var": 13.979130045572917, + "learning_rate": 4e-05, + "loss": 5.1695, + "loss/crossentropy": 2.2670177817344666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23726213350892067, + "step": 4138 + }, + { + "epoch": 0.345, + "grad_norm": 4.875, + "grad_norm_var": 13.991129557291666, + "learning_rate": 4e-05, + "loss": 5.1073, + "loss/crossentropy": 1.9467437043786049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18490321934223175, + "step": 4140 + }, + { + "epoch": 0.3451666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 13.946028645833334, + "learning_rate": 4e-05, + "loss": 4.7978, + "loss/crossentropy": 1.795416384935379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21431923657655716, + "step": 4142 + }, + { + "epoch": 0.3453333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 13.94879150390625, + "learning_rate": 4e-05, + "loss": 5.0306, + "loss/crossentropy": 1.867057092487812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19479823857545853, + "step": 4144 + }, + { + "epoch": 0.3455, + "grad_norm": 4.53125, + "grad_norm_var": 13.983056640625, + "learning_rate": 4e-05, + "loss": 4.3536, + "loss/crossentropy": 2.0579030513763428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20508618280291557, + "step": 4146 + }, + { + "epoch": 0.3456666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 13.959830729166667, + "learning_rate": 4e-05, + "loss": 4.9017, + "loss/crossentropy": 1.839685283601284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.169641749933362, + "step": 4148 + }, + { + "epoch": 0.3458333333333333, + "grad_norm": 5.65625, + "grad_norm_var": 13.884273274739583, + "learning_rate": 4e-05, + "loss": 5.2768, + "loss/crossentropy": 1.167643092572689, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15179457888007164, + "step": 4150 + }, + { + "epoch": 0.346, + "grad_norm": 5.03125, + "grad_norm_var": 13.831624348958334, + "learning_rate": 4e-05, + "loss": 4.6659, + "loss/crossentropy": 2.5688222646713257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23355836793780327, + "step": 4152 + }, + { + "epoch": 0.3461666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.09479166666666666, + "learning_rate": 4e-05, + "loss": 4.8113, + "loss/crossentropy": 2.2830787897109985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20207640901207924, + "step": 4154 + }, + { + "epoch": 0.3463333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.09582926432291666, + "learning_rate": 4e-05, + "loss": 5.0874, + "loss/crossentropy": 2.287237584590912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2221594713628292, + "step": 4156 + }, + { + "epoch": 0.3465, + "grad_norm": 4.75, + "grad_norm_var": 0.10351155598958334, + "learning_rate": 4e-05, + "loss": 4.7523, + "loss/crossentropy": 1.992442563176155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18241856060922146, + "step": 4158 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.10006510416666667, + "learning_rate": 4e-05, + "loss": 5.429, + "loss/crossentropy": 2.3137503266334534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21104448288679123, + "step": 4160 + }, + { + "epoch": 0.3468333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.10113525390625, + "learning_rate": 4e-05, + "loss": 5.909, + "loss/crossentropy": 2.0655910670757294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179282046854496, + "step": 4162 + }, + { + "epoch": 0.347, + "grad_norm": 4.75, + "grad_norm_var": 0.10198160807291666, + "learning_rate": 4e-05, + "loss": 4.1742, + "loss/crossentropy": 1.2919567823410034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13722717948257923, + "step": 4164 + }, + { + "epoch": 0.3471666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.074853515625, + "learning_rate": 4e-05, + "loss": 5.3134, + "loss/crossentropy": 2.1499520242214203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992734745144844, + "step": 4166 + }, + { + "epoch": 0.3473333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.07672119140625, + "learning_rate": 4e-05, + "loss": 4.2854, + "loss/crossentropy": 1.5794583559036255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16880195401608944, + "step": 4168 + }, + { + "epoch": 0.3475, + "grad_norm": 4.78125, + "grad_norm_var": 0.08603108723958333, + "learning_rate": 4e-05, + "loss": 4.7931, + "loss/crossentropy": 1.4645969420671463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14471661485731602, + "step": 4170 + }, + { + "epoch": 0.3476666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.11170247395833334, + "learning_rate": 4e-05, + "loss": 4.9849, + "loss/crossentropy": 2.362588882446289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20443035662174225, + "step": 4172 + }, + { + "epoch": 0.3478333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.10764567057291667, + "learning_rate": 4e-05, + "loss": 5.4635, + "loss/crossentropy": 2.2797932624816895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22322962060570717, + "step": 4174 + }, + { + "epoch": 0.348, + "grad_norm": 4.40625, + "grad_norm_var": 0.12890625, + "learning_rate": 4e-05, + "loss": 4.8326, + "loss/crossentropy": 2.0911890268325806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915295533835888, + "step": 4176 + }, + { + "epoch": 0.3481666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.120947265625, + "learning_rate": 4e-05, + "loss": 5.0474, + "loss/crossentropy": 2.514350116252899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21622708812355995, + "step": 4178 + }, + { + "epoch": 0.34833333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.0876953125, + "learning_rate": 4e-05, + "loss": 5.1258, + "loss/crossentropy": 1.924629956483841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893673911690712, + "step": 4180 + }, + { + "epoch": 0.3485, + "grad_norm": 4.71875, + "grad_norm_var": 0.08136393229166666, + "learning_rate": 4e-05, + "loss": 5.2519, + "loss/crossentropy": 1.5494660213589668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15524601563811302, + "step": 4182 + }, + { + "epoch": 0.3486666666666667, + "grad_norm": 5.625, + "grad_norm_var": 0.09410400390625, + "learning_rate": 4e-05, + "loss": 4.6197, + "loss/crossentropy": 1.914233423769474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18002042546868324, + "step": 4184 + }, + { + "epoch": 0.34883333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.10506184895833333, + "learning_rate": 4e-05, + "loss": 4.6615, + "loss/crossentropy": 1.8490305989980698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15980881080031395, + "step": 4186 + }, + { + "epoch": 0.349, + "grad_norm": 5.0, + "grad_norm_var": 0.08748372395833333, + "learning_rate": 4e-05, + "loss": 5.1398, + "loss/crossentropy": 1.982210248708725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987055353820324, + "step": 4188 + }, + { + "epoch": 0.3491666666666667, + "grad_norm": 5.71875, + "grad_norm_var": 0.12515869140625, + "learning_rate": 4e-05, + "loss": 4.8987, + "loss/crossentropy": 1.8878967761993408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18452826887369156, + "step": 4190 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.12565104166666666, + "learning_rate": 4e-05, + "loss": 4.2882, + "loss/crossentropy": 1.7068097814917564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1642775908112526, + "step": 4192 + }, + { + "epoch": 0.3495, + "grad_norm": 5.0625, + "grad_norm_var": 0.11731770833333334, + "learning_rate": 4e-05, + "loss": 5.2158, + "loss/crossentropy": 1.33222147077322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15623358637094498, + "step": 4194 + }, + { + "epoch": 0.3496666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.12068684895833333, + "learning_rate": 4e-05, + "loss": 4.7641, + "loss/crossentropy": 1.6291079074144363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1752131376415491, + "step": 4196 + }, + { + "epoch": 0.34983333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.11979166666666667, + "learning_rate": 4e-05, + "loss": 4.4284, + "loss/crossentropy": 1.9068839251995087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23426436632871628, + "step": 4198 + }, + { + "epoch": 0.35, + "grad_norm": 5.09375, + "grad_norm_var": 0.09062093098958333, + "learning_rate": 4e-05, + "loss": 5.1638, + "loss/crossentropy": 1.8091362193226814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16963034123182297, + "step": 4200 + }, + { + "epoch": 0.3501666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.08717447916666667, + "learning_rate": 4e-05, + "loss": 5.1473, + "loss/crossentropy": 2.4834869503974915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2258441038429737, + "step": 4202 + }, + { + "epoch": 0.35033333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.08977457682291666, + "learning_rate": 4e-05, + "loss": 5.0231, + "loss/crossentropy": 1.9468242824077606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19090471975505352, + "step": 4204 + }, + { + "epoch": 0.3505, + "grad_norm": 5.125, + "grad_norm_var": 0.05871988932291667, + "learning_rate": 4e-05, + "loss": 4.3702, + "loss/crossentropy": 2.568976879119873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22428393363952637, + "step": 4206 + }, + { + "epoch": 0.3506666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.03804931640625, + "learning_rate": 4e-05, + "loss": 5.1383, + "loss/crossentropy": 2.1792095601558685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2094913348555565, + "step": 4208 + }, + { + "epoch": 0.35083333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.036942545572916666, + "learning_rate": 4e-05, + "loss": 4.7552, + "loss/crossentropy": 1.8818095847964287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17569194920361042, + "step": 4210 + }, + { + "epoch": 0.351, + "grad_norm": 5.15625, + "grad_norm_var": 0.08564046223958334, + "learning_rate": 4e-05, + "loss": 5.2439, + "loss/crossentropy": 1.962063044309616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20449374616146088, + "step": 4212 + }, + { + "epoch": 0.3511666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.08448893229166667, + "learning_rate": 4e-05, + "loss": 5.1916, + "loss/crossentropy": 2.030742183327675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19212795421481133, + "step": 4214 + }, + { + "epoch": 0.35133333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.08749593098958333, + "learning_rate": 4e-05, + "loss": 5.1428, + "loss/crossentropy": 2.2157254815101624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048327773809433, + "step": 4216 + }, + { + "epoch": 0.3515, + "grad_norm": 4.59375, + "grad_norm_var": 0.08971354166666666, + "learning_rate": 4e-05, + "loss": 4.7645, + "loss/crossentropy": 1.7414375841617584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17881914600729942, + "step": 4218 + }, + { + "epoch": 0.3516666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.09138997395833333, + "learning_rate": 4e-05, + "loss": 5.0248, + "loss/crossentropy": 1.7276012226939201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17646119743585587, + "step": 4220 + }, + { + "epoch": 0.35183333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.08346354166666667, + "learning_rate": 4e-05, + "loss": 4.4493, + "loss/crossentropy": 1.6815314665436745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1717877183109522, + "step": 4222 + }, + { + "epoch": 0.352, + "grad_norm": 4.5, + "grad_norm_var": 0.11477457682291667, + "learning_rate": 4e-05, + "loss": 5.1927, + "loss/crossentropy": 2.431355118751526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992778554558754, + "step": 4224 + }, + { + "epoch": 0.3521666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.12667643229166667, + "learning_rate": 4e-05, + "loss": 4.4977, + "loss/crossentropy": 2.3680657148361206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199774120002985, + "step": 4226 + }, + { + "epoch": 0.35233333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.07472330729166667, + "learning_rate": 4e-05, + "loss": 5.2512, + "loss/crossentropy": 2.244264602661133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101249285042286, + "step": 4228 + }, + { + "epoch": 0.3525, + "grad_norm": 5.0625, + "grad_norm_var": 0.08424072265625, + "learning_rate": 4e-05, + "loss": 5.2336, + "loss/crossentropy": 1.5435407906770706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15030980855226517, + "step": 4230 + }, + { + "epoch": 0.3526666666666667, + "grad_norm": 5.5625, + "grad_norm_var": 0.108056640625, + "learning_rate": 4e-05, + "loss": 5.0424, + "loss/crossentropy": 1.6612791121006012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18562641739845276, + "step": 4232 + }, + { + "epoch": 0.35283333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.09986979166666667, + "learning_rate": 4e-05, + "loss": 5.0052, + "loss/crossentropy": 1.051307551562786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12871519662439823, + "step": 4234 + }, + { + "epoch": 0.353, + "grad_norm": 4.875, + "grad_norm_var": 0.10552978515625, + "learning_rate": 4e-05, + "loss": 4.8386, + "loss/crossentropy": 2.6155091524124146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23894379287958145, + "step": 4236 + }, + { + "epoch": 0.3531666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.102734375, + "learning_rate": 4e-05, + "loss": 5.061, + "loss/crossentropy": 1.767829805612564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993457730859518, + "step": 4238 + }, + { + "epoch": 0.35333333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.09159749348958333, + "learning_rate": 4e-05, + "loss": 4.9406, + "loss/crossentropy": 1.423234261572361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1627085842192173, + "step": 4240 + }, + { + "epoch": 0.3535, + "grad_norm": 5.125, + "grad_norm_var": 0.08586832682291666, + "learning_rate": 4e-05, + "loss": 5.243, + "loss/crossentropy": 2.1509978771209717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19178292714059353, + "step": 4242 + }, + { + "epoch": 0.3536666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.085791015625, + "learning_rate": 4e-05, + "loss": 4.8738, + "loss/crossentropy": 2.137165904045105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19972681626677513, + "step": 4244 + }, + { + "epoch": 0.35383333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.07597249348958333, + "learning_rate": 4e-05, + "loss": 4.5654, + "loss/crossentropy": 1.1787279769778252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13839380256831646, + "step": 4246 + }, + { + "epoch": 0.354, + "grad_norm": 5.0, + "grad_norm_var": 0.052469889322916664, + "learning_rate": 4e-05, + "loss": 4.8617, + "loss/crossentropy": 2.045122891664505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18550307303667068, + "step": 4248 + }, + { + "epoch": 0.3541666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.10305582682291667, + "learning_rate": 4e-05, + "loss": 5.1557, + "loss/crossentropy": 1.8815576285123825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1797770895063877, + "step": 4250 + }, + { + "epoch": 0.35433333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.10064697265625, + "learning_rate": 4e-05, + "loss": 4.974, + "loss/crossentropy": 1.4666025638580322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15171694196760654, + "step": 4252 + }, + { + "epoch": 0.3545, + "grad_norm": 5.0625, + "grad_norm_var": 0.15735270182291666, + "learning_rate": 4e-05, + "loss": 4.9926, + "loss/crossentropy": 2.0984753370285034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19612977281212807, + "step": 4254 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.13743082682291666, + "learning_rate": 4e-05, + "loss": 5.5811, + "loss/crossentropy": 2.1417490541934967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23930613696575165, + "step": 4256 + }, + { + "epoch": 0.35483333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.14933268229166666, + "learning_rate": 4e-05, + "loss": 4.403, + "loss/crossentropy": 1.3386527448892593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1588802933692932, + "step": 4258 + }, + { + "epoch": 0.355, + "grad_norm": 5.34375, + "grad_norm_var": 0.15167643229166666, + "learning_rate": 4e-05, + "loss": 5.0899, + "loss/crossentropy": 1.9312639832496643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.182557824999094, + "step": 4260 + }, + { + "epoch": 0.3551666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.162353515625, + "learning_rate": 4e-05, + "loss": 4.8298, + "loss/crossentropy": 1.7975911796092987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19092736393213272, + "step": 4262 + }, + { + "epoch": 0.35533333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.16760660807291666, + "learning_rate": 4e-05, + "loss": 5.2307, + "loss/crossentropy": 2.0434736609458923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22150762006640434, + "step": 4264 + }, + { + "epoch": 0.3555, + "grad_norm": 4.75, + "grad_norm_var": 0.12682291666666667, + "learning_rate": 4e-05, + "loss": 4.8851, + "loss/crossentropy": 2.5525485277175903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2277713306248188, + "step": 4266 + }, + { + "epoch": 0.3556666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.11847330729166666, + "learning_rate": 4e-05, + "loss": 5.5178, + "loss/crossentropy": 2.019535183906555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19332855939865112, + "step": 4268 + }, + { + "epoch": 0.35583333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.04980061848958333, + "learning_rate": 4e-05, + "loss": 4.0606, + "loss/crossentropy": 1.5260907039046288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14622344821691513, + "step": 4270 + }, + { + "epoch": 0.356, + "grad_norm": 4.9375, + "grad_norm_var": 0.04390869140625, + "learning_rate": 4e-05, + "loss": 4.8001, + "loss/crossentropy": 1.830235406756401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18560582026839256, + "step": 4272 + }, + { + "epoch": 0.3561666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.04159749348958333, + "learning_rate": 4e-05, + "loss": 5.0943, + "loss/crossentropy": 2.435010313987732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23003898561000824, + "step": 4274 + }, + { + "epoch": 0.35633333333333334, + "grad_norm": 6.40625, + "grad_norm_var": 0.17096354166666666, + "learning_rate": 4e-05, + "loss": 5.0332, + "loss/crossentropy": 2.1487464606761932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22359532490372658, + "step": 4276 + }, + { + "epoch": 0.3565, + "grad_norm": 4.90625, + "grad_norm_var": 0.16985270182291667, + "learning_rate": 4e-05, + "loss": 5.2108, + "loss/crossentropy": 3.206775486469269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20537611469626427, + "step": 4278 + }, + { + "epoch": 0.3566666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.16985677083333334, + "learning_rate": 4e-05, + "loss": 5.3511, + "loss/crossentropy": 2.3853268325328827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20269788056612015, + "step": 4280 + }, + { + "epoch": 0.35683333333333334, + "grad_norm": 5.4375, + "grad_norm_var": 0.17174479166666667, + "learning_rate": 4e-05, + "loss": 5.336, + "loss/crossentropy": 1.9683539271354675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21414640918374062, + "step": 4282 + }, + { + "epoch": 0.357, + "grad_norm": 5.03125, + "grad_norm_var": 0.18079427083333333, + "learning_rate": 4e-05, + "loss": 4.8088, + "loss/crossentropy": 1.9340718239545822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17748939990997314, + "step": 4284 + }, + { + "epoch": 0.3571666666666667, + "grad_norm": 5.8125, + "grad_norm_var": 0.20422770182291666, + "learning_rate": 4e-05, + "loss": 5.0841, + "loss/crossentropy": 2.4096251130104065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2105432227253914, + "step": 4286 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.2512003580729167, + "learning_rate": 4e-05, + "loss": 5.1465, + "loss/crossentropy": 1.613951414823532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16857917606830597, + "step": 4288 + }, + { + "epoch": 0.3575, + "grad_norm": 5.125, + "grad_norm_var": 0.23918863932291667, + "learning_rate": 4e-05, + "loss": 5.3099, + "loss/crossentropy": 2.3350643515586853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21324753761291504, + "step": 4290 + }, + { + "epoch": 0.3576666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.14329427083333332, + "learning_rate": 4e-05, + "loss": 4.4897, + "loss/crossentropy": 1.8215420618653297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822014134377241, + "step": 4292 + }, + { + "epoch": 0.35783333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.12467041015625, + "learning_rate": 4e-05, + "loss": 5.1288, + "loss/crossentropy": 1.9511115998029709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198295496404171, + "step": 4294 + }, + { + "epoch": 0.358, + "grad_norm": 4.6875, + "grad_norm_var": 0.13853759765625, + "learning_rate": 4e-05, + "loss": 5.0161, + "loss/crossentropy": 2.398141384124756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993892453610897, + "step": 4296 + }, + { + "epoch": 0.3581666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.13880208333333333, + "learning_rate": 4e-05, + "loss": 5.2304, + "loss/crossentropy": 1.723719596862793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2035377323627472, + "step": 4298 + }, + { + "epoch": 0.35833333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.14390869140625, + "learning_rate": 4e-05, + "loss": 4.712, + "loss/crossentropy": 1.977920413017273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20899861678481102, + "step": 4300 + }, + { + "epoch": 0.3585, + "grad_norm": 4.96875, + "grad_norm_var": 0.12115885416666666, + "learning_rate": 4e-05, + "loss": 5.0954, + "loss/crossentropy": 2.031194046139717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2107471190392971, + "step": 4302 + }, + { + "epoch": 0.3586666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 0.07550455729166666, + "learning_rate": 4e-05, + "loss": 5.6159, + "loss/crossentropy": 1.8637639060616493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20326609164476395, + "step": 4304 + }, + { + "epoch": 0.35883333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.07654622395833334, + "learning_rate": 4e-05, + "loss": 4.402, + "loss/crossentropy": 2.2037427127361298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23041347414255142, + "step": 4306 + }, + { + "epoch": 0.359, + "grad_norm": 4.6875, + "grad_norm_var": 0.07929280598958334, + "learning_rate": 4e-05, + "loss": 5.0445, + "loss/crossentropy": 2.564959466457367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22452329471707344, + "step": 4308 + }, + { + "epoch": 0.3591666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.08616129557291667, + "learning_rate": 4e-05, + "loss": 5.4649, + "loss/crossentropy": 2.3605875968933105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2296753227710724, + "step": 4310 + }, + { + "epoch": 0.35933333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.07667643229166667, + "learning_rate": 4e-05, + "loss": 5.0013, + "loss/crossentropy": 2.170982927083969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2387605495750904, + "step": 4312 + }, + { + "epoch": 0.3595, + "grad_norm": 4.9375, + "grad_norm_var": 0.0662109375, + "learning_rate": 4e-05, + "loss": 4.6528, + "loss/crossentropy": 2.062200278043747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1859907377511263, + "step": 4314 + }, + { + "epoch": 0.3596666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.061051432291666666, + "learning_rate": 4e-05, + "loss": 5.1288, + "loss/crossentropy": 2.2674206495285034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20846696570515633, + "step": 4316 + }, + { + "epoch": 0.35983333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.06187744140625, + "learning_rate": 4e-05, + "loss": 5.3712, + "loss/crossentropy": 2.384114146232605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240867242217064, + "step": 4318 + }, + { + "epoch": 0.36, + "grad_norm": 5.15625, + "grad_norm_var": 0.037886555989583334, + "learning_rate": 4e-05, + "loss": 5.3943, + "loss/crossentropy": 2.363667756319046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20440954342484474, + "step": 4320 + }, + { + "epoch": 0.3601666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.03592122395833333, + "learning_rate": 4e-05, + "loss": 5.0069, + "loss/crossentropy": 2.1359744668006897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19151579588651657, + "step": 4322 + }, + { + "epoch": 0.36033333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.029292805989583334, + "learning_rate": 4e-05, + "loss": 5.3678, + "loss/crossentropy": 2.1146958768367767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19935699924826622, + "step": 4324 + }, + { + "epoch": 0.3605, + "grad_norm": 4.6875, + "grad_norm_var": 0.026949055989583335, + "learning_rate": 4e-05, + "loss": 5.0466, + "loss/crossentropy": 1.8558301255106926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1702909618616104, + "step": 4326 + }, + { + "epoch": 0.3606666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.024117024739583333, + "learning_rate": 4e-05, + "loss": 5.2054, + "loss/crossentropy": 2.2706758975982666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21369409933686256, + "step": 4328 + }, + { + "epoch": 0.36083333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.02916259765625, + "learning_rate": 4e-05, + "loss": 4.5506, + "loss/crossentropy": 2.637549102306366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21644367650151253, + "step": 4330 + }, + { + "epoch": 0.361, + "grad_norm": 5.1875, + "grad_norm_var": 0.033003743489583334, + "learning_rate": 4e-05, + "loss": 5.4144, + "loss/crossentropy": 1.6384681463241577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1661134473979473, + "step": 4332 + }, + { + "epoch": 0.3611666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.021484375, + "learning_rate": 4e-05, + "loss": 5.3093, + "loss/crossentropy": 1.6732659563422203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18927017599344254, + "step": 4334 + }, + { + "epoch": 0.36133333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.018550618489583334, + "learning_rate": 4e-05, + "loss": 5.518, + "loss/crossentropy": 2.198245033621788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16945981606841087, + "step": 4336 + }, + { + "epoch": 0.3615, + "grad_norm": 4.75, + "grad_norm_var": 0.020442708333333334, + "learning_rate": 4e-05, + "loss": 4.5047, + "loss/crossentropy": 2.4632855653762817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22415439784526825, + "step": 4338 + }, + { + "epoch": 0.3616666666666667, + "grad_norm": 5.6875, + "grad_norm_var": 0.059228515625, + "learning_rate": 4e-05, + "loss": 5.4676, + "loss/crossentropy": 2.4306774139404297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2453540340065956, + "step": 4340 + }, + { + "epoch": 0.36183333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.05826416015625, + "learning_rate": 4e-05, + "loss": 4.868, + "loss/crossentropy": 2.255247950553894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19311066716909409, + "step": 4342 + }, + { + "epoch": 0.362, + "grad_norm": 5.1875, + "grad_norm_var": 0.060282389322916664, + "learning_rate": 4e-05, + "loss": 4.9875, + "loss/crossentropy": 1.7551113069057465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17623233050107956, + "step": 4344 + }, + { + "epoch": 0.3621666666666667, + "grad_norm": 5.625, + "grad_norm_var": 0.07473551432291667, + "learning_rate": 4e-05, + "loss": 4.9721, + "loss/crossentropy": 2.160099893808365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21592870354652405, + "step": 4346 + }, + { + "epoch": 0.36233333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.07928059895833334, + "learning_rate": 4e-05, + "loss": 4.4798, + "loss/crossentropy": 2.1821701526641846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061518207192421, + "step": 4348 + }, + { + "epoch": 0.3625, + "grad_norm": 4.84375, + "grad_norm_var": 0.08125, + "learning_rate": 4e-05, + "loss": 4.5843, + "loss/crossentropy": 1.9987001791596413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1731714904308319, + "step": 4350 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.083447265625, + "learning_rate": 4e-05, + "loss": 4.9023, + "loss/crossentropy": 1.9951072037220001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2395828329026699, + "step": 4352 + }, + { + "epoch": 0.36283333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.10832926432291666, + "learning_rate": 4e-05, + "loss": 4.1862, + "loss/crossentropy": 1.293114811182022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1551448032259941, + "step": 4354 + }, + { + "epoch": 0.363, + "grad_norm": 5.125, + "grad_norm_var": 0.07701822916666666, + "learning_rate": 4e-05, + "loss": 5.3229, + "loss/crossentropy": 1.7663483619689941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1607537753880024, + "step": 4356 + }, + { + "epoch": 0.3631666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.07620035807291667, + "learning_rate": 4e-05, + "loss": 5.3652, + "loss/crossentropy": 2.1835967004299164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20054300501942635, + "step": 4358 + }, + { + "epoch": 0.36333333333333334, + "grad_norm": 5.40625, + "grad_norm_var": 0.08502604166666666, + "learning_rate": 4e-05, + "loss": 4.8332, + "loss/crossentropy": 1.9410650432109833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1902483105659485, + "step": 4360 + }, + { + "epoch": 0.3635, + "grad_norm": 4.78125, + "grad_norm_var": 0.06321207682291667, + "learning_rate": 4e-05, + "loss": 4.0222, + "loss/crossentropy": 2.366856336593628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30182891711592674, + "step": 4362 + }, + { + "epoch": 0.3636666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.06246337890625, + "learning_rate": 4e-05, + "loss": 5.0653, + "loss/crossentropy": 2.190703809261322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2186608836054802, + "step": 4364 + }, + { + "epoch": 0.36383333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.0689453125, + "learning_rate": 4e-05, + "loss": 4.6313, + "loss/crossentropy": 1.8244432508945465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1971954982727766, + "step": 4366 + }, + { + "epoch": 0.364, + "grad_norm": 6.09375, + "grad_norm_var": 0.14373372395833334, + "learning_rate": 4e-05, + "loss": 5.0425, + "loss/crossentropy": 1.3379913195967674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20181451365351677, + "step": 4368 + }, + { + "epoch": 0.3641666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.22604166666666667, + "learning_rate": 4e-05, + "loss": 4.6713, + "loss/crossentropy": 2.4003341794013977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20302216708660126, + "step": 4370 + }, + { + "epoch": 0.36433333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.22392171223958332, + "learning_rate": 4e-05, + "loss": 5.2939, + "loss/crossentropy": 1.87662872672081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22657470777630806, + "step": 4372 + }, + { + "epoch": 0.3645, + "grad_norm": 4.8125, + "grad_norm_var": 0.230712890625, + "learning_rate": 4e-05, + "loss": 5.0056, + "loss/crossentropy": 2.6067728400230408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22825849428772926, + "step": 4374 + }, + { + "epoch": 0.36466666666666664, + "grad_norm": 5.1875, + "grad_norm_var": 0.22245686848958332, + "learning_rate": 4e-05, + "loss": 5.048, + "loss/crossentropy": 2.219629019498825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24455127492547035, + "step": 4376 + }, + { + "epoch": 0.36483333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.22948811848958334, + "learning_rate": 4e-05, + "loss": 4.9465, + "loss/crossentropy": 1.511473461985588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1544812899082899, + "step": 4378 + }, + { + "epoch": 0.365, + "grad_norm": 5.09375, + "grad_norm_var": 0.22342122395833333, + "learning_rate": 4e-05, + "loss": 5.6686, + "loss/crossentropy": 2.0744031071662903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21673217043280602, + "step": 4380 + }, + { + "epoch": 0.36516666666666664, + "grad_norm": 5.3125, + "grad_norm_var": 0.21171468098958332, + "learning_rate": 4e-05, + "loss": 5.515, + "loss/crossentropy": 1.9527825713157654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20629049092531204, + "step": 4382 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.144921875, + "learning_rate": 4e-05, + "loss": 5.0875, + "loss/crossentropy": 1.833628848195076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18492337316274643, + "step": 4384 + }, + { + "epoch": 0.3655, + "grad_norm": 4.78125, + "grad_norm_var": 0.03218994140625, + "learning_rate": 4e-05, + "loss": 4.8835, + "loss/crossentropy": 1.2569792494177818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13009061105549335, + "step": 4386 + }, + { + "epoch": 0.36566666666666664, + "grad_norm": 5.15625, + "grad_norm_var": 0.039839680989583334, + "learning_rate": 4e-05, + "loss": 5.0789, + "loss/crossentropy": 1.895428627729416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112501822412014, + "step": 4388 + }, + { + "epoch": 0.36583333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.06669514973958333, + "learning_rate": 4e-05, + "loss": 4.4833, + "loss/crossentropy": 1.2277986034750938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14113224111497402, + "step": 4390 + }, + { + "epoch": 0.366, + "grad_norm": 4.8125, + "grad_norm_var": 0.06640218098958334, + "learning_rate": 4e-05, + "loss": 4.8386, + "loss/crossentropy": 2.208162397146225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132505401968956, + "step": 4392 + }, + { + "epoch": 0.36616666666666664, + "grad_norm": 5.125, + "grad_norm_var": 0.05885009765625, + "learning_rate": 4e-05, + "loss": 4.8422, + "loss/crossentropy": 2.383415102958679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22132331132888794, + "step": 4394 + }, + { + "epoch": 0.36633333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.087109375, + "learning_rate": 4e-05, + "loss": 4.6894, + "loss/crossentropy": 2.077396295964718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17519673332571983, + "step": 4396 + }, + { + "epoch": 0.3665, + "grad_norm": 5.5, + "grad_norm_var": 0.11412760416666666, + "learning_rate": 4e-05, + "loss": 5.3017, + "loss/crossentropy": 2.461028575897217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349405586719513, + "step": 4398 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 4.90625, + "grad_norm_var": 0.11868489583333333, + "learning_rate": 4e-05, + "loss": 5.7916, + "loss/crossentropy": 2.4471404552459717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2270291931927204, + "step": 4400 + }, + { + "epoch": 0.36683333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.11689046223958334, + "learning_rate": 4e-05, + "loss": 5.0363, + "loss/crossentropy": 1.5369725078344345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969006545841694, + "step": 4402 + }, + { + "epoch": 0.367, + "grad_norm": 5.0, + "grad_norm_var": 0.10426025390625, + "learning_rate": 4e-05, + "loss": 5.6669, + "loss/crossentropy": 1.8737575113773346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2562069408595562, + "step": 4404 + }, + { + "epoch": 0.36716666666666664, + "grad_norm": 4.78125, + "grad_norm_var": 0.08183186848958333, + "learning_rate": 4e-05, + "loss": 4.642, + "loss/crossentropy": 1.9986793920397758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230439141392708, + "step": 4406 + }, + { + "epoch": 0.36733333333333335, + "grad_norm": 5.09375, + "grad_norm_var": 0.07941080729166666, + "learning_rate": 4e-05, + "loss": 4.8895, + "loss/crossentropy": 1.9708809554576874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21388357877731323, + "step": 4408 + }, + { + "epoch": 0.3675, + "grad_norm": 5.21875, + "grad_norm_var": 0.08229166666666667, + "learning_rate": 4e-05, + "loss": 5.0833, + "loss/crossentropy": 1.5469930842518806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15567244589328766, + "step": 4410 + }, + { + "epoch": 0.36766666666666664, + "grad_norm": 5.15625, + "grad_norm_var": 0.05133056640625, + "learning_rate": 4e-05, + "loss": 5.7301, + "loss/crossentropy": 2.365154951810837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022877149283886, + "step": 4412 + }, + { + "epoch": 0.36783333333333335, + "grad_norm": 4.78125, + "grad_norm_var": 0.028629557291666666, + "learning_rate": 4e-05, + "loss": 4.6753, + "loss/crossentropy": 1.574171431362629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17130100168287754, + "step": 4414 + }, + { + "epoch": 0.368, + "grad_norm": 4.90625, + "grad_norm_var": 0.03131510416666667, + "learning_rate": 4e-05, + "loss": 5.2293, + "loss/crossentropy": 2.4405595660209656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22489827498793602, + "step": 4416 + }, + { + "epoch": 0.36816666666666664, + "grad_norm": 4.875, + "grad_norm_var": 0.03043212890625, + "learning_rate": 4e-05, + "loss": 4.8175, + "loss/crossentropy": 2.0399864241480827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18748826161026955, + "step": 4418 + }, + { + "epoch": 0.36833333333333335, + "grad_norm": 5.25, + "grad_norm_var": 0.039351399739583334, + "learning_rate": 4e-05, + "loss": 4.3589, + "loss/crossentropy": 1.993159256875515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920241042971611, + "step": 4420 + }, + { + "epoch": 0.3685, + "grad_norm": 4.96875, + "grad_norm_var": 0.04407552083333333, + "learning_rate": 4e-05, + "loss": 5.0311, + "loss/crossentropy": 1.9237814024090767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1813476886600256, + "step": 4422 + }, + { + "epoch": 0.36866666666666664, + "grad_norm": 5.09375, + "grad_norm_var": 0.04596354166666667, + "learning_rate": 4e-05, + "loss": 5.0933, + "loss/crossentropy": 1.994587004184723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19352146796882153, + "step": 4424 + }, + { + "epoch": 0.36883333333333335, + "grad_norm": 5.125, + "grad_norm_var": 0.05901285807291667, + "learning_rate": 4e-05, + "loss": 5.3789, + "loss/crossentropy": 2.0247163474559784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23301972076296806, + "step": 4426 + }, + { + "epoch": 0.369, + "grad_norm": 5.03125, + "grad_norm_var": 0.06249593098958333, + "learning_rate": 4e-05, + "loss": 5.0111, + "loss/crossentropy": 2.0897902846336365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1841061543673277, + "step": 4428 + }, + { + "epoch": 0.36916666666666664, + "grad_norm": 4.40625, + "grad_norm_var": 0.08587239583333334, + "learning_rate": 4e-05, + "loss": 4.759, + "loss/crossentropy": 2.043010212481022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18399441055953503, + "step": 4430 + }, + { + "epoch": 0.36933333333333335, + "grad_norm": 5.03125, + "grad_norm_var": 0.07978108723958334, + "learning_rate": 4e-05, + "loss": 4.6126, + "loss/crossentropy": 1.329975888133049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1455624159425497, + "step": 4432 + }, + { + "epoch": 0.3695, + "grad_norm": 5.21875, + "grad_norm_var": 0.0798828125, + "learning_rate": 4e-05, + "loss": 4.7537, + "loss/crossentropy": 1.5519905239343643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16253823041915894, + "step": 4434 + }, + { + "epoch": 0.36966666666666664, + "grad_norm": 5.375, + "grad_norm_var": 0.11686197916666667, + "learning_rate": 4e-05, + "loss": 5.2753, + "loss/crossentropy": 2.5547273755073547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22284472361207008, + "step": 4436 + }, + { + "epoch": 0.36983333333333335, + "grad_norm": 4.5625, + "grad_norm_var": 0.13437093098958333, + "learning_rate": 4e-05, + "loss": 4.6019, + "loss/crossentropy": 1.815646231174469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18009398505091667, + "step": 4438 + }, + { + "epoch": 0.37, + "grad_norm": 5.40625, + "grad_norm_var": 0.14185791015625, + "learning_rate": 4e-05, + "loss": 5.086, + "loss/crossentropy": 1.72943264991045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15384770184755325, + "step": 4440 + }, + { + "epoch": 0.37016666666666664, + "grad_norm": 5.375, + "grad_norm_var": 0.13355712890625, + "learning_rate": 4e-05, + "loss": 4.7693, + "loss/crossentropy": 2.069035105407238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1780005767941475, + "step": 4442 + }, + { + "epoch": 0.37033333333333335, + "grad_norm": 4.875, + "grad_norm_var": 0.13001302083333333, + "learning_rate": 4e-05, + "loss": 4.988, + "loss/crossentropy": 1.8963488563895226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16615681909024715, + "step": 4444 + }, + { + "epoch": 0.3705, + "grad_norm": 4.5, + "grad_norm_var": 0.11767171223958334, + "learning_rate": 4e-05, + "loss": 4.5752, + "loss/crossentropy": 1.6237219274044037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18100574985146523, + "step": 4446 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 5.6875, + "grad_norm_var": 0.17493489583333333, + "learning_rate": 4e-05, + "loss": 5.2411, + "loss/crossentropy": 1.7035855576395988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18641939014196396, + "step": 4448 + }, + { + "epoch": 0.37083333333333335, + "grad_norm": 4.4375, + "grad_norm_var": 0.21393229166666666, + "learning_rate": 4e-05, + "loss": 5.0545, + "loss/crossentropy": 2.348470985889435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21899165213108063, + "step": 4450 + }, + { + "epoch": 0.371, + "grad_norm": 5.0, + "grad_norm_var": 0.161181640625, + "learning_rate": 4e-05, + "loss": 4.6793, + "loss/crossentropy": 2.0913305208086967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19459903612732887, + "step": 4452 + }, + { + "epoch": 0.37116666666666664, + "grad_norm": 4.40625, + "grad_norm_var": 0.17157796223958333, + "learning_rate": 4e-05, + "loss": 4.7403, + "loss/crossentropy": 2.2749286592006683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18400948494672775, + "step": 4454 + }, + { + "epoch": 0.37133333333333335, + "grad_norm": 4.53125, + "grad_norm_var": 0.16903889973958333, + "learning_rate": 4e-05, + "loss": 4.4669, + "loss/crossentropy": 2.459641069173813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20260080322623253, + "step": 4456 + }, + { + "epoch": 0.3715, + "grad_norm": 5.0625, + "grad_norm_var": 0.15579427083333333, + "learning_rate": 4e-05, + "loss": 5.0665, + "loss/crossentropy": 2.2930372953414917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102019041776657, + "step": 4458 + }, + { + "epoch": 0.37166666666666665, + "grad_norm": 5.0625, + "grad_norm_var": 0.16861979166666666, + "learning_rate": 4e-05, + "loss": 4.9879, + "loss/crossentropy": 2.230555236339569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22333020716905594, + "step": 4460 + }, + { + "epoch": 0.37183333333333335, + "grad_norm": 5.96875, + "grad_norm_var": 0.22862955729166667, + "learning_rate": 4e-05, + "loss": 5.0287, + "loss/crossentropy": 2.245620846748352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2319910265505314, + "step": 4462 + }, + { + "epoch": 0.372, + "grad_norm": 4.65625, + "grad_norm_var": 0.14010009765625, + "learning_rate": 4e-05, + "loss": 4.7234, + "loss/crossentropy": 1.5217574685811996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15399678982794285, + "step": 4464 + }, + { + "epoch": 0.37216666666666665, + "grad_norm": 4.8125, + "grad_norm_var": 0.13970947265625, + "learning_rate": 4e-05, + "loss": 5.1308, + "loss/crossentropy": 1.8298010528087616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942594312131405, + "step": 4466 + }, + { + "epoch": 0.37233333333333335, + "grad_norm": 4.5625, + "grad_norm_var": 0.149462890625, + "learning_rate": 4e-05, + "loss": 4.8408, + "loss/crossentropy": 1.8616338968276978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17482497915625572, + "step": 4468 + }, + { + "epoch": 0.3725, + "grad_norm": 5.34375, + "grad_norm_var": 0.1765625, + "learning_rate": 4e-05, + "loss": 5.4852, + "loss/crossentropy": 2.097053498029709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076597884297371, + "step": 4470 + }, + { + "epoch": 0.37266666666666665, + "grad_norm": 5.125, + "grad_norm_var": 0.15944010416666668, + "learning_rate": 4e-05, + "loss": 4.3597, + "loss/crossentropy": 2.012524388730526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1794872172176838, + "step": 4472 + }, + { + "epoch": 0.37283333333333335, + "grad_norm": 5.0, + "grad_norm_var": 0.16184895833333332, + "learning_rate": 4e-05, + "loss": 4.8864, + "loss/crossentropy": 1.5553888604044914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15588123723864555, + "step": 4474 + }, + { + "epoch": 0.373, + "grad_norm": 5.03125, + "grad_norm_var": 0.13857014973958334, + "learning_rate": 4e-05, + "loss": 4.9816, + "loss/crossentropy": 2.187106668949127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085089460015297, + "step": 4476 + }, + { + "epoch": 0.37316666666666665, + "grad_norm": 4.875, + "grad_norm_var": 0.08456624348958333, + "learning_rate": 4e-05, + "loss": 4.8089, + "loss/crossentropy": 1.8687955513596535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18639723025262356, + "step": 4478 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 4.9375, + "grad_norm_var": 0.07550455729166666, + "learning_rate": 4e-05, + "loss": 4.8763, + "loss/crossentropy": 1.53606728464365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15891759283840656, + "step": 4480 + }, + { + "epoch": 0.3735, + "grad_norm": 4.6875, + "grad_norm_var": 0.073681640625, + "learning_rate": 4e-05, + "loss": 4.8269, + "loss/crossentropy": 2.033777177333832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21377325057983398, + "step": 4482 + }, + { + "epoch": 0.37366666666666665, + "grad_norm": 5.15625, + "grad_norm_var": 0.05810139973958333, + "learning_rate": 4e-05, + "loss": 5.0661, + "loss/crossentropy": 2.221651792526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20696890354156494, + "step": 4484 + }, + { + "epoch": 0.37383333333333335, + "grad_norm": 4.40625, + "grad_norm_var": 0.056966145833333336, + "learning_rate": 4e-05, + "loss": 4.9653, + "loss/crossentropy": 2.0857246443629265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875857077538967, + "step": 4486 + }, + { + "epoch": 0.374, + "grad_norm": 4.84375, + "grad_norm_var": 0.05701497395833333, + "learning_rate": 4e-05, + "loss": 4.6214, + "loss/crossentropy": 1.5107336938381195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18940279260277748, + "step": 4488 + }, + { + "epoch": 0.37416666666666665, + "grad_norm": 5.875, + "grad_norm_var": 0.10110677083333333, + "learning_rate": 4e-05, + "loss": 5.4512, + "loss/crossentropy": 2.483080804347992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103271558880806, + "step": 4490 + }, + { + "epoch": 0.37433333333333335, + "grad_norm": 4.6875, + "grad_norm_var": 0.11083577473958334, + "learning_rate": 4e-05, + "loss": 5.0859, + "loss/crossentropy": 2.398630738258362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2321801409125328, + "step": 4492 + }, + { + "epoch": 0.3745, + "grad_norm": 5.09375, + "grad_norm_var": 0.112109375, + "learning_rate": 4e-05, + "loss": 5.1353, + "loss/crossentropy": 2.4930431246757507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20315857604146004, + "step": 4494 + }, + { + "epoch": 0.37466666666666665, + "grad_norm": 4.6875, + "grad_norm_var": 0.12353108723958334, + "learning_rate": 4e-05, + "loss": 4.8157, + "loss/crossentropy": 2.4085164666175842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192452736198902, + "step": 4496 + }, + { + "epoch": 0.37483333333333335, + "grad_norm": 5.03125, + "grad_norm_var": 0.11612955729166667, + "learning_rate": 4e-05, + "loss": 5.4999, + "loss/crossentropy": 2.2982660233974457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21146676689386368, + "step": 4498 + }, + { + "epoch": 0.375, + "grad_norm": 5.0, + "grad_norm_var": 0.11431884765625, + "learning_rate": 4e-05, + "loss": 4.5637, + "loss/crossentropy": 1.7181595116853714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1607910357415676, + "step": 4500 + }, + { + "epoch": 0.37516666666666665, + "grad_norm": 4.78125, + "grad_norm_var": 0.08179931640625, + "learning_rate": 4e-05, + "loss": 4.6966, + "loss/crossentropy": 1.7414831668138504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1897329930216074, + "step": 4502 + }, + { + "epoch": 0.37533333333333335, + "grad_norm": 5.21875, + "grad_norm_var": 0.08476155598958333, + "learning_rate": 4e-05, + "loss": 4.6112, + "loss/crossentropy": 1.4592458382248878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1842550728470087, + "step": 4504 + }, + { + "epoch": 0.3755, + "grad_norm": 4.8125, + "grad_norm_var": 0.029150390625, + "learning_rate": 4e-05, + "loss": 4.7776, + "loss/crossentropy": 2.0442886650562286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18794919177889824, + "step": 4506 + }, + { + "epoch": 0.37566666666666665, + "grad_norm": 4.6875, + "grad_norm_var": 0.022509765625, + "learning_rate": 4e-05, + "loss": 4.4669, + "loss/crossentropy": 2.0219354778528214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18397306650877, + "step": 4508 + }, + { + "epoch": 0.37583333333333335, + "grad_norm": 4.6875, + "grad_norm_var": 0.023177083333333334, + "learning_rate": 4e-05, + "loss": 4.807, + "loss/crossentropy": 1.6848445013165474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17057411931455135, + "step": 4510 + }, + { + "epoch": 0.376, + "grad_norm": 4.875, + "grad_norm_var": 0.018778483072916668, + "learning_rate": 4e-05, + "loss": 4.8326, + "loss/crossentropy": 1.9318490028381348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17598963528871536, + "step": 4512 + }, + { + "epoch": 0.37616666666666665, + "grad_norm": 4.96875, + "grad_norm_var": 0.018994140625, + "learning_rate": 4e-05, + "loss": 4.5025, + "loss/crossentropy": 2.4948436617851257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19910050928592682, + "step": 4514 + }, + { + "epoch": 0.37633333333333335, + "grad_norm": 4.9375, + "grad_norm_var": 0.02105712890625, + "learning_rate": 4e-05, + "loss": 5.4404, + "loss/crossentropy": 1.8748653531074524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18252076767385006, + "step": 4516 + }, + { + "epoch": 0.3765, + "grad_norm": 5.09375, + "grad_norm_var": 0.024983723958333332, + "learning_rate": 4e-05, + "loss": 5.1306, + "loss/crossentropy": 2.293698728084564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19080045446753502, + "step": 4518 + }, + { + "epoch": 0.37666666666666665, + "grad_norm": 5.15625, + "grad_norm_var": 0.022721354166666666, + "learning_rate": 4e-05, + "loss": 5.2861, + "loss/crossentropy": 2.323317229747772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21780076250433922, + "step": 4520 + }, + { + "epoch": 0.37683333333333335, + "grad_norm": 4.875, + "grad_norm_var": 0.022981770833333335, + "learning_rate": 4e-05, + "loss": 4.9991, + "loss/crossentropy": 1.8763496354222298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1941455528140068, + "step": 4522 + }, + { + "epoch": 0.377, + "grad_norm": 4.59375, + "grad_norm_var": 0.031233723958333334, + "learning_rate": 4e-05, + "loss": 4.7514, + "loss/crossentropy": 1.469950720667839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15184346586465836, + "step": 4524 + }, + { + "epoch": 0.37716666666666665, + "grad_norm": 4.90625, + "grad_norm_var": 0.03345947265625, + "learning_rate": 4e-05, + "loss": 4.7452, + "loss/crossentropy": 1.4754580333828926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14591082371771336, + "step": 4526 + }, + { + "epoch": 0.37733333333333335, + "grad_norm": 4.84375, + "grad_norm_var": 0.036962890625, + "learning_rate": 4e-05, + "loss": 4.8999, + "loss/crossentropy": 2.164148509502411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979655660688877, + "step": 4528 + }, + { + "epoch": 0.3775, + "grad_norm": 4.875, + "grad_norm_var": 0.038407389322916666, + "learning_rate": 4e-05, + "loss": 4.9113, + "loss/crossentropy": 2.2905170917510986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23950210586190224, + "step": 4530 + }, + { + "epoch": 0.37766666666666665, + "grad_norm": 4.46875, + "grad_norm_var": 0.05904541015625, + "learning_rate": 4e-05, + "loss": 4.8391, + "loss/crossentropy": 1.711202435195446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18126659281551838, + "step": 4532 + }, + { + "epoch": 0.37783333333333335, + "grad_norm": 4.96875, + "grad_norm_var": 0.05487874348958333, + "learning_rate": 4e-05, + "loss": 5.2967, + "loss/crossentropy": 2.019508332014084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19983846321702003, + "step": 4534 + }, + { + "epoch": 0.378, + "grad_norm": 4.875, + "grad_norm_var": 0.04934895833333333, + "learning_rate": 4e-05, + "loss": 4.6832, + "loss/crossentropy": 1.4596013128757477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1501648034900427, + "step": 4536 + }, + { + "epoch": 0.37816666666666665, + "grad_norm": 5.125, + "grad_norm_var": 0.053369140625, + "learning_rate": 4e-05, + "loss": 4.688, + "loss/crossentropy": 1.7826803848147392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20163408666849136, + "step": 4538 + }, + { + "epoch": 0.37833333333333335, + "grad_norm": 5.03125, + "grad_norm_var": 0.06555582682291666, + "learning_rate": 4e-05, + "loss": 5.2718, + "loss/crossentropy": 2.1941796839237213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30321093276143074, + "step": 4540 + }, + { + "epoch": 0.3785, + "grad_norm": 4.625, + "grad_norm_var": 0.066015625, + "learning_rate": 4e-05, + "loss": 4.7823, + "loss/crossentropy": 1.5937781259417534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16691001877188683, + "step": 4542 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.067041015625, + "learning_rate": 4e-05, + "loss": 5.1226, + "loss/crossentropy": 1.317307323217392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17711128666996956, + "step": 4544 + }, + { + "epoch": 0.37883333333333336, + "grad_norm": 4.71875, + "grad_norm_var": 0.07118733723958333, + "learning_rate": 4e-05, + "loss": 5.1718, + "loss/crossentropy": 2.36332568526268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21637556329369545, + "step": 4546 + }, + { + "epoch": 0.379, + "grad_norm": 5.375, + "grad_norm_var": 0.060807291666666666, + "learning_rate": 4e-05, + "loss": 4.8774, + "loss/crossentropy": 1.8394339084625244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21467747539281845, + "step": 4548 + }, + { + "epoch": 0.37916666666666665, + "grad_norm": 4.75, + "grad_norm_var": 0.06847330729166666, + "learning_rate": 4e-05, + "loss": 5.0993, + "loss/crossentropy": 1.8647487238049507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19115696288645267, + "step": 4550 + }, + { + "epoch": 0.37933333333333336, + "grad_norm": 5.0625, + "grad_norm_var": 0.09465738932291666, + "learning_rate": 4e-05, + "loss": 5.0467, + "loss/crossentropy": 2.344729393720627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2191801331937313, + "step": 4552 + }, + { + "epoch": 0.3795, + "grad_norm": 5.15625, + "grad_norm_var": 0.09576416015625, + "learning_rate": 4e-05, + "loss": 5.0544, + "loss/crossentropy": 0.9832122027873993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.126436535269022, + "step": 4554 + }, + { + "epoch": 0.37966666666666665, + "grad_norm": 5.125, + "grad_norm_var": 0.08508707682291666, + "learning_rate": 4e-05, + "loss": 4.9261, + "loss/crossentropy": 2.408473551273346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232109010219574, + "step": 4556 + }, + { + "epoch": 0.37983333333333336, + "grad_norm": 5.0625, + "grad_norm_var": 0.06808268229166667, + "learning_rate": 4e-05, + "loss": 4.6928, + "loss/crossentropy": 1.7633072063326836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19468731619417667, + "step": 4558 + }, + { + "epoch": 0.38, + "grad_norm": 5.03125, + "grad_norm_var": 0.059098307291666666, + "learning_rate": 4e-05, + "loss": 4.7743, + "loss/crossentropy": 1.9241390004754066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015049308538437, + "step": 4560 + }, + { + "epoch": 0.38016666666666665, + "grad_norm": 5.125, + "grad_norm_var": 0.050374348958333336, + "learning_rate": 4e-05, + "loss": 4.8489, + "loss/crossentropy": 1.9497752413153648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18487541005015373, + "step": 4562 + }, + { + "epoch": 0.38033333333333336, + "grad_norm": 5.25, + "grad_norm_var": 0.04163004557291667, + "learning_rate": 4e-05, + "loss": 5.0498, + "loss/crossentropy": 2.279202699661255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2351878061890602, + "step": 4564 + }, + { + "epoch": 0.3805, + "grad_norm": 4.8125, + "grad_norm_var": 0.038525390625, + "learning_rate": 4e-05, + "loss": 4.9243, + "loss/crossentropy": 2.036419540643692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20054011419415474, + "step": 4566 + }, + { + "epoch": 0.38066666666666665, + "grad_norm": 5.09375, + "grad_norm_var": 0.018880208333333332, + "learning_rate": 4e-05, + "loss": 5.4103, + "loss/crossentropy": 2.19651135802269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140839770436287, + "step": 4568 + }, + { + "epoch": 0.38083333333333336, + "grad_norm": 4.4375, + "grad_norm_var": 0.033980305989583334, + "learning_rate": 4e-05, + "loss": 4.4335, + "loss/crossentropy": 2.5283347964286804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21709014475345612, + "step": 4570 + }, + { + "epoch": 0.381, + "grad_norm": 4.65625, + "grad_norm_var": 0.04034830729166667, + "learning_rate": 4e-05, + "loss": 4.62, + "loss/crossentropy": 1.9986247941851616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19273697584867477, + "step": 4572 + }, + { + "epoch": 0.38116666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.04950764973958333, + "learning_rate": 4e-05, + "loss": 5.1786, + "loss/crossentropy": 1.9769628196954727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18022861890494823, + "step": 4574 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 5.5, + "grad_norm_var": 0.06734619140625, + "learning_rate": 4e-05, + "loss": 5.084, + "loss/crossentropy": 1.9706605598330498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18343539349734783, + "step": 4576 + }, + { + "epoch": 0.3815, + "grad_norm": 5.65625, + "grad_norm_var": 0.15592041015625, + "learning_rate": 4e-05, + "loss": 4.723, + "loss/crossentropy": 2.2917512953281403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216607965528965, + "step": 4578 + }, + { + "epoch": 0.38166666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.15623372395833332, + "learning_rate": 4e-05, + "loss": 4.8429, + "loss/crossentropy": 2.4264036417007446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21467092260718346, + "step": 4580 + }, + { + "epoch": 0.38183333333333336, + "grad_norm": 5.4375, + "grad_norm_var": 0.15740559895833334, + "learning_rate": 4e-05, + "loss": 5.2089, + "loss/crossentropy": 2.411838263273239, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21722927317023277, + "step": 4582 + }, + { + "epoch": 0.382, + "grad_norm": 4.75, + "grad_norm_var": 0.16604410807291667, + "learning_rate": 4e-05, + "loss": 4.9028, + "loss/crossentropy": 1.5032013952732086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.140468442812562, + "step": 4584 + }, + { + "epoch": 0.38216666666666665, + "grad_norm": 4.65625, + "grad_norm_var": 0.14618733723958333, + "learning_rate": 4e-05, + "loss": 4.6503, + "loss/crossentropy": 1.8413489237427711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861533373594284, + "step": 4586 + }, + { + "epoch": 0.38233333333333336, + "grad_norm": 5.375, + "grad_norm_var": 0.12428385416666667, + "learning_rate": 4e-05, + "loss": 5.4413, + "loss/crossentropy": 1.6172087043523788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1677580364048481, + "step": 4588 + }, + { + "epoch": 0.3825, + "grad_norm": 5.21875, + "grad_norm_var": 0.13123372395833333, + "learning_rate": 4e-05, + "loss": 4.3317, + "loss/crossentropy": 1.9612976610660553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23077058792114258, + "step": 4590 + }, + { + "epoch": 0.38266666666666665, + "grad_norm": 5.25, + "grad_norm_var": 0.14269205729166667, + "learning_rate": 4e-05, + "loss": 4.9979, + "loss/crossentropy": 1.6219684183597565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15546699054539204, + "step": 4592 + }, + { + "epoch": 0.38283333333333336, + "grad_norm": 4.875, + "grad_norm_var": 0.063525390625, + "learning_rate": 4e-05, + "loss": 4.5898, + "loss/crossentropy": 1.8427765145897865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1906105801463127, + "step": 4594 + }, + { + "epoch": 0.383, + "grad_norm": 4.5625, + "grad_norm_var": 0.08398030598958334, + "learning_rate": 4e-05, + "loss": 4.5646, + "loss/crossentropy": 2.483305275440216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23165292665362358, + "step": 4596 + }, + { + "epoch": 0.38316666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.07870686848958333, + "learning_rate": 4e-05, + "loss": 4.6242, + "loss/crossentropy": 1.164937436580658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.149330236017704, + "step": 4598 + }, + { + "epoch": 0.38333333333333336, + "grad_norm": 4.90625, + "grad_norm_var": 0.08125, + "learning_rate": 4e-05, + "loss": 4.7377, + "loss/crossentropy": 1.1906629279255867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14596578665077686, + "step": 4600 + }, + { + "epoch": 0.3835, + "grad_norm": 5.1875, + "grad_norm_var": 0.7831868489583333, + "learning_rate": 4e-05, + "loss": 4.8185, + "loss/crossentropy": 1.5672737285494804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17864598706364632, + "step": 4602 + }, + { + "epoch": 0.38366666666666666, + "grad_norm": 5.40625, + "grad_norm_var": 0.78707275390625, + "learning_rate": 4e-05, + "loss": 5.3415, + "loss/crossentropy": 1.97494575381279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22634240612387657, + "step": 4604 + }, + { + "epoch": 0.38383333333333336, + "grad_norm": 4.25, + "grad_norm_var": 0.8302693684895833, + "learning_rate": 4e-05, + "loss": 4.8177, + "loss/crossentropy": 2.1882776021957397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004379890859127, + "step": 4606 + }, + { + "epoch": 0.384, + "grad_norm": 5.0, + "grad_norm_var": 0.8123697916666667, + "learning_rate": 4e-05, + "loss": 5.3293, + "loss/crossentropy": 2.0698306038975716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1867370456457138, + "step": 4608 + }, + { + "epoch": 0.38416666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.82799072265625, + "learning_rate": 4e-05, + "loss": 4.5079, + "loss/crossentropy": 2.5524789094924927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.229669027030468, + "step": 4610 + }, + { + "epoch": 0.38433333333333336, + "grad_norm": 4.875, + "grad_norm_var": 0.80015869140625, + "learning_rate": 4e-05, + "loss": 4.7616, + "loss/crossentropy": 2.66834419965744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2175808660686016, + "step": 4612 + }, + { + "epoch": 0.3845, + "grad_norm": 4.71875, + "grad_norm_var": 0.7801717122395834, + "learning_rate": 4e-05, + "loss": 5.358, + "loss/crossentropy": 1.8729632422327995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17812915332615376, + "step": 4614 + }, + { + "epoch": 0.38466666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.7616495768229167, + "learning_rate": 4e-05, + "loss": 4.9371, + "loss/crossentropy": 1.8376353681087494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18147564306855202, + "step": 4616 + }, + { + "epoch": 0.38483333333333336, + "grad_norm": 4.75, + "grad_norm_var": 0.12734375, + "learning_rate": 4e-05, + "loss": 4.8466, + "loss/crossentropy": 2.1042263209819794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1988103687763214, + "step": 4618 + }, + { + "epoch": 0.385, + "grad_norm": 4.9375, + "grad_norm_var": 0.10914306640625, + "learning_rate": 4e-05, + "loss": 4.4813, + "loss/crossentropy": 1.6850282698869705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18490208312869072, + "step": 4620 + }, + { + "epoch": 0.38516666666666666, + "grad_norm": 5.59375, + "grad_norm_var": 0.08850504557291666, + "learning_rate": 4e-05, + "loss": 5.1136, + "loss/crossentropy": 2.3153931200504303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22859248518943787, + "step": 4622 + }, + { + "epoch": 0.38533333333333336, + "grad_norm": 5.15625, + "grad_norm_var": 0.09101155598958334, + "learning_rate": 4e-05, + "loss": 4.6581, + "loss/crossentropy": 2.230543076992035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22239073365926743, + "step": 4624 + }, + { + "epoch": 0.3855, + "grad_norm": 4.9375, + "grad_norm_var": 0.07851155598958333, + "learning_rate": 4e-05, + "loss": 5.0687, + "loss/crossentropy": 1.9034294560551643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17507881671190262, + "step": 4626 + }, + { + "epoch": 0.38566666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.08332926432291667, + "learning_rate": 4e-05, + "loss": 5.2866, + "loss/crossentropy": 1.9101981818675995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21490610018372536, + "step": 4628 + }, + { + "epoch": 0.3858333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.072119140625, + "learning_rate": 4e-05, + "loss": 5.136, + "loss/crossentropy": 2.074092000722885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18353088945150375, + "step": 4630 + }, + { + "epoch": 0.386, + "grad_norm": 5.3125, + "grad_norm_var": 0.06419270833333333, + "learning_rate": 4e-05, + "loss": 5.4292, + "loss/crossentropy": 2.1906376481056213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22475622966885567, + "step": 4632 + }, + { + "epoch": 0.38616666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.05526936848958333, + "learning_rate": 4e-05, + "loss": 4.9876, + "loss/crossentropy": 2.148811638355255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19886000081896782, + "step": 4634 + }, + { + "epoch": 0.3863333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.057291666666666664, + "learning_rate": 4e-05, + "loss": 4.5391, + "loss/crossentropy": 2.1843119859695435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001435048878193, + "step": 4636 + }, + { + "epoch": 0.3865, + "grad_norm": 4.46875, + "grad_norm_var": 0.0609375, + "learning_rate": 4e-05, + "loss": 4.2873, + "loss/crossentropy": 1.3234648406505585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17355255037546158, + "step": 4638 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 5.25, + "grad_norm_var": 0.07340087890625, + "learning_rate": 4e-05, + "loss": 4.7845, + "loss/crossentropy": 2.5076652467250824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20514310523867607, + "step": 4640 + }, + { + "epoch": 0.3868333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.07643229166666667, + "learning_rate": 4e-05, + "loss": 5.0443, + "loss/crossentropy": 1.0731484815478325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1514381840825081, + "step": 4642 + }, + { + "epoch": 0.387, + "grad_norm": 5.125, + "grad_norm_var": 0.07844645182291667, + "learning_rate": 4e-05, + "loss": 4.8815, + "loss/crossentropy": 1.5910435616970062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16784923523664474, + "step": 4644 + }, + { + "epoch": 0.38716666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.08284098307291667, + "learning_rate": 4e-05, + "loss": 4.9735, + "loss/crossentropy": 2.524174451828003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103761024773121, + "step": 4646 + }, + { + "epoch": 0.3873333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.048563639322916664, + "learning_rate": 4e-05, + "loss": 5.0155, + "loss/crossentropy": 1.8493199050426483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17533520981669426, + "step": 4648 + }, + { + "epoch": 0.3875, + "grad_norm": 4.59375, + "grad_norm_var": 0.049605305989583334, + "learning_rate": 4e-05, + "loss": 5.0395, + "loss/crossentropy": 2.0180707573890686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20294680073857307, + "step": 4650 + }, + { + "epoch": 0.38766666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.049605305989583334, + "learning_rate": 4e-05, + "loss": 5.0705, + "loss/crossentropy": 2.2379818856716156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366400510072708, + "step": 4652 + }, + { + "epoch": 0.3878333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.038736979166666664, + "learning_rate": 4e-05, + "loss": 4.5548, + "loss/crossentropy": 2.384194016456604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23891232535243034, + "step": 4654 + }, + { + "epoch": 0.388, + "grad_norm": 5.375, + "grad_norm_var": 0.044775390625, + "learning_rate": 4e-05, + "loss": 4.8009, + "loss/crossentropy": 1.7017896994948387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889229454100132, + "step": 4656 + }, + { + "epoch": 0.38816666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.05585530598958333, + "learning_rate": 4e-05, + "loss": 4.4232, + "loss/crossentropy": 1.9795458614826202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17039254680275917, + "step": 4658 + }, + { + "epoch": 0.3883333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.07613525390625, + "learning_rate": 4e-05, + "loss": 4.7452, + "loss/crossentropy": 1.8883531391620636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24201414734125137, + "step": 4660 + }, + { + "epoch": 0.3885, + "grad_norm": 4.875, + "grad_norm_var": 0.074853515625, + "learning_rate": 4e-05, + "loss": 5.0235, + "loss/crossentropy": 1.8122221529483795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21144957840442657, + "step": 4662 + }, + { + "epoch": 0.38866666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.08553059895833333, + "learning_rate": 4e-05, + "loss": 4.4121, + "loss/crossentropy": 1.4162417724728584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14702197536826134, + "step": 4664 + }, + { + "epoch": 0.3888333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.11549479166666667, + "learning_rate": 4e-05, + "loss": 4.2224, + "loss/crossentropy": 1.2500296533107758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12859608978033066, + "step": 4666 + }, + { + "epoch": 0.389, + "grad_norm": 5.03125, + "grad_norm_var": 0.117822265625, + "learning_rate": 4e-05, + "loss": 4.4014, + "loss/crossentropy": 1.477287195622921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14969945698976517, + "step": 4668 + }, + { + "epoch": 0.38916666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.120947265625, + "learning_rate": 4e-05, + "loss": 4.7961, + "loss/crossentropy": 2.572759300470352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2284352257847786, + "step": 4670 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.11588541666666667, + "learning_rate": 4e-05, + "loss": 4.6865, + "loss/crossentropy": 2.2431783378124237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18131668493151665, + "step": 4672 + }, + { + "epoch": 0.3895, + "grad_norm": 4.96875, + "grad_norm_var": 0.13084309895833332, + "learning_rate": 4e-05, + "loss": 4.7876, + "loss/crossentropy": 2.0269206687808037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18316753953695297, + "step": 4674 + }, + { + "epoch": 0.38966666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.107421875, + "learning_rate": 4e-05, + "loss": 5.4435, + "loss/crossentropy": 2.206951141357422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20157692208886147, + "step": 4676 + }, + { + "epoch": 0.3898333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.11022135416666666, + "learning_rate": 4e-05, + "loss": 4.936, + "loss/crossentropy": 1.9816325455904007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19688685610890388, + "step": 4678 + }, + { + "epoch": 0.39, + "grad_norm": 5.15625, + "grad_norm_var": 0.11027018229166667, + "learning_rate": 4e-05, + "loss": 5.7048, + "loss/crossentropy": 2.4815438985824585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25391989201307297, + "step": 4680 + }, + { + "epoch": 0.39016666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.1109375, + "learning_rate": 4e-05, + "loss": 5.1386, + "loss/crossentropy": 2.527748703956604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069994568824768, + "step": 4682 + }, + { + "epoch": 0.3903333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.10948893229166666, + "learning_rate": 4e-05, + "loss": 4.8226, + "loss/crossentropy": 2.2085874676704407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19113614037632942, + "step": 4684 + }, + { + "epoch": 0.3905, + "grad_norm": 5.03125, + "grad_norm_var": 0.105322265625, + "learning_rate": 4e-05, + "loss": 5.4503, + "loss/crossentropy": 2.017712041735649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1863031480461359, + "step": 4686 + }, + { + "epoch": 0.39066666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.10198160807291666, + "learning_rate": 4e-05, + "loss": 4.6944, + "loss/crossentropy": 1.8249566927552223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17401202395558357, + "step": 4688 + }, + { + "epoch": 0.3908333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.07532145182291666, + "learning_rate": 4e-05, + "loss": 4.4237, + "loss/crossentropy": 2.3632700443267822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909812018275261, + "step": 4690 + }, + { + "epoch": 0.391, + "grad_norm": 4.5625, + "grad_norm_var": 0.07483317057291666, + "learning_rate": 4e-05, + "loss": 4.3248, + "loss/crossentropy": 0.6874695122241974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.09018013998866081, + "step": 4692 + }, + { + "epoch": 0.39116666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.07688395182291667, + "learning_rate": 4e-05, + "loss": 5.2917, + "loss/crossentropy": 1.7731168419122696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825818531215191, + "step": 4694 + }, + { + "epoch": 0.3913333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.07511393229166667, + "learning_rate": 4e-05, + "loss": 5.2002, + "loss/crossentropy": 1.7397530004382133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17103871516883373, + "step": 4696 + }, + { + "epoch": 0.3915, + "grad_norm": 4.84375, + "grad_norm_var": 0.0484375, + "learning_rate": 4e-05, + "loss": 4.7215, + "loss/crossentropy": 1.8667742162942886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19179138913750648, + "step": 4698 + }, + { + "epoch": 0.39166666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.04937744140625, + "learning_rate": 4e-05, + "loss": 4.8689, + "loss/crossentropy": 2.058789312839508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24368786439299583, + "step": 4700 + }, + { + "epoch": 0.3918333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.048046875, + "learning_rate": 4e-05, + "loss": 4.7386, + "loss/crossentropy": 1.992527186870575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063431590795517, + "step": 4702 + }, + { + "epoch": 0.392, + "grad_norm": 4.6875, + "grad_norm_var": 0.05038655598958333, + "learning_rate": 4e-05, + "loss": 4.3603, + "loss/crossentropy": 1.689400039613247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18329921551048756, + "step": 4704 + }, + { + "epoch": 0.39216666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.048811848958333334, + "learning_rate": 4e-05, + "loss": 4.6414, + "loss/crossentropy": 2.0435714572668076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18891741707921028, + "step": 4706 + }, + { + "epoch": 0.3923333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.04748942057291667, + "learning_rate": 4e-05, + "loss": 4.8324, + "loss/crossentropy": 1.8614301830530167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17250847071409225, + "step": 4708 + }, + { + "epoch": 0.3925, + "grad_norm": 4.71875, + "grad_norm_var": 0.03553059895833333, + "learning_rate": 4e-05, + "loss": 5.0679, + "loss/crossentropy": 2.0357573106884956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1831020563840866, + "step": 4710 + }, + { + "epoch": 0.39266666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.036909993489583334, + "learning_rate": 4e-05, + "loss": 4.6938, + "loss/crossentropy": 1.6935219168663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851331926882267, + "step": 4712 + }, + { + "epoch": 0.3928333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.04680582682291667, + "learning_rate": 4e-05, + "loss": 4.9553, + "loss/crossentropy": 1.4612598046660423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15830958634614944, + "step": 4714 + }, + { + "epoch": 0.393, + "grad_norm": 5.15625, + "grad_norm_var": 0.056233723958333336, + "learning_rate": 4e-05, + "loss": 5.5202, + "loss/crossentropy": 2.0164549723267555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17883064597845078, + "step": 4716 + }, + { + "epoch": 0.39316666666666666, + "grad_norm": 5.4375, + "grad_norm_var": 0.0845703125, + "learning_rate": 4e-05, + "loss": 5.1372, + "loss/crossentropy": 2.097052186727524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877942606806755, + "step": 4718 + }, + { + "epoch": 0.3933333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.07454020182291667, + "learning_rate": 4e-05, + "loss": 4.4806, + "loss/crossentropy": 2.1500546038150787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19928832724690437, + "step": 4720 + }, + { + "epoch": 0.3935, + "grad_norm": 5.6875, + "grad_norm_var": 0.10998942057291666, + "learning_rate": 4e-05, + "loss": 5.2306, + "loss/crossentropy": 1.7852617651224136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16534100286662579, + "step": 4722 + }, + { + "epoch": 0.39366666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.11151936848958334, + "learning_rate": 4e-05, + "loss": 4.6223, + "loss/crossentropy": 1.8593629002571106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19395272806286812, + "step": 4724 + }, + { + "epoch": 0.3938333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.10857747395833334, + "learning_rate": 4e-05, + "loss": 4.8391, + "loss/crossentropy": 2.176317922770977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18766920641064644, + "step": 4726 + }, + { + "epoch": 0.394, + "grad_norm": 4.9375, + "grad_norm_var": 0.10728759765625, + "learning_rate": 4e-05, + "loss": 4.7861, + "loss/crossentropy": 1.5290814563632011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16091121174395084, + "step": 4728 + }, + { + "epoch": 0.39416666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.07991129557291667, + "learning_rate": 4e-05, + "loss": 5.1133, + "loss/crossentropy": 2.491378366947174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20404277369379997, + "step": 4730 + }, + { + "epoch": 0.3943333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.101416015625, + "learning_rate": 4e-05, + "loss": 5.0591, + "loss/crossentropy": 2.1622492969036102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20248603075742722, + "step": 4732 + }, + { + "epoch": 0.3945, + "grad_norm": 4.75, + "grad_norm_var": 0.09254150390625, + "learning_rate": 4e-05, + "loss": 4.4277, + "loss/crossentropy": 2.1794531047344208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22922635823488235, + "step": 4734 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.08995768229166666, + "learning_rate": 4e-05, + "loss": 5.177, + "loss/crossentropy": 1.7416007369756699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.181478600949049, + "step": 4736 + }, + { + "epoch": 0.3948333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.07745768229166666, + "learning_rate": 4e-05, + "loss": 5.3699, + "loss/crossentropy": 1.9947757422924042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2080913558602333, + "step": 4738 + }, + { + "epoch": 0.395, + "grad_norm": 4.5625, + "grad_norm_var": 0.07239583333333334, + "learning_rate": 4e-05, + "loss": 4.9546, + "loss/crossentropy": 1.2601947486400604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1411193422973156, + "step": 4740 + }, + { + "epoch": 0.39516666666666667, + "grad_norm": 5.9375, + "grad_norm_var": 0.11790364583333333, + "learning_rate": 4e-05, + "loss": 5.0002, + "loss/crossentropy": 1.9214930534362793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973959505558014, + "step": 4742 + }, + { + "epoch": 0.3953333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.13326416015625, + "learning_rate": 4e-05, + "loss": 4.72, + "loss/crossentropy": 2.6403688788414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22059182822704315, + "step": 4744 + }, + { + "epoch": 0.3955, + "grad_norm": 4.9375, + "grad_norm_var": 0.148291015625, + "learning_rate": 4e-05, + "loss": 4.8914, + "loss/crossentropy": 2.0313423722982407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909470409154892, + "step": 4746 + }, + { + "epoch": 0.39566666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.14836832682291667, + "learning_rate": 4e-05, + "loss": 4.0028, + "loss/crossentropy": 0.390984907746315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.08585469983518124, + "step": 4748 + }, + { + "epoch": 0.3958333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.147509765625, + "learning_rate": 4e-05, + "loss": 5.5398, + "loss/crossentropy": 1.977037712931633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18233729153871536, + "step": 4750 + }, + { + "epoch": 0.396, + "grad_norm": 5.40625, + "grad_norm_var": 0.25857747395833336, + "learning_rate": 4e-05, + "loss": 4.7413, + "loss/crossentropy": 2.0886579751968384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19855722039937973, + "step": 4752 + }, + { + "epoch": 0.39616666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.27277018229166666, + "learning_rate": 4e-05, + "loss": 4.6178, + "loss/crossentropy": 2.5692251324653625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22474710270762444, + "step": 4754 + }, + { + "epoch": 0.3963333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.258837890625, + "learning_rate": 4e-05, + "loss": 5.2294, + "loss/crossentropy": 2.397335708141327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20136011764407158, + "step": 4756 + }, + { + "epoch": 0.3965, + "grad_norm": 4.78125, + "grad_norm_var": 0.19625244140625, + "learning_rate": 4e-05, + "loss": 4.885, + "loss/crossentropy": 1.47114946693182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1387049611657858, + "step": 4758 + }, + { + "epoch": 0.39666666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.18866780598958333, + "learning_rate": 4e-05, + "loss": 4.9398, + "loss/crossentropy": 1.8775576800107956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18835894763469696, + "step": 4760 + }, + { + "epoch": 0.3968333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.18726806640625, + "learning_rate": 4e-05, + "loss": 5.1243, + "loss/crossentropy": 1.7907000631093979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16836080700159073, + "step": 4762 + }, + { + "epoch": 0.397, + "grad_norm": 4.84375, + "grad_norm_var": 0.17512613932291668, + "learning_rate": 4e-05, + "loss": 5.0837, + "loss/crossentropy": 2.0826119109988213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17814225889742374, + "step": 4764 + }, + { + "epoch": 0.39716666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.17003580729166667, + "learning_rate": 4e-05, + "loss": 5.1873, + "loss/crossentropy": 2.191085457801819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033637948334217, + "step": 4766 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.0205078125, + "learning_rate": 4e-05, + "loss": 4.4774, + "loss/crossentropy": 2.2374271750450134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20148679986596107, + "step": 4768 + }, + { + "epoch": 0.3975, + "grad_norm": 4.625, + "grad_norm_var": 0.017411295572916666, + "learning_rate": 4e-05, + "loss": 4.9577, + "loss/crossentropy": 1.713077962398529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17870762944221497, + "step": 4770 + }, + { + "epoch": 0.39766666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.019205729166666668, + "learning_rate": 4e-05, + "loss": 4.7891, + "loss/crossentropy": 2.1501103043556213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18045742809772491, + "step": 4772 + }, + { + "epoch": 0.3978333333333333, + "grad_norm": 6.0, + "grad_norm_var": 0.10403238932291667, + "learning_rate": 4e-05, + "loss": 4.8548, + "loss/crossentropy": 1.9973932579159737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19423209875822067, + "step": 4774 + }, + { + "epoch": 0.398, + "grad_norm": 4.8125, + "grad_norm_var": 0.11031494140625, + "learning_rate": 4e-05, + "loss": 4.9899, + "loss/crossentropy": 1.975264847278595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1853540502488613, + "step": 4776 + }, + { + "epoch": 0.39816666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.111328125, + "learning_rate": 4e-05, + "loss": 4.9549, + "loss/crossentropy": 1.631221704185009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1606139950454235, + "step": 4778 + }, + { + "epoch": 0.3983333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.11077067057291666, + "learning_rate": 4e-05, + "loss": 5.0013, + "loss/crossentropy": 1.5004914924502373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1420083437114954, + "step": 4780 + }, + { + "epoch": 0.3985, + "grad_norm": 4.90625, + "grad_norm_var": 0.11002197265625, + "learning_rate": 4e-05, + "loss": 4.9629, + "loss/crossentropy": 2.5754368901252747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215489249676466, + "step": 4782 + }, + { + "epoch": 0.39866666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.11923421223958333, + "learning_rate": 4e-05, + "loss": 4.7496, + "loss/crossentropy": 1.8387674316763878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17710744589567184, + "step": 4784 + }, + { + "epoch": 0.3988333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.10998942057291666, + "learning_rate": 4e-05, + "loss": 5.1065, + "loss/crossentropy": 1.951240062713623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17940466478466988, + "step": 4786 + }, + { + "epoch": 0.399, + "grad_norm": 5.0, + "grad_norm_var": 0.10175374348958334, + "learning_rate": 4e-05, + "loss": 5.3268, + "loss/crossentropy": 2.650111675262451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2154219187796116, + "step": 4788 + }, + { + "epoch": 0.39916666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.036572265625, + "learning_rate": 4e-05, + "loss": 4.9731, + "loss/crossentropy": 1.7309883832931519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170034870505333, + "step": 4790 + }, + { + "epoch": 0.3993333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.034012858072916666, + "learning_rate": 4e-05, + "loss": 4.9897, + "loss/crossentropy": 1.7699553072452545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1885082796216011, + "step": 4792 + }, + { + "epoch": 0.3995, + "grad_norm": 4.96875, + "grad_norm_var": 0.03306884765625, + "learning_rate": 4e-05, + "loss": 5.3317, + "loss/crossentropy": 1.8295550793409348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17074094712734222, + "step": 4794 + }, + { + "epoch": 0.39966666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.04583333333333333, + "learning_rate": 4e-05, + "loss": 5.2288, + "loss/crossentropy": 2.3379410803318024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21407023817300797, + "step": 4796 + }, + { + "epoch": 0.3998333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.04892171223958333, + "learning_rate": 4e-05, + "loss": 4.4721, + "loss/crossentropy": 1.9539672955870628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17216168344020844, + "step": 4798 + }, + { + "epoch": 0.4, + "grad_norm": 4.90625, + "grad_norm_var": 0.20284830729166667, + "learning_rate": 4e-05, + "loss": 5.193, + "loss/crossentropy": 2.7041677832603455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110525727272034, + "step": 4800 + }, + { + "epoch": 0.40016666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.2056640625, + "learning_rate": 4e-05, + "loss": 4.7426, + "loss/crossentropy": 1.9566970467567444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17532207444310188, + "step": 4802 + }, + { + "epoch": 0.4003333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.209228515625, + "learning_rate": 4e-05, + "loss": 5.0816, + "loss/crossentropy": 1.6096001043915749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16063898243010044, + "step": 4804 + }, + { + "epoch": 0.4005, + "grad_norm": 4.78125, + "grad_norm_var": 0.212109375, + "learning_rate": 4e-05, + "loss": 4.8431, + "loss/crossentropy": 1.724914450198412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16081226454116404, + "step": 4806 + }, + { + "epoch": 0.40066666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.20445556640625, + "learning_rate": 4e-05, + "loss": 5.7713, + "loss/crossentropy": 1.7849983498454094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18764329701662064, + "step": 4808 + }, + { + "epoch": 0.4008333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.20894775390625, + "learning_rate": 4e-05, + "loss": 5.8588, + "loss/crossentropy": 2.3519081473350525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21424896642565727, + "step": 4810 + }, + { + "epoch": 0.401, + "grad_norm": 5.34375, + "grad_norm_var": 0.19498697916666666, + "learning_rate": 4e-05, + "loss": 5.2778, + "loss/crossentropy": 1.5925240516662598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17957336083054543, + "step": 4812 + }, + { + "epoch": 0.40116666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.19231363932291667, + "learning_rate": 4e-05, + "loss": 5.3563, + "loss/crossentropy": 2.3263401687145233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23666761443018913, + "step": 4814 + }, + { + "epoch": 0.4013333333333333, + "grad_norm": 10.0, + "grad_norm_var": 1.5945963541666666, + "learning_rate": 4e-05, + "loss": 4.823, + "loss/crossentropy": 3.0336874127388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21647527068853378, + "step": 4816 + }, + { + "epoch": 0.4015, + "grad_norm": 5.09375, + "grad_norm_var": 1.5526692708333334, + "learning_rate": 4e-05, + "loss": 4.741, + "loss/crossentropy": 2.064057379961014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19196297973394394, + "step": 4818 + }, + { + "epoch": 0.40166666666666667, + "grad_norm": 4.75, + "grad_norm_var": 1.57222900390625, + "learning_rate": 4e-05, + "loss": 4.8622, + "loss/crossentropy": 2.474276751279831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21422725915908813, + "step": 4820 + }, + { + "epoch": 0.4018333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 1.5520792643229167, + "learning_rate": 4e-05, + "loss": 5.3988, + "loss/crossentropy": 2.4372578859329224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20125412195920944, + "step": 4822 + }, + { + "epoch": 0.402, + "grad_norm": 5.09375, + "grad_norm_var": 1.5538899739583334, + "learning_rate": 4e-05, + "loss": 5.0631, + "loss/crossentropy": 2.34754741191864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127290964126587, + "step": 4824 + }, + { + "epoch": 0.4021666666666667, + "grad_norm": 5.0, + "grad_norm_var": 1.602978515625, + "learning_rate": 4e-05, + "loss": 4.8608, + "loss/crossentropy": 1.7592350095510483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19576997309923172, + "step": 4826 + }, + { + "epoch": 0.4023333333333333, + "grad_norm": 5.125, + "grad_norm_var": 1.5969685872395833, + "learning_rate": 4e-05, + "loss": 5.0661, + "loss/crossentropy": 2.473145544528961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2341373898088932, + "step": 4828 + }, + { + "epoch": 0.4025, + "grad_norm": 5.03125, + "grad_norm_var": 1.6091796875, + "learning_rate": 4e-05, + "loss": 4.7009, + "loss/crossentropy": 0.9298921674489975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1077762320637703, + "step": 4830 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.052408854166666664, + "learning_rate": 4e-05, + "loss": 4.5144, + "loss/crossentropy": 1.3825726583600044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15989060886204243, + "step": 4832 + }, + { + "epoch": 0.4028333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.05169270833333333, + "learning_rate": 4e-05, + "loss": 4.9293, + "loss/crossentropy": 1.9143131226301193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1857363097369671, + "step": 4834 + }, + { + "epoch": 0.403, + "grad_norm": 4.875, + "grad_norm_var": 0.0419921875, + "learning_rate": 4e-05, + "loss": 4.791, + "loss/crossentropy": 1.947355903685093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18314639292657375, + "step": 4836 + }, + { + "epoch": 0.4031666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.037353515625, + "learning_rate": 4e-05, + "loss": 4.9574, + "loss/crossentropy": 2.1336475014686584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22201551869511604, + "step": 4838 + }, + { + "epoch": 0.4033333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.042431640625, + "learning_rate": 4e-05, + "loss": 4.5785, + "loss/crossentropy": 2.036346584558487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21780623495578766, + "step": 4840 + }, + { + "epoch": 0.4035, + "grad_norm": 5.03125, + "grad_norm_var": 0.02633056640625, + "learning_rate": 4e-05, + "loss": 4.9651, + "loss/crossentropy": 2.0523361265659332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20310677587985992, + "step": 4842 + }, + { + "epoch": 0.4036666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.0302734375, + "learning_rate": 4e-05, + "loss": 5.1269, + "loss/crossentropy": 1.9599568769335747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1813591830432415, + "step": 4844 + }, + { + "epoch": 0.4038333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.03277587890625, + "learning_rate": 4e-05, + "loss": 4.9369, + "loss/crossentropy": 2.3991090655326843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21284585446119308, + "step": 4846 + }, + { + "epoch": 0.404, + "grad_norm": 4.875, + "grad_norm_var": 0.034749348958333336, + "learning_rate": 4e-05, + "loss": 4.5958, + "loss/crossentropy": 1.9806862249970436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810116320848465, + "step": 4848 + }, + { + "epoch": 0.4041666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.04934488932291667, + "learning_rate": 4e-05, + "loss": 4.6824, + "loss/crossentropy": 2.1363211572170258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16956374421715736, + "step": 4850 + }, + { + "epoch": 0.4043333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.05347900390625, + "learning_rate": 4e-05, + "loss": 4.6704, + "loss/crossentropy": 2.7161881923675537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152242660522461, + "step": 4852 + }, + { + "epoch": 0.4045, + "grad_norm": 4.71875, + "grad_norm_var": 0.052567545572916666, + "learning_rate": 4e-05, + "loss": 4.915, + "loss/crossentropy": 2.2441403567790985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20361649617552757, + "step": 4854 + }, + { + "epoch": 0.4046666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.058854166666666666, + "learning_rate": 4e-05, + "loss": 5.3845, + "loss/crossentropy": 2.0099719166755676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22300902009010315, + "step": 4856 + }, + { + "epoch": 0.4048333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.05705973307291667, + "learning_rate": 4e-05, + "loss": 4.9307, + "loss/crossentropy": 2.7886196970939636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21208975464105606, + "step": 4858 + }, + { + "epoch": 0.405, + "grad_norm": 5.0, + "grad_norm_var": 0.05188802083333333, + "learning_rate": 4e-05, + "loss": 4.9334, + "loss/crossentropy": 1.954047828912735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21413618698716164, + "step": 4860 + }, + { + "epoch": 0.4051666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.05286051432291667, + "learning_rate": 4e-05, + "loss": 5.0889, + "loss/crossentropy": 2.416406363248825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19541754946112633, + "step": 4862 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.05859375, + "learning_rate": 4e-05, + "loss": 4.8194, + "loss/crossentropy": 2.1043947488069534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21232957020401955, + "step": 4864 + }, + { + "epoch": 0.4055, + "grad_norm": 4.625, + "grad_norm_var": 0.0431640625, + "learning_rate": 4e-05, + "loss": 5.0973, + "loss/crossentropy": 1.5513064786791801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18353967368602753, + "step": 4866 + }, + { + "epoch": 0.4056666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.045145670572916664, + "learning_rate": 4e-05, + "loss": 4.8616, + "loss/crossentropy": 2.0387043803930283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1888214349746704, + "step": 4868 + }, + { + "epoch": 0.4058333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.042822265625, + "learning_rate": 4e-05, + "loss": 5.3838, + "loss/crossentropy": 2.5968635082244873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22734135761857033, + "step": 4870 + }, + { + "epoch": 0.406, + "grad_norm": 5.59375, + "grad_norm_var": 0.077197265625, + "learning_rate": 4e-05, + "loss": 5.6008, + "loss/crossentropy": 2.491960108280182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108830250799656, + "step": 4872 + }, + { + "epoch": 0.4061666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.07515869140625, + "learning_rate": 4e-05, + "loss": 4.5119, + "loss/crossentropy": 1.766836240887642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16757315024733543, + "step": 4874 + }, + { + "epoch": 0.4063333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.07476806640625, + "learning_rate": 4e-05, + "loss": 5.1455, + "loss/crossentropy": 2.424330711364746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23356298729777336, + "step": 4876 + }, + { + "epoch": 0.4065, + "grad_norm": 4.9375, + "grad_norm_var": 0.07401936848958333, + "learning_rate": 4e-05, + "loss": 4.817, + "loss/crossentropy": 1.9086792171001434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21842263266444206, + "step": 4878 + }, + { + "epoch": 0.4066666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.066015625, + "learning_rate": 4e-05, + "loss": 5.3566, + "loss/crossentropy": 2.3481759428977966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666499763727188, + "step": 4880 + }, + { + "epoch": 0.4068333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.07565104166666667, + "learning_rate": 4e-05, + "loss": 4.8577, + "loss/crossentropy": 1.8519111350178719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18224726244807243, + "step": 4882 + }, + { + "epoch": 0.407, + "grad_norm": 4.78125, + "grad_norm_var": 0.0791015625, + "learning_rate": 4e-05, + "loss": 5.2082, + "loss/crossentropy": 2.2147536873817444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20171857625246048, + "step": 4884 + }, + { + "epoch": 0.4071666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.07926025390625, + "learning_rate": 4e-05, + "loss": 4.8338, + "loss/crossentropy": 2.537692904472351, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117961779236794, + "step": 4886 + }, + { + "epoch": 0.4073333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.03982747395833333, + "learning_rate": 4e-05, + "loss": 4.9522, + "loss/crossentropy": 2.4090050756931305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2254425659775734, + "step": 4888 + }, + { + "epoch": 0.4075, + "grad_norm": 4.9375, + "grad_norm_var": 0.03957926432291667, + "learning_rate": 4e-05, + "loss": 4.3431, + "loss/crossentropy": 1.156929299235344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14492111094295979, + "step": 4890 + }, + { + "epoch": 0.4076666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.044661458333333334, + "learning_rate": 4e-05, + "loss": 5.3081, + "loss/crossentropy": 2.1810811161994934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20051919296383858, + "step": 4892 + }, + { + "epoch": 0.4078333333333333, + "grad_norm": 5.96875, + "grad_norm_var": 0.12571207682291666, + "learning_rate": 4e-05, + "loss": 4.8635, + "loss/crossentropy": 2.264392375946045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23612874373793602, + "step": 4894 + }, + { + "epoch": 0.408, + "grad_norm": 5.25, + "grad_norm_var": 0.12161458333333333, + "learning_rate": 4e-05, + "loss": 5.0033, + "loss/crossentropy": 1.631929226219654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20521026104688644, + "step": 4896 + }, + { + "epoch": 0.4081666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.15950113932291668, + "learning_rate": 4e-05, + "loss": 5.1642, + "loss/crossentropy": 2.431085526943207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308473400771618, + "step": 4898 + }, + { + "epoch": 0.4083333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.20800374348958334, + "learning_rate": 4e-05, + "loss": 5.5606, + "loss/crossentropy": 2.380655586719513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22495479881763458, + "step": 4900 + }, + { + "epoch": 0.4085, + "grad_norm": 5.1875, + "grad_norm_var": 0.23761393229166666, + "learning_rate": 4e-05, + "loss": 4.318, + "loss/crossentropy": 2.0739801824092865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816551871597767, + "step": 4902 + }, + { + "epoch": 0.4086666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.24060872395833333, + "learning_rate": 4e-05, + "loss": 4.9015, + "loss/crossentropy": 1.5711579322814941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034294791519642, + "step": 4904 + }, + { + "epoch": 0.4088333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.25816650390625, + "learning_rate": 4e-05, + "loss": 4.7011, + "loss/crossentropy": 1.5462888479232788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15808816254138947, + "step": 4906 + }, + { + "epoch": 0.409, + "grad_norm": 4.59375, + "grad_norm_var": 0.2498046875, + "learning_rate": 4e-05, + "loss": 4.6972, + "loss/crossentropy": 1.5701627358794212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15472500771284103, + "step": 4908 + }, + { + "epoch": 0.4091666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.18088785807291666, + "learning_rate": 4e-05, + "loss": 4.4919, + "loss/crossentropy": 2.258408010005951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2214752584695816, + "step": 4910 + }, + { + "epoch": 0.4093333333333333, + "grad_norm": 6.125, + "grad_norm_var": 0.26545817057291665, + "learning_rate": 4e-05, + "loss": 5.6132, + "loss/crossentropy": 2.2918245792388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20702063292264938, + "step": 4912 + }, + { + "epoch": 0.4095, + "grad_norm": 4.46875, + "grad_norm_var": 0.23922119140625, + "learning_rate": 4e-05, + "loss": 4.9267, + "loss/crossentropy": 2.0654823556542397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17589706182479858, + "step": 4914 + }, + { + "epoch": 0.4096666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.18759358723958333, + "learning_rate": 4e-05, + "loss": 4.8291, + "loss/crossentropy": 1.9735463857650757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892772577702999, + "step": 4916 + }, + { + "epoch": 0.4098333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.1677734375, + "learning_rate": 4e-05, + "loss": 5.1753, + "loss/crossentropy": 1.4942561835050583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14400891959667206, + "step": 4918 + }, + { + "epoch": 0.41, + "grad_norm": 5.34375, + "grad_norm_var": 0.185546875, + "learning_rate": 4e-05, + "loss": 5.0549, + "loss/crossentropy": 1.840515322983265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930188685655594, + "step": 4920 + }, + { + "epoch": 0.4101666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.17565104166666667, + "learning_rate": 4e-05, + "loss": 4.8569, + "loss/crossentropy": 1.399222806096077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20139185898005962, + "step": 4922 + }, + { + "epoch": 0.4103333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.16612955729166667, + "learning_rate": 4e-05, + "loss": 5.0999, + "loss/crossentropy": 1.889309674501419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1681222766637802, + "step": 4924 + }, + { + "epoch": 0.4105, + "grad_norm": 4.6875, + "grad_norm_var": 0.16809488932291666, + "learning_rate": 4e-05, + "loss": 4.6713, + "loss/crossentropy": 2.352425366640091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22923307865858078, + "step": 4926 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.06617431640625, + "learning_rate": 4e-05, + "loss": 5.1291, + "loss/crossentropy": 2.288997530937195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21416041254997253, + "step": 4928 + }, + { + "epoch": 0.41083333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.05806884765625, + "learning_rate": 4e-05, + "loss": 4.5875, + "loss/crossentropy": 2.1992684602737427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20887574926018715, + "step": 4930 + }, + { + "epoch": 0.411, + "grad_norm": 5.28125, + "grad_norm_var": 0.06334228515625, + "learning_rate": 4e-05, + "loss": 4.7654, + "loss/crossentropy": 2.058065950870514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20995530486106873, + "step": 4932 + }, + { + "epoch": 0.4111666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.059619140625, + "learning_rate": 4e-05, + "loss": 4.8428, + "loss/crossentropy": 1.7666442766785622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17294050380587578, + "step": 4934 + }, + { + "epoch": 0.41133333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.03240559895833333, + "learning_rate": 4e-05, + "loss": 4.9497, + "loss/crossentropy": 1.412701353430748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15248983725905418, + "step": 4936 + }, + { + "epoch": 0.4115, + "grad_norm": 4.71875, + "grad_norm_var": 0.029911295572916666, + "learning_rate": 4e-05, + "loss": 4.8646, + "loss/crossentropy": 1.9869374781847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19339880347251892, + "step": 4938 + }, + { + "epoch": 0.4116666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.03248291015625, + "learning_rate": 4e-05, + "loss": 4.4866, + "loss/crossentropy": 1.2516977936029434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13892163336277008, + "step": 4940 + }, + { + "epoch": 0.41183333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.0404296875, + "learning_rate": 4e-05, + "loss": 5.4393, + "loss/crossentropy": 2.3766159415245056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21843568980693817, + "step": 4942 + }, + { + "epoch": 0.412, + "grad_norm": 4.78125, + "grad_norm_var": 0.04959309895833333, + "learning_rate": 4e-05, + "loss": 5.1146, + "loss/crossentropy": 1.8096503615379333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20148269459605217, + "step": 4944 + }, + { + "epoch": 0.4121666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.044661458333333334, + "learning_rate": 4e-05, + "loss": 4.7362, + "loss/crossentropy": 2.301755279302597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2105144038796425, + "step": 4946 + }, + { + "epoch": 0.41233333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.03873291015625, + "learning_rate": 4e-05, + "loss": 5.2578, + "loss/crossentropy": 2.1293097138404846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22269337251782417, + "step": 4948 + }, + { + "epoch": 0.4125, + "grad_norm": 5.0, + "grad_norm_var": 0.036458333333333336, + "learning_rate": 4e-05, + "loss": 4.7433, + "loss/crossentropy": 0.9624597281217575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11805179342627525, + "step": 4950 + }, + { + "epoch": 0.4126666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.0375, + "learning_rate": 4e-05, + "loss": 5.0177, + "loss/crossentropy": 2.7045233845710754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20849882066249847, + "step": 4952 + }, + { + "epoch": 0.41283333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.040478515625, + "learning_rate": 4e-05, + "loss": 4.8001, + "loss/crossentropy": 2.376677691936493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22974925860762596, + "step": 4954 + }, + { + "epoch": 0.413, + "grad_norm": 4.875, + "grad_norm_var": 0.04052327473958333, + "learning_rate": 4e-05, + "loss": 5.1012, + "loss/crossentropy": 1.7594347819685936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18802068009972572, + "step": 4956 + }, + { + "epoch": 0.4131666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.03551025390625, + "learning_rate": 4e-05, + "loss": 5.3854, + "loss/crossentropy": 2.0615014731884003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038946896791458, + "step": 4958 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.03592122395833333, + "learning_rate": 4e-05, + "loss": 4.8311, + "loss/crossentropy": 1.5626792162656784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15392129682004452, + "step": 4960 + }, + { + "epoch": 0.4135, + "grad_norm": 4.90625, + "grad_norm_var": 0.03958333333333333, + "learning_rate": 4e-05, + "loss": 5.3433, + "loss/crossentropy": 2.549463391304016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21993155032396317, + "step": 4962 + }, + { + "epoch": 0.4136666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.053120930989583336, + "learning_rate": 4e-05, + "loss": 4.5375, + "loss/crossentropy": 1.9942337423563004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19395725801587105, + "step": 4964 + }, + { + "epoch": 0.41383333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.06834309895833333, + "learning_rate": 4e-05, + "loss": 4.7104, + "loss/crossentropy": 1.3048944622278214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14355522580444813, + "step": 4966 + }, + { + "epoch": 0.414, + "grad_norm": 4.75, + "grad_norm_var": 0.0759765625, + "learning_rate": 4e-05, + "loss": 4.9089, + "loss/crossentropy": 2.365107297897339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21021969243884087, + "step": 4968 + }, + { + "epoch": 0.4141666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.08472900390625, + "learning_rate": 4e-05, + "loss": 5.2194, + "loss/crossentropy": 1.9617774188518524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20810309797525406, + "step": 4970 + }, + { + "epoch": 0.41433333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.08136393229166666, + "learning_rate": 4e-05, + "loss": 4.9017, + "loss/crossentropy": 2.406768888235092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964864060282707, + "step": 4972 + }, + { + "epoch": 0.4145, + "grad_norm": 4.65625, + "grad_norm_var": 0.07849934895833334, + "learning_rate": 4e-05, + "loss": 5.0075, + "loss/crossentropy": 1.9754514545202255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20375796221196651, + "step": 4974 + }, + { + "epoch": 0.4146666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.07923177083333334, + "learning_rate": 4e-05, + "loss": 4.6253, + "loss/crossentropy": 0.8821175321936607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12686316668987274, + "step": 4976 + }, + { + "epoch": 0.41483333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.09016927083333333, + "learning_rate": 4e-05, + "loss": 4.4754, + "loss/crossentropy": 1.8768207728862762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18468411453068256, + "step": 4978 + }, + { + "epoch": 0.415, + "grad_norm": 5.15625, + "grad_norm_var": 0.09446207682291667, + "learning_rate": 4e-05, + "loss": 4.8406, + "loss/crossentropy": 1.8561868369579315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19827888533473015, + "step": 4980 + }, + { + "epoch": 0.4151666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.08046468098958333, + "learning_rate": 4e-05, + "loss": 5.1432, + "loss/crossentropy": 2.181670993566513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666886448860168, + "step": 4982 + }, + { + "epoch": 0.41533333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.07532552083333334, + "learning_rate": 4e-05, + "loss": 4.812, + "loss/crossentropy": 1.7170398011803627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21950143948197365, + "step": 4984 + }, + { + "epoch": 0.4155, + "grad_norm": 5.1875, + "grad_norm_var": 0.07823893229166666, + "learning_rate": 4e-05, + "loss": 5.0194, + "loss/crossentropy": 1.7972271963953972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1704818457365036, + "step": 4986 + }, + { + "epoch": 0.4156666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.08424479166666667, + "learning_rate": 4e-05, + "loss": 5.1628, + "loss/crossentropy": 2.5009243488311768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20981622114777565, + "step": 4988 + }, + { + "epoch": 0.41583333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.08765869140625, + "learning_rate": 4e-05, + "loss": 4.6857, + "loss/crossentropy": 1.990450143814087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19208272732794285, + "step": 4990 + }, + { + "epoch": 0.416, + "grad_norm": 5.125, + "grad_norm_var": 0.087744140625, + "learning_rate": 4e-05, + "loss": 4.9785, + "loss/crossentropy": 1.6779858320951462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16968519985675812, + "step": 4992 + }, + { + "epoch": 0.4161666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.06444905598958334, + "learning_rate": 4e-05, + "loss": 4.9131, + "loss/crossentropy": 2.3368648886680603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20479774847626686, + "step": 4994 + }, + { + "epoch": 0.41633333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.06220296223958333, + "learning_rate": 4e-05, + "loss": 4.7284, + "loss/crossentropy": 2.203623980283737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22645379230380058, + "step": 4996 + }, + { + "epoch": 0.4165, + "grad_norm": 4.84375, + "grad_norm_var": 0.06614583333333333, + "learning_rate": 4e-05, + "loss": 4.5558, + "loss/crossentropy": 1.6457276046276093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14439713582396507, + "step": 4998 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.060221354166666664, + "learning_rate": 4e-05, + "loss": 5.3545, + "loss/crossentropy": 1.8733460828661919, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17062750086188316, + "step": 5000 + }, + { + "epoch": 0.41683333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.043192545572916664, + "learning_rate": 4e-05, + "loss": 4.5375, + "loss/crossentropy": 1.8087796047329903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17719511315226555, + "step": 5002 + }, + { + "epoch": 0.417, + "grad_norm": 4.84375, + "grad_norm_var": 0.02578125, + "learning_rate": 4e-05, + "loss": 5.0403, + "loss/crossentropy": 2.1366709172725677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200287990272045, + "step": 5004 + }, + { + "epoch": 0.4171666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.024637858072916668, + "learning_rate": 4e-05, + "loss": 4.5753, + "loss/crossentropy": 2.0192334055900574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032482735812664, + "step": 5006 + }, + { + "epoch": 0.41733333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.03951822916666667, + "learning_rate": 4e-05, + "loss": 5.5531, + "loss/crossentropy": 1.7714089825749397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18016327545046806, + "step": 5008 + }, + { + "epoch": 0.4175, + "grad_norm": 5.40625, + "grad_norm_var": 0.051981608072916664, + "learning_rate": 4e-05, + "loss": 5.0002, + "loss/crossentropy": 2.296230137348175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2626774534583092, + "step": 5010 + }, + { + "epoch": 0.4176666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.05383707682291667, + "learning_rate": 4e-05, + "loss": 5.2479, + "loss/crossentropy": 2.222260892391205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999911367893219, + "step": 5012 + }, + { + "epoch": 0.41783333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.064697265625, + "learning_rate": 4e-05, + "loss": 4.8299, + "loss/crossentropy": 1.8547895401716232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18344702012836933, + "step": 5014 + }, + { + "epoch": 0.418, + "grad_norm": 4.6875, + "grad_norm_var": 0.07916259765625, + "learning_rate": 4e-05, + "loss": 3.9861, + "loss/crossentropy": 1.537365846335888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15916940197348595, + "step": 5016 + }, + { + "epoch": 0.4181666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.115478515625, + "learning_rate": 4e-05, + "loss": 5.4498, + "loss/crossentropy": 2.237378031015396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22286979109048843, + "step": 5018 + }, + { + "epoch": 0.41833333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.12050374348958333, + "learning_rate": 4e-05, + "loss": 5.3501, + "loss/crossentropy": 2.154662251472473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220684964209795, + "step": 5020 + }, + { + "epoch": 0.4185, + "grad_norm": 5.09375, + "grad_norm_var": 0.11252848307291667, + "learning_rate": 4e-05, + "loss": 5.2871, + "loss/crossentropy": 1.9252085089683533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19202982634305954, + "step": 5022 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.11041259765625, + "learning_rate": 4e-05, + "loss": 4.6119, + "loss/crossentropy": 1.7248478308320045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18302454613149166, + "step": 5024 + }, + { + "epoch": 0.41883333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.09351806640625, + "learning_rate": 4e-05, + "loss": 4.9789, + "loss/crossentropy": 2.413954019546509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19942022860050201, + "step": 5026 + }, + { + "epoch": 0.419, + "grad_norm": 4.84375, + "grad_norm_var": 0.09270833333333334, + "learning_rate": 4e-05, + "loss": 4.5155, + "loss/crossentropy": 2.0690543353557587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17765720933675766, + "step": 5028 + }, + { + "epoch": 0.4191666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.08153889973958334, + "learning_rate": 4e-05, + "loss": 4.9016, + "loss/crossentropy": 1.9299027398228645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17881011590361595, + "step": 5030 + }, + { + "epoch": 0.41933333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.05956624348958333, + "learning_rate": 4e-05, + "loss": 4.945, + "loss/crossentropy": 1.9287557378411293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16480084136128426, + "step": 5032 + }, + { + "epoch": 0.4195, + "grad_norm": 4.9375, + "grad_norm_var": 0.026676432291666666, + "learning_rate": 4e-05, + "loss": 4.9642, + "loss/crossentropy": 1.560925267636776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15121719613671303, + "step": 5034 + }, + { + "epoch": 0.4196666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.019645182291666667, + "learning_rate": 4e-05, + "loss": 4.846, + "loss/crossentropy": 1.8814395442605019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1752840355038643, + "step": 5036 + }, + { + "epoch": 0.41983333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.020829264322916666, + "learning_rate": 4e-05, + "loss": 5.4615, + "loss/crossentropy": 2.11568945646286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19466029852628708, + "step": 5038 + }, + { + "epoch": 0.42, + "grad_norm": 5.125, + "grad_norm_var": 0.019820149739583334, + "learning_rate": 4e-05, + "loss": 4.7966, + "loss/crossentropy": 2.5833939909934998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22409406676888466, + "step": 5040 + }, + { + "epoch": 0.4201666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.022526041666666666, + "learning_rate": 4e-05, + "loss": 4.8328, + "loss/crossentropy": 1.853715144097805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956741362810135, + "step": 5042 + }, + { + "epoch": 0.42033333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.021207682291666665, + "learning_rate": 4e-05, + "loss": 5.0333, + "loss/crossentropy": 1.6487743258476257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1757977306842804, + "step": 5044 + }, + { + "epoch": 0.4205, + "grad_norm": 4.6875, + "grad_norm_var": 0.023942057291666666, + "learning_rate": 4e-05, + "loss": 4.7196, + "loss/crossentropy": 2.2509495317935944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21562692895531654, + "step": 5046 + }, + { + "epoch": 0.4206666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.03268229166666667, + "learning_rate": 4e-05, + "loss": 5.0596, + "loss/crossentropy": 2.125421464443207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19918570667505264, + "step": 5048 + }, + { + "epoch": 0.42083333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.03553059895833333, + "learning_rate": 4e-05, + "loss": 4.6523, + "loss/crossentropy": 1.7011590600013733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1660672463476658, + "step": 5050 + }, + { + "epoch": 0.421, + "grad_norm": 5.40625, + "grad_norm_var": 0.054036458333333336, + "learning_rate": 4e-05, + "loss": 5.1967, + "loss/crossentropy": 2.2367068231105804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934006493538618, + "step": 5052 + }, + { + "epoch": 0.4211666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.055562337239583336, + "learning_rate": 4e-05, + "loss": 5.5412, + "loss/crossentropy": 2.018110543489456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19330649077892303, + "step": 5054 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.062174479166666664, + "learning_rate": 4e-05, + "loss": 4.7395, + "loss/crossentropy": 1.915867231786251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698799423873425, + "step": 5056 + }, + { + "epoch": 0.4215, + "grad_norm": 5.0625, + "grad_norm_var": 0.07667643229166667, + "learning_rate": 4e-05, + "loss": 5.4262, + "loss/crossentropy": 2.3704627454280853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22046199068427086, + "step": 5058 + }, + { + "epoch": 0.4216666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.08043212890625, + "learning_rate": 4e-05, + "loss": 4.8319, + "loss/crossentropy": 2.6193134784698486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22060347348451614, + "step": 5060 + }, + { + "epoch": 0.42183333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.07978108723958334, + "learning_rate": 4e-05, + "loss": 4.5304, + "loss/crossentropy": 2.0035160332918167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18313829228281975, + "step": 5062 + }, + { + "epoch": 0.422, + "grad_norm": 4.6875, + "grad_norm_var": 0.08489583333333334, + "learning_rate": 4e-05, + "loss": 5.3497, + "loss/crossentropy": 2.353265404701233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23167630657553673, + "step": 5064 + }, + { + "epoch": 0.4221666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.07928059895833334, + "learning_rate": 4e-05, + "loss": 4.7515, + "loss/crossentropy": 2.228081852197647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21101577952504158, + "step": 5066 + }, + { + "epoch": 0.42233333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.069384765625, + "learning_rate": 4e-05, + "loss": 5.2329, + "loss/crossentropy": 2.2022290229797363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20957914367318153, + "step": 5068 + }, + { + "epoch": 0.4225, + "grad_norm": 4.71875, + "grad_norm_var": 0.07408854166666666, + "learning_rate": 4e-05, + "loss": 4.5227, + "loss/crossentropy": 1.3012079074978828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1404709778726101, + "step": 5070 + }, + { + "epoch": 0.4226666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.06925455729166667, + "learning_rate": 4e-05, + "loss": 4.2628, + "loss/crossentropy": 1.5303220078349113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17120682075619698, + "step": 5072 + }, + { + "epoch": 0.42283333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.05364176432291667, + "learning_rate": 4e-05, + "loss": 4.8328, + "loss/crossentropy": 2.4758930802345276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22224238142371178, + "step": 5074 + }, + { + "epoch": 0.423, + "grad_norm": 5.03125, + "grad_norm_var": 0.05194905598958333, + "learning_rate": 4e-05, + "loss": 5.3414, + "loss/crossentropy": 2.2223449051380157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18280233442783356, + "step": 5076 + }, + { + "epoch": 0.4231666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.054150390625, + "learning_rate": 4e-05, + "loss": 5.0858, + "loss/crossentropy": 2.2318738102912903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102554365992546, + "step": 5078 + }, + { + "epoch": 0.42333333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.04156494140625, + "learning_rate": 4e-05, + "loss": 4.8605, + "loss/crossentropy": 1.872434914112091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843944452702999, + "step": 5080 + }, + { + "epoch": 0.4235, + "grad_norm": 5.09375, + "grad_norm_var": 0.045817057291666664, + "learning_rate": 4e-05, + "loss": 4.7855, + "loss/crossentropy": 2.177910089492798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981346271932125, + "step": 5082 + }, + { + "epoch": 0.4236666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 4e-05, + "loss": 4.8335, + "loss/crossentropy": 1.09625893086195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17088285274803638, + "step": 5084 + }, + { + "epoch": 0.42383333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.03638916015625, + "learning_rate": 4e-05, + "loss": 5.8526, + "loss/crossentropy": 1.9430483132600784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1950901709496975, + "step": 5086 + }, + { + "epoch": 0.424, + "grad_norm": 4.53125, + "grad_norm_var": 0.06304931640625, + "learning_rate": 4e-05, + "loss": 4.7281, + "loss/crossentropy": 1.635485090315342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18363632634282112, + "step": 5088 + }, + { + "epoch": 0.4241666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.0609375, + "learning_rate": 4e-05, + "loss": 4.9117, + "loss/crossentropy": 1.2679156586527824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16498573496937752, + "step": 5090 + }, + { + "epoch": 0.42433333333333334, + "grad_norm": 5.84375, + "grad_norm_var": 8.680452473958333, + "learning_rate": 4e-05, + "loss": 5.3373, + "loss/crossentropy": 2.0983648747205734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19808345846831799, + "step": 5092 + }, + { + "epoch": 0.4245, + "grad_norm": 5.09375, + "grad_norm_var": 8.694124348958333, + "learning_rate": 4e-05, + "loss": 4.9142, + "loss/crossentropy": 2.0484844595193863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18929221853613853, + "step": 5094 + }, + { + "epoch": 0.4246666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 8.717041015625, + "learning_rate": 4e-05, + "loss": 5.1393, + "loss/crossentropy": 1.9642380774021149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21084356680512428, + "step": 5096 + }, + { + "epoch": 0.42483333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 22.81953125, + "learning_rate": 4e-05, + "loss": 4.9222, + "loss/crossentropy": 1.8336669728159904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18257286585867405, + "step": 5098 + }, + { + "epoch": 0.425, + "grad_norm": 5.15625, + "grad_norm_var": 22.73375244140625, + "learning_rate": 4e-05, + "loss": 4.986, + "loss/crossentropy": 2.136046200990677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19740736857056618, + "step": 5100 + }, + { + "epoch": 0.4251666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 22.855497233072917, + "learning_rate": 4e-05, + "loss": 5.3193, + "loss/crossentropy": 1.5806643292307854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1525644864886999, + "step": 5102 + }, + { + "epoch": 0.42533333333333334, + "grad_norm": 4.75, + "grad_norm_var": 22.573030598958333, + "learning_rate": 4e-05, + "loss": 4.8681, + "loss/crossentropy": 1.7995612248778343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969994120299816, + "step": 5104 + }, + { + "epoch": 0.4255, + "grad_norm": 5.09375, + "grad_norm_var": 22.454541015625, + "learning_rate": 4e-05, + "loss": 4.6807, + "loss/crossentropy": 1.8234473168849945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21108638122677803, + "step": 5106 + }, + { + "epoch": 0.4256666666666667, + "grad_norm": 5.125, + "grad_norm_var": 15.586962890625, + "learning_rate": 4e-05, + "loss": 5.1477, + "loss/crossentropy": 2.558901071548462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21013934910297394, + "step": 5108 + }, + { + "epoch": 0.42583333333333334, + "grad_norm": 5.375, + "grad_norm_var": 15.538212076822917, + "learning_rate": 4e-05, + "loss": 4.8511, + "loss/crossentropy": 1.5620201379060745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16541289910674095, + "step": 5110 + }, + { + "epoch": 0.426, + "grad_norm": 4.75, + "grad_norm_var": 15.622782389322916, + "learning_rate": 4e-05, + "loss": 4.8213, + "loss/crossentropy": 2.680659532546997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138008326292038, + "step": 5112 + }, + { + "epoch": 0.4261666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.06482747395833334, + "learning_rate": 4e-05, + "loss": 4.7686, + "loss/crossentropy": 1.9398740530014038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1988565307110548, + "step": 5114 + }, + { + "epoch": 0.42633333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.0630859375, + "learning_rate": 4e-05, + "loss": 5.5553, + "loss/crossentropy": 2.090523838996887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20620887726545334, + "step": 5116 + }, + { + "epoch": 0.4265, + "grad_norm": 6.21875, + "grad_norm_var": 0.15349934895833334, + "learning_rate": 4e-05, + "loss": 4.8973, + "loss/crossentropy": 1.4458559900522232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23421939089894295, + "step": 5118 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.15455322265625, + "learning_rate": 4e-05, + "loss": 4.678, + "loss/crossentropy": 1.8547161743044853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17052751407027245, + "step": 5120 + }, + { + "epoch": 0.42683333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.15501302083333332, + "learning_rate": 4e-05, + "loss": 4.5453, + "loss/crossentropy": 1.6307990327477455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15353327803313732, + "step": 5122 + }, + { + "epoch": 0.427, + "grad_norm": 4.6875, + "grad_norm_var": 0.17401936848958333, + "learning_rate": 4e-05, + "loss": 4.6401, + "loss/crossentropy": 2.0444329008460045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21640102565288544, + "step": 5124 + }, + { + "epoch": 0.42716666666666664, + "grad_norm": 4.96875, + "grad_norm_var": 0.1701171875, + "learning_rate": 4e-05, + "loss": 5.3645, + "loss/crossentropy": 2.4515466690063477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20284807682037354, + "step": 5126 + }, + { + "epoch": 0.42733333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.162109375, + "learning_rate": 4e-05, + "loss": 4.9106, + "loss/crossentropy": 1.507865995168686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1698614414781332, + "step": 5128 + }, + { + "epoch": 0.4275, + "grad_norm": 5.25, + "grad_norm_var": 0.16126302083333333, + "learning_rate": 4e-05, + "loss": 4.8938, + "loss/crossentropy": 1.7518939077854156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1792406179010868, + "step": 5130 + }, + { + "epoch": 0.42766666666666664, + "grad_norm": 5.40625, + "grad_norm_var": 0.17263997395833333, + "learning_rate": 4e-05, + "loss": 4.729, + "loss/crossentropy": 1.932149201631546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19870978593826294, + "step": 5132 + }, + { + "epoch": 0.42783333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.069921875, + "learning_rate": 4e-05, + "loss": 5.2444, + "loss/crossentropy": 2.447461187839508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20525626838207245, + "step": 5134 + }, + { + "epoch": 0.428, + "grad_norm": 4.875, + "grad_norm_var": 0.06666259765625, + "learning_rate": 4e-05, + "loss": 4.9806, + "loss/crossentropy": 2.315503776073456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193039208650589, + "step": 5136 + }, + { + "epoch": 0.42816666666666664, + "grad_norm": 5.5, + "grad_norm_var": 0.08899332682291666, + "learning_rate": 4e-05, + "loss": 4.8899, + "loss/crossentropy": 2.2293468713760376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100820429623127, + "step": 5138 + }, + { + "epoch": 0.42833333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.06516927083333333, + "learning_rate": 4e-05, + "loss": 5.0528, + "loss/crossentropy": 1.8010507598519325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925482451915741, + "step": 5140 + }, + { + "epoch": 0.4285, + "grad_norm": 5.0, + "grad_norm_var": 0.060933430989583336, + "learning_rate": 4e-05, + "loss": 4.9691, + "loss/crossentropy": 2.4426570534706116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21383054926991463, + "step": 5142 + }, + { + "epoch": 0.42866666666666664, + "grad_norm": 4.5625, + "grad_norm_var": 0.06350504557291667, + "learning_rate": 4e-05, + "loss": 5.1129, + "loss/crossentropy": 1.7086549401283264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16771548986434937, + "step": 5144 + }, + { + "epoch": 0.42883333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.06678059895833334, + "learning_rate": 4e-05, + "loss": 5.1706, + "loss/crossentropy": 2.5270156860351562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23303020745515823, + "step": 5146 + }, + { + "epoch": 0.429, + "grad_norm": 5.0625, + "grad_norm_var": 0.05377197265625, + "learning_rate": 4e-05, + "loss": 5.3792, + "loss/crossentropy": 2.2300464510917664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2331315577030182, + "step": 5148 + }, + { + "epoch": 0.42916666666666664, + "grad_norm": 5.28125, + "grad_norm_var": 0.060286458333333334, + "learning_rate": 4e-05, + "loss": 4.8926, + "loss/crossentropy": 1.5624526962637901, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16189817152917385, + "step": 5150 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.06730143229166667, + "learning_rate": 4e-05, + "loss": 5.3266, + "loss/crossentropy": 2.736234724521637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2452528215944767, + "step": 5152 + }, + { + "epoch": 0.4295, + "grad_norm": 4.75, + "grad_norm_var": 0.04892171223958333, + "learning_rate": 4e-05, + "loss": 4.0593, + "loss/crossentropy": 1.1300897151231766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1562607455998659, + "step": 5154 + }, + { + "epoch": 0.42966666666666664, + "grad_norm": 5.46875, + "grad_norm_var": 0.06806233723958334, + "learning_rate": 4e-05, + "loss": 5.2706, + "loss/crossentropy": 2.048402391374111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18175113759934902, + "step": 5156 + }, + { + "epoch": 0.42983333333333335, + "grad_norm": 4.53125, + "grad_norm_var": 0.08134358723958333, + "learning_rate": 4e-05, + "loss": 4.3642, + "loss/crossentropy": 2.2617290019989014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21256079152226448, + "step": 5158 + }, + { + "epoch": 0.43, + "grad_norm": 4.75, + "grad_norm_var": 0.07506103515625, + "learning_rate": 4e-05, + "loss": 4.7489, + "loss/crossentropy": 1.670315831899643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18331779912114143, + "step": 5160 + }, + { + "epoch": 0.43016666666666664, + "grad_norm": 5.1875, + "grad_norm_var": 0.06717122395833333, + "learning_rate": 4e-05, + "loss": 5.3245, + "loss/crossentropy": 1.1678467690944672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1482110098004341, + "step": 5162 + }, + { + "epoch": 0.43033333333333335, + "grad_norm": 5.1875, + "grad_norm_var": 0.07580973307291666, + "learning_rate": 4e-05, + "loss": 4.2844, + "loss/crossentropy": 2.235573798418045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24548039212822914, + "step": 5164 + }, + { + "epoch": 0.4305, + "grad_norm": 5.28125, + "grad_norm_var": 0.092578125, + "learning_rate": 4e-05, + "loss": 4.5618, + "loss/crossentropy": 2.2987032532691956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20700116828083992, + "step": 5166 + }, + { + "epoch": 0.43066666666666664, + "grad_norm": 5.25, + "grad_norm_var": 0.11443684895833334, + "learning_rate": 4e-05, + "loss": 5.6446, + "loss/crossentropy": 2.2951321601867676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21205606684088707, + "step": 5168 + }, + { + "epoch": 0.43083333333333335, + "grad_norm": 5.375, + "grad_norm_var": 0.11767171223958334, + "learning_rate": 4e-05, + "loss": 5.4096, + "loss/crossentropy": 2.568745195865631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844319835305214, + "step": 5170 + }, + { + "epoch": 0.431, + "grad_norm": 5.25, + "grad_norm_var": 0.11890869140625, + "learning_rate": 4e-05, + "loss": 5.3175, + "loss/crossentropy": 2.58900648355484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21212553977966309, + "step": 5172 + }, + { + "epoch": 0.43116666666666664, + "grad_norm": 4.5625, + "grad_norm_var": 0.11170247395833334, + "learning_rate": 4e-05, + "loss": 4.6694, + "loss/crossentropy": 1.557967871427536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21451672539114952, + "step": 5174 + }, + { + "epoch": 0.43133333333333335, + "grad_norm": 4.59375, + "grad_norm_var": 0.11808268229166667, + "learning_rate": 4e-05, + "loss": 4.841, + "loss/crossentropy": 2.51181161403656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21414360031485558, + "step": 5176 + }, + { + "epoch": 0.4315, + "grad_norm": 5.40625, + "grad_norm_var": 0.12913004557291666, + "learning_rate": 4e-05, + "loss": 5.4822, + "loss/crossentropy": 2.557952344417572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21239551529288292, + "step": 5178 + }, + { + "epoch": 0.43166666666666664, + "grad_norm": 4.71875, + "grad_norm_var": 0.13228759765625, + "learning_rate": 4e-05, + "loss": 4.3222, + "loss/crossentropy": 0.8792792856693268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10647947527468204, + "step": 5180 + }, + { + "epoch": 0.43183333333333335, + "grad_norm": 5.0, + "grad_norm_var": 0.10898030598958333, + "learning_rate": 4e-05, + "loss": 4.4222, + "loss/crossentropy": 1.6791368499398232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18098345771431923, + "step": 5182 + }, + { + "epoch": 0.432, + "grad_norm": 4.875, + "grad_norm_var": 0.08318684895833334, + "learning_rate": 4e-05, + "loss": 4.7292, + "loss/crossentropy": 2.429518163204193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2394704520702362, + "step": 5184 + }, + { + "epoch": 0.43216666666666664, + "grad_norm": 4.8125, + "grad_norm_var": 0.065625, + "learning_rate": 4e-05, + "loss": 5.0259, + "loss/crossentropy": 1.6873462200164795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16258270666003227, + "step": 5186 + }, + { + "epoch": 0.43233333333333335, + "grad_norm": 4.875, + "grad_norm_var": 0.051285807291666666, + "learning_rate": 4e-05, + "loss": 4.8518, + "loss/crossentropy": 2.167505532503128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19529220461845398, + "step": 5188 + }, + { + "epoch": 0.4325, + "grad_norm": 4.84375, + "grad_norm_var": 0.04334309895833333, + "learning_rate": 4e-05, + "loss": 5.1402, + "loss/crossentropy": 1.5462488010525703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15644877590239048, + "step": 5190 + }, + { + "epoch": 0.43266666666666664, + "grad_norm": 5.25, + "grad_norm_var": 0.046284993489583336, + "learning_rate": 4e-05, + "loss": 5.1826, + "loss/crossentropy": 1.9765098094940186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2178989090025425, + "step": 5192 + }, + { + "epoch": 0.43283333333333335, + "grad_norm": 5.125, + "grad_norm_var": 0.02320556640625, + "learning_rate": 4e-05, + "loss": 4.9383, + "loss/crossentropy": 2.181311994791031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18956802040338516, + "step": 5194 + }, + { + "epoch": 0.433, + "grad_norm": 4.75, + "grad_norm_var": 0.02037353515625, + "learning_rate": 4e-05, + "loss": 4.545, + "loss/crossentropy": 2.1313266456127167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24127675592899323, + "step": 5196 + }, + { + "epoch": 0.43316666666666664, + "grad_norm": 4.78125, + "grad_norm_var": 0.020052083333333335, + "learning_rate": 4e-05, + "loss": 4.9439, + "loss/crossentropy": 1.9447228908538818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084346041083336, + "step": 5198 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 4.65625, + "grad_norm_var": 0.023697916666666666, + "learning_rate": 4e-05, + "loss": 4.7933, + "loss/crossentropy": 2.6325061917304993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21256360411643982, + "step": 5200 + }, + { + "epoch": 0.4335, + "grad_norm": 4.875, + "grad_norm_var": 0.023291015625, + "learning_rate": 4e-05, + "loss": 4.7973, + "loss/crossentropy": 1.343225508928299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1388614997267723, + "step": 5202 + }, + { + "epoch": 0.43366666666666664, + "grad_norm": 4.90625, + "grad_norm_var": 0.031571451822916666, + "learning_rate": 4e-05, + "loss": 5.1977, + "loss/crossentropy": 2.4446049332618713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23277466744184494, + "step": 5204 + }, + { + "epoch": 0.43383333333333335, + "grad_norm": 5.21875, + "grad_norm_var": 0.04006754557291667, + "learning_rate": 4e-05, + "loss": 4.991, + "loss/crossentropy": 2.2790105640888214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22518282383680344, + "step": 5206 + }, + { + "epoch": 0.434, + "grad_norm": 5.0, + "grad_norm_var": 0.02945556640625, + "learning_rate": 4e-05, + "loss": 4.7089, + "loss/crossentropy": 2.0995849072933197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328692339360714, + "step": 5208 + }, + { + "epoch": 0.43416666666666665, + "grad_norm": 4.75, + "grad_norm_var": 0.03365478515625, + "learning_rate": 4e-05, + "loss": 5.2833, + "loss/crossentropy": 2.38425749540329, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22432530671358109, + "step": 5210 + }, + { + "epoch": 0.43433333333333335, + "grad_norm": 5.3125, + "grad_norm_var": 0.04550374348958333, + "learning_rate": 4e-05, + "loss": 4.9349, + "loss/crossentropy": 2.3226277828216553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002677097916603, + "step": 5212 + }, + { + "epoch": 0.4345, + "grad_norm": 4.75, + "grad_norm_var": 0.045947265625, + "learning_rate": 4e-05, + "loss": 5.1692, + "loss/crossentropy": 2.459660768508911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22898616641759872, + "step": 5214 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 5.03125, + "grad_norm_var": 0.052587890625, + "learning_rate": 4e-05, + "loss": 4.7851, + "loss/crossentropy": 1.7144055142998695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18077606335282326, + "step": 5216 + }, + { + "epoch": 0.43483333333333335, + "grad_norm": 5.46875, + "grad_norm_var": 0.06946614583333334, + "learning_rate": 4e-05, + "loss": 5.4372, + "loss/crossentropy": 1.9776408672332764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18408696725964546, + "step": 5218 + }, + { + "epoch": 0.435, + "grad_norm": 5.53125, + "grad_norm_var": 0.07146809895833334, + "learning_rate": 4e-05, + "loss": 4.7817, + "loss/crossentropy": 1.2995164021849632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1558908000588417, + "step": 5220 + }, + { + "epoch": 0.43516666666666665, + "grad_norm": 4.6875, + "grad_norm_var": 0.07316080729166667, + "learning_rate": 4e-05, + "loss": 4.5949, + "loss/crossentropy": 1.725936196744442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1701155286282301, + "step": 5222 + }, + { + "epoch": 0.43533333333333335, + "grad_norm": 4.9375, + "grad_norm_var": 0.09664306640625, + "learning_rate": 4e-05, + "loss": 5.2427, + "loss/crossentropy": 2.4887034595012665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21807188540697098, + "step": 5224 + }, + { + "epoch": 0.4355, + "grad_norm": 5.0, + "grad_norm_var": 0.09021809895833334, + "learning_rate": 4e-05, + "loss": 4.3535, + "loss/crossentropy": 1.597044050693512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17133367992937565, + "step": 5226 + }, + { + "epoch": 0.43566666666666665, + "grad_norm": 4.75, + "grad_norm_var": 0.09550374348958333, + "learning_rate": 4e-05, + "loss": 5.2829, + "loss/crossentropy": 2.532274305820465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22373450174927711, + "step": 5228 + }, + { + "epoch": 0.43583333333333335, + "grad_norm": 5.3125, + "grad_norm_var": 0.10360921223958333, + "learning_rate": 4e-05, + "loss": 4.6998, + "loss/crossentropy": 2.2400224208831787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20386741310358047, + "step": 5230 + }, + { + "epoch": 0.436, + "grad_norm": 4.8125, + "grad_norm_var": 0.10012613932291667, + "learning_rate": 4e-05, + "loss": 4.8858, + "loss/crossentropy": 2.234561562538147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22216036915779114, + "step": 5232 + }, + { + "epoch": 0.43616666666666665, + "grad_norm": 4.96875, + "grad_norm_var": 0.09280192057291667, + "learning_rate": 4e-05, + "loss": 4.6863, + "loss/crossentropy": 1.9974671080708504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18877053633332253, + "step": 5234 + }, + { + "epoch": 0.43633333333333335, + "grad_norm": 4.8125, + "grad_norm_var": 0.06643473307291667, + "learning_rate": 4e-05, + "loss": 4.8685, + "loss/crossentropy": 1.6357809603214264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16103867627680302, + "step": 5236 + }, + { + "epoch": 0.4365, + "grad_norm": 4.5625, + "grad_norm_var": 0.07776285807291666, + "learning_rate": 4e-05, + "loss": 4.5777, + "loss/crossentropy": 1.2994728162884712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1460007168352604, + "step": 5238 + }, + { + "epoch": 0.43666666666666665, + "grad_norm": 4.84375, + "grad_norm_var": 0.06417643229166667, + "learning_rate": 4e-05, + "loss": 4.9275, + "loss/crossentropy": 2.199091613292694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144564837217331, + "step": 5240 + }, + { + "epoch": 0.43683333333333335, + "grad_norm": 5.0625, + "grad_norm_var": 0.07066650390625, + "learning_rate": 4e-05, + "loss": 4.7809, + "loss/crossentropy": 2.668358266353607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22134612500667572, + "step": 5242 + }, + { + "epoch": 0.437, + "grad_norm": 4.71875, + "grad_norm_var": 0.051102701822916666, + "learning_rate": 4e-05, + "loss": 4.9155, + "loss/crossentropy": 1.8332997113466263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924358643591404, + "step": 5244 + }, + { + "epoch": 0.43716666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.056864420572916664, + "learning_rate": 4e-05, + "loss": 5.0669, + "loss/crossentropy": 2.1800646483898163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20784780383110046, + "step": 5246 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 5.15625, + "grad_norm_var": 0.06339518229166667, + "learning_rate": 4e-05, + "loss": 4.6012, + "loss/crossentropy": 1.463565170764923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1575961671769619, + "step": 5248 + }, + { + "epoch": 0.4375, + "grad_norm": 5.53125, + "grad_norm_var": 0.08502604166666666, + "learning_rate": 4e-05, + "loss": 5.103, + "loss/crossentropy": 2.0702124536037445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20388288795948029, + "step": 5250 + }, + { + "epoch": 0.43766666666666665, + "grad_norm": 5.15625, + "grad_norm_var": 0.09503580729166666, + "learning_rate": 4e-05, + "loss": 5.2124, + "loss/crossentropy": 2.0828773379325867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22234292700886726, + "step": 5252 + }, + { + "epoch": 0.43783333333333335, + "grad_norm": 4.65625, + "grad_norm_var": 0.08854166666666667, + "learning_rate": 4e-05, + "loss": 5.07, + "loss/crossentropy": 2.2166521549224854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087559849023819, + "step": 5254 + }, + { + "epoch": 0.438, + "grad_norm": 4.6875, + "grad_norm_var": 0.09374593098958334, + "learning_rate": 4e-05, + "loss": 5.3975, + "loss/crossentropy": 2.173314794898033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19079249911010265, + "step": 5256 + }, + { + "epoch": 0.43816666666666665, + "grad_norm": 5.3125, + "grad_norm_var": 0.09436442057291666, + "learning_rate": 4e-05, + "loss": 5.0493, + "loss/crossentropy": 2.52229842543602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2559218965470791, + "step": 5258 + }, + { + "epoch": 0.43833333333333335, + "grad_norm": 4.71875, + "grad_norm_var": 0.08440348307291666, + "learning_rate": 4e-05, + "loss": 5.0227, + "loss/crossentropy": 2.1676777005195618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21036133915185928, + "step": 5260 + }, + { + "epoch": 0.4385, + "grad_norm": 8.5, + "grad_norm_var": 0.8345011393229167, + "learning_rate": 4e-05, + "loss": 5.1052, + "loss/crossentropy": 1.9067611992359161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087782323360443, + "step": 5262 + }, + { + "epoch": 0.43866666666666665, + "grad_norm": 5.3125, + "grad_norm_var": 0.82457275390625, + "learning_rate": 4e-05, + "loss": 5.1143, + "loss/crossentropy": 2.3210455179214478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18462146818637848, + "step": 5264 + }, + { + "epoch": 0.43883333333333335, + "grad_norm": 4.65625, + "grad_norm_var": 0.83912353515625, + "learning_rate": 4e-05, + "loss": 4.959, + "loss/crossentropy": 1.3365162461996078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16913891211152077, + "step": 5266 + }, + { + "epoch": 0.439, + "grad_norm": 5.625, + "grad_norm_var": 3.219038899739583, + "learning_rate": 4e-05, + "loss": 4.8483, + "loss/crossentropy": 2.61382520198822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22090576961636543, + "step": 5268 + }, + { + "epoch": 0.43916666666666665, + "grad_norm": 4.71875, + "grad_norm_var": 3.216727701822917, + "learning_rate": 4e-05, + "loss": 4.9946, + "loss/crossentropy": 2.3929781913757324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2272535227239132, + "step": 5270 + }, + { + "epoch": 0.43933333333333335, + "grad_norm": 5.25, + "grad_norm_var": 3.1981608072916665, + "learning_rate": 4e-05, + "loss": 4.844, + "loss/crossentropy": 2.355992078781128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21010024100542068, + "step": 5272 + }, + { + "epoch": 0.4395, + "grad_norm": 4.65625, + "grad_norm_var": 3.239306640625, + "learning_rate": 4e-05, + "loss": 5.1044, + "loss/crossentropy": 2.033922716975212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21508634090423584, + "step": 5274 + }, + { + "epoch": 0.43966666666666665, + "grad_norm": 5.46875, + "grad_norm_var": 3.19302978515625, + "learning_rate": 4e-05, + "loss": 5.1998, + "loss/crossentropy": 2.0046669840812683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982124038040638, + "step": 5276 + }, + { + "epoch": 0.43983333333333335, + "grad_norm": 4.875, + "grad_norm_var": 2.647591145833333, + "learning_rate": 4e-05, + "loss": 5.5324, + "loss/crossentropy": 2.5933563113212585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21993929892778397, + "step": 5278 + }, + { + "epoch": 0.44, + "grad_norm": 4.78125, + "grad_norm_var": 2.7052042643229166, + "learning_rate": 4e-05, + "loss": 4.7645, + "loss/crossentropy": 1.6767731830477715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17139611765742302, + "step": 5280 + }, + { + "epoch": 0.44016666666666665, + "grad_norm": 5.0, + "grad_norm_var": 2.71695556640625, + "learning_rate": 4e-05, + "loss": 5.2355, + "loss/crossentropy": 1.993862234055996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19030576571822166, + "step": 5282 + }, + { + "epoch": 0.44033333333333335, + "grad_norm": 5.15625, + "grad_norm_var": 0.10299072265625, + "learning_rate": 4e-05, + "loss": 4.9784, + "loss/crossentropy": 1.885690025985241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19479876197874546, + "step": 5284 + }, + { + "epoch": 0.4405, + "grad_norm": 4.9375, + "grad_norm_var": 0.09381510416666666, + "learning_rate": 4e-05, + "loss": 5.0184, + "loss/crossentropy": 2.0076128244400024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18920595943927765, + "step": 5286 + }, + { + "epoch": 0.44066666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.08638916015625, + "learning_rate": 4e-05, + "loss": 5.2074, + "loss/crossentropy": 2.4117564260959625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2313983030617237, + "step": 5288 + }, + { + "epoch": 0.44083333333333335, + "grad_norm": 4.5625, + "grad_norm_var": 0.110009765625, + "learning_rate": 4e-05, + "loss": 4.6849, + "loss/crossentropy": 1.497763104736805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16671940125524998, + "step": 5290 + }, + { + "epoch": 0.441, + "grad_norm": 5.21875, + "grad_norm_var": 0.094775390625, + "learning_rate": 4e-05, + "loss": 5.2303, + "loss/crossentropy": 1.4492312595248222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15011000633239746, + "step": 5292 + }, + { + "epoch": 0.44116666666666665, + "grad_norm": 5.84375, + "grad_norm_var": 0.13225504557291667, + "learning_rate": 4e-05, + "loss": 4.7391, + "loss/crossentropy": 2.051460087299347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20685597881674767, + "step": 5294 + }, + { + "epoch": 0.44133333333333336, + "grad_norm": 5.0, + "grad_norm_var": 0.13450520833333332, + "learning_rate": 4e-05, + "loss": 4.9108, + "loss/crossentropy": 2.753096580505371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23638440668582916, + "step": 5296 + }, + { + "epoch": 0.4415, + "grad_norm": 4.6875, + "grad_norm_var": 0.13847249348958332, + "learning_rate": 4e-05, + "loss": 4.4445, + "loss/crossentropy": 2.3885613679885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244817018508911, + "step": 5298 + }, + { + "epoch": 0.44166666666666665, + "grad_norm": 4.46875, + "grad_norm_var": 0.13592122395833334, + "learning_rate": 4e-05, + "loss": 5.1649, + "loss/crossentropy": 2.4205052852630615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21577660739421844, + "step": 5300 + }, + { + "epoch": 0.44183333333333336, + "grad_norm": 5.0625, + "grad_norm_var": 0.15230712890625, + "learning_rate": 4e-05, + "loss": 5.2738, + "loss/crossentropy": 2.2331049740314484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21633132547140121, + "step": 5302 + }, + { + "epoch": 0.442, + "grad_norm": 4.71875, + "grad_norm_var": 0.14698893229166668, + "learning_rate": 4e-05, + "loss": 4.7522, + "loss/crossentropy": 2.1147951781749725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21170584484934807, + "step": 5304 + }, + { + "epoch": 0.44216666666666665, + "grad_norm": 5.125, + "grad_norm_var": 0.12317301432291666, + "learning_rate": 4e-05, + "loss": 5.2276, + "loss/crossentropy": 2.4038414657115936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2141760066151619, + "step": 5306 + }, + { + "epoch": 0.44233333333333336, + "grad_norm": 5.46875, + "grad_norm_var": 0.13899739583333334, + "learning_rate": 4e-05, + "loss": 5.1861, + "loss/crossentropy": 1.9134333208203316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17545868270099163, + "step": 5308 + }, + { + "epoch": 0.4425, + "grad_norm": 5.5, + "grad_norm_var": 0.10569254557291667, + "learning_rate": 4e-05, + "loss": 4.6515, + "loss/crossentropy": 2.0091544091701508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976570524275303, + "step": 5310 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 4.875, + "grad_norm_var": 0.104931640625, + "learning_rate": 4e-05, + "loss": 5.3006, + "loss/crossentropy": 1.9432563707232475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17379920929670334, + "step": 5312 + }, + { + "epoch": 0.44283333333333336, + "grad_norm": 4.53125, + "grad_norm_var": 0.11717122395833333, + "learning_rate": 4e-05, + "loss": 4.6163, + "loss/crossentropy": 1.1306499615311623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12089996412396431, + "step": 5314 + }, + { + "epoch": 0.443, + "grad_norm": 4.78125, + "grad_norm_var": 0.094921875, + "learning_rate": 4e-05, + "loss": 4.8005, + "loss/crossentropy": 2.579859673976898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22911198064684868, + "step": 5316 + }, + { + "epoch": 0.44316666666666665, + "grad_norm": 4.65625, + "grad_norm_var": 0.08336181640625, + "learning_rate": 4e-05, + "loss": 5.6242, + "loss/crossentropy": 2.7599607706069946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21561214700341225, + "step": 5318 + }, + { + "epoch": 0.44333333333333336, + "grad_norm": 4.90625, + "grad_norm_var": 0.238525390625, + "learning_rate": 4e-05, + "loss": 5.1176, + "loss/crossentropy": 2.208019971847534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21972987428307533, + "step": 5320 + }, + { + "epoch": 0.4435, + "grad_norm": 5.0, + "grad_norm_var": 0.24205729166666667, + "learning_rate": 4e-05, + "loss": 5.1073, + "loss/crossentropy": 2.002477027475834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19431659020483494, + "step": 5322 + }, + { + "epoch": 0.44366666666666665, + "grad_norm": 4.9375, + "grad_norm_var": 0.22418212890625, + "learning_rate": 4e-05, + "loss": 4.8315, + "loss/crossentropy": 2.3529436886310577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23537345603108406, + "step": 5324 + }, + { + "epoch": 0.44383333333333336, + "grad_norm": 4.78125, + "grad_norm_var": 0.21044514973958334, + "learning_rate": 4e-05, + "loss": 4.9157, + "loss/crossentropy": 2.0489574670791626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18815740570425987, + "step": 5326 + }, + { + "epoch": 0.444, + "grad_norm": 5.59375, + "grad_norm_var": 0.23173421223958332, + "learning_rate": 4e-05, + "loss": 4.9874, + "loss/crossentropy": 1.70937280356884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18881511315703392, + "step": 5328 + }, + { + "epoch": 0.44416666666666665, + "grad_norm": 4.75, + "grad_norm_var": 0.20753580729166668, + "learning_rate": 4e-05, + "loss": 4.9372, + "loss/crossentropy": 2.4225960969924927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22473590821027756, + "step": 5330 + }, + { + "epoch": 0.44433333333333336, + "grad_norm": 4.96875, + "grad_norm_var": 0.20299479166666667, + "learning_rate": 4e-05, + "loss": 4.4347, + "loss/crossentropy": 2.0197691321372986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18901053816080093, + "step": 5332 + }, + { + "epoch": 0.4445, + "grad_norm": 5.84375, + "grad_norm_var": 0.23058268229166667, + "learning_rate": 4e-05, + "loss": 4.9402, + "loss/crossentropy": 2.1207685470581055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30563098564743996, + "step": 5334 + }, + { + "epoch": 0.44466666666666665, + "grad_norm": 5.375, + "grad_norm_var": 0.10266520182291666, + "learning_rate": 4e-05, + "loss": 5.0762, + "loss/crossentropy": 2.138897955417633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23109456151723862, + "step": 5336 + }, + { + "epoch": 0.44483333333333336, + "grad_norm": 5.625, + "grad_norm_var": 0.120947265625, + "learning_rate": 4e-05, + "loss": 5.2804, + "loss/crossentropy": 2.353893458843231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192895971238613, + "step": 5338 + }, + { + "epoch": 0.445, + "grad_norm": 5.21875, + "grad_norm_var": 0.12118733723958333, + "learning_rate": 4e-05, + "loss": 5.394, + "loss/crossentropy": 2.337477147579193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23079849779605865, + "step": 5340 + }, + { + "epoch": 0.44516666666666665, + "grad_norm": 4.96875, + "grad_norm_var": 0.11666259765625, + "learning_rate": 4e-05, + "loss": 4.7981, + "loss/crossentropy": 1.877238281071186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18820980936288834, + "step": 5342 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 5.0625, + "grad_norm_var": 0.08951416015625, + "learning_rate": 4e-05, + "loss": 5.0076, + "loss/crossentropy": 2.206613063812256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22341401129961014, + "step": 5344 + }, + { + "epoch": 0.4455, + "grad_norm": 5.125, + "grad_norm_var": 0.07415364583333334, + "learning_rate": 4e-05, + "loss": 4.6358, + "loss/crossentropy": 1.4311936870217323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15251067653298378, + "step": 5346 + }, + { + "epoch": 0.44566666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.08357747395833333, + "learning_rate": 4e-05, + "loss": 5.0284, + "loss/crossentropy": 2.017084077000618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756956409662962, + "step": 5348 + }, + { + "epoch": 0.44583333333333336, + "grad_norm": 4.5, + "grad_norm_var": 0.068603515625, + "learning_rate": 4e-05, + "loss": 4.7167, + "loss/crossentropy": 1.0140413790941238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1561578195542097, + "step": 5350 + }, + { + "epoch": 0.446, + "grad_norm": 5.4375, + "grad_norm_var": 0.07177327473958334, + "learning_rate": 4e-05, + "loss": 5.2292, + "loss/crossentropy": 1.7443393990397453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18483179062604904, + "step": 5352 + }, + { + "epoch": 0.44616666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.06536458333333334, + "learning_rate": 4e-05, + "loss": 4.7097, + "loss/crossentropy": 2.484953999519348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1995408609509468, + "step": 5354 + }, + { + "epoch": 0.44633333333333336, + "grad_norm": 5.125, + "grad_norm_var": 0.06027018229166667, + "learning_rate": 4e-05, + "loss": 4.8299, + "loss/crossentropy": 1.8088392987847328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17983365431427956, + "step": 5356 + }, + { + "epoch": 0.4465, + "grad_norm": 4.78125, + "grad_norm_var": 0.06532796223958333, + "learning_rate": 4e-05, + "loss": 5.2104, + "loss/crossentropy": 2.070494204759598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20856108516454697, + "step": 5358 + }, + { + "epoch": 0.44666666666666666, + "grad_norm": 5.4375, + "grad_norm_var": 0.08186442057291667, + "learning_rate": 4e-05, + "loss": 5.1108, + "loss/crossentropy": 1.826586052775383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19579278863966465, + "step": 5360 + }, + { + "epoch": 0.44683333333333336, + "grad_norm": 4.84375, + "grad_norm_var": 0.08690999348958334, + "learning_rate": 4e-05, + "loss": 4.6195, + "loss/crossentropy": 1.4399391859769821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14422345533967018, + "step": 5362 + }, + { + "epoch": 0.447, + "grad_norm": 5.40625, + "grad_norm_var": 0.10937093098958334, + "learning_rate": 4e-05, + "loss": 4.6871, + "loss/crossentropy": 1.527749978005886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15237411856651306, + "step": 5364 + }, + { + "epoch": 0.44716666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.10432535807291667, + "learning_rate": 4e-05, + "loss": 4.6966, + "loss/crossentropy": 2.0162869840860367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18172578141093254, + "step": 5366 + }, + { + "epoch": 0.44733333333333336, + "grad_norm": 4.84375, + "grad_norm_var": 0.09322916666666667, + "learning_rate": 4e-05, + "loss": 5.3323, + "loss/crossentropy": 2.02753733843565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17405341938138008, + "step": 5368 + }, + { + "epoch": 0.4475, + "grad_norm": 5.25, + "grad_norm_var": 0.09101155598958334, + "learning_rate": 4e-05, + "loss": 4.9713, + "loss/crossentropy": 1.9596833288669586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19608371704816818, + "step": 5370 + }, + { + "epoch": 0.44766666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.09021809895833334, + "learning_rate": 4e-05, + "loss": 4.8812, + "loss/crossentropy": 1.6146743893623352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17602908238768578, + "step": 5372 + }, + { + "epoch": 0.44783333333333336, + "grad_norm": 5.03125, + "grad_norm_var": 0.08720296223958333, + "learning_rate": 4e-05, + "loss": 5.2716, + "loss/crossentropy": 2.4960675835609436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139119803905487, + "step": 5374 + }, + { + "epoch": 0.448, + "grad_norm": 5.15625, + "grad_norm_var": 0.07763264973958334, + "learning_rate": 4e-05, + "loss": 5.3561, + "loss/crossentropy": 2.2325498163700104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21507864072918892, + "step": 5376 + }, + { + "epoch": 0.44816666666666666, + "grad_norm": 4.4375, + "grad_norm_var": 0.09062093098958333, + "learning_rate": 4e-05, + "loss": 4.4405, + "loss/crossentropy": 1.2764653414487839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1504039615392685, + "step": 5378 + }, + { + "epoch": 0.4483333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.086181640625, + "learning_rate": 4e-05, + "loss": 5.2766, + "loss/crossentropy": 1.6668346375226974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16964828968048096, + "step": 5380 + }, + { + "epoch": 0.4485, + "grad_norm": 5.875, + "grad_norm_var": 1.3891764322916667, + "learning_rate": 4e-05, + "loss": 4.7435, + "loss/crossentropy": 2.3967296481132507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25826695933938026, + "step": 5382 + }, + { + "epoch": 0.44866666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 1.398291015625, + "learning_rate": 4e-05, + "loss": 4.6032, + "loss/crossentropy": 1.7601360231637955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18853917345404625, + "step": 5384 + }, + { + "epoch": 0.4488333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 1.4064453125, + "learning_rate": 4e-05, + "loss": 5.4187, + "loss/crossentropy": 1.5048007890582085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15315121971070766, + "step": 5386 + }, + { + "epoch": 0.449, + "grad_norm": 4.6875, + "grad_norm_var": 1.4166666666666667, + "learning_rate": 4e-05, + "loss": 4.8313, + "loss/crossentropy": 1.4428337439894676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15264881402254105, + "step": 5388 + }, + { + "epoch": 0.44916666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 1.4527994791666667, + "learning_rate": 4e-05, + "loss": 4.4607, + "loss/crossentropy": 1.3986869975924492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17954625561833382, + "step": 5390 + }, + { + "epoch": 0.4493333333333333, + "grad_norm": 4.625, + "grad_norm_var": 1.4571451822916666, + "learning_rate": 4e-05, + "loss": 4.9726, + "loss/crossentropy": 1.787402868270874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20106610283255577, + "step": 5392 + }, + { + "epoch": 0.4495, + "grad_norm": 4.8125, + "grad_norm_var": 1.4185831705729166, + "learning_rate": 4e-05, + "loss": 4.8828, + "loss/crossentropy": 1.5303082168102264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15836665406823158, + "step": 5394 + }, + { + "epoch": 0.44966666666666666, + "grad_norm": 5.25, + "grad_norm_var": 1.4359212239583334, + "learning_rate": 4e-05, + "loss": 5.115, + "loss/crossentropy": 1.9863907098770142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2083004079759121, + "step": 5396 + }, + { + "epoch": 0.4498333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.058817545572916664, + "learning_rate": 4e-05, + "loss": 5.0986, + "loss/crossentropy": 2.172573536634445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20910141617059708, + "step": 5398 + }, + { + "epoch": 0.45, + "grad_norm": 5.625, + "grad_norm_var": 0.09016520182291667, + "learning_rate": 4e-05, + "loss": 5.1335, + "loss/crossentropy": 1.42410147190094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17954059317708015, + "step": 5400 + }, + { + "epoch": 0.45016666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.09472249348958334, + "learning_rate": 4e-05, + "loss": 5.0265, + "loss/crossentropy": 2.2131763994693756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932872235774994, + "step": 5402 + }, + { + "epoch": 0.4503333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.09081624348958334, + "learning_rate": 4e-05, + "loss": 4.722, + "loss/crossentropy": 1.675975002348423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2252166699618101, + "step": 5404 + }, + { + "epoch": 0.4505, + "grad_norm": 5.46875, + "grad_norm_var": 0.10115559895833333, + "learning_rate": 4e-05, + "loss": 5.3349, + "loss/crossentropy": 2.5156899094581604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156628593802452, + "step": 5406 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.08883056640625, + "learning_rate": 4e-05, + "loss": 5.3603, + "loss/crossentropy": 1.6027273386716843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18063210882246494, + "step": 5408 + }, + { + "epoch": 0.4508333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.1025390625, + "learning_rate": 4e-05, + "loss": 5.3645, + "loss/crossentropy": 2.3822204172611237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17875253781676292, + "step": 5410 + }, + { + "epoch": 0.451, + "grad_norm": 5.3125, + "grad_norm_var": 0.11164957682291667, + "learning_rate": 4e-05, + "loss": 4.9495, + "loss/crossentropy": 1.816266119480133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20655769482254982, + "step": 5412 + }, + { + "epoch": 0.45116666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.10904947916666667, + "learning_rate": 4e-05, + "loss": 4.7501, + "loss/crossentropy": 2.2936626076698303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20902733877301216, + "step": 5414 + }, + { + "epoch": 0.4513333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.08551025390625, + "learning_rate": 4e-05, + "loss": 4.7972, + "loss/crossentropy": 1.6542549058794975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17094655334949493, + "step": 5416 + }, + { + "epoch": 0.4515, + "grad_norm": 4.65625, + "grad_norm_var": 0.08479410807291667, + "learning_rate": 4e-05, + "loss": 4.8174, + "loss/crossentropy": 2.0066296085715294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19182928279042244, + "step": 5418 + }, + { + "epoch": 0.45166666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.08948160807291666, + "learning_rate": 4e-05, + "loss": 4.8859, + "loss/crossentropy": 1.921394057571888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1805327869951725, + "step": 5420 + }, + { + "epoch": 0.4518333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.06936442057291667, + "learning_rate": 4e-05, + "loss": 4.0731, + "loss/crossentropy": 2.1230402290821075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20618195086717606, + "step": 5422 + }, + { + "epoch": 0.452, + "grad_norm": 4.65625, + "grad_norm_var": 0.06545817057291667, + "learning_rate": 4e-05, + "loss": 5.0284, + "loss/crossentropy": 0.9875800833106041, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1523461416363716, + "step": 5424 + }, + { + "epoch": 0.45216666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.058837890625, + "learning_rate": 4e-05, + "loss": 4.5052, + "loss/crossentropy": 1.5282204896211624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15887302160263062, + "step": 5426 + }, + { + "epoch": 0.4523333333333333, + "grad_norm": 6.5625, + "grad_norm_var": 0.23505452473958333, + "learning_rate": 4e-05, + "loss": 5.1988, + "loss/crossentropy": 2.2137202620506287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19665415585041046, + "step": 5428 + }, + { + "epoch": 0.4525, + "grad_norm": 4.71875, + "grad_norm_var": 0.23079427083333334, + "learning_rate": 4e-05, + "loss": 5.1186, + "loss/crossentropy": 1.8932070061564445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16527743637561798, + "step": 5430 + }, + { + "epoch": 0.45266666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.22766927083333333, + "learning_rate": 4e-05, + "loss": 4.5121, + "loss/crossentropy": 1.0359995141625404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15376388281583786, + "step": 5432 + }, + { + "epoch": 0.4528333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.22558186848958334, + "learning_rate": 4e-05, + "loss": 5.3009, + "loss/crossentropy": 2.4695218205451965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21178413927555084, + "step": 5434 + }, + { + "epoch": 0.453, + "grad_norm": 6.28125, + "grad_norm_var": 0.31881103515625, + "learning_rate": 4e-05, + "loss": 5.5794, + "loss/crossentropy": 2.6623180508613586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121230624616146, + "step": 5436 + }, + { + "epoch": 0.45316666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.29618733723958335, + "learning_rate": 4e-05, + "loss": 5.2521, + "loss/crossentropy": 1.2481238469481468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171453032642603, + "step": 5438 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.2942708333333333, + "learning_rate": 4e-05, + "loss": 4.5362, + "loss/crossentropy": 1.9702692329883575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18803063035011292, + "step": 5440 + }, + { + "epoch": 0.4535, + "grad_norm": 4.625, + "grad_norm_var": 0.28427327473958336, + "learning_rate": 4e-05, + "loss": 4.7512, + "loss/crossentropy": 1.6472477465867996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19744962453842163, + "step": 5442 + }, + { + "epoch": 0.45366666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.15514322916666667, + "learning_rate": 4e-05, + "loss": 5.3208, + "loss/crossentropy": 2.093437224626541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20477236062288284, + "step": 5444 + }, + { + "epoch": 0.4538333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.14885660807291667, + "learning_rate": 4e-05, + "loss": 5.3538, + "loss/crossentropy": 2.378512978553772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103431336581707, + "step": 5446 + }, + { + "epoch": 0.454, + "grad_norm": 4.875, + "grad_norm_var": 0.15907796223958334, + "learning_rate": 4e-05, + "loss": 4.6674, + "loss/crossentropy": 2.076447993516922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18661235831677914, + "step": 5448 + }, + { + "epoch": 0.45416666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.15637613932291666, + "learning_rate": 4e-05, + "loss": 5.0477, + "loss/crossentropy": 2.223031312227249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23521675914525986, + "step": 5450 + }, + { + "epoch": 0.4543333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.05597330729166667, + "learning_rate": 4e-05, + "loss": 4.7088, + "loss/crossentropy": 1.7138950303196907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18349325843155384, + "step": 5452 + }, + { + "epoch": 0.4545, + "grad_norm": 5.34375, + "grad_norm_var": 0.06467692057291667, + "learning_rate": 4e-05, + "loss": 4.9164, + "loss/crossentropy": 1.6598485931754112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1732875034213066, + "step": 5454 + }, + { + "epoch": 0.45466666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.058268229166666664, + "learning_rate": 4e-05, + "loss": 4.7924, + "loss/crossentropy": 1.519950993359089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1434491015970707, + "step": 5456 + }, + { + "epoch": 0.4548333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.05245768229166667, + "learning_rate": 4e-05, + "loss": 5.2509, + "loss/crossentropy": 1.6798207014799118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1604145709425211, + "step": 5458 + }, + { + "epoch": 0.455, + "grad_norm": 5.0625, + "grad_norm_var": 0.04397379557291667, + "learning_rate": 4e-05, + "loss": 4.9022, + "loss/crossentropy": 1.5287635251879692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15692395344376564, + "step": 5460 + }, + { + "epoch": 0.45516666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.03996988932291667, + "learning_rate": 4e-05, + "loss": 4.9569, + "loss/crossentropy": 1.8003231510519981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980554684996605, + "step": 5462 + }, + { + "epoch": 0.4553333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.04309488932291667, + "learning_rate": 4e-05, + "loss": 5.4458, + "loss/crossentropy": 1.9575251713395119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18819259479641914, + "step": 5464 + }, + { + "epoch": 0.4555, + "grad_norm": 8.1875, + "grad_norm_var": 0.6995930989583333, + "learning_rate": 4e-05, + "loss": 4.6394, + "loss/crossentropy": 0.7658378258347511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11739085428416729, + "step": 5466 + }, + { + "epoch": 0.45566666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.7234659830729167, + "learning_rate": 4e-05, + "loss": 5.2724, + "loss/crossentropy": 1.6100464090704918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208255335688591, + "step": 5468 + }, + { + "epoch": 0.4558333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.7325358072916667, + "learning_rate": 4e-05, + "loss": 4.5175, + "loss/crossentropy": 1.5317303538322449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17215605825185776, + "step": 5470 + }, + { + "epoch": 0.456, + "grad_norm": 4.875, + "grad_norm_var": 0.7384724934895833, + "learning_rate": 4e-05, + "loss": 5.0997, + "loss/crossentropy": 2.1992835104465485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21746817231178284, + "step": 5472 + }, + { + "epoch": 0.45616666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.72574462890625, + "learning_rate": 4e-05, + "loss": 4.9513, + "loss/crossentropy": 2.353798985481262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22958621755242348, + "step": 5474 + }, + { + "epoch": 0.4563333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.7391764322916666, + "learning_rate": 4e-05, + "loss": 4.6827, + "loss/crossentropy": 1.9713662266731262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19430748745799065, + "step": 5476 + }, + { + "epoch": 0.4565, + "grad_norm": 5.21875, + "grad_norm_var": 0.7601847330729167, + "learning_rate": 4e-05, + "loss": 5.227, + "loss/crossentropy": 2.5195890069007874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20874114707112312, + "step": 5478 + }, + { + "epoch": 0.45666666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.7831339518229167, + "learning_rate": 4e-05, + "loss": 5.0046, + "loss/crossentropy": 2.294678211212158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244003266096115, + "step": 5480 + }, + { + "epoch": 0.4568333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.13560791015625, + "learning_rate": 4e-05, + "loss": 5.0747, + "loss/crossentropy": 2.0025685876607895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1901235617697239, + "step": 5482 + }, + { + "epoch": 0.457, + "grad_norm": 5.0, + "grad_norm_var": 0.03528645833333333, + "learning_rate": 4e-05, + "loss": 4.8902, + "loss/crossentropy": 1.941299356520176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1734217330813408, + "step": 5484 + }, + { + "epoch": 0.45716666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.04095052083333333, + "learning_rate": 4e-05, + "loss": 4.7396, + "loss/crossentropy": 2.118344932794571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2606281489133835, + "step": 5486 + }, + { + "epoch": 0.4573333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.042704264322916664, + "learning_rate": 4e-05, + "loss": 5.2228, + "loss/crossentropy": 2.236980974674225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2161729745566845, + "step": 5488 + }, + { + "epoch": 0.4575, + "grad_norm": 4.59375, + "grad_norm_var": 0.04644775390625, + "learning_rate": 4e-05, + "loss": 4.4538, + "loss/crossentropy": 1.025296412408352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13174685835838318, + "step": 5490 + }, + { + "epoch": 0.45766666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.051171875, + "learning_rate": 4e-05, + "loss": 4.9829, + "loss/crossentropy": 1.4498857855796814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1556770447641611, + "step": 5492 + }, + { + "epoch": 0.4578333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.044384765625, + "learning_rate": 4e-05, + "loss": 5.0496, + "loss/crossentropy": 2.1010265946388245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19020748883485794, + "step": 5494 + }, + { + "epoch": 0.458, + "grad_norm": 4.96875, + "grad_norm_var": 0.042578125, + "learning_rate": 4e-05, + "loss": 4.7502, + "loss/crossentropy": 1.1143567636609077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16215180046856403, + "step": 5496 + }, + { + "epoch": 0.45816666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.04140625, + "learning_rate": 4e-05, + "loss": 4.9584, + "loss/crossentropy": 1.6512196511030197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1721491888165474, + "step": 5498 + }, + { + "epoch": 0.4583333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.04664306640625, + "learning_rate": 4e-05, + "loss": 4.8581, + "loss/crossentropy": 1.7085549235343933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1505817025899887, + "step": 5500 + }, + { + "epoch": 0.4585, + "grad_norm": 4.875, + "grad_norm_var": 0.03931884765625, + "learning_rate": 4e-05, + "loss": 4.7432, + "loss/crossentropy": 1.8823091089725494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18096166849136353, + "step": 5502 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.03802083333333333, + "learning_rate": 4e-05, + "loss": 4.7849, + "loss/crossentropy": 2.0908593386411667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18026093766093254, + "step": 5504 + }, + { + "epoch": 0.4588333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.02974853515625, + "learning_rate": 4e-05, + "loss": 4.611, + "loss/crossentropy": 1.2987871691584587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13752495497465134, + "step": 5506 + }, + { + "epoch": 0.459, + "grad_norm": 5.03125, + "grad_norm_var": 0.02760009765625, + "learning_rate": 4e-05, + "loss": 5.1585, + "loss/crossentropy": 1.8118347227573395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16236663702875376, + "step": 5508 + }, + { + "epoch": 0.45916666666666667, + "grad_norm": 5.6875, + "grad_norm_var": 0.06907552083333333, + "learning_rate": 4e-05, + "loss": 5.3165, + "loss/crossentropy": 2.391406774520874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20025787502527237, + "step": 5510 + }, + { + "epoch": 0.4593333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.06890869140625, + "learning_rate": 4e-05, + "loss": 5.2343, + "loss/crossentropy": 2.0247348248958588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17275281623005867, + "step": 5512 + }, + { + "epoch": 0.4595, + "grad_norm": 5.125, + "grad_norm_var": 0.07890625, + "learning_rate": 4e-05, + "loss": 4.3175, + "loss/crossentropy": 1.8554309457540512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19631312042474747, + "step": 5514 + }, + { + "epoch": 0.45966666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.07928059895833334, + "learning_rate": 4e-05, + "loss": 4.6764, + "loss/crossentropy": 1.0043694823980331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1664719507098198, + "step": 5516 + }, + { + "epoch": 0.4598333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.07398681640625, + "learning_rate": 4e-05, + "loss": 5.132, + "loss/crossentropy": 2.534608483314514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207382507622242, + "step": 5518 + }, + { + "epoch": 0.46, + "grad_norm": 4.625, + "grad_norm_var": 0.0775390625, + "learning_rate": 4e-05, + "loss": 4.7972, + "loss/crossentropy": 1.9613143354654312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146427035331726, + "step": 5520 + }, + { + "epoch": 0.46016666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.08069254557291666, + "learning_rate": 4e-05, + "loss": 5.2331, + "loss/crossentropy": 2.5223607420921326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20144547894597054, + "step": 5522 + }, + { + "epoch": 0.4603333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.09659830729166667, + "learning_rate": 4e-05, + "loss": 5.4748, + "loss/crossentropy": 2.462041199207306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21255720406770706, + "step": 5524 + }, + { + "epoch": 0.4605, + "grad_norm": 4.84375, + "grad_norm_var": 0.06861979166666667, + "learning_rate": 4e-05, + "loss": 5.4283, + "loss/crossentropy": 1.8817952871322632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882878541946411, + "step": 5526 + }, + { + "epoch": 0.46066666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.07229410807291667, + "learning_rate": 4e-05, + "loss": 5.3448, + "loss/crossentropy": 2.304149329662323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20610129833221436, + "step": 5528 + }, + { + "epoch": 0.4608333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.06282145182291667, + "learning_rate": 4e-05, + "loss": 5.3979, + "loss/crossentropy": 2.2624170780181885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21408792212605476, + "step": 5530 + }, + { + "epoch": 0.461, + "grad_norm": 5.03125, + "grad_norm_var": 0.05579020182291667, + "learning_rate": 4e-05, + "loss": 4.6877, + "loss/crossentropy": 1.5460020303726196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15484758466482162, + "step": 5532 + }, + { + "epoch": 0.46116666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.05836181640625, + "learning_rate": 4e-05, + "loss": 5.0696, + "loss/crossentropy": 1.4943357408046722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16306064277887344, + "step": 5534 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.0611328125, + "learning_rate": 4e-05, + "loss": 5.3117, + "loss/crossentropy": 2.556882083415985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21715038269758224, + "step": 5536 + }, + { + "epoch": 0.4615, + "grad_norm": 5.5, + "grad_norm_var": 0.08513997395833334, + "learning_rate": 4e-05, + "loss": 4.7835, + "loss/crossentropy": 1.7690436989068985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21724151819944382, + "step": 5538 + }, + { + "epoch": 0.46166666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.08079427083333333, + "learning_rate": 4e-05, + "loss": 5.0873, + "loss/crossentropy": 1.8304852917790413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18460811860859394, + "step": 5540 + }, + { + "epoch": 0.4618333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.08059895833333333, + "learning_rate": 4e-05, + "loss": 4.4272, + "loss/crossentropy": 1.6574642956256866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16919926181435585, + "step": 5542 + }, + { + "epoch": 0.462, + "grad_norm": 4.34375, + "grad_norm_var": 0.10556233723958333, + "learning_rate": 4e-05, + "loss": 4.5421, + "loss/crossentropy": 2.3935444951057434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20081859827041626, + "step": 5544 + }, + { + "epoch": 0.46216666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.10155843098958334, + "learning_rate": 4e-05, + "loss": 5.1131, + "loss/crossentropy": 2.4130229353904724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20666294917464256, + "step": 5546 + }, + { + "epoch": 0.4623333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.10748697916666666, + "learning_rate": 4e-05, + "loss": 4.7261, + "loss/crossentropy": 2.4770246744155884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202097252011299, + "step": 5548 + }, + { + "epoch": 0.4625, + "grad_norm": 4.71875, + "grad_norm_var": 0.118994140625, + "learning_rate": 4e-05, + "loss": 4.9954, + "loss/crossentropy": 1.7167896926403046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16790836118161678, + "step": 5550 + }, + { + "epoch": 0.46266666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.10950520833333334, + "learning_rate": 4e-05, + "loss": 5.0256, + "loss/crossentropy": 2.050472140312195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17495042085647583, + "step": 5552 + }, + { + "epoch": 0.4628333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 0.07980143229166667, + "learning_rate": 4e-05, + "loss": 5.3496, + "loss/crossentropy": 2.4477387070655823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21591287106275558, + "step": 5554 + }, + { + "epoch": 0.463, + "grad_norm": 5.25, + "grad_norm_var": 0.20500895182291667, + "learning_rate": 4e-05, + "loss": 5.2569, + "loss/crossentropy": 2.42499315738678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20981594547629356, + "step": 5556 + }, + { + "epoch": 0.46316666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.20562744140625, + "learning_rate": 4e-05, + "loss": 4.5267, + "loss/crossentropy": 1.9163185358047485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18663611635565758, + "step": 5558 + }, + { + "epoch": 0.4633333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.17222900390625, + "learning_rate": 4e-05, + "loss": 4.9847, + "loss/crossentropy": 2.0122427120804787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882903315126896, + "step": 5560 + }, + { + "epoch": 0.4635, + "grad_norm": 4.96875, + "grad_norm_var": 0.169775390625, + "learning_rate": 4e-05, + "loss": 5.2916, + "loss/crossentropy": 2.5393239855766296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2253870852291584, + "step": 5562 + }, + { + "epoch": 0.46366666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.34957275390625, + "learning_rate": 4e-05, + "loss": 4.9768, + "loss/crossentropy": 2.6330828070640564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185838483273983, + "step": 5564 + }, + { + "epoch": 0.4638333333333333, + "grad_norm": 6.84375, + "grad_norm_var": 0.54713134765625, + "learning_rate": 4e-05, + "loss": 4.5474, + "loss/crossentropy": 1.7194873318076134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1570914275944233, + "step": 5566 + }, + { + "epoch": 0.464, + "grad_norm": 4.5625, + "grad_norm_var": 0.55894775390625, + "learning_rate": 4e-05, + "loss": 4.655, + "loss/crossentropy": 2.1608819663524628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23339637368917465, + "step": 5568 + }, + { + "epoch": 0.46416666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.56539306640625, + "learning_rate": 4e-05, + "loss": 5.0446, + "loss/crossentropy": 2.66201913356781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21940989419817924, + "step": 5570 + }, + { + "epoch": 0.4643333333333333, + "grad_norm": 5.5, + "grad_norm_var": 0.4955078125, + "learning_rate": 4e-05, + "loss": 5.2771, + "loss/crossentropy": 2.2475315630435944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19876830279827118, + "step": 5572 + }, + { + "epoch": 0.4645, + "grad_norm": 4.59375, + "grad_norm_var": 0.5167805989583333, + "learning_rate": 4e-05, + "loss": 4.3096, + "loss/crossentropy": 1.4486872777342796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1465356983244419, + "step": 5574 + }, + { + "epoch": 0.4646666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.5151652018229167, + "learning_rate": 4e-05, + "loss": 4.9242, + "loss/crossentropy": 2.028968036174774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18292693980038166, + "step": 5576 + }, + { + "epoch": 0.4648333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.5244425455729167, + "learning_rate": 4e-05, + "loss": 4.9943, + "loss/crossentropy": 2.321197360754013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20540225505828857, + "step": 5578 + }, + { + "epoch": 0.465, + "grad_norm": 4.8125, + "grad_norm_var": 0.3117472330729167, + "learning_rate": 4e-05, + "loss": 5.019, + "loss/crossentropy": 1.9598820507526398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18344413861632347, + "step": 5580 + }, + { + "epoch": 0.4651666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.058186848958333336, + "learning_rate": 4e-05, + "loss": 4.6168, + "loss/crossentropy": 2.1863655149936676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22168518975377083, + "step": 5582 + }, + { + "epoch": 0.4653333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.11962483723958334, + "learning_rate": 4e-05, + "loss": 4.7241, + "loss/crossentropy": 2.301854968070984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22621898725628853, + "step": 5584 + }, + { + "epoch": 0.4655, + "grad_norm": 4.96875, + "grad_norm_var": 0.12932535807291667, + "learning_rate": 4e-05, + "loss": 5.5068, + "loss/crossentropy": 2.373853385448456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21982931718230247, + "step": 5586 + }, + { + "epoch": 0.4656666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.11672770182291667, + "learning_rate": 4e-05, + "loss": 4.9399, + "loss/crossentropy": 1.9027044028043747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18401046097278595, + "step": 5588 + }, + { + "epoch": 0.4658333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.105712890625, + "learning_rate": 4e-05, + "loss": 5.1164, + "loss/crossentropy": 2.4487122297286987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22322659194469452, + "step": 5590 + }, + { + "epoch": 0.466, + "grad_norm": 5.125, + "grad_norm_var": 0.10530192057291667, + "learning_rate": 4e-05, + "loss": 5.1738, + "loss/crossentropy": 2.392216980457306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22900298237800598, + "step": 5592 + }, + { + "epoch": 0.4661666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.11092122395833333, + "learning_rate": 4e-05, + "loss": 5.0379, + "loss/crossentropy": 2.20323982834816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22030623257160187, + "step": 5594 + }, + { + "epoch": 0.4663333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.10611979166666667, + "learning_rate": 4e-05, + "loss": 5.1038, + "loss/crossentropy": 1.8584736064076424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.181080624461174, + "step": 5596 + }, + { + "epoch": 0.4665, + "grad_norm": 5.1875, + "grad_norm_var": 0.10846354166666666, + "learning_rate": 4e-05, + "loss": 4.5158, + "loss/crossentropy": 1.710656851530075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18867123126983643, + "step": 5598 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.04211832682291667, + "learning_rate": 4e-05, + "loss": 5.3547, + "loss/crossentropy": 2.108785852789879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18798251450061798, + "step": 5600 + }, + { + "epoch": 0.4668333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.04000244140625, + "learning_rate": 4e-05, + "loss": 4.6394, + "loss/crossentropy": 2.492998719215393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24159801751375198, + "step": 5602 + }, + { + "epoch": 0.467, + "grad_norm": 4.96875, + "grad_norm_var": 0.03899332682291667, + "learning_rate": 4e-05, + "loss": 5.0782, + "loss/crossentropy": 1.8815812170505524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166886031627655, + "step": 5604 + }, + { + "epoch": 0.4671666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.06315104166666667, + "learning_rate": 4e-05, + "loss": 4.5261, + "loss/crossentropy": 1.4593137428164482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.157625250518322, + "step": 5606 + }, + { + "epoch": 0.4673333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.06783854166666667, + "learning_rate": 4e-05, + "loss": 4.8489, + "loss/crossentropy": 2.531603217124939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19394199922680855, + "step": 5608 + }, + { + "epoch": 0.4675, + "grad_norm": 4.75, + "grad_norm_var": 0.06327718098958333, + "learning_rate": 4e-05, + "loss": 4.801, + "loss/crossentropy": 1.6611417457461357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18344333954155445, + "step": 5610 + }, + { + "epoch": 0.4676666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.06311442057291666, + "learning_rate": 4e-05, + "loss": 4.9979, + "loss/crossentropy": 2.3051935136318207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960429958999157, + "step": 5612 + }, + { + "epoch": 0.4678333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.05419514973958333, + "learning_rate": 4e-05, + "loss": 4.5293, + "loss/crossentropy": 1.8542626649141312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17750085517764091, + "step": 5614 + }, + { + "epoch": 0.468, + "grad_norm": 4.71875, + "grad_norm_var": 0.03982747395833333, + "learning_rate": 4e-05, + "loss": 5.0748, + "loss/crossentropy": 2.1633825600147247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869092732667923, + "step": 5616 + }, + { + "epoch": 0.4681666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.056380208333333334, + "learning_rate": 4e-05, + "loss": 4.757, + "loss/crossentropy": 1.6803074106574059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20325062051415443, + "step": 5618 + }, + { + "epoch": 0.4683333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.06015218098958333, + "learning_rate": 4e-05, + "loss": 5.0692, + "loss/crossentropy": 2.228565901517868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19540579989552498, + "step": 5620 + }, + { + "epoch": 0.4685, + "grad_norm": 5.40625, + "grad_norm_var": 0.06326497395833333, + "learning_rate": 4e-05, + "loss": 4.9955, + "loss/crossentropy": 2.2330249547958374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2226766161620617, + "step": 5622 + }, + { + "epoch": 0.4686666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.05245768229166667, + "learning_rate": 4e-05, + "loss": 4.9578, + "loss/crossentropy": 1.7934229224920273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1788601279258728, + "step": 5624 + }, + { + "epoch": 0.4688333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.05230712890625, + "learning_rate": 4e-05, + "loss": 4.8806, + "loss/crossentropy": 1.8773228824138641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17370744049549103, + "step": 5626 + }, + { + "epoch": 0.469, + "grad_norm": 4.65625, + "grad_norm_var": 0.04778645833333333, + "learning_rate": 4e-05, + "loss": 5.0076, + "loss/crossentropy": 2.1669468581676483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194295935332775, + "step": 5628 + }, + { + "epoch": 0.4691666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.04664306640625, + "learning_rate": 4e-05, + "loss": 4.7817, + "loss/crossentropy": 1.9204804003238678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20061272010207176, + "step": 5630 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.04895426432291667, + "learning_rate": 4e-05, + "loss": 4.5336, + "loss/crossentropy": 1.5425023213028908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17089655436575413, + "step": 5632 + }, + { + "epoch": 0.4695, + "grad_norm": 5.125, + "grad_norm_var": 0.04635009765625, + "learning_rate": 4e-05, + "loss": 5.03, + "loss/crossentropy": 2.249193251132965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20843612030148506, + "step": 5634 + }, + { + "epoch": 0.4696666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.04583333333333333, + "learning_rate": 4e-05, + "loss": 4.9591, + "loss/crossentropy": 2.4124104380607605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710863336920738, + "step": 5636 + }, + { + "epoch": 0.4698333333333333, + "grad_norm": 5.9375, + "grad_norm_var": 0.10232747395833333, + "learning_rate": 4e-05, + "loss": 5.175, + "loss/crossentropy": 1.8956755921244621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18736336380243301, + "step": 5638 + }, + { + "epoch": 0.47, + "grad_norm": 4.75, + "grad_norm_var": 0.10364583333333334, + "learning_rate": 4e-05, + "loss": 4.5773, + "loss/crossentropy": 2.1982105374336243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23227747902274132, + "step": 5640 + }, + { + "epoch": 0.4701666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.10295817057291666, + "learning_rate": 4e-05, + "loss": 4.6193, + "loss/crossentropy": 1.927461177110672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18918763473629951, + "step": 5642 + }, + { + "epoch": 0.4703333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.14256184895833332, + "learning_rate": 4e-05, + "loss": 4.8908, + "loss/crossentropy": 2.513116717338562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22445828840136528, + "step": 5644 + }, + { + "epoch": 0.4705, + "grad_norm": 4.84375, + "grad_norm_var": 0.13472900390625, + "learning_rate": 4e-05, + "loss": 5.4053, + "loss/crossentropy": 1.8445745781064034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20689787901937962, + "step": 5646 + }, + { + "epoch": 0.4706666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.14055989583333334, + "learning_rate": 4e-05, + "loss": 4.5119, + "loss/crossentropy": 2.787231981754303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22696134075522423, + "step": 5648 + }, + { + "epoch": 0.4708333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.14680582682291668, + "learning_rate": 4e-05, + "loss": 4.6691, + "loss/crossentropy": 1.9298951923847198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20483119785785675, + "step": 5650 + }, + { + "epoch": 0.471, + "grad_norm": 5.28125, + "grad_norm_var": 0.16521809895833334, + "learning_rate": 4e-05, + "loss": 4.5899, + "loss/crossentropy": 1.9479724541306496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18522769957780838, + "step": 5652 + }, + { + "epoch": 0.4711666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.12180989583333333, + "learning_rate": 4e-05, + "loss": 4.9636, + "loss/crossentropy": 1.995558775961399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19092020578682423, + "step": 5654 + }, + { + "epoch": 0.4713333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.11763916015625, + "learning_rate": 4e-05, + "loss": 4.9866, + "loss/crossentropy": 1.3711708784103394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17136229574680328, + "step": 5656 + }, + { + "epoch": 0.4715, + "grad_norm": 4.84375, + "grad_norm_var": 0.115087890625, + "learning_rate": 4e-05, + "loss": 4.7872, + "loss/crossentropy": 2.1775683164596558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25280786678195, + "step": 5658 + }, + { + "epoch": 0.4716666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.06936442057291667, + "learning_rate": 4e-05, + "loss": 5.4841, + "loss/crossentropy": 2.1796337962150574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797861486673355, + "step": 5660 + }, + { + "epoch": 0.4718333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.09254150390625, + "learning_rate": 4e-05, + "loss": 4.8985, + "loss/crossentropy": 1.956167332828045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071479931473732, + "step": 5662 + }, + { + "epoch": 0.472, + "grad_norm": 4.9375, + "grad_norm_var": 0.08229166666666667, + "learning_rate": 4e-05, + "loss": 4.5594, + "loss/crossentropy": 1.6870819255709648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18948296085000038, + "step": 5664 + }, + { + "epoch": 0.4721666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.08292643229166667, + "learning_rate": 4e-05, + "loss": 4.3645, + "loss/crossentropy": 1.7608007118105888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16702809929847717, + "step": 5666 + }, + { + "epoch": 0.4723333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.07180582682291667, + "learning_rate": 4e-05, + "loss": 4.3888, + "loss/crossentropy": 1.1737506687641144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15226943045854568, + "step": 5668 + }, + { + "epoch": 0.4725, + "grad_norm": 5.0625, + "grad_norm_var": 0.050244140625, + "learning_rate": 4e-05, + "loss": 4.9121, + "loss/crossentropy": 2.3276381492614746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23176956549286842, + "step": 5670 + }, + { + "epoch": 0.4726666666666667, + "grad_norm": 8.375, + "grad_norm_var": 0.77730712890625, + "learning_rate": 4e-05, + "loss": 5.1473, + "loss/crossentropy": 1.7255630418658257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22438404709100723, + "step": 5672 + }, + { + "epoch": 0.4728333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.7973958333333333, + "learning_rate": 4e-05, + "loss": 4.6398, + "loss/crossentropy": 1.3483816534280777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1676221415400505, + "step": 5674 + }, + { + "epoch": 0.473, + "grad_norm": 5.5, + "grad_norm_var": 0.8156087239583333, + "learning_rate": 4e-05, + "loss": 5.5553, + "loss/crossentropy": 2.0339736565947533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17984124645590782, + "step": 5676 + }, + { + "epoch": 0.4731666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.7845052083333334, + "learning_rate": 4e-05, + "loss": 4.8993, + "loss/crossentropy": 1.665101781487465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18875110894441605, + "step": 5678 + }, + { + "epoch": 0.47333333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.7824055989583333, + "learning_rate": 4e-05, + "loss": 4.9043, + "loss/crossentropy": 1.4573597237467766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1709212139248848, + "step": 5680 + }, + { + "epoch": 0.4735, + "grad_norm": 4.65625, + "grad_norm_var": 0.8001261393229167, + "learning_rate": 4e-05, + "loss": 5.3516, + "loss/crossentropy": 2.237562984228134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515824109315872, + "step": 5682 + }, + { + "epoch": 0.4736666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.8140462239583334, + "learning_rate": 4e-05, + "loss": 4.329, + "loss/crossentropy": 1.8115737065672874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17292770743370056, + "step": 5684 + }, + { + "epoch": 0.47383333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.84068603515625, + "learning_rate": 4e-05, + "loss": 4.4173, + "loss/crossentropy": 1.8549021109938622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1680241972208023, + "step": 5686 + }, + { + "epoch": 0.474, + "grad_norm": 4.8125, + "grad_norm_var": 0.11907552083333334, + "learning_rate": 4e-05, + "loss": 4.6369, + "loss/crossentropy": 1.2896523252129555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13351555354893208, + "step": 5688 + }, + { + "epoch": 0.4741666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.10696207682291667, + "learning_rate": 4e-05, + "loss": 4.8859, + "loss/crossentropy": 1.4676887169480324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1461086068302393, + "step": 5690 + }, + { + "epoch": 0.47433333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.09140625, + "learning_rate": 4e-05, + "loss": 5.577, + "loss/crossentropy": 1.770861029624939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1623640414327383, + "step": 5692 + }, + { + "epoch": 0.4745, + "grad_norm": 5.21875, + "grad_norm_var": 0.06417643229166667, + "learning_rate": 4e-05, + "loss": 5.0672, + "loss/crossentropy": 2.0532081723213196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289374724030495, + "step": 5694 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 5.625, + "grad_norm_var": 0.08800455729166666, + "learning_rate": 4e-05, + "loss": 5.2478, + "loss/crossentropy": 1.9786882400512695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24169421941041946, + "step": 5696 + }, + { + "epoch": 0.47483333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.086181640625, + "learning_rate": 4e-05, + "loss": 4.8398, + "loss/crossentropy": 2.2744025588035583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21215546131134033, + "step": 5698 + }, + { + "epoch": 0.475, + "grad_norm": 5.21875, + "grad_norm_var": 0.07922770182291666, + "learning_rate": 4e-05, + "loss": 5.4644, + "loss/crossentropy": 2.2868226170539856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20553449168801308, + "step": 5700 + }, + { + "epoch": 0.4751666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.09225260416666667, + "learning_rate": 4e-05, + "loss": 4.792, + "loss/crossentropy": 2.4458898305892944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21513555943965912, + "step": 5702 + }, + { + "epoch": 0.47533333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.09256184895833333, + "learning_rate": 4e-05, + "loss": 4.9897, + "loss/crossentropy": 1.9003973007202148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19777457788586617, + "step": 5704 + }, + { + "epoch": 0.4755, + "grad_norm": 4.65625, + "grad_norm_var": 0.099853515625, + "learning_rate": 4e-05, + "loss": 4.1787, + "loss/crossentropy": 2.359318822622299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21146679669618607, + "step": 5706 + }, + { + "epoch": 0.4756666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.096337890625, + "learning_rate": 4e-05, + "loss": 4.6372, + "loss/crossentropy": 2.3056346774101257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23220019042491913, + "step": 5708 + }, + { + "epoch": 0.47583333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.09208577473958333, + "learning_rate": 4e-05, + "loss": 5.5974, + "loss/crossentropy": 2.6773802042007446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22551625221967697, + "step": 5710 + }, + { + "epoch": 0.476, + "grad_norm": 5.1875, + "grad_norm_var": 0.061572265625, + "learning_rate": 4e-05, + "loss": 4.952, + "loss/crossentropy": 2.311539113521576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19972623512148857, + "step": 5712 + }, + { + "epoch": 0.4761666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.05826416015625, + "learning_rate": 4e-05, + "loss": 4.5923, + "loss/crossentropy": 1.410909503698349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15724024921655655, + "step": 5714 + }, + { + "epoch": 0.47633333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.08722330729166666, + "learning_rate": 4e-05, + "loss": 4.6575, + "loss/crossentropy": 2.465664803981781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142348736524582, + "step": 5716 + }, + { + "epoch": 0.4765, + "grad_norm": 4.6875, + "grad_norm_var": 0.10777587890625, + "learning_rate": 4e-05, + "loss": 4.4435, + "loss/crossentropy": 2.299306809902191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054593190550804, + "step": 5718 + }, + { + "epoch": 0.4766666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.13987223307291666, + "learning_rate": 4e-05, + "loss": 4.5301, + "loss/crossentropy": 1.9581206738948822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18420690298080444, + "step": 5720 + }, + { + "epoch": 0.47683333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.13697916666666668, + "learning_rate": 4e-05, + "loss": 4.7601, + "loss/crossentropy": 1.5340687707066536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17616295255720615, + "step": 5722 + }, + { + "epoch": 0.477, + "grad_norm": 4.59375, + "grad_norm_var": 0.17867431640625, + "learning_rate": 4e-05, + "loss": 3.8758, + "loss/crossentropy": 0.8211743906140327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12213561870157719, + "step": 5724 + }, + { + "epoch": 0.4771666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.17720947265625, + "learning_rate": 4e-05, + "loss": 5.1329, + "loss/crossentropy": 2.3268213868141174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2231154851615429, + "step": 5726 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.18736979166666667, + "learning_rate": 4e-05, + "loss": 5.0385, + "loss/crossentropy": 2.3459609746932983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21290796995162964, + "step": 5728 + }, + { + "epoch": 0.4775, + "grad_norm": 5.375, + "grad_norm_var": 0.21158854166666666, + "learning_rate": 4e-05, + "loss": 5.4045, + "loss/crossentropy": 2.478249251842499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21360737085342407, + "step": 5730 + }, + { + "epoch": 0.4776666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.1826171875, + "learning_rate": 4e-05, + "loss": 5.2075, + "loss/crossentropy": 2.6454553604125977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2205870822072029, + "step": 5732 + }, + { + "epoch": 0.47783333333333333, + "grad_norm": 5.625, + "grad_norm_var": 0.18603108723958334, + "learning_rate": 4e-05, + "loss": 4.5918, + "loss/crossentropy": 2.2463018894195557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118573747575283, + "step": 5734 + }, + { + "epoch": 0.478, + "grad_norm": 4.96875, + "grad_norm_var": 0.15319010416666667, + "learning_rate": 4e-05, + "loss": 4.457, + "loss/crossentropy": 1.66935233771801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1990772783756256, + "step": 5736 + }, + { + "epoch": 0.4781666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 0.16799723307291667, + "learning_rate": 4e-05, + "loss": 5.0837, + "loss/crossentropy": 2.6120508909225464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278040051460266, + "step": 5738 + }, + { + "epoch": 0.47833333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.10572916666666667, + "learning_rate": 4e-05, + "loss": 5.4204, + "loss/crossentropy": 2.6417598128318787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22373153269290924, + "step": 5740 + }, + { + "epoch": 0.4785, + "grad_norm": 4.8125, + "grad_norm_var": 0.12102457682291666, + "learning_rate": 4e-05, + "loss": 4.3854, + "loss/crossentropy": 2.144157826900482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20237886905670166, + "step": 5742 + }, + { + "epoch": 0.4786666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.10663655598958334, + "learning_rate": 4e-05, + "loss": 4.6867, + "loss/crossentropy": 2.346675455570221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053035832941532, + "step": 5744 + }, + { + "epoch": 0.47883333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.1123046875, + "learning_rate": 4e-05, + "loss": 5.184, + "loss/crossentropy": 2.6332274079322815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171645276248455, + "step": 5746 + }, + { + "epoch": 0.479, + "grad_norm": 4.875, + "grad_norm_var": 0.131103515625, + "learning_rate": 4e-05, + "loss": 4.0919, + "loss/crossentropy": 1.4563121870160103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14949826709926128, + "step": 5748 + }, + { + "epoch": 0.4791666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.10089518229166666, + "learning_rate": 4e-05, + "loss": 5.0827, + "loss/crossentropy": 2.034213662147522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20901034027338028, + "step": 5750 + }, + { + "epoch": 0.47933333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.10390625, + "learning_rate": 4e-05, + "loss": 4.4502, + "loss/crossentropy": 2.4095794558525085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22046563774347305, + "step": 5752 + }, + { + "epoch": 0.4795, + "grad_norm": 5.3125, + "grad_norm_var": 0.09759114583333334, + "learning_rate": 4e-05, + "loss": 5.2505, + "loss/crossentropy": 2.034136213362217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787286102771759, + "step": 5754 + }, + { + "epoch": 0.4796666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.09998372395833334, + "learning_rate": 4e-05, + "loss": 5.1684, + "loss/crossentropy": 2.084101490676403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18571361154317856, + "step": 5756 + }, + { + "epoch": 0.47983333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.09599202473958333, + "learning_rate": 4e-05, + "loss": 4.8376, + "loss/crossentropy": 1.6878532022237778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.179075812920928, + "step": 5758 + }, + { + "epoch": 0.48, + "grad_norm": 4.84375, + "grad_norm_var": 0.095947265625, + "learning_rate": 4e-05, + "loss": 4.6523, + "loss/crossentropy": 1.9728622436523438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1716511808335781, + "step": 5760 + }, + { + "epoch": 0.4801666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.09329427083333333, + "learning_rate": 4e-05, + "loss": 4.7272, + "loss/crossentropy": 1.4189490303397179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16327936947345734, + "step": 5762 + }, + { + "epoch": 0.48033333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.07408854166666666, + "learning_rate": 4e-05, + "loss": 5.0876, + "loss/crossentropy": 2.339596748352051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20580720156431198, + "step": 5764 + }, + { + "epoch": 0.4805, + "grad_norm": 5.59375, + "grad_norm_var": 0.09114583333333333, + "learning_rate": 4e-05, + "loss": 5.0123, + "loss/crossentropy": 1.7912172004580498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17723418772220612, + "step": 5766 + }, + { + "epoch": 0.4806666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.089697265625, + "learning_rate": 4e-05, + "loss": 4.9938, + "loss/crossentropy": 1.4451691582798958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14382265135645866, + "step": 5768 + }, + { + "epoch": 0.48083333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.08567708333333333, + "learning_rate": 4e-05, + "loss": 4.8534, + "loss/crossentropy": 1.743345096707344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15984099358320236, + "step": 5770 + }, + { + "epoch": 0.481, + "grad_norm": 4.84375, + "grad_norm_var": 0.07342122395833334, + "learning_rate": 4e-05, + "loss": 4.7223, + "loss/crossentropy": 2.2444933652877808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19508197531104088, + "step": 5772 + }, + { + "epoch": 0.4811666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.07095947265625, + "learning_rate": 4e-05, + "loss": 4.6951, + "loss/crossentropy": 1.3921042084693909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14450440928339958, + "step": 5774 + }, + { + "epoch": 0.48133333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.07554931640625, + "learning_rate": 4e-05, + "loss": 5.5692, + "loss/crossentropy": 2.309263288974762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22191011533141136, + "step": 5776 + }, + { + "epoch": 0.4815, + "grad_norm": 5.15625, + "grad_norm_var": 0.06796875, + "learning_rate": 4e-05, + "loss": 5.0408, + "loss/crossentropy": 1.2080154567956924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14104336686432362, + "step": 5778 + }, + { + "epoch": 0.4816666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.06402587890625, + "learning_rate": 4e-05, + "loss": 4.5586, + "loss/crossentropy": 2.0010958090424538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1737840212881565, + "step": 5780 + }, + { + "epoch": 0.48183333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.047265625, + "learning_rate": 4e-05, + "loss": 4.8221, + "loss/crossentropy": 2.0100313425064087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19126557931303978, + "step": 5782 + }, + { + "epoch": 0.482, + "grad_norm": 5.09375, + "grad_norm_var": 0.04394124348958333, + "learning_rate": 4e-05, + "loss": 5.491, + "loss/crossentropy": 1.8984070345759392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18304388225078583, + "step": 5784 + }, + { + "epoch": 0.4821666666666667, + "grad_norm": 5.90625, + "grad_norm_var": 0.0921875, + "learning_rate": 4e-05, + "loss": 5.3534, + "loss/crossentropy": 1.232503592967987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13390603102743626, + "step": 5786 + }, + { + "epoch": 0.48233333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.09010009765625, + "learning_rate": 4e-05, + "loss": 4.7457, + "loss/crossentropy": 1.4511554315686226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1644621528685093, + "step": 5788 + }, + { + "epoch": 0.4825, + "grad_norm": 4.71875, + "grad_norm_var": 0.08827718098958333, + "learning_rate": 4e-05, + "loss": 4.4682, + "loss/crossentropy": 1.909536212682724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18038103729486465, + "step": 5790 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.08603108723958333, + "learning_rate": 4e-05, + "loss": 4.6379, + "loss/crossentropy": 1.01704840362072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1405244879424572, + "step": 5792 + }, + { + "epoch": 0.48283333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.08162434895833333, + "learning_rate": 4e-05, + "loss": 5.1905, + "loss/crossentropy": 1.8805341720581055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17578154057264328, + "step": 5794 + }, + { + "epoch": 0.483, + "grad_norm": 4.96875, + "grad_norm_var": 0.08098958333333334, + "learning_rate": 4e-05, + "loss": 5.0812, + "loss/crossentropy": 2.587492525577545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23445945233106613, + "step": 5796 + }, + { + "epoch": 0.4831666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.07029622395833333, + "learning_rate": 4e-05, + "loss": 5.2722, + "loss/crossentropy": 1.608366496860981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2221035696566105, + "step": 5798 + }, + { + "epoch": 0.48333333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.1158203125, + "learning_rate": 4e-05, + "loss": 4.206, + "loss/crossentropy": 1.8016544580459595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18007242307066917, + "step": 5800 + }, + { + "epoch": 0.4835, + "grad_norm": 5.0625, + "grad_norm_var": 0.052197265625, + "learning_rate": 4e-05, + "loss": 5.1328, + "loss/crossentropy": 1.5784991532564163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1633721888065338, + "step": 5802 + }, + { + "epoch": 0.4836666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.05338134765625, + "learning_rate": 4e-05, + "loss": 5.2282, + "loss/crossentropy": 2.152147799730301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19616486132144928, + "step": 5804 + }, + { + "epoch": 0.48383333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.053055826822916666, + "learning_rate": 4e-05, + "loss": 4.6495, + "loss/crossentropy": 1.6643903106451035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19107349775731564, + "step": 5806 + }, + { + "epoch": 0.484, + "grad_norm": 4.78125, + "grad_norm_var": 0.05358072916666667, + "learning_rate": 4e-05, + "loss": 4.4893, + "loss/crossentropy": 1.6722280532121658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17675404995679855, + "step": 5808 + }, + { + "epoch": 0.4841666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.06399739583333333, + "learning_rate": 4e-05, + "loss": 4.9003, + "loss/crossentropy": 2.3588092923164368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119477353990078, + "step": 5810 + }, + { + "epoch": 0.48433333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.06327718098958333, + "learning_rate": 4e-05, + "loss": 4.5094, + "loss/crossentropy": 1.411509931087494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15218713879585266, + "step": 5812 + }, + { + "epoch": 0.4845, + "grad_norm": 4.84375, + "grad_norm_var": 0.18619384765625, + "learning_rate": 4e-05, + "loss": 4.9774, + "loss/crossentropy": 1.8252098262310028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2733977921307087, + "step": 5814 + }, + { + "epoch": 0.4846666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.17486979166666666, + "learning_rate": 4e-05, + "loss": 4.4479, + "loss/crossentropy": 1.9259876608848572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18517810851335526, + "step": 5816 + }, + { + "epoch": 0.48483333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.17278238932291667, + "learning_rate": 4e-05, + "loss": 4.8732, + "loss/crossentropy": 1.6283398121595383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14427206851541996, + "step": 5818 + }, + { + "epoch": 0.485, + "grad_norm": 4.9375, + "grad_norm_var": 0.16829020182291668, + "learning_rate": 4e-05, + "loss": 5.2812, + "loss/crossentropy": 1.330165982246399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16302420757710934, + "step": 5820 + }, + { + "epoch": 0.4851666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.18448893229166666, + "learning_rate": 4e-05, + "loss": 5.1328, + "loss/crossentropy": 1.5612802058458328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18193678371608257, + "step": 5822 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.18346354166666667, + "learning_rate": 4e-05, + "loss": 5.1239, + "loss/crossentropy": 2.374065101146698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2208290696144104, + "step": 5824 + }, + { + "epoch": 0.4855, + "grad_norm": 4.6875, + "grad_norm_var": 0.200390625, + "learning_rate": 4e-05, + "loss": 4.663, + "loss/crossentropy": 1.4791902005672455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15314685553312302, + "step": 5826 + }, + { + "epoch": 0.4856666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.19998372395833333, + "learning_rate": 4e-05, + "loss": 5.0844, + "loss/crossentropy": 1.3942490443587303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17971058189868927, + "step": 5828 + }, + { + "epoch": 0.48583333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.09544270833333333, + "learning_rate": 4e-05, + "loss": 5.1706, + "loss/crossentropy": 2.2101835906505585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22206401452422142, + "step": 5830 + }, + { + "epoch": 0.486, + "grad_norm": 4.8125, + "grad_norm_var": 0.06829427083333334, + "learning_rate": 4e-05, + "loss": 4.9569, + "loss/crossentropy": 1.806466780602932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16677996329963207, + "step": 5832 + }, + { + "epoch": 0.4861666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 0.08826497395833334, + "learning_rate": 4e-05, + "loss": 4.8622, + "loss/crossentropy": 1.182845950126648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15750311501324177, + "step": 5834 + }, + { + "epoch": 0.48633333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.09351806640625, + "learning_rate": 4e-05, + "loss": 4.9245, + "loss/crossentropy": 1.6443369388580322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15455692261457443, + "step": 5836 + }, + { + "epoch": 0.4865, + "grad_norm": 4.9375, + "grad_norm_var": 0.130712890625, + "learning_rate": 4e-05, + "loss": 5.2779, + "loss/crossentropy": 2.343976229429245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20677870512008667, + "step": 5838 + }, + { + "epoch": 0.4866666666666667, + "grad_norm": 3.9375, + "grad_norm_var": 0.20206705729166666, + "learning_rate": 4e-05, + "loss": 4.0648, + "loss/crossentropy": 1.9890966042876244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19900896027684212, + "step": 5840 + }, + { + "epoch": 0.48683333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.17771809895833332, + "learning_rate": 4e-05, + "loss": 4.9045, + "loss/crossentropy": 1.9553634375333786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19644594565033913, + "step": 5842 + }, + { + "epoch": 0.487, + "grad_norm": 4.6875, + "grad_norm_var": 0.1740234375, + "learning_rate": 4e-05, + "loss": 4.5715, + "loss/crossentropy": 1.8074621483683586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17907745763659477, + "step": 5844 + }, + { + "epoch": 0.4871666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.16474202473958333, + "learning_rate": 4e-05, + "loss": 4.7783, + "loss/crossentropy": 2.4416297674179077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21168984845280647, + "step": 5846 + }, + { + "epoch": 0.48733333333333334, + "grad_norm": 5.46875, + "grad_norm_var": 0.18603108723958334, + "learning_rate": 4e-05, + "loss": 4.6668, + "loss/crossentropy": 2.470840334892273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23379643633961678, + "step": 5848 + }, + { + "epoch": 0.4875, + "grad_norm": 5.15625, + "grad_norm_var": 0.16584879557291668, + "learning_rate": 4e-05, + "loss": 5.2838, + "loss/crossentropy": 2.0527156069874763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883339285850525, + "step": 5850 + }, + { + "epoch": 0.4876666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 0.17708333333333334, + "learning_rate": 4e-05, + "loss": 5.0268, + "loss/crossentropy": 2.331518530845642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710406616330147, + "step": 5852 + }, + { + "epoch": 0.48783333333333334, + "grad_norm": 5.875, + "grad_norm_var": 0.18625895182291666, + "learning_rate": 4e-05, + "loss": 4.8177, + "loss/crossentropy": 2.0739459693431854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22978762909770012, + "step": 5854 + }, + { + "epoch": 0.488, + "grad_norm": 4.71875, + "grad_norm_var": 0.11404622395833333, + "learning_rate": 4e-05, + "loss": 4.6849, + "loss/crossentropy": 1.164167359471321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13028892129659653, + "step": 5856 + }, + { + "epoch": 0.4881666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.11591389973958334, + "learning_rate": 4e-05, + "loss": 5.3475, + "loss/crossentropy": 2.5920631885528564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20778515562415123, + "step": 5858 + }, + { + "epoch": 0.48833333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.10546468098958334, + "learning_rate": 4e-05, + "loss": 5.1718, + "loss/crossentropy": 1.9089709669351578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1964315064251423, + "step": 5860 + }, + { + "epoch": 0.4885, + "grad_norm": 5.0, + "grad_norm_var": 0.09967041015625, + "learning_rate": 4e-05, + "loss": 5.2793, + "loss/crossentropy": 1.874973475933075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18953284621238708, + "step": 5862 + }, + { + "epoch": 0.4886666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.09140218098958333, + "learning_rate": 4e-05, + "loss": 5.2577, + "loss/crossentropy": 2.303260922431946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19869283586740494, + "step": 5864 + }, + { + "epoch": 0.48883333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.09527587890625, + "learning_rate": 4e-05, + "loss": 4.5316, + "loss/crossentropy": 1.5516202598810196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037242390215397, + "step": 5866 + }, + { + "epoch": 0.489, + "grad_norm": 4.875, + "grad_norm_var": 0.08443603515625, + "learning_rate": 4e-05, + "loss": 4.3666, + "loss/crossentropy": 1.716199368238449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23071671649813652, + "step": 5868 + }, + { + "epoch": 0.4891666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.028450520833333333, + "learning_rate": 4e-05, + "loss": 5.1654, + "loss/crossentropy": 1.994306929409504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18211017921566963, + "step": 5870 + }, + { + "epoch": 0.48933333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.029020182291666665, + "learning_rate": 4e-05, + "loss": 4.7998, + "loss/crossentropy": 1.303352952003479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16013509593904018, + "step": 5872 + }, + { + "epoch": 0.4895, + "grad_norm": 4.8125, + "grad_norm_var": 0.02847900390625, + "learning_rate": 4e-05, + "loss": 5.3041, + "loss/crossentropy": 1.8330368399620056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1668311320245266, + "step": 5874 + }, + { + "epoch": 0.48966666666666664, + "grad_norm": 4.65625, + "grad_norm_var": 0.031640625, + "learning_rate": 4e-05, + "loss": 4.8962, + "loss/crossentropy": 2.1987491250038147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968229003250599, + "step": 5876 + }, + { + "epoch": 0.48983333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.023942057291666666, + "learning_rate": 4e-05, + "loss": 4.7749, + "loss/crossentropy": 2.274143636226654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117280811071396, + "step": 5878 + }, + { + "epoch": 0.49, + "grad_norm": 4.96875, + "grad_norm_var": 0.027762858072916667, + "learning_rate": 4e-05, + "loss": 4.2549, + "loss/crossentropy": 1.1879191398620605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13721632584929466, + "step": 5880 + }, + { + "epoch": 0.49016666666666664, + "grad_norm": 4.8125, + "grad_norm_var": 0.025113932291666665, + "learning_rate": 4e-05, + "loss": 4.8011, + "loss/crossentropy": 1.788271814584732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16815048828721046, + "step": 5882 + }, + { + "epoch": 0.49033333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.03131510416666667, + "learning_rate": 4e-05, + "loss": 4.6725, + "loss/crossentropy": 1.9695579707622528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19962314143776894, + "step": 5884 + }, + { + "epoch": 0.4905, + "grad_norm": 4.5, + "grad_norm_var": 0.03216145833333333, + "learning_rate": 4e-05, + "loss": 4.3649, + "loss/crossentropy": 1.6603060066699982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18644647859036922, + "step": 5886 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 4.8125, + "grad_norm_var": 0.036572265625, + "learning_rate": 4e-05, + "loss": 5.1558, + "loss/crossentropy": 1.9313186779618263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20377793721854687, + "step": 5888 + }, + { + "epoch": 0.49083333333333334, + "grad_norm": 5.5, + "grad_norm_var": 0.06510009765625, + "learning_rate": 4e-05, + "loss": 4.7934, + "loss/crossentropy": 1.7631231471896172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17797018960118294, + "step": 5890 + }, + { + "epoch": 0.491, + "grad_norm": 4.53125, + "grad_norm_var": 0.07079671223958334, + "learning_rate": 4e-05, + "loss": 4.7736, + "loss/crossentropy": 1.8846217468380928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16871808841824532, + "step": 5892 + }, + { + "epoch": 0.49116666666666664, + "grad_norm": 5.125, + "grad_norm_var": 0.071875, + "learning_rate": 4e-05, + "loss": 5.2073, + "loss/crossentropy": 2.2401039600372314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20969317108392715, + "step": 5894 + }, + { + "epoch": 0.49133333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.07029622395833333, + "learning_rate": 4e-05, + "loss": 5.0011, + "loss/crossentropy": 1.9672669917345047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19397153332829475, + "step": 5896 + }, + { + "epoch": 0.4915, + "grad_norm": 4.59375, + "grad_norm_var": 0.07460530598958333, + "learning_rate": 4e-05, + "loss": 4.0801, + "loss/crossentropy": 0.41661109030246735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.0850940328091383, + "step": 5898 + }, + { + "epoch": 0.49166666666666664, + "grad_norm": 4.96875, + "grad_norm_var": 0.0677734375, + "learning_rate": 4e-05, + "loss": 4.9046, + "loss/crossentropy": 1.5046052262187004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13949467055499554, + "step": 5900 + }, + { + "epoch": 0.49183333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 0.07603759765625, + "learning_rate": 4e-05, + "loss": 4.4854, + "loss/crossentropy": 1.941825695335865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18303821794688702, + "step": 5902 + }, + { + "epoch": 0.492, + "grad_norm": 4.875, + "grad_norm_var": 0.080712890625, + "learning_rate": 4e-05, + "loss": 4.8065, + "loss/crossentropy": 1.7237118035554886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17298521287739277, + "step": 5904 + }, + { + "epoch": 0.49216666666666664, + "grad_norm": 4.875, + "grad_norm_var": 0.06565348307291667, + "learning_rate": 4e-05, + "loss": 4.8444, + "loss/crossentropy": 1.5659492053091526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17988939210772514, + "step": 5906 + }, + { + "epoch": 0.49233333333333335, + "grad_norm": 4.78125, + "grad_norm_var": 0.059794108072916664, + "learning_rate": 4e-05, + "loss": 4.7725, + "loss/crossentropy": 2.2780866622924805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22615566104650497, + "step": 5908 + }, + { + "epoch": 0.4925, + "grad_norm": 4.90625, + "grad_norm_var": 0.06047770182291667, + "learning_rate": 4e-05, + "loss": 4.6949, + "loss/crossentropy": 2.317236602306366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197988148778677, + "step": 5910 + }, + { + "epoch": 0.49266666666666664, + "grad_norm": 4.875, + "grad_norm_var": 0.05944010416666667, + "learning_rate": 4e-05, + "loss": 4.452, + "loss/crossentropy": 1.369862139225006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1475118100643158, + "step": 5912 + }, + { + "epoch": 0.49283333333333335, + "grad_norm": 4.8125, + "grad_norm_var": 0.06951497395833334, + "learning_rate": 4e-05, + "loss": 4.9848, + "loss/crossentropy": 1.3414551764726639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16197285428643227, + "step": 5914 + }, + { + "epoch": 0.493, + "grad_norm": 4.96875, + "grad_norm_var": 0.09034830729166667, + "learning_rate": 4e-05, + "loss": 5.1543, + "loss/crossentropy": 2.490071475505829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21345427632331848, + "step": 5916 + }, + { + "epoch": 0.49316666666666664, + "grad_norm": 4.75, + "grad_norm_var": 0.06568603515625, + "learning_rate": 4e-05, + "loss": 5.0928, + "loss/crossentropy": 2.5531476736068726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24787741899490356, + "step": 5918 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 5.0, + "grad_norm_var": 0.060139973958333336, + "learning_rate": 4e-05, + "loss": 5.1398, + "loss/crossentropy": 1.9292317777872086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17992111667990685, + "step": 5920 + }, + { + "epoch": 0.4935, + "grad_norm": 4.71875, + "grad_norm_var": 0.07379150390625, + "learning_rate": 4e-05, + "loss": 5.0046, + "loss/crossentropy": 2.194719046354294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2291640229523182, + "step": 5922 + }, + { + "epoch": 0.49366666666666664, + "grad_norm": 5.09375, + "grad_norm_var": 0.07433268229166666, + "learning_rate": 4e-05, + "loss": 5.3898, + "loss/crossentropy": 2.461084246635437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223949883133173, + "step": 5924 + }, + { + "epoch": 0.49383333333333335, + "grad_norm": 5.34375, + "grad_norm_var": 0.07948811848958333, + "learning_rate": 4e-05, + "loss": 5.0161, + "loss/crossentropy": 1.2005824074149132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12999506667256355, + "step": 5926 + }, + { + "epoch": 0.494, + "grad_norm": 4.65625, + "grad_norm_var": 0.08186442057291667, + "learning_rate": 4e-05, + "loss": 5.2245, + "loss/crossentropy": 1.230931095778942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1690246555954218, + "step": 5928 + }, + { + "epoch": 0.49416666666666664, + "grad_norm": 5.09375, + "grad_norm_var": 0.07248942057291667, + "learning_rate": 4e-05, + "loss": 5.0945, + "loss/crossentropy": 2.3888412415981293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041553296148777, + "step": 5930 + }, + { + "epoch": 0.49433333333333335, + "grad_norm": 4.96875, + "grad_norm_var": 0.05597330729166667, + "learning_rate": 4e-05, + "loss": 4.8557, + "loss/crossentropy": 1.6737447902560234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22364658117294312, + "step": 5932 + }, + { + "epoch": 0.4945, + "grad_norm": 4.6875, + "grad_norm_var": 0.06412760416666667, + "learning_rate": 4e-05, + "loss": 4.6837, + "loss/crossentropy": 2.177261143922806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21531370282173157, + "step": 5934 + }, + { + "epoch": 0.49466666666666664, + "grad_norm": 4.5625, + "grad_norm_var": 0.068359375, + "learning_rate": 4e-05, + "loss": 4.3736, + "loss/crossentropy": 1.9087681472301483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18165162578225136, + "step": 5936 + }, + { + "epoch": 0.49483333333333335, + "grad_norm": 4.6875, + "grad_norm_var": 0.08919270833333333, + "learning_rate": 4e-05, + "loss": 4.684, + "loss/crossentropy": 2.2595274448394775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23523679375648499, + "step": 5938 + }, + { + "epoch": 0.495, + "grad_norm": 6.6875, + "grad_norm_var": 0.2752888997395833, + "learning_rate": 4e-05, + "loss": 4.3276, + "loss/crossentropy": 1.8927638530731201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26145458966493607, + "step": 5940 + }, + { + "epoch": 0.49516666666666664, + "grad_norm": 5.15625, + "grad_norm_var": 0.27274983723958335, + "learning_rate": 4e-05, + "loss": 5.1695, + "loss/crossentropy": 2.340158134698868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244977205991745, + "step": 5942 + }, + { + "epoch": 0.49533333333333335, + "grad_norm": 5.28125, + "grad_norm_var": 0.27708333333333335, + "learning_rate": 4e-05, + "loss": 5.0179, + "loss/crossentropy": 1.9942467659711838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1774813625961542, + "step": 5944 + }, + { + "epoch": 0.4955, + "grad_norm": 4.96875, + "grad_norm_var": 0.28964436848958336, + "learning_rate": 4e-05, + "loss": 5.1942, + "loss/crossentropy": 1.7467531263828278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19287551939487457, + "step": 5946 + }, + { + "epoch": 0.49566666666666664, + "grad_norm": 4.75, + "grad_norm_var": 0.30271809895833335, + "learning_rate": 4e-05, + "loss": 4.9724, + "loss/crossentropy": 2.288071572780609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23296159133315086, + "step": 5948 + }, + { + "epoch": 0.49583333333333335, + "grad_norm": 4.625, + "grad_norm_var": 0.2931925455729167, + "learning_rate": 4e-05, + "loss": 5.0832, + "loss/crossentropy": 1.5061530321836472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16233282163739204, + "step": 5950 + }, + { + "epoch": 0.496, + "grad_norm": 4.5, + "grad_norm_var": 0.30162353515625, + "learning_rate": 4e-05, + "loss": 4.4171, + "loss/crossentropy": 1.308549128472805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13862483203411102, + "step": 5952 + }, + { + "epoch": 0.49616666666666664, + "grad_norm": 4.59375, + "grad_norm_var": 0.294384765625, + "learning_rate": 4e-05, + "loss": 4.2719, + "loss/crossentropy": 1.2562294602394104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.136533934623003, + "step": 5954 + }, + { + "epoch": 0.49633333333333335, + "grad_norm": 4.75, + "grad_norm_var": 0.07823893229166666, + "learning_rate": 4e-05, + "loss": 4.1481, + "loss/crossentropy": 1.8677359819412231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17084606923162937, + "step": 5956 + }, + { + "epoch": 0.4965, + "grad_norm": 4.78125, + "grad_norm_var": 0.042952473958333334, + "learning_rate": 4e-05, + "loss": 5.1479, + "loss/crossentropy": 2.244591236114502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130465917289257, + "step": 5958 + }, + { + "epoch": 0.49666666666666665, + "grad_norm": 4.75, + "grad_norm_var": 0.022379557291666668, + "learning_rate": 4e-05, + "loss": 5.0567, + "loss/crossentropy": 2.3259174823760986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980872005224228, + "step": 5960 + }, + { + "epoch": 0.49683333333333335, + "grad_norm": 4.78125, + "grad_norm_var": 0.030729166666666665, + "learning_rate": 4e-05, + "loss": 4.8392, + "loss/crossentropy": 2.577570676803589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21869254857301712, + "step": 5962 + }, + { + "epoch": 0.497, + "grad_norm": 5.0, + "grad_norm_var": 0.043473307291666666, + "learning_rate": 4e-05, + "loss": 5.403, + "loss/crossentropy": 2.4040364921092987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2173318862915039, + "step": 5964 + }, + { + "epoch": 0.49716666666666665, + "grad_norm": 5.09375, + "grad_norm_var": 0.047526041666666664, + "learning_rate": 4e-05, + "loss": 4.6977, + "loss/crossentropy": 1.6452934443950653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20848311856389046, + "step": 5966 + }, + { + "epoch": 0.49733333333333335, + "grad_norm": 4.6875, + "grad_norm_var": 0.05041910807291667, + "learning_rate": 4e-05, + "loss": 4.1572, + "loss/crossentropy": 2.2454931437969208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20870861783623695, + "step": 5968 + }, + { + "epoch": 0.4975, + "grad_norm": 5.0, + "grad_norm_var": 0.04763997395833333, + "learning_rate": 4e-05, + "loss": 4.9585, + "loss/crossentropy": 1.4836616143584251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17538996413350105, + "step": 5970 + }, + { + "epoch": 0.49766666666666665, + "grad_norm": 4.53125, + "grad_norm_var": 0.051416015625, + "learning_rate": 4e-05, + "loss": 4.847, + "loss/crossentropy": 1.3948156237602234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14281159173697233, + "step": 5972 + }, + { + "epoch": 0.49783333333333335, + "grad_norm": 4.78125, + "grad_norm_var": 0.05172119140625, + "learning_rate": 4e-05, + "loss": 5.1803, + "loss/crossentropy": 2.3170337080955505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23617496713995934, + "step": 5974 + }, + { + "epoch": 0.498, + "grad_norm": 4.46875, + "grad_norm_var": 0.060546875, + "learning_rate": 4e-05, + "loss": 4.885, + "loss/crossentropy": 1.7463590651750565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15852132812142372, + "step": 5976 + }, + { + "epoch": 0.49816666666666665, + "grad_norm": 4.625, + "grad_norm_var": 0.056233723958333336, + "learning_rate": 4e-05, + "loss": 4.8101, + "loss/crossentropy": 2.4009880125522614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22088930383324623, + "step": 5978 + }, + { + "epoch": 0.49833333333333335, + "grad_norm": 4.90625, + "grad_norm_var": 0.048567708333333334, + "learning_rate": 4e-05, + "loss": 5.1921, + "loss/crossentropy": 1.3776784762740135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.142588060349226, + "step": 5980 + }, + { + "epoch": 0.4985, + "grad_norm": 4.71875, + "grad_norm_var": 0.04544270833333333, + "learning_rate": 4e-05, + "loss": 4.3889, + "loss/crossentropy": 1.7223493158817291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928764395415783, + "step": 5982 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 4.8125, + "grad_norm_var": 0.03860677083333333, + "learning_rate": 4e-05, + "loss": 5.2635, + "loss/crossentropy": 1.702957108616829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1879901885986328, + "step": 5984 + }, + { + "epoch": 0.49883333333333335, + "grad_norm": 4.4375, + "grad_norm_var": 0.03290608723958333, + "learning_rate": 4e-05, + "loss": 3.8626, + "loss/crossentropy": 0.9118227884173393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11705210246145725, + "step": 5986 + }, + { + "epoch": 0.499, + "grad_norm": 5.0625, + "grad_norm_var": 0.03631184895833333, + "learning_rate": 4e-05, + "loss": 4.7101, + "loss/crossentropy": 0.9368212670087814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14234120398759842, + "step": 5988 + }, + { + "epoch": 0.49916666666666665, + "grad_norm": 4.84375, + "grad_norm_var": 0.04215087890625, + "learning_rate": 4e-05, + "loss": 4.8971, + "loss/crossentropy": 2.262044668197632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21489252150058746, + "step": 5990 + }, + { + "epoch": 0.49933333333333335, + "grad_norm": 4.96875, + "grad_norm_var": 0.03534749348958333, + "learning_rate": 4e-05, + "loss": 5.4815, + "loss/crossentropy": 2.360913395881653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074732705950737, + "step": 5992 + }, + { + "epoch": 0.4995, + "grad_norm": 4.78125, + "grad_norm_var": 0.035445149739583334, + "learning_rate": 4e-05, + "loss": 5.2325, + "loss/crossentropy": 2.536450207233429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22463076934218407, + "step": 5994 + }, + { + "epoch": 0.49966666666666665, + "grad_norm": 4.90625, + "grad_norm_var": 0.048046875, + "learning_rate": 4e-05, + "loss": 4.2879, + "loss/crossentropy": 0.678945854306221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.0957408007234335, + "step": 5996 + }, + { + "epoch": 0.49983333333333335, + "grad_norm": 5.09375, + "grad_norm_var": 0.04803059895833333, + "learning_rate": 4e-05, + "loss": 4.6463, + "loss/crossentropy": 1.7992531657218933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17922177724540234, + "step": 5998 + }, + { + "epoch": 0.5, + "grad_norm": 4.96875, + "grad_norm_var": 0.049072265625, + "learning_rate": 4e-05, + "loss": 5.3301, + "loss/crossentropy": 1.8574589490890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18118897452950478, + "step": 6000 + }, + { + "epoch": 0.5001666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.029488118489583333, + "learning_rate": 4e-05, + "loss": 5.2455, + "loss/crossentropy": 1.382308728992939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16513625532388687, + "step": 6002 + }, + { + "epoch": 0.5003333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.03919270833333333, + "learning_rate": 4e-05, + "loss": 4.9577, + "loss/crossentropy": 1.8322591856122017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16798565536737442, + "step": 6004 + }, + { + "epoch": 0.5005, + "grad_norm": 5.625, + "grad_norm_var": 0.07190348307291666, + "learning_rate": 4e-05, + "loss": 5.3397, + "loss/crossentropy": 2.5801175832748413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2178741730749607, + "step": 6006 + }, + { + "epoch": 0.5006666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.074462890625, + "learning_rate": 4e-05, + "loss": 4.9032, + "loss/crossentropy": 2.0123944729566574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1762095633894205, + "step": 6008 + }, + { + "epoch": 0.5008333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.07369384765625, + "learning_rate": 4e-05, + "loss": 4.325, + "loss/crossentropy": 1.4120425209403038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16691729426383972, + "step": 6010 + }, + { + "epoch": 0.501, + "grad_norm": 5.03125, + "grad_norm_var": 0.07083333333333333, + "learning_rate": 4e-05, + "loss": 5.4007, + "loss/crossentropy": 2.539487302303314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22366130352020264, + "step": 6012 + }, + { + "epoch": 0.5011666666666666, + "grad_norm": 5.25, + "grad_norm_var": 0.07381184895833333, + "learning_rate": 4e-05, + "loss": 5.3621, + "loss/crossentropy": 2.442401111125946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21156932041049004, + "step": 6014 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.07408447265625, + "learning_rate": 4e-05, + "loss": 4.9838, + "loss/crossentropy": 1.941146932542324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16361217759549618, + "step": 6016 + }, + { + "epoch": 0.5015, + "grad_norm": 5.53125, + "grad_norm_var": 0.09104410807291667, + "learning_rate": 4e-05, + "loss": 4.8045, + "loss/crossentropy": 1.6995574682950974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17772125452756882, + "step": 6018 + }, + { + "epoch": 0.5016666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.07024332682291666, + "learning_rate": 4e-05, + "loss": 5.028, + "loss/crossentropy": 1.2951749190688133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13708342798054218, + "step": 6020 + }, + { + "epoch": 0.5018333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.05862223307291667, + "learning_rate": 4e-05, + "loss": 5.2686, + "loss/crossentropy": 1.886262372136116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18839742615818977, + "step": 6022 + }, + { + "epoch": 0.502, + "grad_norm": 5.3125, + "grad_norm_var": 0.06679280598958333, + "learning_rate": 4e-05, + "loss": 4.8845, + "loss/crossentropy": 2.3439903557300568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835384353995323, + "step": 6024 + }, + { + "epoch": 0.5021666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.09217122395833334, + "learning_rate": 4e-05, + "loss": 4.1525, + "loss/crossentropy": 1.9972389563918114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18215650878846645, + "step": 6026 + }, + { + "epoch": 0.5023333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.09192708333333334, + "learning_rate": 4e-05, + "loss": 5.0969, + "loss/crossentropy": 1.8251912593841553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17888565175235271, + "step": 6028 + }, + { + "epoch": 0.5025, + "grad_norm": 4.75, + "grad_norm_var": 0.08951416015625, + "learning_rate": 4e-05, + "loss": 4.6262, + "loss/crossentropy": 1.6110865250229836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845040712505579, + "step": 6030 + }, + { + "epoch": 0.5026666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.115869140625, + "learning_rate": 4e-05, + "loss": 4.6456, + "loss/crossentropy": 2.026170499622822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19897928833961487, + "step": 6032 + }, + { + "epoch": 0.5028333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.09097900390625, + "learning_rate": 4e-05, + "loss": 5.3882, + "loss/crossentropy": 2.2215274199843407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18462110683321953, + "step": 6034 + }, + { + "epoch": 0.503, + "grad_norm": 4.875, + "grad_norm_var": 0.10445556640625, + "learning_rate": 4e-05, + "loss": 4.7315, + "loss/crossentropy": 1.7266902402043343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15911870449781418, + "step": 6036 + }, + { + "epoch": 0.5031666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.09023030598958333, + "learning_rate": 4e-05, + "loss": 4.7497, + "loss/crossentropy": 1.2634576633572578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14578200317919254, + "step": 6038 + }, + { + "epoch": 0.5033333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.07203369140625, + "learning_rate": 4e-05, + "loss": 4.5089, + "loss/crossentropy": 1.9131877347826958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1862852443009615, + "step": 6040 + }, + { + "epoch": 0.5035, + "grad_norm": 5.25, + "grad_norm_var": 0.09212239583333333, + "learning_rate": 4e-05, + "loss": 4.9267, + "loss/crossentropy": 1.783204659819603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17316385731101036, + "step": 6042 + }, + { + "epoch": 0.5036666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.090087890625, + "learning_rate": 4e-05, + "loss": 4.7129, + "loss/crossentropy": 1.7621163725852966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1662435531616211, + "step": 6044 + }, + { + "epoch": 0.5038333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.10611979166666667, + "learning_rate": 4e-05, + "loss": 4.1912, + "loss/crossentropy": 1.622048631310463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18177297711372375, + "step": 6046 + }, + { + "epoch": 0.504, + "grad_norm": 5.15625, + "grad_norm_var": 0.11750895182291667, + "learning_rate": 4e-05, + "loss": 5.0865, + "loss/crossentropy": 1.6797449514269829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16225052624940872, + "step": 6048 + }, + { + "epoch": 0.5041666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.11949462890625, + "learning_rate": 4e-05, + "loss": 4.9798, + "loss/crossentropy": 1.8889866098761559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17602252773940563, + "step": 6050 + }, + { + "epoch": 0.5043333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.1078125, + "learning_rate": 4e-05, + "loss": 4.5108, + "loss/crossentropy": 2.5241169333457947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20590389892458916, + "step": 6052 + }, + { + "epoch": 0.5045, + "grad_norm": 4.6875, + "grad_norm_var": 0.09468994140625, + "learning_rate": 4e-05, + "loss": 4.8736, + "loss/crossentropy": 2.277352422475815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986519955098629, + "step": 6054 + }, + { + "epoch": 0.5046666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.08453369140625, + "learning_rate": 4e-05, + "loss": 4.6058, + "loss/crossentropy": 2.3752284348011017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21625737473368645, + "step": 6056 + }, + { + "epoch": 0.5048333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.08570556640625, + "learning_rate": 4e-05, + "loss": 5.2332, + "loss/crossentropy": 2.3863165974617004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20476922765374184, + "step": 6058 + }, + { + "epoch": 0.505, + "grad_norm": 4.875, + "grad_norm_var": 0.087353515625, + "learning_rate": 4e-05, + "loss": 4.7747, + "loss/crossentropy": 2.209844708442688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20129455253481865, + "step": 6060 + }, + { + "epoch": 0.5051666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.0900390625, + "learning_rate": 4e-05, + "loss": 5.2351, + "loss/crossentropy": 1.8845185935497284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019658386707306, + "step": 6062 + }, + { + "epoch": 0.5053333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.06399739583333333, + "learning_rate": 4e-05, + "loss": 4.499, + "loss/crossentropy": 1.9752169027924538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18215681612491608, + "step": 6064 + }, + { + "epoch": 0.5055, + "grad_norm": 5.09375, + "grad_norm_var": 0.06015218098958333, + "learning_rate": 4e-05, + "loss": 4.6792, + "loss/crossentropy": 1.0717046111822128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1789306327700615, + "step": 6066 + }, + { + "epoch": 0.5056666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.04303385416666667, + "learning_rate": 4e-05, + "loss": 5.2286, + "loss/crossentropy": 1.7968225330114365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18911275640130043, + "step": 6068 + }, + { + "epoch": 0.5058333333333334, + "grad_norm": 4.375, + "grad_norm_var": 0.05865885416666667, + "learning_rate": 4e-05, + "loss": 4.6731, + "loss/crossentropy": 2.1856305301189423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19559626653790474, + "step": 6070 + }, + { + "epoch": 0.506, + "grad_norm": 4.6875, + "grad_norm_var": 0.060400390625, + "learning_rate": 4e-05, + "loss": 4.9951, + "loss/crossentropy": 1.5920969396829605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17270361259579659, + "step": 6072 + }, + { + "epoch": 0.5061666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.060400390625, + "learning_rate": 4e-05, + "loss": 4.8435, + "loss/crossentropy": 1.9251913204789162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16875243186950684, + "step": 6074 + }, + { + "epoch": 0.5063333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.06226806640625, + "learning_rate": 4e-05, + "loss": 5.0282, + "loss/crossentropy": 1.7353182584047318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15810954943299294, + "step": 6076 + }, + { + "epoch": 0.5065, + "grad_norm": 4.84375, + "grad_norm_var": 0.058426920572916666, + "learning_rate": 4e-05, + "loss": 5.5596, + "loss/crossentropy": 1.6096114814281464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17892133630812168, + "step": 6078 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.054036458333333336, + "learning_rate": 4e-05, + "loss": 5.0621, + "loss/crossentropy": 1.56081011146307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16593561880290508, + "step": 6080 + }, + { + "epoch": 0.5068333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.06131184895833333, + "learning_rate": 4e-05, + "loss": 4.4695, + "loss/crossentropy": 2.1837804913520813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104526273906231, + "step": 6082 + }, + { + "epoch": 0.507, + "grad_norm": 4.6875, + "grad_norm_var": 0.056233723958333336, + "learning_rate": 4e-05, + "loss": 5.0153, + "loss/crossentropy": 2.1096479892730713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877879686653614, + "step": 6084 + }, + { + "epoch": 0.5071666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.0396484375, + "learning_rate": 4e-05, + "loss": 4.3041, + "loss/crossentropy": 1.398997388780117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14874712191522121, + "step": 6086 + }, + { + "epoch": 0.5073333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.05792643229166667, + "learning_rate": 4e-05, + "loss": 4.0589, + "loss/crossentropy": 2.1173028349876404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18377171456813812, + "step": 6088 + }, + { + "epoch": 0.5075, + "grad_norm": 5.3125, + "grad_norm_var": 0.07200113932291667, + "learning_rate": 4e-05, + "loss": 5.1461, + "loss/crossentropy": 2.301940530538559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2225518599152565, + "step": 6090 + }, + { + "epoch": 0.5076666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.07841389973958333, + "learning_rate": 4e-05, + "loss": 4.7475, + "loss/crossentropy": 1.870749220252037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839730478823185, + "step": 6092 + }, + { + "epoch": 0.5078333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.08440348307291666, + "learning_rate": 4e-05, + "loss": 4.3053, + "loss/crossentropy": 1.5619140639901161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14815925806760788, + "step": 6094 + }, + { + "epoch": 0.508, + "grad_norm": 4.59375, + "grad_norm_var": 0.088916015625, + "learning_rate": 4e-05, + "loss": 5.0066, + "loss/crossentropy": 2.1684592068195343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090744599699974, + "step": 6096 + }, + { + "epoch": 0.5081666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.0810546875, + "learning_rate": 4e-05, + "loss": 5.1679, + "loss/crossentropy": 2.247919350862503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19475699588656425, + "step": 6098 + }, + { + "epoch": 0.5083333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.08186442057291667, + "learning_rate": 4e-05, + "loss": 4.7223, + "loss/crossentropy": 1.5078487992286682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15344743058085442, + "step": 6100 + }, + { + "epoch": 0.5085, + "grad_norm": 4.84375, + "grad_norm_var": 0.09134114583333333, + "learning_rate": 4e-05, + "loss": 4.7778, + "loss/crossentropy": 2.0286532193422318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19421415030956268, + "step": 6102 + }, + { + "epoch": 0.5086666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.069775390625, + "learning_rate": 4e-05, + "loss": 4.9994, + "loss/crossentropy": 1.5452463030815125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15388812869787216, + "step": 6104 + }, + { + "epoch": 0.5088333333333334, + "grad_norm": 4.3125, + "grad_norm_var": 0.06877848307291666, + "learning_rate": 4e-05, + "loss": 4.6816, + "loss/crossentropy": 1.1183883771300316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1324480827897787, + "step": 6106 + }, + { + "epoch": 0.509, + "grad_norm": 4.5, + "grad_norm_var": 0.08717447916666667, + "learning_rate": 4e-05, + "loss": 4.3537, + "loss/crossentropy": 1.9015508890151978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20949223265051842, + "step": 6108 + }, + { + "epoch": 0.5091666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.08577067057291667, + "learning_rate": 4e-05, + "loss": 4.794, + "loss/crossentropy": 1.9852671101689339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18729106336832047, + "step": 6110 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.0826171875, + "learning_rate": 4e-05, + "loss": 5.6819, + "loss/crossentropy": 1.9998779222369194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17011355608701706, + "step": 6112 + }, + { + "epoch": 0.5095, + "grad_norm": 5.125, + "grad_norm_var": 0.08938802083333333, + "learning_rate": 4e-05, + "loss": 4.6003, + "loss/crossentropy": 2.7957329154014587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22699139639735222, + "step": 6114 + }, + { + "epoch": 0.5096666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.10341389973958333, + "learning_rate": 4e-05, + "loss": 4.765, + "loss/crossentropy": 2.4798883199691772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112659588456154, + "step": 6116 + }, + { + "epoch": 0.5098333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.09123942057291666, + "learning_rate": 4e-05, + "loss": 4.8308, + "loss/crossentropy": 1.5427830889821053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14681899175047874, + "step": 6118 + }, + { + "epoch": 0.51, + "grad_norm": 4.875, + "grad_norm_var": 0.091259765625, + "learning_rate": 4e-05, + "loss": 5.116, + "loss/crossentropy": 1.948954276740551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17982318252325058, + "step": 6120 + }, + { + "epoch": 0.5101666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.095556640625, + "learning_rate": 4e-05, + "loss": 5.2846, + "loss/crossentropy": 2.349464535713196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22288982570171356, + "step": 6122 + }, + { + "epoch": 0.5103333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.10142822265625, + "learning_rate": 4e-05, + "loss": 4.8594, + "loss/crossentropy": 1.2270648926496506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12373272702097893, + "step": 6124 + }, + { + "epoch": 0.5105, + "grad_norm": 5.0, + "grad_norm_var": 0.09576416015625, + "learning_rate": 4e-05, + "loss": 5.194, + "loss/crossentropy": 2.047919809818268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2000446319580078, + "step": 6126 + }, + { + "epoch": 0.5106666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.094140625, + "learning_rate": 4e-05, + "loss": 5.3205, + "loss/crossentropy": 1.957608625292778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17870837077498436, + "step": 6128 + }, + { + "epoch": 0.5108333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.0912109375, + "learning_rate": 4e-05, + "loss": 5.3583, + "loss/crossentropy": 2.141770154237747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25270281732082367, + "step": 6130 + }, + { + "epoch": 0.511, + "grad_norm": 4.8125, + "grad_norm_var": 0.10217692057291666, + "learning_rate": 4e-05, + "loss": 4.4733, + "loss/crossentropy": 1.920982912182808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17835867032408714, + "step": 6132 + }, + { + "epoch": 0.5111666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.10384114583333333, + "learning_rate": 4e-05, + "loss": 4.8358, + "loss/crossentropy": 1.4349671453237534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14867802895605564, + "step": 6134 + }, + { + "epoch": 0.5113333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.10084635416666667, + "learning_rate": 4e-05, + "loss": 5.0694, + "loss/crossentropy": 2.0233269333839417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19398606568574905, + "step": 6136 + }, + { + "epoch": 0.5115, + "grad_norm": 4.46875, + "grad_norm_var": 0.07564697265625, + "learning_rate": 4e-05, + "loss": 4.6461, + "loss/crossentropy": 1.884231187403202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1786806397140026, + "step": 6138 + }, + { + "epoch": 0.5116666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.08941650390625, + "learning_rate": 4e-05, + "loss": 5.2206, + "loss/crossentropy": 1.6101298779249191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20260108821094036, + "step": 6140 + }, + { + "epoch": 0.5118333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.10045166015625, + "learning_rate": 4e-05, + "loss": 4.8658, + "loss/crossentropy": 2.56222003698349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21393615752458572, + "step": 6142 + }, + { + "epoch": 0.512, + "grad_norm": 5.1875, + "grad_norm_var": 0.11510009765625, + "learning_rate": 4e-05, + "loss": 5.1734, + "loss/crossentropy": 2.5355464816093445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23778853192925453, + "step": 6144 + }, + { + "epoch": 0.5121666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.12394205729166667, + "learning_rate": 4e-05, + "loss": 4.9488, + "loss/crossentropy": 1.8643578216433525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17698352597653866, + "step": 6146 + }, + { + "epoch": 0.5123333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.10491129557291666, + "learning_rate": 4e-05, + "loss": 5.2033, + "loss/crossentropy": 1.9535821378231049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19483724609017372, + "step": 6148 + }, + { + "epoch": 0.5125, + "grad_norm": 4.78125, + "grad_norm_var": 0.10349934895833333, + "learning_rate": 4e-05, + "loss": 4.9737, + "loss/crossentropy": 1.9888366162776947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200862318277359, + "step": 6150 + }, + { + "epoch": 0.5126666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.11552327473958333, + "learning_rate": 4e-05, + "loss": 4.453, + "loss/crossentropy": 2.2788354754447937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21191953867673874, + "step": 6152 + }, + { + "epoch": 0.5128333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.09972330729166666, + "learning_rate": 4e-05, + "loss": 5.0921, + "loss/crossentropy": 1.3500697389245033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16570369713008404, + "step": 6154 + }, + { + "epoch": 0.513, + "grad_norm": 5.625, + "grad_norm_var": 0.10514322916666667, + "learning_rate": 4e-05, + "loss": 4.9128, + "loss/crossentropy": 1.7618460059165955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21831801161170006, + "step": 6156 + }, + { + "epoch": 0.5131666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.10064697265625, + "learning_rate": 4e-05, + "loss": 4.8258, + "loss/crossentropy": 1.8500609695911407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215445153415203, + "step": 6158 + }, + { + "epoch": 0.5133333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.09073893229166667, + "learning_rate": 4e-05, + "loss": 4.9059, + "loss/crossentropy": 2.0788095593452454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21428800374269485, + "step": 6160 + }, + { + "epoch": 0.5135, + "grad_norm": 4.75, + "grad_norm_var": 0.08290608723958333, + "learning_rate": 4e-05, + "loss": 5.0222, + "loss/crossentropy": 2.1214606761932373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101825326681137, + "step": 6162 + }, + { + "epoch": 0.5136666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.08365478515625, + "learning_rate": 4e-05, + "loss": 3.6792, + "loss/crossentropy": 1.6810518354177475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17640621587634087, + "step": 6164 + }, + { + "epoch": 0.5138333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.08957926432291667, + "learning_rate": 4e-05, + "loss": 4.7143, + "loss/crossentropy": 1.536653459072113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18369175493717194, + "step": 6166 + }, + { + "epoch": 0.514, + "grad_norm": 4.875, + "grad_norm_var": 0.07291666666666667, + "learning_rate": 4e-05, + "loss": 4.8028, + "loss/crossentropy": 2.130642056465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23484623059630394, + "step": 6168 + }, + { + "epoch": 0.5141666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.075244140625, + "learning_rate": 4e-05, + "loss": 5.0955, + "loss/crossentropy": 2.1398507356643677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19409223273396492, + "step": 6170 + }, + { + "epoch": 0.5143333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.07330322265625, + "learning_rate": 4e-05, + "loss": 5.0557, + "loss/crossentropy": 1.8753467500209808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19965557008981705, + "step": 6172 + }, + { + "epoch": 0.5145, + "grad_norm": 4.96875, + "grad_norm_var": 0.06314697265625, + "learning_rate": 4e-05, + "loss": 4.6537, + "loss/crossentropy": 2.6288909912109375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21764767169952393, + "step": 6174 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.069775390625, + "learning_rate": 4e-05, + "loss": 5.392, + "loss/crossentropy": 1.5557663962244987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104162685573101, + "step": 6176 + }, + { + "epoch": 0.5148333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.07288004557291666, + "learning_rate": 4e-05, + "loss": 4.5591, + "loss/crossentropy": 1.727570228278637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1829282995313406, + "step": 6178 + }, + { + "epoch": 0.515, + "grad_norm": 4.78125, + "grad_norm_var": 0.0646484375, + "learning_rate": 4e-05, + "loss": 4.7078, + "loss/crossentropy": 1.828254558146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17748953867703676, + "step": 6180 + }, + { + "epoch": 0.5151666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.059619140625, + "learning_rate": 4e-05, + "loss": 4.459, + "loss/crossentropy": 1.7436736971139908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18016472458839417, + "step": 6182 + }, + { + "epoch": 0.5153333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.06357014973958333, + "learning_rate": 4e-05, + "loss": 4.848, + "loss/crossentropy": 1.2710848674178123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1631698366254568, + "step": 6184 + }, + { + "epoch": 0.5155, + "grad_norm": 4.5625, + "grad_norm_var": 0.06633707682291666, + "learning_rate": 4e-05, + "loss": 4.2305, + "loss/crossentropy": 1.4865316599607468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17193517833948135, + "step": 6186 + }, + { + "epoch": 0.5156666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.03606363932291667, + "learning_rate": 4e-05, + "loss": 4.6158, + "loss/crossentropy": 1.702145777642727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.168340552598238, + "step": 6188 + }, + { + "epoch": 0.5158333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.04273681640625, + "learning_rate": 4e-05, + "loss": 4.5417, + "loss/crossentropy": 1.9604062549769878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17854683846235275, + "step": 6190 + }, + { + "epoch": 0.516, + "grad_norm": 5.15625, + "grad_norm_var": 0.0416015625, + "learning_rate": 4e-05, + "loss": 4.8238, + "loss/crossentropy": 2.2761952579021454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19496627151966095, + "step": 6192 + }, + { + "epoch": 0.5161666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.04178059895833333, + "learning_rate": 4e-05, + "loss": 4.8672, + "loss/crossentropy": 1.6333682388067245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18837878666818142, + "step": 6194 + }, + { + "epoch": 0.5163333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.049544270833333334, + "learning_rate": 4e-05, + "loss": 5.2122, + "loss/crossentropy": 2.6109012365341187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22294483333826065, + "step": 6196 + }, + { + "epoch": 0.5165, + "grad_norm": 4.8125, + "grad_norm_var": 0.058882649739583334, + "learning_rate": 4e-05, + "loss": 4.9916, + "loss/crossentropy": 2.4936388731002808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22632497176527977, + "step": 6198 + }, + { + "epoch": 0.5166666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.05090738932291667, + "learning_rate": 4e-05, + "loss": 4.752, + "loss/crossentropy": 2.026665262877941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111969329416752, + "step": 6200 + }, + { + "epoch": 0.5168333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.051953125, + "learning_rate": 4e-05, + "loss": 4.8235, + "loss/crossentropy": 2.217519849538803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086241953074932, + "step": 6202 + }, + { + "epoch": 0.517, + "grad_norm": 4.34375, + "grad_norm_var": 0.073046875, + "learning_rate": 4e-05, + "loss": 4.9519, + "loss/crossentropy": 2.1753300726413727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2227613627910614, + "step": 6204 + }, + { + "epoch": 0.5171666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.06366780598958334, + "learning_rate": 4e-05, + "loss": 4.4093, + "loss/crossentropy": 1.853736698627472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19453158415853977, + "step": 6206 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.0796875, + "learning_rate": 4e-05, + "loss": 4.5713, + "loss/crossentropy": 1.8317307531833649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17688237130641937, + "step": 6208 + }, + { + "epoch": 0.5175, + "grad_norm": 4.84375, + "grad_norm_var": 0.07265625, + "learning_rate": 4e-05, + "loss": 4.6111, + "loss/crossentropy": 1.2401080802083015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18187956511974335, + "step": 6210 + }, + { + "epoch": 0.5176666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.10650634765625, + "learning_rate": 4e-05, + "loss": 4.5749, + "loss/crossentropy": 2.200599730014801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23270919546484947, + "step": 6212 + }, + { + "epoch": 0.5178333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.09986572265625, + "learning_rate": 4e-05, + "loss": 4.7749, + "loss/crossentropy": 1.5861978828907013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17271048948168755, + "step": 6214 + }, + { + "epoch": 0.518, + "grad_norm": 4.46875, + "grad_norm_var": 0.11744384765625, + "learning_rate": 4e-05, + "loss": 4.3848, + "loss/crossentropy": 1.9050172120332718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19479480385780334, + "step": 6216 + }, + { + "epoch": 0.5181666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.110009765625, + "learning_rate": 4e-05, + "loss": 4.8868, + "loss/crossentropy": 2.2683032155036926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20415033772587776, + "step": 6218 + }, + { + "epoch": 0.5183333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.09374593098958334, + "learning_rate": 4e-05, + "loss": 4.7636, + "loss/crossentropy": 1.3371253162622452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14753723703324795, + "step": 6220 + }, + { + "epoch": 0.5185, + "grad_norm": 4.5625, + "grad_norm_var": 0.08896077473958333, + "learning_rate": 4e-05, + "loss": 4.9785, + "loss/crossentropy": 1.7137665003538132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17877374030649662, + "step": 6222 + }, + { + "epoch": 0.5186666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.05022379557291667, + "learning_rate": 4e-05, + "loss": 4.9261, + "loss/crossentropy": 2.3642892837524414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21913127228617668, + "step": 6224 + }, + { + "epoch": 0.5188333333333334, + "grad_norm": 5.71875, + "grad_norm_var": 0.12693684895833332, + "learning_rate": 4e-05, + "loss": 5.1342, + "loss/crossentropy": 2.3839961886405945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21350585669279099, + "step": 6226 + }, + { + "epoch": 0.519, + "grad_norm": 4.84375, + "grad_norm_var": 0.11490885416666667, + "learning_rate": 4e-05, + "loss": 4.795, + "loss/crossentropy": 1.5752469822764397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1464600432664156, + "step": 6228 + }, + { + "epoch": 0.5191666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.14156494140625, + "learning_rate": 4e-05, + "loss": 5.2829, + "loss/crossentropy": 2.2646824717521667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26518501713871956, + "step": 6230 + }, + { + "epoch": 0.5193333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.12550455729166668, + "learning_rate": 4e-05, + "loss": 5.1681, + "loss/crossentropy": 2.501288950443268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19936766847968102, + "step": 6232 + }, + { + "epoch": 0.5195, + "grad_norm": 5.0625, + "grad_norm_var": 0.14386393229166666, + "learning_rate": 4e-05, + "loss": 5.4947, + "loss/crossentropy": 2.6087101250886917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20095998793840408, + "step": 6234 + }, + { + "epoch": 0.5196666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.12096354166666666, + "learning_rate": 4e-05, + "loss": 5.2234, + "loss/crossentropy": 2.112656258046627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1662088967859745, + "step": 6236 + }, + { + "epoch": 0.5198333333333334, + "grad_norm": 5.53125, + "grad_norm_var": 0.11321207682291666, + "learning_rate": 4e-05, + "loss": 5.4278, + "loss/crossentropy": 2.4622672498226166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20646344870328903, + "step": 6238 + }, + { + "epoch": 0.52, + "grad_norm": 4.875, + "grad_norm_var": 0.09568684895833333, + "learning_rate": 4e-05, + "loss": 5.0155, + "loss/crossentropy": 2.2638790011405945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19504579156637192, + "step": 6240 + }, + { + "epoch": 0.5201666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.08492431640625, + "learning_rate": 4e-05, + "loss": 4.8904, + "loss/crossentropy": 2.4643925726413727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21222343668341637, + "step": 6242 + }, + { + "epoch": 0.5203333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.10026041666666667, + "learning_rate": 4e-05, + "loss": 4.8743, + "loss/crossentropy": 2.4396408200263977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22357990220189095, + "step": 6244 + }, + { + "epoch": 0.5205, + "grad_norm": 5.03125, + "grad_norm_var": 0.09052327473958334, + "learning_rate": 4e-05, + "loss": 5.0041, + "loss/crossentropy": 1.6959142982959747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19862833246588707, + "step": 6246 + }, + { + "epoch": 0.5206666666666667, + "grad_norm": 5.65625, + "grad_norm_var": 0.10611979166666667, + "learning_rate": 4e-05, + "loss": 5.5068, + "loss/crossentropy": 2.444189488887787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21889224275946617, + "step": 6248 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.09104410807291667, + "learning_rate": 4e-05, + "loss": 5.0639, + "loss/crossentropy": 1.7642913609743118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18225438706576824, + "step": 6250 + }, + { + "epoch": 0.521, + "grad_norm": 5.03125, + "grad_norm_var": 0.09120686848958333, + "learning_rate": 4e-05, + "loss": 5.5272, + "loss/crossentropy": 1.9865412786602974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17439224757254124, + "step": 6252 + }, + { + "epoch": 0.5211666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.078759765625, + "learning_rate": 4e-05, + "loss": 4.7066, + "loss/crossentropy": 1.6813689842820168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18430567532777786, + "step": 6254 + }, + { + "epoch": 0.5213333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.08268229166666667, + "learning_rate": 4e-05, + "loss": 4.8611, + "loss/crossentropy": 1.4353890866041183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14516296423971653, + "step": 6256 + }, + { + "epoch": 0.5215, + "grad_norm": 5.0, + "grad_norm_var": 0.076025390625, + "learning_rate": 4e-05, + "loss": 4.7658, + "loss/crossentropy": 1.1730327010154724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.135228231549263, + "step": 6258 + }, + { + "epoch": 0.5216666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.058837890625, + "learning_rate": 4e-05, + "loss": 4.8194, + "loss/crossentropy": 1.7282505184412003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.166041424497962, + "step": 6260 + }, + { + "epoch": 0.5218333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.056233723958333336, + "learning_rate": 4e-05, + "loss": 4.8927, + "loss/crossentropy": 2.077870637178421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21072274073958397, + "step": 6262 + }, + { + "epoch": 0.522, + "grad_norm": 4.75, + "grad_norm_var": 0.019596354166666666, + "learning_rate": 4e-05, + "loss": 4.94, + "loss/crossentropy": 1.6653959900140762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1690856534987688, + "step": 6264 + }, + { + "epoch": 0.5221666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.017508951822916667, + "learning_rate": 4e-05, + "loss": 4.796, + "loss/crossentropy": 2.3305214643478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003948614001274, + "step": 6266 + }, + { + "epoch": 0.5223333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 0.033984375, + "learning_rate": 4e-05, + "loss": 4.9536, + "loss/crossentropy": 1.6702621951699257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18134449422359467, + "step": 6268 + }, + { + "epoch": 0.5225, + "grad_norm": 4.8125, + "grad_norm_var": 0.03811442057291667, + "learning_rate": 4e-05, + "loss": 4.5768, + "loss/crossentropy": 1.830449789762497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21275094151496887, + "step": 6270 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.03720296223958333, + "learning_rate": 4e-05, + "loss": 5.1588, + "loss/crossentropy": 2.4117337092757225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16365902870893478, + "step": 6272 + }, + { + "epoch": 0.5228333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.05169270833333333, + "learning_rate": 4e-05, + "loss": 5.2077, + "loss/crossentropy": 2.003813534975052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18846618384122849, + "step": 6274 + }, + { + "epoch": 0.523, + "grad_norm": 4.71875, + "grad_norm_var": 0.05474853515625, + "learning_rate": 4e-05, + "loss": 4.8736, + "loss/crossentropy": 1.146153837442398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14847451448440552, + "step": 6276 + }, + { + "epoch": 0.5231666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.045426432291666666, + "learning_rate": 4e-05, + "loss": 4.9438, + "loss/crossentropy": 2.475542187690735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23931154608726501, + "step": 6278 + }, + { + "epoch": 0.5233333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.07845052083333333, + "learning_rate": 4e-05, + "loss": 4.6778, + "loss/crossentropy": 1.7261252030730247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1629408374428749, + "step": 6280 + }, + { + "epoch": 0.5235, + "grad_norm": 4.8125, + "grad_norm_var": 0.07971598307291666, + "learning_rate": 4e-05, + "loss": 5.3948, + "loss/crossentropy": 2.4525701999664307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2252441942691803, + "step": 6282 + }, + { + "epoch": 0.5236666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.06601155598958333, + "learning_rate": 4e-05, + "loss": 5.304, + "loss/crossentropy": 1.8351141512393951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17090753465890884, + "step": 6284 + }, + { + "epoch": 0.5238333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.06482747395833334, + "learning_rate": 4e-05, + "loss": 5.3146, + "loss/crossentropy": 1.9516530483961105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1840406134724617, + "step": 6286 + }, + { + "epoch": 0.524, + "grad_norm": 4.71875, + "grad_norm_var": 0.07057291666666667, + "learning_rate": 4e-05, + "loss": 5.4645, + "loss/crossentropy": 1.8888097256422043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17698637954890728, + "step": 6288 + }, + { + "epoch": 0.5241666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.058447265625, + "learning_rate": 4e-05, + "loss": 5.0941, + "loss/crossentropy": 1.8953617364168167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1759849712252617, + "step": 6290 + }, + { + "epoch": 0.5243333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.05982666015625, + "learning_rate": 4e-05, + "loss": 4.7776, + "loss/crossentropy": 1.4816829040646553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15172210335731506, + "step": 6292 + }, + { + "epoch": 0.5245, + "grad_norm": 4.5625, + "grad_norm_var": 0.07691650390625, + "learning_rate": 4e-05, + "loss": 4.3964, + "loss/crossentropy": 1.4299919679760933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14875125512480736, + "step": 6294 + }, + { + "epoch": 0.5246666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.053125, + "learning_rate": 4e-05, + "loss": 4.6746, + "loss/crossentropy": 1.0042973533272743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1491411328315735, + "step": 6296 + }, + { + "epoch": 0.5248333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.05129801432291667, + "learning_rate": 4e-05, + "loss": 4.5342, + "loss/crossentropy": 2.0254055559635162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21776164323091507, + "step": 6298 + }, + { + "epoch": 0.525, + "grad_norm": 5.1875, + "grad_norm_var": 0.058268229166666664, + "learning_rate": 4e-05, + "loss": 5.0111, + "loss/crossentropy": 2.035038098692894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1758381500840187, + "step": 6300 + }, + { + "epoch": 0.5251666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.049332682291666666, + "learning_rate": 4e-05, + "loss": 5.2614, + "loss/crossentropy": 2.0022889897227287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18497100099921227, + "step": 6302 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.044270833333333336, + "learning_rate": 4e-05, + "loss": 4.9755, + "loss/crossentropy": 2.0294620618224144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19521377608180046, + "step": 6304 + }, + { + "epoch": 0.5255, + "grad_norm": 5.40625, + "grad_norm_var": 0.09446614583333333, + "learning_rate": 4e-05, + "loss": 5.3874, + "loss/crossentropy": 1.909932941198349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23021681234240532, + "step": 6306 + }, + { + "epoch": 0.5256666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.09296875, + "learning_rate": 4e-05, + "loss": 4.9068, + "loss/crossentropy": 2.2554367184638977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20018938928842545, + "step": 6308 + }, + { + "epoch": 0.5258333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.06612955729166667, + "learning_rate": 4e-05, + "loss": 4.9175, + "loss/crossentropy": 1.856409564614296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1912154108285904, + "step": 6310 + }, + { + "epoch": 0.526, + "grad_norm": 4.75, + "grad_norm_var": 0.06750895182291666, + "learning_rate": 4e-05, + "loss": 4.9821, + "loss/crossentropy": 1.97897869348526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190910242497921, + "step": 6312 + }, + { + "epoch": 0.5261666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.0728515625, + "learning_rate": 4e-05, + "loss": 5.2764, + "loss/crossentropy": 1.6196852624416351, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1537347286939621, + "step": 6314 + }, + { + "epoch": 0.5263333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.07271728515625, + "learning_rate": 4e-05, + "loss": 4.9833, + "loss/crossentropy": 2.2614503502845764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053113467991352, + "step": 6316 + }, + { + "epoch": 0.5265, + "grad_norm": 4.71875, + "grad_norm_var": 9.188346354166667, + "learning_rate": 4e-05, + "loss": 5.0568, + "loss/crossentropy": 1.965643584728241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18929314240813255, + "step": 6318 + }, + { + "epoch": 0.5266666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 9.125504557291666, + "learning_rate": 4e-05, + "loss": 5.2748, + "loss/crossentropy": 1.6978831887245178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468260884284973, + "step": 6320 + }, + { + "epoch": 0.5268333333333334, + "grad_norm": 5.0, + "grad_norm_var": 9.159228515625, + "learning_rate": 4e-05, + "loss": 5.1958, + "loss/crossentropy": 1.967081904411316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20972293615341187, + "step": 6322 + }, + { + "epoch": 0.527, + "grad_norm": 4.96875, + "grad_norm_var": 9.119755045572917, + "learning_rate": 4e-05, + "loss": 4.5926, + "loss/crossentropy": 1.6954425349831581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1549944244325161, + "step": 6324 + }, + { + "epoch": 0.5271666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 9.114957682291667, + "learning_rate": 4e-05, + "loss": 5.5283, + "loss/crossentropy": 2.016428828239441, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2225477620959282, + "step": 6326 + }, + { + "epoch": 0.5273333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 9.015478515625, + "learning_rate": 4e-05, + "loss": 5.3683, + "loss/crossentropy": 2.1403996646404266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20416608452796936, + "step": 6328 + }, + { + "epoch": 0.5275, + "grad_norm": 4.53125, + "grad_norm_var": 8.976236979166666, + "learning_rate": 4e-05, + "loss": 4.8732, + "loss/crossentropy": 1.7906945049762726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17962076887488365, + "step": 6330 + }, + { + "epoch": 0.5276666666666666, + "grad_norm": 4.75, + "grad_norm_var": 9.127864583333333, + "learning_rate": 4e-05, + "loss": 4.3215, + "loss/crossentropy": 1.274952009320259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.134726008400321, + "step": 6332 + }, + { + "epoch": 0.5278333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.23472900390625, + "learning_rate": 4e-05, + "loss": 4.8964, + "loss/crossentropy": 1.7225348949432373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21106019616127014, + "step": 6334 + }, + { + "epoch": 0.528, + "grad_norm": 5.46875, + "grad_norm_var": 0.24270833333333333, + "learning_rate": 4e-05, + "loss": 5.323, + "loss/crossentropy": 2.2533539831638336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20605823397636414, + "step": 6336 + }, + { + "epoch": 0.5281666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.239306640625, + "learning_rate": 4e-05, + "loss": 5.1128, + "loss/crossentropy": 2.207033485174179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21361444517970085, + "step": 6338 + }, + { + "epoch": 0.5283333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.2624308268229167, + "learning_rate": 4e-05, + "loss": 4.8296, + "loss/crossentropy": 2.4584856629371643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20966831594705582, + "step": 6340 + }, + { + "epoch": 0.5285, + "grad_norm": 5.03125, + "grad_norm_var": 0.25774739583333334, + "learning_rate": 4e-05, + "loss": 4.717, + "loss/crossentropy": 1.4203026741743088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15853681229054928, + "step": 6342 + }, + { + "epoch": 0.5286666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.21503499348958333, + "learning_rate": 4e-05, + "loss": 4.8863, + "loss/crossentropy": 1.733842521905899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21481941640377045, + "step": 6344 + }, + { + "epoch": 0.5288333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.104931640625, + "learning_rate": 4e-05, + "loss": 4.6211, + "loss/crossentropy": 2.055474132299423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19885016605257988, + "step": 6346 + }, + { + "epoch": 0.529, + "grad_norm": 4.75, + "grad_norm_var": 0.073681640625, + "learning_rate": 4e-05, + "loss": 4.5627, + "loss/crossentropy": 1.6423608288168907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17252543941140175, + "step": 6348 + }, + { + "epoch": 0.5291666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.072509765625, + "learning_rate": 4e-05, + "loss": 4.8631, + "loss/crossentropy": 2.213515669107437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20754803717136383, + "step": 6350 + }, + { + "epoch": 0.5293333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.0546875, + "learning_rate": 4e-05, + "loss": 4.9258, + "loss/crossentropy": 1.778283067047596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.174639031291008, + "step": 6352 + }, + { + "epoch": 0.5295, + "grad_norm": 5.4375, + "grad_norm_var": 0.07261962890625, + "learning_rate": 4e-05, + "loss": 4.9626, + "loss/crossentropy": 2.278725653886795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21602385863661766, + "step": 6354 + }, + { + "epoch": 0.5296666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.0583984375, + "learning_rate": 4e-05, + "loss": 4.9928, + "loss/crossentropy": 1.8386949226260185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17753521539270878, + "step": 6356 + }, + { + "epoch": 0.5298333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.063525390625, + "learning_rate": 4e-05, + "loss": 4.4814, + "loss/crossentropy": 1.8008562847971916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900603324174881, + "step": 6358 + }, + { + "epoch": 0.53, + "grad_norm": 4.59375, + "grad_norm_var": 0.07496337890625, + "learning_rate": 4e-05, + "loss": 4.5678, + "loss/crossentropy": 1.1889959871768951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15015914104878902, + "step": 6360 + }, + { + "epoch": 0.5301666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.07643229166666667, + "learning_rate": 4e-05, + "loss": 5.053, + "loss/crossentropy": 2.290656328201294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23251566290855408, + "step": 6362 + }, + { + "epoch": 0.5303333333333333, + "grad_norm": 5.84375, + "grad_norm_var": 0.11545817057291667, + "learning_rate": 4e-05, + "loss": 5.3294, + "loss/crossentropy": 2.3371450304985046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21618635579943657, + "step": 6364 + }, + { + "epoch": 0.5305, + "grad_norm": 5.125, + "grad_norm_var": 0.11022135416666666, + "learning_rate": 4e-05, + "loss": 5.2062, + "loss/crossentropy": 2.769070029258728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21105346828699112, + "step": 6366 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.10240885416666666, + "learning_rate": 4e-05, + "loss": 5.3696, + "loss/crossentropy": 2.346166968345642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124275527894497, + "step": 6368 + }, + { + "epoch": 0.5308333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.098046875, + "learning_rate": 4e-05, + "loss": 5.5516, + "loss/crossentropy": 2.1462940871715546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2227632701396942, + "step": 6370 + }, + { + "epoch": 0.531, + "grad_norm": 4.625, + "grad_norm_var": 0.10780843098958333, + "learning_rate": 4e-05, + "loss": 4.9736, + "loss/crossentropy": 2.0071266889572144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20992008224129677, + "step": 6372 + }, + { + "epoch": 0.5311666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.108056640625, + "learning_rate": 4e-05, + "loss": 4.4188, + "loss/crossentropy": 1.9117163717746735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18441595882177353, + "step": 6374 + }, + { + "epoch": 0.5313333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.08993733723958333, + "learning_rate": 4e-05, + "loss": 4.6976, + "loss/crossentropy": 1.9666685834527016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19188417494297028, + "step": 6376 + }, + { + "epoch": 0.5315, + "grad_norm": 5.0625, + "grad_norm_var": 0.09898681640625, + "learning_rate": 4e-05, + "loss": 4.5196, + "loss/crossentropy": 1.652912124991417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16498231142759323, + "step": 6378 + }, + { + "epoch": 0.5316666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.05237223307291667, + "learning_rate": 4e-05, + "loss": 4.3164, + "loss/crossentropy": 1.8182547390460968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19134299270808697, + "step": 6380 + }, + { + "epoch": 0.5318333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.05318603515625, + "learning_rate": 4e-05, + "loss": 5.0686, + "loss/crossentropy": 2.0795591175556183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21680190041661263, + "step": 6382 + }, + { + "epoch": 0.532, + "grad_norm": 5.03125, + "grad_norm_var": 0.05011393229166667, + "learning_rate": 4e-05, + "loss": 5.5201, + "loss/crossentropy": 1.865036815404892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2006813958287239, + "step": 6384 + }, + { + "epoch": 0.5321666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.07470296223958334, + "learning_rate": 4e-05, + "loss": 5.0107, + "loss/crossentropy": 2.5886573791503906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073320634663105, + "step": 6386 + }, + { + "epoch": 0.5323333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.06744384765625, + "learning_rate": 4e-05, + "loss": 4.2948, + "loss/crossentropy": 1.7393722236156464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19270563125610352, + "step": 6388 + }, + { + "epoch": 0.5325, + "grad_norm": 4.59375, + "grad_norm_var": 0.07693684895833333, + "learning_rate": 4e-05, + "loss": 4.8029, + "loss/crossentropy": 1.8167866170406342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17310548946261406, + "step": 6390 + }, + { + "epoch": 0.5326666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.10432535807291667, + "learning_rate": 4e-05, + "loss": 4.0045, + "loss/crossentropy": 1.742269590497017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17706291005015373, + "step": 6392 + }, + { + "epoch": 0.5328333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.10787353515625, + "learning_rate": 4e-05, + "loss": 5.0616, + "loss/crossentropy": 1.8916596919298172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17616764828562737, + "step": 6394 + }, + { + "epoch": 0.533, + "grad_norm": 4.6875, + "grad_norm_var": 0.10435791015625, + "learning_rate": 4e-05, + "loss": 5.1281, + "loss/crossentropy": 2.4375243186950684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24222580343484879, + "step": 6396 + }, + { + "epoch": 0.5331666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.10123291015625, + "learning_rate": 4e-05, + "loss": 4.469, + "loss/crossentropy": 2.201574385166168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19397074729204178, + "step": 6398 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.09850260416666666, + "learning_rate": 4e-05, + "loss": 4.7124, + "loss/crossentropy": 1.9350962042808533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19658659398555756, + "step": 6400 + }, + { + "epoch": 0.5335, + "grad_norm": 4.78125, + "grad_norm_var": 0.06560872395833334, + "learning_rate": 4e-05, + "loss": 5.2431, + "loss/crossentropy": 1.468439057469368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16111770644783974, + "step": 6402 + }, + { + "epoch": 0.5336666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.067822265625, + "learning_rate": 4e-05, + "loss": 5.0469, + "loss/crossentropy": 2.328448623418808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2267051674425602, + "step": 6404 + }, + { + "epoch": 0.5338333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.06275634765625, + "learning_rate": 4e-05, + "loss": 4.628, + "loss/crossentropy": 2.319155514240265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2297566793859005, + "step": 6406 + }, + { + "epoch": 0.534, + "grad_norm": 5.1875, + "grad_norm_var": 0.06370035807291667, + "learning_rate": 4e-05, + "loss": 4.4522, + "loss/crossentropy": 1.6990758031606674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14935988560318947, + "step": 6408 + }, + { + "epoch": 0.5341666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.052083333333333336, + "learning_rate": 4e-05, + "loss": 4.8484, + "loss/crossentropy": 1.4202308654785156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16745430044829845, + "step": 6410 + }, + { + "epoch": 0.5343333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.05245768229166667, + "learning_rate": 4e-05, + "loss": 5.2703, + "loss/crossentropy": 2.6230361461639404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23828474059700966, + "step": 6412 + }, + { + "epoch": 0.5345, + "grad_norm": 4.6875, + "grad_norm_var": 0.05701497395833333, + "learning_rate": 4e-05, + "loss": 4.6883, + "loss/crossentropy": 2.146642565727234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937054954469204, + "step": 6414 + }, + { + "epoch": 0.5346666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.06092122395833333, + "learning_rate": 4e-05, + "loss": 4.7823, + "loss/crossentropy": 1.6736676394939423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18550512567162514, + "step": 6416 + }, + { + "epoch": 0.5348333333333334, + "grad_norm": 5.21875, + "grad_norm_var": 0.07342122395833334, + "learning_rate": 4e-05, + "loss": 4.7432, + "loss/crossentropy": 1.7560848370194435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17786714434623718, + "step": 6418 + }, + { + "epoch": 0.535, + "grad_norm": 4.78125, + "grad_norm_var": 0.07454427083333333, + "learning_rate": 4e-05, + "loss": 5.1035, + "loss/crossentropy": 1.8662885650992393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1651218943297863, + "step": 6420 + }, + { + "epoch": 0.5351666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.072265625, + "learning_rate": 4e-05, + "loss": 5.3528, + "loss/crossentropy": 2.4681405425071716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21770339086651802, + "step": 6422 + }, + { + "epoch": 0.5353333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.05243733723958333, + "learning_rate": 4e-05, + "loss": 5.2777, + "loss/crossentropy": 1.8275192975997925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23562873154878616, + "step": 6424 + }, + { + "epoch": 0.5355, + "grad_norm": 4.5, + "grad_norm_var": 0.06552327473958333, + "learning_rate": 4e-05, + "loss": 4.8877, + "loss/crossentropy": 2.511595845222473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2235119715332985, + "step": 6426 + }, + { + "epoch": 0.5356666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.06607666015625, + "learning_rate": 4e-05, + "loss": 4.6033, + "loss/crossentropy": 2.1008317470550537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20949330553412437, + "step": 6428 + }, + { + "epoch": 0.5358333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.06357014973958333, + "learning_rate": 4e-05, + "loss": 4.7668, + "loss/crossentropy": 1.8257400766015053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19425482116639614, + "step": 6430 + }, + { + "epoch": 0.536, + "grad_norm": 5.21875, + "grad_norm_var": 0.0697265625, + "learning_rate": 4e-05, + "loss": 4.8881, + "loss/crossentropy": 2.2771336138248444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748082920908928, + "step": 6432 + }, + { + "epoch": 0.5361666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.06666666666666667, + "learning_rate": 4e-05, + "loss": 5.2098, + "loss/crossentropy": 1.7778798043727875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17552071809768677, + "step": 6434 + }, + { + "epoch": 0.5363333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.06443684895833333, + "learning_rate": 4e-05, + "loss": 4.7605, + "loss/crossentropy": 1.6425282210111618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1681121438741684, + "step": 6436 + }, + { + "epoch": 0.5365, + "grad_norm": 4.75, + "grad_norm_var": 0.06529541015625, + "learning_rate": 4e-05, + "loss": 5.4284, + "loss/crossentropy": 2.383900284767151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20339232310652733, + "step": 6438 + }, + { + "epoch": 0.5366666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.060139973958333336, + "learning_rate": 4e-05, + "loss": 5.2453, + "loss/crossentropy": 2.399388551712036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.196612898260355, + "step": 6440 + }, + { + "epoch": 0.5368333333333334, + "grad_norm": 5.375, + "grad_norm_var": 0.06261393229166666, + "learning_rate": 4e-05, + "loss": 5.3347, + "loss/crossentropy": 1.2793971821665764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15150474943220615, + "step": 6442 + }, + { + "epoch": 0.537, + "grad_norm": 4.65625, + "grad_norm_var": 0.07916259765625, + "learning_rate": 4e-05, + "loss": 4.8687, + "loss/crossentropy": 1.5082692801952362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1459091752767563, + "step": 6444 + }, + { + "epoch": 0.5371666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.07336832682291666, + "learning_rate": 4e-05, + "loss": 4.8662, + "loss/crossentropy": 2.350295513868332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22841082885861397, + "step": 6446 + }, + { + "epoch": 0.5373333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.05792643229166667, + "learning_rate": 4e-05, + "loss": 4.9448, + "loss/crossentropy": 2.585313081741333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21179723367094994, + "step": 6448 + }, + { + "epoch": 0.5375, + "grad_norm": 4.75, + "grad_norm_var": 0.05243733723958333, + "learning_rate": 4e-05, + "loss": 4.3929, + "loss/crossentropy": 2.019158661365509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110278531908989, + "step": 6450 + }, + { + "epoch": 0.5376666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.05935872395833333, + "learning_rate": 4e-05, + "loss": 4.6044, + "loss/crossentropy": 1.8365314081311226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1840047501027584, + "step": 6452 + }, + { + "epoch": 0.5378333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.06471354166666667, + "learning_rate": 4e-05, + "loss": 4.2156, + "loss/crossentropy": 2.1785257756710052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20807776227593422, + "step": 6454 + }, + { + "epoch": 0.538, + "grad_norm": 4.6875, + "grad_norm_var": 0.09576416015625, + "learning_rate": 4e-05, + "loss": 5.2141, + "loss/crossentropy": 2.0053779631853104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18812064826488495, + "step": 6456 + }, + { + "epoch": 0.5381666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.0833984375, + "learning_rate": 4e-05, + "loss": 5.6892, + "loss/crossentropy": 1.9643101394176483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20990155264735222, + "step": 6458 + }, + { + "epoch": 0.5383333333333333, + "grad_norm": 6.03125, + "grad_norm_var": 0.14892171223958334, + "learning_rate": 4e-05, + "loss": 5.1658, + "loss/crossentropy": 1.853164553642273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23421235010027885, + "step": 6460 + }, + { + "epoch": 0.5385, + "grad_norm": 4.625, + "grad_norm_var": 0.16213785807291667, + "learning_rate": 4e-05, + "loss": 5.281, + "loss/crossentropy": 1.398292914032936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15396402776241302, + "step": 6462 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.16419270833333333, + "learning_rate": 4e-05, + "loss": 4.6955, + "loss/crossentropy": 2.4291781187057495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074362002313137, + "step": 6464 + }, + { + "epoch": 0.5388333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.161181640625, + "learning_rate": 4e-05, + "loss": 4.573, + "loss/crossentropy": 1.8937534540891647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.178242489695549, + "step": 6466 + }, + { + "epoch": 0.539, + "grad_norm": 4.875, + "grad_norm_var": 0.15754801432291668, + "learning_rate": 4e-05, + "loss": 5.2892, + "loss/crossentropy": 2.6727017164230347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22374076396226883, + "step": 6468 + }, + { + "epoch": 0.5391666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.14425455729166667, + "learning_rate": 4e-05, + "loss": 4.9342, + "loss/crossentropy": 2.3451661467552185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21111416071653366, + "step": 6470 + }, + { + "epoch": 0.5393333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.12112223307291667, + "learning_rate": 4e-05, + "loss": 4.7019, + "loss/crossentropy": 1.8271106332540512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1798660308122635, + "step": 6472 + }, + { + "epoch": 0.5395, + "grad_norm": 4.84375, + "grad_norm_var": 0.11998291015625, + "learning_rate": 4e-05, + "loss": 5.0453, + "loss/crossentropy": 2.053663656115532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18994637206196785, + "step": 6474 + }, + { + "epoch": 0.5396666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.03534749348958333, + "learning_rate": 4e-05, + "loss": 5.4068, + "loss/crossentropy": 2.02348530292511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2410239391028881, + "step": 6476 + }, + { + "epoch": 0.5398333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 4e-05, + "loss": 4.5837, + "loss/crossentropy": 1.0813293680548668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13367479853332043, + "step": 6478 + }, + { + "epoch": 0.54, + "grad_norm": 5.34375, + "grad_norm_var": 0.05956624348958333, + "learning_rate": 4e-05, + "loss": 4.6037, + "loss/crossentropy": 1.643570214509964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726165246218443, + "step": 6480 + }, + { + "epoch": 0.5401666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.063134765625, + "learning_rate": 4e-05, + "loss": 4.9758, + "loss/crossentropy": 2.1287569105625153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20907136425375938, + "step": 6482 + }, + { + "epoch": 0.5403333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.05318603515625, + "learning_rate": 4e-05, + "loss": 4.8937, + "loss/crossentropy": 1.5239028334617615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893562152981758, + "step": 6484 + }, + { + "epoch": 0.5405, + "grad_norm": 5.0, + "grad_norm_var": 0.0537109375, + "learning_rate": 4e-05, + "loss": 5.1236, + "loss/crossentropy": 2.2508333921432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21869652345776558, + "step": 6486 + }, + { + "epoch": 0.5406666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.12961832682291666, + "learning_rate": 4e-05, + "loss": 5.061, + "loss/crossentropy": 2.1860940158367157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25951458886265755, + "step": 6488 + }, + { + "epoch": 0.5408333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.12838541666666667, + "learning_rate": 4e-05, + "loss": 4.693, + "loss/crossentropy": 1.9693054556846619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21282178536057472, + "step": 6490 + }, + { + "epoch": 0.541, + "grad_norm": 5.09375, + "grad_norm_var": 0.12858072916666666, + "learning_rate": 4e-05, + "loss": 4.1882, + "loss/crossentropy": 1.9670451954007149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18181007727980614, + "step": 6492 + }, + { + "epoch": 0.5411666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.12255452473958334, + "learning_rate": 4e-05, + "loss": 4.8558, + "loss/crossentropy": 2.0306060314178467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18981993943452835, + "step": 6494 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.10234375, + "learning_rate": 4e-05, + "loss": 5.1075, + "loss/crossentropy": 1.1169188246130943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11659727245569229, + "step": 6496 + }, + { + "epoch": 0.5415, + "grad_norm": 5.0, + "grad_norm_var": 0.10701497395833333, + "learning_rate": 4e-05, + "loss": 5.3556, + "loss/crossentropy": 2.5187776684761047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22361308708786964, + "step": 6498 + }, + { + "epoch": 0.5416666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.11262613932291667, + "learning_rate": 4e-05, + "loss": 4.6649, + "loss/crossentropy": 2.0278166234493256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21406276151537895, + "step": 6500 + }, + { + "epoch": 0.5418333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.14423421223958333, + "learning_rate": 4e-05, + "loss": 4.5689, + "loss/crossentropy": 0.8223181739449501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11125335656106472, + "step": 6502 + }, + { + "epoch": 0.542, + "grad_norm": 4.90625, + "grad_norm_var": 0.0802734375, + "learning_rate": 4e-05, + "loss": 4.9923, + "loss/crossentropy": 1.8635541796684265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24781085550785065, + "step": 6504 + }, + { + "epoch": 0.5421666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.088671875, + "learning_rate": 4e-05, + "loss": 4.5678, + "loss/crossentropy": 1.9120320081710815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1711360290646553, + "step": 6506 + }, + { + "epoch": 0.5423333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.08307291666666666, + "learning_rate": 4e-05, + "loss": 5.228, + "loss/crossentropy": 2.482433497905731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151547260582447, + "step": 6508 + }, + { + "epoch": 0.5425, + "grad_norm": 5.0, + "grad_norm_var": 0.08990885416666666, + "learning_rate": 4e-05, + "loss": 5.2356, + "loss/crossentropy": 1.492501512169838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15523040667176247, + "step": 6510 + }, + { + "epoch": 0.5426666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.10666910807291667, + "learning_rate": 4e-05, + "loss": 4.8739, + "loss/crossentropy": 1.1665829196572304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1341555155813694, + "step": 6512 + }, + { + "epoch": 0.5428333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.09289957682291666, + "learning_rate": 4e-05, + "loss": 4.411, + "loss/crossentropy": 1.2798718959093094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13985286466777325, + "step": 6514 + }, + { + "epoch": 0.543, + "grad_norm": 4.65625, + "grad_norm_var": 0.11809895833333334, + "learning_rate": 4e-05, + "loss": 4.8671, + "loss/crossentropy": 1.679627038538456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18767642788589, + "step": 6516 + }, + { + "epoch": 0.5431666666666667, + "grad_norm": 5.65625, + "grad_norm_var": 0.12561442057291666, + "learning_rate": 4e-05, + "loss": 4.9561, + "loss/crossentropy": 2.608290433883667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20927531272172928, + "step": 6518 + }, + { + "epoch": 0.5433333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.13268229166666667, + "learning_rate": 4e-05, + "loss": 4.8244, + "loss/crossentropy": 2.111786961555481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18980854004621506, + "step": 6520 + }, + { + "epoch": 0.5435, + "grad_norm": 5.09375, + "grad_norm_var": 0.12102457682291666, + "learning_rate": 4e-05, + "loss": 5.3594, + "loss/crossentropy": 2.082971006631851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20488620921969414, + "step": 6522 + }, + { + "epoch": 0.5436666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.12263997395833333, + "learning_rate": 4e-05, + "loss": 5.182, + "loss/crossentropy": 1.7450605109333992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21161355637013912, + "step": 6524 + }, + { + "epoch": 0.5438333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.11705729166666666, + "learning_rate": 4e-05, + "loss": 5.3596, + "loss/crossentropy": 2.4712526500225067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20881268754601479, + "step": 6526 + }, + { + "epoch": 0.544, + "grad_norm": 4.71875, + "grad_norm_var": 0.101025390625, + "learning_rate": 4e-05, + "loss": 5.2519, + "loss/crossentropy": 2.10575695335865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18501529842615128, + "step": 6528 + }, + { + "epoch": 0.5441666666666667, + "grad_norm": 5.53125, + "grad_norm_var": 0.11418863932291666, + "learning_rate": 4e-05, + "loss": 5.2655, + "loss/crossentropy": 2.047022193670273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2391316220164299, + "step": 6530 + }, + { + "epoch": 0.5443333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.09221598307291666, + "learning_rate": 4e-05, + "loss": 4.3932, + "loss/crossentropy": 2.1609503626823425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19315851852297783, + "step": 6532 + }, + { + "epoch": 0.5445, + "grad_norm": 5.15625, + "grad_norm_var": 0.05872395833333333, + "learning_rate": 4e-05, + "loss": 5.2578, + "loss/crossentropy": 2.1153732389211655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17025521025061607, + "step": 6534 + }, + { + "epoch": 0.5446666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.055074055989583336, + "learning_rate": 4e-05, + "loss": 4.9669, + "loss/crossentropy": 1.997760385274887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18793719820678234, + "step": 6536 + }, + { + "epoch": 0.5448333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.06131184895833333, + "learning_rate": 4e-05, + "loss": 4.9012, + "loss/crossentropy": 1.9758115112781525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17915968596935272, + "step": 6538 + }, + { + "epoch": 0.545, + "grad_norm": 5.28125, + "grad_norm_var": 0.06653645833333334, + "learning_rate": 4e-05, + "loss": 5.3909, + "loss/crossentropy": 1.5755042657256126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.164706502109766, + "step": 6540 + }, + { + "epoch": 0.5451666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.06952718098958334, + "learning_rate": 4e-05, + "loss": 4.3076, + "loss/crossentropy": 0.9922100901603699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12771181762218475, + "step": 6542 + }, + { + "epoch": 0.5453333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.0744140625, + "learning_rate": 4e-05, + "loss": 5.0314, + "loss/crossentropy": 2.3435046076774597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274561271071434, + "step": 6544 + }, + { + "epoch": 0.5455, + "grad_norm": 4.65625, + "grad_norm_var": 0.05074462890625, + "learning_rate": 4e-05, + "loss": 5.3025, + "loss/crossentropy": 2.2309907376766205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21106267720460892, + "step": 6546 + }, + { + "epoch": 0.5456666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.058577473958333334, + "learning_rate": 4e-05, + "loss": 4.8551, + "loss/crossentropy": 1.9048963338136673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19614364951848984, + "step": 6548 + }, + { + "epoch": 0.5458333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.06712239583333333, + "learning_rate": 4e-05, + "loss": 5.2287, + "loss/crossentropy": 2.3945577144622803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21453485265374184, + "step": 6550 + }, + { + "epoch": 0.546, + "grad_norm": 4.8125, + "grad_norm_var": 0.066259765625, + "learning_rate": 4e-05, + "loss": 4.4469, + "loss/crossentropy": 1.5723232179880142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20032503828406334, + "step": 6552 + }, + { + "epoch": 0.5461666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.05904541015625, + "learning_rate": 4e-05, + "loss": 4.8559, + "loss/crossentropy": 1.4620828106999397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15637214109301567, + "step": 6554 + }, + { + "epoch": 0.5463333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.057535807291666664, + "learning_rate": 4e-05, + "loss": 4.9627, + "loss/crossentropy": 2.2152084708213806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839854922145605, + "step": 6556 + }, + { + "epoch": 0.5465, + "grad_norm": 4.625, + "grad_norm_var": 0.06353759765625, + "learning_rate": 4e-05, + "loss": 4.8659, + "loss/crossentropy": 1.2578969895839691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1370962280780077, + "step": 6558 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 4.3125, + "grad_norm_var": 0.11829427083333334, + "learning_rate": 4e-05, + "loss": 4.7406, + "loss/crossentropy": 1.8498205542564392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.182185810059309, + "step": 6560 + }, + { + "epoch": 0.5468333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.11184895833333333, + "learning_rate": 4e-05, + "loss": 5.1587, + "loss/crossentropy": 2.459185838699341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21943962574005127, + "step": 6562 + }, + { + "epoch": 0.547, + "grad_norm": 4.65625, + "grad_norm_var": 0.10388997395833334, + "learning_rate": 4e-05, + "loss": 4.7703, + "loss/crossentropy": 1.976406842470169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1768235471099615, + "step": 6564 + }, + { + "epoch": 0.5471666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.10022379557291666, + "learning_rate": 4e-05, + "loss": 4.7495, + "loss/crossentropy": 2.067137509584427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20023474469780922, + "step": 6566 + }, + { + "epoch": 0.5473333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.101416015625, + "learning_rate": 4e-05, + "loss": 5.0631, + "loss/crossentropy": 2.285651355981827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20043394714593887, + "step": 6568 + }, + { + "epoch": 0.5475, + "grad_norm": 4.8125, + "grad_norm_var": 0.09970296223958333, + "learning_rate": 4e-05, + "loss": 5.0448, + "loss/crossentropy": 2.1361614763736725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.196052685379982, + "step": 6570 + }, + { + "epoch": 0.5476666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.09622395833333333, + "learning_rate": 4e-05, + "loss": 5.0062, + "loss/crossentropy": 2.1231243014335632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22360917553305626, + "step": 6572 + }, + { + "epoch": 0.5478333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.08665364583333333, + "learning_rate": 4e-05, + "loss": 4.873, + "loss/crossentropy": 1.7845403030514717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827959530055523, + "step": 6574 + }, + { + "epoch": 0.548, + "grad_norm": 5.0, + "grad_norm_var": 0.026302083333333334, + "learning_rate": 4e-05, + "loss": 4.5735, + "loss/crossentropy": 1.8567826747894287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2372213713824749, + "step": 6576 + }, + { + "epoch": 0.5481666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.038960774739583336, + "learning_rate": 4e-05, + "loss": 5.2786, + "loss/crossentropy": 2.3799859285354614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2384014129638672, + "step": 6578 + }, + { + "epoch": 0.5483333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.039322916666666666, + "learning_rate": 4e-05, + "loss": 4.8077, + "loss/crossentropy": 1.8382329940795898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2353934571146965, + "step": 6580 + }, + { + "epoch": 0.5485, + "grad_norm": 4.8125, + "grad_norm_var": 0.032666015625, + "learning_rate": 4e-05, + "loss": 5.2369, + "loss/crossentropy": 2.491079866886139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22125229239463806, + "step": 6582 + }, + { + "epoch": 0.5486666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.04107666015625, + "learning_rate": 4e-05, + "loss": 4.5376, + "loss/crossentropy": 1.3073586374521255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16539104282855988, + "step": 6584 + }, + { + "epoch": 0.5488333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.043863932291666664, + "learning_rate": 4e-05, + "loss": 4.9317, + "loss/crossentropy": 2.1026156544685364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20953651517629623, + "step": 6586 + }, + { + "epoch": 0.549, + "grad_norm": 5.0, + "grad_norm_var": 0.039957682291666664, + "learning_rate": 4e-05, + "loss": 4.5531, + "loss/crossentropy": 2.3541979789733887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21336714923381805, + "step": 6588 + }, + { + "epoch": 0.5491666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.0408203125, + "learning_rate": 4e-05, + "loss": 4.8354, + "loss/crossentropy": 1.3233697563409805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15271325409412384, + "step": 6590 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.039713541666666664, + "learning_rate": 4e-05, + "loss": 4.7838, + "loss/crossentropy": 1.797164410352707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19014303386211395, + "step": 6592 + }, + { + "epoch": 0.5495, + "grad_norm": 5.03125, + "grad_norm_var": 0.028369140625, + "learning_rate": 4e-05, + "loss": 4.5686, + "loss/crossentropy": 2.195107936859131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21631298959255219, + "step": 6594 + }, + { + "epoch": 0.5496666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.028804524739583334, + "learning_rate": 4e-05, + "loss": 5.015, + "loss/crossentropy": 1.7663453668355942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18998459354043007, + "step": 6596 + }, + { + "epoch": 0.5498333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.03534749348958333, + "learning_rate": 4e-05, + "loss": 5.1948, + "loss/crossentropy": 1.7619916647672653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1652333326637745, + "step": 6598 + }, + { + "epoch": 0.55, + "grad_norm": 5.25, + "grad_norm_var": 0.03240559895833333, + "learning_rate": 4e-05, + "loss": 5.046, + "loss/crossentropy": 2.5450315475463867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705002650618553, + "step": 6600 + }, + { + "epoch": 0.5501666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.028446451822916666, + "learning_rate": 4e-05, + "loss": 5.451, + "loss/crossentropy": 1.96433687210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157164253294468, + "step": 6602 + }, + { + "epoch": 0.5503333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.035477701822916666, + "learning_rate": 4e-05, + "loss": 4.8976, + "loss/crossentropy": 2.3359290957450867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21508868411183357, + "step": 6604 + }, + { + "epoch": 0.5505, + "grad_norm": 4.90625, + "grad_norm_var": 0.03186442057291667, + "learning_rate": 4e-05, + "loss": 4.457, + "loss/crossentropy": 1.537950836122036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15239088609814644, + "step": 6606 + }, + { + "epoch": 0.5506666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.050065104166666666, + "learning_rate": 4e-05, + "loss": 4.447, + "loss/crossentropy": 1.3219322934746742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14767685532569885, + "step": 6608 + }, + { + "epoch": 0.5508333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.05035400390625, + "learning_rate": 4e-05, + "loss": 5.0461, + "loss/crossentropy": 1.9098602086305618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19876160100102425, + "step": 6610 + }, + { + "epoch": 0.551, + "grad_norm": 5.1875, + "grad_norm_var": 0.04855143229166667, + "learning_rate": 4e-05, + "loss": 5.2442, + "loss/crossentropy": 2.309624195098877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23638612031936646, + "step": 6612 + }, + { + "epoch": 0.5511666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.047265625, + "learning_rate": 4e-05, + "loss": 4.8379, + "loss/crossentropy": 1.9966806918382645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17079764790832996, + "step": 6614 + }, + { + "epoch": 0.5513333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.04394124348958333, + "learning_rate": 4e-05, + "loss": 4.3536, + "loss/crossentropy": 2.0822777450084686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22634311020374298, + "step": 6616 + }, + { + "epoch": 0.5515, + "grad_norm": 4.8125, + "grad_norm_var": 0.04021809895833333, + "learning_rate": 4e-05, + "loss": 4.7073, + "loss/crossentropy": 1.2211828529834747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18051068857312202, + "step": 6618 + }, + { + "epoch": 0.5516666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.03762613932291667, + "learning_rate": 4e-05, + "loss": 5.5942, + "loss/crossentropy": 2.4567679166793823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20884644612669945, + "step": 6620 + }, + { + "epoch": 0.5518333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.05188395182291667, + "learning_rate": 4e-05, + "loss": 5.4697, + "loss/crossentropy": 2.2509296238422394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2176789492368698, + "step": 6622 + }, + { + "epoch": 0.552, + "grad_norm": 4.6875, + "grad_norm_var": 0.042801920572916666, + "learning_rate": 4e-05, + "loss": 5.278, + "loss/crossentropy": 1.8549513220787048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1713106855750084, + "step": 6624 + }, + { + "epoch": 0.5521666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.048681640625, + "learning_rate": 4e-05, + "loss": 4.3434, + "loss/crossentropy": 1.9512775838375092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18272232450544834, + "step": 6626 + }, + { + "epoch": 0.5523333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.05758056640625, + "learning_rate": 4e-05, + "loss": 4.2377, + "loss/crossentropy": 1.8292421698570251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19010495953261852, + "step": 6628 + }, + { + "epoch": 0.5525, + "grad_norm": 5.0, + "grad_norm_var": 0.06249593098958333, + "learning_rate": 4e-05, + "loss": 4.8372, + "loss/crossentropy": 1.668544426560402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17395753040909767, + "step": 6630 + }, + { + "epoch": 0.5526666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.05657145182291667, + "learning_rate": 4e-05, + "loss": 4.9377, + "loss/crossentropy": 1.7942739725112915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17174866795539856, + "step": 6632 + }, + { + "epoch": 0.5528333333333333, + "grad_norm": 5.5, + "grad_norm_var": 0.08566080729166667, + "learning_rate": 4e-05, + "loss": 5.575, + "loss/crossentropy": 2.2777758836746216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2089482769370079, + "step": 6634 + }, + { + "epoch": 0.553, + "grad_norm": 4.71875, + "grad_norm_var": 0.08332926432291667, + "learning_rate": 4e-05, + "loss": 4.9785, + "loss/crossentropy": 2.664782464504242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22308696061372757, + "step": 6636 + }, + { + "epoch": 0.5531666666666667, + "grad_norm": 4.1875, + "grad_norm_var": 0.1029296875, + "learning_rate": 4e-05, + "loss": 4.2995, + "loss/crossentropy": 1.231726422905922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14293777011334896, + "step": 6638 + }, + { + "epoch": 0.5533333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.10354410807291667, + "learning_rate": 4e-05, + "loss": 4.8424, + "loss/crossentropy": 1.7617796063423157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960979737341404, + "step": 6640 + }, + { + "epoch": 0.5535, + "grad_norm": 4.8125, + "grad_norm_var": 0.0982421875, + "learning_rate": 4e-05, + "loss": 4.9507, + "loss/crossentropy": 2.1851932406425476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22447919100522995, + "step": 6642 + }, + { + "epoch": 0.5536666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.08658854166666667, + "learning_rate": 4e-05, + "loss": 4.4639, + "loss/crossentropy": 1.8027335181832314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19817146845161915, + "step": 6644 + }, + { + "epoch": 0.5538333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.08967692057291667, + "learning_rate": 4e-05, + "loss": 5.0155, + "loss/crossentropy": 1.1861390694975853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14436486922204494, + "step": 6646 + }, + { + "epoch": 0.554, + "grad_norm": 4.875, + "grad_norm_var": 0.09073893229166667, + "learning_rate": 4e-05, + "loss": 5.6074, + "loss/crossentropy": 2.3672031462192535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19832666590809822, + "step": 6648 + }, + { + "epoch": 0.5541666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.07629801432291666, + "learning_rate": 4e-05, + "loss": 5.1871, + "loss/crossentropy": 2.238451138138771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17222367227077484, + "step": 6650 + }, + { + "epoch": 0.5543333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.08801676432291666, + "learning_rate": 4e-05, + "loss": 4.3575, + "loss/crossentropy": 1.422397181391716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17442157492041588, + "step": 6652 + }, + { + "epoch": 0.5545, + "grad_norm": 4.84375, + "grad_norm_var": 0.05206705729166667, + "learning_rate": 4e-05, + "loss": 4.9704, + "loss/crossentropy": 2.4562787413597107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20161322504281998, + "step": 6654 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.057145182291666666, + "learning_rate": 4e-05, + "loss": 5.1275, + "loss/crossentropy": 2.188231348991394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20168805122375488, + "step": 6656 + }, + { + "epoch": 0.5548333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.055078125, + "learning_rate": 4e-05, + "loss": 5.0781, + "loss/crossentropy": 2.32028391957283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20598405227065086, + "step": 6658 + }, + { + "epoch": 0.555, + "grad_norm": 5.34375, + "grad_norm_var": 0.05712483723958333, + "learning_rate": 4e-05, + "loss": 4.9123, + "loss/crossentropy": 1.8713824450969696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19735869392752647, + "step": 6660 + }, + { + "epoch": 0.5551666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.05240478515625, + "learning_rate": 4e-05, + "loss": 4.9659, + "loss/crossentropy": 1.9562078714370728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18440138176083565, + "step": 6662 + }, + { + "epoch": 0.5553333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.054976399739583334, + "learning_rate": 4e-05, + "loss": 5.1777, + "loss/crossentropy": 1.6304501295089722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18326781317591667, + "step": 6664 + }, + { + "epoch": 0.5555, + "grad_norm": 4.75, + "grad_norm_var": 0.0427734375, + "learning_rate": 4e-05, + "loss": 4.0937, + "loss/crossentropy": 0.6879162788391113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14016366563737392, + "step": 6666 + }, + { + "epoch": 0.5556666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.04429931640625, + "learning_rate": 4e-05, + "loss": 4.9816, + "loss/crossentropy": 1.8157347962260246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18013755604624748, + "step": 6668 + }, + { + "epoch": 0.5558333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.059098307291666666, + "learning_rate": 4e-05, + "loss": 5.1179, + "loss/crossentropy": 2.056758761405945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23633424937725067, + "step": 6670 + }, + { + "epoch": 0.556, + "grad_norm": 4.25, + "grad_norm_var": 0.08990885416666666, + "learning_rate": 4e-05, + "loss": 4.5153, + "loss/crossentropy": 2.417732000350952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401347428560257, + "step": 6672 + }, + { + "epoch": 0.5561666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.09514567057291666, + "learning_rate": 4e-05, + "loss": 5.3679, + "loss/crossentropy": 1.909975491464138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16564680822193623, + "step": 6674 + }, + { + "epoch": 0.5563333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.08058268229166667, + "learning_rate": 4e-05, + "loss": 5.439, + "loss/crossentropy": 2.06892479211092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18954100832343102, + "step": 6676 + }, + { + "epoch": 0.5565, + "grad_norm": 4.4375, + "grad_norm_var": 0.09250895182291667, + "learning_rate": 4e-05, + "loss": 4.6291, + "loss/crossentropy": 2.054016627371311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1676958091557026, + "step": 6678 + }, + { + "epoch": 0.5566666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.10558268229166666, + "learning_rate": 4e-05, + "loss": 4.9014, + "loss/crossentropy": 2.0140282213687897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17690856754779816, + "step": 6680 + }, + { + "epoch": 0.5568333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.10507405598958333, + "learning_rate": 4e-05, + "loss": 4.8231, + "loss/crossentropy": 2.3354055285453796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20604095980525017, + "step": 6682 + }, + { + "epoch": 0.557, + "grad_norm": 5.1875, + "grad_norm_var": 0.10859375, + "learning_rate": 4e-05, + "loss": 5.232, + "loss/crossentropy": 2.084495782852173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18740139529109, + "step": 6684 + }, + { + "epoch": 0.5571666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.07734375, + "learning_rate": 4e-05, + "loss": 4.4036, + "loss/crossentropy": 1.638416811823845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15969400480389595, + "step": 6686 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.08196614583333334, + "learning_rate": 4e-05, + "loss": 4.7544, + "loss/crossentropy": 0.9400245994329453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12616140954196453, + "step": 6688 + }, + { + "epoch": 0.5575, + "grad_norm": 4.875, + "grad_norm_var": 0.07766927083333333, + "learning_rate": 4e-05, + "loss": 5.2425, + "loss/crossentropy": 2.1369471848011017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22332263365387917, + "step": 6690 + }, + { + "epoch": 0.5576666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.07616780598958334, + "learning_rate": 4e-05, + "loss": 5.1849, + "loss/crossentropy": 2.384535789489746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22261762246489525, + "step": 6692 + }, + { + "epoch": 0.5578333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.06249593098958333, + "learning_rate": 4e-05, + "loss": 4.6685, + "loss/crossentropy": 1.8818542137742043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18431300669908524, + "step": 6694 + }, + { + "epoch": 0.558, + "grad_norm": 4.5625, + "grad_norm_var": 0.05487874348958333, + "learning_rate": 4e-05, + "loss": 4.7571, + "loss/crossentropy": 0.9524921476840973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11215276457369328, + "step": 6696 + }, + { + "epoch": 0.5581666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.05974934895833333, + "learning_rate": 4e-05, + "loss": 4.6185, + "loss/crossentropy": 1.4619218409061432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1464745569974184, + "step": 6698 + }, + { + "epoch": 0.5583333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.04837239583333333, + "learning_rate": 4e-05, + "loss": 4.8486, + "loss/crossentropy": 2.051357090473175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19964271783828735, + "step": 6700 + }, + { + "epoch": 0.5585, + "grad_norm": 4.71875, + "grad_norm_var": 0.051102701822916666, + "learning_rate": 4e-05, + "loss": 4.0687, + "loss/crossentropy": 1.5994284898042679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16306093521416187, + "step": 6702 + }, + { + "epoch": 0.5586666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.029817708333333335, + "learning_rate": 4e-05, + "loss": 4.4709, + "loss/crossentropy": 2.0101925432682037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20254620909690857, + "step": 6704 + }, + { + "epoch": 0.5588333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.029150390625, + "learning_rate": 4e-05, + "loss": 4.2, + "loss/crossentropy": 1.8986879587173462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18273010104894638, + "step": 6706 + }, + { + "epoch": 0.559, + "grad_norm": 5.28125, + "grad_norm_var": 0.056929524739583334, + "learning_rate": 4e-05, + "loss": 4.9344, + "loss/crossentropy": 1.509821131825447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1933259814977646, + "step": 6708 + }, + { + "epoch": 0.5591666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.06949462890625, + "learning_rate": 4e-05, + "loss": 4.5876, + "loss/crossentropy": 1.5221448838710785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1912355963140726, + "step": 6710 + }, + { + "epoch": 0.5593333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.06962483723958333, + "learning_rate": 4e-05, + "loss": 4.812, + "loss/crossentropy": 2.3658363819122314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21914341673254967, + "step": 6712 + }, + { + "epoch": 0.5595, + "grad_norm": 4.96875, + "grad_norm_var": 0.0685546875, + "learning_rate": 4e-05, + "loss": 5.2591, + "loss/crossentropy": 2.0233709514141083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21164459735155106, + "step": 6714 + }, + { + "epoch": 0.5596666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 0.10091145833333333, + "learning_rate": 4e-05, + "loss": 5.6136, + "loss/crossentropy": 2.1298616975545883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18793256022036076, + "step": 6716 + }, + { + "epoch": 0.5598333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.10846354166666666, + "learning_rate": 4e-05, + "loss": 5.0254, + "loss/crossentropy": 1.5498792678117752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15589362382888794, + "step": 6718 + }, + { + "epoch": 0.56, + "grad_norm": 5.46875, + "grad_norm_var": 0.12470296223958334, + "learning_rate": 4e-05, + "loss": 5.0242, + "loss/crossentropy": 2.247347056865692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19609899818897247, + "step": 6720 + }, + { + "epoch": 0.5601666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.09837239583333333, + "learning_rate": 4e-05, + "loss": 5.0665, + "loss/crossentropy": 1.7242010906338692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1808859072625637, + "step": 6722 + }, + { + "epoch": 0.5603333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.06534830729166667, + "learning_rate": 4e-05, + "loss": 5.3355, + "loss/crossentropy": 2.5626547932624817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478646084666252, + "step": 6724 + }, + { + "epoch": 0.5605, + "grad_norm": 9.75, + "grad_norm_var": 1.4786417643229166, + "learning_rate": 4e-05, + "loss": 5.0271, + "loss/crossentropy": 2.60895174741745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22574814409017563, + "step": 6726 + }, + { + "epoch": 0.5606666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 1.514453125, + "learning_rate": 4e-05, + "loss": 4.5541, + "loss/crossentropy": 1.6328820586204529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18884171918034554, + "step": 6728 + }, + { + "epoch": 0.5608333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 1.5160115559895833, + "learning_rate": 4e-05, + "loss": 4.5824, + "loss/crossentropy": 0.8445823714137077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11832733266055584, + "step": 6730 + }, + { + "epoch": 0.561, + "grad_norm": 4.875, + "grad_norm_var": 1.5098917643229166, + "learning_rate": 4e-05, + "loss": 4.8565, + "loss/crossentropy": 2.287008821964264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20950447022914886, + "step": 6732 + }, + { + "epoch": 0.5611666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 1.5292277018229166, + "learning_rate": 4e-05, + "loss": 4.6073, + "loss/crossentropy": 2.079039454460144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198956910520792, + "step": 6734 + }, + { + "epoch": 0.5613333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 1.56275634765625, + "learning_rate": 4e-05, + "loss": 5.5193, + "loss/crossentropy": 2.555041193962097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20681289210915565, + "step": 6736 + }, + { + "epoch": 0.5615, + "grad_norm": 5.0625, + "grad_norm_var": 1.5798828125, + "learning_rate": 4e-05, + "loss": 5.5108, + "loss/crossentropy": 2.1208256036043167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17366488836705685, + "step": 6738 + }, + { + "epoch": 0.5616666666666666, + "grad_norm": 4.75, + "grad_norm_var": 1.5755167643229167, + "learning_rate": 4e-05, + "loss": 5.2524, + "loss/crossentropy": 2.362846553325653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502614036202431, + "step": 6740 + }, + { + "epoch": 0.5618333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.104931640625, + "learning_rate": 4e-05, + "loss": 4.9545, + "loss/crossentropy": 2.3334818482398987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19263581559062004, + "step": 6742 + }, + { + "epoch": 0.562, + "grad_norm": 5.125, + "grad_norm_var": 0.09755452473958333, + "learning_rate": 4e-05, + "loss": 5.4646, + "loss/crossentropy": 2.254566043615341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19652917608618736, + "step": 6744 + }, + { + "epoch": 0.5621666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.10774739583333333, + "learning_rate": 4e-05, + "loss": 4.4787, + "loss/crossentropy": 1.0434362962841988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.122630275785923, + "step": 6746 + }, + { + "epoch": 0.5623333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.10755208333333334, + "learning_rate": 4e-05, + "loss": 5.2754, + "loss/crossentropy": 2.429089665412903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22747541218996048, + "step": 6748 + }, + { + "epoch": 0.5625, + "grad_norm": 4.40625, + "grad_norm_var": 0.11708577473958333, + "learning_rate": 4e-05, + "loss": 4.9665, + "loss/crossentropy": 1.912009358406067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17672326415777206, + "step": 6750 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.06705322265625, + "learning_rate": 4e-05, + "loss": 4.5696, + "loss/crossentropy": 1.6353125348687172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.179355189204216, + "step": 6752 + }, + { + "epoch": 0.5628333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.06363525390625, + "learning_rate": 4e-05, + "loss": 5.0427, + "loss/crossentropy": 1.8525151684880257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19017129763960838, + "step": 6754 + }, + { + "epoch": 0.563, + "grad_norm": 5.0625, + "grad_norm_var": 0.07001546223958334, + "learning_rate": 4e-05, + "loss": 4.8502, + "loss/crossentropy": 2.0079415440559387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22154979780316353, + "step": 6756 + }, + { + "epoch": 0.5631666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.07353108723958333, + "learning_rate": 4e-05, + "loss": 5.2495, + "loss/crossentropy": 2.432105541229248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2078854702413082, + "step": 6758 + }, + { + "epoch": 0.5633333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.07310791015625, + "learning_rate": 4e-05, + "loss": 4.8135, + "loss/crossentropy": 2.318072497844696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19287996366620064, + "step": 6760 + }, + { + "epoch": 0.5635, + "grad_norm": 5.03125, + "grad_norm_var": 0.06873372395833334, + "learning_rate": 4e-05, + "loss": 4.9044, + "loss/crossentropy": 2.040756940841675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.234269879758358, + "step": 6762 + }, + { + "epoch": 0.5636666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.31116129557291666, + "learning_rate": 4e-05, + "loss": 4.9236, + "loss/crossentropy": 2.3868152499198914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909792795777321, + "step": 6764 + }, + { + "epoch": 0.5638333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.30419514973958334, + "learning_rate": 4e-05, + "loss": 4.947, + "loss/crossentropy": 2.2354209423065186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142213024199009, + "step": 6766 + }, + { + "epoch": 0.564, + "grad_norm": 5.125, + "grad_norm_var": 0.29257405598958336, + "learning_rate": 4e-05, + "loss": 5.5168, + "loss/crossentropy": 2.0357046499848366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17524536699056625, + "step": 6768 + }, + { + "epoch": 0.5641666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.29412434895833334, + "learning_rate": 4e-05, + "loss": 4.8833, + "loss/crossentropy": 1.6228312253952026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1723469439893961, + "step": 6770 + }, + { + "epoch": 0.5643333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.2816243489583333, + "learning_rate": 4e-05, + "loss": 5.1799, + "loss/crossentropy": 1.7683988213539124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16633950546383858, + "step": 6772 + }, + { + "epoch": 0.5645, + "grad_norm": 5.53125, + "grad_norm_var": 0.29003499348958334, + "learning_rate": 4e-05, + "loss": 4.7752, + "loss/crossentropy": 1.9569614827632904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17459377273917198, + "step": 6774 + }, + { + "epoch": 0.5646666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.3126953125, + "learning_rate": 4e-05, + "loss": 4.7691, + "loss/crossentropy": 1.7250538617372513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17075425572693348, + "step": 6776 + }, + { + "epoch": 0.5648333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.30500895182291665, + "learning_rate": 4e-05, + "loss": 5.0686, + "loss/crossentropy": 2.08631694316864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18906661123037338, + "step": 6778 + }, + { + "epoch": 0.565, + "grad_norm": 4.84375, + "grad_norm_var": 0.08043212890625, + "learning_rate": 4e-05, + "loss": 5.6042, + "loss/crossentropy": 2.473922371864319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215004812926054, + "step": 6780 + }, + { + "epoch": 0.5651666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.06443684895833333, + "learning_rate": 4e-05, + "loss": 5.2425, + "loss/crossentropy": 2.2826786637306213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24417144805192947, + "step": 6782 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.06868489583333333, + "learning_rate": 4e-05, + "loss": 4.7002, + "loss/crossentropy": 2.471294343471527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048209197819233, + "step": 6784 + }, + { + "epoch": 0.5655, + "grad_norm": 5.09375, + "grad_norm_var": 0.06669514973958333, + "learning_rate": 4e-05, + "loss": 5.3737, + "loss/crossentropy": 2.359356611967087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23297657072544098, + "step": 6786 + }, + { + "epoch": 0.5656666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.057145182291666666, + "learning_rate": 4e-05, + "loss": 4.8805, + "loss/crossentropy": 1.9158901870250702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1967592779546976, + "step": 6788 + }, + { + "epoch": 0.5658333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.04816080729166667, + "learning_rate": 4e-05, + "loss": 5.2392, + "loss/crossentropy": 1.9844098091125488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17694342508912086, + "step": 6790 + }, + { + "epoch": 0.566, + "grad_norm": 4.84375, + "grad_norm_var": 0.05026041666666667, + "learning_rate": 4e-05, + "loss": 4.4574, + "loss/crossentropy": 1.1681826636195183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12684665992856026, + "step": 6792 + }, + { + "epoch": 0.5661666666666667, + "grad_norm": 5.65625, + "grad_norm_var": 0.086181640625, + "learning_rate": 4e-05, + "loss": 5.2646, + "loss/crossentropy": 1.9428167939186096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1700345128774643, + "step": 6794 + }, + { + "epoch": 0.5663333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.08409830729166666, + "learning_rate": 4e-05, + "loss": 5.1086, + "loss/crossentropy": 1.7468329519033432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17568138800561428, + "step": 6796 + }, + { + "epoch": 0.5665, + "grad_norm": 5.03125, + "grad_norm_var": 0.086572265625, + "learning_rate": 4e-05, + "loss": 4.9981, + "loss/crossentropy": 1.8271580636501312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1846771389245987, + "step": 6798 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.089306640625, + "learning_rate": 4e-05, + "loss": 4.9397, + "loss/crossentropy": 1.8120107203722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16303765773773193, + "step": 6800 + }, + { + "epoch": 0.5668333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.114306640625, + "learning_rate": 4e-05, + "loss": 5.0096, + "loss/crossentropy": 2.0087318643927574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18550613150000572, + "step": 6802 + }, + { + "epoch": 0.567, + "grad_norm": 4.90625, + "grad_norm_var": 0.11334228515625, + "learning_rate": 4e-05, + "loss": 4.6148, + "loss/crossentropy": 2.5129515528678894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20342765003442764, + "step": 6804 + }, + { + "epoch": 0.5671666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.1056640625, + "learning_rate": 4e-05, + "loss": 4.8834, + "loss/crossentropy": 2.082836002111435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934790275990963, + "step": 6806 + }, + { + "epoch": 0.5673333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.08605143229166666, + "learning_rate": 4e-05, + "loss": 4.6374, + "loss/crossentropy": 2.0405823960900307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18986310623586178, + "step": 6808 + }, + { + "epoch": 0.5675, + "grad_norm": 5.0625, + "grad_norm_var": 0.04537353515625, + "learning_rate": 4e-05, + "loss": 5.0763, + "loss/crossentropy": 2.2310905158519745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21792350336909294, + "step": 6810 + }, + { + "epoch": 0.5676666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.04186197916666667, + "learning_rate": 4e-05, + "loss": 4.9157, + "loss/crossentropy": 2.1328996419906616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22394980490207672, + "step": 6812 + }, + { + "epoch": 0.5678333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.04071858723958333, + "learning_rate": 4e-05, + "loss": 4.3477, + "loss/crossentropy": 1.644135631620884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20680170506238937, + "step": 6814 + }, + { + "epoch": 0.568, + "grad_norm": 4.84375, + "grad_norm_var": 0.02926025390625, + "learning_rate": 4e-05, + "loss": 4.8556, + "loss/crossentropy": 2.437521994113922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220199353992939, + "step": 6816 + }, + { + "epoch": 0.5681666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.0193359375, + "learning_rate": 4e-05, + "loss": 4.9518, + "loss/crossentropy": 2.237753540277481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20501817017793655, + "step": 6818 + }, + { + "epoch": 0.5683333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.022135416666666668, + "learning_rate": 4e-05, + "loss": 4.4411, + "loss/crossentropy": 1.9096024632453918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19846992194652557, + "step": 6820 + }, + { + "epoch": 0.5685, + "grad_norm": 4.6875, + "grad_norm_var": 0.028934733072916666, + "learning_rate": 4e-05, + "loss": 4.6075, + "loss/crossentropy": 2.0617934688925743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183325186371803, + "step": 6822 + }, + { + "epoch": 0.5686666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.03420817057291667, + "learning_rate": 4e-05, + "loss": 5.1295, + "loss/crossentropy": 1.7742072641849518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17162226885557175, + "step": 6824 + }, + { + "epoch": 0.5688333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.031245930989583334, + "learning_rate": 4e-05, + "loss": 4.8558, + "loss/crossentropy": 2.2291614413261414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22294742614030838, + "step": 6826 + }, + { + "epoch": 0.569, + "grad_norm": 4.71875, + "grad_norm_var": 0.031083170572916666, + "learning_rate": 4e-05, + "loss": 5.1684, + "loss/crossentropy": 2.303519546985626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20848600566387177, + "step": 6828 + }, + { + "epoch": 0.5691666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.03515218098958333, + "learning_rate": 4e-05, + "loss": 5.1765, + "loss/crossentropy": 1.977641612291336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18245596811175346, + "step": 6830 + }, + { + "epoch": 0.5693333333333334, + "grad_norm": 5.78125, + "grad_norm_var": 0.10136311848958333, + "learning_rate": 4e-05, + "loss": 4.9752, + "loss/crossentropy": 2.1543014645576477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22348742187023163, + "step": 6832 + }, + { + "epoch": 0.5695, + "grad_norm": 4.75, + "grad_norm_var": 0.3146769205729167, + "learning_rate": 4e-05, + "loss": 4.4685, + "loss/crossentropy": 1.5171415954828262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1520980577915907, + "step": 6834 + }, + { + "epoch": 0.5696666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.3045572916666667, + "learning_rate": 4e-05, + "loss": 5.0365, + "loss/crossentropy": 1.4744467735290527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1628479491919279, + "step": 6836 + }, + { + "epoch": 0.5698333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.278515625, + "learning_rate": 4e-05, + "loss": 5.1384, + "loss/crossentropy": 2.248887836933136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23819807171821594, + "step": 6838 + }, + { + "epoch": 0.57, + "grad_norm": 4.6875, + "grad_norm_var": 0.274853515625, + "learning_rate": 4e-05, + "loss": 5.007, + "loss/crossentropy": 1.9553302228450775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1953351702541113, + "step": 6840 + }, + { + "epoch": 0.5701666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.27421875, + "learning_rate": 4e-05, + "loss": 4.758, + "loss/crossentropy": 1.7406494319438934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21699757128953934, + "step": 6842 + }, + { + "epoch": 0.5703333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.276416015625, + "learning_rate": 4e-05, + "loss": 4.3834, + "loss/crossentropy": 1.7711199223995209, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848914809525013, + "step": 6844 + }, + { + "epoch": 0.5705, + "grad_norm": 4.53125, + "grad_norm_var": 0.2936482747395833, + "learning_rate": 4e-05, + "loss": 4.9482, + "loss/crossentropy": 2.113058567047119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19523290544748306, + "step": 6846 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.25104166666666666, + "learning_rate": 4e-05, + "loss": 4.7867, + "loss/crossentropy": 1.378658078610897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14851684123277664, + "step": 6848 + }, + { + "epoch": 0.5708333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.03561197916666667, + "learning_rate": 4e-05, + "loss": 5.1184, + "loss/crossentropy": 2.6506794095039368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21903324127197266, + "step": 6850 + }, + { + "epoch": 0.571, + "grad_norm": 5.0, + "grad_norm_var": 0.03209228515625, + "learning_rate": 4e-05, + "loss": 4.8231, + "loss/crossentropy": 1.9687937498092651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869981847703457, + "step": 6852 + }, + { + "epoch": 0.5711666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.03723551432291667, + "learning_rate": 4e-05, + "loss": 4.8397, + "loss/crossentropy": 1.3870623856782913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16671552881598473, + "step": 6854 + }, + { + "epoch": 0.5713333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.040478515625, + "learning_rate": 4e-05, + "loss": 4.7197, + "loss/crossentropy": 1.9801999479532242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19567257165908813, + "step": 6856 + }, + { + "epoch": 0.5715, + "grad_norm": 4.5, + "grad_norm_var": 0.048421223958333336, + "learning_rate": 4e-05, + "loss": 5.1194, + "loss/crossentropy": 1.1472266912460327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11946773529052734, + "step": 6858 + }, + { + "epoch": 0.5716666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.048563639322916664, + "learning_rate": 4e-05, + "loss": 5.605, + "loss/crossentropy": 2.2561517730355263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16934522055089474, + "step": 6860 + }, + { + "epoch": 0.5718333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.04241129557291667, + "learning_rate": 4e-05, + "loss": 5.3357, + "loss/crossentropy": 2.355598211288452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21308866888284683, + "step": 6862 + }, + { + "epoch": 0.572, + "grad_norm": 4.5, + "grad_norm_var": 0.04537760416666667, + "learning_rate": 4e-05, + "loss": 5.0089, + "loss/crossentropy": 2.11463263630867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152191400527954, + "step": 6864 + }, + { + "epoch": 0.5721666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 1.681494140625, + "learning_rate": 4e-05, + "loss": 4.8888, + "loss/crossentropy": 1.9020079374313354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944811660796404, + "step": 6866 + }, + { + "epoch": 0.5723333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 1.6887003580729167, + "learning_rate": 4e-05, + "loss": 4.6484, + "loss/crossentropy": 1.2562482431530952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15086103230714798, + "step": 6868 + }, + { + "epoch": 0.5725, + "grad_norm": 4.75, + "grad_norm_var": 1.6973592122395833, + "learning_rate": 4e-05, + "loss": 4.9415, + "loss/crossentropy": 2.1037851870059967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20293112844228745, + "step": 6870 + }, + { + "epoch": 0.5726666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 1.678369140625, + "learning_rate": 4e-05, + "loss": 5.1511, + "loss/crossentropy": 2.3142004013061523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23257246613502502, + "step": 6872 + }, + { + "epoch": 0.5728333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 1.6506144205729167, + "learning_rate": 4e-05, + "loss": 4.994, + "loss/crossentropy": 1.8683245033025742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040301226079464, + "step": 6874 + }, + { + "epoch": 0.573, + "grad_norm": 4.53125, + "grad_norm_var": 1.6702962239583334, + "learning_rate": 4e-05, + "loss": 4.4974, + "loss/crossentropy": 2.2722477316856384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20786982402205467, + "step": 6876 + }, + { + "epoch": 0.5731666666666667, + "grad_norm": 5.0, + "grad_norm_var": 1.6706868489583333, + "learning_rate": 4e-05, + "loss": 5.012, + "loss/crossentropy": 2.5765005946159363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2184704802930355, + "step": 6878 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 4.75, + "grad_norm_var": 1.642041015625, + "learning_rate": 4e-05, + "loss": 5.3578, + "loss/crossentropy": 2.108662247657776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19640013948082924, + "step": 6880 + }, + { + "epoch": 0.5735, + "grad_norm": 4.84375, + "grad_norm_var": 0.027278645833333334, + "learning_rate": 4e-05, + "loss": 5.3062, + "loss/crossentropy": 2.601102828979492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24048221856355667, + "step": 6882 + }, + { + "epoch": 0.5736666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.03229166666666667, + "learning_rate": 4e-05, + "loss": 5.0257, + "loss/crossentropy": 0.8844295963644981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1019323579967022, + "step": 6884 + }, + { + "epoch": 0.5738333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.03280843098958333, + "learning_rate": 4e-05, + "loss": 4.7654, + "loss/crossentropy": 2.2570102512836456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22596049681305885, + "step": 6886 + }, + { + "epoch": 0.574, + "grad_norm": 5.28125, + "grad_norm_var": 0.04843343098958333, + "learning_rate": 4e-05, + "loss": 5.5993, + "loss/crossentropy": 2.4165295362472534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2355419434607029, + "step": 6888 + }, + { + "epoch": 0.5741666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.0533203125, + "learning_rate": 4e-05, + "loss": 4.3093, + "loss/crossentropy": 1.5194483771920204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1469650249928236, + "step": 6890 + }, + { + "epoch": 0.5743333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.04732666015625, + "learning_rate": 4e-05, + "loss": 5.1506, + "loss/crossentropy": 2.3721578419208527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981596052646637, + "step": 6892 + }, + { + "epoch": 0.5745, + "grad_norm": 4.875, + "grad_norm_var": 0.047526041666666664, + "learning_rate": 4e-05, + "loss": 5.0537, + "loss/crossentropy": 2.3216958045959473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19841821864247322, + "step": 6894 + }, + { + "epoch": 0.5746666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.06116129557291667, + "learning_rate": 4e-05, + "loss": 4.1328, + "loss/crossentropy": 1.9104135930538177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16793107241392136, + "step": 6896 + }, + { + "epoch": 0.5748333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.091650390625, + "learning_rate": 4e-05, + "loss": 4.1164, + "loss/crossentropy": 0.9877064228057861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12135954014956951, + "step": 6898 + }, + { + "epoch": 0.575, + "grad_norm": 4.75, + "grad_norm_var": 0.08938802083333333, + "learning_rate": 4e-05, + "loss": 4.8793, + "loss/crossentropy": 2.3174608945846558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19842035695910454, + "step": 6900 + }, + { + "epoch": 0.5751666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.09859619140625, + "learning_rate": 4e-05, + "loss": 5.2412, + "loss/crossentropy": 2.267774134874344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262442149221897, + "step": 6902 + }, + { + "epoch": 0.5753333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.0853515625, + "learning_rate": 4e-05, + "loss": 5.2218, + "loss/crossentropy": 1.7064669355750084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17715736478567123, + "step": 6904 + }, + { + "epoch": 0.5755, + "grad_norm": 5.1875, + "grad_norm_var": 0.084619140625, + "learning_rate": 4e-05, + "loss": 5.1483, + "loss/crossentropy": 2.3594651222229004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23974528908729553, + "step": 6906 + }, + { + "epoch": 0.5756666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.09683837890625, + "learning_rate": 4e-05, + "loss": 5.252, + "loss/crossentropy": 2.312884032726288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21780531853437424, + "step": 6908 + }, + { + "epoch": 0.5758333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.10323893229166667, + "learning_rate": 4e-05, + "loss": 4.6783, + "loss/crossentropy": 1.9019780158996582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16511645540595055, + "step": 6910 + }, + { + "epoch": 0.576, + "grad_norm": 5.46875, + "grad_norm_var": 0.09452718098958333, + "learning_rate": 4e-05, + "loss": 4.7793, + "loss/crossentropy": 2.478718101978302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21854493767023087, + "step": 6912 + }, + { + "epoch": 0.5761666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.05383707682291667, + "learning_rate": 4e-05, + "loss": 4.687, + "loss/crossentropy": 1.895049013197422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19955835677683353, + "step": 6914 + }, + { + "epoch": 0.5763333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.06718343098958333, + "learning_rate": 4e-05, + "loss": 4.9585, + "loss/crossentropy": 1.823567271232605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18378795869648457, + "step": 6916 + }, + { + "epoch": 0.5765, + "grad_norm": 9.125, + "grad_norm_var": 1.12574462890625, + "learning_rate": 4e-05, + "loss": 5.0141, + "loss/crossentropy": 1.417652204632759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1524863112717867, + "step": 6918 + }, + { + "epoch": 0.5766666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 1.175634765625, + "learning_rate": 4e-05, + "loss": 4.7746, + "loss/crossentropy": 1.9156860336661339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19187748990952969, + "step": 6920 + }, + { + "epoch": 0.5768333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 1.1823527018229167, + "learning_rate": 4e-05, + "loss": 4.6849, + "loss/crossentropy": 1.7854193970561028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17549366503953934, + "step": 6922 + }, + { + "epoch": 0.577, + "grad_norm": 4.375, + "grad_norm_var": 1.2242024739583333, + "learning_rate": 4e-05, + "loss": 4.8228, + "loss/crossentropy": 1.835989773273468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17153911851346493, + "step": 6924 + }, + { + "epoch": 0.5771666666666667, + "grad_norm": 6.65625, + "grad_norm_var": 1.3544230143229166, + "learning_rate": 4e-05, + "loss": 5.2973, + "loss/crossentropy": 2.4750488996505737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117002233862877, + "step": 6926 + }, + { + "epoch": 0.5773333333333334, + "grad_norm": 4.625, + "grad_norm_var": 1.3676717122395834, + "learning_rate": 4e-05, + "loss": 5.4856, + "loss/crossentropy": 2.3786118626594543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20669499039649963, + "step": 6928 + }, + { + "epoch": 0.5775, + "grad_norm": 4.75, + "grad_norm_var": 1.3824503580729166, + "learning_rate": 4e-05, + "loss": 4.343, + "loss/crossentropy": 1.9303578808903694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1886876206845045, + "step": 6930 + }, + { + "epoch": 0.5776666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 1.350634765625, + "learning_rate": 4e-05, + "loss": 4.7448, + "loss/crossentropy": 2.3733231723308563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21326325461268425, + "step": 6932 + }, + { + "epoch": 0.5778333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.2679646809895833, + "learning_rate": 4e-05, + "loss": 4.925, + "loss/crossentropy": 2.333590805530548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2030404508113861, + "step": 6934 + }, + { + "epoch": 0.578, + "grad_norm": 4.71875, + "grad_norm_var": 0.24911702473958333, + "learning_rate": 4e-05, + "loss": 4.8514, + "loss/crossentropy": 2.50896617770195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22089635208249092, + "step": 6936 + }, + { + "epoch": 0.5781666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.2536417643229167, + "learning_rate": 4e-05, + "loss": 4.5747, + "loss/crossentropy": 1.3991278186440468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19910966977477074, + "step": 6938 + }, + { + "epoch": 0.5783333333333334, + "grad_norm": 5.625, + "grad_norm_var": 0.243603515625, + "learning_rate": 4e-05, + "loss": 4.7782, + "loss/crossentropy": 1.2879233956336975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14194754138588905, + "step": 6940 + }, + { + "epoch": 0.5785, + "grad_norm": 4.84375, + "grad_norm_var": 0.07394205729166667, + "learning_rate": 4e-05, + "loss": 4.8375, + "loss/crossentropy": 2.180011212825775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19299915060400963, + "step": 6942 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.08391927083333334, + "learning_rate": 4e-05, + "loss": 5.0074, + "loss/crossentropy": 1.9094977304339409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777942907065153, + "step": 6944 + }, + { + "epoch": 0.5788333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.09099934895833334, + "learning_rate": 4e-05, + "loss": 4.9617, + "loss/crossentropy": 2.395069420337677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748833194375038, + "step": 6946 + }, + { + "epoch": 0.579, + "grad_norm": 5.0625, + "grad_norm_var": 0.08644205729166667, + "learning_rate": 4e-05, + "loss": 5.1167, + "loss/crossentropy": 2.3528851866722107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21444562450051308, + "step": 6948 + }, + { + "epoch": 0.5791666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.08899739583333334, + "learning_rate": 4e-05, + "loss": 5.3257, + "loss/crossentropy": 1.9694509357213974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1694854572415352, + "step": 6950 + }, + { + "epoch": 0.5793333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.08683268229166667, + "learning_rate": 4e-05, + "loss": 5.4056, + "loss/crossentropy": 1.936231642961502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19177152961492538, + "step": 6952 + }, + { + "epoch": 0.5795, + "grad_norm": 4.6875, + "grad_norm_var": 0.083837890625, + "learning_rate": 4e-05, + "loss": 5.1056, + "loss/crossentropy": 2.1117332875728607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18018162995576859, + "step": 6954 + }, + { + "epoch": 0.5796666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 0.06443684895833333, + "learning_rate": 4e-05, + "loss": 5.1666, + "loss/crossentropy": 2.5744773745536804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2220405377447605, + "step": 6956 + }, + { + "epoch": 0.5798333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.06927083333333334, + "learning_rate": 4e-05, + "loss": 4.5248, + "loss/crossentropy": 2.1000839695334435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16782992519438267, + "step": 6958 + }, + { + "epoch": 0.58, + "grad_norm": 4.90625, + "grad_norm_var": 0.061442057291666664, + "learning_rate": 4e-05, + "loss": 5.0504, + "loss/crossentropy": 2.1625142991542816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22695934772491455, + "step": 6960 + }, + { + "epoch": 0.5801666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.060546875, + "learning_rate": 4e-05, + "loss": 5.3221, + "loss/crossentropy": 2.1675389409065247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19221728295087814, + "step": 6962 + }, + { + "epoch": 0.5803333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.07320556640625, + "learning_rate": 4e-05, + "loss": 4.8257, + "loss/crossentropy": 1.7454765737056732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20795752108097076, + "step": 6964 + }, + { + "epoch": 0.5805, + "grad_norm": 4.53125, + "grad_norm_var": 0.07224934895833333, + "learning_rate": 4e-05, + "loss": 5.3364, + "loss/crossentropy": 2.321167379617691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211841382086277, + "step": 6966 + }, + { + "epoch": 0.5806666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.07450764973958333, + "learning_rate": 4e-05, + "loss": 4.9915, + "loss/crossentropy": 1.9849184900522232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19291441701352596, + "step": 6968 + }, + { + "epoch": 0.5808333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.07317301432291666, + "learning_rate": 4e-05, + "loss": 4.9854, + "loss/crossentropy": 1.7237927541136742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16498488560318947, + "step": 6970 + }, + { + "epoch": 0.581, + "grad_norm": 5.0, + "grad_norm_var": 0.05777587890625, + "learning_rate": 4e-05, + "loss": 4.9658, + "loss/crossentropy": 2.640030264854431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21197672188282013, + "step": 6972 + }, + { + "epoch": 0.5811666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.0595703125, + "learning_rate": 4e-05, + "loss": 4.6484, + "loss/crossentropy": 1.8918295353651047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1703386828303337, + "step": 6974 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.061848958333333336, + "learning_rate": 4e-05, + "loss": 4.6603, + "loss/crossentropy": 1.0444296523928642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16650191694498062, + "step": 6976 + }, + { + "epoch": 0.5815, + "grad_norm": 4.59375, + "grad_norm_var": 0.058333333333333334, + "learning_rate": 4e-05, + "loss": 5.0858, + "loss/crossentropy": 2.292292296886444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22024738788604736, + "step": 6978 + }, + { + "epoch": 0.5816666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.059619140625, + "learning_rate": 4e-05, + "loss": 5.0752, + "loss/crossentropy": 2.2760500013828278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20995662361383438, + "step": 6980 + }, + { + "epoch": 0.5818333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.046122233072916664, + "learning_rate": 4e-05, + "loss": 5.1017, + "loss/crossentropy": 2.164345234632492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2195327877998352, + "step": 6982 + }, + { + "epoch": 0.582, + "grad_norm": 5.1875, + "grad_norm_var": 0.051025390625, + "learning_rate": 4e-05, + "loss": 4.9941, + "loss/crossentropy": 1.9390491545200348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.251364566385746, + "step": 6984 + }, + { + "epoch": 0.5821666666666667, + "grad_norm": 5.5625, + "grad_norm_var": 0.07655843098958333, + "learning_rate": 4e-05, + "loss": 5.4158, + "loss/crossentropy": 2.521660327911377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20530570298433304, + "step": 6986 + }, + { + "epoch": 0.5823333333333334, + "grad_norm": 4.28125, + "grad_norm_var": 0.10627848307291667, + "learning_rate": 4e-05, + "loss": 4.8529, + "loss/crossentropy": 2.434011995792389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2161153182387352, + "step": 6988 + }, + { + "epoch": 0.5825, + "grad_norm": 4.9375, + "grad_norm_var": 0.10323893229166667, + "learning_rate": 4e-05, + "loss": 5.091, + "loss/crossentropy": 2.19405135512352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19611788541078568, + "step": 6990 + }, + { + "epoch": 0.5826666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.1046875, + "learning_rate": 4e-05, + "loss": 4.9027, + "loss/crossentropy": 2.0773863047361374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199423398822546, + "step": 6992 + }, + { + "epoch": 0.5828333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.116015625, + "learning_rate": 4e-05, + "loss": 5.4493, + "loss/crossentropy": 2.2322590053081512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22856701165437698, + "step": 6994 + }, + { + "epoch": 0.583, + "grad_norm": 4.875, + "grad_norm_var": 0.11282145182291667, + "learning_rate": 4e-05, + "loss": 5.4103, + "loss/crossentropy": 2.223317414522171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666733711957932, + "step": 6996 + }, + { + "epoch": 0.5831666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.14628499348958332, + "learning_rate": 4e-05, + "loss": 5.0099, + "loss/crossentropy": 2.306236118078232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21099332347512245, + "step": 6998 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.16679280598958332, + "learning_rate": 4e-05, + "loss": 4.9629, + "loss/crossentropy": 2.1312642991542816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20312974229454994, + "step": 7000 + }, + { + "epoch": 0.5835, + "grad_norm": 5.125, + "grad_norm_var": 0.14654947916666666, + "learning_rate": 4e-05, + "loss": 5.2156, + "loss/crossentropy": 2.47050142288208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21174683049321175, + "step": 7002 + }, + { + "epoch": 0.5836666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.11672770182291667, + "learning_rate": 4e-05, + "loss": 4.2701, + "loss/crossentropy": 2.2160302698612213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22286521643400192, + "step": 7004 + }, + { + "epoch": 0.5838333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.11373697916666667, + "learning_rate": 4e-05, + "loss": 4.5535, + "loss/crossentropy": 1.4888541474938393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15817414224147797, + "step": 7006 + }, + { + "epoch": 0.584, + "grad_norm": 5.65625, + "grad_norm_var": 0.14075520833333333, + "learning_rate": 4e-05, + "loss": 4.574, + "loss/crossentropy": 2.411957621574402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22817474603652954, + "step": 7008 + }, + { + "epoch": 0.5841666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.14016927083333333, + "learning_rate": 4e-05, + "loss": 4.7713, + "loss/crossentropy": 1.4711432233452797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15142634697258472, + "step": 7010 + }, + { + "epoch": 0.5843333333333334, + "grad_norm": 4.28125, + "grad_norm_var": 0.16521809895833334, + "learning_rate": 4e-05, + "loss": 4.5345, + "loss/crossentropy": 1.834930658340454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19916201382875443, + "step": 7012 + }, + { + "epoch": 0.5845, + "grad_norm": 4.71875, + "grad_norm_var": 0.13837483723958333, + "learning_rate": 4e-05, + "loss": 4.5504, + "loss/crossentropy": 1.7337545081973076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18489226698875427, + "step": 7014 + }, + { + "epoch": 0.5846666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.11373291015625, + "learning_rate": 4e-05, + "loss": 4.3701, + "loss/crossentropy": 1.3228175267577171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15250758454203606, + "step": 7016 + }, + { + "epoch": 0.5848333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.117041015625, + "learning_rate": 4e-05, + "loss": 4.8881, + "loss/crossentropy": 2.3420262932777405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1912631243467331, + "step": 7018 + }, + { + "epoch": 0.585, + "grad_norm": 6.03125, + "grad_norm_var": 0.22639567057291668, + "learning_rate": 4e-05, + "loss": 5.0456, + "loss/crossentropy": 1.3368181511759758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14204655215144157, + "step": 7020 + }, + { + "epoch": 0.5851666666666666, + "grad_norm": 5.28125, + "grad_norm_var": 0.24104410807291668, + "learning_rate": 4e-05, + "loss": 4.6956, + "loss/crossentropy": 1.5645621120929718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1582186594605446, + "step": 7022 + }, + { + "epoch": 0.5853333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.2123046875, + "learning_rate": 4e-05, + "loss": 5.308, + "loss/crossentropy": 2.4173710644245148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20801213383674622, + "step": 7024 + }, + { + "epoch": 0.5855, + "grad_norm": 4.875, + "grad_norm_var": 0.21207275390625, + "learning_rate": 4e-05, + "loss": 5.1631, + "loss/crossentropy": 2.3601708114147186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22110438346862793, + "step": 7026 + }, + { + "epoch": 0.5856666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.19423421223958334, + "learning_rate": 4e-05, + "loss": 5.0976, + "loss/crossentropy": 2.0202344954013824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23262187093496323, + "step": 7028 + }, + { + "epoch": 0.5858333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.426025390625, + "learning_rate": 4e-05, + "loss": 5.1829, + "loss/crossentropy": 2.080652594566345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20888086408376694, + "step": 7030 + }, + { + "epoch": 0.586, + "grad_norm": 4.84375, + "grad_norm_var": 0.3563435872395833, + "learning_rate": 4e-05, + "loss": 5.0934, + "loss/crossentropy": 2.4877448081970215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21074169129133224, + "step": 7032 + }, + { + "epoch": 0.5861666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.34060872395833336, + "learning_rate": 4e-05, + "loss": 4.7548, + "loss/crossentropy": 1.833594799041748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944413259625435, + "step": 7034 + }, + { + "epoch": 0.5863333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.31399739583333336, + "learning_rate": 4e-05, + "loss": 5.1317, + "loss/crossentropy": 1.8810269013047218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18199764378368855, + "step": 7036 + }, + { + "epoch": 0.5865, + "grad_norm": 4.8125, + "grad_norm_var": 0.3049112955729167, + "learning_rate": 4e-05, + "loss": 5.0683, + "loss/crossentropy": 2.4290638267993927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21032685041427612, + "step": 7038 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.3036417643229167, + "learning_rate": 4e-05, + "loss": 5.0505, + "loss/crossentropy": 2.0547506511211395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208097193390131, + "step": 7040 + }, + { + "epoch": 0.5868333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.32157796223958335, + "learning_rate": 4e-05, + "loss": 4.4601, + "loss/crossentropy": 1.7334094047546387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20311187207698822, + "step": 7042 + }, + { + "epoch": 0.587, + "grad_norm": 4.46875, + "grad_norm_var": 0.3348592122395833, + "learning_rate": 4e-05, + "loss": 4.74, + "loss/crossentropy": 2.338983803987503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23134483397006989, + "step": 7044 + }, + { + "epoch": 0.5871666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.026981608072916666, + "learning_rate": 4e-05, + "loss": 4.4947, + "loss/crossentropy": 1.6076650097966194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.170147143304348, + "step": 7046 + }, + { + "epoch": 0.5873333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.044775390625, + "learning_rate": 4e-05, + "loss": 5.3415, + "loss/crossentropy": 2.398401141166687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23924263939261436, + "step": 7048 + }, + { + "epoch": 0.5875, + "grad_norm": 5.125, + "grad_norm_var": 0.058915201822916666, + "learning_rate": 4e-05, + "loss": 5.33, + "loss/crossentropy": 1.9874465465545654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18284693360328674, + "step": 7050 + }, + { + "epoch": 0.5876666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.06334228515625, + "learning_rate": 4e-05, + "loss": 4.3923, + "loss/crossentropy": 2.0420145988464355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16938142478466034, + "step": 7052 + }, + { + "epoch": 0.5878333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.08411051432291666, + "learning_rate": 4e-05, + "loss": 4.8238, + "loss/crossentropy": 2.0706692337989807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137084066867828, + "step": 7054 + }, + { + "epoch": 0.588, + "grad_norm": 4.78125, + "grad_norm_var": 0.08052978515625, + "learning_rate": 4e-05, + "loss": 4.834, + "loss/crossentropy": 2.1098215356469154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18781218118965626, + "step": 7056 + }, + { + "epoch": 0.5881666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.09280192057291667, + "learning_rate": 4e-05, + "loss": 4.623, + "loss/crossentropy": 1.389645166695118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1635691300034523, + "step": 7058 + }, + { + "epoch": 0.5883333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.08860677083333333, + "learning_rate": 4e-05, + "loss": 4.7686, + "loss/crossentropy": 2.2585472464561462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468041092157364, + "step": 7060 + }, + { + "epoch": 0.5885, + "grad_norm": 5.0, + "grad_norm_var": 0.088916015625, + "learning_rate": 4e-05, + "loss": 5.1589, + "loss/crossentropy": 2.4640525579452515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21823417022824287, + "step": 7062 + }, + { + "epoch": 0.5886666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.06848551432291666, + "learning_rate": 4e-05, + "loss": 4.6937, + "loss/crossentropy": 2.095867395401001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18419134989380836, + "step": 7064 + }, + { + "epoch": 0.5888333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.060546875, + "learning_rate": 4e-05, + "loss": 4.4197, + "loss/crossentropy": 2.244805634021759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19950323924422264, + "step": 7066 + }, + { + "epoch": 0.589, + "grad_norm": 4.75, + "grad_norm_var": 0.055078125, + "learning_rate": 4e-05, + "loss": 4.805, + "loss/crossentropy": 2.1685686707496643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20743118226528168, + "step": 7068 + }, + { + "epoch": 0.5891666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.04694010416666667, + "learning_rate": 4e-05, + "loss": 4.5978, + "loss/crossentropy": 1.2904389277100563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14947044663131237, + "step": 7070 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.045182291666666666, + "learning_rate": 4e-05, + "loss": 5.1091, + "loss/crossentropy": 1.944626122713089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1817072257399559, + "step": 7072 + }, + { + "epoch": 0.5895, + "grad_norm": 4.53125, + "grad_norm_var": 0.035400390625, + "learning_rate": 4e-05, + "loss": 4.663, + "loss/crossentropy": 1.7342427596449852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18484355323016644, + "step": 7074 + }, + { + "epoch": 0.5896666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.03592122395833333, + "learning_rate": 4e-05, + "loss": 4.5442, + "loss/crossentropy": 1.944994330406189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828309655189514, + "step": 7076 + }, + { + "epoch": 0.5898333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.03284098307291667, + "learning_rate": 4e-05, + "loss": 5.1822, + "loss/crossentropy": 1.6501160487532616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18159450963139534, + "step": 7078 + }, + { + "epoch": 0.59, + "grad_norm": 4.8125, + "grad_norm_var": 0.03658854166666667, + "learning_rate": 4e-05, + "loss": 4.8808, + "loss/crossentropy": 2.1811063289642334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19801967963576317, + "step": 7080 + }, + { + "epoch": 0.5901666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.042801920572916666, + "learning_rate": 4e-05, + "loss": 4.81, + "loss/crossentropy": 2.1836779415607452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19767899811267853, + "step": 7082 + }, + { + "epoch": 0.5903333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.04412434895833333, + "learning_rate": 4e-05, + "loss": 4.4882, + "loss/crossentropy": 1.9349441826343536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839659884572029, + "step": 7084 + }, + { + "epoch": 0.5905, + "grad_norm": 5.1875, + "grad_norm_var": 0.06760660807291667, + "learning_rate": 4e-05, + "loss": 4.8502, + "loss/crossentropy": 1.5366474315524101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1782370787113905, + "step": 7086 + }, + { + "epoch": 0.5906666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.06760660807291667, + "learning_rate": 4e-05, + "loss": 4.4517, + "loss/crossentropy": 1.4672022983431816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14767078682780266, + "step": 7088 + }, + { + "epoch": 0.5908333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.08331705729166666, + "learning_rate": 4e-05, + "loss": 4.8765, + "loss/crossentropy": 2.0351298972964287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845032162964344, + "step": 7090 + }, + { + "epoch": 0.591, + "grad_norm": 5.03125, + "grad_norm_var": 0.07682291666666667, + "learning_rate": 4e-05, + "loss": 5.2773, + "loss/crossentropy": 1.6402394473552704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16715828701853752, + "step": 7092 + }, + { + "epoch": 0.5911666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.08498942057291667, + "learning_rate": 4e-05, + "loss": 4.7207, + "loss/crossentropy": 2.1903760731220245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975751407444477, + "step": 7094 + }, + { + "epoch": 0.5913333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.06490478515625, + "learning_rate": 4e-05, + "loss": 5.4179, + "loss/crossentropy": 2.4919445514678955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21300790458917618, + "step": 7096 + }, + { + "epoch": 0.5915, + "grad_norm": 4.84375, + "grad_norm_var": 0.06760660807291667, + "learning_rate": 4e-05, + "loss": 5.2518, + "loss/crossentropy": 2.238806664943695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21941359341144562, + "step": 7098 + }, + { + "epoch": 0.5916666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.07082926432291667, + "learning_rate": 4e-05, + "loss": 5.3808, + "loss/crossentropy": 2.211260676383972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18881267309188843, + "step": 7100 + }, + { + "epoch": 0.5918333333333333, + "grad_norm": 6.03125, + "grad_norm_var": 0.13489176432291666, + "learning_rate": 4e-05, + "loss": 4.523, + "loss/crossentropy": 1.7687021493911743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20016569271683693, + "step": 7102 + }, + { + "epoch": 0.592, + "grad_norm": 4.6875, + "grad_norm_var": 0.13357747395833333, + "learning_rate": 4e-05, + "loss": 4.8254, + "loss/crossentropy": 1.8385881558060646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18539466150105, + "step": 7104 + }, + { + "epoch": 0.5921666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.11302083333333333, + "learning_rate": 4e-05, + "loss": 5.0105, + "loss/crossentropy": 1.5129987746477127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15814178064465523, + "step": 7106 + }, + { + "epoch": 0.5923333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.11754150390625, + "learning_rate": 4e-05, + "loss": 5.0845, + "loss/crossentropy": 1.2784345149993896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1473633572459221, + "step": 7108 + }, + { + "epoch": 0.5925, + "grad_norm": 4.65625, + "grad_norm_var": 0.11926676432291666, + "learning_rate": 4e-05, + "loss": 5.1103, + "loss/crossentropy": 2.039908640086651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18187290988862514, + "step": 7110 + }, + { + "epoch": 0.5926666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.11959228515625, + "learning_rate": 4e-05, + "loss": 4.4894, + "loss/crossentropy": 1.7723973244428635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224184051156044, + "step": 7112 + }, + { + "epoch": 0.5928333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.1162109375, + "learning_rate": 4e-05, + "loss": 4.787, + "loss/crossentropy": 1.9356326535344124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17493806034326553, + "step": 7114 + }, + { + "epoch": 0.593, + "grad_norm": 4.75, + "grad_norm_var": 0.13271077473958334, + "learning_rate": 4e-05, + "loss": 5.3488, + "loss/crossentropy": 1.8205601423978806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18349266424775124, + "step": 7116 + }, + { + "epoch": 0.5931666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.058186848958333336, + "learning_rate": 4e-05, + "loss": 4.8379, + "loss/crossentropy": 1.5122859254479408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1873751487582922, + "step": 7118 + }, + { + "epoch": 0.5933333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.05779622395833333, + "learning_rate": 4e-05, + "loss": 4.7082, + "loss/crossentropy": 2.456811845302582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22585484385490417, + "step": 7120 + }, + { + "epoch": 0.5935, + "grad_norm": 4.59375, + "grad_norm_var": 0.06979166666666667, + "learning_rate": 4e-05, + "loss": 4.7246, + "loss/crossentropy": 1.925706960260868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17225532606244087, + "step": 7122 + }, + { + "epoch": 0.5936666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.10154622395833333, + "learning_rate": 4e-05, + "loss": 4.2237, + "loss/crossentropy": 1.8182199075818062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815893966704607, + "step": 7124 + }, + { + "epoch": 0.5938333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.09735921223958334, + "learning_rate": 4e-05, + "loss": 4.6819, + "loss/crossentropy": 1.501136690378189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14409414678812027, + "step": 7126 + }, + { + "epoch": 0.594, + "grad_norm": 4.78125, + "grad_norm_var": 0.09283854166666666, + "learning_rate": 4e-05, + "loss": 5.0641, + "loss/crossentropy": 1.9566605687141418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1724204383790493, + "step": 7128 + }, + { + "epoch": 0.5941666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.08902587890625, + "learning_rate": 4e-05, + "loss": 4.3695, + "loss/crossentropy": 1.9445571303367615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21243244409561157, + "step": 7130 + }, + { + "epoch": 0.5943333333333334, + "grad_norm": 4.1875, + "grad_norm_var": 0.09998372395833334, + "learning_rate": 4e-05, + "loss": 4.8659, + "loss/crossentropy": 2.362126111984253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20181460306048393, + "step": 7132 + }, + { + "epoch": 0.5945, + "grad_norm": 4.9375, + "grad_norm_var": 0.11194254557291666, + "learning_rate": 4e-05, + "loss": 4.295, + "loss/crossentropy": 1.7851531505584717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17282582819461823, + "step": 7134 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.11448160807291667, + "learning_rate": 4e-05, + "loss": 4.6094, + "loss/crossentropy": 1.4123722687363625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1530421730130911, + "step": 7136 + }, + { + "epoch": 0.5948333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.11262613932291667, + "learning_rate": 4e-05, + "loss": 4.9726, + "loss/crossentropy": 1.876221090555191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224697545170784, + "step": 7138 + }, + { + "epoch": 0.595, + "grad_norm": 4.65625, + "grad_norm_var": 0.08733317057291666, + "learning_rate": 4e-05, + "loss": 5.2092, + "loss/crossentropy": 1.2281717583537102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12058732472360134, + "step": 7140 + }, + { + "epoch": 0.5951666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.087109375, + "learning_rate": 4e-05, + "loss": 5.1486, + "loss/crossentropy": 2.3574686646461487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202539652585983, + "step": 7142 + }, + { + "epoch": 0.5953333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.08839518229166667, + "learning_rate": 4e-05, + "loss": 4.394, + "loss/crossentropy": 1.3628652021288872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17282887548208237, + "step": 7144 + }, + { + "epoch": 0.5955, + "grad_norm": 4.6875, + "grad_norm_var": 0.1013671875, + "learning_rate": 4e-05, + "loss": 5.1276, + "loss/crossentropy": 2.1821780800819397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21669187769293785, + "step": 7146 + }, + { + "epoch": 0.5956666666666667, + "grad_norm": 5.34375, + "grad_norm_var": 0.07962239583333333, + "learning_rate": 4e-05, + "loss": 4.6122, + "loss/crossentropy": 2.189154863357544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20566852018237114, + "step": 7148 + }, + { + "epoch": 0.5958333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.09544270833333333, + "learning_rate": 4e-05, + "loss": 3.5778, + "loss/crossentropy": 1.0040019303560257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1260563600808382, + "step": 7150 + }, + { + "epoch": 0.596, + "grad_norm": 4.625, + "grad_norm_var": 0.13378499348958334, + "learning_rate": 4e-05, + "loss": 4.8097, + "loss/crossentropy": 2.3464534282684326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20970812812447548, + "step": 7152 + }, + { + "epoch": 0.5961666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.13209228515625, + "learning_rate": 4e-05, + "loss": 4.6268, + "loss/crossentropy": 1.6716122701764107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19498688727617264, + "step": 7154 + }, + { + "epoch": 0.5963333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.13245035807291666, + "learning_rate": 4e-05, + "loss": 4.5956, + "loss/crossentropy": 1.6933601424098015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20203623175621033, + "step": 7156 + }, + { + "epoch": 0.5965, + "grad_norm": 5.21875, + "grad_norm_var": 0.13417561848958334, + "learning_rate": 4e-05, + "loss": 5.1846, + "loss/crossentropy": 2.599808931350708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22032839059829712, + "step": 7158 + }, + { + "epoch": 0.5966666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.1357421875, + "learning_rate": 4e-05, + "loss": 4.5418, + "loss/crossentropy": 1.4756020605564117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14345270954072475, + "step": 7160 + }, + { + "epoch": 0.5968333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.12616780598958333, + "learning_rate": 4e-05, + "loss": 4.5142, + "loss/crossentropy": 1.753265455365181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787324883043766, + "step": 7162 + }, + { + "epoch": 0.597, + "grad_norm": 4.3125, + "grad_norm_var": 0.12823893229166666, + "learning_rate": 4e-05, + "loss": 4.7166, + "loss/crossentropy": 2.619646430015564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22686060145497322, + "step": 7164 + }, + { + "epoch": 0.5971666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.1130859375, + "learning_rate": 4e-05, + "loss": 4.1743, + "loss/crossentropy": 1.353781297802925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14187632501125336, + "step": 7166 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.08345947265625, + "learning_rate": 4e-05, + "loss": 4.5593, + "loss/crossentropy": 1.8787141367793083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18714572489261627, + "step": 7168 + }, + { + "epoch": 0.5975, + "grad_norm": 4.78125, + "grad_norm_var": 0.07535400390625, + "learning_rate": 4e-05, + "loss": 4.8598, + "loss/crossentropy": 2.3827298283576965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21606054157018661, + "step": 7170 + }, + { + "epoch": 0.5976666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.06848958333333334, + "learning_rate": 4e-05, + "loss": 4.9034, + "loss/crossentropy": 2.169047087430954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20684335753321648, + "step": 7172 + }, + { + "epoch": 0.5978333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.05670572916666667, + "learning_rate": 4e-05, + "loss": 4.7007, + "loss/crossentropy": 0.976897768676281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15186763554811478, + "step": 7174 + }, + { + "epoch": 0.598, + "grad_norm": 4.84375, + "grad_norm_var": 0.05636393229166667, + "learning_rate": 4e-05, + "loss": 4.4089, + "loss/crossentropy": 1.524364024400711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1366695910692215, + "step": 7176 + }, + { + "epoch": 0.5981666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.06011962890625, + "learning_rate": 4e-05, + "loss": 5.1123, + "loss/crossentropy": 2.1570041179656982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20499436184763908, + "step": 7178 + }, + { + "epoch": 0.5983333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.05718994140625, + "learning_rate": 4e-05, + "loss": 5.104, + "loss/crossentropy": 2.673922121524811, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113421931862831, + "step": 7180 + }, + { + "epoch": 0.5985, + "grad_norm": 4.6875, + "grad_norm_var": 0.038134765625, + "learning_rate": 4e-05, + "loss": 4.8084, + "loss/crossentropy": 2.6139089465141296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268553152680397, + "step": 7182 + }, + { + "epoch": 0.5986666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.03899739583333333, + "learning_rate": 4e-05, + "loss": 4.821, + "loss/crossentropy": 2.4553991556167603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23415345698595047, + "step": 7184 + }, + { + "epoch": 0.5988333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.05054931640625, + "learning_rate": 4e-05, + "loss": 5.1587, + "loss/crossentropy": 2.134661704301834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19149811938405037, + "step": 7186 + }, + { + "epoch": 0.599, + "grad_norm": 5.0, + "grad_norm_var": 0.053515625, + "learning_rate": 4e-05, + "loss": 5.1202, + "loss/crossentropy": 2.0437642335891724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21648352220654488, + "step": 7188 + }, + { + "epoch": 0.5991666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.052534993489583334, + "learning_rate": 4e-05, + "loss": 4.8893, + "loss/crossentropy": 1.434989832341671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14296963065862656, + "step": 7190 + }, + { + "epoch": 0.5993333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.050455729166666664, + "learning_rate": 4e-05, + "loss": 4.9558, + "loss/crossentropy": 1.83430977165699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16601720824837685, + "step": 7192 + }, + { + "epoch": 0.5995, + "grad_norm": 5.0, + "grad_norm_var": 0.050764973958333334, + "learning_rate": 4e-05, + "loss": 4.8337, + "loss/crossentropy": 2.1914051175117493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19750383496284485, + "step": 7194 + }, + { + "epoch": 0.5996666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.4123006184895833, + "learning_rate": 4e-05, + "loss": 4.896, + "loss/crossentropy": 1.6491469144821167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20295769348740578, + "step": 7196 + }, + { + "epoch": 0.5998333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.40487874348958336, + "learning_rate": 4e-05, + "loss": 4.8493, + "loss/crossentropy": 1.4159668385982513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1523029077798128, + "step": 7198 + }, + { + "epoch": 0.6, + "grad_norm": 4.90625, + "grad_norm_var": 0.3759073893229167, + "learning_rate": 4e-05, + "loss": 5.1873, + "loss/crossentropy": 2.0024573504924774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217225655913353, + "step": 7200 + }, + { + "epoch": 0.6001666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.38033447265625, + "learning_rate": 4e-05, + "loss": 4.803, + "loss/crossentropy": 2.018974833190441, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17445803619921207, + "step": 7202 + }, + { + "epoch": 0.6003333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.38218994140625, + "learning_rate": 4e-05, + "loss": 4.4835, + "loss/crossentropy": 2.6279727816581726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21568148583173752, + "step": 7204 + }, + { + "epoch": 0.6005, + "grad_norm": 6.0, + "grad_norm_var": 0.46005452473958336, + "learning_rate": 4e-05, + "loss": 4.5529, + "loss/crossentropy": 2.3042045533657074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125856727361679, + "step": 7206 + }, + { + "epoch": 0.6006666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.43339436848958335, + "learning_rate": 4e-05, + "loss": 5.0155, + "loss/crossentropy": 1.9355166032910347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18338329531252384, + "step": 7208 + }, + { + "epoch": 0.6008333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.44442952473958336, + "learning_rate": 4e-05, + "loss": 4.7165, + "loss/crossentropy": 1.6675493568181992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16726251505315304, + "step": 7210 + }, + { + "epoch": 0.601, + "grad_norm": 4.59375, + "grad_norm_var": 0.11536458333333334, + "learning_rate": 4e-05, + "loss": 4.7583, + "loss/crossentropy": 2.4564692974090576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22821878641843796, + "step": 7212 + }, + { + "epoch": 0.6011666666666666, + "grad_norm": 7.5625, + "grad_norm_var": 0.58125, + "learning_rate": 4e-05, + "loss": 4.6558, + "loss/crossentropy": 1.9894456341862679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19508633390069008, + "step": 7214 + }, + { + "epoch": 0.6013333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.5986287434895833, + "learning_rate": 4e-05, + "loss": 4.4333, + "loss/crossentropy": 2.0114836394786835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24474351853132248, + "step": 7216 + }, + { + "epoch": 0.6015, + "grad_norm": 4.90625, + "grad_norm_var": 0.6019816080729167, + "learning_rate": 4e-05, + "loss": 4.8185, + "loss/crossentropy": 1.790744572877884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19527454674243927, + "step": 7218 + }, + { + "epoch": 0.6016666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.60279541015625, + "learning_rate": 4e-05, + "loss": 4.7642, + "loss/crossentropy": 1.9990643709897995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20038366317749023, + "step": 7220 + }, + { + "epoch": 0.6018333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.530712890625, + "learning_rate": 4e-05, + "loss": 5.0586, + "loss/crossentropy": 1.5327222123742104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18884196318686008, + "step": 7222 + }, + { + "epoch": 0.602, + "grad_norm": 4.84375, + "grad_norm_var": 0.5283162434895833, + "learning_rate": 4e-05, + "loss": 5.0143, + "loss/crossentropy": 2.18381866812706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2246886007487774, + "step": 7224 + }, + { + "epoch": 0.6021666666666666, + "grad_norm": 5.8125, + "grad_norm_var": 0.56920166015625, + "learning_rate": 4e-05, + "loss": 5.0421, + "loss/crossentropy": 2.5714552998542786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21956535056233406, + "step": 7226 + }, + { + "epoch": 0.6023333333333334, + "grad_norm": 6.53125, + "grad_norm_var": 0.6641886393229167, + "learning_rate": 4e-05, + "loss": 5.1027, + "loss/crossentropy": 2.4347121119499207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21895084530115128, + "step": 7228 + }, + { + "epoch": 0.6025, + "grad_norm": 4.1875, + "grad_norm_var": 0.31151936848958334, + "learning_rate": 4e-05, + "loss": 4.6368, + "loss/crossentropy": 1.82130666077137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18457898125052452, + "step": 7230 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.29791259765625, + "learning_rate": 4e-05, + "loss": 4.8637, + "loss/crossentropy": 2.497299909591675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19863051548600197, + "step": 7232 + }, + { + "epoch": 0.6028333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.31643473307291664, + "learning_rate": 4e-05, + "loss": 4.5341, + "loss/crossentropy": 2.023450642824173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18142622709274292, + "step": 7234 + }, + { + "epoch": 0.603, + "grad_norm": 4.75, + "grad_norm_var": 0.31177978515625, + "learning_rate": 4e-05, + "loss": 4.2234, + "loss/crossentropy": 1.542768731713295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14925481751561165, + "step": 7236 + }, + { + "epoch": 0.6031666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.3067545572916667, + "learning_rate": 4e-05, + "loss": 5.2911, + "loss/crossentropy": 2.219216376543045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20528903231024742, + "step": 7238 + }, + { + "epoch": 0.6033333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.32060139973958335, + "learning_rate": 4e-05, + "loss": 4.9617, + "loss/crossentropy": 1.9249942675232887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838216297328472, + "step": 7240 + }, + { + "epoch": 0.6035, + "grad_norm": 4.40625, + "grad_norm_var": 0.27928059895833335, + "learning_rate": 4e-05, + "loss": 4.6277, + "loss/crossentropy": 1.7711387276649475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17735876329243183, + "step": 7242 + }, + { + "epoch": 0.6036666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.09462483723958333, + "learning_rate": 4e-05, + "loss": 5.0941, + "loss/crossentropy": 1.6195759177207947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17690261639654636, + "step": 7244 + }, + { + "epoch": 0.6038333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.06443684895833333, + "learning_rate": 4e-05, + "loss": 5.2268, + "loss/crossentropy": 2.1065678000450134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140568532049656, + "step": 7246 + }, + { + "epoch": 0.604, + "grad_norm": 4.65625, + "grad_norm_var": 0.05831705729166667, + "learning_rate": 4e-05, + "loss": 4.7785, + "loss/crossentropy": 2.4431713819503784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23185305297374725, + "step": 7248 + }, + { + "epoch": 0.6041666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.04993082682291667, + "learning_rate": 4e-05, + "loss": 5.1371, + "loss/crossentropy": 1.6138255596160889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1841035783290863, + "step": 7250 + }, + { + "epoch": 0.6043333333333333, + "grad_norm": 5.71875, + "grad_norm_var": 0.10154622395833333, + "learning_rate": 4e-05, + "loss": 4.811, + "loss/crossentropy": 1.9559299945831299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17412950471043587, + "step": 7252 + }, + { + "epoch": 0.6045, + "grad_norm": 4.6875, + "grad_norm_var": 0.10126546223958334, + "learning_rate": 4e-05, + "loss": 4.9001, + "loss/crossentropy": 1.585697665810585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17267544195055962, + "step": 7254 + }, + { + "epoch": 0.6046666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.11376546223958334, + "learning_rate": 4e-05, + "loss": 4.6544, + "loss/crossentropy": 2.1703633964061737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22978663071990013, + "step": 7256 + }, + { + "epoch": 0.6048333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.09869384765625, + "learning_rate": 4e-05, + "loss": 5.1685, + "loss/crossentropy": 2.224595546722412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069346234202385, + "step": 7258 + }, + { + "epoch": 0.605, + "grad_norm": 4.25, + "grad_norm_var": 0.12902018229166667, + "learning_rate": 4e-05, + "loss": 4.3922, + "loss/crossentropy": 1.8856493830680847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18326758965849876, + "step": 7260 + }, + { + "epoch": 0.6051666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.127587890625, + "learning_rate": 4e-05, + "loss": 5.6004, + "loss/crossentropy": 2.3320149183273315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202293783426285, + "step": 7262 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.12056884765625, + "learning_rate": 4e-05, + "loss": 5.1004, + "loss/crossentropy": 2.044320672750473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24393825232982635, + "step": 7264 + }, + { + "epoch": 0.6055, + "grad_norm": 5.0, + "grad_norm_var": 0.11912434895833333, + "learning_rate": 4e-05, + "loss": 5.1452, + "loss/crossentropy": 2.3974433541297913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2371625006198883, + "step": 7266 + }, + { + "epoch": 0.6056666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.08511962890625, + "learning_rate": 4e-05, + "loss": 5.325, + "loss/crossentropy": 2.257835239171982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960177905857563, + "step": 7268 + }, + { + "epoch": 0.6058333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.11458333333333333, + "learning_rate": 4e-05, + "loss": 5.4307, + "loss/crossentropy": 1.788444608449936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2374886106699705, + "step": 7270 + }, + { + "epoch": 0.606, + "grad_norm": 4.84375, + "grad_norm_var": 0.09384358723958333, + "learning_rate": 4e-05, + "loss": 4.9727, + "loss/crossentropy": 1.87558251619339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17660346627235413, + "step": 7272 + }, + { + "epoch": 0.6061666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.09933268229166667, + "learning_rate": 4e-05, + "loss": 4.775, + "loss/crossentropy": 2.021895110607147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23071924597024918, + "step": 7274 + }, + { + "epoch": 0.6063333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.07789306640625, + "learning_rate": 4e-05, + "loss": 4.7348, + "loss/crossentropy": 1.7142114639282227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17037047445774078, + "step": 7276 + }, + { + "epoch": 0.6065, + "grad_norm": 4.78125, + "grad_norm_var": 0.075244140625, + "learning_rate": 4e-05, + "loss": 5.193, + "loss/crossentropy": 2.4209593534469604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22260507941246033, + "step": 7278 + }, + { + "epoch": 0.6066666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.07877604166666667, + "learning_rate": 4e-05, + "loss": 4.8547, + "loss/crossentropy": 1.5159368366003036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.155701145529747, + "step": 7280 + }, + { + "epoch": 0.6068333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.08313395182291666, + "learning_rate": 4e-05, + "loss": 4.9023, + "loss/crossentropy": 1.7176839709281921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18741050362586975, + "step": 7282 + }, + { + "epoch": 0.607, + "grad_norm": 4.96875, + "grad_norm_var": 0.06588134765625, + "learning_rate": 4e-05, + "loss": 4.9645, + "loss/crossentropy": 1.6779725551605225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16802698001265526, + "step": 7284 + }, + { + "epoch": 0.6071666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.032275390625, + "learning_rate": 4e-05, + "loss": 4.805, + "loss/crossentropy": 2.100606143474579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19785485044121742, + "step": 7286 + }, + { + "epoch": 0.6073333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.038802083333333334, + "learning_rate": 4e-05, + "loss": 5.5269, + "loss/crossentropy": 2.4047402143478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22336392104625702, + "step": 7288 + }, + { + "epoch": 0.6075, + "grad_norm": 5.09375, + "grad_norm_var": 0.041304524739583334, + "learning_rate": 4e-05, + "loss": 5.0355, + "loss/crossentropy": 2.4837300777435303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262895181775093, + "step": 7290 + }, + { + "epoch": 0.6076666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.0435546875, + "learning_rate": 4e-05, + "loss": 4.5059, + "loss/crossentropy": 0.9277792125940323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13263830170035362, + "step": 7292 + }, + { + "epoch": 0.6078333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.23424479166666667, + "learning_rate": 4e-05, + "loss": 5.1121, + "loss/crossentropy": 2.300870269536972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2035658285021782, + "step": 7294 + }, + { + "epoch": 0.608, + "grad_norm": 4.84375, + "grad_norm_var": 0.23014322916666666, + "learning_rate": 4e-05, + "loss": 4.4993, + "loss/crossentropy": 2.54541939496994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21243872493505478, + "step": 7296 + }, + { + "epoch": 0.6081666666666666, + "grad_norm": 4.3125, + "grad_norm_var": 0.26519775390625, + "learning_rate": 4e-05, + "loss": 4.1937, + "loss/crossentropy": 1.6491773575544357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16693933680653572, + "step": 7298 + }, + { + "epoch": 0.6083333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.27346598307291664, + "learning_rate": 4e-05, + "loss": 5.6926, + "loss/crossentropy": 2.7386369705200195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123209573328495, + "step": 7300 + }, + { + "epoch": 0.6085, + "grad_norm": 4.65625, + "grad_norm_var": 0.27667643229166666, + "learning_rate": 4e-05, + "loss": 4.6207, + "loss/crossentropy": 2.468148171901703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21846356615424156, + "step": 7302 + }, + { + "epoch": 0.6086666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.28013916015625, + "learning_rate": 4e-05, + "loss": 4.7455, + "loss/crossentropy": 2.2846281826496124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20414729788899422, + "step": 7304 + }, + { + "epoch": 0.6088333333333333, + "grad_norm": 5.4375, + "grad_norm_var": 0.29498291015625, + "learning_rate": 4e-05, + "loss": 5.2679, + "loss/crossentropy": 2.3819915056228638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19434510171413422, + "step": 7306 + }, + { + "epoch": 0.609, + "grad_norm": 4.71875, + "grad_norm_var": 0.2815755208333333, + "learning_rate": 4e-05, + "loss": 5.0059, + "loss/crossentropy": 1.7940093278884888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17341401614248753, + "step": 7308 + }, + { + "epoch": 0.6091666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.05950113932291667, + "learning_rate": 4e-05, + "loss": 5.0488, + "loss/crossentropy": 2.2041455805301666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19079307839274406, + "step": 7310 + }, + { + "epoch": 0.6093333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.06679280598958333, + "learning_rate": 4e-05, + "loss": 4.3424, + "loss/crossentropy": 2.475742816925049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21777204051613808, + "step": 7312 + }, + { + "epoch": 0.6095, + "grad_norm": 4.71875, + "grad_norm_var": 0.11339518229166666, + "learning_rate": 4e-05, + "loss": 5.1092, + "loss/crossentropy": 1.4416265115141869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1719446536153555, + "step": 7314 + }, + { + "epoch": 0.6096666666666667, + "grad_norm": 5.21875, + "grad_norm_var": 0.119775390625, + "learning_rate": 4e-05, + "loss": 5.4393, + "loss/crossentropy": 2.565885007381439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22275138646364212, + "step": 7316 + }, + { + "epoch": 0.6098333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.12745768229166668, + "learning_rate": 4e-05, + "loss": 5.0595, + "loss/crossentropy": 1.6449436023831367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839300487190485, + "step": 7318 + }, + { + "epoch": 0.61, + "grad_norm": 5.03125, + "grad_norm_var": 0.12981363932291667, + "learning_rate": 4e-05, + "loss": 5.0024, + "loss/crossentropy": 1.2995290160179138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17145265080034733, + "step": 7320 + }, + { + "epoch": 0.6101666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.11057535807291667, + "learning_rate": 4e-05, + "loss": 4.8652, + "loss/crossentropy": 2.277883529663086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.240261971950531, + "step": 7322 + }, + { + "epoch": 0.6103333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.11011962890625, + "learning_rate": 4e-05, + "loss": 4.9789, + "loss/crossentropy": 1.3303010761737823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1409695502370596, + "step": 7324 + }, + { + "epoch": 0.6105, + "grad_norm": 4.625, + "grad_norm_var": 0.11197916666666667, + "learning_rate": 4e-05, + "loss": 4.695, + "loss/crossentropy": 2.046277731657028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17989829368889332, + "step": 7326 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.10054931640625, + "learning_rate": 4e-05, + "loss": 5.6208, + "loss/crossentropy": 2.087553530931473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23087459430098534, + "step": 7328 + }, + { + "epoch": 0.6108333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.06222330729166667, + "learning_rate": 4e-05, + "loss": 4.5278, + "loss/crossentropy": 1.748817302286625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16369684785604477, + "step": 7330 + }, + { + "epoch": 0.611, + "grad_norm": 4.78125, + "grad_norm_var": 0.05284830729166667, + "learning_rate": 4e-05, + "loss": 4.5774, + "loss/crossentropy": 2.064897298812866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19502196460962296, + "step": 7332 + }, + { + "epoch": 0.6111666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.045308430989583336, + "learning_rate": 4e-05, + "loss": 4.9905, + "loss/crossentropy": 1.2127100005745888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13186858594417572, + "step": 7334 + }, + { + "epoch": 0.6113333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.06873372395833334, + "learning_rate": 4e-05, + "loss": 4.3084, + "loss/crossentropy": 1.5775117054581642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1517433263361454, + "step": 7336 + }, + { + "epoch": 0.6115, + "grad_norm": 5.6875, + "grad_norm_var": 0.11087239583333333, + "learning_rate": 4e-05, + "loss": 5.3517, + "loss/crossentropy": 2.102786421775818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23160099610686302, + "step": 7338 + }, + { + "epoch": 0.6116666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.11573893229166667, + "learning_rate": 4e-05, + "loss": 4.8422, + "loss/crossentropy": 1.8997114524245262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1689708549529314, + "step": 7340 + }, + { + "epoch": 0.6118333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.15728759765625, + "learning_rate": 4e-05, + "loss": 4.8982, + "loss/crossentropy": 2.1801356077194214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23223424702882767, + "step": 7342 + }, + { + "epoch": 0.612, + "grad_norm": 5.03125, + "grad_norm_var": 0.14894205729166668, + "learning_rate": 4e-05, + "loss": 4.9091, + "loss/crossentropy": 2.1929211616516113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20502665266394615, + "step": 7344 + }, + { + "epoch": 0.6121666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.14576822916666668, + "learning_rate": 4e-05, + "loss": 4.6603, + "loss/crossentropy": 1.4914572164416313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15952800959348679, + "step": 7346 + }, + { + "epoch": 0.6123333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.15396728515625, + "learning_rate": 4e-05, + "loss": 4.6071, + "loss/crossentropy": 1.3951681330800056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1512138657271862, + "step": 7348 + }, + { + "epoch": 0.6125, + "grad_norm": 4.53125, + "grad_norm_var": 0.15078125, + "learning_rate": 4e-05, + "loss": 4.837, + "loss/crossentropy": 1.9563019052147865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891426406800747, + "step": 7350 + }, + { + "epoch": 0.6126666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.12941080729166668, + "learning_rate": 4e-05, + "loss": 5.0656, + "loss/crossentropy": 2.1818154975771904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1797526851296425, + "step": 7352 + }, + { + "epoch": 0.6128333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.09466145833333334, + "learning_rate": 4e-05, + "loss": 4.9402, + "loss/crossentropy": 2.425243556499481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22774088010191917, + "step": 7354 + }, + { + "epoch": 0.613, + "grad_norm": 4.5, + "grad_norm_var": 1.2548177083333334, + "learning_rate": 4e-05, + "loss": 4.8706, + "loss/crossentropy": 2.3366977274417877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20710158720612526, + "step": 7356 + }, + { + "epoch": 0.6131666666666666, + "grad_norm": 4.875, + "grad_norm_var": 1.1992146809895834, + "learning_rate": 4e-05, + "loss": 5.0991, + "loss/crossentropy": 2.3591907620429993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22993936762213707, + "step": 7358 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 1.1929646809895833, + "learning_rate": 4e-05, + "loss": 5.0655, + "loss/crossentropy": 1.9983976259827614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18225537240505219, + "step": 7360 + }, + { + "epoch": 0.6135, + "grad_norm": 4.75, + "grad_norm_var": 1.1788411458333334, + "learning_rate": 4e-05, + "loss": 4.6148, + "loss/crossentropy": 1.7058027386665344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1712406538426876, + "step": 7362 + }, + { + "epoch": 0.6136666666666667, + "grad_norm": 5.0, + "grad_norm_var": 1.16724853515625, + "learning_rate": 4e-05, + "loss": 5.3608, + "loss/crossentropy": 1.91152223944664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19123814068734646, + "step": 7364 + }, + { + "epoch": 0.6138333333333333, + "grad_norm": 6.0, + "grad_norm_var": 1.1889322916666667, + "learning_rate": 4e-05, + "loss": 5.179, + "loss/crossentropy": 1.94556924700737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352980338037014, + "step": 7366 + }, + { + "epoch": 0.614, + "grad_norm": 4.8125, + "grad_norm_var": 1.21763916015625, + "learning_rate": 4e-05, + "loss": 4.8045, + "loss/crossentropy": 2.543876588344574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20267504453659058, + "step": 7368 + }, + { + "epoch": 0.6141666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 1.2033162434895834, + "learning_rate": 4e-05, + "loss": 4.7137, + "loss/crossentropy": 2.6393585205078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21013517677783966, + "step": 7370 + }, + { + "epoch": 0.6143333333333333, + "grad_norm": 5.40625, + "grad_norm_var": 0.12252197265625, + "learning_rate": 4e-05, + "loss": 5.2704, + "loss/crossentropy": 2.2312487065792084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21544279530644417, + "step": 7372 + }, + { + "epoch": 0.6145, + "grad_norm": 4.9375, + "grad_norm_var": 0.12138264973958333, + "learning_rate": 4e-05, + "loss": 4.8709, + "loss/crossentropy": 1.9392684027552605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17965877056121826, + "step": 7374 + }, + { + "epoch": 0.6146666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.11951497395833334, + "learning_rate": 4e-05, + "loss": 4.9357, + "loss/crossentropy": 1.7078978717327118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18131383322179317, + "step": 7376 + }, + { + "epoch": 0.6148333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.12294514973958333, + "learning_rate": 4e-05, + "loss": 4.9165, + "loss/crossentropy": 1.4037601873278618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14496637880802155, + "step": 7378 + }, + { + "epoch": 0.615, + "grad_norm": 4.28125, + "grad_norm_var": 0.15657145182291668, + "learning_rate": 4e-05, + "loss": 4.5314, + "loss/crossentropy": 2.505313813686371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22614199295639992, + "step": 7380 + }, + { + "epoch": 0.6151666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.08131510416666667, + "learning_rate": 4e-05, + "loss": 4.7505, + "loss/crossentropy": 1.5547932982444763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1560380645096302, + "step": 7382 + }, + { + "epoch": 0.6153333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.08300374348958334, + "learning_rate": 4e-05, + "loss": 4.4938, + "loss/crossentropy": 1.399954080581665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15437544509768486, + "step": 7384 + }, + { + "epoch": 0.6155, + "grad_norm": 4.75, + "grad_norm_var": 0.07844645182291667, + "learning_rate": 4e-05, + "loss": 4.8019, + "loss/crossentropy": 2.378181368112564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19977695494890213, + "step": 7386 + }, + { + "epoch": 0.6156666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.06213785807291667, + "learning_rate": 4e-05, + "loss": 4.9903, + "loss/crossentropy": 2.203928917646408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22653977200388908, + "step": 7388 + }, + { + "epoch": 0.6158333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.07096354166666667, + "learning_rate": 4e-05, + "loss": 5.456, + "loss/crossentropy": 2.281449168920517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20882750302553177, + "step": 7390 + }, + { + "epoch": 0.616, + "grad_norm": 4.28125, + "grad_norm_var": 0.09495035807291667, + "learning_rate": 4e-05, + "loss": 3.8264, + "loss/crossentropy": 1.2839400321245193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15107953548431396, + "step": 7392 + }, + { + "epoch": 0.6161666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.09646809895833333, + "learning_rate": 4e-05, + "loss": 4.3006, + "loss/crossentropy": 1.7372924834489822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894401628524065, + "step": 7394 + }, + { + "epoch": 0.6163333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.06378580729166666, + "learning_rate": 4e-05, + "loss": 4.738, + "loss/crossentropy": 2.499216377735138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21850312128663063, + "step": 7396 + }, + { + "epoch": 0.6165, + "grad_norm": 4.53125, + "grad_norm_var": 0.0611328125, + "learning_rate": 4e-05, + "loss": 4.3971, + "loss/crossentropy": 2.232160449028015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22296073287725449, + "step": 7398 + }, + { + "epoch": 0.6166666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.05774332682291667, + "learning_rate": 4e-05, + "loss": 4.5329, + "loss/crossentropy": 1.8162498325109482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18512892350554466, + "step": 7400 + }, + { + "epoch": 0.6168333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.06106363932291667, + "learning_rate": 4e-05, + "loss": 4.7576, + "loss/crossentropy": 2.1651022732257843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21410580724477768, + "step": 7402 + }, + { + "epoch": 0.617, + "grad_norm": 4.65625, + "grad_norm_var": 0.053385416666666664, + "learning_rate": 4e-05, + "loss": 4.4337, + "loss/crossentropy": 1.6273088455200195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19039241410791874, + "step": 7404 + }, + { + "epoch": 0.6171666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.08040364583333333, + "learning_rate": 4e-05, + "loss": 5.1997, + "loss/crossentropy": 2.3385795950889587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21196548268198967, + "step": 7406 + }, + { + "epoch": 0.6173333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.06519775390625, + "learning_rate": 4e-05, + "loss": 4.7667, + "loss/crossentropy": 2.227331906557083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21265869960188866, + "step": 7408 + }, + { + "epoch": 0.6175, + "grad_norm": 5.71875, + "grad_norm_var": 0.12955322265625, + "learning_rate": 4e-05, + "loss": 5.4909, + "loss/crossentropy": 2.0978061258792877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349342331290245, + "step": 7410 + }, + { + "epoch": 0.6176666666666667, + "grad_norm": 5.96875, + "grad_norm_var": 0.20193684895833333, + "learning_rate": 4e-05, + "loss": 5.0476, + "loss/crossentropy": 2.3273271322250366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20061522722244263, + "step": 7412 + }, + { + "epoch": 0.6178333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.19308268229166667, + "learning_rate": 4e-05, + "loss": 4.736, + "loss/crossentropy": 1.521895594894886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1687323059886694, + "step": 7414 + }, + { + "epoch": 0.618, + "grad_norm": 4.53125, + "grad_norm_var": 0.18791910807291667, + "learning_rate": 4e-05, + "loss": 4.4298, + "loss/crossentropy": 2.3158479034900665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2391265109181404, + "step": 7416 + }, + { + "epoch": 0.6181666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.17003580729166667, + "learning_rate": 4e-05, + "loss": 5.0987, + "loss/crossentropy": 1.928847923874855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19217653200030327, + "step": 7418 + }, + { + "epoch": 0.6183333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.14069010416666666, + "learning_rate": 4e-05, + "loss": 5.3043, + "loss/crossentropy": 2.1934374272823334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24121011793613434, + "step": 7420 + }, + { + "epoch": 0.6185, + "grad_norm": 5.0, + "grad_norm_var": 0.17079671223958334, + "learning_rate": 4e-05, + "loss": 5.1373, + "loss/crossentropy": 1.699301615357399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16717437282204628, + "step": 7422 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.17146809895833334, + "learning_rate": 4e-05, + "loss": 5.0396, + "loss/crossentropy": 2.2893040478229523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21165359392762184, + "step": 7424 + }, + { + "epoch": 0.6188333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.1810546875, + "learning_rate": 4e-05, + "loss": 4.2959, + "loss/crossentropy": 1.883228361606598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17740708962082863, + "step": 7426 + }, + { + "epoch": 0.619, + "grad_norm": 4.96875, + "grad_norm_var": 0.11334635416666666, + "learning_rate": 4e-05, + "loss": 4.9929, + "loss/crossentropy": 1.7570676431059837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17012443765997887, + "step": 7428 + }, + { + "epoch": 0.6191666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.10549723307291667, + "learning_rate": 4e-05, + "loss": 5.0408, + "loss/crossentropy": 1.8213330134749413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1704744454473257, + "step": 7430 + }, + { + "epoch": 0.6193333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.09777018229166666, + "learning_rate": 4e-05, + "loss": 5.172, + "loss/crossentropy": 1.6421629637479782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18456821888685226, + "step": 7432 + }, + { + "epoch": 0.6195, + "grad_norm": 4.625, + "grad_norm_var": 0.103125, + "learning_rate": 4e-05, + "loss": 4.587, + "loss/crossentropy": 1.8029606891795993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16571101709268987, + "step": 7434 + }, + { + "epoch": 0.6196666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.10384114583333333, + "learning_rate": 4e-05, + "loss": 5.0261, + "loss/crossentropy": 0.9726422056555748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1137815099209547, + "step": 7436 + }, + { + "epoch": 0.6198333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.045817057291666664, + "learning_rate": 4e-05, + "loss": 5.0299, + "loss/crossentropy": 1.7549875676631927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18394076451659203, + "step": 7438 + }, + { + "epoch": 0.62, + "grad_norm": 4.78125, + "grad_norm_var": 0.04550374348958333, + "learning_rate": 4e-05, + "loss": 4.4456, + "loss/crossentropy": 1.8778206706047058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20654203370213509, + "step": 7440 + }, + { + "epoch": 0.6201666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.02984619140625, + "learning_rate": 4e-05, + "loss": 4.7478, + "loss/crossentropy": 2.4427354633808136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21256711333990097, + "step": 7442 + }, + { + "epoch": 0.6203333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.03866780598958333, + "learning_rate": 4e-05, + "loss": 4.5859, + "loss/crossentropy": 2.2122822999954224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503214538097382, + "step": 7444 + }, + { + "epoch": 0.6205, + "grad_norm": 4.65625, + "grad_norm_var": 0.042252604166666666, + "learning_rate": 4e-05, + "loss": 4.5618, + "loss/crossentropy": 2.0259710252285004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19461403414607048, + "step": 7446 + }, + { + "epoch": 0.6206666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.041910807291666664, + "learning_rate": 4e-05, + "loss": 5.1102, + "loss/crossentropy": 2.308306932449341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225881177932024, + "step": 7448 + }, + { + "epoch": 0.6208333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.04159749348958333, + "learning_rate": 4e-05, + "loss": 5.0064, + "loss/crossentropy": 2.103279024362564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2261260412633419, + "step": 7450 + }, + { + "epoch": 0.621, + "grad_norm": 4.8125, + "grad_norm_var": 0.026936848958333332, + "learning_rate": 4e-05, + "loss": 5.1623, + "loss/crossentropy": 1.6729852855205536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.169599249958992, + "step": 7452 + }, + { + "epoch": 0.6211666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.01734619140625, + "learning_rate": 4e-05, + "loss": 4.3505, + "loss/crossentropy": 1.8214651942253113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827968992292881, + "step": 7454 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.017606608072916665, + "learning_rate": 4e-05, + "loss": 4.4708, + "loss/crossentropy": 1.6592305526137352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15527719631791115, + "step": 7456 + }, + { + "epoch": 0.6215, + "grad_norm": 5.03125, + "grad_norm_var": 0.03435872395833333, + "learning_rate": 4e-05, + "loss": 5.0017, + "loss/crossentropy": 2.079524904489517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18644290789961815, + "step": 7458 + }, + { + "epoch": 0.6216666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.029423014322916666, + "learning_rate": 4e-05, + "loss": 4.6251, + "loss/crossentropy": 1.945441111922264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1680115982890129, + "step": 7460 + }, + { + "epoch": 0.6218333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.03189697265625, + "learning_rate": 4e-05, + "loss": 4.8618, + "loss/crossentropy": 2.3849238753318787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19866472855210304, + "step": 7462 + }, + { + "epoch": 0.622, + "grad_norm": 4.46875, + "grad_norm_var": 0.04309895833333333, + "learning_rate": 4e-05, + "loss": 4.4236, + "loss/crossentropy": 1.5672996863722801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.160741176456213, + "step": 7464 + }, + { + "epoch": 0.6221666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.04309895833333333, + "learning_rate": 4e-05, + "loss": 4.7951, + "loss/crossentropy": 1.4478224888443947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15010821633040905, + "step": 7466 + }, + { + "epoch": 0.6223333333333333, + "grad_norm": 4.21875, + "grad_norm_var": 0.09257405598958333, + "learning_rate": 4e-05, + "loss": 4.695, + "loss/crossentropy": 1.4308890029788017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18856440111994743, + "step": 7468 + }, + { + "epoch": 0.6225, + "grad_norm": 4.6875, + "grad_norm_var": 0.09608968098958333, + "learning_rate": 4e-05, + "loss": 4.9713, + "loss/crossentropy": 1.7661740481853485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19487347826361656, + "step": 7470 + }, + { + "epoch": 0.6226666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.09607747395833334, + "learning_rate": 4e-05, + "loss": 4.7833, + "loss/crossentropy": 1.2931400835514069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15353696048259735, + "step": 7472 + }, + { + "epoch": 0.6228333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.08606770833333334, + "learning_rate": 4e-05, + "loss": 5.6731, + "loss/crossentropy": 2.2657946050167084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23214036226272583, + "step": 7474 + }, + { + "epoch": 0.623, + "grad_norm": 4.5625, + "grad_norm_var": 0.104150390625, + "learning_rate": 4e-05, + "loss": 4.544, + "loss/crossentropy": 1.616468869149685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15361887589097023, + "step": 7476 + }, + { + "epoch": 0.6231666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.100634765625, + "learning_rate": 4e-05, + "loss": 5.0915, + "loss/crossentropy": 1.716950848698616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18519181199371815, + "step": 7478 + }, + { + "epoch": 0.6233333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.08511962890625, + "learning_rate": 4e-05, + "loss": 4.7246, + "loss/crossentropy": 2.2773490250110626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976032555103302, + "step": 7480 + }, + { + "epoch": 0.6235, + "grad_norm": 4.65625, + "grad_norm_var": 0.11910400390625, + "learning_rate": 4e-05, + "loss": 4.9252, + "loss/crossentropy": 1.9374547228217125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17996854707598686, + "step": 7482 + }, + { + "epoch": 0.6236666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.088134765625, + "learning_rate": 4e-05, + "loss": 4.6941, + "loss/crossentropy": 2.4025683403015137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20034634694457054, + "step": 7484 + }, + { + "epoch": 0.6238333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.08339436848958333, + "learning_rate": 4e-05, + "loss": 5.1253, + "loss/crossentropy": 1.978261910378933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18603325076401234, + "step": 7486 + }, + { + "epoch": 0.624, + "grad_norm": 4.625, + "grad_norm_var": 0.08707275390625, + "learning_rate": 4e-05, + "loss": 5.0676, + "loss/crossentropy": 2.561383068561554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25106481462717056, + "step": 7488 + }, + { + "epoch": 0.6241666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.09231363932291667, + "learning_rate": 4e-05, + "loss": 5.1759, + "loss/crossentropy": 1.8795787617564201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17086850106716156, + "step": 7490 + }, + { + "epoch": 0.6243333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.08017171223958333, + "learning_rate": 4e-05, + "loss": 4.4529, + "loss/crossentropy": 2.1656236350536346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20800131186842918, + "step": 7492 + }, + { + "epoch": 0.6245, + "grad_norm": 4.84375, + "grad_norm_var": 0.07763264973958334, + "learning_rate": 4e-05, + "loss": 5.2549, + "loss/crossentropy": 2.350065529346466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21179821342229843, + "step": 7494 + }, + { + "epoch": 0.6246666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.08951822916666667, + "learning_rate": 4e-05, + "loss": 4.4729, + "loss/crossentropy": 2.0494428873062134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21732918173074722, + "step": 7496 + }, + { + "epoch": 0.6248333333333334, + "grad_norm": 5.40625, + "grad_norm_var": 0.07083333333333333, + "learning_rate": 4e-05, + "loss": 5.1736, + "loss/crossentropy": 2.5712009966373444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21002069488167763, + "step": 7498 + }, + { + "epoch": 0.625, + "grad_norm": 5.0625, + "grad_norm_var": 0.058056640625, + "learning_rate": 4e-05, + "loss": 4.6009, + "loss/crossentropy": 1.6923210993409157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1651032567024231, + "step": 7500 + }, + { + "epoch": 0.6251666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.06161702473958333, + "learning_rate": 4e-05, + "loss": 4.2048, + "loss/crossentropy": 0.818359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10744648613035679, + "step": 7502 + }, + { + "epoch": 0.6253333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 0.06638997395833333, + "learning_rate": 4e-05, + "loss": 4.3508, + "loss/crossentropy": 1.6889969930052757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20452985167503357, + "step": 7504 + }, + { + "epoch": 0.6255, + "grad_norm": 5.25, + "grad_norm_var": 0.06679280598958333, + "learning_rate": 4e-05, + "loss": 4.9417, + "loss/crossentropy": 1.9900458455085754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20413101464509964, + "step": 7506 + }, + { + "epoch": 0.6256666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 0.07245686848958334, + "learning_rate": 4e-05, + "loss": 4.9704, + "loss/crossentropy": 2.0661009550094604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114063873887062, + "step": 7508 + }, + { + "epoch": 0.6258333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.07298177083333333, + "learning_rate": 4e-05, + "loss": 4.8257, + "loss/crossentropy": 1.449072316288948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15182125568389893, + "step": 7510 + }, + { + "epoch": 0.626, + "grad_norm": 4.96875, + "grad_norm_var": 0.06640218098958334, + "learning_rate": 4e-05, + "loss": 5.5008, + "loss/crossentropy": 2.6559234857559204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21250774711370468, + "step": 7512 + }, + { + "epoch": 0.6261666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.05933837890625, + "learning_rate": 4e-05, + "loss": 5.0174, + "loss/crossentropy": 1.3103863522410393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14163080044090748, + "step": 7514 + }, + { + "epoch": 0.6263333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.06064046223958333, + "learning_rate": 4e-05, + "loss": 5.2193, + "loss/crossentropy": 2.289563000202179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21135593950748444, + "step": 7516 + }, + { + "epoch": 0.6265, + "grad_norm": 4.5625, + "grad_norm_var": 0.06496988932291667, + "learning_rate": 4e-05, + "loss": 5.1146, + "loss/crossentropy": 2.310837507247925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19352519512176514, + "step": 7518 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.07411702473958333, + "learning_rate": 4e-05, + "loss": 4.5612, + "loss/crossentropy": 1.3394847959280014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13622993975877762, + "step": 7520 + }, + { + "epoch": 0.6268333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.07394205729166667, + "learning_rate": 4e-05, + "loss": 5.2805, + "loss/crossentropy": 1.9260080009698868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18235324323177338, + "step": 7522 + }, + { + "epoch": 0.627, + "grad_norm": 5.15625, + "grad_norm_var": 0.056494140625, + "learning_rate": 4e-05, + "loss": 5.5087, + "loss/crossentropy": 2.140301376581192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202216237783432, + "step": 7524 + }, + { + "epoch": 0.6271666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.06822916666666666, + "learning_rate": 4e-05, + "loss": 4.4574, + "loss/crossentropy": 2.1392059326171875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24074340984225273, + "step": 7526 + }, + { + "epoch": 0.6273333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.064697265625, + "learning_rate": 4e-05, + "loss": 4.4673, + "loss/crossentropy": 1.8771500810980797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1791480854153633, + "step": 7528 + }, + { + "epoch": 0.6275, + "grad_norm": 4.96875, + "grad_norm_var": 0.06702067057291666, + "learning_rate": 4e-05, + "loss": 4.9868, + "loss/crossentropy": 1.9266760498285294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410490244627, + "step": 7530 + }, + { + "epoch": 0.6276666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.06809488932291667, + "learning_rate": 4e-05, + "loss": 5.5456, + "loss/crossentropy": 2.4995266795158386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117721870541573, + "step": 7532 + }, + { + "epoch": 0.6278333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.073681640625, + "learning_rate": 4e-05, + "loss": 4.5547, + "loss/crossentropy": 2.4195366203784943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22145430743694305, + "step": 7534 + }, + { + "epoch": 0.628, + "grad_norm": 4.3125, + "grad_norm_var": 0.072509765625, + "learning_rate": 4e-05, + "loss": 4.1866, + "loss/crossentropy": 1.2680030390620232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13905576802790165, + "step": 7536 + }, + { + "epoch": 0.6281666666666667, + "grad_norm": 5.40625, + "grad_norm_var": 0.08404947916666666, + "learning_rate": 4e-05, + "loss": 5.0348, + "loss/crossentropy": 1.3818910643458366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14404772967100143, + "step": 7538 + }, + { + "epoch": 0.6283333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.07902018229166667, + "learning_rate": 4e-05, + "loss": 5.0873, + "loss/crossentropy": 1.975064903497696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19577939808368683, + "step": 7540 + }, + { + "epoch": 0.6285, + "grad_norm": 4.46875, + "grad_norm_var": 0.078515625, + "learning_rate": 4e-05, + "loss": 4.6554, + "loss/crossentropy": 1.7756718024611473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17718594893813133, + "step": 7542 + }, + { + "epoch": 0.6286666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.08674723307291667, + "learning_rate": 4e-05, + "loss": 4.5838, + "loss/crossentropy": 1.5631278902292252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20101314038038254, + "step": 7544 + }, + { + "epoch": 0.6288333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.09472249348958334, + "learning_rate": 4e-05, + "loss": 4.3893, + "loss/crossentropy": 2.0111151337623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1673036515712738, + "step": 7546 + }, + { + "epoch": 0.629, + "grad_norm": 4.34375, + "grad_norm_var": 0.10676676432291667, + "learning_rate": 4e-05, + "loss": 4.9599, + "loss/crossentropy": 1.497259370982647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15227816626429558, + "step": 7548 + }, + { + "epoch": 0.6291666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.11248372395833334, + "learning_rate": 4e-05, + "loss": 4.2137, + "loss/crossentropy": 2.412768602371216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20763478055596352, + "step": 7550 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.10826822916666666, + "learning_rate": 4e-05, + "loss": 4.8871, + "loss/crossentropy": 1.8732339143753052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120702899992466, + "step": 7552 + }, + { + "epoch": 0.6295, + "grad_norm": 5.125, + "grad_norm_var": 0.09029541015625, + "learning_rate": 4e-05, + "loss": 4.7153, + "loss/crossentropy": 2.2625193893909454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1922021098434925, + "step": 7554 + }, + { + "epoch": 0.6296666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.09811197916666667, + "learning_rate": 4e-05, + "loss": 5.2024, + "loss/crossentropy": 2.018505483865738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894335299730301, + "step": 7556 + }, + { + "epoch": 0.6298333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.09052327473958334, + "learning_rate": 4e-05, + "loss": 5.1375, + "loss/crossentropy": 1.974723607301712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21125193685293198, + "step": 7558 + }, + { + "epoch": 0.63, + "grad_norm": 4.625, + "grad_norm_var": 0.12745768229166668, + "learning_rate": 4e-05, + "loss": 4.7333, + "loss/crossentropy": 1.1625150069594383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1372869722545147, + "step": 7560 + }, + { + "epoch": 0.6301666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.911328125, + "learning_rate": 4e-05, + "loss": 4.9296, + "loss/crossentropy": 1.8650590181350708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1866096295416355, + "step": 7562 + }, + { + "epoch": 0.6303333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.8719889322916666, + "learning_rate": 4e-05, + "loss": 4.8647, + "loss/crossentropy": 2.071987845003605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18294048868119717, + "step": 7564 + }, + { + "epoch": 0.6305, + "grad_norm": 5.3125, + "grad_norm_var": 0.82847900390625, + "learning_rate": 4e-05, + "loss": 4.9965, + "loss/crossentropy": 1.72061687707901, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1748257651925087, + "step": 7566 + }, + { + "epoch": 0.6306666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.895166015625, + "learning_rate": 4e-05, + "loss": 4.1251, + "loss/crossentropy": 2.17667618393898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039562426507473, + "step": 7568 + }, + { + "epoch": 0.6308333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.882421875, + "learning_rate": 4e-05, + "loss": 4.9057, + "loss/crossentropy": 1.8860152289271355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19144929759204388, + "step": 7570 + }, + { + "epoch": 0.631, + "grad_norm": 4.4375, + "grad_norm_var": 0.9200480143229167, + "learning_rate": 4e-05, + "loss": 4.4815, + "loss/crossentropy": 2.2932342290878296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954507753252983, + "step": 7572 + }, + { + "epoch": 0.6311666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.9251139322916667, + "learning_rate": 4e-05, + "loss": 5.395, + "loss/crossentropy": 2.4433215856552124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20751015469431877, + "step": 7574 + }, + { + "epoch": 0.6313333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.9438802083333333, + "learning_rate": 4e-05, + "loss": 4.6266, + "loss/crossentropy": 2.486309230327606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23241155594587326, + "step": 7576 + }, + { + "epoch": 0.6315, + "grad_norm": 4.5625, + "grad_norm_var": 0.092041015625, + "learning_rate": 4e-05, + "loss": 4.7142, + "loss/crossentropy": 1.9126665592193604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041993960738182, + "step": 7578 + }, + { + "epoch": 0.6316666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.09306233723958333, + "learning_rate": 4e-05, + "loss": 5.2434, + "loss/crossentropy": 1.77960654348135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16748891957104206, + "step": 7580 + }, + { + "epoch": 0.6318333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.07392171223958334, + "learning_rate": 4e-05, + "loss": 4.9963, + "loss/crossentropy": 1.656422920525074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18162991851568222, + "step": 7582 + }, + { + "epoch": 0.632, + "grad_norm": 4.75, + "grad_norm_var": 0.05924072265625, + "learning_rate": 4e-05, + "loss": 5.164, + "loss/crossentropy": 1.8230694606900215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17624171450734138, + "step": 7584 + }, + { + "epoch": 0.6321666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.06884358723958334, + "learning_rate": 4e-05, + "loss": 5.0233, + "loss/crossentropy": 1.4461549371480942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15429365262389183, + "step": 7586 + }, + { + "epoch": 0.6323333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.06164957682291667, + "learning_rate": 4e-05, + "loss": 5.1898, + "loss/crossentropy": 2.1677410900592804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.191360954195261, + "step": 7588 + }, + { + "epoch": 0.6325, + "grad_norm": 4.84375, + "grad_norm_var": 0.05924072265625, + "learning_rate": 4e-05, + "loss": 5.0244, + "loss/crossentropy": 1.9723278135061264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18035429902374744, + "step": 7590 + }, + { + "epoch": 0.6326666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.048567708333333334, + "learning_rate": 4e-05, + "loss": 4.6673, + "loss/crossentropy": 1.7898750752210617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19324330985546112, + "step": 7592 + }, + { + "epoch": 0.6328333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.053446451822916664, + "learning_rate": 4e-05, + "loss": 4.982, + "loss/crossentropy": 2.2220281660556793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20492373406887054, + "step": 7594 + }, + { + "epoch": 0.633, + "grad_norm": 4.5625, + "grad_norm_var": 0.057535807291666664, + "learning_rate": 4e-05, + "loss": 5.0144, + "loss/crossentropy": 1.870813049376011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17919510789215565, + "step": 7596 + }, + { + "epoch": 0.6331666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.05536702473958333, + "learning_rate": 4e-05, + "loss": 4.7825, + "loss/crossentropy": 1.8371545374393463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20575408078730106, + "step": 7598 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.057417805989583334, + "learning_rate": 4e-05, + "loss": 4.8203, + "loss/crossentropy": 1.3351125866174698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14372671209275723, + "step": 7600 + }, + { + "epoch": 0.6335, + "grad_norm": 5.09375, + "grad_norm_var": 0.05584309895833333, + "learning_rate": 4e-05, + "loss": 5.2663, + "loss/crossentropy": 2.3292965590953827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20524241402745247, + "step": 7602 + }, + { + "epoch": 0.6336666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.05517171223958333, + "learning_rate": 4e-05, + "loss": 4.7048, + "loss/crossentropy": 2.4347563982009888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2168492116034031, + "step": 7604 + }, + { + "epoch": 0.6338333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.05689697265625, + "learning_rate": 4e-05, + "loss": 4.607, + "loss/crossentropy": 2.1245033144950867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22462333366274834, + "step": 7606 + }, + { + "epoch": 0.634, + "grad_norm": 5.6875, + "grad_norm_var": 0.08586832682291666, + "learning_rate": 4e-05, + "loss": 4.44, + "loss/crossentropy": 1.540616787970066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18548547476530075, + "step": 7608 + }, + { + "epoch": 0.6341666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 0.09299723307291667, + "learning_rate": 4e-05, + "loss": 4.502, + "loss/crossentropy": 1.3562129810452461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18716447241604328, + "step": 7610 + }, + { + "epoch": 0.6343333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.08883056640625, + "learning_rate": 4e-05, + "loss": 4.3424, + "loss/crossentropy": 1.135543867945671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14085247367620468, + "step": 7612 + }, + { + "epoch": 0.6345, + "grad_norm": 4.9375, + "grad_norm_var": 0.10015869140625, + "learning_rate": 4e-05, + "loss": 5.2584, + "loss/crossentropy": 2.25225293636322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21712664887309074, + "step": 7614 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.09390869140625, + "learning_rate": 4e-05, + "loss": 5.0247, + "loss/crossentropy": 1.9538735672831535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17630420625209808, + "step": 7616 + }, + { + "epoch": 0.6348333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.112353515625, + "learning_rate": 4e-05, + "loss": 3.9214, + "loss/crossentropy": 1.6917081847786903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17729554325342178, + "step": 7618 + }, + { + "epoch": 0.635, + "grad_norm": 6.4375, + "grad_norm_var": 0.48606770833333335, + "learning_rate": 4e-05, + "loss": 5.2245, + "loss/crossentropy": 2.008490338921547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19701511040329933, + "step": 7620 + }, + { + "epoch": 0.6351666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.49247639973958335, + "learning_rate": 4e-05, + "loss": 5.2463, + "loss/crossentropy": 1.7468746230006218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2234695442020893, + "step": 7622 + }, + { + "epoch": 0.6353333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.4777628580729167, + "learning_rate": 4e-05, + "loss": 4.6194, + "loss/crossentropy": 1.1543590053915977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1463829204440117, + "step": 7624 + }, + { + "epoch": 0.6355, + "grad_norm": 4.96875, + "grad_norm_var": 0.49244791666666665, + "learning_rate": 4e-05, + "loss": 4.8176, + "loss/crossentropy": 1.722088746726513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1737196370959282, + "step": 7626 + }, + { + "epoch": 0.6356666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.4774698893229167, + "learning_rate": 4e-05, + "loss": 5.0913, + "loss/crossentropy": 2.0701277554035187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123371586203575, + "step": 7628 + }, + { + "epoch": 0.6358333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.4684244791666667, + "learning_rate": 4e-05, + "loss": 5.1948, + "loss/crossentropy": 1.5187406539916992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1752415895462036, + "step": 7630 + }, + { + "epoch": 0.636, + "grad_norm": 5.3125, + "grad_norm_var": 0.48138020833333334, + "learning_rate": 4e-05, + "loss": 5.2926, + "loss/crossentropy": 2.065512478351593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2165343016386032, + "step": 7632 + }, + { + "epoch": 0.6361666666666667, + "grad_norm": 4.125, + "grad_norm_var": 0.5079060872395833, + "learning_rate": 4e-05, + "loss": 4.5435, + "loss/crossentropy": 1.003174789249897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11726399697363377, + "step": 7634 + }, + { + "epoch": 0.6363333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.09542643229166667, + "learning_rate": 4e-05, + "loss": 4.9272, + "loss/crossentropy": 2.772099256515503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22614924237132072, + "step": 7636 + }, + { + "epoch": 0.6365, + "grad_norm": 4.5, + "grad_norm_var": 0.10167643229166666, + "learning_rate": 4e-05, + "loss": 5.1145, + "loss/crossentropy": 2.4037272334098816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22226005792617798, + "step": 7638 + }, + { + "epoch": 0.6366666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.08762613932291667, + "learning_rate": 4e-05, + "loss": 5.2337, + "loss/crossentropy": 2.631228506565094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077028527855873, + "step": 7640 + }, + { + "epoch": 0.6368333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.080322265625, + "learning_rate": 4e-05, + "loss": 5.4376, + "loss/crossentropy": 2.5499427318573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166195549070835, + "step": 7642 + }, + { + "epoch": 0.637, + "grad_norm": 4.9375, + "grad_norm_var": 0.0669921875, + "learning_rate": 4e-05, + "loss": 5.0609, + "loss/crossentropy": 2.2906173169612885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19342058897018433, + "step": 7644 + }, + { + "epoch": 0.6371666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.08687744140625, + "learning_rate": 4e-05, + "loss": 4.6802, + "loss/crossentropy": 1.5942266285419464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20375887118279934, + "step": 7646 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.07571207682291667, + "learning_rate": 4e-05, + "loss": 4.6925, + "loss/crossentropy": 1.7787268534302711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17380007728934288, + "step": 7648 + }, + { + "epoch": 0.6375, + "grad_norm": 4.84375, + "grad_norm_var": 0.08821614583333333, + "learning_rate": 4e-05, + "loss": 5.239, + "loss/crossentropy": 2.2324607968330383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22638193517923355, + "step": 7650 + }, + { + "epoch": 0.6376666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.1076171875, + "learning_rate": 4e-05, + "loss": 5.1601, + "loss/crossentropy": 2.2216447293758392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244611196219921, + "step": 7652 + }, + { + "epoch": 0.6378333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.102978515625, + "learning_rate": 4e-05, + "loss": 5.0098, + "loss/crossentropy": 1.621771179139614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16405992954969406, + "step": 7654 + }, + { + "epoch": 0.638, + "grad_norm": 4.59375, + "grad_norm_var": 0.108056640625, + "learning_rate": 4e-05, + "loss": 4.9768, + "loss/crossentropy": 1.562087506055832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15578421019017696, + "step": 7656 + }, + { + "epoch": 0.6381666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.113134765625, + "learning_rate": 4e-05, + "loss": 4.6871, + "loss/crossentropy": 1.9255924224853516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18294049426913261, + "step": 7658 + }, + { + "epoch": 0.6383333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.14205322265625, + "learning_rate": 4e-05, + "loss": 4.3626, + "loss/crossentropy": 1.799445129930973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18437882140278816, + "step": 7660 + }, + { + "epoch": 0.6385, + "grad_norm": 4.8125, + "grad_norm_var": 0.12337239583333333, + "learning_rate": 4e-05, + "loss": 4.7016, + "loss/crossentropy": 1.9358659163117409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17831017449498177, + "step": 7662 + }, + { + "epoch": 0.6386666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.11278889973958334, + "learning_rate": 4e-05, + "loss": 4.6857, + "loss/crossentropy": 2.1596154496073723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186475221067667, + "step": 7664 + }, + { + "epoch": 0.6388333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.07310791015625, + "learning_rate": 4e-05, + "loss": 4.7901, + "loss/crossentropy": 1.5760779231786728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756380796432495, + "step": 7666 + }, + { + "epoch": 0.639, + "grad_norm": 4.6875, + "grad_norm_var": 0.05640869140625, + "learning_rate": 4e-05, + "loss": 5.2057, + "loss/crossentropy": 2.23982173204422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1747075356543064, + "step": 7668 + }, + { + "epoch": 0.6391666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.053450520833333334, + "learning_rate": 4e-05, + "loss": 4.9526, + "loss/crossentropy": 1.7911747694015503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22356000542640686, + "step": 7670 + }, + { + "epoch": 0.6393333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.04879150390625, + "learning_rate": 4e-05, + "loss": 5.1562, + "loss/crossentropy": 1.8702715337276459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1837063878774643, + "step": 7672 + }, + { + "epoch": 0.6395, + "grad_norm": 4.84375, + "grad_norm_var": 0.04827067057291667, + "learning_rate": 4e-05, + "loss": 5.0209, + "loss/crossentropy": 1.9256580173969269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18390294164419174, + "step": 7674 + }, + { + "epoch": 0.6396666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.019917805989583332, + "learning_rate": 4e-05, + "loss": 4.7435, + "loss/crossentropy": 1.92479457706213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19825425744056702, + "step": 7676 + }, + { + "epoch": 0.6398333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.0185546875, + "learning_rate": 4e-05, + "loss": 5.3588, + "loss/crossentropy": 2.2302474975585938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21018041297793388, + "step": 7678 + }, + { + "epoch": 0.64, + "grad_norm": 4.8125, + "grad_norm_var": 0.017431640625, + "learning_rate": 4e-05, + "loss": 4.9013, + "loss/crossentropy": 2.454475373029709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20873108878731728, + "step": 7680 + }, + { + "epoch": 0.6401666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.02008056640625, + "learning_rate": 4e-05, + "loss": 5.0343, + "loss/crossentropy": 1.556854486465454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1480635069310665, + "step": 7682 + }, + { + "epoch": 0.6403333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.029166666666666667, + "learning_rate": 4e-05, + "loss": 4.6421, + "loss/crossentropy": 1.8543910756707191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16730140149593353, + "step": 7684 + }, + { + "epoch": 0.6405, + "grad_norm": 4.875, + "grad_norm_var": 0.04205322265625, + "learning_rate": 4e-05, + "loss": 4.8912, + "loss/crossentropy": 2.4121593832969666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19549377635121346, + "step": 7686 + }, + { + "epoch": 0.6406666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.04146728515625, + "learning_rate": 4e-05, + "loss": 4.1813, + "loss/crossentropy": 1.6175341084599495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17516151070594788, + "step": 7688 + }, + { + "epoch": 0.6408333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.051656087239583336, + "learning_rate": 4e-05, + "loss": 5.1848, + "loss/crossentropy": 2.531389057636261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26043013855814934, + "step": 7690 + }, + { + "epoch": 0.641, + "grad_norm": 5.03125, + "grad_norm_var": 0.0525390625, + "learning_rate": 4e-05, + "loss": 5.6131, + "loss/crossentropy": 1.8920731022953987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1748149711638689, + "step": 7692 + }, + { + "epoch": 0.6411666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.054947916666666666, + "learning_rate": 4e-05, + "loss": 5.2251, + "loss/crossentropy": 2.11933171749115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19750068709254265, + "step": 7694 + }, + { + "epoch": 0.6413333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.30357666015625, + "learning_rate": 4e-05, + "loss": 4.493, + "loss/crossentropy": 2.253427028656006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19837215542793274, + "step": 7696 + }, + { + "epoch": 0.6415, + "grad_norm": 4.96875, + "grad_norm_var": 0.30155843098958335, + "learning_rate": 4e-05, + "loss": 4.5354, + "loss/crossentropy": 0.9745614528656006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10782595910131931, + "step": 7698 + }, + { + "epoch": 0.6416666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.2813639322916667, + "learning_rate": 4e-05, + "loss": 5.3764, + "loss/crossentropy": 2.150584578514099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23296813294291496, + "step": 7700 + }, + { + "epoch": 0.6418333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.26998697916666664, + "learning_rate": 4e-05, + "loss": 5.004, + "loss/crossentropy": 2.3049569725990295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985363632440567, + "step": 7702 + }, + { + "epoch": 0.642, + "grad_norm": 4.84375, + "grad_norm_var": 0.2631144205729167, + "learning_rate": 4e-05, + "loss": 4.3756, + "loss/crossentropy": 1.7743771374225616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020826954394579, + "step": 7704 + }, + { + "epoch": 0.6421666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.25484619140625, + "learning_rate": 4e-05, + "loss": 5.093, + "loss/crossentropy": 1.6929708272218704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18312661536037922, + "step": 7706 + }, + { + "epoch": 0.6423333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.26470947265625, + "learning_rate": 4e-05, + "loss": 5.1116, + "loss/crossentropy": 2.522721290588379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22130097448825836, + "step": 7708 + }, + { + "epoch": 0.6425, + "grad_norm": 4.90625, + "grad_norm_var": 0.2642578125, + "learning_rate": 4e-05, + "loss": 5.7148, + "loss/crossentropy": 2.4536512196063995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.236469566822052, + "step": 7710 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.04290364583333333, + "learning_rate": 4e-05, + "loss": 4.4442, + "loss/crossentropy": 2.4721298217773438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147650420665741, + "step": 7712 + }, + { + "epoch": 0.6428333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.039286295572916664, + "learning_rate": 4e-05, + "loss": 5.2998, + "loss/crossentropy": 1.9547239020466805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19260060787200928, + "step": 7714 + }, + { + "epoch": 0.643, + "grad_norm": 4.84375, + "grad_norm_var": 0.037919108072916666, + "learning_rate": 4e-05, + "loss": 4.6844, + "loss/crossentropy": 1.1047619432210922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15300701186060905, + "step": 7716 + }, + { + "epoch": 0.6431666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.03424479166666667, + "learning_rate": 4e-05, + "loss": 4.967, + "loss/crossentropy": 1.8797119855880737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17335743829607964, + "step": 7718 + }, + { + "epoch": 0.6433333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.035445149739583334, + "learning_rate": 4e-05, + "loss": 5.6001, + "loss/crossentropy": 1.82417381554842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1801386997103691, + "step": 7720 + }, + { + "epoch": 0.6435, + "grad_norm": 4.71875, + "grad_norm_var": 0.03954671223958333, + "learning_rate": 4e-05, + "loss": 5.1648, + "loss/crossentropy": 1.1887472122907639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17347997426986694, + "step": 7722 + }, + { + "epoch": 0.6436666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.04073893229166667, + "learning_rate": 4e-05, + "loss": 4.9365, + "loss/crossentropy": 2.210069417953491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106613926589489, + "step": 7724 + }, + { + "epoch": 0.6438333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.04937744140625, + "learning_rate": 4e-05, + "loss": 4.6794, + "loss/crossentropy": 1.8511146306991577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16800183057785034, + "step": 7726 + }, + { + "epoch": 0.644, + "grad_norm": 4.875, + "grad_norm_var": 0.04504801432291667, + "learning_rate": 4e-05, + "loss": 5.1563, + "loss/crossentropy": 2.2941965758800507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2198324017226696, + "step": 7728 + }, + { + "epoch": 0.6441666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.04576822916666667, + "learning_rate": 4e-05, + "loss": 5.0422, + "loss/crossentropy": 2.0403133034706116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22183560580015182, + "step": 7730 + }, + { + "epoch": 0.6443333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.054036458333333336, + "learning_rate": 4e-05, + "loss": 4.7523, + "loss/crossentropy": 2.1832509338855743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20109236985445023, + "step": 7732 + }, + { + "epoch": 0.6445, + "grad_norm": 4.65625, + "grad_norm_var": 0.046728515625, + "learning_rate": 4e-05, + "loss": 5.1595, + "loss/crossentropy": 2.494588077068329, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531383857131004, + "step": 7734 + }, + { + "epoch": 0.6446666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.03834228515625, + "learning_rate": 4e-05, + "loss": 4.8467, + "loss/crossentropy": 2.134985640645027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1793641895055771, + "step": 7736 + }, + { + "epoch": 0.6448333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.045426432291666666, + "learning_rate": 4e-05, + "loss": 4.5305, + "loss/crossentropy": 1.2465531900525093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13715138658881187, + "step": 7738 + }, + { + "epoch": 0.645, + "grad_norm": 4.84375, + "grad_norm_var": 0.0361328125, + "learning_rate": 4e-05, + "loss": 4.0631, + "loss/crossentropy": 1.8641687408089638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15544311329722404, + "step": 7740 + }, + { + "epoch": 0.6451666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.06328125, + "learning_rate": 4e-05, + "loss": 5.2158, + "loss/crossentropy": 2.3051227927207947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19798275083303452, + "step": 7742 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.08329671223958333, + "learning_rate": 4e-05, + "loss": 5.2336, + "loss/crossentropy": 2.479782283306122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21739281341433525, + "step": 7744 + }, + { + "epoch": 0.6455, + "grad_norm": 4.75, + "grad_norm_var": 0.08508707682291666, + "learning_rate": 4e-05, + "loss": 4.7113, + "loss/crossentropy": 1.2954497933387756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15466006100177765, + "step": 7746 + }, + { + "epoch": 0.6456666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.07316080729166667, + "learning_rate": 4e-05, + "loss": 5.0855, + "loss/crossentropy": 2.1735799908638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.240029476583004, + "step": 7748 + }, + { + "epoch": 0.6458333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.072900390625, + "learning_rate": 4e-05, + "loss": 4.7663, + "loss/crossentropy": 2.0483902767300606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18308479711413383, + "step": 7750 + }, + { + "epoch": 0.646, + "grad_norm": 4.625, + "grad_norm_var": 0.07317708333333334, + "learning_rate": 4e-05, + "loss": 4.6245, + "loss/crossentropy": 1.47433602809906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1538691483438015, + "step": 7752 + }, + { + "epoch": 0.6461666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.06796468098958333, + "learning_rate": 4e-05, + "loss": 4.8267, + "loss/crossentropy": 2.449679970741272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22104224935173988, + "step": 7754 + }, + { + "epoch": 0.6463333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.10439046223958333, + "learning_rate": 4e-05, + "loss": 5.3427, + "loss/crossentropy": 2.2434877157211304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18906432762742043, + "step": 7756 + }, + { + "epoch": 0.6465, + "grad_norm": 5.03125, + "grad_norm_var": 0.09078369140625, + "learning_rate": 4e-05, + "loss": 4.9494, + "loss/crossentropy": 2.0335726141929626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20091040432453156, + "step": 7758 + }, + { + "epoch": 0.6466666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.08088785807291667, + "learning_rate": 4e-05, + "loss": 5.1213, + "loss/crossentropy": 2.3035090565681458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22087782993912697, + "step": 7760 + }, + { + "epoch": 0.6468333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.09371337890625, + "learning_rate": 4e-05, + "loss": 4.4378, + "loss/crossentropy": 1.9872702360153198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20616934821009636, + "step": 7762 + }, + { + "epoch": 0.647, + "grad_norm": 4.96875, + "grad_norm_var": 0.09544270833333333, + "learning_rate": 4e-05, + "loss": 5.3507, + "loss/crossentropy": 1.9482222869992256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16111544147133827, + "step": 7764 + }, + { + "epoch": 0.6471666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.09641927083333333, + "learning_rate": 4e-05, + "loss": 4.7896, + "loss/crossentropy": 1.0095570906996727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13183519057929516, + "step": 7766 + }, + { + "epoch": 0.6473333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.09052327473958334, + "learning_rate": 4e-05, + "loss": 4.8455, + "loss/crossentropy": 1.3340253606438637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13347483985126019, + "step": 7768 + }, + { + "epoch": 0.6475, + "grad_norm": 4.78125, + "grad_norm_var": 0.08826497395833334, + "learning_rate": 4e-05, + "loss": 5.1285, + "loss/crossentropy": 2.3872081637382507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22220738232135773, + "step": 7770 + }, + { + "epoch": 0.6476666666666666, + "grad_norm": 5.53125, + "grad_norm_var": 0.077734375, + "learning_rate": 4e-05, + "loss": 4.8552, + "loss/crossentropy": 1.511668123304844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15176920779049397, + "step": 7772 + }, + { + "epoch": 0.6478333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.074609375, + "learning_rate": 4e-05, + "loss": 4.8755, + "loss/crossentropy": 2.030316300690174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17819524556398392, + "step": 7774 + }, + { + "epoch": 0.648, + "grad_norm": 4.40625, + "grad_norm_var": 0.07812093098958334, + "learning_rate": 4e-05, + "loss": 4.4122, + "loss/crossentropy": 1.7088908702135086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17149026691913605, + "step": 7776 + }, + { + "epoch": 0.6481666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.073291015625, + "learning_rate": 4e-05, + "loss": 5.2213, + "loss/crossentropy": 2.429815411567688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040964849293232, + "step": 7778 + }, + { + "epoch": 0.6483333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.071875, + "learning_rate": 4e-05, + "loss": 4.5178, + "loss/crossentropy": 0.8023601695895195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13062872551381588, + "step": 7780 + }, + { + "epoch": 0.6485, + "grad_norm": 4.90625, + "grad_norm_var": 0.073291015625, + "learning_rate": 4e-05, + "loss": 4.7439, + "loss/crossentropy": 2.149062544107437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019280605018139, + "step": 7782 + }, + { + "epoch": 0.6486666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.07073160807291666, + "learning_rate": 4e-05, + "loss": 5.0257, + "loss/crossentropy": 1.923517182469368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21313883364200592, + "step": 7784 + }, + { + "epoch": 0.6488333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.11425374348958334, + "learning_rate": 4e-05, + "loss": 4.9594, + "loss/crossentropy": 1.1835918948054314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14380118064582348, + "step": 7786 + }, + { + "epoch": 0.649, + "grad_norm": 5.875, + "grad_norm_var": 0.157275390625, + "learning_rate": 4e-05, + "loss": 5.5351, + "loss/crossentropy": 2.353056013584137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2323654629290104, + "step": 7788 + }, + { + "epoch": 0.6491666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.16378580729166667, + "learning_rate": 4e-05, + "loss": 5.4731, + "loss/crossentropy": 2.396900922060013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21475185081362724, + "step": 7790 + }, + { + "epoch": 0.6493333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.13557535807291668, + "learning_rate": 4e-05, + "loss": 5.4009, + "loss/crossentropy": 1.9842062294483185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102402187883854, + "step": 7792 + }, + { + "epoch": 0.6495, + "grad_norm": 5.1875, + "grad_norm_var": 0.134228515625, + "learning_rate": 4e-05, + "loss": 4.9706, + "loss/crossentropy": 2.2572127282619476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21562613546848297, + "step": 7794 + }, + { + "epoch": 0.6496666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.1216796875, + "learning_rate": 4e-05, + "loss": 4.9391, + "loss/crossentropy": 1.6953379437327385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19858373701572418, + "step": 7796 + }, + { + "epoch": 0.6498333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.11487223307291666, + "learning_rate": 4e-05, + "loss": 4.6978, + "loss/crossentropy": 1.352690041065216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1570559199899435, + "step": 7798 + }, + { + "epoch": 0.65, + "grad_norm": 4.96875, + "grad_norm_var": 0.10323893229166667, + "learning_rate": 4e-05, + "loss": 4.8123, + "loss/crossentropy": 1.9550207555294037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17728200927376747, + "step": 7800 + }, + { + "epoch": 0.6501666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.08469645182291667, + "learning_rate": 4e-05, + "loss": 4.508, + "loss/crossentropy": 2.0163797438144684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042006030678749, + "step": 7802 + }, + { + "epoch": 0.6503333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.032666015625, + "learning_rate": 4e-05, + "loss": 4.978, + "loss/crossentropy": 2.063169986009598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335309386253357, + "step": 7804 + }, + { + "epoch": 0.6505, + "grad_norm": 4.625, + "grad_norm_var": 0.03357747395833333, + "learning_rate": 4e-05, + "loss": 4.9353, + "loss/crossentropy": 2.104892671108246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22495409473776817, + "step": 7806 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.05907796223958333, + "learning_rate": 4e-05, + "loss": 5.4607, + "loss/crossentropy": 2.243934750556946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2479833886027336, + "step": 7808 + }, + { + "epoch": 0.6508333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.056473795572916666, + "learning_rate": 4e-05, + "loss": 4.7778, + "loss/crossentropy": 1.9320513978600502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17580274306237698, + "step": 7810 + }, + { + "epoch": 0.651, + "grad_norm": 4.59375, + "grad_norm_var": 0.06705322265625, + "learning_rate": 4e-05, + "loss": 5.3106, + "loss/crossentropy": 2.0712881311774254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777807716280222, + "step": 7812 + }, + { + "epoch": 0.6511666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.06222330729166667, + "learning_rate": 4e-05, + "loss": 4.9737, + "loss/crossentropy": 2.4067393839359283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23931296914815903, + "step": 7814 + }, + { + "epoch": 0.6513333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.061572265625, + "learning_rate": 4e-05, + "loss": 5.1568, + "loss/crossentropy": 1.9289019256830215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20855563879013062, + "step": 7816 + }, + { + "epoch": 0.6515, + "grad_norm": 4.9375, + "grad_norm_var": 0.06404622395833333, + "learning_rate": 4e-05, + "loss": 4.5316, + "loss/crossentropy": 2.230655610561371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22553424537181854, + "step": 7818 + }, + { + "epoch": 0.6516666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.062353515625, + "learning_rate": 4e-05, + "loss": 4.1795, + "loss/crossentropy": 2.017218291759491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18330439925193787, + "step": 7820 + }, + { + "epoch": 0.6518333333333334, + "grad_norm": 5.25, + "grad_norm_var": 0.063916015625, + "learning_rate": 4e-05, + "loss": 5.1191, + "loss/crossentropy": 1.7517078816890717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20479023829102516, + "step": 7822 + }, + { + "epoch": 0.652, + "grad_norm": 4.78125, + "grad_norm_var": 0.03424072265625, + "learning_rate": 4e-05, + "loss": 4.5937, + "loss/crossentropy": 2.2532346844673157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2107340209186077, + "step": 7824 + }, + { + "epoch": 0.6521666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.03717447916666667, + "learning_rate": 4e-05, + "loss": 5.0257, + "loss/crossentropy": 1.69417604804039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16969127021729946, + "step": 7826 + }, + { + "epoch": 0.6523333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.03306884765625, + "learning_rate": 4e-05, + "loss": 4.7318, + "loss/crossentropy": 1.354017548263073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15083578042685986, + "step": 7828 + }, + { + "epoch": 0.6525, + "grad_norm": 4.34375, + "grad_norm_var": 0.0708984375, + "learning_rate": 4e-05, + "loss": 4.5279, + "loss/crossentropy": 1.8364375233650208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1852422822266817, + "step": 7830 + }, + { + "epoch": 0.6526666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.074462890625, + "learning_rate": 4e-05, + "loss": 5.042, + "loss/crossentropy": 2.439578354358673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096551414579153, + "step": 7832 + }, + { + "epoch": 0.6528333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.0798828125, + "learning_rate": 4e-05, + "loss": 5.3277, + "loss/crossentropy": 2.394729971885681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21963701769709587, + "step": 7834 + }, + { + "epoch": 0.653, + "grad_norm": 4.8125, + "grad_norm_var": 0.07838541666666667, + "learning_rate": 4e-05, + "loss": 4.3895, + "loss/crossentropy": 1.419870764017105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18464216031134129, + "step": 7836 + }, + { + "epoch": 0.6531666666666667, + "grad_norm": 5.71875, + "grad_norm_var": 0.15162760416666668, + "learning_rate": 4e-05, + "loss": 5.1471, + "loss/crossentropy": 1.7623902410268784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1748678907752037, + "step": 7838 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.16365559895833334, + "learning_rate": 4e-05, + "loss": 4.3374, + "loss/crossentropy": 1.9380313530564308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16397200338542461, + "step": 7840 + }, + { + "epoch": 0.6535, + "grad_norm": 4.84375, + "grad_norm_var": 0.15388997395833334, + "learning_rate": 4e-05, + "loss": 5.1512, + "loss/crossentropy": 1.9005918502807617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17274107970297337, + "step": 7842 + }, + { + "epoch": 0.6536666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.16404622395833332, + "learning_rate": 4e-05, + "loss": 4.846, + "loss/crossentropy": 1.4002360850572586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827119942754507, + "step": 7844 + }, + { + "epoch": 0.6538333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.12750244140625, + "learning_rate": 4e-05, + "loss": 5.2873, + "loss/crossentropy": 2.5820748805999756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22619497776031494, + "step": 7846 + }, + { + "epoch": 0.654, + "grad_norm": 5.4375, + "grad_norm_var": 0.13632405598958333, + "learning_rate": 4e-05, + "loss": 5.1209, + "loss/crossentropy": 2.4924808740615845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21206573769450188, + "step": 7848 + }, + { + "epoch": 0.6541666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.13551025390625, + "learning_rate": 4e-05, + "loss": 5.3657, + "loss/crossentropy": 2.2897271811962128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21945805847644806, + "step": 7850 + }, + { + "epoch": 0.6543333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.13984375, + "learning_rate": 4e-05, + "loss": 5.1915, + "loss/crossentropy": 2.0121906995773315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069852635264397, + "step": 7852 + }, + { + "epoch": 0.6545, + "grad_norm": 4.625, + "grad_norm_var": 0.06678059895833334, + "learning_rate": 4e-05, + "loss": 4.7018, + "loss/crossentropy": 1.3283841013908386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14112715609371662, + "step": 7854 + }, + { + "epoch": 0.6546666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.059370930989583334, + "learning_rate": 4e-05, + "loss": 4.966, + "loss/crossentropy": 2.381036549806595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22558944672346115, + "step": 7856 + }, + { + "epoch": 0.6548333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.06064046223958333, + "learning_rate": 4e-05, + "loss": 4.6811, + "loss/crossentropy": 2.217640519142151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052822895348072, + "step": 7858 + }, + { + "epoch": 0.655, + "grad_norm": 5.0, + "grad_norm_var": 0.05896809895833333, + "learning_rate": 4e-05, + "loss": 4.4663, + "loss/crossentropy": 1.9649121761322021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994537878781557, + "step": 7860 + }, + { + "epoch": 0.6551666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.058837890625, + "learning_rate": 4e-05, + "loss": 4.5006, + "loss/crossentropy": 1.4738883003592491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20674226433038712, + "step": 7862 + }, + { + "epoch": 0.6553333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.06842041015625, + "learning_rate": 4e-05, + "loss": 5.0418, + "loss/crossentropy": 2.4609346985816956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20670482888817787, + "step": 7864 + }, + { + "epoch": 0.6555, + "grad_norm": 4.65625, + "grad_norm_var": 0.05831705729166667, + "learning_rate": 4e-05, + "loss": 4.8698, + "loss/crossentropy": 2.553082287311554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218223724514246, + "step": 7866 + }, + { + "epoch": 0.6556666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.06835530598958334, + "learning_rate": 4e-05, + "loss": 5.264, + "loss/crossentropy": 2.2438072860240936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2258710041642189, + "step": 7868 + }, + { + "epoch": 0.6558333333333334, + "grad_norm": 4.3125, + "grad_norm_var": 0.08420817057291667, + "learning_rate": 4e-05, + "loss": 4.5106, + "loss/crossentropy": 1.7579245939850807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17779571935534477, + "step": 7870 + }, + { + "epoch": 0.656, + "grad_norm": 4.75, + "grad_norm_var": 0.07965087890625, + "learning_rate": 4e-05, + "loss": 4.7868, + "loss/crossentropy": 2.037400543689728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17647516541182995, + "step": 7872 + }, + { + "epoch": 0.6561666666666667, + "grad_norm": 5.28125, + "grad_norm_var": 0.10354410807291667, + "learning_rate": 4e-05, + "loss": 4.9759, + "loss/crossentropy": 2.227533906698227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21963432803750038, + "step": 7874 + }, + { + "epoch": 0.6563333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.09895426432291667, + "learning_rate": 4e-05, + "loss": 4.9878, + "loss/crossentropy": 2.487546145915985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22717060893774033, + "step": 7876 + }, + { + "epoch": 0.6565, + "grad_norm": 4.53125, + "grad_norm_var": 0.10475260416666667, + "learning_rate": 4e-05, + "loss": 4.3423, + "loss/crossentropy": 1.5704541355371475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1635904349386692, + "step": 7878 + }, + { + "epoch": 0.6566666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.07662760416666667, + "learning_rate": 4e-05, + "loss": 5.1448, + "loss/crossentropy": 2.16184538602829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24196895584464073, + "step": 7880 + }, + { + "epoch": 0.6568333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.07323811848958334, + "learning_rate": 4e-05, + "loss": 5.1361, + "loss/crossentropy": 2.436331033706665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24281031265854836, + "step": 7882 + }, + { + "epoch": 0.657, + "grad_norm": 4.5, + "grad_norm_var": 0.07433268229166666, + "learning_rate": 4e-05, + "loss": 4.6996, + "loss/crossentropy": 2.312442362308502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044270522892475, + "step": 7884 + }, + { + "epoch": 0.6571666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.06145833333333333, + "learning_rate": 4e-05, + "loss": 5.1136, + "loss/crossentropy": 2.0718987584114075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22442586347460747, + "step": 7886 + }, + { + "epoch": 0.6573333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.06789957682291667, + "learning_rate": 4e-05, + "loss": 5.0928, + "loss/crossentropy": 2.368440628051758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2424367070198059, + "step": 7888 + }, + { + "epoch": 0.6575, + "grad_norm": 5.65625, + "grad_norm_var": 0.08917643229166666, + "learning_rate": 4e-05, + "loss": 5.4366, + "loss/crossentropy": 1.8111486434936523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17810833640396595, + "step": 7890 + }, + { + "epoch": 0.6576666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.08948160807291666, + "learning_rate": 4e-05, + "loss": 5.411, + "loss/crossentropy": 2.2830857932567596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076493538916111, + "step": 7892 + }, + { + "epoch": 0.6578333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.08710530598958334, + "learning_rate": 4e-05, + "loss": 4.8047, + "loss/crossentropy": 1.8694885224103928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18636782839894295, + "step": 7894 + }, + { + "epoch": 0.658, + "grad_norm": 5.125, + "grad_norm_var": 0.09218343098958333, + "learning_rate": 4e-05, + "loss": 4.9802, + "loss/crossentropy": 2.084577538073063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19515875913202763, + "step": 7896 + }, + { + "epoch": 0.6581666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.09505208333333333, + "learning_rate": 4e-05, + "loss": 4.1378, + "loss/crossentropy": 2.35347381234169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1936819739639759, + "step": 7898 + }, + { + "epoch": 0.6583333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.09334309895833333, + "learning_rate": 4e-05, + "loss": 5.2054, + "loss/crossentropy": 2.1593450531363487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18632697872817516, + "step": 7900 + }, + { + "epoch": 0.6585, + "grad_norm": 5.3125, + "grad_norm_var": 0.10419514973958334, + "learning_rate": 4e-05, + "loss": 5.343, + "loss/crossentropy": 2.141828954219818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113342024385929, + "step": 7902 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 5.25, + "grad_norm_var": 0.11951497395833334, + "learning_rate": 4e-05, + "loss": 5.1705, + "loss/crossentropy": 2.043430596590042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20793258398771286, + "step": 7904 + }, + { + "epoch": 0.6588333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.07897135416666666, + "learning_rate": 4e-05, + "loss": 4.4168, + "loss/crossentropy": 2.251025438308716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22544211894273758, + "step": 7906 + }, + { + "epoch": 0.659, + "grad_norm": 5.1875, + "grad_norm_var": 0.08293863932291666, + "learning_rate": 4e-05, + "loss": 5.2882, + "loss/crossentropy": 1.9112261459231377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17405965924263, + "step": 7908 + }, + { + "epoch": 0.6591666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.103125, + "learning_rate": 4e-05, + "loss": 4.5415, + "loss/crossentropy": 2.078736662864685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160506434738636, + "step": 7910 + }, + { + "epoch": 0.6593333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 0.10813802083333333, + "learning_rate": 4e-05, + "loss": 4.7367, + "loss/crossentropy": 0.9118631333112717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10875993594527245, + "step": 7912 + }, + { + "epoch": 0.6595, + "grad_norm": 4.75, + "grad_norm_var": 0.08681233723958333, + "learning_rate": 4e-05, + "loss": 4.7211, + "loss/crossentropy": 2.090813308954239, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23957878351211548, + "step": 7914 + }, + { + "epoch": 0.6596666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.09996337890625, + "learning_rate": 4e-05, + "loss": 4.9209, + "loss/crossentropy": 1.5122576355934143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19297392293810844, + "step": 7916 + }, + { + "epoch": 0.6598333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.08917643229166666, + "learning_rate": 4e-05, + "loss": 5.0074, + "loss/crossentropy": 2.0265481024980545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22811248153448105, + "step": 7918 + }, + { + "epoch": 0.66, + "grad_norm": 4.5625, + "grad_norm_var": 0.07057291666666667, + "learning_rate": 4e-05, + "loss": 4.9616, + "loss/crossentropy": 1.9131137356162071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1730040479451418, + "step": 7920 + }, + { + "epoch": 0.6601666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.08290608723958333, + "learning_rate": 4e-05, + "loss": 4.8872, + "loss/crossentropy": 2.4343717992305756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30416465178132057, + "step": 7922 + }, + { + "epoch": 0.6603333333333333, + "grad_norm": 4.25, + "grad_norm_var": 0.09394124348958334, + "learning_rate": 4e-05, + "loss": 4.7282, + "loss/crossentropy": 1.0288232266902924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13559334725141525, + "step": 7924 + }, + { + "epoch": 0.6605, + "grad_norm": 5.03125, + "grad_norm_var": 0.08899739583333334, + "learning_rate": 4e-05, + "loss": 5.2148, + "loss/crossentropy": 2.2694281935691833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19374223798513412, + "step": 7926 + }, + { + "epoch": 0.6606666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.07346598307291667, + "learning_rate": 4e-05, + "loss": 4.9215, + "loss/crossentropy": 2.181885540485382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091890163719654, + "step": 7928 + }, + { + "epoch": 0.6608333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.07870686848958333, + "learning_rate": 4e-05, + "loss": 4.5869, + "loss/crossentropy": 1.8901971653103828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878662332892418, + "step": 7930 + }, + { + "epoch": 0.661, + "grad_norm": 5.3125, + "grad_norm_var": 0.083056640625, + "learning_rate": 4e-05, + "loss": 4.8568, + "loss/crossentropy": 1.9051894173026085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19481675140559673, + "step": 7932 + }, + { + "epoch": 0.6611666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.08756103515625, + "learning_rate": 4e-05, + "loss": 4.6283, + "loss/crossentropy": 1.8722472786903381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24349390342831612, + "step": 7934 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 5.5625, + "grad_norm_var": 0.10948893229166666, + "learning_rate": 4e-05, + "loss": 4.563, + "loss/crossentropy": 1.7739543914794922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16185857728123665, + "step": 7936 + }, + { + "epoch": 0.6615, + "grad_norm": 4.6875, + "grad_norm_var": 0.10233968098958333, + "learning_rate": 4e-05, + "loss": 5.0303, + "loss/crossentropy": 2.4332188963890076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997020602226257, + "step": 7938 + }, + { + "epoch": 0.6616666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.07991129557291667, + "learning_rate": 4e-05, + "loss": 5.1234, + "loss/crossentropy": 2.0778674632310867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.176247363910079, + "step": 7940 + }, + { + "epoch": 0.6618333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.06897379557291666, + "learning_rate": 4e-05, + "loss": 5.1767, + "loss/crossentropy": 2.163965940475464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073141485452652, + "step": 7942 + }, + { + "epoch": 0.662, + "grad_norm": 4.1875, + "grad_norm_var": 0.10546468098958334, + "learning_rate": 4e-05, + "loss": 4.2894, + "loss/crossentropy": 0.9995723515748978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11586864292621613, + "step": 7944 + }, + { + "epoch": 0.6621666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.10299072265625, + "learning_rate": 4e-05, + "loss": 4.6738, + "loss/crossentropy": 2.281874358654022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23054268583655357, + "step": 7946 + }, + { + "epoch": 0.6623333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.09205322265625, + "learning_rate": 4e-05, + "loss": 4.9369, + "loss/crossentropy": 2.096975475549698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18830178305506706, + "step": 7948 + }, + { + "epoch": 0.6625, + "grad_norm": 5.0, + "grad_norm_var": 0.09138997395833333, + "learning_rate": 4e-05, + "loss": 4.9355, + "loss/crossentropy": 1.595092996954918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848563738167286, + "step": 7950 + }, + { + "epoch": 0.6626666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.06002197265625, + "learning_rate": 4e-05, + "loss": 5.0445, + "loss/crossentropy": 1.7920393347740173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1907125525176525, + "step": 7952 + }, + { + "epoch": 0.6628333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.05959879557291667, + "learning_rate": 4e-05, + "loss": 4.4764, + "loss/crossentropy": 1.4217949956655502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15368330664932728, + "step": 7954 + }, + { + "epoch": 0.663, + "grad_norm": 4.8125, + "grad_norm_var": 0.05276285807291667, + "learning_rate": 4e-05, + "loss": 5.1335, + "loss/crossentropy": 2.128646582365036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20609202608466148, + "step": 7956 + }, + { + "epoch": 0.6631666666666667, + "grad_norm": 4.1875, + "grad_norm_var": 0.07659098307291666, + "learning_rate": 4e-05, + "loss": 4.6351, + "loss/crossentropy": 1.244727723300457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13210402987897396, + "step": 7958 + }, + { + "epoch": 0.6633333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.05608317057291667, + "learning_rate": 4e-05, + "loss": 4.7433, + "loss/crossentropy": 1.747454211115837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16521551832556725, + "step": 7960 + }, + { + "epoch": 0.6635, + "grad_norm": 5.03125, + "grad_norm_var": 0.05468343098958333, + "learning_rate": 4e-05, + "loss": 4.939, + "loss/crossentropy": 2.3972195982933044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21212521940469742, + "step": 7962 + }, + { + "epoch": 0.6636666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.050244140625, + "learning_rate": 4e-05, + "loss": 4.5736, + "loss/crossentropy": 2.157024621963501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839892379939556, + "step": 7964 + }, + { + "epoch": 0.6638333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.04920247395833333, + "learning_rate": 4e-05, + "loss": 4.3667, + "loss/crossentropy": 1.5357392206788063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14107412099838257, + "step": 7966 + }, + { + "epoch": 0.664, + "grad_norm": 4.40625, + "grad_norm_var": 0.07037760416666666, + "learning_rate": 4e-05, + "loss": 4.8995, + "loss/crossentropy": 2.07469192892313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992726493626833, + "step": 7968 + }, + { + "epoch": 0.6641666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.07597249348958333, + "learning_rate": 4e-05, + "loss": 5.0096, + "loss/crossentropy": 1.7797765955328941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16863209381699562, + "step": 7970 + }, + { + "epoch": 0.6643333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.08508707682291666, + "learning_rate": 4e-05, + "loss": 4.9404, + "loss/crossentropy": 1.9922729581594467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18705208972096443, + "step": 7972 + }, + { + "epoch": 0.6645, + "grad_norm": 4.875, + "grad_norm_var": 0.054427083333333334, + "learning_rate": 4e-05, + "loss": 4.8838, + "loss/crossentropy": 2.2012872993946075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21382101997733116, + "step": 7974 + }, + { + "epoch": 0.6646666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.04801025390625, + "learning_rate": 4e-05, + "loss": 4.9927, + "loss/crossentropy": 1.9671935513615608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18664169497787952, + "step": 7976 + }, + { + "epoch": 0.6648333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.0658203125, + "learning_rate": 4e-05, + "loss": 4.6905, + "loss/crossentropy": 1.8137053772807121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16209018975496292, + "step": 7978 + }, + { + "epoch": 0.665, + "grad_norm": 4.84375, + "grad_norm_var": 0.063134765625, + "learning_rate": 4e-05, + "loss": 4.7317, + "loss/crossentropy": 2.605897605419159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2145151011645794, + "step": 7980 + }, + { + "epoch": 0.6651666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.06926676432291666, + "learning_rate": 4e-05, + "loss": 4.7387, + "loss/crossentropy": 1.8791739642620087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18151278793811798, + "step": 7982 + }, + { + "epoch": 0.6653333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.049332682291666666, + "learning_rate": 4e-05, + "loss": 4.7535, + "loss/crossentropy": 2.591217875480652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23360199108719826, + "step": 7984 + }, + { + "epoch": 0.6655, + "grad_norm": 5.0, + "grad_norm_var": 0.07307535807291667, + "learning_rate": 4e-05, + "loss": 5.5989, + "loss/crossentropy": 2.3981366753578186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21705374494194984, + "step": 7986 + }, + { + "epoch": 0.6656666666666666, + "grad_norm": 5.375, + "grad_norm_var": 0.081884765625, + "learning_rate": 4e-05, + "loss": 4.7857, + "loss/crossentropy": 2.2135126292705536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21088680252432823, + "step": 7988 + }, + { + "epoch": 0.6658333333333334, + "grad_norm": 5.53125, + "grad_norm_var": 0.10944010416666666, + "learning_rate": 4e-05, + "loss": 5.1127, + "loss/crossentropy": 2.6215781569480896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20877355709671974, + "step": 7990 + }, + { + "epoch": 0.666, + "grad_norm": 5.09375, + "grad_norm_var": 0.11129150390625, + "learning_rate": 4e-05, + "loss": 4.6155, + "loss/crossentropy": 1.5245660692453384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1763434261083603, + "step": 7992 + }, + { + "epoch": 0.6661666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.08307291666666666, + "learning_rate": 4e-05, + "loss": 5.0979, + "loss/crossentropy": 1.9757955074310303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18706656992435455, + "step": 7994 + }, + { + "epoch": 0.6663333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.07701822916666666, + "learning_rate": 4e-05, + "loss": 5.22, + "loss/crossentropy": 2.126909226179123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23862241953611374, + "step": 7996 + }, + { + "epoch": 0.6665, + "grad_norm": 4.84375, + "grad_norm_var": 0.075244140625, + "learning_rate": 4e-05, + "loss": 4.9789, + "loss/crossentropy": 2.364075005054474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21161803603172302, + "step": 7998 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.06415608723958334, + "learning_rate": 4e-05, + "loss": 4.4117, + "loss/crossentropy": 2.0129463002085686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17588723078370094, + "step": 8000 + }, + { + "epoch": 0.6668333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.05810139973958333, + "learning_rate": 3.999998026079526e-05, + "loss": 4.7781, + "loss/crossentropy": 1.8196282386779785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22529863938689232, + "step": 8002 + }, + { + "epoch": 0.667, + "grad_norm": 4.875, + "grad_norm_var": 0.0544921875, + "learning_rate": 3.9999921043229736e-05, + "loss": 4.6482, + "loss/crossentropy": 2.20221546292305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19770921766757965, + "step": 8004 + }, + { + "epoch": 0.6671666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.029410807291666667, + "learning_rate": 3.9999822347449543e-05, + "loss": 5.1931, + "loss/crossentropy": 1.919562578201294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418702088296413, + "step": 8006 + }, + { + "epoch": 0.6673333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.02633056640625, + "learning_rate": 3.99996841736982e-05, + "loss": 5.1969, + "loss/crossentropy": 2.100528210401535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1811746470630169, + "step": 8008 + }, + { + "epoch": 0.6675, + "grad_norm": 4.96875, + "grad_norm_var": 0.02330322265625, + "learning_rate": 3.999950652231664e-05, + "loss": 4.5698, + "loss/crossentropy": 1.828367918729782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19560833647847176, + "step": 8010 + }, + { + "epoch": 0.6676666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 1940.4690714518229, + "learning_rate": 3.99992893937432e-05, + "loss": 4.8554, + "loss/crossentropy": 2.049312300980091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17662812024354935, + "step": 8012 + }, + { + "epoch": 0.6678333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 1939.649149576823, + "learning_rate": 3.9999032788513625e-05, + "loss": 5.1692, + "loss/crossentropy": 2.4652374386787415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22743260487914085, + "step": 8014 + }, + { + "epoch": 0.668, + "grad_norm": 4.46875, + "grad_norm_var": 1940.2542317708333, + "learning_rate": 3.999873670726106e-05, + "loss": 4.8301, + "loss/crossentropy": 1.8886771276593208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19605330377817154, + "step": 8016 + }, + { + "epoch": 0.6681666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 1938.9496704101562, + "learning_rate": 3.999840115071606e-05, + "loss": 5.126, + "loss/crossentropy": 2.337659776210785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21413381025195122, + "step": 8018 + }, + { + "epoch": 0.6683333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 1938.8692993164063, + "learning_rate": 3.9998026119706576e-05, + "loss": 4.5696, + "loss/crossentropy": 2.1056962609291077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247220054268837, + "step": 8020 + }, + { + "epoch": 0.6685, + "grad_norm": 5.25, + "grad_norm_var": 1937.8184733072917, + "learning_rate": 3.999761161515795e-05, + "loss": 5.6617, + "loss/crossentropy": 2.4179972410202026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223285723477602, + "step": 8022 + }, + { + "epoch": 0.6686666666666666, + "grad_norm": 5.25, + "grad_norm_var": 1937.643603515625, + "learning_rate": 3.9997157638092944e-05, + "loss": 5.005, + "loss/crossentropy": 1.895853579044342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043449804186821, + "step": 8024 + }, + { + "epoch": 0.6688333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 1937.6156860351562, + "learning_rate": 3.999666418963171e-05, + "loss": 4.7058, + "loss/crossentropy": 2.025197595357895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19659354910254478, + "step": 8026 + }, + { + "epoch": 0.669, + "grad_norm": 4.8125, + "grad_norm_var": 0.107666015625, + "learning_rate": 3.999613127099175e-05, + "loss": 4.2922, + "loss/crossentropy": 1.6821075975894928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1879300493746996, + "step": 8028 + }, + { + "epoch": 0.6691666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.10526936848958333, + "learning_rate": 3.999555888348801e-05, + "loss": 4.9131, + "loss/crossentropy": 2.2622806429862976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22173702344298363, + "step": 8030 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.092431640625, + "learning_rate": 3.99949470285328e-05, + "loss": 5.4962, + "loss/crossentropy": 2.7419689893722534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21366914361715317, + "step": 8032 + }, + { + "epoch": 0.6695, + "grad_norm": 6.84375, + "grad_norm_var": 0.3025390625, + "learning_rate": 3.999429570763581e-05, + "loss": 5.3351, + "loss/crossentropy": 2.4165670573711395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20715122669935226, + "step": 8034 + }, + { + "epoch": 0.6696666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.27447509765625, + "learning_rate": 3.999360492240411e-05, + "loss": 5.1791, + "loss/crossentropy": 1.8621023744344711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17758541740477085, + "step": 8036 + }, + { + "epoch": 0.6698333333333333, + "grad_norm": 5.96875, + "grad_norm_var": 0.32437744140625, + "learning_rate": 3.999287467454214e-05, + "loss": 5.3983, + "loss/crossentropy": 1.1967885345220566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17341040447354317, + "step": 8038 + }, + { + "epoch": 0.67, + "grad_norm": 5.09375, + "grad_norm_var": 0.3204264322916667, + "learning_rate": 3.999210496585171e-05, + "loss": 5.1228, + "loss/crossentropy": 1.89522323012352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980042066425085, + "step": 8040 + }, + { + "epoch": 0.6701666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.30373942057291664, + "learning_rate": 3.9991295798232e-05, + "loss": 5.0264, + "loss/crossentropy": 1.2332193851470947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1300823837518692, + "step": 8042 + }, + { + "epoch": 0.6703333333333333, + "grad_norm": 4.375, + "grad_norm_var": 0.4376953125, + "learning_rate": 3.999044717367957e-05, + "loss": 4.7728, + "loss/crossentropy": 2.335726737976074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21574588492512703, + "step": 8044 + }, + { + "epoch": 0.6705, + "grad_norm": 5.125, + "grad_norm_var": 0.4343058268229167, + "learning_rate": 3.99895590942883e-05, + "loss": 4.919, + "loss/crossentropy": 2.3052841424942017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2212902568280697, + "step": 8046 + }, + { + "epoch": 0.6706666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.4610514322916667, + "learning_rate": 3.9988631562249435e-05, + "loss": 4.6745, + "loss/crossentropy": 2.4768862426280975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21305079013109207, + "step": 8048 + }, + { + "epoch": 0.6708333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.28905843098958334, + "learning_rate": 3.9987664579851574e-05, + "loss": 4.3145, + "loss/crossentropy": 1.7946585342288017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17041044309735298, + "step": 8050 + }, + { + "epoch": 0.671, + "grad_norm": 5.03125, + "grad_norm_var": 0.2899576822916667, + "learning_rate": 3.998665814948065e-05, + "loss": 5.0085, + "loss/crossentropy": 2.1682648360729218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21787675842642784, + "step": 8052 + }, + { + "epoch": 0.6711666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.23605143229166667, + "learning_rate": 3.9985612273619924e-05, + "loss": 4.1648, + "loss/crossentropy": 0.8689647540450096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1191219910979271, + "step": 8054 + }, + { + "epoch": 0.6713333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.2359375, + "learning_rate": 3.9984526954850003e-05, + "loss": 4.8656, + "loss/crossentropy": 1.978205069899559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915741264820099, + "step": 8056 + }, + { + "epoch": 0.6715, + "grad_norm": 4.78125, + "grad_norm_var": 0.2380859375, + "learning_rate": 3.9983402195848796e-05, + "loss": 4.519, + "loss/crossentropy": 1.9606445729732513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19727950543165207, + "step": 8058 + }, + { + "epoch": 0.6716666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.051285807291666666, + "learning_rate": 3.998223799939153e-05, + "loss": 4.43, + "loss/crossentropy": 1.5983156114816666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1602168083190918, + "step": 8060 + }, + { + "epoch": 0.6718333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.06291910807291666, + "learning_rate": 3.9981034368350744e-05, + "loss": 5.0673, + "loss/crossentropy": 2.5904553532600403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2175530567765236, + "step": 8062 + }, + { + "epoch": 0.672, + "grad_norm": 5.34375, + "grad_norm_var": 0.09251302083333333, + "learning_rate": 3.997979130569628e-05, + "loss": 5.1659, + "loss/crossentropy": 2.0036058127880096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17185713909566402, + "step": 8064 + }, + { + "epoch": 0.6721666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.08782145182291666, + "learning_rate": 3.9978508814495287e-05, + "loss": 5.0144, + "loss/crossentropy": 1.981281191110611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.172686118632555, + "step": 8066 + }, + { + "epoch": 0.6723333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.08162434895833333, + "learning_rate": 3.9977186897912166e-05, + "loss": 4.8809, + "loss/crossentropy": 1.9285836815834045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19455789402127266, + "step": 8068 + }, + { + "epoch": 0.6725, + "grad_norm": 4.375, + "grad_norm_var": 0.09081624348958334, + "learning_rate": 3.997582555920861e-05, + "loss": 4.3465, + "loss/crossentropy": 0.8334982395172119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1363061834126711, + "step": 8070 + }, + { + "epoch": 0.6726666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.1041015625, + "learning_rate": 3.997442480174361e-05, + "loss": 5.0726, + "loss/crossentropy": 1.5748215094208717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17971625551581383, + "step": 8072 + }, + { + "epoch": 0.6728333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.10546468098958334, + "learning_rate": 3.997298462897336e-05, + "loss": 4.7462, + "loss/crossentropy": 1.3154399320483208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15049860626459122, + "step": 8074 + }, + { + "epoch": 0.673, + "grad_norm": 4.9375, + "grad_norm_var": 0.08870035807291667, + "learning_rate": 3.99715050444514e-05, + "loss": 5.0491, + "loss/crossentropy": 0.9168932139873505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11337370611727238, + "step": 8076 + }, + { + "epoch": 0.6731666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.07183837890625, + "learning_rate": 3.9969986051828394e-05, + "loss": 4.5893, + "loss/crossentropy": 2.0407353341579437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022136151790619, + "step": 8078 + }, + { + "epoch": 0.6733333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.04529622395833333, + "learning_rate": 3.996842765485235e-05, + "loss": 5.294, + "loss/crossentropy": 2.0593449771404266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1811201088130474, + "step": 8080 + }, + { + "epoch": 0.6735, + "grad_norm": 5.03125, + "grad_norm_var": 0.0453125, + "learning_rate": 3.9966829857368434e-05, + "loss": 4.9266, + "loss/crossentropy": 1.1097619906067848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1245882660150528, + "step": 8082 + }, + { + "epoch": 0.6736666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.052046712239583334, + "learning_rate": 3.996519266331907e-05, + "loss": 4.6228, + "loss/crossentropy": 1.8012422546744347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16311753541231155, + "step": 8084 + }, + { + "epoch": 0.6738333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.042867024739583336, + "learning_rate": 3.9963516076743856e-05, + "loss": 4.9789, + "loss/crossentropy": 2.0890884697437286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20314529910683632, + "step": 8086 + }, + { + "epoch": 0.674, + "grad_norm": 4.8125, + "grad_norm_var": 0.03216145833333333, + "learning_rate": 3.996180010177961e-05, + "loss": 4.9535, + "loss/crossentropy": 1.5582296922802925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15331477485597134, + "step": 8088 + }, + { + "epoch": 0.6741666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.02750244140625, + "learning_rate": 3.996004474266033e-05, + "loss": 5.154, + "loss/crossentropy": 2.3426169753074646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202622890472412, + "step": 8090 + }, + { + "epoch": 0.6743333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 3.9958250003717184e-05, + "loss": 5.4054, + "loss/crossentropy": 2.036056786775589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18563580885529518, + "step": 8092 + }, + { + "epoch": 0.6745, + "grad_norm": 4.6875, + "grad_norm_var": 0.049723307291666664, + "learning_rate": 3.995641588937852e-05, + "loss": 4.4591, + "loss/crossentropy": 1.8210117146372795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20190994441509247, + "step": 8094 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.051005045572916664, + "learning_rate": 3.995454240416982e-05, + "loss": 5.1467, + "loss/crossentropy": 1.740770123898983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19278324209153652, + "step": 8096 + }, + { + "epoch": 0.6748333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.05422770182291667, + "learning_rate": 3.9952629552713745e-05, + "loss": 4.8513, + "loss/crossentropy": 2.1134003698825836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23501833155751228, + "step": 8098 + }, + { + "epoch": 0.675, + "grad_norm": 4.75, + "grad_norm_var": 0.03857014973958333, + "learning_rate": 3.995067733973005e-05, + "loss": 5.3558, + "loss/crossentropy": 1.907908834517002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19258636608719826, + "step": 8100 + }, + { + "epoch": 0.6751666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.03892822265625, + "learning_rate": 3.994868577003563e-05, + "loss": 4.6607, + "loss/crossentropy": 1.755495123565197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17224089056253433, + "step": 8102 + }, + { + "epoch": 0.6753333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.0474609375, + "learning_rate": 3.9946654848544477e-05, + "loss": 4.3648, + "loss/crossentropy": 1.9235477447509766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895141527056694, + "step": 8104 + }, + { + "epoch": 0.6755, + "grad_norm": 4.625, + "grad_norm_var": 0.051106770833333336, + "learning_rate": 3.9944584580267706e-05, + "loss": 4.8674, + "loss/crossentropy": 2.156323105096817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22421907261013985, + "step": 8106 + }, + { + "epoch": 0.6756666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.047770182291666664, + "learning_rate": 3.9942474970313485e-05, + "loss": 4.2895, + "loss/crossentropy": 2.0829486325383186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18314629420638084, + "step": 8108 + }, + { + "epoch": 0.6758333333333333, + "grad_norm": 4.15625, + "grad_norm_var": 0.05015869140625, + "learning_rate": 3.994032602388706e-05, + "loss": 4.8896, + "loss/crossentropy": 1.6454570293426514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16752009466290474, + "step": 8110 + }, + { + "epoch": 0.676, + "grad_norm": 4.75, + "grad_norm_var": 0.043603515625, + "learning_rate": 3.993813774629076e-05, + "loss": 4.8309, + "loss/crossentropy": 2.204928934574127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20959094911813736, + "step": 8112 + }, + { + "epoch": 0.6761666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.052978515625, + "learning_rate": 3.9935910142923934e-05, + "loss": 4.8202, + "loss/crossentropy": 1.6364878937602043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1698533445596695, + "step": 8114 + }, + { + "epoch": 0.6763333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.04820556640625, + "learning_rate": 3.993364321928298e-05, + "loss": 4.8303, + "loss/crossentropy": 1.7198282107710838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16924752667546272, + "step": 8116 + }, + { + "epoch": 0.6765, + "grad_norm": 4.5, + "grad_norm_var": 0.050093587239583334, + "learning_rate": 3.993133698096129e-05, + "loss": 4.3707, + "loss/crossentropy": 1.4806988760828972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13833208149299026, + "step": 8118 + }, + { + "epoch": 0.6766666666666666, + "grad_norm": 4.34375, + "grad_norm_var": 0.09440104166666667, + "learning_rate": 3.9928991433649284e-05, + "loss": 4.9286, + "loss/crossentropy": 2.1212473809719086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18642381206154823, + "step": 8120 + }, + { + "epoch": 0.6768333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.09680582682291666, + "learning_rate": 3.992660658313438e-05, + "loss": 4.7489, + "loss/crossentropy": 1.7608193159103394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17317914962768555, + "step": 8122 + }, + { + "epoch": 0.677, + "grad_norm": 4.5, + "grad_norm_var": 0.10084635416666667, + "learning_rate": 3.992418243530094e-05, + "loss": 4.4682, + "loss/crossentropy": 1.4573740735650063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14598331600427628, + "step": 8124 + }, + { + "epoch": 0.6771666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.081494140625, + "learning_rate": 3.9921718996130326e-05, + "loss": 4.8036, + "loss/crossentropy": 2.064685195684433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20185653865337372, + "step": 8126 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.10362955729166666, + "learning_rate": 3.991921627170081e-05, + "loss": 4.5552, + "loss/crossentropy": 1.5148289203643799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1409279704093933, + "step": 8128 + }, + { + "epoch": 0.6775, + "grad_norm": 4.8125, + "grad_norm_var": 0.10191650390625, + "learning_rate": 3.9916674268187625e-05, + "loss": 5.0554, + "loss/crossentropy": 2.1727925539016724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20459099113941193, + "step": 8130 + }, + { + "epoch": 0.6776666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.10625, + "learning_rate": 3.991409299186292e-05, + "loss": 4.9051, + "loss/crossentropy": 1.7988258972764015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21256174892187119, + "step": 8132 + }, + { + "epoch": 0.6778333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.10076497395833334, + "learning_rate": 3.9911472449095726e-05, + "loss": 5.0322, + "loss/crossentropy": 2.352916330099106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223893664777279, + "step": 8134 + }, + { + "epoch": 0.678, + "grad_norm": 4.6875, + "grad_norm_var": 0.058394368489583334, + "learning_rate": 3.990881264635198e-05, + "loss": 5.0109, + "loss/crossentropy": 1.9112081602215767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17446013167500496, + "step": 8136 + }, + { + "epoch": 0.6781666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.05572509765625, + "learning_rate": 3.990611359019449e-05, + "loss": 5.1502, + "loss/crossentropy": 1.8885821998119354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19072378613054752, + "step": 8138 + }, + { + "epoch": 0.6783333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.05416259765625, + "learning_rate": 3.9903375287282886e-05, + "loss": 4.6403, + "loss/crossentropy": 1.7756406664848328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20463722944259644, + "step": 8140 + }, + { + "epoch": 0.6785, + "grad_norm": 4.90625, + "grad_norm_var": 0.05475260416666667, + "learning_rate": 3.990059774437366e-05, + "loss": 4.6098, + "loss/crossentropy": 1.41706994920969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16518711298704147, + "step": 8142 + }, + { + "epoch": 0.6786666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.04700113932291667, + "learning_rate": 3.989778096832014e-05, + "loss": 4.5598, + "loss/crossentropy": 1.7809503972530365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18425824865698814, + "step": 8144 + }, + { + "epoch": 0.6788333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.043192545572916664, + "learning_rate": 3.989492496607243e-05, + "loss": 4.8872, + "loss/crossentropy": 1.4895486384630203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15744570270180702, + "step": 8146 + }, + { + "epoch": 0.679, + "grad_norm": 4.8125, + "grad_norm_var": 0.03967692057291667, + "learning_rate": 3.989202974467744e-05, + "loss": 4.9619, + "loss/crossentropy": 2.420316845178604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21129243820905685, + "step": 8148 + }, + { + "epoch": 0.6791666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.04114176432291667, + "learning_rate": 3.988909531127883e-05, + "loss": 4.6627, + "loss/crossentropy": 1.2393221259117126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15679911896586418, + "step": 8150 + }, + { + "epoch": 0.6793333333333333, + "grad_norm": 5.28125, + "grad_norm_var": 0.05513916015625, + "learning_rate": 3.988612167311703e-05, + "loss": 5.4727, + "loss/crossentropy": 1.9080857932567596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22484339028596878, + "step": 8152 + }, + { + "epoch": 0.6795, + "grad_norm": 4.8125, + "grad_norm_var": 0.060009765625, + "learning_rate": 3.988310883752918e-05, + "loss": 5.0678, + "loss/crossentropy": 2.225371241569519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21629876643419266, + "step": 8154 + }, + { + "epoch": 0.6796666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.055013020833333336, + "learning_rate": 3.9880056811949186e-05, + "loss": 4.8673, + "loss/crossentropy": 1.2217218354344368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14047273807227612, + "step": 8156 + }, + { + "epoch": 0.6798333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.05299072265625, + "learning_rate": 3.9876965603907585e-05, + "loss": 4.8334, + "loss/crossentropy": 1.7158519104123116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16879184916615486, + "step": 8158 + }, + { + "epoch": 0.68, + "grad_norm": 4.78125, + "grad_norm_var": 0.042561848958333336, + "learning_rate": 3.987383522103165e-05, + "loss": 4.9505, + "loss/crossentropy": 2.0144409984350204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20102826319634914, + "step": 8160 + }, + { + "epoch": 0.6801666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.0390625, + "learning_rate": 3.987066567104528e-05, + "loss": 5.6142, + "loss/crossentropy": 2.599462568759918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125120908021927, + "step": 8162 + }, + { + "epoch": 0.6803333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.05136311848958333, + "learning_rate": 3.986745696176901e-05, + "loss": 5.547, + "loss/crossentropy": 2.1270923539996147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18128559738397598, + "step": 8164 + }, + { + "epoch": 0.6805, + "grad_norm": 5.21875, + "grad_norm_var": 0.053759765625, + "learning_rate": 3.986420910112003e-05, + "loss": 5.038, + "loss/crossentropy": 1.7307285517454147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1786797009408474, + "step": 8166 + }, + { + "epoch": 0.6806666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.06728108723958333, + "learning_rate": 3.986092209711211e-05, + "loss": 4.7684, + "loss/crossentropy": 1.9707505255937576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1803161595016718, + "step": 8168 + }, + { + "epoch": 0.6808333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.08736572265625, + "learning_rate": 3.98575959578556e-05, + "loss": 3.7211, + "loss/crossentropy": 1.5715525671839714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16525976173579693, + "step": 8170 + }, + { + "epoch": 0.681, + "grad_norm": 5.15625, + "grad_norm_var": 0.09810791015625, + "learning_rate": 3.9854230691557425e-05, + "loss": 4.322, + "loss/crossentropy": 1.225288264453411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12830936163663864, + "step": 8172 + }, + { + "epoch": 0.6811666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.114306640625, + "learning_rate": 3.9850826306521036e-05, + "loss": 4.7835, + "loss/crossentropy": 1.218340426683426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12600534223020077, + "step": 8174 + }, + { + "epoch": 0.6813333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.11770426432291667, + "learning_rate": 3.984738281114642e-05, + "loss": 5.2156, + "loss/crossentropy": 2.4113438725471497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21955899521708488, + "step": 8176 + }, + { + "epoch": 0.6815, + "grad_norm": 4.46875, + "grad_norm_var": 0.13287760416666666, + "learning_rate": 3.984390021393007e-05, + "loss": 5.0152, + "loss/crossentropy": 1.543459229171276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1737987082451582, + "step": 8178 + }, + { + "epoch": 0.6816666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.13944905598958332, + "learning_rate": 3.9840378523464924e-05, + "loss": 4.7537, + "loss/crossentropy": 1.7875322103500366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1727149300277233, + "step": 8180 + }, + { + "epoch": 0.6818333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.12537434895833333, + "learning_rate": 3.9836817748440424e-05, + "loss": 4.9742, + "loss/crossentropy": 2.043481595814228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18028966709971428, + "step": 8182 + }, + { + "epoch": 0.682, + "grad_norm": 4.78125, + "grad_norm_var": 0.11300455729166667, + "learning_rate": 3.983321789764242e-05, + "loss": 4.878, + "loss/crossentropy": 1.3496059402823448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1595297921448946, + "step": 8184 + }, + { + "epoch": 0.6821666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.10204671223958334, + "learning_rate": 3.9829578979953195e-05, + "loss": 5.0171, + "loss/crossentropy": 2.472753405570984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23509525135159492, + "step": 8186 + }, + { + "epoch": 0.6823333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.08912353515625, + "learning_rate": 3.982590100435139e-05, + "loss": 5.0676, + "loss/crossentropy": 2.0272571817040443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17441200464963913, + "step": 8188 + }, + { + "epoch": 0.6825, + "grad_norm": 5.15625, + "grad_norm_var": 0.07385660807291666, + "learning_rate": 3.982218397991208e-05, + "loss": 5.6034, + "loss/crossentropy": 1.9016704335808754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17647476121783257, + "step": 8190 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 4.4375, + "grad_norm_var": 0.0853515625, + "learning_rate": 3.981842791580663e-05, + "loss": 4.9338, + "loss/crossentropy": 2.406555950641632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20240669697523117, + "step": 8192 + }, + { + "epoch": 0.6828333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.07428385416666666, + "learning_rate": 3.981463282130277e-05, + "loss": 5.2944, + "loss/crossentropy": 2.195953816175461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23554722219705582, + "step": 8194 + }, + { + "epoch": 0.683, + "grad_norm": 4.84375, + "grad_norm_var": 0.05347900390625, + "learning_rate": 3.98107987057645e-05, + "loss": 4.6559, + "loss/crossentropy": 2.159575939178467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401450619101524, + "step": 8196 + }, + { + "epoch": 0.6831666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.05390218098958333, + "learning_rate": 3.9806925578652125e-05, + "loss": 5.1964, + "loss/crossentropy": 1.8091963231563568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16383634880185127, + "step": 8198 + }, + { + "epoch": 0.6833333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.05966389973958333, + "learning_rate": 3.980301344952221e-05, + "loss": 4.4383, + "loss/crossentropy": 1.9880341216921806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19586319476366043, + "step": 8200 + }, + { + "epoch": 0.6835, + "grad_norm": 4.625, + "grad_norm_var": 0.05774739583333333, + "learning_rate": 3.979906232802754e-05, + "loss": 5.3086, + "loss/crossentropy": 2.4634940028190613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22515181452035904, + "step": 8202 + }, + { + "epoch": 0.6836666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.05872395833333333, + "learning_rate": 3.9795072223917115e-05, + "loss": 4.8084, + "loss/crossentropy": 1.6714323610067368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18010685592889786, + "step": 8204 + }, + { + "epoch": 0.6838333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.056441243489583334, + "learning_rate": 3.9791043147036114e-05, + "loss": 4.746, + "loss/crossentropy": 2.0559864789247513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17964443936944008, + "step": 8206 + }, + { + "epoch": 0.684, + "grad_norm": 5.0625, + "grad_norm_var": 0.04459228515625, + "learning_rate": 3.978697510732589e-05, + "loss": 5.1632, + "loss/crossentropy": 2.4614692330360413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2225230485200882, + "step": 8208 + }, + { + "epoch": 0.6841666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.045796712239583336, + "learning_rate": 3.9782868114823936e-05, + "loss": 4.6625, + "loss/crossentropy": 1.9659627079963684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19717267900705338, + "step": 8210 + }, + { + "epoch": 0.6843333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.047749837239583336, + "learning_rate": 3.9778722179663826e-05, + "loss": 5.2607, + "loss/crossentropy": 2.311997652053833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20555757358670235, + "step": 8212 + }, + { + "epoch": 0.6845, + "grad_norm": 4.9375, + "grad_norm_var": 0.046708170572916666, + "learning_rate": 3.9774537312075254e-05, + "loss": 4.8792, + "loss/crossentropy": 1.5093220099806786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14826051332056522, + "step": 8214 + }, + { + "epoch": 0.6846666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.04023030598958333, + "learning_rate": 3.977031352238397e-05, + "loss": 4.7302, + "loss/crossentropy": 1.310145728290081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15178019180893898, + "step": 8216 + }, + { + "epoch": 0.6848333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.05331624348958333, + "learning_rate": 3.976605082101175e-05, + "loss": 4.8753, + "loss/crossentropy": 1.9999983832240105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20402120612561703, + "step": 8218 + }, + { + "epoch": 0.685, + "grad_norm": 4.625, + "grad_norm_var": 0.05364176432291667, + "learning_rate": 3.976174921847639e-05, + "loss": 5.1049, + "loss/crossentropy": 2.197313755750656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19855907186865807, + "step": 8220 + }, + { + "epoch": 0.6851666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 3.975740872539166e-05, + "loss": 4.9291, + "loss/crossentropy": 2.1756413877010345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19770053774118423, + "step": 8222 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.04763997395833333, + "learning_rate": 3.975302935246729e-05, + "loss": 5.3972, + "loss/crossentropy": 2.2892523109912872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869375966489315, + "step": 8224 + }, + { + "epoch": 0.6855, + "grad_norm": 4.90625, + "grad_norm_var": 0.04488525390625, + "learning_rate": 3.9748611110508964e-05, + "loss": 5.0387, + "loss/crossentropy": 2.574119806289673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21818042173981667, + "step": 8226 + }, + { + "epoch": 0.6856666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.033984375, + "learning_rate": 3.974415401041824e-05, + "loss": 5.1188, + "loss/crossentropy": 2.054434657096863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17497070133686066, + "step": 8228 + }, + { + "epoch": 0.6858333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.030192057291666668, + "learning_rate": 3.9739658063192575e-05, + "loss": 4.8558, + "loss/crossentropy": 1.874234914779663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1658182181417942, + "step": 8230 + }, + { + "epoch": 0.686, + "grad_norm": 4.65625, + "grad_norm_var": 0.04016927083333333, + "learning_rate": 3.973512327992528e-05, + "loss": 5.2414, + "loss/crossentropy": 2.5035698115825653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22155991941690445, + "step": 8232 + }, + { + "epoch": 0.6861666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.02890625, + "learning_rate": 3.973054967180547e-05, + "loss": 4.7098, + "loss/crossentropy": 1.3449292182922363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23684380762279034, + "step": 8234 + }, + { + "epoch": 0.6863333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.03860677083333333, + "learning_rate": 3.972593725011807e-05, + "loss": 4.3327, + "loss/crossentropy": 0.3794103041291237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.08249375969171524, + "step": 8236 + }, + { + "epoch": 0.6865, + "grad_norm": 5.25, + "grad_norm_var": 0.052469889322916664, + "learning_rate": 3.972128602624378e-05, + "loss": 5.2133, + "loss/crossentropy": 1.9596935585141182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16619658470153809, + "step": 8238 + }, + { + "epoch": 0.6866666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.1208984375, + "learning_rate": 3.971659601165903e-05, + "loss": 5.5184, + "loss/crossentropy": 2.493978977203369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21593138948082924, + "step": 8240 + }, + { + "epoch": 0.6868333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.1208984375, + "learning_rate": 3.971186721793595e-05, + "loss": 4.7815, + "loss/crossentropy": 1.8379368782043457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18796057254076004, + "step": 8242 + }, + { + "epoch": 0.687, + "grad_norm": 4.8125, + "grad_norm_var": 0.12193603515625, + "learning_rate": 3.970709965674239e-05, + "loss": 5.2517, + "loss/crossentropy": 2.4483371675014496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22528450563549995, + "step": 8244 + }, + { + "epoch": 0.6871666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.12923177083333334, + "learning_rate": 3.970229333984182e-05, + "loss": 5.2067, + "loss/crossentropy": 1.752450205385685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1706908978521824, + "step": 8246 + }, + { + "epoch": 0.6873333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.12902018229166667, + "learning_rate": 3.9697448279093346e-05, + "loss": 5.0662, + "loss/crossentropy": 1.8285248652100563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17460808157920837, + "step": 8248 + }, + { + "epoch": 0.6875, + "grad_norm": 4.625, + "grad_norm_var": 0.13079020182291667, + "learning_rate": 3.969256448645169e-05, + "loss": 5.042, + "loss/crossentropy": 1.361061304807663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13284870982170105, + "step": 8250 + }, + { + "epoch": 0.6876666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.11708577473958333, + "learning_rate": 3.968764197396712e-05, + "loss": 5.0501, + "loss/crossentropy": 2.5378739833831787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144315354526043, + "step": 8252 + }, + { + "epoch": 0.6878333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.10657145182291666, + "learning_rate": 3.968268075378543e-05, + "loss": 4.9037, + "loss/crossentropy": 2.0044125020504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190082147717476, + "step": 8254 + }, + { + "epoch": 0.688, + "grad_norm": 4.625, + "grad_norm_var": 0.03590087890625, + "learning_rate": 3.967768083814796e-05, + "loss": 4.8427, + "loss/crossentropy": 2.0641330182552338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22053687274456024, + "step": 8256 + }, + { + "epoch": 0.6881666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.052734375, + "learning_rate": 3.9672642239391486e-05, + "loss": 4.4691, + "loss/crossentropy": 2.459451824426651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19304845109581947, + "step": 8258 + }, + { + "epoch": 0.6883333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.056315104166666664, + "learning_rate": 3.966756496994825e-05, + "loss": 4.9431, + "loss/crossentropy": 1.7906870916485786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16057110950350761, + "step": 8260 + }, + { + "epoch": 0.6885, + "grad_norm": 5.1875, + "grad_norm_var": 0.06370035807291667, + "learning_rate": 3.966244904234594e-05, + "loss": 5.1878, + "loss/crossentropy": 1.838362380862236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1645890176296234, + "step": 8262 + }, + { + "epoch": 0.6886666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.06640625, + "learning_rate": 3.965729446920755e-05, + "loss": 5.6991, + "loss/crossentropy": 1.9389175474643707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18414141610264778, + "step": 8264 + }, + { + "epoch": 0.6888333333333333, + "grad_norm": 5.1875, + "grad_norm_var": 0.07884114583333333, + "learning_rate": 3.965210126325153e-05, + "loss": 5.2045, + "loss/crossentropy": 2.096444122493267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1620175652205944, + "step": 8266 + }, + { + "epoch": 0.689, + "grad_norm": 4.90625, + "grad_norm_var": 0.07902018229166667, + "learning_rate": 3.964686943729155e-05, + "loss": 4.2653, + "loss/crossentropy": 1.6484524458646774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16247570887207985, + "step": 8268 + }, + { + "epoch": 0.6891666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.07941080729166666, + "learning_rate": 3.964159900423666e-05, + "loss": 5.6302, + "loss/crossentropy": 2.382517784833908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19215881079435349, + "step": 8270 + }, + { + "epoch": 0.6893333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.07743733723958333, + "learning_rate": 3.9636289977091104e-05, + "loss": 4.8313, + "loss/crossentropy": 1.8752048686146736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1806214228272438, + "step": 8272 + }, + { + "epoch": 0.6895, + "grad_norm": 4.90625, + "grad_norm_var": 0.052958170572916664, + "learning_rate": 3.963094236895439e-05, + "loss": 4.4626, + "loss/crossentropy": 2.2586154341697693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22339026257395744, + "step": 8274 + }, + { + "epoch": 0.6896666666666667, + "grad_norm": 5.375, + "grad_norm_var": 0.04334309895833333, + "learning_rate": 3.96255561930212e-05, + "loss": 4.9586, + "loss/crossentropy": 2.25962632894516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24645965546369553, + "step": 8276 + }, + { + "epoch": 0.6898333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.05657552083333333, + "learning_rate": 3.96201314625814e-05, + "loss": 4.8752, + "loss/crossentropy": 1.8089765384793282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19039946794509888, + "step": 8278 + }, + { + "epoch": 0.69, + "grad_norm": 4.71875, + "grad_norm_var": 0.05618489583333333, + "learning_rate": 3.961466819101996e-05, + "loss": 4.3569, + "loss/crossentropy": 1.9538972973823547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19948555529117584, + "step": 8280 + }, + { + "epoch": 0.6901666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.050374348958333336, + "learning_rate": 3.960916639181697e-05, + "loss": 5.1086, + "loss/crossentropy": 2.0360175147652626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17786091938614845, + "step": 8282 + }, + { + "epoch": 0.6903333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.05133056640625, + "learning_rate": 3.960362607854758e-05, + "loss": 4.9383, + "loss/crossentropy": 2.5083267092704773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21976816654205322, + "step": 8284 + }, + { + "epoch": 0.6905, + "grad_norm": 4.90625, + "grad_norm_var": 0.048811848958333334, + "learning_rate": 3.9598047264881946e-05, + "loss": 5.0042, + "loss/crossentropy": 2.357286214828491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20402342081069946, + "step": 8286 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 5.71875, + "grad_norm_var": 0.09566650390625, + "learning_rate": 3.959242996458524e-05, + "loss": 5.4444, + "loss/crossentropy": 1.7867268919944763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19183974713087082, + "step": 8288 + }, + { + "epoch": 0.6908333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.09709879557291666, + "learning_rate": 3.95867741915176e-05, + "loss": 4.9998, + "loss/crossentropy": 1.7130176201462746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19901876337826252, + "step": 8290 + }, + { + "epoch": 0.691, + "grad_norm": 4.9375, + "grad_norm_var": 0.07997639973958333, + "learning_rate": 3.958107995963406e-05, + "loss": 5.3135, + "loss/crossentropy": 2.3072088062763214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21593845263123512, + "step": 8292 + }, + { + "epoch": 0.6911666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.071728515625, + "learning_rate": 3.957534728298461e-05, + "loss": 4.991, + "loss/crossentropy": 2.2377262711524963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21270013973116875, + "step": 8294 + }, + { + "epoch": 0.6913333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.07509358723958333, + "learning_rate": 3.956957617571403e-05, + "loss": 4.5893, + "loss/crossentropy": 1.2319274619221687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14451301470398903, + "step": 8296 + }, + { + "epoch": 0.6915, + "grad_norm": 4.9375, + "grad_norm_var": 0.07672119140625, + "learning_rate": 3.956376665206196e-05, + "loss": 5.013, + "loss/crossentropy": 2.2056290805339813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942349076271057, + "step": 8298 + }, + { + "epoch": 0.6916666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.07068684895833334, + "learning_rate": 3.955791872636283e-05, + "loss": 4.9761, + "loss/crossentropy": 2.356662005186081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21107817068696022, + "step": 8300 + }, + { + "epoch": 0.6918333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.07646077473958333, + "learning_rate": 3.95520324130458e-05, + "loss": 4.8055, + "loss/crossentropy": 1.8195563182234764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17385952547192574, + "step": 8302 + }, + { + "epoch": 0.692, + "grad_norm": 5.375, + "grad_norm_var": 0.04503580729166667, + "learning_rate": 3.954610772663479e-05, + "loss": 4.8658, + "loss/crossentropy": 1.4324785694479942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1714583933353424, + "step": 8304 + }, + { + "epoch": 0.6921666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.05318603515625, + "learning_rate": 3.9540144681748343e-05, + "loss": 4.4637, + "loss/crossentropy": 1.8403845950961113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19282516837120056, + "step": 8306 + }, + { + "epoch": 0.6923333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.05845947265625, + "learning_rate": 3.95341432930997e-05, + "loss": 4.6448, + "loss/crossentropy": 2.415658622980118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018880546092987, + "step": 8308 + }, + { + "epoch": 0.6925, + "grad_norm": 4.46875, + "grad_norm_var": 0.06295572916666667, + "learning_rate": 3.952810357549669e-05, + "loss": 4.9796, + "loss/crossentropy": 1.79848662763834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1668144389986992, + "step": 8310 + }, + { + "epoch": 0.6926666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.07053629557291667, + "learning_rate": 3.95220255438417e-05, + "loss": 4.2825, + "loss/crossentropy": 1.9337844401597977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18568622693419456, + "step": 8312 + }, + { + "epoch": 0.6928333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.07552083333333333, + "learning_rate": 3.951590921313169e-05, + "loss": 4.5296, + "loss/crossentropy": 1.2211369574069977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14676949754357338, + "step": 8314 + }, + { + "epoch": 0.693, + "grad_norm": 4.65625, + "grad_norm_var": 0.06962483723958333, + "learning_rate": 3.950975459845807e-05, + "loss": 4.7562, + "loss/crossentropy": 1.9180386438965797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1833595335483551, + "step": 8316 + }, + { + "epoch": 0.6931666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.06532796223958333, + "learning_rate": 3.9503561715006775e-05, + "loss": 4.5347, + "loss/crossentropy": 1.9306902810931206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19284119084477425, + "step": 8318 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 3.94973305780581e-05, + "loss": 5.1963, + "loss/crossentropy": 2.3883610665798187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22568393871188164, + "step": 8320 + }, + { + "epoch": 0.6935, + "grad_norm": 4.78125, + "grad_norm_var": 0.04607747395833333, + "learning_rate": 3.9491061202986776e-05, + "loss": 4.8591, + "loss/crossentropy": 1.6141887456178665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17095645144581795, + "step": 8322 + }, + { + "epoch": 0.6936666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.08450113932291667, + "learning_rate": 3.9484753605261856e-05, + "loss": 4.9119, + "loss/crossentropy": 1.8141232132911682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1782233528792858, + "step": 8324 + }, + { + "epoch": 0.6938333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.08157552083333333, + "learning_rate": 3.94784078004467e-05, + "loss": 4.696, + "loss/crossentropy": 1.6317244544625282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.161282641813159, + "step": 8326 + }, + { + "epoch": 0.694, + "grad_norm": 5.125, + "grad_norm_var": 0.07226155598958334, + "learning_rate": 3.9472023804198966e-05, + "loss": 4.7567, + "loss/crossentropy": 2.129282683134079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21127504110336304, + "step": 8328 + }, + { + "epoch": 0.6941666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.14412434895833334, + "learning_rate": 3.946560163227052e-05, + "loss": 4.7442, + "loss/crossentropy": 1.7705247178673744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15480435080826283, + "step": 8330 + }, + { + "epoch": 0.6943333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.13527018229166668, + "learning_rate": 3.945914130050744e-05, + "loss": 4.2676, + "loss/crossentropy": 1.9477231204509735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19183451309800148, + "step": 8332 + }, + { + "epoch": 0.6945, + "grad_norm": 4.53125, + "grad_norm_var": 0.13759358723958334, + "learning_rate": 3.9452642824849944e-05, + "loss": 4.8556, + "loss/crossentropy": 2.5035791397094727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22337283194065094, + "step": 8334 + }, + { + "epoch": 0.6946666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.13567708333333334, + "learning_rate": 3.9446106221332384e-05, + "loss": 4.5276, + "loss/crossentropy": 1.919069766998291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20139261707663536, + "step": 8336 + }, + { + "epoch": 0.6948333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.1396484375, + "learning_rate": 3.943953150608318e-05, + "loss": 4.5294, + "loss/crossentropy": 1.7460784316062927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1773892566561699, + "step": 8338 + }, + { + "epoch": 0.695, + "grad_norm": 4.71875, + "grad_norm_var": 0.119384765625, + "learning_rate": 3.9432918695324775e-05, + "loss": 4.8103, + "loss/crossentropy": 1.711449757218361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17536772415041924, + "step": 8340 + }, + { + "epoch": 0.6951666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.11343994140625, + "learning_rate": 3.9426267805373626e-05, + "loss": 5.095, + "loss/crossentropy": 1.6863776296377182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16977461241185665, + "step": 8342 + }, + { + "epoch": 0.6953333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.11013997395833333, + "learning_rate": 3.941957885264017e-05, + "loss": 4.9459, + "loss/crossentropy": 2.0749086141586304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21645259484648705, + "step": 8344 + }, + { + "epoch": 0.6955, + "grad_norm": 4.9375, + "grad_norm_var": 0.03443603515625, + "learning_rate": 3.941285185362868e-05, + "loss": 5.0884, + "loss/crossentropy": 2.0320696011185646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18671347945928574, + "step": 8346 + }, + { + "epoch": 0.6956666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.03765869140625, + "learning_rate": 3.940608682493741e-05, + "loss": 5.2533, + "loss/crossentropy": 1.2317260429263115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13544929958879948, + "step": 8348 + }, + { + "epoch": 0.6958333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.03215738932291667, + "learning_rate": 3.939928378325836e-05, + "loss": 5.1139, + "loss/crossentropy": 2.150977909564972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224623154848814, + "step": 8350 + }, + { + "epoch": 0.696, + "grad_norm": 4.9375, + "grad_norm_var": 0.03323160807291667, + "learning_rate": 3.939244274537738e-05, + "loss": 5.476, + "loss/crossentropy": 2.4628894329071045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21918050199747086, + "step": 8352 + }, + { + "epoch": 0.6961666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.026822916666666665, + "learning_rate": 3.938556372817404e-05, + "loss": 5.2391, + "loss/crossentropy": 2.3663055896759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987644210457802, + "step": 8354 + }, + { + "epoch": 0.6963333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.022591145833333333, + "learning_rate": 3.937864674862163e-05, + "loss": 4.7939, + "loss/crossentropy": 2.5858985781669617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24316586554050446, + "step": 8356 + }, + { + "epoch": 0.6965, + "grad_norm": 4.90625, + "grad_norm_var": 0.04348551432291667, + "learning_rate": 3.937169182378712e-05, + "loss": 5.0335, + "loss/crossentropy": 1.2397507652640343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.162556741386652, + "step": 8358 + }, + { + "epoch": 0.6966666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.05911051432291667, + "learning_rate": 3.936469897083109e-05, + "loss": 4.7473, + "loss/crossentropy": 1.5406540408730507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22426093369722366, + "step": 8360 + }, + { + "epoch": 0.6968333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.058268229166666664, + "learning_rate": 3.935766820700771e-05, + "loss": 5.0673, + "loss/crossentropy": 2.4371124505996704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19626253098249435, + "step": 8362 + }, + { + "epoch": 0.697, + "grad_norm": 5.15625, + "grad_norm_var": 0.05093994140625, + "learning_rate": 3.935059954966469e-05, + "loss": 5.4225, + "loss/crossentropy": 2.520164370536804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460576295852661, + "step": 8364 + }, + { + "epoch": 0.6971666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.08385009765625, + "learning_rate": 3.934349301624324e-05, + "loss": 4.5993, + "loss/crossentropy": 2.55728417634964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20566748455166817, + "step": 8366 + }, + { + "epoch": 0.6973333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.08424479166666667, + "learning_rate": 3.933634862427802e-05, + "loss": 4.9533, + "loss/crossentropy": 1.4302483797073364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16049204394221306, + "step": 8368 + }, + { + "epoch": 0.6975, + "grad_norm": 4.28125, + "grad_norm_var": 0.11964518229166667, + "learning_rate": 3.9329166391397116e-05, + "loss": 4.5304, + "loss/crossentropy": 2.4105213284492493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21557223051786423, + "step": 8370 + }, + { + "epoch": 0.6976666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.13444010416666666, + "learning_rate": 3.932194633532196e-05, + "loss": 4.6363, + "loss/crossentropy": 1.7642899453639984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1885760761797428, + "step": 8372 + }, + { + "epoch": 0.6978333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.10345052083333334, + "learning_rate": 3.931468847386734e-05, + "loss": 4.4008, + "loss/crossentropy": 1.898881435394287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19243913888931274, + "step": 8374 + }, + { + "epoch": 0.698, + "grad_norm": 5.15625, + "grad_norm_var": 0.097119140625, + "learning_rate": 3.93073928249413e-05, + "loss": 5.3528, + "loss/crossentropy": 2.535469710826874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21360251680016518, + "step": 8376 + }, + { + "epoch": 0.6981666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.08599853515625, + "learning_rate": 3.930005940654511e-05, + "loss": 5.0111, + "loss/crossentropy": 2.345153719186783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27940667420625687, + "step": 8378 + }, + { + "epoch": 0.6983333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.070947265625, + "learning_rate": 3.9292688236773286e-05, + "loss": 4.7551, + "loss/crossentropy": 1.8373343795537949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17152095399796963, + "step": 8380 + }, + { + "epoch": 0.6985, + "grad_norm": 4.9375, + "grad_norm_var": 0.06575113932291667, + "learning_rate": 3.928527933381344e-05, + "loss": 4.8176, + "loss/crossentropy": 2.3495190739631653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21102092787623405, + "step": 8382 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.06389567057291666, + "learning_rate": 3.92778327159463e-05, + "loss": 4.7387, + "loss/crossentropy": 1.8806327432394028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15044947527348995, + "step": 8384 + }, + { + "epoch": 0.6988333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.041666666666666664, + "learning_rate": 3.9270348401545646e-05, + "loss": 5.4241, + "loss/crossentropy": 2.6885854601860046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21951700001955032, + "step": 8386 + }, + { + "epoch": 0.699, + "grad_norm": 5.625, + "grad_norm_var": 0.07209879557291667, + "learning_rate": 3.92628264090783e-05, + "loss": 4.5313, + "loss/crossentropy": 1.8490508198738098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25332874804735184, + "step": 8388 + }, + { + "epoch": 0.6991666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.07044270833333334, + "learning_rate": 3.9255266757104025e-05, + "loss": 4.2339, + "loss/crossentropy": 2.2976879477500916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21890880912542343, + "step": 8390 + }, + { + "epoch": 0.6993333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.07303059895833333, + "learning_rate": 3.924766946427551e-05, + "loss": 4.7814, + "loss/crossentropy": 1.930149868130684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19942124374210835, + "step": 8392 + }, + { + "epoch": 0.6995, + "grad_norm": 4.6875, + "grad_norm_var": 0.07916259765625, + "learning_rate": 3.9240034549338315e-05, + "loss": 4.9137, + "loss/crossentropy": 2.1267817318439484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.221748698502779, + "step": 8394 + }, + { + "epoch": 0.6996666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.4212849934895833, + "learning_rate": 3.9232362031130836e-05, + "loss": 4.4343, + "loss/crossentropy": 0.8540246337652206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1094401404261589, + "step": 8396 + }, + { + "epoch": 0.6998333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.42107747395833334, + "learning_rate": 3.9224651928584246e-05, + "loss": 4.9973, + "loss/crossentropy": 2.14926341176033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1886780634522438, + "step": 8398 + }, + { + "epoch": 0.7, + "grad_norm": 5.21875, + "grad_norm_var": 0.41067708333333336, + "learning_rate": 3.921690426072246e-05, + "loss": 5.3, + "loss/crossentropy": 1.972128227353096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19318870827555656, + "step": 8400 + }, + { + "epoch": 0.7001666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.41717122395833334, + "learning_rate": 3.9209119046662085e-05, + "loss": 4.8935, + "loss/crossentropy": 1.6673232913017273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16348423436284065, + "step": 8402 + }, + { + "epoch": 0.7003333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.397119140625, + "learning_rate": 3.920129630561235e-05, + "loss": 4.7038, + "loss/crossentropy": 0.7960238456726074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10878065042197704, + "step": 8404 + }, + { + "epoch": 0.7005, + "grad_norm": 4.625, + "grad_norm_var": 0.38163655598958335, + "learning_rate": 3.9193436056875106e-05, + "loss": 4.913, + "loss/crossentropy": 2.3446280360221863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20906392112374306, + "step": 8406 + }, + { + "epoch": 0.7006666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.40325520833333334, + "learning_rate": 3.918553831984472e-05, + "loss": 4.7213, + "loss/crossentropy": 1.2463391497731209, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13812248408794403, + "step": 8408 + }, + { + "epoch": 0.7008333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.39319254557291666, + "learning_rate": 3.917760311400808e-05, + "loss": 4.6313, + "loss/crossentropy": 2.367984890937805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071567215025425, + "step": 8410 + }, + { + "epoch": 0.701, + "grad_norm": 5.03125, + "grad_norm_var": 0.04934488932291667, + "learning_rate": 3.9169630458944515e-05, + "loss": 5.3454, + "loss/crossentropy": 2.3054229021072388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.243374515324831, + "step": 8412 + }, + { + "epoch": 0.7011666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.06051025390625, + "learning_rate": 3.916162037432576e-05, + "loss": 5.1157, + "loss/crossentropy": 2.388631224632263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20708012580871582, + "step": 8414 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.04605712890625, + "learning_rate": 3.915357287991591e-05, + "loss": 5.0288, + "loss/crossentropy": 1.7904658913612366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20545685291290283, + "step": 8416 + }, + { + "epoch": 0.7015, + "grad_norm": 4.75, + "grad_norm_var": 0.047119140625, + "learning_rate": 3.914548799557135e-05, + "loss": 4.7721, + "loss/crossentropy": 2.0648528784513474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930786818265915, + "step": 8418 + }, + { + "epoch": 0.7016666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 3.9137365741240734e-05, + "loss": 5.0071, + "loss/crossentropy": 2.4154167771339417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22159772738814354, + "step": 8420 + }, + { + "epoch": 0.7018333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.05435791015625, + "learning_rate": 3.9129206136964903e-05, + "loss": 4.4603, + "loss/crossentropy": 1.9558910503983498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17343437299132347, + "step": 8422 + }, + { + "epoch": 0.702, + "grad_norm": 5.0, + "grad_norm_var": 0.04010009765625, + "learning_rate": 3.912100920287688e-05, + "loss": 4.9526, + "loss/crossentropy": 2.4329636991024017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20050939917564392, + "step": 8424 + }, + { + "epoch": 0.7021666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.04713134765625, + "learning_rate": 3.911277495920179e-05, + "loss": 5.5551, + "loss/crossentropy": 2.05565532296896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18992381542921066, + "step": 8426 + }, + { + "epoch": 0.7023333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.04583333333333333, + "learning_rate": 3.91045034262568e-05, + "loss": 4.5382, + "loss/crossentropy": 1.910774514079094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17602653056383133, + "step": 8428 + }, + { + "epoch": 0.7025, + "grad_norm": 5.03125, + "grad_norm_var": 0.043229166666666666, + "learning_rate": 3.9096194624451104e-05, + "loss": 4.8823, + "loss/crossentropy": 2.303587019443512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22907770797610283, + "step": 8430 + }, + { + "epoch": 0.7026666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.051025390625, + "learning_rate": 3.908784857428583e-05, + "loss": 4.411, + "loss/crossentropy": 1.3701264038681984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1463383063673973, + "step": 8432 + }, + { + "epoch": 0.7028333333333333, + "grad_norm": 5.3125, + "grad_norm_var": 0.06806233723958334, + "learning_rate": 3.907946529635405e-05, + "loss": 5.2984, + "loss/crossentropy": 1.987550988793373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17481104657053947, + "step": 8434 + }, + { + "epoch": 0.703, + "grad_norm": 4.75, + "grad_norm_var": 0.07459309895833334, + "learning_rate": 3.907104481134066e-05, + "loss": 4.4821, + "loss/crossentropy": 1.3826638907194138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15103114023804665, + "step": 8436 + }, + { + "epoch": 0.7031666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.06278889973958333, + "learning_rate": 3.906258714002236e-05, + "loss": 4.6062, + "loss/crossentropy": 1.6567226275801659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17642017267644405, + "step": 8438 + }, + { + "epoch": 0.7033333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.07336832682291666, + "learning_rate": 3.905409230326761e-05, + "loss": 4.3813, + "loss/crossentropy": 1.7283048927783966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893971562385559, + "step": 8440 + }, + { + "epoch": 0.7035, + "grad_norm": 4.78125, + "grad_norm_var": 0.06656494140625, + "learning_rate": 3.90455603220366e-05, + "loss": 4.974, + "loss/crossentropy": 2.4172494411468506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20795316621661186, + "step": 8442 + }, + { + "epoch": 0.7036666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.0798828125, + "learning_rate": 3.903699121738112e-05, + "loss": 4.3795, + "loss/crossentropy": 1.8104211688041687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2350565455853939, + "step": 8444 + }, + { + "epoch": 0.7038333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.07877197265625, + "learning_rate": 3.9028385010444593e-05, + "loss": 4.6677, + "loss/crossentropy": 1.9519396424293518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947762556374073, + "step": 8446 + }, + { + "epoch": 0.704, + "grad_norm": 4.78125, + "grad_norm_var": 0.07434488932291666, + "learning_rate": 3.901974172246199e-05, + "loss": 5.0585, + "loss/crossentropy": 2.076841115951538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1911839358508587, + "step": 8448 + }, + { + "epoch": 0.7041666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.04772135416666667, + "learning_rate": 3.9011061374759756e-05, + "loss": 4.869, + "loss/crossentropy": 2.004154473543167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17821232788264751, + "step": 8450 + }, + { + "epoch": 0.7043333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 3.900234398875578e-05, + "loss": 5.1626, + "loss/crossentropy": 2.0224228501319885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1770163904875517, + "step": 8452 + }, + { + "epoch": 0.7045, + "grad_norm": 4.90625, + "grad_norm_var": 0.05178629557291667, + "learning_rate": 3.899358958595935e-05, + "loss": 5.1691, + "loss/crossentropy": 2.1597854495048523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20969395712018013, + "step": 8454 + }, + { + "epoch": 0.7046666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.04895833333333333, + "learning_rate": 3.898479818797108e-05, + "loss": 5.0392, + "loss/crossentropy": 2.580492913722992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20104866474866867, + "step": 8456 + }, + { + "epoch": 0.7048333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.05481770833333333, + "learning_rate": 3.8975969816482884e-05, + "loss": 5.0146, + "loss/crossentropy": 2.439221531152725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20938636735081673, + "step": 8458 + }, + { + "epoch": 0.705, + "grad_norm": 4.65625, + "grad_norm_var": 0.04034830729166667, + "learning_rate": 3.896710449327788e-05, + "loss": 4.6385, + "loss/crossentropy": 2.402409076690674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21491533517837524, + "step": 8460 + }, + { + "epoch": 0.7051666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.03411051432291667, + "learning_rate": 3.8958202240230376e-05, + "loss": 5.0256, + "loss/crossentropy": 1.7773304283618927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1971539780497551, + "step": 8462 + }, + { + "epoch": 0.7053333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.05380452473958333, + "learning_rate": 3.89492630793058e-05, + "loss": 5.1185, + "loss/crossentropy": 2.059245079755783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149088755249977, + "step": 8464 + }, + { + "epoch": 0.7055, + "grad_norm": 4.59375, + "grad_norm_var": 0.05224202473958333, + "learning_rate": 3.894028703256063e-05, + "loss": 4.4259, + "loss/crossentropy": 1.6723755300045013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16442426852881908, + "step": 8466 + }, + { + "epoch": 0.7056666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.049609375, + "learning_rate": 3.893127412214238e-05, + "loss": 4.72, + "loss/crossentropy": 1.6438677459955215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16178004071116447, + "step": 8468 + }, + { + "epoch": 0.7058333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.07011311848958333, + "learning_rate": 3.8922224370289517e-05, + "loss": 4.9171, + "loss/crossentropy": 2.445003926753998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24207842722535133, + "step": 8470 + }, + { + "epoch": 0.706, + "grad_norm": 5.0, + "grad_norm_var": 0.06881510416666667, + "learning_rate": 3.891313779933138e-05, + "loss": 5.1656, + "loss/crossentropy": 2.066555440425873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20761993527412415, + "step": 8472 + }, + { + "epoch": 0.7061666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.06461181640625, + "learning_rate": 3.89040144316882e-05, + "loss": 4.9594, + "loss/crossentropy": 2.183867871761322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912669971585274, + "step": 8474 + }, + { + "epoch": 0.7063333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.06148681640625, + "learning_rate": 3.889485428987097e-05, + "loss": 5.1491, + "loss/crossentropy": 2.3668190836906433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22393818199634552, + "step": 8476 + }, + { + "epoch": 0.7065, + "grad_norm": 4.28125, + "grad_norm_var": 0.090869140625, + "learning_rate": 3.888565739648145e-05, + "loss": 4.7785, + "loss/crossentropy": 2.4078583121299744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20308464020490646, + "step": 8478 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.0720703125, + "learning_rate": 3.887642377421203e-05, + "loss": 4.7113, + "loss/crossentropy": 1.0246254950761795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14504259265959263, + "step": 8480 + }, + { + "epoch": 0.7068333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.07873942057291666, + "learning_rate": 3.886715344584577e-05, + "loss": 5.002, + "loss/crossentropy": 2.1630527675151825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19843829050660133, + "step": 8482 + }, + { + "epoch": 0.707, + "grad_norm": 4.90625, + "grad_norm_var": 0.07499593098958333, + "learning_rate": 3.885784643425628e-05, + "loss": 4.9709, + "loss/crossentropy": 1.5243459790945053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1480504274368286, + "step": 8484 + }, + { + "epoch": 0.7071666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.06018473307291667, + "learning_rate": 3.884850276240769e-05, + "loss": 4.9724, + "loss/crossentropy": 2.170057028532028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22325589135289192, + "step": 8486 + }, + { + "epoch": 0.7073333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.05816650390625, + "learning_rate": 3.8839122453354584e-05, + "loss": 4.9309, + "loss/crossentropy": 1.7268838658928871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19534722715616226, + "step": 8488 + }, + { + "epoch": 0.7075, + "grad_norm": 4.5, + "grad_norm_var": 0.06337483723958333, + "learning_rate": 3.882970553024193e-05, + "loss": 4.21, + "loss/crossentropy": 2.080011636018753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039019875228405, + "step": 8490 + }, + { + "epoch": 0.7076666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.068994140625, + "learning_rate": 3.8820252016305066e-05, + "loss": 4.5204, + "loss/crossentropy": 2.4818327128887177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21026042476296425, + "step": 8492 + }, + { + "epoch": 0.7078333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.04407552083333333, + "learning_rate": 3.881076193486959e-05, + "loss": 5.0249, + "loss/crossentropy": 2.102166533470154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555450603365898, + "step": 8494 + }, + { + "epoch": 0.708, + "grad_norm": 4.625, + "grad_norm_var": 0.04455973307291667, + "learning_rate": 3.8801235309351326e-05, + "loss": 4.5784, + "loss/crossentropy": 1.0780503954738379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10496869310736656, + "step": 8496 + }, + { + "epoch": 0.7081666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.037398274739583334, + "learning_rate": 3.87916721632563e-05, + "loss": 4.8757, + "loss/crossentropy": 1.9232516288757324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2197490967810154, + "step": 8498 + }, + { + "epoch": 0.7083333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 0.044331868489583336, + "learning_rate": 3.878207252018059e-05, + "loss": 4.8544, + "loss/crossentropy": 2.091248132288456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18537914380431175, + "step": 8500 + }, + { + "epoch": 0.7085, + "grad_norm": 4.8125, + "grad_norm_var": 0.030989583333333334, + "learning_rate": 3.877243640381038e-05, + "loss": 4.8074, + "loss/crossentropy": 1.759295605123043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1624660287052393, + "step": 8502 + }, + { + "epoch": 0.7086666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.033589680989583336, + "learning_rate": 3.876276383792184e-05, + "loss": 4.4416, + "loss/crossentropy": 1.1989781931042671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1289712656289339, + "step": 8504 + }, + { + "epoch": 0.7088333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.030061848958333335, + "learning_rate": 3.875305484638105e-05, + "loss": 4.9461, + "loss/crossentropy": 1.7124052718281746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17960615642368793, + "step": 8506 + }, + { + "epoch": 0.709, + "grad_norm": 4.96875, + "grad_norm_var": 0.03209635416666667, + "learning_rate": 3.874330945314398e-05, + "loss": 4.7397, + "loss/crossentropy": 1.9230768084526062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19067246094346046, + "step": 8508 + }, + { + "epoch": 0.7091666666666666, + "grad_norm": 4.28125, + "grad_norm_var": 0.0423828125, + "learning_rate": 3.873352768225643e-05, + "loss": 4.2579, + "loss/crossentropy": 1.9107168316841125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18083369359374046, + "step": 8510 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 5.5, + "grad_norm_var": 0.08183186848958333, + "learning_rate": 3.8723709557853935e-05, + "loss": 5.4434, + "loss/crossentropy": 2.5605077147483826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24422496184706688, + "step": 8512 + }, + { + "epoch": 0.7095, + "grad_norm": 4.5, + "grad_norm_var": 0.0818359375, + "learning_rate": 3.871385510416175e-05, + "loss": 4.6419, + "loss/crossentropy": 1.6347190141677856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.167040653526783, + "step": 8514 + }, + { + "epoch": 0.7096666666666667, + "grad_norm": 7.25, + "grad_norm_var": 0.47897135416666664, + "learning_rate": 3.8703964345494747e-05, + "loss": 4.8956, + "loss/crossentropy": 1.5641431733965874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1506042554974556, + "step": 8516 + }, + { + "epoch": 0.7098333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.49269205729166665, + "learning_rate": 3.869403730625741e-05, + "loss": 4.9614, + "loss/crossentropy": 2.4893141984939575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214422095566988, + "step": 8518 + }, + { + "epoch": 0.71, + "grad_norm": 4.15625, + "grad_norm_var": 0.5141764322916667, + "learning_rate": 3.86840740109437e-05, + "loss": 4.093, + "loss/crossentropy": 1.1999619975686073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1375366821885109, + "step": 8520 + }, + { + "epoch": 0.7101666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.514306640625, + "learning_rate": 3.8674074484137075e-05, + "loss": 4.9127, + "loss/crossentropy": 1.6504128351807594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15782082825899124, + "step": 8522 + }, + { + "epoch": 0.7103333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.506103515625, + "learning_rate": 3.866403875051037e-05, + "loss": 4.9036, + "loss/crossentropy": 1.891563042998314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19191880524158478, + "step": 8524 + }, + { + "epoch": 0.7105, + "grad_norm": 5.53125, + "grad_norm_var": 0.5021769205729166, + "learning_rate": 3.865396683482575e-05, + "loss": 5.1794, + "loss/crossentropy": 2.358280599117279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268039658665657, + "step": 8526 + }, + { + "epoch": 0.7106666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.47706705729166665, + "learning_rate": 3.864385876193469e-05, + "loss": 4.8347, + "loss/crossentropy": 2.0874394476413727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218267060816288, + "step": 8528 + }, + { + "epoch": 0.7108333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.46678059895833335, + "learning_rate": 3.8633714556777817e-05, + "loss": 5.0689, + "loss/crossentropy": 0.9933740720152855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.116140803322196, + "step": 8530 + }, + { + "epoch": 0.711, + "grad_norm": 4.9375, + "grad_norm_var": 0.09361979166666666, + "learning_rate": 3.8623534244384984e-05, + "loss": 5.2475, + "loss/crossentropy": 2.2980607450008392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18490897864103317, + "step": 8532 + }, + { + "epoch": 0.7111666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.08357747395833333, + "learning_rate": 3.861331784987508e-05, + "loss": 4.93, + "loss/crossentropy": 1.8772684335708618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1812591589987278, + "step": 8534 + }, + { + "epoch": 0.7113333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.05115559895833333, + "learning_rate": 3.8603065398456056e-05, + "loss": 5.0756, + "loss/crossentropy": 1.7558320760726929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000910758972168, + "step": 8536 + }, + { + "epoch": 0.7115, + "grad_norm": 5.0625, + "grad_norm_var": 0.05191650390625, + "learning_rate": 3.85927769154248e-05, + "loss": 4.9146, + "loss/crossentropy": 1.9606410264968872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17154713906347752, + "step": 8538 + }, + { + "epoch": 0.7116666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.05181884765625, + "learning_rate": 3.858245242616713e-05, + "loss": 4.8698, + "loss/crossentropy": 1.5781254023313522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16780022345483303, + "step": 8540 + }, + { + "epoch": 0.7118333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.022196451822916668, + "learning_rate": 3.857209195615769e-05, + "loss": 5.4512, + "loss/crossentropy": 2.496997833251953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22793800756335258, + "step": 8542 + }, + { + "epoch": 0.712, + "grad_norm": 4.5, + "grad_norm_var": 0.07580973307291666, + "learning_rate": 3.856169553095994e-05, + "loss": 4.7199, + "loss/crossentropy": 1.7486284598708153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16116427071392536, + "step": 8544 + }, + { + "epoch": 0.7121666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.08079020182291667, + "learning_rate": 3.855126317622598e-05, + "loss": 4.5685, + "loss/crossentropy": 1.554433859884739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16488460823893547, + "step": 8546 + }, + { + "epoch": 0.7123333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.08307291666666666, + "learning_rate": 3.854079491769665e-05, + "loss": 5.0647, + "loss/crossentropy": 1.7844418436288834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16470178216695786, + "step": 8548 + }, + { + "epoch": 0.7125, + "grad_norm": 4.65625, + "grad_norm_var": 0.09338785807291666, + "learning_rate": 3.853029078120131e-05, + "loss": 4.7518, + "loss/crossentropy": 1.723744347691536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1574373096227646, + "step": 8550 + }, + { + "epoch": 0.7126666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.09439697265625, + "learning_rate": 3.851975079265788e-05, + "loss": 5.3427, + "loss/crossentropy": 2.4624204635620117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21660296246409416, + "step": 8552 + }, + { + "epoch": 0.7128333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.09768473307291667, + "learning_rate": 3.850917497807273e-05, + "loss": 4.9311, + "loss/crossentropy": 2.056380547583103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827603466808796, + "step": 8554 + }, + { + "epoch": 0.713, + "grad_norm": 4.625, + "grad_norm_var": 0.10230712890625, + "learning_rate": 3.849856336354064e-05, + "loss": 5.3591, + "loss/crossentropy": 2.038110814988613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19497457519173622, + "step": 8556 + }, + { + "epoch": 0.7131666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.108447265625, + "learning_rate": 3.8487915975244715e-05, + "loss": 5.4798, + "loss/crossentropy": 2.337386816740036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22147011384367943, + "step": 8558 + }, + { + "epoch": 0.7133333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.0740234375, + "learning_rate": 3.847723283945632e-05, + "loss": 5.0219, + "loss/crossentropy": 2.2437821328639984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1856003701686859, + "step": 8560 + }, + { + "epoch": 0.7135, + "grad_norm": 5.4375, + "grad_norm_var": 0.08938802083333333, + "learning_rate": 3.846651398253503e-05, + "loss": 4.9609, + "loss/crossentropy": 1.8475798070430756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18377842381596565, + "step": 8562 + }, + { + "epoch": 0.7136666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.08642171223958334, + "learning_rate": 3.845575943092857e-05, + "loss": 4.8802, + "loss/crossentropy": 1.9503494873642921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18385779485106468, + "step": 8564 + }, + { + "epoch": 0.7138333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.070556640625, + "learning_rate": 3.8444969211172704e-05, + "loss": 5.2131, + "loss/crossentropy": 2.542492926120758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972789764404297, + "step": 8566 + }, + { + "epoch": 0.714, + "grad_norm": 4.8125, + "grad_norm_var": 0.083837890625, + "learning_rate": 3.843414334989125e-05, + "loss": 4.7795, + "loss/crossentropy": 2.350885808467865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20130325853824615, + "step": 8568 + }, + { + "epoch": 0.7141666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.31819254557291665, + "learning_rate": 3.842328187379593e-05, + "loss": 4.414, + "loss/crossentropy": 1.7121087461709976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17743447422981262, + "step": 8570 + }, + { + "epoch": 0.7143333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.3076171875, + "learning_rate": 3.841238480968637e-05, + "loss": 4.8087, + "loss/crossentropy": 2.0848701670765877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18633460253477097, + "step": 8572 + }, + { + "epoch": 0.7145, + "grad_norm": 4.625, + "grad_norm_var": 0.3153483072916667, + "learning_rate": 3.840145218444999e-05, + "loss": 4.5425, + "loss/crossentropy": 1.3898528441786766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17466096580028534, + "step": 8574 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.30987955729166666, + "learning_rate": 3.839048402506194e-05, + "loss": 4.7232, + "loss/crossentropy": 1.9185372442007065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18338378705084324, + "step": 8576 + }, + { + "epoch": 0.7148333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.28677978515625, + "learning_rate": 3.837948035858508e-05, + "loss": 4.7647, + "loss/crossentropy": 2.4466370940208435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22165575250983238, + "step": 8578 + }, + { + "epoch": 0.715, + "grad_norm": 5.15625, + "grad_norm_var": 0.29127604166666665, + "learning_rate": 3.8368441212169856e-05, + "loss": 4.9122, + "loss/crossentropy": 2.3279071152210236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20262134820222855, + "step": 8580 + }, + { + "epoch": 0.7151666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.2951131184895833, + "learning_rate": 3.8357366613054265e-05, + "loss": 4.4034, + "loss/crossentropy": 1.2494020238518715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17146893590688705, + "step": 8582 + }, + { + "epoch": 0.7153333333333334, + "grad_norm": 5.34375, + "grad_norm_var": 0.29737955729166665, + "learning_rate": 3.834625658856378e-05, + "loss": 4.92, + "loss/crossentropy": 2.271469384431839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555164128541946, + "step": 8584 + }, + { + "epoch": 0.7155, + "grad_norm": 5.125, + "grad_norm_var": 0.062235514322916664, + "learning_rate": 3.833511116611128e-05, + "loss": 4.9768, + "loss/crossentropy": 2.541659891605377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22345850616693497, + "step": 8586 + }, + { + "epoch": 0.7156666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.0658203125, + "learning_rate": 3.8323930373196994e-05, + "loss": 4.9504, + "loss/crossentropy": 1.7460681796073914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19049026630818844, + "step": 8588 + }, + { + "epoch": 0.7158333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.08502604166666666, + "learning_rate": 3.83127142374084e-05, + "loss": 5.0767, + "loss/crossentropy": 2.4137668907642365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21288975328207016, + "step": 8590 + }, + { + "epoch": 0.716, + "grad_norm": 4.78125, + "grad_norm_var": 0.08006184895833333, + "learning_rate": 3.830146278642023e-05, + "loss": 4.8836, + "loss/crossentropy": 2.0978833809494972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21597089432179928, + "step": 8592 + }, + { + "epoch": 0.7161666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.08245035807291666, + "learning_rate": 3.829017604799428e-05, + "loss": 4.8027, + "loss/crossentropy": 1.518560267984867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15250182338058949, + "step": 8594 + }, + { + "epoch": 0.7163333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.07903238932291666, + "learning_rate": 3.8278854049979495e-05, + "loss": 5.1256, + "loss/crossentropy": 1.6443413645029068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18051048554480076, + "step": 8596 + }, + { + "epoch": 0.7165, + "grad_norm": 4.8125, + "grad_norm_var": 0.07862955729166667, + "learning_rate": 3.826749682031174e-05, + "loss": 5.122, + "loss/crossentropy": 1.3595605567097664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12122759409248829, + "step": 8598 + }, + { + "epoch": 0.7166666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.07864583333333333, + "learning_rate": 3.8256104387013886e-05, + "loss": 4.3628, + "loss/crossentropy": 1.3635896146297455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14798260852694511, + "step": 8600 + }, + { + "epoch": 0.7168333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.08088785807291667, + "learning_rate": 3.824467677819562e-05, + "loss": 5.4972, + "loss/crossentropy": 1.7752568274736404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16927327774465084, + "step": 8602 + }, + { + "epoch": 0.717, + "grad_norm": 5.09375, + "grad_norm_var": 0.076806640625, + "learning_rate": 3.8233214022053414e-05, + "loss": 4.3435, + "loss/crossentropy": 2.242382973432541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19916261360049248, + "step": 8604 + }, + { + "epoch": 0.7171666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.05725504557291667, + "learning_rate": 3.822171614687049e-05, + "loss": 4.8377, + "loss/crossentropy": 1.064011611044407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1281402837485075, + "step": 8606 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.05732014973958333, + "learning_rate": 3.821018318101672e-05, + "loss": 5.0572, + "loss/crossentropy": 2.07350555062294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20168068632483482, + "step": 8608 + }, + { + "epoch": 0.7175, + "grad_norm": 4.59375, + "grad_norm_var": 0.06131184895833333, + "learning_rate": 3.8198615152948534e-05, + "loss": 5.551, + "loss/crossentropy": 1.9728271514177322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18988211080431938, + "step": 8610 + }, + { + "epoch": 0.7176666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.061356608072916666, + "learning_rate": 3.818701209120891e-05, + "loss": 5.1561, + "loss/crossentropy": 2.1180964708328247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928955800831318, + "step": 8612 + }, + { + "epoch": 0.7178333333333333, + "grad_norm": 5.0625, + "grad_norm_var": 0.05520426432291667, + "learning_rate": 3.8175374024427233e-05, + "loss": 5.2349, + "loss/crossentropy": 2.4339922070503235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179810367524624, + "step": 8614 + }, + { + "epoch": 0.718, + "grad_norm": 4.78125, + "grad_norm_var": 0.03258056640625, + "learning_rate": 3.816370098131929e-05, + "loss": 5.0511, + "loss/crossentropy": 2.0979133695364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20639016665518284, + "step": 8616 + }, + { + "epoch": 0.7181666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.039388020833333336, + "learning_rate": 3.815199299068714e-05, + "loss": 5.1404, + "loss/crossentropy": 1.4955974891781807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16872404888272285, + "step": 8618 + }, + { + "epoch": 0.7183333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 0.045638020833333334, + "learning_rate": 3.8140250081419105e-05, + "loss": 4.4802, + "loss/crossentropy": 0.6960400566458702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11802476830780506, + "step": 8620 + }, + { + "epoch": 0.7185, + "grad_norm": 5.03125, + "grad_norm_var": 0.05211181640625, + "learning_rate": 3.812847228248962e-05, + "loss": 5.2375, + "loss/crossentropy": 2.1082040667533875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18984466418623924, + "step": 8622 + }, + { + "epoch": 0.7186666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.060400390625, + "learning_rate": 3.811665962295925e-05, + "loss": 5.2651, + "loss/crossentropy": 2.195831149816513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19905410893261433, + "step": 8624 + }, + { + "epoch": 0.7188333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.06471354166666667, + "learning_rate": 3.8104812131974565e-05, + "loss": 4.4659, + "loss/crossentropy": 1.7844280079007149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.194871723651886, + "step": 8626 + }, + { + "epoch": 0.719, + "grad_norm": 4.5625, + "grad_norm_var": 0.06649983723958333, + "learning_rate": 3.809292983876806e-05, + "loss": 4.8771, + "loss/crossentropy": 2.350356310606003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22957272082567215, + "step": 8628 + }, + { + "epoch": 0.7191666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.0626953125, + "learning_rate": 3.8081012772658125e-05, + "loss": 5.2387, + "loss/crossentropy": 2.595982849597931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20795315876603127, + "step": 8630 + }, + { + "epoch": 0.7193333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.06503499348958333, + "learning_rate": 3.8069060963048904e-05, + "loss": 4.9113, + "loss/crossentropy": 1.6220935881137848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17125827446579933, + "step": 8632 + }, + { + "epoch": 0.7195, + "grad_norm": 4.90625, + "grad_norm_var": 0.05388997395833333, + "learning_rate": 3.8057074439430326e-05, + "loss": 5.3481, + "loss/crossentropy": 2.598345994949341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20299434289336205, + "step": 8634 + }, + { + "epoch": 0.7196666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.042769368489583334, + "learning_rate": 3.804505323137796e-05, + "loss": 4.5609, + "loss/crossentropy": 1.8253272399306297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18018994107842445, + "step": 8636 + }, + { + "epoch": 0.7198333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.0361328125, + "learning_rate": 3.80329973685529e-05, + "loss": 5.1573, + "loss/crossentropy": 2.299446254968643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21431740745902061, + "step": 8638 + }, + { + "epoch": 0.72, + "grad_norm": 5.1875, + "grad_norm_var": 0.04149983723958333, + "learning_rate": 3.802090688070182e-05, + "loss": 4.6841, + "loss/crossentropy": 1.5538584291934967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2675389163196087, + "step": 8640 + }, + { + "epoch": 0.7201666666666666, + "grad_norm": 5.75, + "grad_norm_var": 0.08362223307291666, + "learning_rate": 3.800878179765679e-05, + "loss": 5.4819, + "loss/crossentropy": 1.719673328101635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20472576469182968, + "step": 8642 + }, + { + "epoch": 0.7203333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.08683268229166667, + "learning_rate": 3.799662214933525e-05, + "loss": 4.9331, + "loss/crossentropy": 1.8904408514499664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1706375703215599, + "step": 8644 + }, + { + "epoch": 0.7205, + "grad_norm": 4.65625, + "grad_norm_var": 0.08547770182291667, + "learning_rate": 3.7984427965739914e-05, + "loss": 5.1704, + "loss/crossentropy": 1.9823104441165924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20099147781729698, + "step": 8646 + }, + { + "epoch": 0.7206666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.10026041666666667, + "learning_rate": 3.7972199276958726e-05, + "loss": 4.5451, + "loss/crossentropy": 1.9567938223481178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17212271131575108, + "step": 8648 + }, + { + "epoch": 0.7208333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.10146077473958333, + "learning_rate": 3.795993611316476e-05, + "loss": 5.1056, + "loss/crossentropy": 2.315292328596115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21807067096233368, + "step": 8650 + }, + { + "epoch": 0.721, + "grad_norm": 4.78125, + "grad_norm_var": 0.10428059895833333, + "learning_rate": 3.794763850461615e-05, + "loss": 4.9331, + "loss/crossentropy": 1.7855356112122536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16460080444812775, + "step": 8652 + }, + { + "epoch": 0.7211666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.104150390625, + "learning_rate": 3.793530648165602e-05, + "loss": 4.8287, + "loss/crossentropy": 1.8243321552872658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18671520613133907, + "step": 8654 + }, + { + "epoch": 0.7213333333333334, + "grad_norm": 4.28125, + "grad_norm_var": 0.122900390625, + "learning_rate": 3.792294007471242e-05, + "loss": 5.3489, + "loss/crossentropy": 1.904386505484581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1730734258890152, + "step": 8656 + }, + { + "epoch": 0.7215, + "grad_norm": 4.46875, + "grad_norm_var": 0.083447265625, + "learning_rate": 3.791053931429821e-05, + "loss": 4.2947, + "loss/crossentropy": 2.0987056344747543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18425085954368114, + "step": 8658 + }, + { + "epoch": 0.7216666666666667, + "grad_norm": 5.53125, + "grad_norm_var": 0.1107421875, + "learning_rate": 3.7898104231011065e-05, + "loss": 5.074, + "loss/crossentropy": 2.2960754334926605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22611650452017784, + "step": 8660 + }, + { + "epoch": 0.7218333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.10351155598958334, + "learning_rate": 3.788563485553329e-05, + "loss": 5.2099, + "loss/crossentropy": 2.3180200457572937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.195101797580719, + "step": 8662 + }, + { + "epoch": 0.722, + "grad_norm": 4.6875, + "grad_norm_var": 0.09192301432291666, + "learning_rate": 3.787313121863185e-05, + "loss": 4.5433, + "loss/crossentropy": 1.462849237024784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.160456795245409, + "step": 8664 + }, + { + "epoch": 0.7221666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.0955078125, + "learning_rate": 3.7860593351158205e-05, + "loss": 4.4727, + "loss/crossentropy": 2.4242143034934998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20100093632936478, + "step": 8666 + }, + { + "epoch": 0.7223333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.08951416015625, + "learning_rate": 3.784802128404831e-05, + "loss": 4.9187, + "loss/crossentropy": 1.6116406470537186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766587197780609, + "step": 8668 + }, + { + "epoch": 0.7225, + "grad_norm": 4.90625, + "grad_norm_var": 0.09071858723958333, + "learning_rate": 3.7835415048322486e-05, + "loss": 4.7858, + "loss/crossentropy": 2.023942083120346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17556135542690754, + "step": 8670 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.06656494140625, + "learning_rate": 3.782277467508537e-05, + "loss": 4.8446, + "loss/crossentropy": 1.9526407718658447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2195216380059719, + "step": 8672 + }, + { + "epoch": 0.7228333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.073046875, + "learning_rate": 3.7810100195525825e-05, + "loss": 5.1322, + "loss/crossentropy": 2.075814664363861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2480899766087532, + "step": 8674 + }, + { + "epoch": 0.723, + "grad_norm": 4.5625, + "grad_norm_var": 0.054280598958333336, + "learning_rate": 3.7797391640916865e-05, + "loss": 4.4837, + "loss/crossentropy": 1.9470646530389786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18394303135573864, + "step": 8676 + }, + { + "epoch": 0.7231666666666666, + "grad_norm": 4.3125, + "grad_norm_var": 0.06907552083333333, + "learning_rate": 3.7784649042615594e-05, + "loss": 4.3918, + "loss/crossentropy": 1.4085872247815132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14427426271140575, + "step": 8678 + }, + { + "epoch": 0.7233333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.08391927083333334, + "learning_rate": 3.7771872432063104e-05, + "loss": 5.0086, + "loss/crossentropy": 1.5730762034654617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20427705347537994, + "step": 8680 + }, + { + "epoch": 0.7235, + "grad_norm": 4.4375, + "grad_norm_var": 0.08664957682291667, + "learning_rate": 3.775906184078441e-05, + "loss": 4.7252, + "loss/crossentropy": 1.6773125976324081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17820323258638382, + "step": 8682 + }, + { + "epoch": 0.7236666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.08824462890625, + "learning_rate": 3.7746217300388364e-05, + "loss": 4.8914, + "loss/crossentropy": 1.9949692338705063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16665949299931526, + "step": 8684 + }, + { + "epoch": 0.7238333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.08761393229166667, + "learning_rate": 3.7733338842567604e-05, + "loss": 4.4487, + "loss/crossentropy": 1.8528357148170471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21053320541977882, + "step": 8686 + }, + { + "epoch": 0.724, + "grad_norm": 4.65625, + "grad_norm_var": 0.08865559895833333, + "learning_rate": 3.772042649909845e-05, + "loss": 4.2227, + "loss/crossentropy": 1.7226733341813087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1530767474323511, + "step": 8688 + }, + { + "epoch": 0.7241666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.05767822265625, + "learning_rate": 3.77074803018408e-05, + "loss": 4.3961, + "loss/crossentropy": 1.7308517843484879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16562194749712944, + "step": 8690 + }, + { + "epoch": 0.7243333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.10458577473958333, + "learning_rate": 3.769450028273814e-05, + "loss": 5.2013, + "loss/crossentropy": 2.163651943206787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21022338047623634, + "step": 8692 + }, + { + "epoch": 0.7245, + "grad_norm": 4.875, + "grad_norm_var": 0.09110921223958333, + "learning_rate": 3.768148647381735e-05, + "loss": 4.9322, + "loss/crossentropy": 2.3275624215602875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20518775284290314, + "step": 8694 + }, + { + "epoch": 0.7246666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.07489827473958334, + "learning_rate": 3.766843890718873e-05, + "loss": 5.2929, + "loss/crossentropy": 2.217619091272354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21295088529586792, + "step": 8696 + }, + { + "epoch": 0.7248333333333333, + "grad_norm": 5.34375, + "grad_norm_var": 0.08983968098958334, + "learning_rate": 3.765535761504584e-05, + "loss": 4.8328, + "loss/crossentropy": 1.4501753821969032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1492973156273365, + "step": 8698 + }, + { + "epoch": 0.725, + "grad_norm": 5.03125, + "grad_norm_var": 0.08943684895833333, + "learning_rate": 3.764224262966548e-05, + "loss": 4.6842, + "loss/crossentropy": 2.3924825191497803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22406524047255516, + "step": 8700 + }, + { + "epoch": 0.7251666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.09021809895833334, + "learning_rate": 3.7629093983407565e-05, + "loss": 5.6793, + "loss/crossentropy": 2.3391090631484985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21643299236893654, + "step": 8702 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.07743733723958333, + "learning_rate": 3.761591170871507e-05, + "loss": 5.411, + "loss/crossentropy": 2.587038576602936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19993890821933746, + "step": 8704 + }, + { + "epoch": 0.7255, + "grad_norm": 5.28125, + "grad_norm_var": 0.05618489583333333, + "learning_rate": 3.760269583811396e-05, + "loss": 4.9137, + "loss/crossentropy": 2.3714127242565155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2080126442015171, + "step": 8706 + }, + { + "epoch": 0.7256666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.03183186848958333, + "learning_rate": 3.758944640421307e-05, + "loss": 5.3241, + "loss/crossentropy": 1.6202038303017616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17815641313791275, + "step": 8708 + }, + { + "epoch": 0.7258333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.04556884765625, + "learning_rate": 3.7576163439704066e-05, + "loss": 4.9787, + "loss/crossentropy": 1.8317546993494034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2205317411571741, + "step": 8710 + }, + { + "epoch": 0.726, + "grad_norm": 4.875, + "grad_norm_var": 0.055078125, + "learning_rate": 3.756284697736134e-05, + "loss": 5.2559, + "loss/crossentropy": 2.6816230416297913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20220203697681427, + "step": 8712 + }, + { + "epoch": 0.7261666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.05284830729166667, + "learning_rate": 3.7549497050041936e-05, + "loss": 5.216, + "loss/crossentropy": 2.396820366382599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21334237977862358, + "step": 8714 + }, + { + "epoch": 0.7263333333333334, + "grad_norm": 5.40625, + "grad_norm_var": 0.08918863932291667, + "learning_rate": 3.753611369068548e-05, + "loss": 5.0165, + "loss/crossentropy": 2.489267408847809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773302927613258, + "step": 8716 + }, + { + "epoch": 0.7265, + "grad_norm": 4.78125, + "grad_norm_var": 0.09117431640625, + "learning_rate": 3.7522696932314076e-05, + "loss": 4.9107, + "loss/crossentropy": 2.376191943883896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045009806752205, + "step": 8718 + }, + { + "epoch": 0.7266666666666667, + "grad_norm": 4.15625, + "grad_norm_var": 0.124462890625, + "learning_rate": 3.750924680803224e-05, + "loss": 4.8471, + "loss/crossentropy": 1.4641473963856697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14925377815961838, + "step": 8720 + }, + { + "epoch": 0.7268333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.11754150390625, + "learning_rate": 3.749576335102683e-05, + "loss": 4.9705, + "loss/crossentropy": 1.9205654561519623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1824258267879486, + "step": 8722 + }, + { + "epoch": 0.727, + "grad_norm": 4.625, + "grad_norm_var": 0.11812744140625, + "learning_rate": 3.748224659456692e-05, + "loss": 4.912, + "loss/crossentropy": 1.3279554545879364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.147995226085186, + "step": 8724 + }, + { + "epoch": 0.7271666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.08865559895833333, + "learning_rate": 3.7468696572003773e-05, + "loss": 5.0937, + "loss/crossentropy": 2.371949940919876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19854775071144104, + "step": 8726 + }, + { + "epoch": 0.7273333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 719.303153483073, + "learning_rate": 3.7455113316770714e-05, + "loss": 5.064, + "loss/crossentropy": 1.8971856757998466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21301062777638435, + "step": 8728 + }, + { + "epoch": 0.7275, + "grad_norm": 4.75, + "grad_norm_var": 718.9658813476562, + "learning_rate": 3.7441496862383074e-05, + "loss": 4.6886, + "loss/crossentropy": 1.646051250398159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17237389832735062, + "step": 8730 + }, + { + "epoch": 0.7276666666666667, + "grad_norm": 4.875, + "grad_norm_var": 718.8195597330729, + "learning_rate": 3.742784724243811e-05, + "loss": 5.0265, + "loss/crossentropy": 2.2616125643253326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21027613058686256, + "step": 8732 + }, + { + "epoch": 0.7278333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 718.8765258789062, + "learning_rate": 3.74141644906149e-05, + "loss": 4.8026, + "loss/crossentropy": 1.7153765857219696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16985262744128704, + "step": 8734 + }, + { + "epoch": 0.728, + "grad_norm": 5.46875, + "grad_norm_var": 717.9027180989583, + "learning_rate": 3.740044864067428e-05, + "loss": 4.9463, + "loss/crossentropy": 1.7610290348529816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22300682961940765, + "step": 8736 + }, + { + "epoch": 0.7281666666666666, + "grad_norm": 4.75, + "grad_norm_var": 717.5597290039062, + "learning_rate": 3.7386699726458725e-05, + "loss": 5.2234, + "loss/crossentropy": 1.659450389444828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17890477553009987, + "step": 8738 + }, + { + "epoch": 0.7283333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 717.5659790039062, + "learning_rate": 3.7372917781892335e-05, + "loss": 4.771, + "loss/crossentropy": 2.1801012456417084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21790499612689018, + "step": 8740 + }, + { + "epoch": 0.7285, + "grad_norm": 4.84375, + "grad_norm_var": 717.3072265625, + "learning_rate": 3.735910284098068e-05, + "loss": 5.2941, + "loss/crossentropy": 2.0166616439819336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17672479711472988, + "step": 8742 + }, + { + "epoch": 0.7286666666666667, + "grad_norm": 5.1875, + "grad_norm_var": 0.07498372395833333, + "learning_rate": 3.7345254937810746e-05, + "loss": 5.0446, + "loss/crossentropy": 1.6543507799506187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18381119146943092, + "step": 8744 + }, + { + "epoch": 0.7288333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.07459309895833334, + "learning_rate": 3.733137410655087e-05, + "loss": 5.1135, + "loss/crossentropy": 2.328328937292099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19200216978788376, + "step": 8746 + }, + { + "epoch": 0.729, + "grad_norm": 5.46875, + "grad_norm_var": 0.09269205729166667, + "learning_rate": 3.7317460381450616e-05, + "loss": 5.5765, + "loss/crossentropy": 2.3491774797439575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052808813750744, + "step": 8748 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.08795166015625, + "learning_rate": 3.7303513796840724e-05, + "loss": 4.8585, + "loss/crossentropy": 1.4594552740454674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14136701077222824, + "step": 8750 + }, + { + "epoch": 0.7293333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.07408854166666666, + "learning_rate": 3.7289534387133e-05, + "loss": 4.7951, + "loss/crossentropy": 2.2565941512584686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20282186567783356, + "step": 8752 + }, + { + "epoch": 0.7295, + "grad_norm": 4.875, + "grad_norm_var": 0.05584309895833333, + "learning_rate": 3.727552218682026e-05, + "loss": 5.109, + "loss/crossentropy": 1.2331131994724274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13799532130360603, + "step": 8754 + }, + { + "epoch": 0.7296666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.05237223307291667, + "learning_rate": 3.7261477230476194e-05, + "loss": 5.5758, + "loss/crossentropy": 2.656588077545166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.211183350533247, + "step": 8756 + }, + { + "epoch": 0.7298333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.059228515625, + "learning_rate": 3.724739955275535e-05, + "loss": 4.9283, + "loss/crossentropy": 1.393723078072071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15066486410796642, + "step": 8758 + }, + { + "epoch": 0.73, + "grad_norm": 4.5625, + "grad_norm_var": 0.060807291666666666, + "learning_rate": 3.7233289188392994e-05, + "loss": 4.314, + "loss/crossentropy": 1.7706470787525177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16479334980249405, + "step": 8760 + }, + { + "epoch": 0.7301666666666666, + "grad_norm": 5.625, + "grad_norm_var": 0.09605712890625, + "learning_rate": 3.7219146172205054e-05, + "loss": 4.926, + "loss/crossentropy": 1.4691421911120415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15686069056391716, + "step": 8762 + }, + { + "epoch": 0.7303333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.07431233723958333, + "learning_rate": 3.7204970539088005e-05, + "loss": 4.3474, + "loss/crossentropy": 1.8123872131109238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17503276839852333, + "step": 8764 + }, + { + "epoch": 0.7305, + "grad_norm": 5.53125, + "grad_norm_var": 0.10562744140625, + "learning_rate": 3.719076232401881e-05, + "loss": 5.5435, + "loss/crossentropy": 2.8346092104911804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20754270255565643, + "step": 8766 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.11506754557291667, + "learning_rate": 3.717652156205485e-05, + "loss": 4.4182, + "loss/crossentropy": 1.4685752764344215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1536390818655491, + "step": 8768 + }, + { + "epoch": 0.7308333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.12760416666666666, + "learning_rate": 3.716224828833376e-05, + "loss": 4.502, + "loss/crossentropy": 1.5342864394187927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14854193665087223, + "step": 8770 + }, + { + "epoch": 0.731, + "grad_norm": 4.71875, + "grad_norm_var": 0.12336832682291667, + "learning_rate": 3.714794253807345e-05, + "loss": 4.9686, + "loss/crossentropy": 2.0194079279899597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18694549053907394, + "step": 8772 + }, + { + "epoch": 0.7311666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.12018229166666666, + "learning_rate": 3.7133604346571923e-05, + "loss": 4.5597, + "loss/crossentropy": 1.5956484377384186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19843536987900734, + "step": 8774 + }, + { + "epoch": 0.7313333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.12237955729166666, + "learning_rate": 3.711923374920724e-05, + "loss": 4.5708, + "loss/crossentropy": 1.9300574213266373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20277779176831245, + "step": 8776 + }, + { + "epoch": 0.7315, + "grad_norm": 4.96875, + "grad_norm_var": 0.0798828125, + "learning_rate": 3.7104830781437435e-05, + "loss": 5.3787, + "loss/crossentropy": 1.9987846314907074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19426734000444412, + "step": 8778 + }, + { + "epoch": 0.7316666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.08232014973958333, + "learning_rate": 3.709039547880038e-05, + "loss": 5.3359, + "loss/crossentropy": 1.9980473741889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17278934083878994, + "step": 8780 + }, + { + "epoch": 0.7318333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.04505208333333333, + "learning_rate": 3.7075927876913765e-05, + "loss": 5.0242, + "loss/crossentropy": 1.8991079032421112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16986040398478508, + "step": 8782 + }, + { + "epoch": 0.732, + "grad_norm": 4.875, + "grad_norm_var": 0.041796875, + "learning_rate": 3.706142801147495e-05, + "loss": 4.6015, + "loss/crossentropy": 2.006039559841156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17685510218143463, + "step": 8784 + }, + { + "epoch": 0.7321666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.034098307291666664, + "learning_rate": 3.7046895918260916e-05, + "loss": 4.8398, + "loss/crossentropy": 1.5066589415073395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15621322393417358, + "step": 8786 + }, + { + "epoch": 0.7323333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.046858723958333334, + "learning_rate": 3.703233163312816e-05, + "loss": 4.8451, + "loss/crossentropy": 2.0729255378246307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18724722787737846, + "step": 8788 + }, + { + "epoch": 0.7325, + "grad_norm": 4.875, + "grad_norm_var": 0.04940999348958333, + "learning_rate": 3.70177351920126e-05, + "loss": 5.5098, + "loss/crossentropy": 2.3151272237300873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22275524586439133, + "step": 8790 + }, + { + "epoch": 0.7326666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.0490234375, + "learning_rate": 3.700310663092951e-05, + "loss": 4.5555, + "loss/crossentropy": 1.7588667497038841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17178992182016373, + "step": 8792 + }, + { + "epoch": 0.7328333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.05675455729166667, + "learning_rate": 3.698844598597341e-05, + "loss": 5.298, + "loss/crossentropy": 2.5000760555267334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21289020031690598, + "step": 8794 + }, + { + "epoch": 0.733, + "grad_norm": 4.375, + "grad_norm_var": 0.07838541666666667, + "learning_rate": 3.6973753293317975e-05, + "loss": 4.4096, + "loss/crossentropy": 1.6862092539668083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16006714291870594, + "step": 8796 + }, + { + "epoch": 0.7331666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.07721354166666666, + "learning_rate": 3.6959028589215986e-05, + "loss": 4.6138, + "loss/crossentropy": 1.5368055179715157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15705449134111404, + "step": 8798 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.07858072916666667, + "learning_rate": 3.6944271909999166e-05, + "loss": 5.0054, + "loss/crossentropy": 2.119047671556473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983763799071312, + "step": 8800 + }, + { + "epoch": 0.7335, + "grad_norm": 4.78125, + "grad_norm_var": 0.07515869140625, + "learning_rate": 3.6929483292078156e-05, + "loss": 4.8087, + "loss/crossentropy": 1.6268837228417397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1544271968305111, + "step": 8802 + }, + { + "epoch": 0.7336666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.06884358723958334, + "learning_rate": 3.69146627719424e-05, + "loss": 5.0262, + "loss/crossentropy": 2.24159637093544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014218308031559, + "step": 8804 + }, + { + "epoch": 0.7338333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.0875, + "learning_rate": 3.689981038616008e-05, + "loss": 5.1324, + "loss/crossentropy": 1.908030480146408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18177608400583267, + "step": 8806 + }, + { + "epoch": 0.734, + "grad_norm": 4.96875, + "grad_norm_var": 0.07862955729166667, + "learning_rate": 3.6884926171377955e-05, + "loss": 4.4723, + "loss/crossentropy": 1.1692449301481247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12820919789373875, + "step": 8808 + }, + { + "epoch": 0.7341666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.07277018229166667, + "learning_rate": 3.6870010164321354e-05, + "loss": 4.5284, + "loss/crossentropy": 1.6573495715856552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189057357609272, + "step": 8810 + }, + { + "epoch": 0.7343333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.05279541015625, + "learning_rate": 3.685506240179405e-05, + "loss": 5.107, + "loss/crossentropy": 1.549419716000557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16974429041147232, + "step": 8812 + }, + { + "epoch": 0.7345, + "grad_norm": 5.3125, + "grad_norm_var": 0.07511393229166667, + "learning_rate": 3.684008292067814e-05, + "loss": 4.5746, + "loss/crossentropy": 2.343492843210697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1637168973684311, + "step": 8814 + }, + { + "epoch": 0.7346666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.08798421223958333, + "learning_rate": 3.6825071757934034e-05, + "loss": 4.8706, + "loss/crossentropy": 1.5027789995074272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1454361155629158, + "step": 8816 + }, + { + "epoch": 0.7348333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.09440104166666667, + "learning_rate": 3.681002895060026e-05, + "loss": 4.7813, + "loss/crossentropy": 2.3462014198303223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2297021485865116, + "step": 8818 + }, + { + "epoch": 0.735, + "grad_norm": 4.71875, + "grad_norm_var": 0.11614583333333334, + "learning_rate": 3.679495453579345e-05, + "loss": 4.4424, + "loss/crossentropy": 1.9516724050045013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17961198277771473, + "step": 8820 + }, + { + "epoch": 0.7351666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.10041910807291667, + "learning_rate": 3.677984855070824e-05, + "loss": 4.989, + "loss/crossentropy": 1.7319712713360786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1646728366613388, + "step": 8822 + }, + { + "epoch": 0.7353333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.09280192057291667, + "learning_rate": 3.6764711032617146e-05, + "loss": 5.0011, + "loss/crossentropy": 1.472038134932518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16302834451198578, + "step": 8824 + }, + { + "epoch": 0.7355, + "grad_norm": 4.375, + "grad_norm_var": 0.07450764973958333, + "learning_rate": 3.6749542018870464e-05, + "loss": 4.7102, + "loss/crossentropy": 2.418479323387146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23650340735912323, + "step": 8826 + }, + { + "epoch": 0.7356666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.08409830729166666, + "learning_rate": 3.673434154689626e-05, + "loss": 4.9501, + "loss/crossentropy": 1.8115656673908234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16492757201194763, + "step": 8828 + }, + { + "epoch": 0.7358333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.057275390625, + "learning_rate": 3.671910965420017e-05, + "loss": 4.6584, + "loss/crossentropy": 1.308703638613224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15016014873981476, + "step": 8830 + }, + { + "epoch": 0.736, + "grad_norm": 4.8125, + "grad_norm_var": 0.049853515625, + "learning_rate": 3.6703846378365374e-05, + "loss": 4.7533, + "loss/crossentropy": 2.000911645591259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17556572891771793, + "step": 8832 + }, + { + "epoch": 0.7361666666666666, + "grad_norm": 5.15625, + "grad_norm_var": 0.06378580729166666, + "learning_rate": 3.668855175705249e-05, + "loss": 5.1723, + "loss/crossentropy": 2.0119586139917374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.176089683547616, + "step": 8834 + }, + { + "epoch": 0.7363333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.09065348307291667, + "learning_rate": 3.6673225827999475e-05, + "loss": 5.2057, + "loss/crossentropy": 1.8934685587882996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1788729391992092, + "step": 8836 + }, + { + "epoch": 0.7365, + "grad_norm": 5.15625, + "grad_norm_var": 0.08587239583333334, + "learning_rate": 3.665786862902155e-05, + "loss": 5.2444, + "loss/crossentropy": 1.591829739511013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17553285881876945, + "step": 8838 + }, + { + "epoch": 0.7366666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.08079427083333333, + "learning_rate": 3.664248019801105e-05, + "loss": 5.3846, + "loss/crossentropy": 1.8678766191005707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1833246424794197, + "step": 8840 + }, + { + "epoch": 0.7368333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.08619791666666667, + "learning_rate": 3.662706057293743e-05, + "loss": 4.9773, + "loss/crossentropy": 2.21164333820343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19135256856679916, + "step": 8842 + }, + { + "epoch": 0.737, + "grad_norm": 5.3125, + "grad_norm_var": 0.10657145182291666, + "learning_rate": 3.661160979184705e-05, + "loss": 4.5775, + "loss/crossentropy": 2.1143080592155457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20492056012153625, + "step": 8844 + }, + { + "epoch": 0.7371666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.11614176432291666, + "learning_rate": 3.659612789286319e-05, + "loss": 5.7674, + "loss/crossentropy": 2.6996708512306213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100643552839756, + "step": 8846 + }, + { + "epoch": 0.7373333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.11809895833333334, + "learning_rate": 3.658061491418591e-05, + "loss": 4.9873, + "loss/crossentropy": 1.7583412826061249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17828240431845188, + "step": 8848 + }, + { + "epoch": 0.7375, + "grad_norm": 4.40625, + "grad_norm_var": 0.13606770833333334, + "learning_rate": 3.656507089409192e-05, + "loss": 4.5474, + "loss/crossentropy": 1.7216490358114243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15573652274906635, + "step": 8850 + }, + { + "epoch": 0.7376666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.1318359375, + "learning_rate": 3.654949587093456e-05, + "loss": 5.4153, + "loss/crossentropy": 1.8738081008195877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18332227692008018, + "step": 8852 + }, + { + "epoch": 0.7378333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.13045247395833334, + "learning_rate": 3.653388988314365e-05, + "loss": 4.8978, + "loss/crossentropy": 2.121942013502121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189912848174572, + "step": 8854 + }, + { + "epoch": 0.738, + "grad_norm": 4.78125, + "grad_norm_var": 0.130322265625, + "learning_rate": 3.651825296922541e-05, + "loss": 5.0859, + "loss/crossentropy": 2.175195187330246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22438178583979607, + "step": 8856 + }, + { + "epoch": 0.7381666666666666, + "grad_norm": 5.4375, + "grad_norm_var": 0.123291015625, + "learning_rate": 3.6502585167762374e-05, + "loss": 4.8487, + "loss/crossentropy": 2.214916653931141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20211252197623253, + "step": 8858 + }, + { + "epoch": 0.7383333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.10701497395833333, + "learning_rate": 3.648688651741328e-05, + "loss": 5.1052, + "loss/crossentropy": 1.367978423833847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1402355097234249, + "step": 8860 + }, + { + "epoch": 0.7385, + "grad_norm": 5.46875, + "grad_norm_var": 0.17057291666666666, + "learning_rate": 3.647115705691299e-05, + "loss": 5.2058, + "loss/crossentropy": 2.0758024752140045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20703085139393806, + "step": 8862 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.19576416015625, + "learning_rate": 3.645539682507238e-05, + "loss": 4.8559, + "loss/crossentropy": 2.5436695218086243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21577593311667442, + "step": 8864 + }, + { + "epoch": 0.7388333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.17511393229166666, + "learning_rate": 3.6439605860778255e-05, + "loss": 5.2327, + "loss/crossentropy": 2.345476508140564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21046575531363487, + "step": 8866 + }, + { + "epoch": 0.739, + "grad_norm": 4.6875, + "grad_norm_var": 0.18541666666666667, + "learning_rate": 3.642378420299326e-05, + "loss": 4.6524, + "loss/crossentropy": 2.3013267815113068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18924878537654877, + "step": 8868 + }, + { + "epoch": 0.7391666666666666, + "grad_norm": 6.53125, + "grad_norm_var": 0.33944905598958336, + "learning_rate": 3.640793189075576e-05, + "loss": 5.1503, + "loss/crossentropy": 1.9642613977193832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24627995491027832, + "step": 8870 + }, + { + "epoch": 0.7393333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.35559488932291666, + "learning_rate": 3.639204896317974e-05, + "loss": 4.6375, + "loss/crossentropy": 1.7182991802692413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16883813217282295, + "step": 8872 + }, + { + "epoch": 0.7395, + "grad_norm": 4.53125, + "grad_norm_var": 0.35794270833333336, + "learning_rate": 3.6376135459454775e-05, + "loss": 4.9347, + "loss/crossentropy": 2.0507873594760895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1764291562139988, + "step": 8874 + }, + { + "epoch": 0.7396666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.3622355143229167, + "learning_rate": 3.636019141884584e-05, + "loss": 4.8631, + "loss/crossentropy": 2.2530939877033234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20754842087626457, + "step": 8876 + }, + { + "epoch": 0.7398333333333333, + "grad_norm": 4.375, + "grad_norm_var": 0.2731730143229167, + "learning_rate": 3.634421688069326e-05, + "loss": 4.3604, + "loss/crossentropy": 1.693645179271698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18852153420448303, + "step": 8878 + }, + { + "epoch": 0.74, + "grad_norm": 4.59375, + "grad_norm_var": 0.2623046875, + "learning_rate": 3.632821188441264e-05, + "loss": 5.0167, + "loss/crossentropy": 2.5044451355934143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22125179693102837, + "step": 8880 + }, + { + "epoch": 0.7401666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.262890625, + "learning_rate": 3.631217646949469e-05, + "loss": 6.0373, + "loss/crossentropy": 2.3829991221427917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22329098731279373, + "step": 8882 + }, + { + "epoch": 0.7403333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.249853515625, + "learning_rate": 3.629611067550523e-05, + "loss": 5.285, + "loss/crossentropy": 2.3799403607845306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19088751077651978, + "step": 8884 + }, + { + "epoch": 0.7405, + "grad_norm": 4.71875, + "grad_norm_var": 0.044514973958333336, + "learning_rate": 3.6280014542084996e-05, + "loss": 4.5858, + "loss/crossentropy": 1.8241610005497932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17614194378256798, + "step": 8886 + }, + { + "epoch": 0.7406666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.061263020833333334, + "learning_rate": 3.62638881089496e-05, + "loss": 4.832, + "loss/crossentropy": 1.690228745341301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766295973211527, + "step": 8888 + }, + { + "epoch": 0.7408333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.05585530598958333, + "learning_rate": 3.624773141588942e-05, + "loss": 4.5258, + "loss/crossentropy": 1.9454505145549774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19673113897442818, + "step": 8890 + }, + { + "epoch": 0.741, + "grad_norm": 4.625, + "grad_norm_var": 0.05585530598958333, + "learning_rate": 3.623154450276947e-05, + "loss": 5.4359, + "loss/crossentropy": 1.4792904779314995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1534273698925972, + "step": 8892 + }, + { + "epoch": 0.7411666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.059228515625, + "learning_rate": 3.621532740952937e-05, + "loss": 4.7134, + "loss/crossentropy": 2.3156608641147614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20420100539922714, + "step": 8894 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.058915201822916666, + "learning_rate": 3.6199080176183174e-05, + "loss": 5.1667, + "loss/crossentropy": 2.1931245625019073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118113823235035, + "step": 8896 + }, + { + "epoch": 0.7415, + "grad_norm": 5.0, + "grad_norm_var": 0.04755452473958333, + "learning_rate": 3.618280284281931e-05, + "loss": 4.8052, + "loss/crossentropy": 2.5484583973884583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23700064048171043, + "step": 8898 + }, + { + "epoch": 0.7416666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.055192057291666666, + "learning_rate": 3.616649544960051e-05, + "loss": 4.474, + "loss/crossentropy": 2.38634717464447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101435624063015, + "step": 8900 + }, + { + "epoch": 0.7418333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.8354451497395833, + "learning_rate": 3.61501580367636e-05, + "loss": 4.8441, + "loss/crossentropy": 2.2860844433307648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810862421989441, + "step": 8902 + }, + { + "epoch": 0.742, + "grad_norm": 4.65625, + "grad_norm_var": 0.8156087239583333, + "learning_rate": 3.613379064461955e-05, + "loss": 4.2613, + "loss/crossentropy": 1.6806901693344116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1674313172698021, + "step": 8904 + }, + { + "epoch": 0.7421666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.84283447265625, + "learning_rate": 3.6117393313553276e-05, + "loss": 4.8571, + "loss/crossentropy": 2.260170102119446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418415799736977, + "step": 8906 + }, + { + "epoch": 0.7423333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.8539998372395833, + "learning_rate": 3.610096608402356e-05, + "loss": 4.6393, + "loss/crossentropy": 2.18383064866066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21976784616708755, + "step": 8908 + }, + { + "epoch": 0.7425, + "grad_norm": 5.0, + "grad_norm_var": 0.82359619140625, + "learning_rate": 3.6084508996562945e-05, + "loss": 5.3469, + "loss/crossentropy": 1.9949154406785965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17736472189426422, + "step": 8910 + }, + { + "epoch": 0.7426666666666667, + "grad_norm": 5.15625, + "grad_norm_var": 0.8265462239583333, + "learning_rate": 3.606802209177766e-05, + "loss": 5.2167, + "loss/crossentropy": 2.670268714427948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21612682566046715, + "step": 8912 + }, + { + "epoch": 0.7428333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.8615193684895833, + "learning_rate": 3.605150541034752e-05, + "loss": 4.6836, + "loss/crossentropy": 1.777511551976204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17616182379424572, + "step": 8914 + }, + { + "epoch": 0.743, + "grad_norm": 4.71875, + "grad_norm_var": 0.8450154622395833, + "learning_rate": 3.603495899302579e-05, + "loss": 5.0119, + "loss/crossentropy": 1.4489165171980858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15824375115334988, + "step": 8916 + }, + { + "epoch": 0.7431666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.06897379557291666, + "learning_rate": 3.60183828806391e-05, + "loss": 4.708, + "loss/crossentropy": 2.3155589401721954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166098654270172, + "step": 8918 + }, + { + "epoch": 0.7433333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.07502848307291667, + "learning_rate": 3.6001777114087364e-05, + "loss": 5.0273, + "loss/crossentropy": 2.040462851524353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932218372821808, + "step": 8920 + }, + { + "epoch": 0.7435, + "grad_norm": 5.0, + "grad_norm_var": 0.06405843098958333, + "learning_rate": 3.598514173434366e-05, + "loss": 5.1278, + "loss/crossentropy": 1.7882616519927979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18532326072454453, + "step": 8922 + }, + { + "epoch": 0.7436666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.05627848307291667, + "learning_rate": 3.5968476782454126e-05, + "loss": 5.381, + "loss/crossentropy": 2.228593498468399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952892802655697, + "step": 8924 + }, + { + "epoch": 0.7438333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.06327718098958333, + "learning_rate": 3.595178229953789e-05, + "loss": 4.7112, + "loss/crossentropy": 1.9281839057803154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19580445811152458, + "step": 8926 + }, + { + "epoch": 0.744, + "grad_norm": 4.96875, + "grad_norm_var": 0.057515462239583336, + "learning_rate": 3.593505832678692e-05, + "loss": 5.1409, + "loss/crossentropy": 2.7174503803253174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2320035845041275, + "step": 8928 + }, + { + "epoch": 0.7441666666666666, + "grad_norm": 5.40625, + "grad_norm_var": 0.06417643229166667, + "learning_rate": 3.591830490546596e-05, + "loss": 4.9454, + "loss/crossentropy": 1.646928757429123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15842007473111153, + "step": 8930 + }, + { + "epoch": 0.7443333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.0865234375, + "learning_rate": 3.59015220769124e-05, + "loss": 5.0737, + "loss/crossentropy": 1.9822481498122215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17872773855924606, + "step": 8932 + }, + { + "epoch": 0.7445, + "grad_norm": 4.6875, + "grad_norm_var": 0.08538004557291666, + "learning_rate": 3.588470988253622e-05, + "loss": 4.9158, + "loss/crossentropy": 2.2272008657455444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22174109145998955, + "step": 8934 + }, + { + "epoch": 0.7446666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.08404947916666666, + "learning_rate": 3.5867868363819836e-05, + "loss": 5.3303, + "loss/crossentropy": 1.3292552679777145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13451922498643398, + "step": 8936 + }, + { + "epoch": 0.7448333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.08240559895833334, + "learning_rate": 3.5850997562318006e-05, + "loss": 4.8913, + "loss/crossentropy": 2.4469125866889954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009117528796196, + "step": 8938 + }, + { + "epoch": 0.745, + "grad_norm": 4.59375, + "grad_norm_var": 0.08245035807291666, + "learning_rate": 3.583409751965776e-05, + "loss": 4.9503, + "loss/crossentropy": 1.9291583746671677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17131099104881287, + "step": 8940 + }, + { + "epoch": 0.7451666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.08800455729166666, + "learning_rate": 3.5817168277538286e-05, + "loss": 5.0222, + "loss/crossentropy": 1.9054948091506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17595425620675087, + "step": 8942 + }, + { + "epoch": 0.7453333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.08059488932291667, + "learning_rate": 3.580020987773079e-05, + "loss": 5.0269, + "loss/crossentropy": 2.2758138179779053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193683236837387, + "step": 8944 + }, + { + "epoch": 0.7455, + "grad_norm": 4.71875, + "grad_norm_var": 0.14568684895833334, + "learning_rate": 3.578322236207845e-05, + "loss": 5.4236, + "loss/crossentropy": 2.432409644126892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22839342057704926, + "step": 8946 + }, + { + "epoch": 0.7456666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.11780192057291666, + "learning_rate": 3.576620577249626e-05, + "loss": 4.6177, + "loss/crossentropy": 1.9744727090001106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.168179078027606, + "step": 8948 + }, + { + "epoch": 0.7458333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.13570556640625, + "learning_rate": 3.574916015097097e-05, + "loss": 4.4341, + "loss/crossentropy": 2.073435589671135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18709623627364635, + "step": 8950 + }, + { + "epoch": 0.746, + "grad_norm": 4.4375, + "grad_norm_var": 0.143603515625, + "learning_rate": 3.5732085539560965e-05, + "loss": 4.8361, + "loss/crossentropy": 1.7923277840018272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17312389239668846, + "step": 8952 + }, + { + "epoch": 0.7461666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.14683837890625, + "learning_rate": 3.5714981980396144e-05, + "loss": 4.6879, + "loss/crossentropy": 1.9266493320465088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17960952781140804, + "step": 8954 + }, + { + "epoch": 0.7463333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.1439453125, + "learning_rate": 3.5697849515677836e-05, + "loss": 4.9524, + "loss/crossentropy": 2.126235529780388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18564694002270699, + "step": 8956 + }, + { + "epoch": 0.7465, + "grad_norm": 4.28125, + "grad_norm_var": 0.1525390625, + "learning_rate": 3.568068818767869e-05, + "loss": 4.7523, + "loss/crossentropy": 2.540748953819275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21789639070630074, + "step": 8958 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.15510660807291668, + "learning_rate": 3.5663498038742585e-05, + "loss": 5.043, + "loss/crossentropy": 2.259571760892868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22068355977535248, + "step": 8960 + }, + { + "epoch": 0.7468333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.09560139973958333, + "learning_rate": 3.564627911128451e-05, + "loss": 4.5511, + "loss/crossentropy": 1.8589156866073608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19934171438217163, + "step": 8962 + }, + { + "epoch": 0.747, + "grad_norm": 5.21875, + "grad_norm_var": 0.10930582682291666, + "learning_rate": 3.562903144779045e-05, + "loss": 5.0822, + "loss/crossentropy": 1.9405502825975418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18191159516572952, + "step": 8964 + }, + { + "epoch": 0.7471666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.095947265625, + "learning_rate": 3.5611755090817294e-05, + "loss": 4.4999, + "loss/crossentropy": 2.3405293822288513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21351077407598495, + "step": 8966 + }, + { + "epoch": 0.7473333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.107421875, + "learning_rate": 3.559445008299276e-05, + "loss": 4.3914, + "loss/crossentropy": 1.6899859458208084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1662097293883562, + "step": 8968 + }, + { + "epoch": 0.7475, + "grad_norm": 4.65625, + "grad_norm_var": 0.104150390625, + "learning_rate": 3.55771164670152e-05, + "loss": 4.6844, + "loss/crossentropy": 1.4420829191803932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13939605839550495, + "step": 8970 + }, + { + "epoch": 0.7476666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.10428059895833333, + "learning_rate": 3.555975428565361e-05, + "loss": 5.1911, + "loss/crossentropy": 2.3206411004066467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20939678698778152, + "step": 8972 + }, + { + "epoch": 0.7478333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.08381754557291667, + "learning_rate": 3.554236358174743e-05, + "loss": 4.8117, + "loss/crossentropy": 1.8067068308591843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19425064884126186, + "step": 8974 + }, + { + "epoch": 0.748, + "grad_norm": 4.4375, + "grad_norm_var": 0.08238525390625, + "learning_rate": 3.5524944398206516e-05, + "loss": 4.4304, + "loss/crossentropy": 2.1453236043453217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19834518060088158, + "step": 8976 + }, + { + "epoch": 0.7481666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.045817057291666664, + "learning_rate": 3.5507496778010964e-05, + "loss": 4.7814, + "loss/crossentropy": 2.388603627681732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22290612012147903, + "step": 8978 + }, + { + "epoch": 0.7483333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.023563639322916666, + "learning_rate": 3.549002076421102e-05, + "loss": 4.7707, + "loss/crossentropy": 2.538282871246338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229936309158802, + "step": 8980 + }, + { + "epoch": 0.7485, + "grad_norm": 5.125, + "grad_norm_var": 0.037613932291666666, + "learning_rate": 3.5472516399927047e-05, + "loss": 5.0232, + "loss/crossentropy": 2.1156225353479385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18581297993659973, + "step": 8982 + }, + { + "epoch": 0.7486666666666667, + "grad_norm": 9.0625, + "grad_norm_var": 1.2496053059895833, + "learning_rate": 3.5454983728349305e-05, + "loss": 4.5365, + "loss/crossentropy": 2.3697937726974487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17561548948287964, + "step": 8984 + }, + { + "epoch": 0.7488333333333334, + "grad_norm": 4.625, + "grad_norm_var": 1.249853515625, + "learning_rate": 3.543742279273792e-05, + "loss": 5.2116, + "loss/crossentropy": 2.474073052406311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21809647977352142, + "step": 8986 + }, + { + "epoch": 0.749, + "grad_norm": 4.75, + "grad_norm_var": 1.2510416666666666, + "learning_rate": 3.541983363642275e-05, + "loss": 4.8202, + "loss/crossentropy": 1.280080884695053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1427099145948887, + "step": 8988 + }, + { + "epoch": 0.7491666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 1.2544108072916667, + "learning_rate": 3.5402216302803296e-05, + "loss": 4.8564, + "loss/crossentropy": 2.147283583879471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20935291796922684, + "step": 8990 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 1.2329264322916667, + "learning_rate": 3.538457083534858e-05, + "loss": 5.1555, + "loss/crossentropy": 2.4556437134742737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24310623481869698, + "step": 8992 + }, + { + "epoch": 0.7495, + "grad_norm": 4.71875, + "grad_norm_var": 1.2042277018229166, + "learning_rate": 3.536689727759702e-05, + "loss": 4.7679, + "loss/crossentropy": 2.4080257415771484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2176681011915207, + "step": 8994 + }, + { + "epoch": 0.7496666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 1.193994140625, + "learning_rate": 3.5349195673156385e-05, + "loss": 4.7875, + "loss/crossentropy": 2.589709520339966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113696150481701, + "step": 8996 + }, + { + "epoch": 0.7498333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 1.192578125, + "learning_rate": 3.533146606570362e-05, + "loss": 5.1712, + "loss/crossentropy": 2.271325647830963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2238401062786579, + "step": 8998 + }, + { + "epoch": 0.75, + "grad_norm": 4.75, + "grad_norm_var": 0.02838134765625, + "learning_rate": 3.531370849898476e-05, + "loss": 4.8331, + "loss/crossentropy": 1.5186460092663765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.152960903942585, + "step": 9000 + }, + { + "epoch": 0.7501666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.03674723307291667, + "learning_rate": 3.5295923016814856e-05, + "loss": 4.5761, + "loss/crossentropy": 1.513818047940731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13120390567928553, + "step": 9002 + }, + { + "epoch": 0.7503333333333333, + "grad_norm": 5.46875, + "grad_norm_var": 0.08802083333333334, + "learning_rate": 3.527810966307779e-05, + "loss": 5.1174, + "loss/crossentropy": 1.9752911627292633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18689009174704552, + "step": 9004 + }, + { + "epoch": 0.7505, + "grad_norm": 5.40625, + "grad_norm_var": 0.10621337890625, + "learning_rate": 3.5260268481726256e-05, + "loss": 4.9953, + "loss/crossentropy": 2.0769334733486176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100028581917286, + "step": 9006 + }, + { + "epoch": 0.7506666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.129296875, + "learning_rate": 3.5242399516781595e-05, + "loss": 4.5564, + "loss/crossentropy": 1.2513089552521706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14101386442780495, + "step": 9008 + }, + { + "epoch": 0.7508333333333334, + "grad_norm": 5.09375, + "grad_norm_var": 0.12589518229166666, + "learning_rate": 3.5224502812333694e-05, + "loss": 4.8856, + "loss/crossentropy": 1.1583551615476608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1508794203400612, + "step": 9010 + }, + { + "epoch": 0.751, + "grad_norm": 4.59375, + "grad_norm_var": 0.12643229166666667, + "learning_rate": 3.520657841254091e-05, + "loss": 4.6876, + "loss/crossentropy": 2.0375124514102936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591862082481384, + "step": 9012 + }, + { + "epoch": 0.7511666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.130712890625, + "learning_rate": 3.51886263616299e-05, + "loss": 4.9499, + "loss/crossentropy": 2.563260316848755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23242852464318275, + "step": 9014 + }, + { + "epoch": 0.7513333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.13368733723958334, + "learning_rate": 3.517064670389557e-05, + "loss": 5.2829, + "loss/crossentropy": 2.5851563215255737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215106524527073, + "step": 9016 + }, + { + "epoch": 0.7515, + "grad_norm": 5.1875, + "grad_norm_var": 0.12610677083333333, + "learning_rate": 3.5152639483700936e-05, + "loss": 5.1981, + "loss/crossentropy": 2.2794704139232635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125004678964615, + "step": 9018 + }, + { + "epoch": 0.7516666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.0740234375, + "learning_rate": 3.513460474547703e-05, + "loss": 5.3094, + "loss/crossentropy": 2.4865044355392456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22369953617453575, + "step": 9020 + }, + { + "epoch": 0.7518333333333334, + "grad_norm": 4.40625, + "grad_norm_var": 0.05260009765625, + "learning_rate": 3.5116542533722775e-05, + "loss": 4.71, + "loss/crossentropy": 2.2633769810199738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2011515162885189, + "step": 9022 + }, + { + "epoch": 0.752, + "grad_norm": 4.90625, + "grad_norm_var": 0.038525390625, + "learning_rate": 3.509845289300488e-05, + "loss": 4.6829, + "loss/crossentropy": 2.047989845275879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18051733635365963, + "step": 9024 + }, + { + "epoch": 0.7521666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.033036295572916666, + "learning_rate": 3.5080335867957744e-05, + "loss": 4.6169, + "loss/crossentropy": 1.6083292067050934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1622084677219391, + "step": 9026 + }, + { + "epoch": 0.7523333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.04763997395833333, + "learning_rate": 3.50621915032833e-05, + "loss": 4.2666, + "loss/crossentropy": 2.077538877725601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20892773941159248, + "step": 9028 + }, + { + "epoch": 0.7525, + "grad_norm": 4.3125, + "grad_norm_var": 0.059488932291666664, + "learning_rate": 3.5044019843751e-05, + "loss": 4.1963, + "loss/crossentropy": 2.139458805322647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20242808759212494, + "step": 9030 + }, + { + "epoch": 0.7526666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.061812337239583334, + "learning_rate": 3.502582093419758e-05, + "loss": 5.3637, + "loss/crossentropy": 2.3458738029003143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19767314568161964, + "step": 9032 + }, + { + "epoch": 0.7528333333333334, + "grad_norm": 4.3125, + "grad_norm_var": 0.06243489583333333, + "learning_rate": 3.5007594819527054e-05, + "loss": 5.1482, + "loss/crossentropy": 1.56951804459095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15344494953751564, + "step": 9034 + }, + { + "epoch": 0.753, + "grad_norm": 4.90625, + "grad_norm_var": 0.06510416666666667, + "learning_rate": 3.4989341544710543e-05, + "loss": 5.106, + "loss/crossentropy": 2.4358200430870056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20194847881793976, + "step": 9036 + }, + { + "epoch": 0.7531666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.06614176432291667, + "learning_rate": 3.497106115478618e-05, + "loss": 4.6428, + "loss/crossentropy": 2.2978862822055817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043941207230091, + "step": 9038 + }, + { + "epoch": 0.7533333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.0623046875, + "learning_rate": 3.495275369485902e-05, + "loss": 4.9244, + "loss/crossentropy": 2.148787200450897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20230147242546082, + "step": 9040 + }, + { + "epoch": 0.7535, + "grad_norm": 5.03125, + "grad_norm_var": 0.07042643229166666, + "learning_rate": 3.4934419210100906e-05, + "loss": 4.5074, + "loss/crossentropy": 1.943856105208397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838915403932333, + "step": 9042 + }, + { + "epoch": 0.7536666666666667, + "grad_norm": 4.25, + "grad_norm_var": 0.07066650390625, + "learning_rate": 3.491605774575034e-05, + "loss": 4.4208, + "loss/crossentropy": 1.7964168190956116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17760272696614265, + "step": 9044 + }, + { + "epoch": 0.7538333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.06513264973958334, + "learning_rate": 3.489766934711243e-05, + "loss": 4.7686, + "loss/crossentropy": 2.1065956354141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23584268614649773, + "step": 9046 + }, + { + "epoch": 0.754, + "grad_norm": 5.0625, + "grad_norm_var": 0.08222249348958334, + "learning_rate": 3.487925405955872e-05, + "loss": 5.1146, + "loss/crossentropy": 2.0449488013982773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19515445083379745, + "step": 9048 + }, + { + "epoch": 0.7541666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.06780192057291666, + "learning_rate": 3.486081192852708e-05, + "loss": 5.348, + "loss/crossentropy": 2.6749900579452515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22742577642202377, + "step": 9050 + }, + { + "epoch": 0.7543333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.06324462890625, + "learning_rate": 3.4842342999521644e-05, + "loss": 4.9745, + "loss/crossentropy": 1.704499438405037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16869292221963406, + "step": 9052 + }, + { + "epoch": 0.7545, + "grad_norm": 4.65625, + "grad_norm_var": 0.07303059895833333, + "learning_rate": 3.482384731811267e-05, + "loss": 4.5628, + "loss/crossentropy": 2.1640761494636536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1985640451312065, + "step": 9054 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.07727864583333334, + "learning_rate": 3.4805324929936394e-05, + "loss": 5.0844, + "loss/crossentropy": 2.241463929414749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2031102292239666, + "step": 9056 + }, + { + "epoch": 0.7548333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.07125244140625, + "learning_rate": 3.478677588069499e-05, + "loss": 5.1096, + "loss/crossentropy": 2.2909657061100006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24316620454192162, + "step": 9058 + }, + { + "epoch": 0.755, + "grad_norm": 5.46875, + "grad_norm_var": 0.06946614583333334, + "learning_rate": 3.4768200216156374e-05, + "loss": 4.8016, + "loss/crossentropy": 1.831287831068039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1837274581193924, + "step": 9060 + }, + { + "epoch": 0.7551666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.06702067057291666, + "learning_rate": 3.4749597982154166e-05, + "loss": 4.6735, + "loss/crossentropy": 1.2566091194748878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1334764687344432, + "step": 9062 + }, + { + "epoch": 0.7553333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.06536051432291666, + "learning_rate": 3.4730969224587525e-05, + "loss": 5.2192, + "loss/crossentropy": 1.172278493642807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15206410735845566, + "step": 9064 + }, + { + "epoch": 0.7555, + "grad_norm": 5.625, + "grad_norm_var": 0.10373942057291667, + "learning_rate": 3.471231398942105e-05, + "loss": 4.9221, + "loss/crossentropy": 1.4909002631902695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15898577868938446, + "step": 9066 + }, + { + "epoch": 0.7556666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.10611572265625, + "learning_rate": 3.469363232268469e-05, + "loss": 5.1231, + "loss/crossentropy": 2.3849256336688995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019842453300953, + "step": 9068 + }, + { + "epoch": 0.7558333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.09842122395833333, + "learning_rate": 3.4674924270473607e-05, + "loss": 4.3009, + "loss/crossentropy": 1.8146369010210037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19062471948564053, + "step": 9070 + }, + { + "epoch": 0.756, + "grad_norm": 4.96875, + "grad_norm_var": 0.09713134765625, + "learning_rate": 3.465618987894803e-05, + "loss": 4.413, + "loss/crossentropy": 2.3734322786331177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2367638796567917, + "step": 9072 + }, + { + "epoch": 0.7561666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.10637613932291666, + "learning_rate": 3.463742919433323e-05, + "loss": 5.1331, + "loss/crossentropy": 2.602075159549713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20435499399900436, + "step": 9074 + }, + { + "epoch": 0.7563333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.08162434895833333, + "learning_rate": 3.461864226291934e-05, + "loss": 4.7168, + "loss/crossentropy": 2.37225678563118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2264081910252571, + "step": 9076 + }, + { + "epoch": 0.7565, + "grad_norm": 4.5, + "grad_norm_var": 0.08435872395833334, + "learning_rate": 3.4599829131061225e-05, + "loss": 4.6292, + "loss/crossentropy": 1.7959840223193169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16916048899292946, + "step": 9078 + }, + { + "epoch": 0.7566666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.09055989583333333, + "learning_rate": 3.458098984517843e-05, + "loss": 4.7667, + "loss/crossentropy": 1.5935606062412262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16453899256885052, + "step": 9080 + }, + { + "epoch": 0.7568333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.0314453125, + "learning_rate": 3.456212445175502e-05, + "loss": 5.0477, + "loss/crossentropy": 2.24808931350708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21594270691275597, + "step": 9082 + }, + { + "epoch": 0.757, + "grad_norm": 4.59375, + "grad_norm_var": 0.028450520833333333, + "learning_rate": 3.454323299733948e-05, + "loss": 4.6522, + "loss/crossentropy": 1.8796441107988358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952054239809513, + "step": 9084 + }, + { + "epoch": 0.7571666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.042252604166666666, + "learning_rate": 3.452431552854458e-05, + "loss": 4.597, + "loss/crossentropy": 1.292886197566986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13541504368185997, + "step": 9086 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 4.125, + "grad_norm_var": 0.04885660807291667, + "learning_rate": 3.450537209204731e-05, + "loss": 4.3493, + "loss/crossentropy": 1.193882331252098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13158982805907726, + "step": 9088 + }, + { + "epoch": 0.7575, + "grad_norm": 5.0, + "grad_norm_var": 0.07349853515625, + "learning_rate": 3.44864027345887e-05, + "loss": 4.9141, + "loss/crossentropy": 2.6352381110191345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736566931009293, + "step": 9090 + }, + { + "epoch": 0.7576666666666667, + "grad_norm": 4.9375, + "grad_norm_var": 0.07591145833333333, + "learning_rate": 3.446740750297378e-05, + "loss": 4.9487, + "loss/crossentropy": 2.3482372760772705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2141609936952591, + "step": 9092 + }, + { + "epoch": 0.7578333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.20416666666666666, + "learning_rate": 3.444838644407138e-05, + "loss": 5.0464, + "loss/crossentropy": 1.3707880079746246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13958771526813507, + "step": 9094 + }, + { + "epoch": 0.758, + "grad_norm": 5.125, + "grad_norm_var": 0.35090738932291665, + "learning_rate": 3.442933960481407e-05, + "loss": 5.0027, + "loss/crossentropy": 1.7247644662857056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777170617133379, + "step": 9096 + }, + { + "epoch": 0.7581666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.35113525390625, + "learning_rate": 3.441026703219803e-05, + "loss": 5.351, + "loss/crossentropy": 2.6863598823547363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22101808339357376, + "step": 9098 + }, + { + "epoch": 0.7583333333333333, + "grad_norm": 5.375, + "grad_norm_var": 0.35640869140625, + "learning_rate": 3.439116877328294e-05, + "loss": 4.6485, + "loss/crossentropy": 2.349628359079361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090320773422718, + "step": 9100 + }, + { + "epoch": 0.7585, + "grad_norm": 4.71875, + "grad_norm_var": 0.3270467122395833, + "learning_rate": 3.437204487519186e-05, + "loss": 5.0351, + "loss/crossentropy": 2.57798308134079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016715221107006, + "step": 9102 + }, + { + "epoch": 0.7586666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.27294514973958334, + "learning_rate": 3.435289538511111e-05, + "loss": 5.1272, + "loss/crossentropy": 2.202754706144333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18523726612329483, + "step": 9104 + }, + { + "epoch": 0.7588333333333334, + "grad_norm": 5.90625, + "grad_norm_var": 0.32711181640625, + "learning_rate": 3.433372035029015e-05, + "loss": 5.1368, + "loss/crossentropy": 2.47508043050766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21155068278312683, + "step": 9106 + }, + { + "epoch": 0.759, + "grad_norm": 4.84375, + "grad_norm_var": 0.329541015625, + "learning_rate": 3.4314519818041466e-05, + "loss": 5.2032, + "loss/crossentropy": 2.169353663921356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20374875143170357, + "step": 9108 + }, + { + "epoch": 0.7591666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.2648396809895833, + "learning_rate": 3.429529383574047e-05, + "loss": 4.7644, + "loss/crossentropy": 1.9236654192209244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17431189492344856, + "step": 9110 + }, + { + "epoch": 0.7593333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.128759765625, + "learning_rate": 3.4276042450825355e-05, + "loss": 4.6065, + "loss/crossentropy": 1.8178698271512985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19074127450585365, + "step": 9112 + }, + { + "epoch": 0.7595, + "grad_norm": 4.53125, + "grad_norm_var": 0.13720296223958334, + "learning_rate": 3.4256765710797006e-05, + "loss": 4.9505, + "loss/crossentropy": 1.6263808757066727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17589782178401947, + "step": 9114 + }, + { + "epoch": 0.7596666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.114453125, + "learning_rate": 3.4237463663218853e-05, + "loss": 4.7901, + "loss/crossentropy": 2.326656460762024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2385944351553917, + "step": 9116 + }, + { + "epoch": 0.7598333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 0.12330729166666667, + "learning_rate": 3.42181363557168e-05, + "loss": 4.6892, + "loss/crossentropy": 2.12799334526062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20836078003048897, + "step": 9118 + }, + { + "epoch": 0.76, + "grad_norm": 5.0625, + "grad_norm_var": 0.12297770182291666, + "learning_rate": 3.4198783835979034e-05, + "loss": 5.1581, + "loss/crossentropy": 2.33326955139637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1998424343764782, + "step": 9120 + }, + { + "epoch": 0.7601666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.04657796223958333, + "learning_rate": 3.417940615175599e-05, + "loss": 5.0946, + "loss/crossentropy": 1.7350738197565079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1729459259659052, + "step": 9122 + }, + { + "epoch": 0.7603333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.050679524739583336, + "learning_rate": 3.4160003350860176e-05, + "loss": 4.9605, + "loss/crossentropy": 2.4162497520446777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20327191054821014, + "step": 9124 + }, + { + "epoch": 0.7605, + "grad_norm": 4.46875, + "grad_norm_var": 0.05579427083333333, + "learning_rate": 3.4140575481166066e-05, + "loss": 4.5611, + "loss/crossentropy": 1.4746525883674622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13860525004565716, + "step": 9126 + }, + { + "epoch": 0.7606666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.04638264973958333, + "learning_rate": 3.412112259061e-05, + "loss": 4.2153, + "loss/crossentropy": 1.289049193263054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16961247101426125, + "step": 9128 + }, + { + "epoch": 0.7608333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.0521484375, + "learning_rate": 3.410164472719005e-05, + "loss": 4.4881, + "loss/crossentropy": 1.420512616634369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15469906572252512, + "step": 9130 + }, + { + "epoch": 0.761, + "grad_norm": 4.875, + "grad_norm_var": 0.050390625, + "learning_rate": 3.4082141938965915e-05, + "loss": 5.0561, + "loss/crossentropy": 2.3881621956825256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22114987671375275, + "step": 9132 + }, + { + "epoch": 0.7611666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.0599609375, + "learning_rate": 3.406261427405878e-05, + "loss": 4.7353, + "loss/crossentropy": 2.0259829089045525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19267887063324451, + "step": 9134 + }, + { + "epoch": 0.7613333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.05152587890625, + "learning_rate": 3.404306178065121e-05, + "loss": 4.8248, + "loss/crossentropy": 2.682315409183502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208783358335495, + "step": 9136 + }, + { + "epoch": 0.7615, + "grad_norm": 4.5, + "grad_norm_var": 0.05089518229166667, + "learning_rate": 3.4023484506987064e-05, + "loss": 4.6531, + "loss/crossentropy": 2.019272468984127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19835746102035046, + "step": 9138 + }, + { + "epoch": 0.7616666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.04771728515625, + "learning_rate": 3.4003882501371296e-05, + "loss": 4.4325, + "loss/crossentropy": 1.8939252644777298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1691152397543192, + "step": 9140 + }, + { + "epoch": 0.7618333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.04503580729166667, + "learning_rate": 3.39842558121699e-05, + "loss": 4.8541, + "loss/crossentropy": 1.8951895460486412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1793217770755291, + "step": 9142 + }, + { + "epoch": 0.762, + "grad_norm": 4.78125, + "grad_norm_var": 0.042704264322916664, + "learning_rate": 3.3964604487809806e-05, + "loss": 5.0357, + "loss/crossentropy": 1.9603685438632965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17539142072200775, + "step": 9144 + }, + { + "epoch": 0.7621666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.03372395833333333, + "learning_rate": 3.3944928576778694e-05, + "loss": 4.9157, + "loss/crossentropy": 1.379582405090332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1512586548924446, + "step": 9146 + }, + { + "epoch": 0.7623333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.0396484375, + "learning_rate": 3.39252281276249e-05, + "loss": 4.8829, + "loss/crossentropy": 1.871251530945301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19466626271605492, + "step": 9148 + }, + { + "epoch": 0.7625, + "grad_norm": 4.4375, + "grad_norm_var": 0.028369140625, + "learning_rate": 3.3905503188957354e-05, + "loss": 4.3252, + "loss/crossentropy": 1.522882029414177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1446057464927435, + "step": 9150 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.08661702473958334, + "learning_rate": 3.388575380944535e-05, + "loss": 5.5142, + "loss/crossentropy": 2.8795396983623505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21250687539577484, + "step": 9152 + }, + { + "epoch": 0.7628333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.08860677083333333, + "learning_rate": 3.386598003781855e-05, + "loss": 5.0395, + "loss/crossentropy": 2.2749998569488525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20907314494252205, + "step": 9154 + }, + { + "epoch": 0.763, + "grad_norm": 5.15625, + "grad_norm_var": 0.08435872395833334, + "learning_rate": 3.3846181922866746e-05, + "loss": 4.9776, + "loss/crossentropy": 1.7697783410549164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16960179060697556, + "step": 9156 + }, + { + "epoch": 0.7631666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.08255208333333333, + "learning_rate": 3.382635951343983e-05, + "loss": 4.7008, + "loss/crossentropy": 1.9313219785690308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19448504596948624, + "step": 9158 + }, + { + "epoch": 0.7633333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.07860921223958334, + "learning_rate": 3.3806512858447626e-05, + "loss": 4.5204, + "loss/crossentropy": 1.810696929693222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1649162843823433, + "step": 9160 + }, + { + "epoch": 0.7635, + "grad_norm": 4.59375, + "grad_norm_var": 0.07343343098958334, + "learning_rate": 3.378664200685978e-05, + "loss": 4.6922, + "loss/crossentropy": 1.2390378192067146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14889622665941715, + "step": 9162 + }, + { + "epoch": 0.7636666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.08101806640625, + "learning_rate": 3.376674700770564e-05, + "loss": 4.617, + "loss/crossentropy": 1.7813236862421036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18585302121937275, + "step": 9164 + }, + { + "epoch": 0.7638333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.07858072916666667, + "learning_rate": 3.3746827910074154e-05, + "loss": 4.4724, + "loss/crossentropy": 1.8114677891135216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18985752575099468, + "step": 9166 + }, + { + "epoch": 0.764, + "grad_norm": 4.53125, + "grad_norm_var": 0.039306640625, + "learning_rate": 3.3726884763113693e-05, + "loss": 5.1584, + "loss/crossentropy": 1.3699896410107613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.154366759583354, + "step": 9168 + }, + { + "epoch": 0.7641666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.07047119140625, + "learning_rate": 3.3706917616032e-05, + "loss": 5.6044, + "loss/crossentropy": 1.8340007662773132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330247655510902, + "step": 9170 + }, + { + "epoch": 0.7643333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.0587890625, + "learning_rate": 3.3686926518096026e-05, + "loss": 5.2916, + "loss/crossentropy": 2.0827116072177887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1957671195268631, + "step": 9172 + }, + { + "epoch": 0.7645, + "grad_norm": 4.28125, + "grad_norm_var": 0.08700764973958333, + "learning_rate": 3.366691151863182e-05, + "loss": 4.1972, + "loss/crossentropy": 1.7677662521600723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18560653924942017, + "step": 9174 + }, + { + "epoch": 0.7646666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.08938395182291667, + "learning_rate": 3.36468726670244e-05, + "loss": 4.7011, + "loss/crossentropy": 1.5455302894115448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1788640320301056, + "step": 9176 + }, + { + "epoch": 0.7648333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.10976155598958333, + "learning_rate": 3.3626810012717646e-05, + "loss": 5.3509, + "loss/crossentropy": 1.846191093325615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068962249904871, + "step": 9178 + }, + { + "epoch": 0.765, + "grad_norm": 4.59375, + "grad_norm_var": 0.11092122395833333, + "learning_rate": 3.360672360521415e-05, + "loss": 4.898, + "loss/crossentropy": 1.9306019470095634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1716877445578575, + "step": 9180 + }, + { + "epoch": 0.7651666666666667, + "grad_norm": 4.09375, + "grad_norm_var": 0.13440348307291666, + "learning_rate": 3.3586613494075135e-05, + "loss": 4.7658, + "loss/crossentropy": 2.1522144228219986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1841568574309349, + "step": 9182 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.13489176432291666, + "learning_rate": 3.356647972892031e-05, + "loss": 4.8803, + "loss/crossentropy": 1.5114280879497528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15638689696788788, + "step": 9184 + }, + { + "epoch": 0.7655, + "grad_norm": 4.90625, + "grad_norm_var": 0.12597249348958334, + "learning_rate": 3.3546322359427726e-05, + "loss": 5.2088, + "loss/crossentropy": 1.538307212293148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17916510254144669, + "step": 9186 + }, + { + "epoch": 0.7656666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.13098551432291666, + "learning_rate": 3.3526141435333684e-05, + "loss": 5.1529, + "loss/crossentropy": 1.8003139421343803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915070228278637, + "step": 9188 + }, + { + "epoch": 0.7658333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.10657552083333334, + "learning_rate": 3.350593700643262e-05, + "loss": 5.0874, + "loss/crossentropy": 1.7107343226671219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16837585344910622, + "step": 9190 + }, + { + "epoch": 0.766, + "grad_norm": 4.5, + "grad_norm_var": 0.13522135416666667, + "learning_rate": 3.348570912257695e-05, + "loss": 4.7146, + "loss/crossentropy": 2.154672235250473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126365266740322, + "step": 9192 + }, + { + "epoch": 0.7661666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.12069905598958333, + "learning_rate": 3.346545783367697e-05, + "loss": 4.6693, + "loss/crossentropy": 2.692975878715515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21004782244563103, + "step": 9194 + }, + { + "epoch": 0.7663333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.11728108723958333, + "learning_rate": 3.3445183189700716e-05, + "loss": 4.913, + "loss/crossentropy": 1.548985317349434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15373594500124454, + "step": 9196 + }, + { + "epoch": 0.7665, + "grad_norm": 4.78125, + "grad_norm_var": 0.10428059895833333, + "learning_rate": 3.3424885240673866e-05, + "loss": 4.9349, + "loss/crossentropy": 1.9586029201745987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17618397064507008, + "step": 9198 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.09163004557291667, + "learning_rate": 3.340456403667958e-05, + "loss": 4.3688, + "loss/crossentropy": 1.9201075732707977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19281110912561417, + "step": 9200 + }, + { + "epoch": 0.7668333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.08001302083333334, + "learning_rate": 3.338421962785841e-05, + "loss": 4.7653, + "loss/crossentropy": 1.4313116371631622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1664351150393486, + "step": 9202 + }, + { + "epoch": 0.767, + "grad_norm": 4.96875, + "grad_norm_var": 0.07958577473958334, + "learning_rate": 3.3363852064408165e-05, + "loss": 5.2229, + "loss/crossentropy": 2.160913795232773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19118301942944527, + "step": 9204 + }, + { + "epoch": 0.7671666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.08670247395833333, + "learning_rate": 3.3343461396583784e-05, + "loss": 4.8138, + "loss/crossentropy": 1.6962665170431137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16171543672680855, + "step": 9206 + }, + { + "epoch": 0.7673333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.06145833333333333, + "learning_rate": 3.3323047674697224e-05, + "loss": 5.163, + "loss/crossentropy": 2.673032522201538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21397614479064941, + "step": 9208 + }, + { + "epoch": 0.7675, + "grad_norm": 4.78125, + "grad_norm_var": 0.06483968098958333, + "learning_rate": 3.330261094911729e-05, + "loss": 4.8433, + "loss/crossentropy": 1.8518261313438416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22120384871959686, + "step": 9210 + }, + { + "epoch": 0.7676666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.06728108723958333, + "learning_rate": 3.328215127026959e-05, + "loss": 4.8807, + "loss/crossentropy": 1.8135938048362732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18345564603805542, + "step": 9212 + }, + { + "epoch": 0.7678333333333334, + "grad_norm": 4.3125, + "grad_norm_var": 0.05364583333333333, + "learning_rate": 3.326166868863634e-05, + "loss": 4.4959, + "loss/crossentropy": 1.942594051361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18999501317739487, + "step": 9214 + }, + { + "epoch": 0.768, + "grad_norm": 4.5625, + "grad_norm_var": 0.05480143229166667, + "learning_rate": 3.324116325475628e-05, + "loss": 4.7555, + "loss/crossentropy": 1.651905320584774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13776842784136534, + "step": 9216 + }, + { + "epoch": 0.7681666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.05709228515625, + "learning_rate": 3.322063501922453e-05, + "loss": 5.1112, + "loss/crossentropy": 2.402420222759247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22454120218753815, + "step": 9218 + }, + { + "epoch": 0.7683333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.055582682291666664, + "learning_rate": 3.320008403269246e-05, + "loss": 4.7769, + "loss/crossentropy": 1.7973673362284899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15573907841462642, + "step": 9220 + }, + { + "epoch": 0.7685, + "grad_norm": 4.4375, + "grad_norm_var": 0.06456705729166666, + "learning_rate": 3.317951034586759e-05, + "loss": 4.6862, + "loss/crossentropy": 2.4214308857917786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21835685148835182, + "step": 9222 + }, + { + "epoch": 0.7686666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.05526936848958333, + "learning_rate": 3.315891400951346e-05, + "loss": 4.6974, + "loss/crossentropy": 1.9672070443630219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1876031868159771, + "step": 9224 + }, + { + "epoch": 0.7688333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.055338541666666664, + "learning_rate": 3.313829507444946e-05, + "loss": 4.8264, + "loss/crossentropy": 1.510596327483654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15548838302493095, + "step": 9226 + }, + { + "epoch": 0.769, + "grad_norm": 4.625, + "grad_norm_var": 0.07864176432291667, + "learning_rate": 3.311765359155079e-05, + "loss": 4.6752, + "loss/crossentropy": 2.789728343486786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263692542910576, + "step": 9228 + }, + { + "epoch": 0.7691666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.05896809895833333, + "learning_rate": 3.309698961174823e-05, + "loss": 4.2339, + "loss/crossentropy": 2.3551100194454193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22738180309534073, + "step": 9230 + }, + { + "epoch": 0.7693333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.058577473958333334, + "learning_rate": 3.307630318602811e-05, + "loss": 5.6364, + "loss/crossentropy": 2.2606790959835052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2105814702808857, + "step": 9232 + }, + { + "epoch": 0.7695, + "grad_norm": 4.4375, + "grad_norm_var": 0.062890625, + "learning_rate": 3.3055594365432124e-05, + "loss": 4.6461, + "loss/crossentropy": 1.849206268787384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17526361718773842, + "step": 9234 + }, + { + "epoch": 0.7696666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.074853515625, + "learning_rate": 3.303486320105724e-05, + "loss": 4.9883, + "loss/crossentropy": 1.5751914456486702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15906815975904465, + "step": 9236 + }, + { + "epoch": 0.7698333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.06951497395833334, + "learning_rate": 3.3014109744055524e-05, + "loss": 5.1432, + "loss/crossentropy": 1.3298326507210732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15809830278158188, + "step": 9238 + }, + { + "epoch": 0.77, + "grad_norm": 5.0, + "grad_norm_var": 0.06217041015625, + "learning_rate": 3.29933340456341e-05, + "loss": 5.0645, + "loss/crossentropy": 1.6231756582856178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22281446680426598, + "step": 9240 + }, + { + "epoch": 0.7701666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.06326497395833333, + "learning_rate": 3.29725361570549e-05, + "loss": 5.0422, + "loss/crossentropy": 2.274557799100876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104065828025341, + "step": 9242 + }, + { + "epoch": 0.7703333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.04876302083333333, + "learning_rate": 3.2951716129634675e-05, + "loss": 5.0096, + "loss/crossentropy": 1.4321745932102203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16160675697028637, + "step": 9244 + }, + { + "epoch": 0.7705, + "grad_norm": 4.5, + "grad_norm_var": 0.049609375, + "learning_rate": 3.293087401474476e-05, + "loss": 4.3341, + "loss/crossentropy": 1.8292163461446762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17593476921319962, + "step": 9246 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.04986979166666667, + "learning_rate": 3.291000986381101e-05, + "loss": 5.4821, + "loss/crossentropy": 2.490095376968384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170156165957451, + "step": 9248 + }, + { + "epoch": 0.7708333333333334, + "grad_norm": 5.03125, + "grad_norm_var": 0.04694010416666667, + "learning_rate": 3.288912372831364e-05, + "loss": 5.1315, + "loss/crossentropy": 2.3378700017929077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23920363187789917, + "step": 9250 + }, + { + "epoch": 0.771, + "grad_norm": 4.96875, + "grad_norm_var": 0.035791015625, + "learning_rate": 3.286821565978711e-05, + "loss": 5.3335, + "loss/crossentropy": 2.1893119513988495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198990032076836, + "step": 9252 + }, + { + "epoch": 0.7711666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.0548828125, + "learning_rate": 3.284728570982e-05, + "loss": 4.9941, + "loss/crossentropy": 1.962324395775795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17419682629406452, + "step": 9254 + }, + { + "epoch": 0.7713333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.0517578125, + "learning_rate": 3.282633393005489e-05, + "loss": 5.0291, + "loss/crossentropy": 1.6728404238820076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20481682196259499, + "step": 9256 + }, + { + "epoch": 0.7715, + "grad_norm": 4.59375, + "grad_norm_var": 0.05419514973958333, + "learning_rate": 3.28053603721882e-05, + "loss": 5.759, + "loss/crossentropy": 1.7875987961888313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16401218622922897, + "step": 9258 + }, + { + "epoch": 0.7716666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.05237223307291667, + "learning_rate": 3.278436508797011e-05, + "loss": 4.9107, + "loss/crossentropy": 2.0292908400297165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.175339225679636, + "step": 9260 + }, + { + "epoch": 0.7718333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.04108072916666667, + "learning_rate": 3.2763348129204396e-05, + "loss": 4.749, + "loss/crossentropy": 1.8519446104764938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16933886520564556, + "step": 9262 + }, + { + "epoch": 0.772, + "grad_norm": 4.65625, + "grad_norm_var": 0.05286051432291667, + "learning_rate": 3.2742309547748314e-05, + "loss": 5.0763, + "loss/crossentropy": 2.253142923116684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19896778464317322, + "step": 9264 + }, + { + "epoch": 0.7721666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.06002604166666667, + "learning_rate": 3.272124939551247e-05, + "loss": 5.3582, + "loss/crossentropy": 2.13630610704422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20657182484865189, + "step": 9266 + }, + { + "epoch": 0.7723333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.053238932291666666, + "learning_rate": 3.2700167724460685e-05, + "loss": 4.8069, + "loss/crossentropy": 1.8119681552052498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17984369210898876, + "step": 9268 + }, + { + "epoch": 0.7725, + "grad_norm": 4.875, + "grad_norm_var": 0.03173421223958333, + "learning_rate": 3.26790645866099e-05, + "loss": 5.1939, + "loss/crossentropy": 2.4118016362190247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21238981559872627, + "step": 9270 + }, + { + "epoch": 0.7726666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.04440104166666667, + "learning_rate": 3.265794003403002e-05, + "loss": 4.5605, + "loss/crossentropy": 2.2320240437984467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951523795723915, + "step": 9272 + }, + { + "epoch": 0.7728333333333334, + "grad_norm": 6.375, + "grad_norm_var": 0.18730061848958332, + "learning_rate": 3.263679411884375e-05, + "loss": 5.361, + "loss/crossentropy": 1.4449108317494392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16431713104248047, + "step": 9274 + }, + { + "epoch": 0.773, + "grad_norm": 4.875, + "grad_norm_var": 0.18987223307291667, + "learning_rate": 3.2615626893226564e-05, + "loss": 5.3566, + "loss/crossentropy": 2.320689380168915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18379362672567368, + "step": 9276 + }, + { + "epoch": 0.7731666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.19381510416666667, + "learning_rate": 3.2594438409406475e-05, + "loss": 5.102, + "loss/crossentropy": 1.8211688697338104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20534205064177513, + "step": 9278 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.19495035807291666, + "learning_rate": 3.2573228719663944e-05, + "loss": 5.5899, + "loss/crossentropy": 2.3344379365444183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23429518193006516, + "step": 9280 + }, + { + "epoch": 0.7735, + "grad_norm": 4.71875, + "grad_norm_var": 0.20771077473958333, + "learning_rate": 3.2551997876331805e-05, + "loss": 4.9082, + "loss/crossentropy": 1.7352554872632027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16921756975352764, + "step": 9282 + }, + { + "epoch": 0.7736666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.21252848307291666, + "learning_rate": 3.253074593179502e-05, + "loss": 4.9955, + "loss/crossentropy": 2.216016709804535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066405951976776, + "step": 9284 + }, + { + "epoch": 0.7738333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.22278238932291666, + "learning_rate": 3.2509472938490674e-05, + "loss": 4.9624, + "loss/crossentropy": 1.9695368334650993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958236563950777, + "step": 9286 + }, + { + "epoch": 0.774, + "grad_norm": 4.9375, + "grad_norm_var": 0.23668212890625, + "learning_rate": 3.2488178948907746e-05, + "loss": 4.6204, + "loss/crossentropy": 1.8463789224624634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17317039147019386, + "step": 9288 + }, + { + "epoch": 0.7741666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.07766927083333333, + "learning_rate": 3.2466864015587054e-05, + "loss": 4.4168, + "loss/crossentropy": 1.736086145043373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16505318135023117, + "step": 9290 + }, + { + "epoch": 0.7743333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.07763264973958334, + "learning_rate": 3.244552819112107e-05, + "loss": 4.71, + "loss/crossentropy": 1.9710333943367004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19168449193239212, + "step": 9292 + }, + { + "epoch": 0.7745, + "grad_norm": 4.71875, + "grad_norm_var": 0.057145182291666666, + "learning_rate": 3.242417152815381e-05, + "loss": 5.183, + "loss/crossentropy": 2.472516894340515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013029269874096, + "step": 9294 + }, + { + "epoch": 0.7746666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.04885660807291667, + "learning_rate": 3.240279407938074e-05, + "loss": 5.2434, + "loss/crossentropy": 2.4441158175468445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029016651213169, + "step": 9296 + }, + { + "epoch": 0.7748333333333334, + "grad_norm": 4.40625, + "grad_norm_var": 0.0490234375, + "learning_rate": 3.2381395897548563e-05, + "loss": 5.1106, + "loss/crossentropy": 1.6266438364982605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16166551038622856, + "step": 9298 + }, + { + "epoch": 0.775, + "grad_norm": 4.71875, + "grad_norm_var": 0.06200764973958333, + "learning_rate": 3.2359977035455185e-05, + "loss": 4.833, + "loss/crossentropy": 1.9613151028752327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19063337706029415, + "step": 9300 + }, + { + "epoch": 0.7751666666666667, + "grad_norm": 6.0, + "grad_norm_var": 0.158203125, + "learning_rate": 3.233853754594951e-05, + "loss": 5.113, + "loss/crossentropy": 2.270957499742508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21738599985837936, + "step": 9302 + }, + { + "epoch": 0.7753333333333333, + "grad_norm": 4.375, + "grad_norm_var": 0.148828125, + "learning_rate": 3.2317077481931355e-05, + "loss": 4.8684, + "loss/crossentropy": 1.7900439202785492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20083611086010933, + "step": 9304 + }, + { + "epoch": 0.7755, + "grad_norm": 5.0625, + "grad_norm_var": 0.1513671875, + "learning_rate": 3.229559689635129e-05, + "loss": 4.8177, + "loss/crossentropy": 1.4736758545041084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13412130996584892, + "step": 9306 + }, + { + "epoch": 0.7756666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.186572265625, + "learning_rate": 3.227409584221052e-05, + "loss": 4.3715, + "loss/crossentropy": 1.3929030001163483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14130380935966969, + "step": 9308 + }, + { + "epoch": 0.7758333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.19280192057291667, + "learning_rate": 3.225257437256076e-05, + "loss": 5.3773, + "loss/crossentropy": 2.001873791217804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029735241085291, + "step": 9310 + }, + { + "epoch": 0.776, + "grad_norm": 4.625, + "grad_norm_var": 0.18925374348958332, + "learning_rate": 3.22310325405041e-05, + "loss": 5.4514, + "loss/crossentropy": 2.3293404579162598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19467134773731232, + "step": 9312 + }, + { + "epoch": 0.7761666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.18466389973958333, + "learning_rate": 3.220947039919288e-05, + "loss": 4.6479, + "loss/crossentropy": 1.5509610995650291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20042119547724724, + "step": 9314 + }, + { + "epoch": 0.7763333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.19384358723958334, + "learning_rate": 3.218788800182952e-05, + "loss": 4.2091, + "loss/crossentropy": 1.2999482825398445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1503276266157627, + "step": 9316 + }, + { + "epoch": 0.7765, + "grad_norm": 5.5, + "grad_norm_var": 0.12343343098958333, + "learning_rate": 3.216628540166645e-05, + "loss": 4.8884, + "loss/crossentropy": 1.7634951025247574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17525204457342625, + "step": 9318 + }, + { + "epoch": 0.7766666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.10983072916666667, + "learning_rate": 3.214466265200595e-05, + "loss": 5.2293, + "loss/crossentropy": 2.389511853456497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21278628706932068, + "step": 9320 + }, + { + "epoch": 0.7768333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.10089518229166666, + "learning_rate": 3.212301980619998e-05, + "loss": 4.6147, + "loss/crossentropy": 2.306184262037277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19868546724319458, + "step": 9322 + }, + { + "epoch": 0.777, + "grad_norm": 4.375, + "grad_norm_var": 0.08619791666666667, + "learning_rate": 3.210135691765012e-05, + "loss": 4.4001, + "loss/crossentropy": 1.817717507481575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16615832969546318, + "step": 9324 + }, + { + "epoch": 0.7771666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.09478759765625, + "learning_rate": 3.2079674039807404e-05, + "loss": 4.6573, + "loss/crossentropy": 1.8692854642868042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17973013408482075, + "step": 9326 + }, + { + "epoch": 0.7773333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.09153238932291667, + "learning_rate": 3.2057971226172174e-05, + "loss": 5.1534, + "loss/crossentropy": 2.0862750113010406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20505443587899208, + "step": 9328 + }, + { + "epoch": 0.7775, + "grad_norm": 4.59375, + "grad_norm_var": 0.08401285807291667, + "learning_rate": 3.203624853029396e-05, + "loss": 5.1276, + "loss/crossentropy": 2.589634597301483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21999847516417503, + "step": 9330 + }, + { + "epoch": 0.7776666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.08013916015625, + "learning_rate": 3.2014506005771364e-05, + "loss": 4.9304, + "loss/crossentropy": 2.1367976665496826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22664149850606918, + "step": 9332 + }, + { + "epoch": 0.7778333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.04104410807291667, + "learning_rate": 3.199274370625189e-05, + "loss": 4.6075, + "loss/crossentropy": 1.7962630540132523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17372861318290234, + "step": 9334 + }, + { + "epoch": 0.778, + "grad_norm": 4.34375, + "grad_norm_var": 0.04996337890625, + "learning_rate": 3.197096168543186e-05, + "loss": 4.6964, + "loss/crossentropy": 1.823552280664444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19311099871993065, + "step": 9336 + }, + { + "epoch": 0.7781666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.04934895833333333, + "learning_rate": 3.1949159997056235e-05, + "loss": 4.4811, + "loss/crossentropy": 1.6573946326971054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16165916807949543, + "step": 9338 + }, + { + "epoch": 0.7783333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.05517171223958333, + "learning_rate": 3.192733869491853e-05, + "loss": 4.574, + "loss/crossentropy": 2.3132286369800568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21056906878948212, + "step": 9340 + }, + { + "epoch": 0.7785, + "grad_norm": 4.6875, + "grad_norm_var": 0.04735921223958333, + "learning_rate": 3.190549783286062e-05, + "loss": 4.2814, + "loss/crossentropy": 1.885126568377018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18343673273921013, + "step": 9342 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.042708333333333334, + "learning_rate": 3.1883637464772665e-05, + "loss": 4.2948, + "loss/crossentropy": 1.6604382917284966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1493774987757206, + "step": 9344 + }, + { + "epoch": 0.7788333333333334, + "grad_norm": 5.46875, + "grad_norm_var": 0.08863525390625, + "learning_rate": 3.1861757644592963e-05, + "loss": 5.6045, + "loss/crossentropy": 2.4773399233818054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180468514561653, + "step": 9346 + }, + { + "epoch": 0.779, + "grad_norm": 4.8125, + "grad_norm_var": 0.08704020182291666, + "learning_rate": 3.1839858426307784e-05, + "loss": 4.8078, + "loss/crossentropy": 2.260939121246338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197475366294384, + "step": 9348 + }, + { + "epoch": 0.7791666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.07862955729166667, + "learning_rate": 3.1817939863951284e-05, + "loss": 4.7583, + "loss/crossentropy": 1.5021531581878662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16176176443696022, + "step": 9350 + }, + { + "epoch": 0.7793333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.08121337890625, + "learning_rate": 3.179600201160532e-05, + "loss": 4.8546, + "loss/crossentropy": 1.4230839125812054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14267605170607567, + "step": 9352 + }, + { + "epoch": 0.7795, + "grad_norm": 4.625, + "grad_norm_var": 0.091259765625, + "learning_rate": 3.177404492339937e-05, + "loss": 5.7562, + "loss/crossentropy": 2.712648332118988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20741987600922585, + "step": 9354 + }, + { + "epoch": 0.7796666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.08375244140625, + "learning_rate": 3.175206865351038e-05, + "loss": 4.9446, + "loss/crossentropy": 2.2300324141979218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21562551707029343, + "step": 9356 + }, + { + "epoch": 0.7798333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.08150634765625, + "learning_rate": 3.173007325616258e-05, + "loss": 4.9502, + "loss/crossentropy": 2.4897924661636353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22505389899015427, + "step": 9358 + }, + { + "epoch": 0.78, + "grad_norm": 4.6875, + "grad_norm_var": 0.09602457682291667, + "learning_rate": 3.170805878562745e-05, + "loss": 5.3735, + "loss/crossentropy": 1.269807867705822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1314986441284418, + "step": 9360 + }, + { + "epoch": 0.7801666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.07063395182291667, + "learning_rate": 3.1686025296223505e-05, + "loss": 4.8595, + "loss/crossentropy": 1.7563334554433823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16725478693842888, + "step": 9362 + }, + { + "epoch": 0.7803333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.069140625, + "learning_rate": 3.166397284231618e-05, + "loss": 4.3453, + "loss/crossentropy": 2.3237995505332947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043895721435547, + "step": 9364 + }, + { + "epoch": 0.7805, + "grad_norm": 4.59375, + "grad_norm_var": 0.06842041015625, + "learning_rate": 3.1641901478317725e-05, + "loss": 4.8902, + "loss/crossentropy": 2.6030715703964233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23151734843850136, + "step": 9366 + }, + { + "epoch": 0.7806666666666666, + "grad_norm": 5.375, + "grad_norm_var": 0.07511393229166667, + "learning_rate": 3.1619811258687035e-05, + "loss": 4.9789, + "loss/crossentropy": 1.7157761678099632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061714269220829, + "step": 9368 + }, + { + "epoch": 0.7808333333333334, + "grad_norm": 5.125, + "grad_norm_var": 0.07776285807291666, + "learning_rate": 3.159770223792952e-05, + "loss": 4.9378, + "loss/crossentropy": 2.217753827571869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20972023904323578, + "step": 9370 + }, + { + "epoch": 0.781, + "grad_norm": 4.3125, + "grad_norm_var": 0.09915364583333333, + "learning_rate": 3.1575574470596996e-05, + "loss": 4.2703, + "loss/crossentropy": 0.9079168289899826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10304674133658409, + "step": 9372 + }, + { + "epoch": 0.7811666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.09931233723958334, + "learning_rate": 3.155342801128754e-05, + "loss": 4.9554, + "loss/crossentropy": 1.7163361012935638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16222969815135002, + "step": 9374 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.09996337890625, + "learning_rate": 3.153126291464533e-05, + "loss": 4.9338, + "loss/crossentropy": 1.8250629603862762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23775455728173256, + "step": 9376 + }, + { + "epoch": 0.7815, + "grad_norm": 4.9375, + "grad_norm_var": 0.08956705729166667, + "learning_rate": 3.1509079235360534e-05, + "loss": 4.9173, + "loss/crossentropy": 2.315726935863495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20576253160834312, + "step": 9378 + }, + { + "epoch": 0.7816666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.08209635416666666, + "learning_rate": 3.1486877028169174e-05, + "loss": 4.8982, + "loss/crossentropy": 1.8110148012638092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19745264574885368, + "step": 9380 + }, + { + "epoch": 0.7818333333333334, + "grad_norm": 6.15625, + "grad_norm_var": 0.20013020833333334, + "learning_rate": 3.146465634785301e-05, + "loss": 4.7892, + "loss/crossentropy": 1.715107500553131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18671299517154694, + "step": 9382 + }, + { + "epoch": 0.782, + "grad_norm": 4.9375, + "grad_norm_var": 0.18372395833333333, + "learning_rate": 3.144241724923934e-05, + "loss": 4.0597, + "loss/crossentropy": 2.425957441329956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160644568502903, + "step": 9384 + }, + { + "epoch": 0.7821666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.18396809895833333, + "learning_rate": 3.1420159787200934e-05, + "loss": 5.1833, + "loss/crossentropy": 1.9202543646097183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16482832096517086, + "step": 9386 + }, + { + "epoch": 0.7823333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.170556640625, + "learning_rate": 3.1397884016655876e-05, + "loss": 4.7629, + "loss/crossentropy": 2.2666742503643036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014549858868122, + "step": 9388 + }, + { + "epoch": 0.7825, + "grad_norm": 4.96875, + "grad_norm_var": 0.17939046223958333, + "learning_rate": 3.13755899925674e-05, + "loss": 4.6324, + "loss/crossentropy": 1.1317738816142082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14520836994051933, + "step": 9390 + }, + { + "epoch": 0.7826666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.18013916015625, + "learning_rate": 3.1353277769943815e-05, + "loss": 4.8204, + "loss/crossentropy": 1.505985789000988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1771007440984249, + "step": 9392 + }, + { + "epoch": 0.7828333333333334, + "grad_norm": 4.15625, + "grad_norm_var": 0.225390625, + "learning_rate": 3.133094740383829e-05, + "loss": 3.7728, + "loss/crossentropy": 1.7403645440936089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1842222698032856, + "step": 9394 + }, + { + "epoch": 0.783, + "grad_norm": 4.4375, + "grad_norm_var": 0.23824462890625, + "learning_rate": 3.1308598949348796e-05, + "loss": 4.7321, + "loss/crossentropy": 1.2648060396313667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13886748626828194, + "step": 9396 + }, + { + "epoch": 0.7831666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.1080078125, + "learning_rate": 3.1286232461617926e-05, + "loss": 4.4106, + "loss/crossentropy": 2.309142082929611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22487016394734383, + "step": 9398 + }, + { + "epoch": 0.7833333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.09801025390625, + "learning_rate": 3.1263847995832755e-05, + "loss": 4.8193, + "loss/crossentropy": 2.0457848384976387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1811012402176857, + "step": 9400 + }, + { + "epoch": 0.7835, + "grad_norm": 4.84375, + "grad_norm_var": 0.07929280598958334, + "learning_rate": 3.124144560722473e-05, + "loss": 4.9667, + "loss/crossentropy": 2.0342873632907867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22702273726463318, + "step": 9402 + }, + { + "epoch": 0.7836666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.07317708333333334, + "learning_rate": 3.1219025351069524e-05, + "loss": 4.3217, + "loss/crossentropy": 1.3957015573978424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.142100278288126, + "step": 9404 + }, + { + "epoch": 0.7838333333333334, + "grad_norm": 5.28125, + "grad_norm_var": 0.08587239583333334, + "learning_rate": 3.119658728268689e-05, + "loss": 4.8953, + "loss/crossentropy": 1.9280966818332672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1804770566523075, + "step": 9406 + }, + { + "epoch": 0.784, + "grad_norm": 4.34375, + "grad_norm_var": 0.09166666666666666, + "learning_rate": 3.1174131457440524e-05, + "loss": 5.0648, + "loss/crossentropy": 2.139180600643158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20616818591952324, + "step": 9408 + }, + { + "epoch": 0.7841666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.055497233072916666, + "learning_rate": 3.115165793073795e-05, + "loss": 5.0601, + "loss/crossentropy": 2.2152554094791412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19830195233225822, + "step": 9410 + }, + { + "epoch": 0.7843333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.04425455729166667, + "learning_rate": 3.1129166758030344e-05, + "loss": 4.9589, + "loss/crossentropy": 1.9690501242876053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18640769086778164, + "step": 9412 + }, + { + "epoch": 0.7845, + "grad_norm": 4.625, + "grad_norm_var": 0.041910807291666664, + "learning_rate": 3.110665799481246e-05, + "loss": 5.2133, + "loss/crossentropy": 2.339284062385559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21954040229320526, + "step": 9414 + }, + { + "epoch": 0.7846666666666666, + "grad_norm": 5.0, + "grad_norm_var": 0.05175374348958333, + "learning_rate": 3.1084131696622435e-05, + "loss": 5.2811, + "loss/crossentropy": 1.925955355167389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19665737450122833, + "step": 9416 + }, + { + "epoch": 0.7848333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.052978515625, + "learning_rate": 3.106158791904164e-05, + "loss": 5.0989, + "loss/crossentropy": 1.8825834766030312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18833110481500626, + "step": 9418 + }, + { + "epoch": 0.785, + "grad_norm": 4.625, + "grad_norm_var": 0.05054931640625, + "learning_rate": 3.103902671769465e-05, + "loss": 4.9473, + "loss/crossentropy": 2.5766605734825134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20159252732992172, + "step": 9420 + }, + { + "epoch": 0.7851666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.035791015625, + "learning_rate": 3.1016448148248955e-05, + "loss": 5.0349, + "loss/crossentropy": 1.7484197169542313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18348832800984383, + "step": 9422 + }, + { + "epoch": 0.7853333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.02877197265625, + "learning_rate": 3.099385226641493e-05, + "loss": 4.5261, + "loss/crossentropy": 1.3600659668445587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14355047047138214, + "step": 9424 + }, + { + "epoch": 0.7855, + "grad_norm": 4.625, + "grad_norm_var": 0.0302734375, + "learning_rate": 3.097123912794569e-05, + "loss": 5.0282, + "loss/crossentropy": 2.190640449523926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21116185933351517, + "step": 9426 + }, + { + "epoch": 0.7856666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.033919270833333334, + "learning_rate": 3.0948608788636875e-05, + "loss": 5.4235, + "loss/crossentropy": 2.3429831862449646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19852236658334732, + "step": 9428 + }, + { + "epoch": 0.7858333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.03362223307291667, + "learning_rate": 3.0925961304326634e-05, + "loss": 4.8775, + "loss/crossentropy": 2.2385424375534058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20526361465454102, + "step": 9430 + }, + { + "epoch": 0.786, + "grad_norm": 4.6875, + "grad_norm_var": 0.023681640625, + "learning_rate": 3.0903296730895354e-05, + "loss": 4.6021, + "loss/crossentropy": 1.8072471469640732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18658201955258846, + "step": 9432 + }, + { + "epoch": 0.7861666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.026025390625, + "learning_rate": 3.088061512426563e-05, + "loss": 4.7022, + "loss/crossentropy": 2.5440629720687866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039373368024826, + "step": 9434 + }, + { + "epoch": 0.7863333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.0326171875, + "learning_rate": 3.085791654040206e-05, + "loss": 5.3461, + "loss/crossentropy": 1.874840334057808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19369103014469147, + "step": 9436 + }, + { + "epoch": 0.7865, + "grad_norm": 5.21875, + "grad_norm_var": 0.04568684895833333, + "learning_rate": 3.083520103531115e-05, + "loss": 4.5034, + "loss/crossentropy": 2.5266154408454895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19745132699608803, + "step": 9438 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.04784749348958333, + "learning_rate": 3.0812468665041165e-05, + "loss": 5.1865, + "loss/crossentropy": 1.9486228823661804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20566469430923462, + "step": 9440 + }, + { + "epoch": 0.7868333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.051981608072916664, + "learning_rate": 3.078971948568195e-05, + "loss": 5.0026, + "loss/crossentropy": 2.0742533802986145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22526142373681068, + "step": 9442 + }, + { + "epoch": 0.787, + "grad_norm": 5.125, + "grad_norm_var": 0.067578125, + "learning_rate": 3.076695355336486e-05, + "loss": 4.9311, + "loss/crossentropy": 1.456095166504383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14632639847695827, + "step": 9444 + }, + { + "epoch": 0.7871666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.08843994140625, + "learning_rate": 3.0744170924262546e-05, + "loss": 4.2479, + "loss/crossentropy": 1.304304301738739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14765658788383007, + "step": 9446 + }, + { + "epoch": 0.7873333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.092822265625, + "learning_rate": 3.072137165458891e-05, + "loss": 4.3201, + "loss/crossentropy": 1.4218315780162811, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14105146750807762, + "step": 9448 + }, + { + "epoch": 0.7875, + "grad_norm": 4.625, + "grad_norm_var": 0.087744140625, + "learning_rate": 3.069855580059885e-05, + "loss": 5.2229, + "loss/crossentropy": 1.6259961053729057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994849592447281, + "step": 9450 + }, + { + "epoch": 0.7876666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.09256184895833333, + "learning_rate": 3.067572341858825e-05, + "loss": 5.2859, + "loss/crossentropy": 2.0877254605293274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24095411598682404, + "step": 9452 + }, + { + "epoch": 0.7878333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.07789306640625, + "learning_rate": 3.065287456489372e-05, + "loss": 4.8259, + "loss/crossentropy": 0.8618222922086716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12632916308939457, + "step": 9454 + }, + { + "epoch": 0.788, + "grad_norm": 4.625, + "grad_norm_var": 0.06678059895833334, + "learning_rate": 3.063000929589255e-05, + "loss": 4.6287, + "loss/crossentropy": 2.2364018261432648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24320093169808388, + "step": 9456 + }, + { + "epoch": 0.7881666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.07047119140625, + "learning_rate": 3.0607127668002506e-05, + "loss": 5.0525, + "loss/crossentropy": 2.4567251205444336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22456028684973717, + "step": 9458 + }, + { + "epoch": 0.7883333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.07724202473958333, + "learning_rate": 3.058422973768175e-05, + "loss": 5.3273, + "loss/crossentropy": 1.9736377596855164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19915513694286346, + "step": 9460 + }, + { + "epoch": 0.7885, + "grad_norm": 4.40625, + "grad_norm_var": 0.06350504557291667, + "learning_rate": 3.056131556142861e-05, + "loss": 4.8735, + "loss/crossentropy": 1.8944967985153198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18105413764715195, + "step": 9462 + }, + { + "epoch": 0.7886666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.059488932291666664, + "learning_rate": 3.0538385195781594e-05, + "loss": 4.5573, + "loss/crossentropy": 1.660656489431858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15973762422800064, + "step": 9464 + }, + { + "epoch": 0.7888333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.06197916666666667, + "learning_rate": 3.051543869731905e-05, + "loss": 5.3056, + "loss/crossentropy": 2.2200306951999664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009948380291462, + "step": 9466 + }, + { + "epoch": 0.789, + "grad_norm": 4.875, + "grad_norm_var": 0.05468343098958333, + "learning_rate": 3.04924761226592e-05, + "loss": 4.9293, + "loss/crossentropy": 1.854020357131958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18061123974621296, + "step": 9468 + }, + { + "epoch": 0.7891666666666667, + "grad_norm": 4.09375, + "grad_norm_var": 0.09368489583333334, + "learning_rate": 3.0469497528459924e-05, + "loss": 4.5673, + "loss/crossentropy": 1.4358838349580765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1429310292005539, + "step": 9470 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.09322509765625, + "learning_rate": 3.0446502971418607e-05, + "loss": 4.8726, + "loss/crossentropy": 1.8105507493019104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17184137552976608, + "step": 9472 + }, + { + "epoch": 0.7895, + "grad_norm": 4.59375, + "grad_norm_var": 0.20871988932291666, + "learning_rate": 3.0423492508272036e-05, + "loss": 4.8678, + "loss/crossentropy": 1.4386665895581245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14651955105364323, + "step": 9474 + }, + { + "epoch": 0.7896666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.18596598307291667, + "learning_rate": 3.0400466195796238e-05, + "loss": 5.3026, + "loss/crossentropy": 2.505110263824463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073451578617096, + "step": 9476 + }, + { + "epoch": 0.7898333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.18605143229166668, + "learning_rate": 3.037742409080636e-05, + "loss": 5.1978, + "loss/crossentropy": 1.93652855604887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220738735049963, + "step": 9478 + }, + { + "epoch": 0.79, + "grad_norm": 5.0625, + "grad_norm_var": 0.19530843098958334, + "learning_rate": 3.035436625015649e-05, + "loss": 4.8821, + "loss/crossentropy": 1.0793606489896774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1280514094978571, + "step": 9480 + }, + { + "epoch": 0.7901666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.19407552083333332, + "learning_rate": 3.0331292730739583e-05, + "loss": 4.8433, + "loss/crossentropy": 1.9875987321138382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17475885152816772, + "step": 9482 + }, + { + "epoch": 0.7903333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.19924723307291667, + "learning_rate": 3.030820358948722e-05, + "loss": 4.6097, + "loss/crossentropy": 2.2878986299037933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20894601568579674, + "step": 9484 + }, + { + "epoch": 0.7905, + "grad_norm": 4.4375, + "grad_norm_var": 0.16497395833333334, + "learning_rate": 3.0285098883369587e-05, + "loss": 4.6595, + "loss/crossentropy": 1.6879505664110184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16620928049087524, + "step": 9486 + }, + { + "epoch": 0.7906666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.16573893229166667, + "learning_rate": 3.0261978669395246e-05, + "loss": 4.6243, + "loss/crossentropy": 1.9199838414788246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18426301330327988, + "step": 9488 + }, + { + "epoch": 0.7908333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.042252604166666666, + "learning_rate": 3.0238843004611014e-05, + "loss": 5.0213, + "loss/crossentropy": 2.375735104084015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2256057783961296, + "step": 9490 + }, + { + "epoch": 0.791, + "grad_norm": 4.84375, + "grad_norm_var": 0.059098307291666666, + "learning_rate": 3.0215691946101865e-05, + "loss": 5.7123, + "loss/crossentropy": 2.1991631910204887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18081208877265453, + "step": 9492 + }, + { + "epoch": 0.7911666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.06379801432291667, + "learning_rate": 3.0192525550990715e-05, + "loss": 4.9129, + "loss/crossentropy": 1.7236268892884254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19379206374287605, + "step": 9494 + }, + { + "epoch": 0.7913333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.052469889322916664, + "learning_rate": 3.0169343876438354e-05, + "loss": 4.6993, + "loss/crossentropy": 2.3961364030838013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21208176389336586, + "step": 9496 + }, + { + "epoch": 0.7915, + "grad_norm": 4.28125, + "grad_norm_var": 0.06428629557291667, + "learning_rate": 3.0146146979643248e-05, + "loss": 4.5953, + "loss/crossentropy": 1.9669974148273468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18356753140687943, + "step": 9498 + }, + { + "epoch": 0.7916666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.07668863932291667, + "learning_rate": 3.012293491784144e-05, + "loss": 5.4238, + "loss/crossentropy": 2.3267141580581665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22954250872135162, + "step": 9500 + }, + { + "epoch": 0.7918333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.07040608723958333, + "learning_rate": 3.009970774830639e-05, + "loss": 4.6428, + "loss/crossentropy": 1.8814620971679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21085096895694733, + "step": 9502 + }, + { + "epoch": 0.792, + "grad_norm": 4.5, + "grad_norm_var": 0.073046875, + "learning_rate": 3.0076465528348825e-05, + "loss": 4.9866, + "loss/crossentropy": 1.8643994852900505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17112858220934868, + "step": 9504 + }, + { + "epoch": 0.7921666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.08059488932291667, + "learning_rate": 3.0053208315316608e-05, + "loss": 4.9273, + "loss/crossentropy": 1.931690700352192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17383356019854546, + "step": 9506 + }, + { + "epoch": 0.7923333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.070556640625, + "learning_rate": 3.0029936166594606e-05, + "loss": 5.0846, + "loss/crossentropy": 1.6738494783639908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17133232951164246, + "step": 9508 + }, + { + "epoch": 0.7925, + "grad_norm": 4.78125, + "grad_norm_var": 0.06183268229166667, + "learning_rate": 3.0006649139604537e-05, + "loss": 4.6146, + "loss/crossentropy": 1.1198093742132187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1320702861994505, + "step": 9510 + }, + { + "epoch": 0.7926666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.06119384765625, + "learning_rate": 2.9983347291804805e-05, + "loss": 4.7334, + "loss/crossentropy": 2.1339576840400696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1965150125324726, + "step": 9512 + }, + { + "epoch": 0.7928333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.04986572265625, + "learning_rate": 2.996003068069043e-05, + "loss": 4.9968, + "loss/crossentropy": 1.8556120991706848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21033604815602303, + "step": 9514 + }, + { + "epoch": 0.793, + "grad_norm": 4.5625, + "grad_norm_var": 0.04166259765625, + "learning_rate": 2.9936699363792816e-05, + "loss": 5.1152, + "loss/crossentropy": 2.0661367923021317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21299026906490326, + "step": 9516 + }, + { + "epoch": 0.7931666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.04459228515625, + "learning_rate": 2.991335339867968e-05, + "loss": 4.6905, + "loss/crossentropy": 1.0509056076407433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1323660109192133, + "step": 9518 + }, + { + "epoch": 0.7933333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.04654541015625, + "learning_rate": 2.9889992842954858e-05, + "loss": 5.3345, + "loss/crossentropy": 2.2113268077373505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20679356902837753, + "step": 9520 + }, + { + "epoch": 0.7935, + "grad_norm": 4.9375, + "grad_norm_var": 0.04537760416666667, + "learning_rate": 2.9866617754258197e-05, + "loss": 4.7611, + "loss/crossentropy": 1.4634685143828392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1467580944299698, + "step": 9522 + }, + { + "epoch": 0.7936666666666666, + "grad_norm": 5.625, + "grad_norm_var": 0.08841145833333333, + "learning_rate": 2.984322819026541e-05, + "loss": 5.2701, + "loss/crossentropy": 2.333310306072235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280527651309967, + "step": 9524 + }, + { + "epoch": 0.7938333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.09016520182291667, + "learning_rate": 2.981982420868792e-05, + "loss": 5.1634, + "loss/crossentropy": 1.7275255471467972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19213207438588142, + "step": 9526 + }, + { + "epoch": 0.794, + "grad_norm": 4.78125, + "grad_norm_var": 0.09178059895833333, + "learning_rate": 2.979640586727274e-05, + "loss": 5.0111, + "loss/crossentropy": 1.9508731663227081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20765764266252518, + "step": 9528 + }, + { + "epoch": 0.7941666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.08651936848958333, + "learning_rate": 2.977297322380227e-05, + "loss": 4.674, + "loss/crossentropy": 2.0042631030082703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18659953027963638, + "step": 9530 + }, + { + "epoch": 0.7943333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.08294270833333334, + "learning_rate": 2.9749526336094255e-05, + "loss": 4.7152, + "loss/crossentropy": 1.7674919664859772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17473849654197693, + "step": 9532 + }, + { + "epoch": 0.7945, + "grad_norm": 4.4375, + "grad_norm_var": 0.08474934895833333, + "learning_rate": 2.9726065262001545e-05, + "loss": 4.6997, + "loss/crossentropy": 2.370339721441269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148333080112934, + "step": 9534 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 5.1875, + "grad_norm_var": 0.09361572265625, + "learning_rate": 2.970259005941201e-05, + "loss": 5.1521, + "loss/crossentropy": 1.9504710137844086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21056831628084183, + "step": 9536 + }, + { + "epoch": 0.7948333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.08626302083333333, + "learning_rate": 2.967910078624839e-05, + "loss": 4.9986, + "loss/crossentropy": 1.970159761607647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18404271081089973, + "step": 9538 + }, + { + "epoch": 0.795, + "grad_norm": 4.78125, + "grad_norm_var": 0.040425618489583336, + "learning_rate": 2.9655597500468122e-05, + "loss": 5.1379, + "loss/crossentropy": 1.9936151206493378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17894721776247025, + "step": 9540 + }, + { + "epoch": 0.7951666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.04761962890625, + "learning_rate": 2.9632080260063224e-05, + "loss": 4.9476, + "loss/crossentropy": 1.8085922375321388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799444779753685, + "step": 9542 + }, + { + "epoch": 0.7953333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.04332275390625, + "learning_rate": 2.9608549123060145e-05, + "loss": 5.2912, + "loss/crossentropy": 1.8493381887674332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1686496790498495, + "step": 9544 + }, + { + "epoch": 0.7955, + "grad_norm": 4.625, + "grad_norm_var": 0.040087890625, + "learning_rate": 2.9585004147519644e-05, + "loss": 5.4614, + "loss/crossentropy": 2.317984402179718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21581026539206505, + "step": 9546 + }, + { + "epoch": 0.7956666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.04117431640625, + "learning_rate": 2.95614453915366e-05, + "loss": 4.834, + "loss/crossentropy": 1.8879902809858322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18581601977348328, + "step": 9548 + }, + { + "epoch": 0.7958333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.03463134765625, + "learning_rate": 2.9537872913239892e-05, + "loss": 4.4235, + "loss/crossentropy": 1.325361706316471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13260122202336788, + "step": 9550 + }, + { + "epoch": 0.796, + "grad_norm": 4.6875, + "grad_norm_var": 0.024995930989583335, + "learning_rate": 2.9514286770792275e-05, + "loss": 5.1549, + "loss/crossentropy": 1.5360118001699448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1876349337399006, + "step": 9552 + }, + { + "epoch": 0.7961666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.026302083333333334, + "learning_rate": 2.9490687022390215e-05, + "loss": 4.6383, + "loss/crossentropy": 1.5563682615756989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1847225520759821, + "step": 9554 + }, + { + "epoch": 0.7963333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.03883056640625, + "learning_rate": 2.9467073726263736e-05, + "loss": 4.6207, + "loss/crossentropy": 1.750094898045063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1583553459495306, + "step": 9556 + }, + { + "epoch": 0.7965, + "grad_norm": 4.75, + "grad_norm_var": 0.030367024739583335, + "learning_rate": 2.9443446940676305e-05, + "loss": 4.4711, + "loss/crossentropy": 2.0352243930101395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18048042058944702, + "step": 9558 + }, + { + "epoch": 0.7966666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.028515625, + "learning_rate": 2.9419806723924673e-05, + "loss": 4.8025, + "loss/crossentropy": 2.5280230045318604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2519477494060993, + "step": 9560 + }, + { + "epoch": 0.7968333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.042041015625, + "learning_rate": 2.93961531343387e-05, + "loss": 4.5649, + "loss/crossentropy": 2.3008410036563873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132852002978325, + "step": 9562 + }, + { + "epoch": 0.797, + "grad_norm": 4.71875, + "grad_norm_var": 0.14338785807291668, + "learning_rate": 2.937248623028129e-05, + "loss": 4.7427, + "loss/crossentropy": 2.0998832881450653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082512266933918, + "step": 9564 + }, + { + "epoch": 0.7971666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.14140625, + "learning_rate": 2.9348806070148178e-05, + "loss": 5.0877, + "loss/crossentropy": 2.1150071918964386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22998831048607826, + "step": 9566 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.1619140625, + "learning_rate": 2.9325112712367788e-05, + "loss": 4.4993, + "loss/crossentropy": 0.7897375747561455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13665344007313251, + "step": 9568 + }, + { + "epoch": 0.7975, + "grad_norm": 4.53125, + "grad_norm_var": 0.15969645182291667, + "learning_rate": 2.9301406215401136e-05, + "loss": 5.0347, + "loss/crossentropy": 2.2986485958099365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23015353456139565, + "step": 9570 + }, + { + "epoch": 0.7976666666666666, + "grad_norm": 5.03125, + "grad_norm_var": 0.15565999348958334, + "learning_rate": 2.927768663774165e-05, + "loss": 5.3172, + "loss/crossentropy": 1.9723598957061768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20267291367053986, + "step": 9572 + }, + { + "epoch": 0.7978333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.16689046223958334, + "learning_rate": 2.9253954037915016e-05, + "loss": 4.6553, + "loss/crossentropy": 2.072106420993805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16975676827132702, + "step": 9574 + }, + { + "epoch": 0.798, + "grad_norm": 4.4375, + "grad_norm_var": 0.17294514973958333, + "learning_rate": 2.9230208474479077e-05, + "loss": 4.7887, + "loss/crossentropy": 2.2334997951984406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19652622565627098, + "step": 9576 + }, + { + "epoch": 0.7981666666666667, + "grad_norm": 5.03125, + "grad_norm_var": 0.16925455729166666, + "learning_rate": 2.920645000602366e-05, + "loss": 5.0195, + "loss/crossentropy": 1.7784138470888138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887425109744072, + "step": 9578 + }, + { + "epoch": 0.7983333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.0705078125, + "learning_rate": 2.9182678691170392e-05, + "loss": 4.4802, + "loss/crossentropy": 1.1156157106161118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.121920857578516, + "step": 9580 + }, + { + "epoch": 0.7985, + "grad_norm": 5.09375, + "grad_norm_var": 0.07708333333333334, + "learning_rate": 2.915889458857266e-05, + "loss": 4.9223, + "loss/crossentropy": 1.6830340400338173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1527923159301281, + "step": 9582 + }, + { + "epoch": 0.7986666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.07379150390625, + "learning_rate": 2.9135097756915357e-05, + "loss": 5.501, + "loss/crossentropy": 2.1039809063076973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1852792724967003, + "step": 9584 + }, + { + "epoch": 0.7988333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.07128499348958334, + "learning_rate": 2.9111288254914803e-05, + "loss": 4.6736, + "loss/crossentropy": 2.2788360714912415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24036888033151627, + "step": 9586 + }, + { + "epoch": 0.799, + "grad_norm": 4.9375, + "grad_norm_var": 0.06496988932291667, + "learning_rate": 2.9087466141318573e-05, + "loss": 4.8876, + "loss/crossentropy": 2.326200306415558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151721641421318, + "step": 9588 + }, + { + "epoch": 0.7991666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.052079264322916666, + "learning_rate": 2.9063631474905382e-05, + "loss": 4.9736, + "loss/crossentropy": 2.0066977441310883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977986916899681, + "step": 9590 + }, + { + "epoch": 0.7993333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.046675618489583334, + "learning_rate": 2.9039784314484884e-05, + "loss": 5.3869, + "loss/crossentropy": 2.163942277431488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1966785341501236, + "step": 9592 + }, + { + "epoch": 0.7995, + "grad_norm": 4.46875, + "grad_norm_var": 0.046468098958333336, + "learning_rate": 2.9015924718897577e-05, + "loss": 5.0935, + "loss/crossentropy": 2.0210544764995575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1696550790220499, + "step": 9594 + }, + { + "epoch": 0.7996666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.045166015625, + "learning_rate": 2.8992052747014648e-05, + "loss": 4.3569, + "loss/crossentropy": 0.8158632516860962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10923364758491516, + "step": 9596 + }, + { + "epoch": 0.7998333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.034228515625, + "learning_rate": 2.8968168457737805e-05, + "loss": 4.3597, + "loss/crossentropy": 2.0650247782468796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20714912563562393, + "step": 9598 + }, + { + "epoch": 0.8, + "grad_norm": 4.4375, + "grad_norm_var": 0.027278645833333334, + "learning_rate": 2.894427190999916e-05, + "loss": 4.945, + "loss/crossentropy": 2.01848566532135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932220160961151, + "step": 9600 + }, + { + "epoch": 0.8001666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.034505208333333336, + "learning_rate": 2.8920363162761078e-05, + "loss": 4.6152, + "loss/crossentropy": 2.331765651702881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20866511762142181, + "step": 9602 + }, + { + "epoch": 0.8003333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 0.04485270182291667, + "learning_rate": 2.8896442275016014e-05, + "loss": 4.9153, + "loss/crossentropy": 2.022328555583954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19683456048369408, + "step": 9604 + }, + { + "epoch": 0.8005, + "grad_norm": 4.46875, + "grad_norm_var": 0.04869791666666667, + "learning_rate": 2.8872509305786375e-05, + "loss": 5.2109, + "loss/crossentropy": 2.3472258746623993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208266019821167, + "step": 9606 + }, + { + "epoch": 0.8006666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.05182291666666667, + "learning_rate": 2.8848564314124386e-05, + "loss": 4.0321, + "loss/crossentropy": 1.353347197175026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18437976483255625, + "step": 9608 + }, + { + "epoch": 0.8008333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.047135416666666666, + "learning_rate": 2.8824607359111935e-05, + "loss": 4.8684, + "loss/crossentropy": 1.3156828135252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15243668109178543, + "step": 9610 + }, + { + "epoch": 0.801, + "grad_norm": 4.75, + "grad_norm_var": 0.04537353515625, + "learning_rate": 2.8800638499860425e-05, + "loss": 4.6918, + "loss/crossentropy": 1.940132163465023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18036840856075287, + "step": 9612 + }, + { + "epoch": 0.8011666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.04763997395833333, + "learning_rate": 2.8776657795510634e-05, + "loss": 5.0228, + "loss/crossentropy": 1.8538916110992432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1759151928126812, + "step": 9614 + }, + { + "epoch": 0.8013333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.04478759765625, + "learning_rate": 2.8752665305232565e-05, + "loss": 4.695, + "loss/crossentropy": 1.3918126970529556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13803145475685596, + "step": 9616 + }, + { + "epoch": 0.8015, + "grad_norm": 4.59375, + "grad_norm_var": 0.03713785807291667, + "learning_rate": 2.87286610882253e-05, + "loss": 4.7662, + "loss/crossentropy": 2.2778570353984833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2131534069776535, + "step": 9618 + }, + { + "epoch": 0.8016666666666666, + "grad_norm": 5.9375, + "grad_norm_var": 0.11991780598958333, + "learning_rate": 2.8704645203716864e-05, + "loss": 5.0824, + "loss/crossentropy": 2.300149440765381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163809835910797, + "step": 9620 + }, + { + "epoch": 0.8018333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.11591389973958334, + "learning_rate": 2.8680617710964064e-05, + "loss": 4.2839, + "loss/crossentropy": 2.0871397852897644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19335642829537392, + "step": 9622 + }, + { + "epoch": 0.802, + "grad_norm": 4.84375, + "grad_norm_var": 0.10818684895833333, + "learning_rate": 2.8656578669252355e-05, + "loss": 5.2061, + "loss/crossentropy": 2.6689072847366333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22146183252334595, + "step": 9624 + }, + { + "epoch": 0.8021666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.10741780598958334, + "learning_rate": 2.8632528137895677e-05, + "loss": 5.1304, + "loss/crossentropy": 1.9833775535225868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18096048012375832, + "step": 9626 + }, + { + "epoch": 0.8023333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.10771077473958333, + "learning_rate": 2.860846617623631e-05, + "loss": 4.7472, + "loss/crossentropy": 2.048772692680359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1738439705222845, + "step": 9628 + }, + { + "epoch": 0.8025, + "grad_norm": 4.59375, + "grad_norm_var": 0.11308186848958333, + "learning_rate": 2.8584392843644777e-05, + "loss": 4.5055, + "loss/crossentropy": 1.6863243579864502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1545063853263855, + "step": 9630 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.12261962890625, + "learning_rate": 2.856030819951962e-05, + "loss": 5.2981, + "loss/crossentropy": 2.2145843654870987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.180853221565485, + "step": 9632 + }, + { + "epoch": 0.8028333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.15, + "learning_rate": 2.853621230328732e-05, + "loss": 4.8301, + "loss/crossentropy": 2.0582179874181747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821475587785244, + "step": 9634 + }, + { + "epoch": 0.803, + "grad_norm": 4.5, + "grad_norm_var": 0.068603515625, + "learning_rate": 2.851210521440208e-05, + "loss": 4.9309, + "loss/crossentropy": 2.4601719677448273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22456849366426468, + "step": 9636 + }, + { + "epoch": 0.8031666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.06881103515625, + "learning_rate": 2.8487986992345756e-05, + "loss": 5.1272, + "loss/crossentropy": 2.1401634514331818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21643871068954468, + "step": 9638 + }, + { + "epoch": 0.8033333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.07799072265625, + "learning_rate": 2.846385769662767e-05, + "loss": 4.8438, + "loss/crossentropy": 1.3249876573681831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18818671256303787, + "step": 9640 + }, + { + "epoch": 0.8035, + "grad_norm": 4.9375, + "grad_norm_var": 0.079931640625, + "learning_rate": 2.8439717386784464e-05, + "loss": 4.5864, + "loss/crossentropy": 2.387733817100525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22228409722447395, + "step": 9642 + }, + { + "epoch": 0.8036666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.08235270182291667, + "learning_rate": 2.8415566122379937e-05, + "loss": 4.8169, + "loss/crossentropy": 1.6120276674628258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1609376147389412, + "step": 9644 + }, + { + "epoch": 0.8038333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.08834228515625, + "learning_rate": 2.8391403963004943e-05, + "loss": 4.539, + "loss/crossentropy": 2.0225760638713837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24541480466723442, + "step": 9646 + }, + { + "epoch": 0.804, + "grad_norm": 4.71875, + "grad_norm_var": 0.07415364583333334, + "learning_rate": 2.8367230968277213e-05, + "loss": 4.6075, + "loss/crossentropy": 2.2183853089809418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087656781077385, + "step": 9648 + }, + { + "epoch": 0.8041666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.05618082682291667, + "learning_rate": 2.8343047197841192e-05, + "loss": 4.7195, + "loss/crossentropy": 2.461825728416443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21937000378966331, + "step": 9650 + }, + { + "epoch": 0.8043333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.05480143229166667, + "learning_rate": 2.831885271136795e-05, + "loss": 4.9463, + "loss/crossentropy": 2.300680994987488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21464381739497185, + "step": 9652 + }, + { + "epoch": 0.8045, + "grad_norm": 5.0, + "grad_norm_var": 0.07053629557291667, + "learning_rate": 2.8294647568554956e-05, + "loss": 4.4777, + "loss/crossentropy": 1.5217168852686882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1655261069536209, + "step": 9654 + }, + { + "epoch": 0.8046666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.061197916666666664, + "learning_rate": 2.8270431829126015e-05, + "loss": 4.5535, + "loss/crossentropy": 1.9004539996385574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1946029346436262, + "step": 9656 + }, + { + "epoch": 0.8048333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.05618489583333333, + "learning_rate": 2.8246205552831047e-05, + "loss": 4.8719, + "loss/crossentropy": 2.249520570039749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19344881922006607, + "step": 9658 + }, + { + "epoch": 0.805, + "grad_norm": 4.8125, + "grad_norm_var": 0.05618489583333333, + "learning_rate": 2.8221968799445973e-05, + "loss": 4.6141, + "loss/crossentropy": 1.7338961511850357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16334066353738308, + "step": 9660 + }, + { + "epoch": 0.8051666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.05126546223958333, + "learning_rate": 2.819772162877258e-05, + "loss": 5.2458, + "loss/crossentropy": 2.602845251560211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20655466988682747, + "step": 9662 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.058837890625, + "learning_rate": 2.817346410063835e-05, + "loss": 5.3631, + "loss/crossentropy": 1.9564568027853966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18752934224903584, + "step": 9664 + }, + { + "epoch": 0.8055, + "grad_norm": 4.53125, + "grad_norm_var": 0.054541015625, + "learning_rate": 2.8149196274896334e-05, + "loss": 4.7131, + "loss/crossentropy": 2.6057686805725098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19163187593221664, + "step": 9666 + }, + { + "epoch": 0.8056666666666666, + "grad_norm": 4.4375, + "grad_norm_var": 0.07291666666666667, + "learning_rate": 2.812491821142496e-05, + "loss": 4.4306, + "loss/crossentropy": 2.0900171995162964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22258975729346275, + "step": 9668 + }, + { + "epoch": 0.8058333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.07522379557291667, + "learning_rate": 2.8100629970127955e-05, + "loss": 5.1028, + "loss/crossentropy": 1.5734562277793884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16964438371360302, + "step": 9670 + }, + { + "epoch": 0.806, + "grad_norm": 4.8125, + "grad_norm_var": 0.06819254557291667, + "learning_rate": 2.8076331610934117e-05, + "loss": 4.7141, + "loss/crossentropy": 0.9716506451368332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1379980333149433, + "step": 9672 + }, + { + "epoch": 0.8061666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.06724853515625, + "learning_rate": 2.805202319379725e-05, + "loss": 5.4769, + "loss/crossentropy": 2.2032998502254486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21792401000857353, + "step": 9674 + }, + { + "epoch": 0.8063333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.06024983723958333, + "learning_rate": 2.8027704778695962e-05, + "loss": 4.949, + "loss/crossentropy": 2.4872482419013977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23049870505928993, + "step": 9676 + }, + { + "epoch": 0.8065, + "grad_norm": 4.90625, + "grad_norm_var": 0.062093098958333336, + "learning_rate": 2.80033764256335e-05, + "loss": 5.4824, + "loss/crossentropy": 2.6353384852409363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21061846613883972, + "step": 9678 + }, + { + "epoch": 0.8066666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.06451416015625, + "learning_rate": 2.7979038194637683e-05, + "loss": 5.5415, + "loss/crossentropy": 2.14662966132164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17899227887392044, + "step": 9680 + }, + { + "epoch": 0.8068333333333333, + "grad_norm": 4.21875, + "grad_norm_var": 0.084228515625, + "learning_rate": 2.7954690145760656e-05, + "loss": 4.889, + "loss/crossentropy": 2.0768280178308487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17490868642926216, + "step": 9682 + }, + { + "epoch": 0.807, + "grad_norm": 4.75, + "grad_norm_var": 0.060139973958333336, + "learning_rate": 2.793033233907883e-05, + "loss": 4.6971, + "loss/crossentropy": 2.2648986876010895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21894178539514542, + "step": 9684 + }, + { + "epoch": 0.8071666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.04556884765625, + "learning_rate": 2.7905964834692648e-05, + "loss": 4.091, + "loss/crossentropy": 1.7198558524250984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17033100128173828, + "step": 9686 + }, + { + "epoch": 0.8073333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.044905598958333334, + "learning_rate": 2.788158769272652e-05, + "loss": 4.7955, + "loss/crossentropy": 1.8393462970852852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787256933748722, + "step": 9688 + }, + { + "epoch": 0.8075, + "grad_norm": 4.9375, + "grad_norm_var": 0.05115559895833333, + "learning_rate": 2.7857200973328624e-05, + "loss": 4.9282, + "loss/crossentropy": 1.9202441275119781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960473172366619, + "step": 9690 + }, + { + "epoch": 0.8076666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.05513916015625, + "learning_rate": 2.7832804736670754e-05, + "loss": 4.404, + "loss/crossentropy": 2.503723382949829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21644175052642822, + "step": 9692 + }, + { + "epoch": 0.8078333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.049540201822916664, + "learning_rate": 2.78083990429482e-05, + "loss": 5.0705, + "loss/crossentropy": 2.5052966475486755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255629561841488, + "step": 9694 + }, + { + "epoch": 0.808, + "grad_norm": 4.71875, + "grad_norm_var": 0.03518473307291667, + "learning_rate": 2.77839839523796e-05, + "loss": 5.1541, + "loss/crossentropy": 2.4121972918510437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21121644973754883, + "step": 9696 + }, + { + "epoch": 0.8081666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.028999837239583333, + "learning_rate": 2.775955952520675e-05, + "loss": 5.3142, + "loss/crossentropy": 2.036761313676834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21132348477840424, + "step": 9698 + }, + { + "epoch": 0.8083333333333333, + "grad_norm": 5.125, + "grad_norm_var": 0.037886555989583334, + "learning_rate": 2.7735125821694492e-05, + "loss": 5.3845, + "loss/crossentropy": 2.2135225534439087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23048130422830582, + "step": 9700 + }, + { + "epoch": 0.8085, + "grad_norm": 4.96875, + "grad_norm_var": 0.04491780598958333, + "learning_rate": 2.771068290213057e-05, + "loss": 5.3062, + "loss/crossentropy": 2.3597964346408844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20864511281251907, + "step": 9702 + }, + { + "epoch": 0.8086666666666666, + "grad_norm": 4.25, + "grad_norm_var": 0.06451416015625, + "learning_rate": 2.7686230826825453e-05, + "loss": 4.4362, + "loss/crossentropy": 2.1759497225284576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25393587350845337, + "step": 9704 + }, + { + "epoch": 0.8088333333333333, + "grad_norm": 5.15625, + "grad_norm_var": 0.0767578125, + "learning_rate": 2.766176965611221e-05, + "loss": 4.7853, + "loss/crossentropy": 1.5988976433873177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22598283365368843, + "step": 9706 + }, + { + "epoch": 0.809, + "grad_norm": 4.875, + "grad_norm_var": 0.07576497395833333, + "learning_rate": 2.7637299450346345e-05, + "loss": 5.5758, + "loss/crossentropy": 2.635635018348694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24151397868990898, + "step": 9708 + }, + { + "epoch": 0.8091666666666667, + "grad_norm": 4.25, + "grad_norm_var": 0.08878580729166667, + "learning_rate": 2.7612820269905665e-05, + "loss": 4.2942, + "loss/crossentropy": 1.248511366546154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17816242016851902, + "step": 9710 + }, + { + "epoch": 0.8093333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.09659830729166667, + "learning_rate": 2.7588332175190102e-05, + "loss": 5.3882, + "loss/crossentropy": 2.1789558827877045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20762834697961807, + "step": 9712 + }, + { + "epoch": 0.8095, + "grad_norm": 4.625, + "grad_norm_var": 0.08642171223958334, + "learning_rate": 2.7563835226621606e-05, + "loss": 4.8216, + "loss/crossentropy": 2.2960515320301056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117706909775734, + "step": 9714 + }, + { + "epoch": 0.8096666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.07844645182291667, + "learning_rate": 2.753932948464396e-05, + "loss": 4.8336, + "loss/crossentropy": 2.016137406229973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17403754591941833, + "step": 9716 + }, + { + "epoch": 0.8098333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.078515625, + "learning_rate": 2.751481500972264e-05, + "loss": 4.9171, + "loss/crossentropy": 1.544787235558033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772163286805153, + "step": 9718 + }, + { + "epoch": 0.81, + "grad_norm": 4.53125, + "grad_norm_var": 0.06425374348958333, + "learning_rate": 2.7490291862344686e-05, + "loss": 5.0292, + "loss/crossentropy": 1.6385806947946548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17973128333687782, + "step": 9720 + }, + { + "epoch": 0.8101666666666667, + "grad_norm": 5.75, + "grad_norm_var": 0.11888020833333333, + "learning_rate": 2.7465760103018516e-05, + "loss": 4.8005, + "loss/crossentropy": 2.2076993584632874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21309059485793114, + "step": 9722 + }, + { + "epoch": 0.8103333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.125390625, + "learning_rate": 2.744121979227382e-05, + "loss": 4.6816, + "loss/crossentropy": 1.827271208167076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17865308560431004, + "step": 9724 + }, + { + "epoch": 0.8105, + "grad_norm": 4.3125, + "grad_norm_var": 0.1271484375, + "learning_rate": 2.7416670990661365e-05, + "loss": 4.8622, + "loss/crossentropy": 2.055354692041874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18429454416036606, + "step": 9726 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 5.125, + "grad_norm_var": 1.784228515625, + "learning_rate": 2.739211375875288e-05, + "loss": 5.4435, + "loss/crossentropy": 2.217236667871475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077849544584751, + "step": 9728 + }, + { + "epoch": 0.8108333333333333, + "grad_norm": 5.21875, + "grad_norm_var": 1.763134765625, + "learning_rate": 2.7367548157140888e-05, + "loss": 4.8811, + "loss/crossentropy": 2.5398449301719666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2191455401480198, + "step": 9730 + }, + { + "epoch": 0.811, + "grad_norm": 4.28125, + "grad_norm_var": 1.8050130208333333, + "learning_rate": 2.7342974246438586e-05, + "loss": 4.1932, + "loss/crossentropy": 1.9677127003669739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19635247439146042, + "step": 9732 + }, + { + "epoch": 0.8111666666666667, + "grad_norm": 4.875, + "grad_norm_var": 1.7762654622395833, + "learning_rate": 2.7318392087279648e-05, + "loss": 4.9551, + "loss/crossentropy": 2.4844985604286194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22329407185316086, + "step": 9734 + }, + { + "epoch": 0.8113333333333334, + "grad_norm": 4.375, + "grad_norm_var": 1.811181640625, + "learning_rate": 2.7293801740318104e-05, + "loss": 4.2019, + "loss/crossentropy": 0.9313739463686943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12556781060993671, + "step": 9736 + }, + { + "epoch": 0.8115, + "grad_norm": 4.5625, + "grad_norm_var": 1.80064697265625, + "learning_rate": 2.7269203266228196e-05, + "loss": 4.8225, + "loss/crossentropy": 2.24322646856308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20698126405477524, + "step": 9738 + }, + { + "epoch": 0.8116666666666666, + "grad_norm": 4.125, + "grad_norm_var": 1.8283162434895834, + "learning_rate": 2.7244596725704204e-05, + "loss": 4.3012, + "loss/crossentropy": 1.607184648513794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15676794946193695, + "step": 9740 + }, + { + "epoch": 0.8118333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 1.79058837890625, + "learning_rate": 2.7219982179460333e-05, + "loss": 4.9994, + "loss/crossentropy": 1.8960078209638596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17588616535067558, + "step": 9742 + }, + { + "epoch": 0.812, + "grad_norm": 5.0625, + "grad_norm_var": 0.08196614583333334, + "learning_rate": 2.7195359688230514e-05, + "loss": 4.9943, + "loss/crossentropy": 2.059798449277878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20114361122250557, + "step": 9744 + }, + { + "epoch": 0.8121666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.07496337890625, + "learning_rate": 2.717072931276832e-05, + "loss": 5.2328, + "loss/crossentropy": 2.3908294439315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179257869720459, + "step": 9746 + }, + { + "epoch": 0.8123333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.06873372395833334, + "learning_rate": 2.7146091113846723e-05, + "loss": 4.8158, + "loss/crossentropy": 2.3867982923984528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20485028997063637, + "step": 9748 + }, + { + "epoch": 0.8125, + "grad_norm": 4.375, + "grad_norm_var": 0.0720703125, + "learning_rate": 2.7121445152258056e-05, + "loss": 4.4072, + "loss/crossentropy": 1.6835525631904602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20599596947431564, + "step": 9750 + }, + { + "epoch": 0.8126666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.07486979166666667, + "learning_rate": 2.7096791488813772e-05, + "loss": 5.2649, + "loss/crossentropy": 2.514313280582428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239019237458706, + "step": 9752 + }, + { + "epoch": 0.8128333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.07277018229166667, + "learning_rate": 2.7072130184344324e-05, + "loss": 4.9717, + "loss/crossentropy": 2.6661786437034607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23028026148676872, + "step": 9754 + }, + { + "epoch": 0.813, + "grad_norm": 4.84375, + "grad_norm_var": 0.04407552083333333, + "learning_rate": 2.7047461299699045e-05, + "loss": 5.3989, + "loss/crossentropy": 2.0026678144931793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17631691321730614, + "step": 9756 + }, + { + "epoch": 0.8131666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.04778645833333333, + "learning_rate": 2.7022784895745942e-05, + "loss": 5.3907, + "loss/crossentropy": 1.366507887840271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16259120404720306, + "step": 9758 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.04959309895833333, + "learning_rate": 2.6998101033371598e-05, + "loss": 4.5622, + "loss/crossentropy": 1.3968349806964397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14244722109287977, + "step": 9760 + }, + { + "epoch": 0.8135, + "grad_norm": 4.4375, + "grad_norm_var": 0.06288655598958333, + "learning_rate": 2.6973409773480983e-05, + "loss": 4.5042, + "loss/crossentropy": 1.9490249156951904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17202415689826012, + "step": 9762 + }, + { + "epoch": 0.8136666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.063525390625, + "learning_rate": 2.6948711176997338e-05, + "loss": 4.5769, + "loss/crossentropy": 2.4208853244781494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22516579553484917, + "step": 9764 + }, + { + "epoch": 0.8138333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.05347900390625, + "learning_rate": 2.6924005304861976e-05, + "loss": 4.3336, + "loss/crossentropy": 1.5947562903165817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1803539153188467, + "step": 9766 + }, + { + "epoch": 0.814, + "grad_norm": 5.15625, + "grad_norm_var": 0.0922515869140625, + "learning_rate": 2.6899292218034202e-05, + "loss": 4.4487, + "loss/crossentropy": 1.780159056186676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1578317228704691, + "step": 9768 + }, + { + "epoch": 0.8141666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.09517313639322916, + "learning_rate": 2.6874571977491087e-05, + "loss": 4.5388, + "loss/crossentropy": 1.6249744519591331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15210830606520176, + "step": 9770 + }, + { + "epoch": 0.8143333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.0975738525390625, + "learning_rate": 2.684984464422736e-05, + "loss": 4.7648, + "loss/crossentropy": 1.5476508736610413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1978946216404438, + "step": 9772 + }, + { + "epoch": 0.8145, + "grad_norm": 4.59375, + "grad_norm_var": 0.08953348795572917, + "learning_rate": 2.6825110279255286e-05, + "loss": 5.0773, + "loss/crossentropy": 1.603047177195549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18620534241199493, + "step": 9774 + }, + { + "epoch": 0.8146666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.08882548014322916, + "learning_rate": 2.680036894360442e-05, + "loss": 4.8267, + "loss/crossentropy": 2.0404116213321686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23134134337306023, + "step": 9776 + }, + { + "epoch": 0.8148333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.0740142822265625, + "learning_rate": 2.6775620698321568e-05, + "loss": 5.0537, + "loss/crossentropy": 0.9913991242647171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11973785981535912, + "step": 9778 + }, + { + "epoch": 0.815, + "grad_norm": 4.5625, + "grad_norm_var": 0.1052642822265625, + "learning_rate": 2.6750865604470554e-05, + "loss": 4.9444, + "loss/crossentropy": 1.849538080394268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18608948774635792, + "step": 9780 + }, + { + "epoch": 0.8151666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.11380106608072917, + "learning_rate": 2.6726103723132122e-05, + "loss": 4.6016, + "loss/crossentropy": 1.9843580573797226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17255659773945808, + "step": 9782 + }, + { + "epoch": 0.8153333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.07148030598958334, + "learning_rate": 2.6701335115403747e-05, + "loss": 4.8852, + "loss/crossentropy": 1.60471910238266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17869190871715546, + "step": 9784 + }, + { + "epoch": 0.8155, + "grad_norm": 4.4375, + "grad_norm_var": 0.08479410807291667, + "learning_rate": 2.66765598423995e-05, + "loss": 4.1572, + "loss/crossentropy": 2.1039493903517723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869774367660284, + "step": 9786 + }, + { + "epoch": 0.8156666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.08381754557291667, + "learning_rate": 2.665177796524992e-05, + "loss": 5.0702, + "loss/crossentropy": 2.3638014793395996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21606038138270378, + "step": 9788 + }, + { + "epoch": 0.8158333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.08101806640625, + "learning_rate": 2.662698954510181e-05, + "loss": 5.1611, + "loss/crossentropy": 2.450950562953949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21199432387948036, + "step": 9790 + }, + { + "epoch": 0.816, + "grad_norm": 4.46875, + "grad_norm_var": 0.08368733723958334, + "learning_rate": 2.6602194643118142e-05, + "loss": 4.5387, + "loss/crossentropy": 2.1611936390399933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382005743682384, + "step": 9792 + }, + { + "epoch": 0.8161666666666667, + "grad_norm": 5.25, + "grad_norm_var": 0.10491129557291666, + "learning_rate": 2.6577393320477868e-05, + "loss": 5.4112, + "loss/crossentropy": 2.7818479537963867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20983505249023438, + "step": 9794 + }, + { + "epoch": 0.8163333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.06769205729166666, + "learning_rate": 2.6552585638375786e-05, + "loss": 5.1181, + "loss/crossentropy": 2.173560857772827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22058776766061783, + "step": 9796 + }, + { + "epoch": 0.8165, + "grad_norm": 4.59375, + "grad_norm_var": 0.05987955729166667, + "learning_rate": 2.6527771658022384e-05, + "loss": 4.9415, + "loss/crossentropy": 1.7440677136182785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19470586255192757, + "step": 9798 + }, + { + "epoch": 0.8166666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.05279947916666667, + "learning_rate": 2.65029514406437e-05, + "loss": 4.4766, + "loss/crossentropy": 2.262479215860367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19687332212924957, + "step": 9800 + }, + { + "epoch": 0.8168333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.06916910807291667, + "learning_rate": 2.6478125047481138e-05, + "loss": 5.0711, + "loss/crossentropy": 2.4867063760757446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21951580047607422, + "step": 9802 + }, + { + "epoch": 0.817, + "grad_norm": 4.78125, + "grad_norm_var": 0.058919270833333336, + "learning_rate": 2.6453292539791374e-05, + "loss": 4.5929, + "loss/crossentropy": 2.3370174169540405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19681474193930626, + "step": 9804 + }, + { + "epoch": 0.8171666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.07511393229166667, + "learning_rate": 2.642845397884614e-05, + "loss": 5.0763, + "loss/crossentropy": 1.8368133306503296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001226581633091, + "step": 9806 + }, + { + "epoch": 0.8173333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.07662760416666667, + "learning_rate": 2.640360942593212e-05, + "loss": 4.9234, + "loss/crossentropy": 1.2672517523169518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1805922258645296, + "step": 9808 + }, + { + "epoch": 0.8175, + "grad_norm": 4.65625, + "grad_norm_var": 0.06122639973958333, + "learning_rate": 2.6378758942350775e-05, + "loss": 4.7927, + "loss/crossentropy": 1.7979520708322525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17544420808553696, + "step": 9810 + }, + { + "epoch": 0.8176666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.05725504557291667, + "learning_rate": 2.6353902589418204e-05, + "loss": 4.6291, + "loss/crossentropy": 1.7286670580506325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17648831382393837, + "step": 9812 + }, + { + "epoch": 0.8178333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.05862223307291667, + "learning_rate": 2.632904042846499e-05, + "loss": 4.9676, + "loss/crossentropy": 2.0067897960543633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861281618475914, + "step": 9814 + }, + { + "epoch": 0.818, + "grad_norm": 4.4375, + "grad_norm_var": 0.064697265625, + "learning_rate": 2.6304172520836034e-05, + "loss": 5.1672, + "loss/crossentropy": 2.495649516582489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20421000942587852, + "step": 9816 + }, + { + "epoch": 0.8181666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.041796875, + "learning_rate": 2.6279298927890447e-05, + "loss": 5.1169, + "loss/crossentropy": 2.244947165250778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18983267061412334, + "step": 9818 + }, + { + "epoch": 0.8183333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.04247639973958333, + "learning_rate": 2.6254419711001325e-05, + "loss": 5.1038, + "loss/crossentropy": 2.5352413654327393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20742696896195412, + "step": 9820 + }, + { + "epoch": 0.8185, + "grad_norm": 5.03125, + "grad_norm_var": 0.03839518229166667, + "learning_rate": 2.6229534931555675e-05, + "loss": 4.776, + "loss/crossentropy": 2.2929417490959167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830854937434196, + "step": 9822 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.04303385416666667, + "learning_rate": 2.6204644650954212e-05, + "loss": 4.0816, + "loss/crossentropy": 1.6211243867874146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18405551463365555, + "step": 9824 + }, + { + "epoch": 0.8188333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.0560546875, + "learning_rate": 2.6179748930611227e-05, + "loss": 4.984, + "loss/crossentropy": 2.5118577778339386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24132562428712845, + "step": 9826 + }, + { + "epoch": 0.819, + "grad_norm": 4.84375, + "grad_norm_var": 0.07498372395833333, + "learning_rate": 2.615484783195444e-05, + "loss": 4.371, + "loss/crossentropy": 1.8359337151050568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16906186938285828, + "step": 9828 + }, + { + "epoch": 0.8191666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.08030192057291667, + "learning_rate": 2.6129941416424844e-05, + "loss": 4.7897, + "loss/crossentropy": 2.5312774777412415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21105752140283585, + "step": 9830 + }, + { + "epoch": 0.8193333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.08176676432291667, + "learning_rate": 2.6105029745476524e-05, + "loss": 4.8841, + "loss/crossentropy": 1.6842858046293259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15886010602116585, + "step": 9832 + }, + { + "epoch": 0.8195, + "grad_norm": 4.8125, + "grad_norm_var": 0.08046875, + "learning_rate": 2.6080112880576564e-05, + "loss": 5.2355, + "loss/crossentropy": 2.119531899690628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937863491475582, + "step": 9834 + }, + { + "epoch": 0.8196666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.08267822265625, + "learning_rate": 2.605519088320485e-05, + "loss": 5.2572, + "loss/crossentropy": 2.3377262353897095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22751981765031815, + "step": 9836 + }, + { + "epoch": 0.8198333333333333, + "grad_norm": 5.25, + "grad_norm_var": 0.097119140625, + "learning_rate": 2.6030263814853928e-05, + "loss": 4.5545, + "loss/crossentropy": 2.4934067130088806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20816605538129807, + "step": 9838 + }, + { + "epoch": 0.82, + "grad_norm": 4.625, + "grad_norm_var": 0.0767578125, + "learning_rate": 2.6005331737028875e-05, + "loss": 4.9717, + "loss/crossentropy": 2.5231724977493286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19992580264806747, + "step": 9840 + }, + { + "epoch": 0.8201666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.06790364583333333, + "learning_rate": 2.598039471124709e-05, + "loss": 4.7991, + "loss/crossentropy": 1.669595293700695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16963953524827957, + "step": 9842 + }, + { + "epoch": 0.8203333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.049605305989583334, + "learning_rate": 2.5955452799038235e-05, + "loss": 5.3925, + "loss/crossentropy": 2.155703604221344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19724318757653236, + "step": 9844 + }, + { + "epoch": 0.8205, + "grad_norm": 4.375, + "grad_norm_var": 0.04657796223958333, + "learning_rate": 2.593050606194398e-05, + "loss": 4.3041, + "loss/crossentropy": 2.2015575766563416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218138948082924, + "step": 9846 + }, + { + "epoch": 0.8206666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.059488932291666664, + "learning_rate": 2.5905554561517923e-05, + "loss": 5.0173, + "loss/crossentropy": 1.5508754402399063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1607704758644104, + "step": 9848 + }, + { + "epoch": 0.8208333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.07763264973958334, + "learning_rate": 2.5880598359325405e-05, + "loss": 4.7229, + "loss/crossentropy": 1.8121439665555954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22899498790502548, + "step": 9850 + }, + { + "epoch": 0.821, + "grad_norm": 4.6875, + "grad_norm_var": 0.07708333333333334, + "learning_rate": 2.5855637516943386e-05, + "loss": 4.6838, + "loss/crossentropy": 1.7467404007911682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17679844796657562, + "step": 9852 + }, + { + "epoch": 0.8211666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.06638997395833333, + "learning_rate": 2.5830672095960258e-05, + "loss": 4.9116, + "loss/crossentropy": 2.3431586623191833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21176928654313087, + "step": 9854 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.06470947265625, + "learning_rate": 2.580570215797571e-05, + "loss": 4.7261, + "loss/crossentropy": 1.483575701713562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21149089373648167, + "step": 9856 + }, + { + "epoch": 0.8215, + "grad_norm": 4.59375, + "grad_norm_var": 0.06311442057291666, + "learning_rate": 2.5780727764600588e-05, + "loss": 4.9978, + "loss/crossentropy": 1.9482173770666122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.173264279961586, + "step": 9858 + }, + { + "epoch": 0.8216666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.07541910807291667, + "learning_rate": 2.5755748977456722e-05, + "loss": 4.2775, + "loss/crossentropy": 1.2856212258338928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16561558470129967, + "step": 9860 + }, + { + "epoch": 0.8218333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.08508707682291666, + "learning_rate": 2.57307658581768e-05, + "loss": 4.6755, + "loss/crossentropy": 1.4765914678573608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14335180632770061, + "step": 9862 + }, + { + "epoch": 0.822, + "grad_norm": 4.8125, + "grad_norm_var": 0.06695556640625, + "learning_rate": 2.5705778468404158e-05, + "loss": 5.0093, + "loss/crossentropy": 2.4915146827697754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20742217451334, + "step": 9864 + }, + { + "epoch": 0.8221666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.05487874348958333, + "learning_rate": 2.568078686979272e-05, + "loss": 4.3963, + "loss/crossentropy": 2.16377916932106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22294186055660248, + "step": 9866 + }, + { + "epoch": 0.8223333333333334, + "grad_norm": 5.1875, + "grad_norm_var": 0.07623697916666666, + "learning_rate": 2.565579112400676e-05, + "loss": 5.5977, + "loss/crossentropy": 2.2194560170173645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2272995077073574, + "step": 9868 + }, + { + "epoch": 0.8225, + "grad_norm": 4.40625, + "grad_norm_var": 0.07727864583333334, + "learning_rate": 2.5630791292720804e-05, + "loss": 4.823, + "loss/crossentropy": 1.572212889790535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1896784622222185, + "step": 9870 + }, + { + "epoch": 0.8226666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.07727457682291666, + "learning_rate": 2.5605787437619443e-05, + "loss": 4.8361, + "loss/crossentropy": 1.8452747240662575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20604299381375313, + "step": 9872 + }, + { + "epoch": 0.8228333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.07838134765625, + "learning_rate": 2.55807796203972e-05, + "loss": 4.599, + "loss/crossentropy": 1.5376994386315346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13793596252799034, + "step": 9874 + }, + { + "epoch": 0.823, + "grad_norm": 5.375, + "grad_norm_var": 0.105712890625, + "learning_rate": 2.5555767902758398e-05, + "loss": 5.1798, + "loss/crossentropy": 1.973397634923458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19167595729231834, + "step": 9876 + }, + { + "epoch": 0.8231666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.08648681640625, + "learning_rate": 2.5530752346416934e-05, + "loss": 4.8992, + "loss/crossentropy": 1.2014049515128136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13308897987008095, + "step": 9878 + }, + { + "epoch": 0.8233333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.0916015625, + "learning_rate": 2.5505733013096236e-05, + "loss": 4.9195, + "loss/crossentropy": 2.278904974460602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010067068040371, + "step": 9880 + }, + { + "epoch": 0.8235, + "grad_norm": 4.75, + "grad_norm_var": 0.06940104166666666, + "learning_rate": 2.5480709964529e-05, + "loss": 5.2708, + "loss/crossentropy": 2.5370509028434753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21254604309797287, + "step": 9882 + }, + { + "epoch": 0.8236666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.060139973958333336, + "learning_rate": 2.5455683262457127e-05, + "loss": 4.6252, + "loss/crossentropy": 1.7036000490188599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1745048500597477, + "step": 9884 + }, + { + "epoch": 0.8238333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.05310872395833333, + "learning_rate": 2.54306529686315e-05, + "loss": 5.043, + "loss/crossentropy": 2.425258755683899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19860392063856125, + "step": 9886 + }, + { + "epoch": 0.824, + "grad_norm": 4.40625, + "grad_norm_var": 0.05771077473958333, + "learning_rate": 2.54056191448119e-05, + "loss": 5.3246, + "loss/crossentropy": 2.094254046678543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20764774084091187, + "step": 9888 + }, + { + "epoch": 0.8241666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.05481770833333333, + "learning_rate": 2.538058185276678e-05, + "loss": 5.0141, + "loss/crossentropy": 2.3725812435150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19575026631355286, + "step": 9890 + }, + { + "epoch": 0.8243333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.030712890625, + "learning_rate": 2.535554115427318e-05, + "loss": 5.0221, + "loss/crossentropy": 2.5913625955581665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21525685489177704, + "step": 9892 + }, + { + "epoch": 0.8245, + "grad_norm": 4.71875, + "grad_norm_var": 0.04927978515625, + "learning_rate": 2.5330497111116536e-05, + "loss": 4.4302, + "loss/crossentropy": 2.011984132230282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17140764743089676, + "step": 9894 + }, + { + "epoch": 0.8246666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 2.5305449785090526e-05, + "loss": 4.5671, + "loss/crossentropy": 2.1562889516353607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18358014523983002, + "step": 9896 + }, + { + "epoch": 0.8248333333333333, + "grad_norm": 3.875, + "grad_norm_var": 0.07486979166666667, + "learning_rate": 2.5280399237996946e-05, + "loss": 4.2635, + "loss/crossentropy": 1.3045726418495178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14434479922056198, + "step": 9898 + }, + { + "epoch": 0.825, + "grad_norm": 4.40625, + "grad_norm_var": 0.07786051432291667, + "learning_rate": 2.525534553164552e-05, + "loss": 4.1657, + "loss/crossentropy": 1.8639636635780334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1716986782848835, + "step": 9900 + }, + { + "epoch": 0.8251666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.07552083333333333, + "learning_rate": 2.5230288727853794e-05, + "loss": 4.9954, + "loss/crossentropy": 2.545925259590149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545745261013508, + "step": 9902 + }, + { + "epoch": 0.8253333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.0828125, + "learning_rate": 2.520522888844693e-05, + "loss": 5.1413, + "loss/crossentropy": 2.4325710237026215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21113605424761772, + "step": 9904 + }, + { + "epoch": 0.8255, + "grad_norm": 5.0625, + "grad_norm_var": 0.109228515625, + "learning_rate": 2.518016607525759e-05, + "loss": 5.6142, + "loss/crossentropy": 2.188798248767853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2361520528793335, + "step": 9906 + }, + { + "epoch": 0.8256666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.10181884765625, + "learning_rate": 2.5155100350125777e-05, + "loss": 4.8668, + "loss/crossentropy": 2.0001536905765533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016005963087082, + "step": 9908 + }, + { + "epoch": 0.8258333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.08548177083333333, + "learning_rate": 2.513003177489867e-05, + "loss": 4.6742, + "loss/crossentropy": 1.218208484351635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1346103437244892, + "step": 9910 + }, + { + "epoch": 0.826, + "grad_norm": 4.4375, + "grad_norm_var": 0.09514567057291666, + "learning_rate": 2.5104960411430498e-05, + "loss": 4.0595, + "loss/crossentropy": 1.4826791658997536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15879861637949944, + "step": 9912 + }, + { + "epoch": 0.8261666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.06496988932291667, + "learning_rate": 2.507988632158235e-05, + "loss": 4.3835, + "loss/crossentropy": 1.2572619915008545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14165034517645836, + "step": 9914 + }, + { + "epoch": 0.8263333333333334, + "grad_norm": 4.21875, + "grad_norm_var": 0.09234619140625, + "learning_rate": 2.505480956722205e-05, + "loss": 4.9511, + "loss/crossentropy": 1.2375790998339653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.134348314255476, + "step": 9916 + }, + { + "epoch": 0.8265, + "grad_norm": 3.890625, + "grad_norm_var": 0.13383687337239583, + "learning_rate": 2.5029730210224e-05, + "loss": 4.1284, + "loss/crossentropy": 2.0290369763970375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17147668451070786, + "step": 9918 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 4.25, + "grad_norm_var": 0.14077860514322918, + "learning_rate": 2.5004648312469017e-05, + "loss": 4.6698, + "loss/crossentropy": 1.3962792977690697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14946845173835754, + "step": 9920 + }, + { + "epoch": 0.8268333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.14575093587239582, + "learning_rate": 2.4979563935844192e-05, + "loss": 4.7106, + "loss/crossentropy": 2.4267687797546387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070532701909542, + "step": 9922 + }, + { + "epoch": 0.827, + "grad_norm": 4.9375, + "grad_norm_var": 0.15075581868489582, + "learning_rate": 2.4954477142242738e-05, + "loss": 5.7162, + "loss/crossentropy": 2.0101495683193207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21650001779198647, + "step": 9924 + }, + { + "epoch": 0.8271666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.15583394368489584, + "learning_rate": 2.492938799356381e-05, + "loss": 4.6152, + "loss/crossentropy": 2.140102058649063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18941906467080116, + "step": 9926 + }, + { + "epoch": 0.8273333333333334, + "grad_norm": 4.375, + "grad_norm_var": 0.13820699055989583, + "learning_rate": 2.49042965517124e-05, + "loss": 4.2472, + "loss/crossentropy": 2.1069682240486145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918292548507452, + "step": 9928 + }, + { + "epoch": 0.8275, + "grad_norm": 5.03125, + "grad_norm_var": 0.1607818603515625, + "learning_rate": 2.4879202878599137e-05, + "loss": 4.9035, + "loss/crossentropy": 2.698134481906891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24367520585656166, + "step": 9930 + }, + { + "epoch": 0.8276666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.1223541259765625, + "learning_rate": 2.485410703614017e-05, + "loss": 4.5958, + "loss/crossentropy": 2.365455448627472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22331998497247696, + "step": 9932 + }, + { + "epoch": 0.8278333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.08313395182291666, + "learning_rate": 2.4829009086257e-05, + "loss": 5.3997, + "loss/crossentropy": 2.537370502948761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20775676891207695, + "step": 9934 + }, + { + "epoch": 0.828, + "grad_norm": 4.84375, + "grad_norm_var": 0.07506510416666666, + "learning_rate": 2.4803909090876318e-05, + "loss": 5.3511, + "loss/crossentropy": 2.4864864349365234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117943949997425, + "step": 9936 + }, + { + "epoch": 0.8281666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.09146728515625, + "learning_rate": 2.4778807111929868e-05, + "loss": 5.305, + "loss/crossentropy": 2.207614630460739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929798237979412, + "step": 9938 + }, + { + "epoch": 0.8283333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.09073893229166667, + "learning_rate": 2.4753703211354285e-05, + "loss": 5.2632, + "loss/crossentropy": 1.7467438951134682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1611561868339777, + "step": 9940 + }, + { + "epoch": 0.8285, + "grad_norm": 4.53125, + "grad_norm_var": 0.08084309895833333, + "learning_rate": 2.472859745109096e-05, + "loss": 5.0141, + "loss/crossentropy": 2.3705212473869324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19435375928878784, + "step": 9942 + }, + { + "epoch": 0.8286666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.06728108723958333, + "learning_rate": 2.4703489893085842e-05, + "loss": 4.592, + "loss/crossentropy": 1.6773193031549454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19796937331557274, + "step": 9944 + }, + { + "epoch": 0.8288333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.040999348958333334, + "learning_rate": 2.4678380599289352e-05, + "loss": 5.1897, + "loss/crossentropy": 2.335031569004059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20903323218226433, + "step": 9946 + }, + { + "epoch": 0.829, + "grad_norm": 4.5, + "grad_norm_var": 0.04599202473958333, + "learning_rate": 2.4653269631656164e-05, + "loss": 4.9803, + "loss/crossentropy": 1.9900788962841034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772918961942196, + "step": 9948 + }, + { + "epoch": 0.8291666666666667, + "grad_norm": 4.25, + "grad_norm_var": 0.06456705729166666, + "learning_rate": 2.46281570521451e-05, + "loss": 4.5932, + "loss/crossentropy": 1.3242772594094276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14987998828291893, + "step": 9950 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.0826171875, + "learning_rate": 2.4603042922718956e-05, + "loss": 5.2458, + "loss/crossentropy": 2.7143561840057373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748181268572807, + "step": 9952 + }, + { + "epoch": 0.8295, + "grad_norm": 4.625, + "grad_norm_var": 0.04381103515625, + "learning_rate": 2.4577927305344343e-05, + "loss": 4.8155, + "loss/crossentropy": 2.2978959679603577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21883852034807205, + "step": 9954 + }, + { + "epoch": 0.8296666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.03828125, + "learning_rate": 2.4552810261991564e-05, + "loss": 4.653, + "loss/crossentropy": 2.2232907116413116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2165231816470623, + "step": 9956 + }, + { + "epoch": 0.8298333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.04869791666666667, + "learning_rate": 2.4527691854634405e-05, + "loss": 5.1506, + "loss/crossentropy": 1.845031201839447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16416467167437077, + "step": 9958 + }, + { + "epoch": 0.83, + "grad_norm": 4.3125, + "grad_norm_var": 0.055013020833333336, + "learning_rate": 2.4502572145250055e-05, + "loss": 4.6119, + "loss/crossentropy": 1.3500414192676544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15559666231274605, + "step": 9960 + }, + { + "epoch": 0.8301666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.05870768229166667, + "learning_rate": 2.4477451195818896e-05, + "loss": 5.065, + "loss/crossentropy": 1.7964719235897064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1803671047091484, + "step": 9962 + }, + { + "epoch": 0.8303333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.06027018229166667, + "learning_rate": 2.4452329068324377e-05, + "loss": 5.0383, + "loss/crossentropy": 2.097862370312214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18102774024009705, + "step": 9964 + }, + { + "epoch": 0.8305, + "grad_norm": 4.625, + "grad_norm_var": 0.04924723307291667, + "learning_rate": 2.4427205824752846e-05, + "loss": 5.3962, + "loss/crossentropy": 2.1204554736614227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20536362007260323, + "step": 9966 + }, + { + "epoch": 0.8306666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.036421712239583334, + "learning_rate": 2.4402081527093407e-05, + "loss": 4.4037, + "loss/crossentropy": 1.9657414183020592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005226742476225, + "step": 9968 + }, + { + "epoch": 0.8308333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.0361328125, + "learning_rate": 2.4376956237337765e-05, + "loss": 5.0019, + "loss/crossentropy": 2.4795849323272705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20067550241947174, + "step": 9970 + }, + { + "epoch": 0.831, + "grad_norm": 4.5625, + "grad_norm_var": 0.035416666666666666, + "learning_rate": 2.4351830017480085e-05, + "loss": 4.7479, + "loss/crossentropy": 1.7610122039914131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1636796798557043, + "step": 9972 + }, + { + "epoch": 0.8311666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.02574462890625, + "learning_rate": 2.4326702929516813e-05, + "loss": 5.3112, + "loss/crossentropy": 2.6836007833480835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438656002283096, + "step": 9974 + }, + { + "epoch": 0.8313333333333334, + "grad_norm": 4.28125, + "grad_norm_var": 0.029427083333333333, + "learning_rate": 2.4301575035446536e-05, + "loss": 4.3757, + "loss/crossentropy": 1.2276001051068306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13492096588015556, + "step": 9976 + }, + { + "epoch": 0.8315, + "grad_norm": 4.4375, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 2.4276446397269836e-05, + "loss": 4.5018, + "loss/crossentropy": 1.9868988022208214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16871058754622936, + "step": 9978 + }, + { + "epoch": 0.8316666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.025419108072916665, + "learning_rate": 2.4251317076989134e-05, + "loss": 4.4441, + "loss/crossentropy": 1.9331146478652954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816711686551571, + "step": 9980 + }, + { + "epoch": 0.8318333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.026236979166666667, + "learning_rate": 2.422618713660853e-05, + "loss": 4.791, + "loss/crossentropy": 2.5316545963287354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20427104085683823, + "step": 9982 + }, + { + "epoch": 0.832, + "grad_norm": 4.5, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 2.4201056638133647e-05, + "loss": 4.9682, + "loss/crossentropy": 1.7062507718801498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16023046895861626, + "step": 9984 + }, + { + "epoch": 0.8321666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.030855305989583335, + "learning_rate": 2.4175925643571495e-05, + "loss": 5.2089, + "loss/crossentropy": 2.221466898918152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19790564477443695, + "step": 9986 + }, + { + "epoch": 0.8323333333333334, + "grad_norm": 4.3125, + "grad_norm_var": 0.03511962890625, + "learning_rate": 2.4150794214930314e-05, + "loss": 5.0092, + "loss/crossentropy": 2.4697205424308777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19727880507707596, + "step": 9988 + }, + { + "epoch": 0.8325, + "grad_norm": 4.71875, + "grad_norm_var": 0.03404947916666667, + "learning_rate": 2.4125662414219387e-05, + "loss": 4.8074, + "loss/crossentropy": 1.8578790351748466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1584503035992384, + "step": 9990 + }, + { + "epoch": 0.8326666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.024674479166666666, + "learning_rate": 2.4100530303448946e-05, + "loss": 4.1005, + "loss/crossentropy": 2.0569470822811127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2027450054883957, + "step": 9992 + }, + { + "epoch": 0.8328333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.026416015625, + "learning_rate": 2.4075397944629976e-05, + "loss": 4.3906, + "loss/crossentropy": 1.9774124771356583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17903640680015087, + "step": 9994 + }, + { + "epoch": 0.833, + "grad_norm": 4.28125, + "grad_norm_var": 0.04400634765625, + "learning_rate": 2.4050265399774072e-05, + "loss": 4.7254, + "loss/crossentropy": 1.9097615107893944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1601880006492138, + "step": 9996 + }, + { + "epoch": 0.8331666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.04309488932291667, + "learning_rate": 2.4025132730893298e-05, + "loss": 5.4207, + "loss/crossentropy": 2.268464207649231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21927351877093315, + "step": 9998 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.045182291666666666, + "learning_rate": 2.4000000000000004e-05, + "loss": 4.8914, + "loss/crossentropy": 2.4459827542304993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19904977455735207, + "step": 10000 + }, + { + "epoch": 0.8335, + "grad_norm": 5.8125, + "grad_norm_var": 0.13088785807291667, + "learning_rate": 2.397486726910671e-05, + "loss": 5.2496, + "loss/crossentropy": 2.393665611743927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21754077449440956, + "step": 10002 + }, + { + "epoch": 0.8336666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.12320556640625, + "learning_rate": 2.394973460022593e-05, + "loss": 4.242, + "loss/crossentropy": 1.3170356079936028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14232172816991806, + "step": 10004 + }, + { + "epoch": 0.8338333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.125244140625, + "learning_rate": 2.392460205537003e-05, + "loss": 4.5769, + "loss/crossentropy": 1.8860590308904648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1884509939700365, + "step": 10006 + }, + { + "epoch": 0.834, + "grad_norm": 4.53125, + "grad_norm_var": 0.12538655598958334, + "learning_rate": 2.3899469696551058e-05, + "loss": 5.1368, + "loss/crossentropy": 2.2285009026527405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22375357151031494, + "step": 10008 + }, + { + "epoch": 0.8341666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.12401936848958334, + "learning_rate": 2.3874337585780624e-05, + "loss": 4.5004, + "loss/crossentropy": 1.4645071625709534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14867208898067474, + "step": 10010 + }, + { + "epoch": 0.8343333333333334, + "grad_norm": 5.15625, + "grad_norm_var": 0.11417643229166667, + "learning_rate": 2.3849205785069698e-05, + "loss": 5.1683, + "loss/crossentropy": 2.0563749074935913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22186800837516785, + "step": 10012 + }, + { + "epoch": 0.8345, + "grad_norm": 4.4375, + "grad_norm_var": 0.11985270182291667, + "learning_rate": 2.3824074356428513e-05, + "loss": 4.7704, + "loss/crossentropy": 1.532589927315712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1682576658204198, + "step": 10014 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.134375, + "learning_rate": 2.379894336186636e-05, + "loss": 4.8072, + "loss/crossentropy": 1.8966995403170586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17377831041812897, + "step": 10016 + }, + { + "epoch": 0.8348333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.06295166015625, + "learning_rate": 2.3773812863391483e-05, + "loss": 4.8095, + "loss/crossentropy": 2.5535982847213745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20435542613267899, + "step": 10018 + }, + { + "epoch": 0.835, + "grad_norm": 4.25, + "grad_norm_var": 0.07265625, + "learning_rate": 2.3748682923010877e-05, + "loss": 4.0512, + "loss/crossentropy": 1.9317252039909363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969212256371975, + "step": 10020 + }, + { + "epoch": 0.8351666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.07431233723958333, + "learning_rate": 2.3723553602730176e-05, + "loss": 4.6774, + "loss/crossentropy": 2.494425058364868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20477550849318504, + "step": 10022 + }, + { + "epoch": 0.8353333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.07909749348958334, + "learning_rate": 2.3698424964553475e-05, + "loss": 5.1466, + "loss/crossentropy": 2.3530495166778564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201245229691267, + "step": 10024 + }, + { + "epoch": 0.8355, + "grad_norm": 4.8125, + "grad_norm_var": 0.061258951822916664, + "learning_rate": 2.3673297070483198e-05, + "loss": 5.5255, + "loss/crossentropy": 2.239256501197815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2388983592391014, + "step": 10026 + }, + { + "epoch": 0.8356666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.038916015625, + "learning_rate": 2.3648169982519923e-05, + "loss": 5.1636, + "loss/crossentropy": 1.9721302911639214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19041937962174416, + "step": 10028 + }, + { + "epoch": 0.8358333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.03964436848958333, + "learning_rate": 2.3623043762662247e-05, + "loss": 5.3836, + "loss/crossentropy": 2.394495278596878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.254102174192667, + "step": 10030 + }, + { + "epoch": 0.836, + "grad_norm": 4.625, + "grad_norm_var": 0.059305826822916664, + "learning_rate": 2.3597918472906605e-05, + "loss": 4.5989, + "loss/crossentropy": 2.5070912837982178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160685546696186, + "step": 10032 + }, + { + "epoch": 0.8361666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.05276285807291667, + "learning_rate": 2.3572794175247165e-05, + "loss": 5.0263, + "loss/crossentropy": 2.2234988510608673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21419087797403336, + "step": 10034 + }, + { + "epoch": 0.8363333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.045426432291666666, + "learning_rate": 2.3547670931675635e-05, + "loss": 4.6435, + "loss/crossentropy": 1.7220133692026138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973039098083973, + "step": 10036 + }, + { + "epoch": 0.8365, + "grad_norm": 4.46875, + "grad_norm_var": 0.0587890625, + "learning_rate": 2.3522548804181112e-05, + "loss": 4.8134, + "loss/crossentropy": 1.4040338397026062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1425228714942932, + "step": 10038 + }, + { + "epoch": 0.8366666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.058056640625, + "learning_rate": 2.349742785474995e-05, + "loss": 5.0962, + "loss/crossentropy": 2.302014708518982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21808390691876411, + "step": 10040 + }, + { + "epoch": 0.8368333333333333, + "grad_norm": 4.25, + "grad_norm_var": 0.07574462890625, + "learning_rate": 2.3472308145365603e-05, + "loss": 4.7411, + "loss/crossentropy": 2.5134881734848022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2047807015478611, + "step": 10042 + }, + { + "epoch": 0.837, + "grad_norm": 4.34375, + "grad_norm_var": 0.08153889973958334, + "learning_rate": 2.3447189738008448e-05, + "loss": 4.4727, + "loss/crossentropy": 1.8269146978855133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2027146816253662, + "step": 10044 + }, + { + "epoch": 0.8371666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.08370768229166667, + "learning_rate": 2.3422072694655668e-05, + "loss": 4.5877, + "loss/crossentropy": 1.7809841856360435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944657415151596, + "step": 10046 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.057840983072916664, + "learning_rate": 2.3396957077281045e-05, + "loss": 4.9396, + "loss/crossentropy": 2.4442911744117737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2242444083094597, + "step": 10048 + }, + { + "epoch": 0.8375, + "grad_norm": 5.03125, + "grad_norm_var": 0.07003580729166667, + "learning_rate": 2.337184294785491e-05, + "loss": 5.2301, + "loss/crossentropy": 1.9728785753250122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22261008620262146, + "step": 10050 + }, + { + "epoch": 0.8376666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.061747233072916664, + "learning_rate": 2.334673036834384e-05, + "loss": 4.4604, + "loss/crossentropy": 1.158107079565525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1579295713454485, + "step": 10052 + }, + { + "epoch": 0.8378333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.05162760416666667, + "learning_rate": 2.3321619400710656e-05, + "loss": 4.4305, + "loss/crossentropy": 1.447898805141449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16212822496891022, + "step": 10054 + }, + { + "epoch": 0.838, + "grad_norm": 4.46875, + "grad_norm_var": 0.03839518229166667, + "learning_rate": 2.329651010691417e-05, + "loss": 4.9035, + "loss/crossentropy": 2.616535484790802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21434472128748894, + "step": 10056 + }, + { + "epoch": 0.8381666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.03218994140625, + "learning_rate": 2.3271402548909054e-05, + "loss": 4.8799, + "loss/crossentropy": 1.4755475595593452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1441640816628933, + "step": 10058 + }, + { + "epoch": 0.8383333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.025809733072916667, + "learning_rate": 2.324629678864572e-05, + "loss": 5.3515, + "loss/crossentropy": 2.7159085869789124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20213079079985619, + "step": 10060 + }, + { + "epoch": 0.8385, + "grad_norm": 4.40625, + "grad_norm_var": 0.030973307291666665, + "learning_rate": 2.3221192888070144e-05, + "loss": 4.4702, + "loss/crossentropy": 2.061556816101074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1886168085038662, + "step": 10062 + }, + { + "epoch": 0.8386666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.033707682291666666, + "learning_rate": 2.319609090912369e-05, + "loss": 4.614, + "loss/crossentropy": 1.3036933615803719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14548559859395027, + "step": 10064 + }, + { + "epoch": 0.8388333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.025846354166666665, + "learning_rate": 2.317099091374301e-05, + "loss": 4.9908, + "loss/crossentropy": 1.1547905504703522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16585730761289597, + "step": 10066 + }, + { + "epoch": 0.839, + "grad_norm": 5.40625, + "grad_norm_var": 0.07138264973958333, + "learning_rate": 2.3145892963859834e-05, + "loss": 5.044, + "loss/crossentropy": 1.499816857278347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1658524088561535, + "step": 10068 + }, + { + "epoch": 0.8391666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.07818603515625, + "learning_rate": 2.3120797121400874e-05, + "loss": 4.9807, + "loss/crossentropy": 2.348451852798462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19513800367712975, + "step": 10070 + }, + { + "epoch": 0.8393333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.078369140625, + "learning_rate": 2.309570344828761e-05, + "loss": 4.866, + "loss/crossentropy": 2.185619443655014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21363262459635735, + "step": 10072 + }, + { + "epoch": 0.8395, + "grad_norm": 4.21875, + "grad_norm_var": 0.218603515625, + "learning_rate": 2.3070612006436202e-05, + "loss": 4.7391, + "loss/crossentropy": 2.4252246618270874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19878173992037773, + "step": 10074 + }, + { + "epoch": 0.8396666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.24859619140625, + "learning_rate": 2.304552285775727e-05, + "loss": 5.27, + "loss/crossentropy": 2.406407594680786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21240625903010368, + "step": 10076 + }, + { + "epoch": 0.8398333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.22805989583333333, + "learning_rate": 2.3020436064155813e-05, + "loss": 4.9108, + "loss/crossentropy": 1.5165601968765259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16400650143623352, + "step": 10078 + }, + { + "epoch": 0.84, + "grad_norm": 4.71875, + "grad_norm_var": 0.22141520182291666, + "learning_rate": 2.2995351687530988e-05, + "loss": 4.7594, + "loss/crossentropy": 1.1999849155545235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13244791328907013, + "step": 10080 + }, + { + "epoch": 0.8401666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.23917643229166666, + "learning_rate": 2.297026978977601e-05, + "loss": 4.7325, + "loss/crossentropy": 2.0893143713474274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20484750717878342, + "step": 10082 + }, + { + "epoch": 0.8403333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.21301676432291666, + "learning_rate": 2.294519043277796e-05, + "loss": 5.2237, + "loss/crossentropy": 2.0791936218738556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17905857414007187, + "step": 10084 + }, + { + "epoch": 0.8405, + "grad_norm": 4.375, + "grad_norm_var": 0.21646728515625, + "learning_rate": 2.2920113678417666e-05, + "loss": 4.2325, + "loss/crossentropy": 1.9750349968671799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17366132140159607, + "step": 10086 + }, + { + "epoch": 0.8406666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.22083333333333333, + "learning_rate": 2.289503958856951e-05, + "loss": 5.316, + "loss/crossentropy": 2.199047952890396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029794305562973, + "step": 10088 + }, + { + "epoch": 0.8408333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.08069254557291666, + "learning_rate": 2.2869968225101342e-05, + "loss": 4.9191, + "loss/crossentropy": 1.9835616052150726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18883539736270905, + "step": 10090 + }, + { + "epoch": 0.841, + "grad_norm": 4.5625, + "grad_norm_var": 0.04021809895833333, + "learning_rate": 2.2844899649874234e-05, + "loss": 4.5576, + "loss/crossentropy": 0.9617295414209366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14817167818546295, + "step": 10092 + }, + { + "epoch": 0.8411666666666666, + "grad_norm": 5.21875, + "grad_norm_var": 0.06594645182291667, + "learning_rate": 2.281983392474242e-05, + "loss": 5.2977, + "loss/crossentropy": 2.2602842450141907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18473618105053902, + "step": 10094 + }, + { + "epoch": 0.8413333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.0697265625, + "learning_rate": 2.2794771111553082e-05, + "loss": 4.7655, + "loss/crossentropy": 1.6119416430592537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1529501359909773, + "step": 10096 + }, + { + "epoch": 0.8415, + "grad_norm": 4.46875, + "grad_norm_var": 0.060347493489583334, + "learning_rate": 2.2769711272146217e-05, + "loss": 4.9663, + "loss/crossentropy": 1.7058739140629768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1737150400876999, + "step": 10098 + }, + { + "epoch": 0.8416666666666667, + "grad_norm": 5.3125, + "grad_norm_var": 0.08670247395833333, + "learning_rate": 2.2744654468354485e-05, + "loss": 5.3056, + "loss/crossentropy": 2.452378123998642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19069649651646614, + "step": 10100 + }, + { + "epoch": 0.8418333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.08199462890625, + "learning_rate": 2.2719600762003066e-05, + "loss": 5.1376, + "loss/crossentropy": 2.5809699296951294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19522631168365479, + "step": 10102 + }, + { + "epoch": 0.842, + "grad_norm": 4.71875, + "grad_norm_var": 0.083984375, + "learning_rate": 2.2694550214909485e-05, + "loss": 5.2027, + "loss/crossentropy": 1.7924980521202087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17402834072709084, + "step": 10104 + }, + { + "epoch": 0.8421666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.08214518229166666, + "learning_rate": 2.2669502888883476e-05, + "loss": 5.0221, + "loss/crossentropy": 2.1637089550495148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20585686340928078, + "step": 10106 + }, + { + "epoch": 0.8423333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.07693684895833333, + "learning_rate": 2.264445884572683e-05, + "loss": 5.1873, + "loss/crossentropy": 2.483547031879425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202965080738068, + "step": 10108 + }, + { + "epoch": 0.8425, + "grad_norm": 4.4375, + "grad_norm_var": 0.07967122395833333, + "learning_rate": 2.261941814723323e-05, + "loss": 4.2006, + "loss/crossentropy": 1.1384671851992607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.128371087834239, + "step": 10110 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.073291015625, + "learning_rate": 2.2594380855188113e-05, + "loss": 4.9687, + "loss/crossentropy": 2.5191361010074615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2324436865746975, + "step": 10112 + }, + { + "epoch": 0.8428333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.072509765625, + "learning_rate": 2.2569347031368506e-05, + "loss": 4.51, + "loss/crossentropy": 2.028899200260639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18104288540780544, + "step": 10114 + }, + { + "epoch": 0.843, + "grad_norm": 4.78125, + "grad_norm_var": 0.04250895182291667, + "learning_rate": 2.2544316737542884e-05, + "loss": 5.0327, + "loss/crossentropy": 1.9726526737213135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18508725985884666, + "step": 10116 + }, + { + "epoch": 0.8431666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.03573811848958333, + "learning_rate": 2.251929003547101e-05, + "loss": 4.9454, + "loss/crossentropy": 2.0947405397892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19356149435043335, + "step": 10118 + }, + { + "epoch": 0.8433333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.03752848307291667, + "learning_rate": 2.2494266986903775e-05, + "loss": 4.5932, + "loss/crossentropy": 1.8272379711270332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1713714934885502, + "step": 10120 + }, + { + "epoch": 0.8435, + "grad_norm": 4.875, + "grad_norm_var": 0.042317708333333336, + "learning_rate": 2.2469247653583074e-05, + "loss": 5.0663, + "loss/crossentropy": 2.5811198949813843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20255010202527046, + "step": 10122 + }, + { + "epoch": 0.8436666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.04529622395833333, + "learning_rate": 2.244423209724161e-05, + "loss": 5.0194, + "loss/crossentropy": 2.0795632749795914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1663488782942295, + "step": 10124 + }, + { + "epoch": 0.8438333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.04104410807291667, + "learning_rate": 2.2419220379602808e-05, + "loss": 5.2162, + "loss/crossentropy": 2.4612520933151245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20894955843687057, + "step": 10126 + }, + { + "epoch": 0.844, + "grad_norm": 4.3125, + "grad_norm_var": 0.04950764973958333, + "learning_rate": 2.239421256238056e-05, + "loss": 4.9813, + "loss/crossentropy": 2.0469383597373962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.180439043790102, + "step": 10128 + }, + { + "epoch": 0.8441666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.06415608723958334, + "learning_rate": 2.2369208707279207e-05, + "loss": 5.0879, + "loss/crossentropy": 2.3578098118305206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20639550685882568, + "step": 10130 + }, + { + "epoch": 0.8443333333333334, + "grad_norm": 4.34375, + "grad_norm_var": 0.06575113932291667, + "learning_rate": 2.234420887599324e-05, + "loss": 4.605, + "loss/crossentropy": 1.8820787519216537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16899964958429337, + "step": 10132 + }, + { + "epoch": 0.8445, + "grad_norm": 5.125, + "grad_norm_var": 0.080322265625, + "learning_rate": 2.2319213130207284e-05, + "loss": 5.2779, + "loss/crossentropy": 2.2708524763584137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19943390414118767, + "step": 10134 + }, + { + "epoch": 0.8446666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.0740234375, + "learning_rate": 2.2294221531595843e-05, + "loss": 4.7132, + "loss/crossentropy": 2.0893598422408104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17067280784249306, + "step": 10136 + }, + { + "epoch": 0.8448333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.08592122395833333, + "learning_rate": 2.226923414182321e-05, + "loss": 4.5546, + "loss/crossentropy": 0.9215280041098595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10505077801644802, + "step": 10138 + }, + { + "epoch": 0.845, + "grad_norm": 4.21875, + "grad_norm_var": 0.08592122395833333, + "learning_rate": 2.224425102254328e-05, + "loss": 4.1395, + "loss/crossentropy": 2.439699411392212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093421071767807, + "step": 10140 + }, + { + "epoch": 0.8451666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.55406494140625, + "learning_rate": 2.2219272235399417e-05, + "loss": 4.4381, + "loss/crossentropy": 1.3956974297761917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1445833072066307, + "step": 10142 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 4.34375, + "grad_norm_var": 0.53970947265625, + "learning_rate": 2.2194297842024293e-05, + "loss": 5.2216, + "loss/crossentropy": 2.28021776676178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18985600024461746, + "step": 10144 + }, + { + "epoch": 0.8455, + "grad_norm": 4.625, + "grad_norm_var": 0.5352864583333333, + "learning_rate": 2.2169327904039754e-05, + "loss": 5.1932, + "loss/crossentropy": 1.7583889961242676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17270105704665184, + "step": 10146 + }, + { + "epoch": 0.8456666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.5301717122395834, + "learning_rate": 2.2144362483056622e-05, + "loss": 4.6843, + "loss/crossentropy": 1.759839728474617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18034739792346954, + "step": 10148 + }, + { + "epoch": 0.8458333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.5265462239583333, + "learning_rate": 2.2119401640674606e-05, + "loss": 5.4814, + "loss/crossentropy": 2.5909521877765656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22270163893699646, + "step": 10150 + }, + { + "epoch": 0.846, + "grad_norm": 4.78125, + "grad_norm_var": 0.52265625, + "learning_rate": 2.209444543848209e-05, + "loss": 4.5192, + "loss/crossentropy": 1.9088744521141052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19738979637622833, + "step": 10152 + }, + { + "epoch": 0.8461666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.5177083333333333, + "learning_rate": 2.2069493938056033e-05, + "loss": 4.2045, + "loss/crossentropy": 2.0631661638617516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19043312687426805, + "step": 10154 + }, + { + "epoch": 0.8463333333333334, + "grad_norm": 4.625, + "grad_norm_var": 2.4303385416666665, + "learning_rate": 2.204454720096177e-05, + "loss": 4.0872, + "loss/crossentropy": 2.1815578043460846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19806847721338272, + "step": 10156 + }, + { + "epoch": 0.8465, + "grad_norm": 4.78125, + "grad_norm_var": 2.1123697916666666, + "learning_rate": 2.2019605288752914e-05, + "loss": 4.9222, + "loss/crossentropy": 1.1190447807312012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1384220365434885, + "step": 10158 + }, + { + "epoch": 0.8466666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 2.0984212239583333, + "learning_rate": 2.1994668262971133e-05, + "loss": 4.9997, + "loss/crossentropy": 1.367589220404625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1640252321958542, + "step": 10160 + }, + { + "epoch": 0.8468333333333333, + "grad_norm": 4.625, + "grad_norm_var": 2.0991170247395834, + "learning_rate": 2.1969736185146077e-05, + "loss": 4.6507, + "loss/crossentropy": 1.1029272973537445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1302457209676504, + "step": 10162 + }, + { + "epoch": 0.847, + "grad_norm": 4.8125, + "grad_norm_var": 2.0749837239583333, + "learning_rate": 2.1944809116795156e-05, + "loss": 4.5281, + "loss/crossentropy": 1.1658693552017212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12650441750884056, + "step": 10164 + }, + { + "epoch": 0.8471666666666666, + "grad_norm": 5.28125, + "grad_norm_var": 2.0757120768229167, + "learning_rate": 2.1919887119423447e-05, + "loss": 5.1508, + "loss/crossentropy": 1.959177941083908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851571910083294, + "step": 10166 + }, + { + "epoch": 0.8473333333333334, + "grad_norm": 4.40625, + "grad_norm_var": 2.111747233072917, + "learning_rate": 2.189497025452348e-05, + "loss": 5.093, + "loss/crossentropy": 2.24192276597023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19968308880925179, + "step": 10168 + }, + { + "epoch": 0.8475, + "grad_norm": 4.78125, + "grad_norm_var": 2.107405598958333, + "learning_rate": 2.1870058583575168e-05, + "loss": 4.9912, + "loss/crossentropy": 2.1087652146816254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21696339920163155, + "step": 10170 + }, + { + "epoch": 0.8476666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.05703125, + "learning_rate": 2.184515216804556e-05, + "loss": 5.3077, + "loss/crossentropy": 2.6343509554862976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213670052587986, + "step": 10172 + }, + { + "epoch": 0.8478333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.08166910807291666, + "learning_rate": 2.1820251069388778e-05, + "loss": 4.9912, + "loss/crossentropy": 2.2396809458732605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050902657210827, + "step": 10174 + }, + { + "epoch": 0.848, + "grad_norm": 4.96875, + "grad_norm_var": 0.09166259765625, + "learning_rate": 2.1795355349045796e-05, + "loss": 5.2577, + "loss/crossentropy": 2.42908251285553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073732390999794, + "step": 10176 + }, + { + "epoch": 0.8481666666666666, + "grad_norm": 5.125, + "grad_norm_var": 0.09986979166666667, + "learning_rate": 2.177046506844433e-05, + "loss": 5.0577, + "loss/crossentropy": 1.525025613605976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19690265133976936, + "step": 10178 + }, + { + "epoch": 0.8483333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.09830729166666667, + "learning_rate": 2.174558028899868e-05, + "loss": 5.1652, + "loss/crossentropy": 2.395362615585327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21092161908745766, + "step": 10180 + }, + { + "epoch": 0.8485, + "grad_norm": 5.1875, + "grad_norm_var": 0.09722900390625, + "learning_rate": 2.1720701072109564e-05, + "loss": 5.2021, + "loss/crossentropy": 1.5952081009745598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27194925770163536, + "step": 10182 + }, + { + "epoch": 0.8486666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.07851155598958333, + "learning_rate": 2.1695827479163967e-05, + "loss": 4.9746, + "loss/crossentropy": 2.3871026039123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2411057911813259, + "step": 10184 + }, + { + "epoch": 0.8488333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.07057291666666667, + "learning_rate": 2.167095957153502e-05, + "loss": 5.1706, + "loss/crossentropy": 1.297366164624691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14902203157544136, + "step": 10186 + }, + { + "epoch": 0.849, + "grad_norm": 4.71875, + "grad_norm_var": 0.08006184895833333, + "learning_rate": 2.1646097410581804e-05, + "loss": 4.6391, + "loss/crossentropy": 1.8485392034053802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16621309891343117, + "step": 10188 + }, + { + "epoch": 0.8491666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.065869140625, + "learning_rate": 2.1621241057649236e-05, + "loss": 4.9836, + "loss/crossentropy": 2.1049680411815643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2324654832482338, + "step": 10190 + }, + { + "epoch": 0.8493333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.05709635416666667, + "learning_rate": 2.159639057406789e-05, + "loss": 4.6144, + "loss/crossentropy": 1.404382936656475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13865314982831478, + "step": 10192 + }, + { + "epoch": 0.8495, + "grad_norm": 5.15625, + "grad_norm_var": 0.058577473958333334, + "learning_rate": 2.1571546021153863e-05, + "loss": 5.2336, + "loss/crossentropy": 2.1182867288589478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24229948967695236, + "step": 10194 + }, + { + "epoch": 0.8496666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.06638997395833333, + "learning_rate": 2.1546707460208634e-05, + "loss": 4.7806, + "loss/crossentropy": 1.6670853942632675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16587894223630428, + "step": 10196 + }, + { + "epoch": 0.8498333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.07177327473958334, + "learning_rate": 2.1521874952518863e-05, + "loss": 5.4903, + "loss/crossentropy": 1.9937995970249176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17633923329412937, + "step": 10198 + }, + { + "epoch": 0.85, + "grad_norm": 5.25, + "grad_norm_var": 0.09464518229166667, + "learning_rate": 2.149704855935631e-05, + "loss": 5.1717, + "loss/crossentropy": 1.7172137647867203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18349767476320267, + "step": 10200 + }, + { + "epoch": 0.8501666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.09670817057291667, + "learning_rate": 2.1472228341977624e-05, + "loss": 5.0091, + "loss/crossentropy": 2.4879343509674072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22031700983643532, + "step": 10202 + }, + { + "epoch": 0.8503333333333334, + "grad_norm": 22.375, + "grad_norm_var": 19.593550618489584, + "learning_rate": 2.1447414361624216e-05, + "loss": 4.6357, + "loss/crossentropy": 1.5000810474157333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13241293840110302, + "step": 10204 + }, + { + "epoch": 0.8505, + "grad_norm": 4.78125, + "grad_norm_var": 19.64205322265625, + "learning_rate": 2.142260667952214e-05, + "loss": 4.5955, + "loss/crossentropy": 2.1218055486679077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21866939589381218, + "step": 10206 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 19.614046223958333, + "learning_rate": 2.1397805356881863e-05, + "loss": 4.6238, + "loss/crossentropy": 2.0160721242427826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22799114137887955, + "step": 10208 + }, + { + "epoch": 0.8508333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 19.658268229166666, + "learning_rate": 2.1373010454898198e-05, + "loss": 4.6067, + "loss/crossentropy": 1.691223792731762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17581338994204998, + "step": 10210 + }, + { + "epoch": 0.851, + "grad_norm": 4.375, + "grad_norm_var": 19.744755045572916, + "learning_rate": 2.1348222034750083e-05, + "loss": 4.1484, + "loss/crossentropy": 1.6983703970909119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15516823064535856, + "step": 10212 + }, + { + "epoch": 0.8511666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 19.739567057291666, + "learning_rate": 2.13234401576005e-05, + "loss": 4.5592, + "loss/crossentropy": 1.9170349910855293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19698243960738182, + "step": 10214 + }, + { + "epoch": 0.8513333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 19.742708333333333, + "learning_rate": 2.129866488459626e-05, + "loss": 4.6056, + "loss/crossentropy": 2.0503681302070618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23426194116473198, + "step": 10216 + }, + { + "epoch": 0.8515, + "grad_norm": 4.59375, + "grad_norm_var": 19.671354166666667, + "learning_rate": 2.1273896276867886e-05, + "loss": 4.7673, + "loss/crossentropy": 2.1819980144500732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21132346987724304, + "step": 10218 + }, + { + "epoch": 0.8516666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.09136962890625, + "learning_rate": 2.1249134395529447e-05, + "loss": 5.0062, + "loss/crossentropy": 2.1964263021945953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328333593904972, + "step": 10220 + }, + { + "epoch": 0.8518333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.08355712890625, + "learning_rate": 2.122437930167844e-05, + "loss": 4.7946, + "loss/crossentropy": 1.8490115702152252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21102577820420265, + "step": 10222 + }, + { + "epoch": 0.852, + "grad_norm": 4.84375, + "grad_norm_var": 0.08271077473958334, + "learning_rate": 2.1199631056395583e-05, + "loss": 5.2719, + "loss/crossentropy": 1.9915091469883919, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18897178769111633, + "step": 10224 + }, + { + "epoch": 0.8521666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.074609375, + "learning_rate": 2.1174889720744725e-05, + "loss": 5.1246, + "loss/crossentropy": 2.4218207597732544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24127807095646858, + "step": 10226 + }, + { + "epoch": 0.8523333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.053369140625, + "learning_rate": 2.1150155355772642e-05, + "loss": 5.446, + "loss/crossentropy": 2.052487760782242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854759305715561, + "step": 10228 + }, + { + "epoch": 0.8525, + "grad_norm": 4.59375, + "grad_norm_var": 0.026167805989583334, + "learning_rate": 2.112542802250892e-05, + "loss": 5.3682, + "loss/crossentropy": 2.172104150056839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20149703696370125, + "step": 10230 + }, + { + "epoch": 0.8526666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.027457682291666667, + "learning_rate": 2.1100707781965806e-05, + "loss": 5.1003, + "loss/crossentropy": 1.58599391579628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1738888956606388, + "step": 10232 + }, + { + "epoch": 0.8528333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.043778483072916666, + "learning_rate": 2.1075994695138025e-05, + "loss": 5.1155, + "loss/crossentropy": 2.560574531555176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20391079783439636, + "step": 10234 + }, + { + "epoch": 0.853, + "grad_norm": 4.875, + "grad_norm_var": 0.04412434895833333, + "learning_rate": 2.1051288823002663e-05, + "loss": 5.4341, + "loss/crossentropy": 2.2714935541152954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219677422195673, + "step": 10236 + }, + { + "epoch": 0.8531666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.04608968098958333, + "learning_rate": 2.1026590226519018e-05, + "loss": 5.1789, + "loss/crossentropy": 1.3990702331066132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14530890248715878, + "step": 10238 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.046614583333333334, + "learning_rate": 2.1001898966628403e-05, + "loss": 4.559, + "loss/crossentropy": 2.0317687690258026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17451823875308037, + "step": 10240 + }, + { + "epoch": 0.8535, + "grad_norm": 4.3125, + "grad_norm_var": 0.05474853515625, + "learning_rate": 2.097721510425407e-05, + "loss": 4.9075, + "loss/crossentropy": 2.2877692580223083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19254744797945023, + "step": 10242 + }, + { + "epoch": 0.8536666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.05673421223958333, + "learning_rate": 2.0952538700300966e-05, + "loss": 4.7423, + "loss/crossentropy": 1.2009011879563332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12820916064083576, + "step": 10244 + }, + { + "epoch": 0.8538333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.057417805989583334, + "learning_rate": 2.0927869815655684e-05, + "loss": 4.9864, + "loss/crossentropy": 2.1787761747837067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240334264934063, + "step": 10246 + }, + { + "epoch": 0.854, + "grad_norm": 4.625, + "grad_norm_var": 0.061442057291666664, + "learning_rate": 2.090320851118624e-05, + "loss": 5.1471, + "loss/crossentropy": 2.5853514075279236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1966766081750393, + "step": 10248 + }, + { + "epoch": 0.8541666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.037984212239583336, + "learning_rate": 2.0878554847741956e-05, + "loss": 5.1916, + "loss/crossentropy": 2.4860697388648987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21677172929048538, + "step": 10250 + }, + { + "epoch": 0.8543333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.046875, + "learning_rate": 2.0853908886153285e-05, + "loss": 4.6001, + "loss/crossentropy": 1.6758419573307037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1749027669429779, + "step": 10252 + }, + { + "epoch": 0.8545, + "grad_norm": 4.5, + "grad_norm_var": 0.03707275390625, + "learning_rate": 2.0829270687231693e-05, + "loss": 4.2543, + "loss/crossentropy": 2.1500919461250305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20609234645962715, + "step": 10254 + }, + { + "epoch": 0.8546666666666667, + "grad_norm": 4.1875, + "grad_norm_var": 0.042822265625, + "learning_rate": 2.0804640311769494e-05, + "loss": 4.4822, + "loss/crossentropy": 1.4717730283737183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15207753330469131, + "step": 10256 + }, + { + "epoch": 0.8548333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.045166015625, + "learning_rate": 2.078001782053968e-05, + "loss": 5.0939, + "loss/crossentropy": 2.1801829636096954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21367578580975533, + "step": 10258 + }, + { + "epoch": 0.855, + "grad_norm": 4.53125, + "grad_norm_var": 0.0419921875, + "learning_rate": 2.0755403274295807e-05, + "loss": 4.4958, + "loss/crossentropy": 2.5639131665229797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22637486085295677, + "step": 10260 + }, + { + "epoch": 0.8551666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.04491780598958333, + "learning_rate": 2.0730796733771815e-05, + "loss": 5.2363, + "loss/crossentropy": 1.9212607964873314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1764792650938034, + "step": 10262 + }, + { + "epoch": 0.8553333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.04918212890625, + "learning_rate": 2.0706198259681907e-05, + "loss": 4.3208, + "loss/crossentropy": 2.1319038569927216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085258960723877, + "step": 10264 + }, + { + "epoch": 0.8555, + "grad_norm": 4.96875, + "grad_norm_var": 0.07272135416666667, + "learning_rate": 2.0681607912720353e-05, + "loss": 5.7044, + "loss/crossentropy": 2.014607787132263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217660091817379, + "step": 10266 + }, + { + "epoch": 0.8556666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.06360270182291666, + "learning_rate": 2.065702575356142e-05, + "loss": 4.9487, + "loss/crossentropy": 1.988630086183548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18287063390016556, + "step": 10268 + }, + { + "epoch": 0.8558333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.0615234375, + "learning_rate": 2.0632451842859113e-05, + "loss": 4.5084, + "loss/crossentropy": 2.0185526311397552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114994414150715, + "step": 10270 + }, + { + "epoch": 0.856, + "grad_norm": 4.53125, + "grad_norm_var": 0.052197265625, + "learning_rate": 2.0607886241247135e-05, + "loss": 4.7235, + "loss/crossentropy": 2.469718277454376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23021500930190086, + "step": 10272 + }, + { + "epoch": 0.8561666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.0750152587890625, + "learning_rate": 2.0583329009338646e-05, + "loss": 4.3133, + "loss/crossentropy": 1.1823057383298874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1447651106864214, + "step": 10274 + }, + { + "epoch": 0.8563333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.08342997233072917, + "learning_rate": 2.0558780207726193e-05, + "loss": 4.6926, + "loss/crossentropy": 1.8804874122142792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16276229731738567, + "step": 10276 + }, + { + "epoch": 0.8565, + "grad_norm": 4.09375, + "grad_norm_var": 0.0946685791015625, + "learning_rate": 2.0534239896981488e-05, + "loss": 4.8669, + "loss/crossentropy": 2.515592932701111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22185379639267921, + "step": 10278 + }, + { + "epoch": 0.8566666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.08923238118489583, + "learning_rate": 2.050970813765533e-05, + "loss": 4.7441, + "loss/crossentropy": 1.2782834395766258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1427957322448492, + "step": 10280 + }, + { + "epoch": 0.8568333333333333, + "grad_norm": 4.21875, + "grad_norm_var": 0.06330464680989584, + "learning_rate": 2.0485184990277367e-05, + "loss": 4.3806, + "loss/crossentropy": 2.2299217581748962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1995675452053547, + "step": 10282 + }, + { + "epoch": 0.857, + "grad_norm": 4.40625, + "grad_norm_var": 0.05628153483072917, + "learning_rate": 2.046067051535605e-05, + "loss": 4.4903, + "loss/crossentropy": 1.19523473829031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16348830796778202, + "step": 10284 + }, + { + "epoch": 0.8571666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.05873921712239583, + "learning_rate": 2.0436164773378402e-05, + "loss": 5.01, + "loss/crossentropy": 1.890808716416359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17876779288053513, + "step": 10286 + }, + { + "epoch": 0.8573333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.05934956868489583, + "learning_rate": 2.041166782480991e-05, + "loss": 4.8465, + "loss/crossentropy": 1.7806348651647568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16407202184200287, + "step": 10288 + }, + { + "epoch": 0.8575, + "grad_norm": 4.5, + "grad_norm_var": 0.03943684895833333, + "learning_rate": 2.0387179730094343e-05, + "loss": 4.7088, + "loss/crossentropy": 1.9317216500639915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17379080504179, + "step": 10290 + }, + { + "epoch": 0.8576666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.03470052083333333, + "learning_rate": 2.0362700549653663e-05, + "loss": 5.1756, + "loss/crossentropy": 2.081753820180893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230081208050251, + "step": 10292 + }, + { + "epoch": 0.8578333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.025846354166666665, + "learning_rate": 2.03382303438878e-05, + "loss": 4.9575, + "loss/crossentropy": 1.958198145031929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16479731537401676, + "step": 10294 + }, + { + "epoch": 0.858, + "grad_norm": 4.53125, + "grad_norm_var": 0.02613525390625, + "learning_rate": 2.031376917317456e-05, + "loss": 4.717, + "loss/crossentropy": 1.3425401076674461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1326687391847372, + "step": 10296 + }, + { + "epoch": 0.8581666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.02265625, + "learning_rate": 2.028931709786944e-05, + "loss": 4.9235, + "loss/crossentropy": 1.9076469615101814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17414027452468872, + "step": 10298 + }, + { + "epoch": 0.8583333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.04192708333333333, + "learning_rate": 2.026487417830552e-05, + "loss": 4.4676, + "loss/crossentropy": 1.4255196824669838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15519500523805618, + "step": 10300 + }, + { + "epoch": 0.8585, + "grad_norm": 4.3125, + "grad_norm_var": 0.04315999348958333, + "learning_rate": 2.024044047479326e-05, + "loss": 4.8051, + "loss/crossentropy": 1.4676533862948418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14330729097127914, + "step": 10302 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.0408203125, + "learning_rate": 2.021601604762041e-05, + "loss": 5.3457, + "loss/crossentropy": 2.270623505115509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200910672545433, + "step": 10304 + }, + { + "epoch": 0.8588333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.04218343098958333, + "learning_rate": 2.0191600957051802e-05, + "loss": 4.7869, + "loss/crossentropy": 2.379778265953064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987297348678112, + "step": 10306 + }, + { + "epoch": 0.859, + "grad_norm": 4.84375, + "grad_norm_var": 0.05764567057291667, + "learning_rate": 2.016719526332926e-05, + "loss": 4.9602, + "loss/crossentropy": 1.8067995011806488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1776756253093481, + "step": 10308 + }, + { + "epoch": 0.8591666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.0548828125, + "learning_rate": 2.0142799026671387e-05, + "loss": 5.0338, + "loss/crossentropy": 1.6648282185196877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1662379615008831, + "step": 10310 + }, + { + "epoch": 0.8593333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.05435791015625, + "learning_rate": 2.011841230727349e-05, + "loss": 5.0295, + "loss/crossentropy": 2.2086196839809418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117423191666603, + "step": 10312 + }, + { + "epoch": 0.8595, + "grad_norm": 4.78125, + "grad_norm_var": 0.05445556640625, + "learning_rate": 2.009403516530736e-05, + "loss": 4.9409, + "loss/crossentropy": 1.8415422439575195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215627308934927, + "step": 10314 + }, + { + "epoch": 0.8596666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.0404296875, + "learning_rate": 2.0069667660921183e-05, + "loss": 5.2048, + "loss/crossentropy": 2.0204322412610054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18452575244009495, + "step": 10316 + }, + { + "epoch": 0.8598333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.032059733072916666, + "learning_rate": 2.004530985423935e-05, + "loss": 4.4636, + "loss/crossentropy": 1.3031515032052994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13595690671354532, + "step": 10318 + }, + { + "epoch": 0.86, + "grad_norm": 4.59375, + "grad_norm_var": 0.031966145833333334, + "learning_rate": 2.002096180536233e-05, + "loss": 4.989, + "loss/crossentropy": 2.1542540416121483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19251862913370132, + "step": 10320 + }, + { + "epoch": 0.8601666666666666, + "grad_norm": 4.4375, + "grad_norm_var": 0.03435872395833333, + "learning_rate": 1.9996623574366506e-05, + "loss": 4.6652, + "loss/crossentropy": 1.7660870179533958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1680949106812477, + "step": 10322 + }, + { + "epoch": 0.8603333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.038655598958333336, + "learning_rate": 1.997229522130405e-05, + "loss": 5.0513, + "loss/crossentropy": 1.868349775671959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1752658188343048, + "step": 10324 + }, + { + "epoch": 0.8605, + "grad_norm": 4.875, + "grad_norm_var": 0.0478515625, + "learning_rate": 1.994797680620275e-05, + "loss": 4.5671, + "loss/crossentropy": 2.250141680240631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2370268814265728, + "step": 10326 + }, + { + "epoch": 0.8606666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.04659830729166667, + "learning_rate": 1.992366838906589e-05, + "loss": 4.7318, + "loss/crossentropy": 1.7279707714915276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15387518890202045, + "step": 10328 + }, + { + "epoch": 0.8608333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.041910807291666664, + "learning_rate": 1.9899370029872056e-05, + "loss": 5.0707, + "loss/crossentropy": 2.460710883140564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20482278987765312, + "step": 10330 + }, + { + "epoch": 0.861, + "grad_norm": 4.40625, + "grad_norm_var": 0.048173014322916666, + "learning_rate": 1.9875081788575047e-05, + "loss": 4.2985, + "loss/crossentropy": 2.021486707031727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17060382664203644, + "step": 10332 + }, + { + "epoch": 0.8611666666666666, + "grad_norm": 4.34375, + "grad_norm_var": 0.05325113932291667, + "learning_rate": 1.9850803725103674e-05, + "loss": 5.1125, + "loss/crossentropy": 1.4065138399600983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1290155854076147, + "step": 10334 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.06417643229166667, + "learning_rate": 1.9826535899361657e-05, + "loss": 4.1981, + "loss/crossentropy": 1.3243483901023865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14576907455921173, + "step": 10336 + }, + { + "epoch": 0.8615, + "grad_norm": 4.78125, + "grad_norm_var": 0.07786458333333333, + "learning_rate": 1.9802278371227427e-05, + "loss": 4.6557, + "loss/crossentropy": 1.7146344780921936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19457191228866577, + "step": 10338 + }, + { + "epoch": 0.8616666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.06560872395833334, + "learning_rate": 1.9778031200554038e-05, + "loss": 4.6025, + "loss/crossentropy": 0.9829111769795418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13816312327980995, + "step": 10340 + }, + { + "epoch": 0.8618333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.07193603515625, + "learning_rate": 1.9753794447168965e-05, + "loss": 4.0344, + "loss/crossentropy": 1.5932381376624107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16735537350177765, + "step": 10342 + }, + { + "epoch": 0.862, + "grad_norm": 4.875, + "grad_norm_var": 0.0759765625, + "learning_rate": 1.9729568170873997e-05, + "loss": 5.3311, + "loss/crossentropy": 2.4922866225242615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20134862512350082, + "step": 10344 + }, + { + "epoch": 0.8621666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.08033854166666667, + "learning_rate": 1.970535243144505e-05, + "loss": 4.7939, + "loss/crossentropy": 1.3205213844776154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20733513124287128, + "step": 10346 + }, + { + "epoch": 0.8623333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.07483317057291666, + "learning_rate": 1.9681147288632063e-05, + "loss": 5.342, + "loss/crossentropy": 1.4077673107385635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15148967504501343, + "step": 10348 + }, + { + "epoch": 0.8625, + "grad_norm": 4.875, + "grad_norm_var": 0.087109375, + "learning_rate": 1.9656952802158816e-05, + "loss": 5.494, + "loss/crossentropy": 1.784704715013504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.165837112814188, + "step": 10350 + }, + { + "epoch": 0.8626666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.06754150390625, + "learning_rate": 1.96327690317228e-05, + "loss": 4.6651, + "loss/crossentropy": 1.8337150737643242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1464794110506773, + "step": 10352 + }, + { + "epoch": 0.8628333333333333, + "grad_norm": 4.375, + "grad_norm_var": 0.06523030598958333, + "learning_rate": 1.9608596036995065e-05, + "loss": 4.8617, + "loss/crossentropy": 2.221179723739624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20009664818644524, + "step": 10354 + }, + { + "epoch": 0.863, + "grad_norm": 4.90625, + "grad_norm_var": 0.06724853515625, + "learning_rate": 1.9584433877620075e-05, + "loss": 4.8329, + "loss/crossentropy": 2.1157293617725372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24074679613113403, + "step": 10356 + }, + { + "epoch": 0.8631666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.04254150390625, + "learning_rate": 1.9560282613215547e-05, + "loss": 5.1425, + "loss/crossentropy": 2.292990207672119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21054679155349731, + "step": 10358 + }, + { + "epoch": 0.8633333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.04568684895833333, + "learning_rate": 1.9536142303372337e-05, + "loss": 4.7158, + "loss/crossentropy": 1.4450874850153923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13776488974690437, + "step": 10360 + }, + { + "epoch": 0.8635, + "grad_norm": 4.875, + "grad_norm_var": 0.05545247395833333, + "learning_rate": 1.9512013007654248e-05, + "loss": 4.4627, + "loss/crossentropy": 1.927704095840454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18812527135014534, + "step": 10362 + }, + { + "epoch": 0.8636666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.06100260416666667, + "learning_rate": 1.9487894785597933e-05, + "loss": 4.3119, + "loss/crossentropy": 1.5006299167871475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726713590323925, + "step": 10364 + }, + { + "epoch": 0.8638333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.05050455729166667, + "learning_rate": 1.9463787696712696e-05, + "loss": 5.0052, + "loss/crossentropy": 1.9660705775022507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18909884989261627, + "step": 10366 + }, + { + "epoch": 0.864, + "grad_norm": 4.1875, + "grad_norm_var": 0.06276041666666667, + "learning_rate": 1.9439691800480384e-05, + "loss": 4.5395, + "loss/crossentropy": 1.9638324081897736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17081649415194988, + "step": 10368 + }, + { + "epoch": 0.8641666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.05974934895833333, + "learning_rate": 1.9415607156355228e-05, + "loss": 5.0068, + "loss/crossentropy": 1.952818602323532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18014013394713402, + "step": 10370 + }, + { + "epoch": 0.8643333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.06614176432291667, + "learning_rate": 1.93915338237637e-05, + "loss": 4.9283, + "loss/crossentropy": 1.7149565666913986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17573698051273823, + "step": 10372 + }, + { + "epoch": 0.8645, + "grad_norm": 4.59375, + "grad_norm_var": 0.067822265625, + "learning_rate": 1.9367471862104334e-05, + "loss": 5.0251, + "loss/crossentropy": 1.1846980601549149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13160541094839573, + "step": 10374 + }, + { + "epoch": 0.8646666666666667, + "grad_norm": 4.15625, + "grad_norm_var": 0.729931640625, + "learning_rate": 1.9343421330747656e-05, + "loss": 4.4774, + "loss/crossentropy": 1.783796139061451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938588246703148, + "step": 10376 + }, + { + "epoch": 0.8648333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.72105712890625, + "learning_rate": 1.9319382289035937e-05, + "loss": 4.528, + "loss/crossentropy": 2.5256577730178833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21793661266565323, + "step": 10378 + }, + { + "epoch": 0.865, + "grad_norm": 4.65625, + "grad_norm_var": 0.7090779622395833, + "learning_rate": 1.929535479628314e-05, + "loss": 5.0167, + "loss/crossentropy": 1.3702474012970924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1659705750644207, + "step": 10380 + }, + { + "epoch": 0.8651666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.7242838541666666, + "learning_rate": 1.9271338911774705e-05, + "loss": 4.5104, + "loss/crossentropy": 2.072055459022522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18684400990605354, + "step": 10382 + }, + { + "epoch": 0.8653333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.7132649739583333, + "learning_rate": 1.9247334694767446e-05, + "loss": 5.0498, + "loss/crossentropy": 1.4851181358098984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1375308893620968, + "step": 10384 + }, + { + "epoch": 0.8655, + "grad_norm": 4.625, + "grad_norm_var": 0.70621337890625, + "learning_rate": 1.9223342204489377e-05, + "loss": 5.1391, + "loss/crossentropy": 2.250565826892853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24506643041968346, + "step": 10386 + }, + { + "epoch": 0.8656666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.6966756184895834, + "learning_rate": 1.9199361500139587e-05, + "loss": 4.4246, + "loss/crossentropy": 1.9518256038427353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17143848165869713, + "step": 10388 + }, + { + "epoch": 0.8658333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.7186482747395834, + "learning_rate": 1.9175392640888073e-05, + "loss": 4.6129, + "loss/crossentropy": 2.004493474960327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18428033217787743, + "step": 10390 + }, + { + "epoch": 0.866, + "grad_norm": 4.75, + "grad_norm_var": 0.059765625, + "learning_rate": 1.9151435685875622e-05, + "loss": 4.9474, + "loss/crossentropy": 1.5391086861491203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15014921128749847, + "step": 10392 + }, + { + "epoch": 0.8661666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.06064046223958333, + "learning_rate": 1.912749069421363e-05, + "loss": 5.3938, + "loss/crossentropy": 2.432666063308716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20987457409501076, + "step": 10394 + }, + { + "epoch": 0.8663333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.07272135416666667, + "learning_rate": 1.910355772498399e-05, + "loss": 4.319, + "loss/crossentropy": 0.9199870005249977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10082777217030525, + "step": 10396 + }, + { + "epoch": 0.8665, + "grad_norm": 4.46875, + "grad_norm_var": 0.06672770182291667, + "learning_rate": 1.9079636837238923e-05, + "loss": 4.5733, + "loss/crossentropy": 1.7452101185917854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17284293472766876, + "step": 10398 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.05514322916666667, + "learning_rate": 1.9055728090000843e-05, + "loss": 4.6767, + "loss/crossentropy": 1.7718966230750084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16622867062687874, + "step": 10400 + }, + { + "epoch": 0.8668333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.04529622395833333, + "learning_rate": 1.9031831542262203e-05, + "loss": 4.9882, + "loss/crossentropy": 1.9369821846485138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18681341037154198, + "step": 10402 + }, + { + "epoch": 0.867, + "grad_norm": 4.53125, + "grad_norm_var": 0.04973551432291667, + "learning_rate": 1.9007947252985367e-05, + "loss": 4.8377, + "loss/crossentropy": 2.347927749156952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116454504430294, + "step": 10404 + }, + { + "epoch": 0.8671666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.07431233723958333, + "learning_rate": 1.898407528110243e-05, + "loss": 5.2396, + "loss/crossentropy": 2.3202788531780243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18752939999103546, + "step": 10406 + }, + { + "epoch": 0.8673333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.06751302083333334, + "learning_rate": 1.8960215685515128e-05, + "loss": 4.4656, + "loss/crossentropy": 1.361263856291771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15047047100961208, + "step": 10408 + }, + { + "epoch": 0.8675, + "grad_norm": 4.53125, + "grad_norm_var": 0.07115885416666666, + "learning_rate": 1.8936368525094623e-05, + "loss": 4.9721, + "loss/crossentropy": 2.467073440551758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162875458598137, + "step": 10410 + }, + { + "epoch": 0.8676666666666667, + "grad_norm": 5.4375, + "grad_norm_var": 0.10846354166666666, + "learning_rate": 1.891253385868143e-05, + "loss": 4.8256, + "loss/crossentropy": 2.3389711380004883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039964720606804, + "step": 10412 + }, + { + "epoch": 0.8678333333333333, + "grad_norm": 4.375, + "grad_norm_var": 0.11080322265625, + "learning_rate": 1.88887117450852e-05, + "loss": 4.7788, + "loss/crossentropy": 1.9970930740237236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18604559637606144, + "step": 10414 + }, + { + "epoch": 0.868, + "grad_norm": 4.6875, + "grad_norm_var": 0.11174723307291666, + "learning_rate": 1.8864902243084654e-05, + "loss": 4.6257, + "loss/crossentropy": 1.6402384638786316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1494317278265953, + "step": 10416 + }, + { + "epoch": 0.8681666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.10662434895833334, + "learning_rate": 1.884110541142735e-05, + "loss": 4.7642, + "loss/crossentropy": 1.9170377254486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19645358622074127, + "step": 10418 + }, + { + "epoch": 0.8683333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.10836181640625, + "learning_rate": 1.8817321308829616e-05, + "loss": 4.8152, + "loss/crossentropy": 1.6103285178542137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18402664922177792, + "step": 10420 + }, + { + "epoch": 0.8685, + "grad_norm": 4.59375, + "grad_norm_var": 0.077587890625, + "learning_rate": 1.879354999397635e-05, + "loss": 5.0893, + "loss/crossentropy": 2.393158346414566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20873240754008293, + "step": 10422 + }, + { + "epoch": 0.8686666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.07649739583333333, + "learning_rate": 1.8769791525520924e-05, + "loss": 4.9919, + "loss/crossentropy": 1.6317023634910583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17474069818854332, + "step": 10424 + }, + { + "epoch": 0.8688333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.07336832682291666, + "learning_rate": 1.8746045962084985e-05, + "loss": 5.0873, + "loss/crossentropy": 2.3630973398685455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21914517134428024, + "step": 10426 + }, + { + "epoch": 0.869, + "grad_norm": 4.34375, + "grad_norm_var": 0.03619791666666667, + "learning_rate": 1.8722313362258357e-05, + "loss": 4.7096, + "loss/crossentropy": 1.8483033329248428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20285437256097794, + "step": 10428 + }, + { + "epoch": 0.8691666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.03404947916666667, + "learning_rate": 1.8698593784598865e-05, + "loss": 4.4115, + "loss/crossentropy": 2.0544984862208366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418617151677608, + "step": 10430 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.03448893229166667, + "learning_rate": 1.8674887287632217e-05, + "loss": 4.746, + "loss/crossentropy": 2.0632302463054657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20877571776509285, + "step": 10432 + }, + { + "epoch": 0.8695, + "grad_norm": 4.78125, + "grad_norm_var": 0.035139973958333334, + "learning_rate": 1.865119392985183e-05, + "loss": 4.7295, + "loss/crossentropy": 2.368138611316681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977866180241108, + "step": 10434 + }, + { + "epoch": 0.8696666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.03883056640625, + "learning_rate": 1.8627513769718714e-05, + "loss": 5.0828, + "loss/crossentropy": 1.7253614962100983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17387551069259644, + "step": 10436 + }, + { + "epoch": 0.8698333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.04732666015625, + "learning_rate": 1.86038468656613e-05, + "loss": 5.0308, + "loss/crossentropy": 1.5639515295624733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19789879396557808, + "step": 10438 + }, + { + "epoch": 0.87, + "grad_norm": 5.34375, + "grad_norm_var": 0.08151041666666667, + "learning_rate": 1.858019327607534e-05, + "loss": 4.7967, + "loss/crossentropy": 2.375213235616684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19904804602265358, + "step": 10440 + }, + { + "epoch": 0.8701666666666666, + "grad_norm": 4.4375, + "grad_norm_var": 0.08590087890625, + "learning_rate": 1.85565530593237e-05, + "loss": 4.2816, + "loss/crossentropy": 2.289244920015335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22079583629965782, + "step": 10442 + }, + { + "epoch": 0.8703333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.074853515625, + "learning_rate": 1.853292627373627e-05, + "loss": 4.6217, + "loss/crossentropy": 1.7237029895186424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15398419462144375, + "step": 10444 + }, + { + "epoch": 0.8705, + "grad_norm": 4.5625, + "grad_norm_var": 0.07408854166666666, + "learning_rate": 1.850931297760979e-05, + "loss": 5.191, + "loss/crossentropy": 1.8904145956039429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17381685227155685, + "step": 10446 + }, + { + "epoch": 0.8706666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.06754150390625, + "learning_rate": 1.8485713229207733e-05, + "loss": 4.889, + "loss/crossentropy": 1.340662695467472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1496703438460827, + "step": 10448 + }, + { + "epoch": 0.8708333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.05670572916666667, + "learning_rate": 1.8462127086760112e-05, + "loss": 5.1296, + "loss/crossentropy": 2.0262687131762505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17123587429523468, + "step": 10450 + }, + { + "epoch": 0.871, + "grad_norm": 4.1875, + "grad_norm_var": 0.07317708333333334, + "learning_rate": 1.843855460846341e-05, + "loss": 4.965, + "loss/crossentropy": 2.0109422728419304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17585521936416626, + "step": 10452 + }, + { + "epoch": 0.8711666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.06731770833333334, + "learning_rate": 1.8414995852480357e-05, + "loss": 5.0754, + "loss/crossentropy": 2.5963427424430847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299847975373268, + "step": 10454 + }, + { + "epoch": 0.8713333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.03345947265625, + "learning_rate": 1.839145087693986e-05, + "loss": 5.1549, + "loss/crossentropy": 1.8669070899486542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17287633568048477, + "step": 10456 + }, + { + "epoch": 0.8715, + "grad_norm": 4.9375, + "grad_norm_var": 0.03785400390625, + "learning_rate": 1.8367919739936788e-05, + "loss": 4.9381, + "loss/crossentropy": 1.4938563853502274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16861506551504135, + "step": 10458 + }, + { + "epoch": 0.8716666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.04176025390625, + "learning_rate": 1.834440249953189e-05, + "loss": 4.9393, + "loss/crossentropy": 1.6703289598226547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17855097353458405, + "step": 10460 + }, + { + "epoch": 0.8718333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 1.8320899213751614e-05, + "loss": 4.6414, + "loss/crossentropy": 1.7185562402009964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17512892372906208, + "step": 10462 + }, + { + "epoch": 0.872, + "grad_norm": 4.625, + "grad_norm_var": 0.0478515625, + "learning_rate": 1.829740994058799e-05, + "loss": 5.2955, + "loss/crossentropy": 1.9877119585871696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1850297786295414, + "step": 10464 + }, + { + "epoch": 0.8721666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.046773274739583336, + "learning_rate": 1.827393473799846e-05, + "loss": 5.2927, + "loss/crossentropy": 2.5606048107147217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21541300043463707, + "step": 10466 + }, + { + "epoch": 0.8723333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.03528238932291667, + "learning_rate": 1.8250473663905756e-05, + "loss": 4.6312, + "loss/crossentropy": 2.208735913038254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19427185505628586, + "step": 10468 + }, + { + "epoch": 0.8725, + "grad_norm": 4.4375, + "grad_norm_var": 0.034077962239583336, + "learning_rate": 1.8227026776197735e-05, + "loss": 4.8591, + "loss/crossentropy": 1.5274348929524422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14625133015215397, + "step": 10470 + }, + { + "epoch": 0.8726666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.038671875, + "learning_rate": 1.820359413272727e-05, + "loss": 5.2586, + "loss/crossentropy": 1.855014145374298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736547350883484, + "step": 10472 + }, + { + "epoch": 0.8728333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.036181640625, + "learning_rate": 1.818017579131208e-05, + "loss": 4.579, + "loss/crossentropy": 1.6626396775245667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938914842903614, + "step": 10474 + }, + { + "epoch": 0.873, + "grad_norm": 4.65625, + "grad_norm_var": 0.042643229166666664, + "learning_rate": 1.81567718097346e-05, + "loss": 5.3581, + "loss/crossentropy": 1.793995201587677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17096725851297379, + "step": 10476 + }, + { + "epoch": 0.8731666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.042952473958333334, + "learning_rate": 1.8133382245741814e-05, + "loss": 5.1202, + "loss/crossentropy": 1.8239585757255554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21488109789788723, + "step": 10478 + }, + { + "epoch": 0.8733333333333333, + "grad_norm": 4.1875, + "grad_norm_var": 0.052978515625, + "learning_rate": 1.8110007157045157e-05, + "loss": 4.63, + "loss/crossentropy": 2.220126062631607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20925140008330345, + "step": 10480 + }, + { + "epoch": 0.8735, + "grad_norm": 4.6875, + "grad_norm_var": 0.05797119140625, + "learning_rate": 1.8086646601320327e-05, + "loss": 4.891, + "loss/crossentropy": 1.5069852694869041, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14469253458082676, + "step": 10482 + }, + { + "epoch": 0.8736666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.05728759765625, + "learning_rate": 1.806330063620719e-05, + "loss": 4.045, + "loss/crossentropy": 1.8212331235408783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20925015211105347, + "step": 10484 + }, + { + "epoch": 0.8738333333333334, + "grad_norm": 5.0, + "grad_norm_var": 0.06790364583333333, + "learning_rate": 1.8039969319309573e-05, + "loss": 4.6383, + "loss/crossentropy": 1.6038372293114662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1596646960824728, + "step": 10486 + }, + { + "epoch": 0.874, + "grad_norm": 4.84375, + "grad_norm_var": 0.06614583333333333, + "learning_rate": 1.8016652708195196e-05, + "loss": 5.2458, + "loss/crossentropy": 1.9456142485141754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19872823730111122, + "step": 10488 + }, + { + "epoch": 0.8741666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.05896809895833333, + "learning_rate": 1.799335086039547e-05, + "loss": 5.0188, + "loss/crossentropy": 2.1081501841545105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923701912164688, + "step": 10490 + }, + { + "epoch": 0.8743333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.0552734375, + "learning_rate": 1.79700638334054e-05, + "loss": 5.2991, + "loss/crossentropy": 1.814670369029045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19245347008109093, + "step": 10492 + }, + { + "epoch": 0.8745, + "grad_norm": 4.4375, + "grad_norm_var": 0.05188802083333333, + "learning_rate": 1.79467916846834e-05, + "loss": 4.6803, + "loss/crossentropy": 2.246580570936203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071346677839756, + "step": 10494 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 5.0625, + "grad_norm_var": 0.06112874348958333, + "learning_rate": 1.7923534471651186e-05, + "loss": 5.0703, + "loss/crossentropy": 2.5355364084243774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118670642375946, + "step": 10496 + }, + { + "epoch": 0.8748333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.05584309895833333, + "learning_rate": 1.7900292251693618e-05, + "loss": 4.8563, + "loss/crossentropy": 1.5048917829990387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14453205838799477, + "step": 10498 + }, + { + "epoch": 0.875, + "grad_norm": 4.59375, + "grad_norm_var": 0.04348551432291667, + "learning_rate": 1.7877065082158567e-05, + "loss": 4.7278, + "loss/crossentropy": 2.3392655849456787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20232314616441727, + "step": 10500 + }, + { + "epoch": 0.8751666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.042801920572916666, + "learning_rate": 1.7853853020356763e-05, + "loss": 5.1296, + "loss/crossentropy": 1.437909610569477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20237400382757187, + "step": 10502 + }, + { + "epoch": 0.8753333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.04455973307291667, + "learning_rate": 1.7830656123561658e-05, + "loss": 5.0175, + "loss/crossentropy": 1.020252212882042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12232361361384392, + "step": 10504 + }, + { + "epoch": 0.8755, + "grad_norm": 4.6875, + "grad_norm_var": 0.04407145182291667, + "learning_rate": 1.7807474449009293e-05, + "loss": 5.1914, + "loss/crossentropy": 1.643537849187851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.177352674305439, + "step": 10506 + }, + { + "epoch": 0.8756666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.0515625, + "learning_rate": 1.7784308053898147e-05, + "loss": 5.2378, + "loss/crossentropy": 1.8363083899021149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17326289042830467, + "step": 10508 + }, + { + "epoch": 0.8758333333333334, + "grad_norm": 4.96875, + "grad_norm_var": 0.057535807291666664, + "learning_rate": 1.7761156995388994e-05, + "loss": 4.86, + "loss/crossentropy": 2.0081919208168983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17866789363324642, + "step": 10510 + }, + { + "epoch": 0.876, + "grad_norm": 4.25, + "grad_norm_var": 0.04763997395833333, + "learning_rate": 1.7738021330604765e-05, + "loss": 4.4745, + "loss/crossentropy": 0.8506453335285187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1177255641669035, + "step": 10512 + }, + { + "epoch": 0.8761666666666666, + "grad_norm": 4.15625, + "grad_norm_var": 0.06155192057291667, + "learning_rate": 1.7714901116630424e-05, + "loss": 4.8159, + "loss/crossentropy": 2.144615203142166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23711377754807472, + "step": 10514 + }, + { + "epoch": 0.8763333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.08253580729166667, + "learning_rate": 1.7691796410512784e-05, + "loss": 5.3608, + "loss/crossentropy": 2.1386347115039825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22258469462394714, + "step": 10516 + }, + { + "epoch": 0.8765, + "grad_norm": 4.65625, + "grad_norm_var": 0.08435872395833334, + "learning_rate": 1.7668707269260435e-05, + "loss": 5.1316, + "loss/crossentropy": 2.270026445388794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18253154307603836, + "step": 10518 + }, + { + "epoch": 0.8766666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.08253580729166667, + "learning_rate": 1.7645633749843512e-05, + "loss": 5.412, + "loss/crossentropy": 2.1419003307819366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19707633554935455, + "step": 10520 + }, + { + "epoch": 0.8768333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.08495686848958334, + "learning_rate": 1.762257590919365e-05, + "loss": 5.1619, + "loss/crossentropy": 1.8701740205287933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16680870950222015, + "step": 10522 + }, + { + "epoch": 0.877, + "grad_norm": 4.53125, + "grad_norm_var": 0.09342447916666667, + "learning_rate": 1.7599533804203767e-05, + "loss": 4.7449, + "loss/crossentropy": 1.9194505885243416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18774541094899178, + "step": 10524 + }, + { + "epoch": 0.8771666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.09550374348958333, + "learning_rate": 1.7576507491727975e-05, + "loss": 5.3806, + "loss/crossentropy": 2.5566156804561615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26438262313604355, + "step": 10526 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.08609619140625, + "learning_rate": 1.75534970285814e-05, + "loss": 4.9388, + "loss/crossentropy": 1.4661534652113914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15747330151498318, + "step": 10528 + }, + { + "epoch": 0.8775, + "grad_norm": 4.40625, + "grad_norm_var": 0.07174072265625, + "learning_rate": 1.7530502471540084e-05, + "loss": 4.6847, + "loss/crossentropy": 2.2137043476104736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23433979973196983, + "step": 10530 + }, + { + "epoch": 0.8776666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.055712890625, + "learning_rate": 1.7507523877340803e-05, + "loss": 5.2944, + "loss/crossentropy": 1.5667135491967201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16603326424956322, + "step": 10532 + }, + { + "epoch": 0.8778333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.04195556640625, + "learning_rate": 1.748456130268096e-05, + "loss": 4.5756, + "loss/crossentropy": 1.7613427862524986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17346342653036118, + "step": 10534 + }, + { + "epoch": 0.878, + "grad_norm": 5.0, + "grad_norm_var": 0.054671223958333334, + "learning_rate": 1.7461614804218417e-05, + "loss": 5.0538, + "loss/crossentropy": 2.336539626121521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21434544026851654, + "step": 10536 + }, + { + "epoch": 0.8781666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.05572916666666667, + "learning_rate": 1.7438684438571386e-05, + "loss": 4.9415, + "loss/crossentropy": 2.0774486362934113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19223777204751968, + "step": 10538 + }, + { + "epoch": 0.8783333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.03319905598958333, + "learning_rate": 1.7415770262318262e-05, + "loss": 4.9802, + "loss/crossentropy": 2.266049236059189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19595278799533844, + "step": 10540 + }, + { + "epoch": 0.8785, + "grad_norm": 4.4375, + "grad_norm_var": 0.033528645833333336, + "learning_rate": 1.7392872331997495e-05, + "loss": 5.2426, + "loss/crossentropy": 1.9866546764969826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19384957291185856, + "step": 10542 + }, + { + "epoch": 0.8786666666666667, + "grad_norm": 4.21875, + "grad_norm_var": 0.044384765625, + "learning_rate": 1.7369990704107458e-05, + "loss": 4.8644, + "loss/crossentropy": 2.1645276844501495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19293129071593285, + "step": 10544 + }, + { + "epoch": 0.8788333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.04269205729166667, + "learning_rate": 1.7347125435106287e-05, + "loss": 5.3348, + "loss/crossentropy": 2.1431443095207214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157515287399292, + "step": 10546 + }, + { + "epoch": 0.879, + "grad_norm": 4.65625, + "grad_norm_var": 0.042708333333333334, + "learning_rate": 1.732427658141176e-05, + "loss": 4.9541, + "loss/crossentropy": 2.2488779723644257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067658081650734, + "step": 10548 + }, + { + "epoch": 0.8791666666666667, + "grad_norm": 4.90625, + "grad_norm_var": 0.053629557291666664, + "learning_rate": 1.7301444199401158e-05, + "loss": 4.7835, + "loss/crossentropy": 2.4230023622512817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21769268438220024, + "step": 10550 + }, + { + "epoch": 0.8793333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.043355305989583336, + "learning_rate": 1.7278628345411102e-05, + "loss": 4.7411, + "loss/crossentropy": 2.400269329547882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22978059947490692, + "step": 10552 + }, + { + "epoch": 0.8795, + "grad_norm": 4.9375, + "grad_norm_var": 0.05089518229166667, + "learning_rate": 1.725582907573746e-05, + "loss": 4.8078, + "loss/crossentropy": 2.343903511762619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20802440866827965, + "step": 10554 + }, + { + "epoch": 0.8796666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.046610514322916664, + "learning_rate": 1.7233046446635152e-05, + "loss": 5.1468, + "loss/crossentropy": 1.8718384355306625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16846632957458496, + "step": 10556 + }, + { + "epoch": 0.8798333333333334, + "grad_norm": 4.90625, + "grad_norm_var": 0.0537109375, + "learning_rate": 1.7210280514318055e-05, + "loss": 4.7226, + "loss/crossentropy": 1.6889416128396988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929287426173687, + "step": 10558 + }, + { + "epoch": 0.88, + "grad_norm": 4.65625, + "grad_norm_var": 0.04700520833333333, + "learning_rate": 1.718753133495884e-05, + "loss": 5.1345, + "loss/crossentropy": 2.15225350856781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042902112007141, + "step": 10560 + }, + { + "epoch": 0.8801666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.04607747395833333, + "learning_rate": 1.7164798964688853e-05, + "loss": 5.0313, + "loss/crossentropy": 2.0885613709688187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21012815833091736, + "step": 10562 + }, + { + "epoch": 0.8803333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.05234375, + "learning_rate": 1.7142083459597953e-05, + "loss": 4.4626, + "loss/crossentropy": 1.7584224492311478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17640075087547302, + "step": 10564 + }, + { + "epoch": 0.8805, + "grad_norm": 4.59375, + "grad_norm_var": 0.039778645833333334, + "learning_rate": 1.7119384875734388e-05, + "loss": 5.147, + "loss/crossentropy": 2.086060971021652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050802819430828, + "step": 10566 + }, + { + "epoch": 0.8806666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.03892822265625, + "learning_rate": 1.7096703269104658e-05, + "loss": 4.9094, + "loss/crossentropy": 2.1914361715316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21584604680538177, + "step": 10568 + }, + { + "epoch": 0.8808333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.03411458333333333, + "learning_rate": 1.7074038695673384e-05, + "loss": 5.261, + "loss/crossentropy": 1.7987454533576965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16365987062454224, + "step": 10570 + }, + { + "epoch": 0.881, + "grad_norm": 4.5625, + "grad_norm_var": 0.05520833333333333, + "learning_rate": 1.705139121136313e-05, + "loss": 5.002, + "loss/crossentropy": 2.0218057334423065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18577923253178596, + "step": 10572 + }, + { + "epoch": 0.8811666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.04934488932291667, + "learning_rate": 1.7028760872054327e-05, + "loss": 5.1821, + "loss/crossentropy": 2.079995185136795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19173727184534073, + "step": 10574 + }, + { + "epoch": 0.8813333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.048173014322916666, + "learning_rate": 1.700614773358508e-05, + "loss": 4.5502, + "loss/crossentropy": 2.2938634157180786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193524319678545, + "step": 10576 + }, + { + "epoch": 0.8815, + "grad_norm": 4.53125, + "grad_norm_var": 0.048811848958333334, + "learning_rate": 1.698355185175106e-05, + "loss": 5.2259, + "loss/crossentropy": 1.8245511278510094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17827428877353668, + "step": 10578 + }, + { + "epoch": 0.8816666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.041341145833333336, + "learning_rate": 1.696097328230536e-05, + "loss": 4.975, + "loss/crossentropy": 1.785768836736679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040269337594509, + "step": 10580 + }, + { + "epoch": 0.8818333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.05520833333333333, + "learning_rate": 1.693841208095836e-05, + "loss": 4.3794, + "loss/crossentropy": 1.9544300138950348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18160400912165642, + "step": 10582 + }, + { + "epoch": 0.882, + "grad_norm": 5.0625, + "grad_norm_var": 0.06428629557291667, + "learning_rate": 1.691586830337758e-05, + "loss": 5.0472, + "loss/crossentropy": 1.6755925416946411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18401020765304565, + "step": 10584 + }, + { + "epoch": 0.8821666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.06256103515625, + "learning_rate": 1.6893342005187546e-05, + "loss": 4.4777, + "loss/crossentropy": 2.553809404373169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22219187021255493, + "step": 10586 + }, + { + "epoch": 0.8823333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.044514973958333336, + "learning_rate": 1.687083324196966e-05, + "loss": 5.5024, + "loss/crossentropy": 2.220675617456436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19723805040121078, + "step": 10588 + }, + { + "epoch": 0.8825, + "grad_norm": 4.75, + "grad_norm_var": 0.042378743489583336, + "learning_rate": 1.6848342069262065e-05, + "loss": 5.155, + "loss/crossentropy": 1.6391087174415588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1860688552260399, + "step": 10590 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.047509765625, + "learning_rate": 1.682586854255949e-05, + "loss": 4.7607, + "loss/crossentropy": 1.832770362496376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15771115943789482, + "step": 10592 + }, + { + "epoch": 0.8828333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.06396077473958334, + "learning_rate": 1.6803412717313123e-05, + "loss": 5.167, + "loss/crossentropy": 2.5300718545913696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20988282933831215, + "step": 10594 + }, + { + "epoch": 0.883, + "grad_norm": 4.78125, + "grad_norm_var": 0.06796468098958333, + "learning_rate": 1.678097464893048e-05, + "loss": 5.2368, + "loss/crossentropy": 2.185933083295822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19851680099964142, + "step": 10596 + }, + { + "epoch": 0.8831666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.05705973307291667, + "learning_rate": 1.6758554392775276e-05, + "loss": 4.483, + "loss/crossentropy": 1.6705860868096352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17615766637027264, + "step": 10598 + }, + { + "epoch": 0.8833333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.0455078125, + "learning_rate": 1.6736152004167256e-05, + "loss": 4.4796, + "loss/crossentropy": 1.8610120490193367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18658486753702164, + "step": 10600 + }, + { + "epoch": 0.8835, + "grad_norm": 4.625, + "grad_norm_var": 0.041666666666666664, + "learning_rate": 1.6713767538382085e-05, + "loss": 4.563, + "loss/crossentropy": 2.159834563732147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20609938725829124, + "step": 10602 + }, + { + "epoch": 0.8836666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.03720296223958333, + "learning_rate": 1.669140105065121e-05, + "loss": 4.6492, + "loss/crossentropy": 2.3123832046985626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20040404424071312, + "step": 10604 + }, + { + "epoch": 0.8838333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.03876546223958333, + "learning_rate": 1.6669052596161722e-05, + "loss": 5.2065, + "loss/crossentropy": 2.3895527720451355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24119574576616287, + "step": 10606 + }, + { + "epoch": 0.884, + "grad_norm": 4.9375, + "grad_norm_var": 0.04894205729166667, + "learning_rate": 1.66467222300562e-05, + "loss": 4.7021, + "loss/crossentropy": 1.9245961979031563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17225523851811886, + "step": 10608 + }, + { + "epoch": 0.8841666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.040690104166666664, + "learning_rate": 1.6624410007432606e-05, + "loss": 4.8996, + "loss/crossentropy": 1.4526910781860352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15226059220731258, + "step": 10610 + }, + { + "epoch": 0.8843333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.051656087239583336, + "learning_rate": 1.6602115983344136e-05, + "loss": 5.193, + "loss/crossentropy": 2.605428993701935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21228623762726784, + "step": 10612 + }, + { + "epoch": 0.8845, + "grad_norm": 4.90625, + "grad_norm_var": 0.05709228515625, + "learning_rate": 1.6579840212799077e-05, + "loss": 4.6514, + "loss/crossentropy": 1.4337237551808357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17335276678204536, + "step": 10614 + }, + { + "epoch": 0.8846666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.05894775390625, + "learning_rate": 1.655758275076067e-05, + "loss": 5.3302, + "loss/crossentropy": 2.0682147443294525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20112159848213196, + "step": 10616 + }, + { + "epoch": 0.8848333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.06066080729166667, + "learning_rate": 1.6535343652147e-05, + "loss": 4.8437, + "loss/crossentropy": 2.116395853459835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18656159937381744, + "step": 10618 + }, + { + "epoch": 0.885, + "grad_norm": 4.5, + "grad_norm_var": 0.05676676432291667, + "learning_rate": 1.651312297183083e-05, + "loss": 4.8351, + "loss/crossentropy": 1.824868343770504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845719050616026, + "step": 10620 + }, + { + "epoch": 0.8851666666666667, + "grad_norm": 4.25, + "grad_norm_var": 0.06573893229166666, + "learning_rate": 1.6490920764639477e-05, + "loss": 4.4162, + "loss/crossentropy": 1.5410160273313522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15590156242251396, + "step": 10622 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.06474202473958333, + "learning_rate": 1.646873708535468e-05, + "loss": 5.4161, + "loss/crossentropy": 1.3780269846320152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17345618084073067, + "step": 10624 + }, + { + "epoch": 0.8855, + "grad_norm": 5.03125, + "grad_norm_var": 0.0734375, + "learning_rate": 1.644657198871247e-05, + "loss": 4.9033, + "loss/crossentropy": 2.166410952806473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21835574507713318, + "step": 10626 + }, + { + "epoch": 0.8856666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.062483723958333334, + "learning_rate": 1.642442552940301e-05, + "loss": 5.2132, + "loss/crossentropy": 2.2428570091724396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22547245025634766, + "step": 10628 + }, + { + "epoch": 0.8858333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.06100260416666667, + "learning_rate": 1.640229776207049e-05, + "loss": 4.6793, + "loss/crossentropy": 2.4395949244499207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21656383946537971, + "step": 10630 + }, + { + "epoch": 0.886, + "grad_norm": 4.34375, + "grad_norm_var": 0.06560872395833334, + "learning_rate": 1.6380188741312976e-05, + "loss": 4.8013, + "loss/crossentropy": 1.8665351793169975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16414397209882736, + "step": 10632 + }, + { + "epoch": 0.8861666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.06419270833333333, + "learning_rate": 1.6358098521682283e-05, + "loss": 5.2442, + "loss/crossentropy": 2.3931703567504883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21243423223495483, + "step": 10634 + }, + { + "epoch": 0.8863333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.5460245768229167, + "learning_rate": 1.6336027157683828e-05, + "loss": 4.546, + "loss/crossentropy": 1.274245411157608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13230569288134575, + "step": 10636 + }, + { + "epoch": 0.8865, + "grad_norm": 4.6875, + "grad_norm_var": 0.5261067708333333, + "learning_rate": 1.6313974703776507e-05, + "loss": 4.9338, + "loss/crossentropy": 1.9635001122951508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17589573562145233, + "step": 10638 + }, + { + "epoch": 0.8866666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.5380045572916666, + "learning_rate": 1.6291941214372554e-05, + "loss": 4.3675, + "loss/crossentropy": 1.907171793282032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1749830450862646, + "step": 10640 + }, + { + "epoch": 0.8868333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 0.5325358072916667, + "learning_rate": 1.6269926743837432e-05, + "loss": 5.4834, + "loss/crossentropy": 2.321804314851761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19393545016646385, + "step": 10642 + }, + { + "epoch": 0.887, + "grad_norm": 4.3125, + "grad_norm_var": 0.5464680989583334, + "learning_rate": 1.6247931346489637e-05, + "loss": 4.9566, + "loss/crossentropy": 2.445066601037979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21393660083413124, + "step": 10644 + }, + { + "epoch": 0.8871666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.6845052083333333, + "learning_rate": 1.6225955076600636e-05, + "loss": 4.7099, + "loss/crossentropy": 2.3382493257522583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21867894008755684, + "step": 10646 + }, + { + "epoch": 0.8873333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.67222900390625, + "learning_rate": 1.620399798839468e-05, + "loss": 4.9019, + "loss/crossentropy": 2.1961640417575836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18073081225156784, + "step": 10648 + }, + { + "epoch": 0.8875, + "grad_norm": 4.8125, + "grad_norm_var": 0.6851399739583334, + "learning_rate": 1.6182060136048727e-05, + "loss": 5.0117, + "loss/crossentropy": 1.1794405281543732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13596356473863125, + "step": 10650 + }, + { + "epoch": 0.8876666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.20790608723958334, + "learning_rate": 1.6160141573692217e-05, + "loss": 4.9391, + "loss/crossentropy": 2.450029969215393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20907938852906227, + "step": 10652 + }, + { + "epoch": 0.8878333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.21145833333333333, + "learning_rate": 1.613824235540704e-05, + "loss": 4.8929, + "loss/crossentropy": 1.566991001367569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968870833516121, + "step": 10654 + }, + { + "epoch": 0.888, + "grad_norm": 4.53125, + "grad_norm_var": 0.20331624348958333, + "learning_rate": 1.611636253522734e-05, + "loss": 4.4838, + "loss/crossentropy": 2.345241993665695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920331008732319, + "step": 10656 + }, + { + "epoch": 0.8881666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.21090087890625, + "learning_rate": 1.6094502167139393e-05, + "loss": 4.8726, + "loss/crossentropy": 2.4404727816581726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2480272240936756, + "step": 10658 + }, + { + "epoch": 0.8883333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.20390625, + "learning_rate": 1.607266130508148e-05, + "loss": 5.1869, + "loss/crossentropy": 1.9991124272346497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19056628830730915, + "step": 10660 + }, + { + "epoch": 0.8885, + "grad_norm": 4.5, + "grad_norm_var": 0.05543212890625, + "learning_rate": 1.605084000294377e-05, + "loss": 4.8648, + "loss/crossentropy": 2.3871697783470154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21722237020730972, + "step": 10662 + }, + { + "epoch": 0.8886666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.04950764973958333, + "learning_rate": 1.602903831456815e-05, + "loss": 4.9332, + "loss/crossentropy": 2.383307009935379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19519124925136566, + "step": 10664 + }, + { + "epoch": 0.8888333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 0.04198811848958333, + "learning_rate": 1.600725629374812e-05, + "loss": 4.9307, + "loss/crossentropy": 1.549317441880703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15623271651566029, + "step": 10666 + }, + { + "epoch": 0.889, + "grad_norm": 4.8125, + "grad_norm_var": 0.06510416666666667, + "learning_rate": 1.598549399422864e-05, + "loss": 4.5819, + "loss/crossentropy": 2.4213827252388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21201927214860916, + "step": 10668 + }, + { + "epoch": 0.8891666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.060791015625, + "learning_rate": 1.596375146970604e-05, + "loss": 4.9754, + "loss/crossentropy": 2.050051510334015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18452263250946999, + "step": 10670 + }, + { + "epoch": 0.8893333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.05618489583333333, + "learning_rate": 1.5942028773827827e-05, + "loss": 4.357, + "loss/crossentropy": 2.5166059732437134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21152303367853165, + "step": 10672 + }, + { + "epoch": 0.8895, + "grad_norm": 4.5, + "grad_norm_var": 0.054423014322916664, + "learning_rate": 1.59203259601926e-05, + "loss": 4.7636, + "loss/crossentropy": 1.8929245918989182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16911195777356625, + "step": 10674 + }, + { + "epoch": 0.8896666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.04348551432291667, + "learning_rate": 1.589864308234988e-05, + "loss": 4.8104, + "loss/crossentropy": 2.1835354566574097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070077657699585, + "step": 10676 + }, + { + "epoch": 0.8898333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 0.052408854166666664, + "learning_rate": 1.5876980193800033e-05, + "loss": 4.1576, + "loss/crossentropy": 1.198787048459053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13902889378368855, + "step": 10678 + }, + { + "epoch": 0.89, + "grad_norm": 4.53125, + "grad_norm_var": 0.04641927083333333, + "learning_rate": 1.5855337347994062e-05, + "loss": 5.2937, + "loss/crossentropy": 1.7133802622556686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17863846570253372, + "step": 10680 + }, + { + "epoch": 0.8901666666666667, + "grad_norm": 4.0625, + "grad_norm_var": 0.055078125, + "learning_rate": 1.5833714598333553e-05, + "loss": 4.3412, + "loss/crossentropy": 2.547089695930481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081010527908802, + "step": 10682 + }, + { + "epoch": 0.8903333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.03878580729166667, + "learning_rate": 1.581211199817048e-05, + "loss": 4.5989, + "loss/crossentropy": 1.652288556098938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16221519000828266, + "step": 10684 + }, + { + "epoch": 0.8905, + "grad_norm": 4.46875, + "grad_norm_var": 0.03616129557291667, + "learning_rate": 1.579052960080713e-05, + "loss": 5.0203, + "loss/crossentropy": 1.7919713705778122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1675052735954523, + "step": 10686 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 4.0, + "grad_norm_var": 0.041259765625, + "learning_rate": 1.57689674594959e-05, + "loss": 4.2805, + "loss/crossentropy": 1.9634157121181488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854908987879753, + "step": 10688 + }, + { + "epoch": 0.8908333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.04934488932291667, + "learning_rate": 1.5747425627439242e-05, + "loss": 5.0934, + "loss/crossentropy": 1.4715142846107483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14984281174838543, + "step": 10690 + }, + { + "epoch": 0.891, + "grad_norm": 5.15625, + "grad_norm_var": 0.08531494140625, + "learning_rate": 1.5725904157789487e-05, + "loss": 5.2262, + "loss/crossentropy": 2.327080875635147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025107815861702, + "step": 10692 + }, + { + "epoch": 0.8911666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.08049723307291666, + "learning_rate": 1.570440310364872e-05, + "loss": 5.3723, + "loss/crossentropy": 1.475014977157116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1731820423156023, + "step": 10694 + }, + { + "epoch": 0.8913333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.0837890625, + "learning_rate": 1.568292251806865e-05, + "loss": 5.0512, + "loss/crossentropy": 2.648381471633911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21056640520691872, + "step": 10696 + }, + { + "epoch": 0.8915, + "grad_norm": 4.5625, + "grad_norm_var": 0.06897379557291666, + "learning_rate": 1.5661462454050492e-05, + "loss": 4.5428, + "loss/crossentropy": 1.6267412602901459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14332648366689682, + "step": 10698 + }, + { + "epoch": 0.8916666666666667, + "grad_norm": 5.6875, + "grad_norm_var": 0.14615885416666666, + "learning_rate": 1.564002296454482e-05, + "loss": 5.2771, + "loss/crossentropy": 2.4379181265830994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064468339085579, + "step": 10700 + }, + { + "epoch": 0.8918333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.1453125, + "learning_rate": 1.5618604102451445e-05, + "loss": 5.2395, + "loss/crossentropy": 2.3840895295143127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20120671018958092, + "step": 10702 + }, + { + "epoch": 0.892, + "grad_norm": 4.84375, + "grad_norm_var": 0.10777587890625, + "learning_rate": 1.559720592061927e-05, + "loss": 5.0845, + "loss/crossentropy": 1.6713618710637093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16508845053613186, + "step": 10704 + }, + { + "epoch": 0.8921666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.1103515625, + "learning_rate": 1.5575828471846192e-05, + "loss": 5.2038, + "loss/crossentropy": 2.4455989003181458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22217406332492828, + "step": 10706 + }, + { + "epoch": 0.8923333333333333, + "grad_norm": 4.375, + "grad_norm_var": 0.10245768229166667, + "learning_rate": 1.555447180887894e-05, + "loss": 5.047, + "loss/crossentropy": 1.8809728920459747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19522210396826267, + "step": 10708 + }, + { + "epoch": 0.8925, + "grad_norm": 4.4375, + "grad_norm_var": 0.10338541666666666, + "learning_rate": 1.5533135984412954e-05, + "loss": 4.2285, + "loss/crossentropy": 1.6795168668031693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18034653179347515, + "step": 10710 + }, + { + "epoch": 0.8926666666666667, + "grad_norm": 4.09375, + "grad_norm_var": 0.12014567057291667, + "learning_rate": 1.5511821051092252e-05, + "loss": 4.6844, + "loss/crossentropy": 0.85299401730299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.10577042773365974, + "step": 10712 + }, + { + "epoch": 0.8928333333333334, + "grad_norm": 4.21875, + "grad_norm_var": 0.13359375, + "learning_rate": 1.5490527061509338e-05, + "loss": 4.7831, + "loss/crossentropy": 1.6696652993559837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17354433611035347, + "step": 10714 + }, + { + "epoch": 0.893, + "grad_norm": 4.78125, + "grad_norm_var": 0.057450358072916666, + "learning_rate": 1.5469254068204985e-05, + "loss": 4.9723, + "loss/crossentropy": 2.289715588092804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24060748890042305, + "step": 10716 + }, + { + "epoch": 0.8931666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.0599609375, + "learning_rate": 1.5448002123668207e-05, + "loss": 4.5862, + "loss/crossentropy": 1.8170486837625504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18546131625771523, + "step": 10718 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.05465087890625, + "learning_rate": 1.5426771280336054e-05, + "loss": 5.1344, + "loss/crossentropy": 1.6859957575798035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1697954684495926, + "step": 10720 + }, + { + "epoch": 0.8935, + "grad_norm": 4.46875, + "grad_norm_var": 0.03912760416666667, + "learning_rate": 1.540556159059354e-05, + "loss": 4.8421, + "loss/crossentropy": 0.9791104048490524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11764907836914062, + "step": 10722 + }, + { + "epoch": 0.8936666666666667, + "grad_norm": 7.375, + "grad_norm_var": 0.5524576822916667, + "learning_rate": 1.5384373106773437e-05, + "loss": 4.7781, + "loss/crossentropy": 1.7341388911008835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16041403263807297, + "step": 10724 + }, + { + "epoch": 0.8938333333333334, + "grad_norm": 4.375, + "grad_norm_var": 0.5555826822916666, + "learning_rate": 1.5363205881156248e-05, + "loss": 4.6986, + "loss/crossentropy": 1.618898868560791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18397285975515842, + "step": 10726 + }, + { + "epoch": 0.894, + "grad_norm": 4.75, + "grad_norm_var": 0.534228515625, + "learning_rate": 1.5342059965969988e-05, + "loss": 4.6764, + "loss/crossentropy": 2.125587046146393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1921604499220848, + "step": 10728 + }, + { + "epoch": 0.8941666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.5205037434895833, + "learning_rate": 1.5320935413390107e-05, + "loss": 4.8454, + "loss/crossentropy": 1.786333590745926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17975015565752983, + "step": 10730 + }, + { + "epoch": 0.8943333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.5203125, + "learning_rate": 1.529983227553932e-05, + "loss": 5.2461, + "loss/crossentropy": 1.7129372730851173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15703948214650154, + "step": 10732 + }, + { + "epoch": 0.8945, + "grad_norm": 4.78125, + "grad_norm_var": 0.50650634765625, + "learning_rate": 1.5278750604487543e-05, + "loss": 5.5317, + "loss/crossentropy": 2.582478642463684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20744846016168594, + "step": 10734 + }, + { + "epoch": 0.8946666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.50650634765625, + "learning_rate": 1.52576904522517e-05, + "loss": 4.7874, + "loss/crossentropy": 1.4274420738220215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17440488934516907, + "step": 10736 + }, + { + "epoch": 0.8948333333333334, + "grad_norm": 4.34375, + "grad_norm_var": 0.510546875, + "learning_rate": 1.5236651870795612e-05, + "loss": 4.8968, + "loss/crossentropy": 2.414700925350189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064298652112484, + "step": 10738 + }, + { + "epoch": 0.895, + "grad_norm": 4.90625, + "grad_norm_var": 0.031233723958333334, + "learning_rate": 1.521563491202989e-05, + "loss": 5.3921, + "loss/crossentropy": 2.2058697938919067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21623685210943222, + "step": 10740 + }, + { + "epoch": 0.8951666666666667, + "grad_norm": 5.0, + "grad_norm_var": 0.030887858072916666, + "learning_rate": 1.5194639627811803e-05, + "loss": 5.3514, + "loss/crossentropy": 2.025493770837784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22063399851322174, + "step": 10742 + }, + { + "epoch": 0.8953333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.04021809895833333, + "learning_rate": 1.5173666069945118e-05, + "loss": 4.816, + "loss/crossentropy": 1.9025913998484612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19442759454250336, + "step": 10744 + }, + { + "epoch": 0.8955, + "grad_norm": 4.8125, + "grad_norm_var": 0.03730061848958333, + "learning_rate": 1.5152714290180006e-05, + "loss": 5.2007, + "loss/crossentropy": 1.9195226430892944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1758854277431965, + "step": 10746 + }, + { + "epoch": 0.8956666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.04599202473958333, + "learning_rate": 1.5131784340212893e-05, + "loss": 4.1424, + "loss/crossentropy": 1.8893222734332085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18168200738728046, + "step": 10748 + }, + { + "epoch": 0.8958333333333334, + "grad_norm": 4.875, + "grad_norm_var": 0.04638264973958333, + "learning_rate": 1.511087627168637e-05, + "loss": 5.4183, + "loss/crossentropy": 2.4070481657981873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728513225913048, + "step": 10750 + }, + { + "epoch": 0.896, + "grad_norm": 4.59375, + "grad_norm_var": 0.061844889322916666, + "learning_rate": 1.5089990136189e-05, + "loss": 4.6507, + "loss/crossentropy": 2.1939191222190857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22148913517594337, + "step": 10752 + }, + { + "epoch": 0.8961666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.05748697916666667, + "learning_rate": 1.5069125985255242e-05, + "loss": 5.3693, + "loss/crossentropy": 2.1805800199508667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069510854780674, + "step": 10754 + }, + { + "epoch": 0.8963333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.05050455729166667, + "learning_rate": 1.5048283870365332e-05, + "loss": 4.8997, + "loss/crossentropy": 2.116938143968582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18929652869701385, + "step": 10756 + }, + { + "epoch": 0.8965, + "grad_norm": 4.5, + "grad_norm_var": 0.04334309895833333, + "learning_rate": 1.5027463842945109e-05, + "loss": 4.6614, + "loss/crossentropy": 1.7113404273986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19923411309719086, + "step": 10758 + }, + { + "epoch": 0.8966666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.037434895833333336, + "learning_rate": 1.5006665954365915e-05, + "loss": 5.2853, + "loss/crossentropy": 2.2329909205436707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22023383155465126, + "step": 10760 + }, + { + "epoch": 0.8968333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.04195556640625, + "learning_rate": 1.4985890255944477e-05, + "loss": 5.0832, + "loss/crossentropy": 2.5310455560684204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2189432941377163, + "step": 10762 + }, + { + "epoch": 0.897, + "grad_norm": 4.8125, + "grad_norm_var": 0.036051432291666664, + "learning_rate": 1.4965136798942772e-05, + "loss": 5.0833, + "loss/crossentropy": 2.420046091079712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084801085293293, + "step": 10764 + }, + { + "epoch": 0.8971666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.03553059895833333, + "learning_rate": 1.4944405634567883e-05, + "loss": 4.9867, + "loss/crossentropy": 1.9696931019425392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19564705714583397, + "step": 10766 + }, + { + "epoch": 0.8973333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.024853515625, + "learning_rate": 1.4923696813971903e-05, + "loss": 5.0961, + "loss/crossentropy": 2.3114156424999237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21169547736644745, + "step": 10768 + }, + { + "epoch": 0.8975, + "grad_norm": 4.65625, + "grad_norm_var": 0.5457682291666667, + "learning_rate": 1.4903010388251777e-05, + "loss": 4.6873, + "loss/crossentropy": 2.131648153066635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21583443507552147, + "step": 10770 + }, + { + "epoch": 0.8976666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.5373006184895833, + "learning_rate": 1.4882346408449222e-05, + "loss": 5.288, + "loss/crossentropy": 2.201010137796402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20310936123132706, + "step": 10772 + }, + { + "epoch": 0.8978333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.5461588541666667, + "learning_rate": 1.4861704925550545e-05, + "loss": 4.5638, + "loss/crossentropy": 1.9012616276741028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18746894598007202, + "step": 10774 + }, + { + "epoch": 0.898, + "grad_norm": 4.53125, + "grad_norm_var": 0.55416259765625, + "learning_rate": 1.4841085990486552e-05, + "loss": 5.3634, + "loss/crossentropy": 1.9615696221590042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19145160540938377, + "step": 10776 + }, + { + "epoch": 0.8981666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.573681640625, + "learning_rate": 1.4820489654132408e-05, + "loss": 5.4252, + "loss/crossentropy": 1.756884180009365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1741621494293213, + "step": 10778 + }, + { + "epoch": 0.8983333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.5929646809895833, + "learning_rate": 1.479991596730755e-05, + "loss": 4.2336, + "loss/crossentropy": 1.196315884590149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1336789671331644, + "step": 10780 + }, + { + "epoch": 0.8985, + "grad_norm": 4.53125, + "grad_norm_var": 0.5968587239583333, + "learning_rate": 1.4779364980775476e-05, + "loss": 4.7495, + "loss/crossentropy": 1.9746932983398438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18156530149281025, + "step": 10782 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 7.46875, + "grad_norm_var": 1.0577473958333334, + "learning_rate": 1.4758836745243723e-05, + "loss": 4.9054, + "loss/crossentropy": 1.986648440361023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18949030712246895, + "step": 10784 + }, + { + "epoch": 0.8988333333333334, + "grad_norm": 4.78125, + "grad_norm_var": 0.5587890625, + "learning_rate": 1.4738331311363659e-05, + "loss": 4.9686, + "loss/crossentropy": 2.134302496910095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217218816280365, + "step": 10786 + }, + { + "epoch": 0.899, + "grad_norm": 4.8125, + "grad_norm_var": 0.58033447265625, + "learning_rate": 1.4717848729730417e-05, + "loss": 4.996, + "loss/crossentropy": 1.5596438944339752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14135419204831123, + "step": 10788 + }, + { + "epoch": 0.8991666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.5920572916666667, + "learning_rate": 1.4697389050882713e-05, + "loss": 4.6219, + "loss/crossentropy": 1.9281855672597885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18955061584711075, + "step": 10790 + }, + { + "epoch": 0.8993333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.5873697916666667, + "learning_rate": 1.4676952325302787e-05, + "loss": 4.8212, + "loss/crossentropy": 2.278311848640442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20813852548599243, + "step": 10792 + }, + { + "epoch": 0.8995, + "grad_norm": 4.09375, + "grad_norm_var": 0.6040323893229167, + "learning_rate": 1.4656538603416222e-05, + "loss": 4.4675, + "loss/crossentropy": 2.52884042263031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21629564464092255, + "step": 10794 + }, + { + "epoch": 0.8996666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.592578125, + "learning_rate": 1.4636147935591845e-05, + "loss": 4.779, + "loss/crossentropy": 1.7139496207237244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17117717862129211, + "step": 10796 + }, + { + "epoch": 0.8998333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.59508056640625, + "learning_rate": 1.46157803721416e-05, + "loss": 4.4386, + "loss/crossentropy": 2.108862668275833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20810528099536896, + "step": 10798 + }, + { + "epoch": 0.9, + "grad_norm": 4.5625, + "grad_norm_var": 0.067041015625, + "learning_rate": 1.4595435963320435e-05, + "loss": 4.8122, + "loss/crossentropy": 2.3672678768634796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21068596094846725, + "step": 10800 + }, + { + "epoch": 0.9001666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.06847330729166666, + "learning_rate": 1.4575114759326147e-05, + "loss": 5.0223, + "loss/crossentropy": 1.967505268752575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17484956234693527, + "step": 10802 + }, + { + "epoch": 0.9003333333333333, + "grad_norm": 4.96875, + "grad_norm_var": 0.08310139973958333, + "learning_rate": 1.4554816810299292e-05, + "loss": 4.1566, + "loss/crossentropy": 0.8119016736745834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1141384020447731, + "step": 10804 + }, + { + "epoch": 0.9005, + "grad_norm": 4.5, + "grad_norm_var": 0.07125244140625, + "learning_rate": 1.4534542166323037e-05, + "loss": 5.0435, + "loss/crossentropy": 1.9839187264442444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827479749917984, + "step": 10806 + }, + { + "epoch": 0.9006666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.07076416015625, + "learning_rate": 1.4514290877423055e-05, + "loss": 4.8319, + "loss/crossentropy": 2.3538177013397217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22079696133732796, + "step": 10808 + }, + { + "epoch": 0.9008333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.05419514973958333, + "learning_rate": 1.4494062993567386e-05, + "loss": 4.9642, + "loss/crossentropy": 2.0697861313819885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22400951385498047, + "step": 10810 + }, + { + "epoch": 0.901, + "grad_norm": 4.375, + "grad_norm_var": 0.05858968098958333, + "learning_rate": 1.4473858564666326e-05, + "loss": 4.7692, + "loss/crossentropy": 2.2374483346939087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18740571290254593, + "step": 10812 + }, + { + "epoch": 0.9011666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.04299723307291667, + "learning_rate": 1.4453677640572284e-05, + "loss": 4.4197, + "loss/crossentropy": 1.543198212981224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1734531968832016, + "step": 10814 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.054541015625, + "learning_rate": 1.4433520271079706e-05, + "loss": 5.1892, + "loss/crossentropy": 2.6357452273368835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21056770905852318, + "step": 10816 + }, + { + "epoch": 0.9015, + "grad_norm": 4.34375, + "grad_norm_var": 0.05071614583333333, + "learning_rate": 1.441338650592487e-05, + "loss": 4.9244, + "loss/crossentropy": 1.6901346743106842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1798687893897295, + "step": 10818 + }, + { + "epoch": 0.9016666666666666, + "grad_norm": 4.25, + "grad_norm_var": 0.02779541015625, + "learning_rate": 1.439327639478586e-05, + "loss": 4.5571, + "loss/crossentropy": 2.5649845004081726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21501518785953522, + "step": 10820 + }, + { + "epoch": 0.9018333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.027473958333333333, + "learning_rate": 1.4373189987282364e-05, + "loss": 5.2804, + "loss/crossentropy": 1.5373041331768036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15145167149603367, + "step": 10822 + }, + { + "epoch": 0.902, + "grad_norm": 4.40625, + "grad_norm_var": 0.027762858072916667, + "learning_rate": 1.4353127332975611e-05, + "loss": 4.7435, + "loss/crossentropy": 1.7477587014436722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1664668321609497, + "step": 10824 + }, + { + "epoch": 0.9021666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.02720947265625, + "learning_rate": 1.4333088481368188e-05, + "loss": 4.8537, + "loss/crossentropy": 1.6127407774329185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1708746962249279, + "step": 10826 + }, + { + "epoch": 0.9023333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.029683430989583332, + "learning_rate": 1.431307348190398e-05, + "loss": 4.8039, + "loss/crossentropy": 1.6628762856125832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15167899429798126, + "step": 10828 + }, + { + "epoch": 0.9025, + "grad_norm": 4.46875, + "grad_norm_var": 0.030497233072916668, + "learning_rate": 1.4293082383968008e-05, + "loss": 4.8124, + "loss/crossentropy": 1.5773266032338142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15623841248452663, + "step": 10830 + }, + { + "epoch": 0.9026666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.04462483723958333, + "learning_rate": 1.427311523688632e-05, + "loss": 4.3818, + "loss/crossentropy": 1.007303848862648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11725516617298126, + "step": 10832 + }, + { + "epoch": 0.9028333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.043843587239583336, + "learning_rate": 1.4253172089925857e-05, + "loss": 5.0789, + "loss/crossentropy": 2.202227681875229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21483517810702324, + "step": 10834 + }, + { + "epoch": 0.903, + "grad_norm": 4.40625, + "grad_norm_var": 0.04309488932291667, + "learning_rate": 1.4233252992294361e-05, + "loss": 5.2643, + "loss/crossentropy": 2.5935566425323486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2313656397163868, + "step": 10836 + }, + { + "epoch": 0.9031666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.04230143229166667, + "learning_rate": 1.4213357993140226e-05, + "loss": 5.0049, + "loss/crossentropy": 1.849151723086834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18203931860625744, + "step": 10838 + }, + { + "epoch": 0.9033333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.049332682291666666, + "learning_rate": 1.4193487141552382e-05, + "loss": 5.2893, + "loss/crossentropy": 1.7852751687169075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18054143711924553, + "step": 10840 + }, + { + "epoch": 0.9035, + "grad_norm": 4.34375, + "grad_norm_var": 0.05266520182291667, + "learning_rate": 1.4173640486560172e-05, + "loss": 4.579, + "loss/crossentropy": 1.3571243658661842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14290621131658554, + "step": 10842 + }, + { + "epoch": 0.9036666666666666, + "grad_norm": 4.28125, + "grad_norm_var": 0.06300455729166667, + "learning_rate": 1.4153818077133257e-05, + "loss": 4.8163, + "loss/crossentropy": 1.5496264174580574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15174993872642517, + "step": 10844 + }, + { + "epoch": 0.9038333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.06573893229166666, + "learning_rate": 1.4134019962181458e-05, + "loss": 4.7493, + "loss/crossentropy": 2.2139610946178436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19633793458342552, + "step": 10846 + }, + { + "epoch": 0.904, + "grad_norm": 5.03125, + "grad_norm_var": 0.060770670572916664, + "learning_rate": 1.4114246190554654e-05, + "loss": 4.8895, + "loss/crossentropy": 1.8826258331537247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1785743087530136, + "step": 10848 + }, + { + "epoch": 0.9041666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.059956868489583336, + "learning_rate": 1.4094496811042657e-05, + "loss": 5.1005, + "loss/crossentropy": 2.1995404064655304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22843296453356743, + "step": 10850 + }, + { + "epoch": 0.9043333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.05972900390625, + "learning_rate": 1.4074771872375111e-05, + "loss": 4.3757, + "loss/crossentropy": 1.0549268051981926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17528251186013222, + "step": 10852 + }, + { + "epoch": 0.9045, + "grad_norm": 4.5, + "grad_norm_var": 0.059403483072916666, + "learning_rate": 1.4055071423221321e-05, + "loss": 5.1859, + "loss/crossentropy": 2.5413814783096313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125440388917923, + "step": 10854 + }, + { + "epoch": 0.9046666666666666, + "grad_norm": 4.9375, + "grad_norm_var": 0.06370035807291667, + "learning_rate": 1.4035395512190204e-05, + "loss": 4.6172, + "loss/crossentropy": 2.1019559502601624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21940137073397636, + "step": 10856 + }, + { + "epoch": 0.9048333333333334, + "grad_norm": 4.40625, + "grad_norm_var": 0.064306640625, + "learning_rate": 1.4015744187830102e-05, + "loss": 4.7331, + "loss/crossentropy": 2.2734327018260956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1995382085442543, + "step": 10858 + }, + { + "epoch": 0.905, + "grad_norm": 4.34375, + "grad_norm_var": 0.062174479166666664, + "learning_rate": 1.3996117498628726e-05, + "loss": 4.8909, + "loss/crossentropy": 2.4530131220817566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104918509721756, + "step": 10860 + }, + { + "epoch": 0.9051666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.055989583333333336, + "learning_rate": 1.397651549301295e-05, + "loss": 4.5454, + "loss/crossentropy": 1.7010397166013718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17811237461864948, + "step": 10862 + }, + { + "epoch": 0.9053333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.04263916015625, + "learning_rate": 1.3956938219348795e-05, + "loss": 4.2376, + "loss/crossentropy": 1.8406718373298645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19351908564567566, + "step": 10864 + }, + { + "epoch": 0.9055, + "grad_norm": 4.125, + "grad_norm_var": 0.052587890625, + "learning_rate": 1.3937385725941234e-05, + "loss": 4.4156, + "loss/crossentropy": 2.0871371999382973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19260753318667412, + "step": 10866 + }, + { + "epoch": 0.9056666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.053971354166666666, + "learning_rate": 1.39178580610341e-05, + "loss": 4.9657, + "loss/crossentropy": 2.59759783744812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034224160015583, + "step": 10868 + }, + { + "epoch": 0.9058333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 612.4866821289063, + "learning_rate": 1.3898355272809958e-05, + "loss": 4.1001, + "loss/crossentropy": 1.2347158193588257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14453835971653461, + "step": 10870 + }, + { + "epoch": 0.906, + "grad_norm": 4.34375, + "grad_norm_var": 612.8882446289062, + "learning_rate": 1.387887740939001e-05, + "loss": 4.843, + "loss/crossentropy": 2.1636237651109695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18093391880393028, + "step": 10872 + }, + { + "epoch": 0.9061666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 613.5347005208333, + "learning_rate": 1.3859424518833944e-05, + "loss": 4.4894, + "loss/crossentropy": 1.5998671725392342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15799619629979134, + "step": 10874 + }, + { + "epoch": 0.9063333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 613.0373697916667, + "learning_rate": 1.3839996649139834e-05, + "loss": 5.1337, + "loss/crossentropy": 2.1352964639663696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2174740508198738, + "step": 10876 + }, + { + "epoch": 0.9065, + "grad_norm": 4.5, + "grad_norm_var": 612.7711873372396, + "learning_rate": 1.382059384824401e-05, + "loss": 5.4513, + "loss/crossentropy": 2.504655659198761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20499487966299057, + "step": 10878 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 4.5, + "grad_norm_var": 612.481103515625, + "learning_rate": 1.3801216164020966e-05, + "loss": 4.6749, + "loss/crossentropy": 2.438272774219513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21250415965914726, + "step": 10880 + }, + { + "epoch": 0.9068333333333334, + "grad_norm": 4.75, + "grad_norm_var": 612.1158203125, + "learning_rate": 1.3781863644283204e-05, + "loss": 5.1241, + "loss/crossentropy": 2.322550445795059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21630997583270073, + "step": 10882 + }, + { + "epoch": 0.907, + "grad_norm": 4.84375, + "grad_norm_var": 611.60078125, + "learning_rate": 1.376253633678115e-05, + "loss": 5.3166, + "loss/crossentropy": 2.158540368080139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1832444816827774, + "step": 10884 + }, + { + "epoch": 0.9071666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.06900634765625, + "learning_rate": 1.3743234289202998e-05, + "loss": 4.8893, + "loss/crossentropy": 1.7209921851754189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16882705315947533, + "step": 10886 + }, + { + "epoch": 0.9073333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.07086181640625, + "learning_rate": 1.3723957549174652e-05, + "loss": 5.1099, + "loss/crossentropy": 2.0559261441230774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1762427855283022, + "step": 10888 + }, + { + "epoch": 0.9075, + "grad_norm": 4.15625, + "grad_norm_var": 0.06623942057291667, + "learning_rate": 1.370470616425954e-05, + "loss": 4.6185, + "loss/crossentropy": 1.8271742761135101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17320568673312664, + "step": 10890 + }, + { + "epoch": 0.9076666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.05911458333333333, + "learning_rate": 1.3685480181958544e-05, + "loss": 5.4106, + "loss/crossentropy": 2.7878470420837402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20561690255999565, + "step": 10892 + }, + { + "epoch": 0.9078333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.04664306640625, + "learning_rate": 1.3666279649709855e-05, + "loss": 4.8514, + "loss/crossentropy": 1.7777061834931374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18127823993563652, + "step": 10894 + }, + { + "epoch": 0.908, + "grad_norm": 4.46875, + "grad_norm_var": 0.04895426432291667, + "learning_rate": 1.3647104614888897e-05, + "loss": 4.7707, + "loss/crossentropy": 1.4975739419460297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14876068755984306, + "step": 10896 + }, + { + "epoch": 0.9081666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.044661458333333334, + "learning_rate": 1.362795512480814e-05, + "loss": 4.7532, + "loss/crossentropy": 2.1489458978176117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19379611685872078, + "step": 10898 + }, + { + "epoch": 0.9083333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.03919270833333333, + "learning_rate": 1.3608831226717065e-05, + "loss": 4.7385, + "loss/crossentropy": 2.505477249622345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21521490439772606, + "step": 10900 + }, + { + "epoch": 0.9085, + "grad_norm": 4.71875, + "grad_norm_var": 0.0435546875, + "learning_rate": 1.358973296780198e-05, + "loss": 5.414, + "loss/crossentropy": 2.2798091173171997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20843492820858955, + "step": 10902 + }, + { + "epoch": 0.9086666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.044270833333333336, + "learning_rate": 1.3570660395185943e-05, + "loss": 4.9674, + "loss/crossentropy": 2.234380006790161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19471841678023338, + "step": 10904 + }, + { + "epoch": 0.9088333333333334, + "grad_norm": 4.21875, + "grad_norm_var": 0.036458333333333336, + "learning_rate": 1.355161355592863e-05, + "loss": 4.7723, + "loss/crossentropy": 1.673406831920147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17296934872865677, + "step": 10906 + }, + { + "epoch": 0.909, + "grad_norm": 4.71875, + "grad_norm_var": 0.02578125, + "learning_rate": 1.3532592497026228e-05, + "loss": 4.7906, + "loss/crossentropy": 1.6391168981790543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15169396623969078, + "step": 10908 + }, + { + "epoch": 0.9091666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.02896728515625, + "learning_rate": 1.35135972654113e-05, + "loss": 5.0298, + "loss/crossentropy": 2.1111242473125458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054269053041935, + "step": 10910 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.027669270833333332, + "learning_rate": 1.3494627907952702e-05, + "loss": 5.09, + "loss/crossentropy": 1.5478358790278435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15873121563345194, + "step": 10912 + }, + { + "epoch": 0.9095, + "grad_norm": 4.71875, + "grad_norm_var": 0.027978515625, + "learning_rate": 1.3475684471455423e-05, + "loss": 5.0438, + "loss/crossentropy": 1.9041509926319122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19300096109509468, + "step": 10914 + }, + { + "epoch": 0.9096666666666666, + "grad_norm": 4.3125, + "grad_norm_var": 0.026676432291666666, + "learning_rate": 1.345676700266053e-05, + "loss": 5.0814, + "loss/crossentropy": 2.0266382694244385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19544285908341408, + "step": 10916 + }, + { + "epoch": 0.9098333333333334, + "grad_norm": 4.40625, + "grad_norm_var": 0.027734375, + "learning_rate": 1.3437875548244986e-05, + "loss": 4.7484, + "loss/crossentropy": 1.905693419277668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17530391551554203, + "step": 10918 + }, + { + "epoch": 0.91, + "grad_norm": 4.875, + "grad_norm_var": 0.03209228515625, + "learning_rate": 1.3419010154821575e-05, + "loss": 4.9243, + "loss/crossentropy": 1.5361211821436882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17113251611590385, + "step": 10920 + }, + { + "epoch": 0.9101666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 1.05982666015625, + "learning_rate": 1.3400170868938775e-05, + "loss": 5.146, + "loss/crossentropy": 2.2465337216854095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19919180497527122, + "step": 10922 + }, + { + "epoch": 0.9103333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 1.0621744791666667, + "learning_rate": 1.3381357737080665e-05, + "loss": 4.9156, + "loss/crossentropy": 2.347740739583969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21325716003775597, + "step": 10924 + }, + { + "epoch": 0.9105, + "grad_norm": 4.5625, + "grad_norm_var": 1.0676920572916666, + "learning_rate": 1.336257080566677e-05, + "loss": 5.1032, + "loss/crossentropy": 1.686830684542656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18585955165326595, + "step": 10926 + }, + { + "epoch": 0.9106666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 1.075634765625, + "learning_rate": 1.3343810121051977e-05, + "loss": 4.5115, + "loss/crossentropy": 2.1991084814071655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18302945792675018, + "step": 10928 + }, + { + "epoch": 0.9108333333333334, + "grad_norm": 4.75, + "grad_norm_var": 1.0740885416666666, + "learning_rate": 1.3325075729526401e-05, + "loss": 5.1483, + "loss/crossentropy": 2.5179224014282227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21267320960760117, + "step": 10930 + }, + { + "epoch": 0.911, + "grad_norm": 4.28125, + "grad_norm_var": 1.1177734375, + "learning_rate": 1.3306367677315315e-05, + "loss": 4.0085, + "loss/crossentropy": 0.39225253462791443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.07208161056041718, + "step": 10932 + }, + { + "epoch": 0.9111666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 1.1162109375, + "learning_rate": 1.3287686010578954e-05, + "loss": 4.9949, + "loss/crossentropy": 2.163930505514145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22702163830399513, + "step": 10934 + }, + { + "epoch": 0.9113333333333333, + "grad_norm": 4.5, + "grad_norm_var": 1.1220011393229166, + "learning_rate": 1.3269030775412481e-05, + "loss": 4.8736, + "loss/crossentropy": 2.1811038851737976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20420709252357483, + "step": 10936 + }, + { + "epoch": 0.9115, + "grad_norm": 4.46875, + "grad_norm_var": 0.06148681640625, + "learning_rate": 1.3250402017845839e-05, + "loss": 4.7587, + "loss/crossentropy": 1.7884374484419823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15043306909501553, + "step": 10938 + }, + { + "epoch": 0.9116666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.06451822916666666, + "learning_rate": 1.323179978384363e-05, + "loss": 5.5021, + "loss/crossentropy": 2.5016011595726013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943902149796486, + "step": 10940 + }, + { + "epoch": 0.9118333333333334, + "grad_norm": 4.0625, + "grad_norm_var": 0.07628580729166666, + "learning_rate": 1.3213224119305017e-05, + "loss": 4.4301, + "loss/crossentropy": 0.9773362800478935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11285099945962429, + "step": 10942 + }, + { + "epoch": 0.912, + "grad_norm": 4.65625, + "grad_norm_var": 0.07906494140625, + "learning_rate": 1.319467507006361e-05, + "loss": 4.7561, + "loss/crossentropy": 1.7318015322089195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21848906949162483, + "step": 10944 + }, + { + "epoch": 0.9121666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.07810872395833333, + "learning_rate": 1.3176152681887345e-05, + "loss": 4.764, + "loss/crossentropy": 2.075657568871975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18956227414309978, + "step": 10946 + }, + { + "epoch": 0.9123333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.05273030598958333, + "learning_rate": 1.3157657000478367e-05, + "loss": 4.7062, + "loss/crossentropy": 1.3858967423439026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14041148498654366, + "step": 10948 + }, + { + "epoch": 0.9125, + "grad_norm": 4.6875, + "grad_norm_var": 0.030712890625, + "learning_rate": 1.3139188071472933e-05, + "loss": 4.9684, + "loss/crossentropy": 2.010605439543724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19537147507071495, + "step": 10950 + }, + { + "epoch": 0.9126666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.03980712890625, + "learning_rate": 1.3120745940441295e-05, + "loss": 5.0574, + "loss/crossentropy": 2.2360286712646484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22419559210538864, + "step": 10952 + }, + { + "epoch": 0.9128333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.054150390625, + "learning_rate": 1.3102330652887573e-05, + "loss": 4.4787, + "loss/crossentropy": 1.4420068562030792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13918348774313927, + "step": 10954 + }, + { + "epoch": 0.913, + "grad_norm": 4.46875, + "grad_norm_var": 0.0525390625, + "learning_rate": 1.308394225424966e-05, + "loss": 5.179, + "loss/crossentropy": 1.8898730650544167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17681991308927536, + "step": 10956 + }, + { + "epoch": 0.9131666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.03752848307291667, + "learning_rate": 1.3065580789899097e-05, + "loss": 4.9317, + "loss/crossentropy": 1.9032281190156937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17939336225390434, + "step": 10958 + }, + { + "epoch": 0.9133333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.04269205729166667, + "learning_rate": 1.3047246305140982e-05, + "loss": 5.2684, + "loss/crossentropy": 2.2830842435359955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21606996655464172, + "step": 10960 + }, + { + "epoch": 0.9135, + "grad_norm": 4.875, + "grad_norm_var": 0.04361572265625, + "learning_rate": 1.3028938845213828e-05, + "loss": 4.613, + "loss/crossentropy": 2.186072915792465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21759696677327156, + "step": 10962 + }, + { + "epoch": 0.9136666666666666, + "grad_norm": 4.90625, + "grad_norm_var": 0.04894205729166667, + "learning_rate": 1.3010658455289471e-05, + "loss": 5.0303, + "loss/crossentropy": 2.524782419204712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118838131427765, + "step": 10964 + }, + { + "epoch": 0.9138333333333334, + "grad_norm": 4.84375, + "grad_norm_var": 0.05349934895833333, + "learning_rate": 1.2992405180472953e-05, + "loss": 5.3547, + "loss/crossentropy": 2.450446605682373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2266518585383892, + "step": 10966 + }, + { + "epoch": 0.914, + "grad_norm": 4.40625, + "grad_norm_var": 0.04920247395833333, + "learning_rate": 1.297417906580243e-05, + "loss": 4.9993, + "loss/crossentropy": 2.5028828382492065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21859336644411087, + "step": 10968 + }, + { + "epoch": 0.9141666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.0435546875, + "learning_rate": 1.2955980156249006e-05, + "loss": 4.9976, + "loss/crossentropy": 1.8704118728637695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18769513443112373, + "step": 10970 + }, + { + "epoch": 0.9143333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.04302978515625, + "learning_rate": 1.2937808496716699e-05, + "loss": 4.9612, + "loss/crossentropy": 2.005654275417328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18199091777205467, + "step": 10972 + }, + { + "epoch": 0.9145, + "grad_norm": 4.375, + "grad_norm_var": 0.049149576822916666, + "learning_rate": 1.291966413204227e-05, + "loss": 4.6524, + "loss/crossentropy": 1.4702882021665573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16470220685005188, + "step": 10974 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.04498697916666667, + "learning_rate": 1.2901547106995125e-05, + "loss": 4.5217, + "loss/crossentropy": 1.9486939013004303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17468804866075516, + "step": 10976 + }, + { + "epoch": 0.9148333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.0376953125, + "learning_rate": 1.2883457466277226e-05, + "loss": 4.2599, + "loss/crossentropy": 2.089116282761097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19068294391036034, + "step": 10978 + }, + { + "epoch": 0.915, + "grad_norm": 4.5625, + "grad_norm_var": 0.028108723958333335, + "learning_rate": 1.2865395254522972e-05, + "loss": 5.3167, + "loss/crossentropy": 2.388074040412903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232646606862545, + "step": 10980 + }, + { + "epoch": 0.9151666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.021744791666666666, + "learning_rate": 1.284736051629907e-05, + "loss": 4.6976, + "loss/crossentropy": 1.801637277007103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17922187969088554, + "step": 10982 + }, + { + "epoch": 0.9153333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.024800618489583332, + "learning_rate": 1.282935329610444e-05, + "loss": 4.4747, + "loss/crossentropy": 0.9352747425436974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11156686767935753, + "step": 10984 + }, + { + "epoch": 0.9155, + "grad_norm": 4.40625, + "grad_norm_var": 0.032145182291666664, + "learning_rate": 1.2811373638370108e-05, + "loss": 5.2869, + "loss/crossentropy": 1.7718759551644325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15538059920072556, + "step": 10986 + }, + { + "epoch": 0.9156666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.04332275390625, + "learning_rate": 1.27934215874591e-05, + "loss": 5.5746, + "loss/crossentropy": 2.574616312980652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20968396216630936, + "step": 10988 + }, + { + "epoch": 0.9158333333333334, + "grad_norm": 4.34375, + "grad_norm_var": 0.03982747395833333, + "learning_rate": 1.277549718766631e-05, + "loss": 4.4059, + "loss/crossentropy": 0.8218652456998825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11896517686545849, + "step": 10990 + }, + { + "epoch": 0.916, + "grad_norm": 4.90625, + "grad_norm_var": 0.04698893229166667, + "learning_rate": 1.2757600483218418e-05, + "loss": 5.491, + "loss/crossentropy": 1.6398277059197426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1669867243617773, + "step": 10992 + }, + { + "epoch": 0.9161666666666667, + "grad_norm": 5.09375, + "grad_norm_var": 0.07248942057291667, + "learning_rate": 1.273973151827375e-05, + "loss": 5.5911, + "loss/crossentropy": 2.2634086310863495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104131430387497, + "step": 10994 + }, + { + "epoch": 0.9163333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.07828369140625, + "learning_rate": 1.2721890336922219e-05, + "loss": 5.099, + "loss/crossentropy": 2.065896801650524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17549334466457367, + "step": 10996 + }, + { + "epoch": 0.9165, + "grad_norm": 4.5625, + "grad_norm_var": 0.06643473307291667, + "learning_rate": 1.2704076983185156e-05, + "loss": 5.3278, + "loss/crossentropy": 1.8941172808408737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17898527905344963, + "step": 10998 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 4.21875, + "grad_norm_var": 0.0703125, + "learning_rate": 1.2686291501015243e-05, + "loss": 4.571, + "loss/crossentropy": 1.3857896998524666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1634297538548708, + "step": 11000 + }, + { + "epoch": 0.9168333333333333, + "grad_norm": 4.25, + "grad_norm_var": 0.07737223307291667, + "learning_rate": 1.2668533934296388e-05, + "loss": 4.5265, + "loss/crossentropy": 1.7431185841560364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19076452031731606, + "step": 11002 + }, + { + "epoch": 0.917, + "grad_norm": 4.71875, + "grad_norm_var": 0.08941650390625, + "learning_rate": 1.2650804326843624e-05, + "loss": 4.7464, + "loss/crossentropy": 2.4716763496398926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999698057770729, + "step": 11004 + }, + { + "epoch": 0.9171666666666667, + "grad_norm": 4.84375, + "grad_norm_var": 0.08411458333333334, + "learning_rate": 1.2633102722402993e-05, + "loss": 4.9842, + "loss/crossentropy": 1.752097338438034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20358425192534924, + "step": 11006 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.08411458333333334, + "learning_rate": 1.2615429164651437e-05, + "loss": 4.9499, + "loss/crossentropy": 1.851276509463787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18495296128094196, + "step": 11008 + }, + { + "epoch": 0.9175, + "grad_norm": 4.6875, + "grad_norm_var": 0.054150390625, + "learning_rate": 1.2597783697196717e-05, + "loss": 4.9879, + "loss/crossentropy": 2.3310405611991882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24937022477388382, + "step": 11010 + }, + { + "epoch": 0.9176666666666666, + "grad_norm": 4.28125, + "grad_norm_var": 0.048563639322916664, + "learning_rate": 1.2580166363577262e-05, + "loss": 4.2779, + "loss/crossentropy": 1.6911320835351944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16903837397694588, + "step": 11012 + }, + { + "epoch": 0.9178333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.06197916666666667, + "learning_rate": 1.2562577207262094e-05, + "loss": 4.5681, + "loss/crossentropy": 2.2415121346712112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18968461081385612, + "step": 11014 + }, + { + "epoch": 0.918, + "grad_norm": 4.6875, + "grad_norm_var": 0.05308837890625, + "learning_rate": 1.2545016271650703e-05, + "loss": 4.945, + "loss/crossentropy": 2.0836883261799812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19096532836556435, + "step": 11016 + }, + { + "epoch": 0.9181666666666667, + "grad_norm": 4.25, + "grad_norm_var": 0.0548828125, + "learning_rate": 1.2527483600072958e-05, + "loss": 4.5348, + "loss/crossentropy": 1.9989722445607185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17667778953909874, + "step": 11018 + }, + { + "epoch": 0.9183333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.06900634765625, + "learning_rate": 1.2509979235788983e-05, + "loss": 4.9737, + "loss/crossentropy": 1.6975673288106918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16448040679097176, + "step": 11020 + }, + { + "epoch": 0.9185, + "grad_norm": 4.9375, + "grad_norm_var": 0.07389322916666667, + "learning_rate": 1.2492503221989052e-05, + "loss": 5.2162, + "loss/crossentropy": 1.9220678880810738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19711147621273994, + "step": 11022 + }, + { + "epoch": 0.9186666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.07893473307291667, + "learning_rate": 1.247505560179349e-05, + "loss": 4.6263, + "loss/crossentropy": 2.1792136132717133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1773302685469389, + "step": 11024 + }, + { + "epoch": 0.9188333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.07760416666666667, + "learning_rate": 1.2457636418252576e-05, + "loss": 4.8479, + "loss/crossentropy": 1.406090959906578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16720456257462502, + "step": 11026 + }, + { + "epoch": 0.919, + "grad_norm": 4.40625, + "grad_norm_var": 0.09107666015625, + "learning_rate": 1.2440245714346406e-05, + "loss": 5.0846, + "loss/crossentropy": 1.9798620790243149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17173392325639725, + "step": 11028 + }, + { + "epoch": 0.9191666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.07174479166666667, + "learning_rate": 1.2422883532984816e-05, + "loss": 4.3883, + "loss/crossentropy": 1.6991348788142204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16558572091162205, + "step": 11030 + }, + { + "epoch": 0.9193333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.07125244140625, + "learning_rate": 1.2405549917007256e-05, + "loss": 5.5011, + "loss/crossentropy": 1.8893938288092613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1703520081937313, + "step": 11032 + }, + { + "epoch": 0.9195, + "grad_norm": 4.78125, + "grad_norm_var": 0.0822265625, + "learning_rate": 1.2388244909182714e-05, + "loss": 4.652, + "loss/crossentropy": 2.1600342392921448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012832432985306, + "step": 11034 + }, + { + "epoch": 0.9196666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.06418863932291667, + "learning_rate": 1.2370968552209557e-05, + "loss": 5.0321, + "loss/crossentropy": 2.060860723257065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1751319319009781, + "step": 11036 + }, + { + "epoch": 0.9198333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.05611572265625, + "learning_rate": 1.2353720888715498e-05, + "loss": 4.6567, + "loss/crossentropy": 2.5102124214172363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21535085886716843, + "step": 11038 + }, + { + "epoch": 0.92, + "grad_norm": 4.5, + "grad_norm_var": 0.0546875, + "learning_rate": 1.2336501961257421e-05, + "loss": 5.4442, + "loss/crossentropy": 2.1127854585647583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202357716858387, + "step": 11040 + }, + { + "epoch": 0.9201666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.06477457682291667, + "learning_rate": 1.231931181232132e-05, + "loss": 5.2808, + "loss/crossentropy": 2.1201717257499695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18782678991556168, + "step": 11042 + }, + { + "epoch": 0.9203333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.049853515625, + "learning_rate": 1.2302150484322178e-05, + "loss": 5.2121, + "loss/crossentropy": 2.13012208789587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18352052569389343, + "step": 11044 + }, + { + "epoch": 0.9205, + "grad_norm": 4.34375, + "grad_norm_var": 0.04674072265625, + "learning_rate": 1.2285018019603867e-05, + "loss": 4.8306, + "loss/crossentropy": 2.0504641234874725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21500540524721146, + "step": 11046 + }, + { + "epoch": 0.9206666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.06425374348958333, + "learning_rate": 1.2267914460439046e-05, + "loss": 4.9593, + "loss/crossentropy": 1.9509310349822044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244015671312809, + "step": 11048 + }, + { + "epoch": 0.9208333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.05028889973958333, + "learning_rate": 1.2250839849029038e-05, + "loss": 4.9527, + "loss/crossentropy": 1.923073947429657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1771315522491932, + "step": 11050 + }, + { + "epoch": 0.921, + "grad_norm": 4.65625, + "grad_norm_var": 0.04788004557291667, + "learning_rate": 1.2233794227503747e-05, + "loss": 5.1758, + "loss/crossentropy": 2.4496266841888428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912783220410347, + "step": 11052 + }, + { + "epoch": 0.9211666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.05807291666666667, + "learning_rate": 1.2216777637921565e-05, + "loss": 4.3485, + "loss/crossentropy": 1.2497363984584808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13991161063313484, + "step": 11054 + }, + { + "epoch": 0.9213333333333333, + "grad_norm": 4.25, + "grad_norm_var": 0.060009765625, + "learning_rate": 1.2199790122269222e-05, + "loss": 4.95, + "loss/crossentropy": 2.5281606912612915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20351628586649895, + "step": 11056 + }, + { + "epoch": 0.9215, + "grad_norm": 4.78125, + "grad_norm_var": 0.060791015625, + "learning_rate": 1.2182831722461727e-05, + "loss": 4.9759, + "loss/crossentropy": 1.8820656910538673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970411352813244, + "step": 11058 + }, + { + "epoch": 0.9216666666666666, + "grad_norm": 4.4375, + "grad_norm_var": 0.065625, + "learning_rate": 1.2165902480342244e-05, + "loss": 5.4734, + "loss/crossentropy": 2.3079889118671417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18381556309759617, + "step": 11060 + }, + { + "epoch": 0.9218333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.06405843098958333, + "learning_rate": 1.2149002437682004e-05, + "loss": 5.1037, + "loss/crossentropy": 2.392060697078705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20238174125552177, + "step": 11062 + }, + { + "epoch": 0.922, + "grad_norm": 4.46875, + "grad_norm_var": 0.04625244140625, + "learning_rate": 1.2132131636180175e-05, + "loss": 5.283, + "loss/crossentropy": 2.3549709618091583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19442760944366455, + "step": 11064 + }, + { + "epoch": 0.9221666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.03883056640625, + "learning_rate": 1.2115290117463785e-05, + "loss": 5.2563, + "loss/crossentropy": 1.8912615105509758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17870598286390305, + "step": 11066 + }, + { + "epoch": 0.9223333333333333, + "grad_norm": 4.1875, + "grad_norm_var": 0.04599202473958333, + "learning_rate": 1.20984779230876e-05, + "loss": 5.0817, + "loss/crossentropy": 1.5166109129786491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1375791635364294, + "step": 11068 + }, + { + "epoch": 0.9225, + "grad_norm": 4.5625, + "grad_norm_var": 0.039872233072916666, + "learning_rate": 1.2081695094534054e-05, + "loss": 4.9119, + "loss/crossentropy": 2.3969703912734985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21224704012274742, + "step": 11070 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.03229166666666667, + "learning_rate": 1.2064941673213088e-05, + "loss": 4.7744, + "loss/crossentropy": 2.287917584180832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23141101002693176, + "step": 11072 + }, + { + "epoch": 0.9228333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.026102701822916668, + "learning_rate": 1.204821770046212e-05, + "loss": 5.204, + "loss/crossentropy": 2.38395032286644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20762236416339874, + "step": 11074 + }, + { + "epoch": 0.923, + "grad_norm": 4.75, + "grad_norm_var": 0.022261555989583334, + "learning_rate": 1.2031523217545887e-05, + "loss": 4.7377, + "loss/crossentropy": 2.1217075884342194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20826097577810287, + "step": 11076 + }, + { + "epoch": 0.9231666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.024995930989583335, + "learning_rate": 1.2014858265656357e-05, + "loss": 5.0618, + "loss/crossentropy": 1.8859133496880531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17268339358270168, + "step": 11078 + }, + { + "epoch": 0.9233333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.02662353515625, + "learning_rate": 1.1998222885912649e-05, + "loss": 4.695, + "loss/crossentropy": 1.691146194934845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18018352426588535, + "step": 11080 + }, + { + "epoch": 0.9235, + "grad_norm": 4.5, + "grad_norm_var": 41.13619384765625, + "learning_rate": 1.1981617119360914e-05, + "loss": 4.4921, + "loss/crossentropy": 1.6973706856369972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1616340707987547, + "step": 11082 + }, + { + "epoch": 0.9236666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 41.20631103515625, + "learning_rate": 1.196504100697422e-05, + "loss": 4.9428, + "loss/crossentropy": 1.8388321250677109, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19290312752127647, + "step": 11084 + }, + { + "epoch": 0.9238333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 41.232906087239584, + "learning_rate": 1.1948494589652487e-05, + "loss": 4.5823, + "loss/crossentropy": 1.5040778517723083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14187092706561089, + "step": 11086 + }, + { + "epoch": 0.924, + "grad_norm": 4.5, + "grad_norm_var": 41.27118733723958, + "learning_rate": 1.193197790822234e-05, + "loss": 4.8349, + "loss/crossentropy": 1.6005319356918335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17666389420628548, + "step": 11088 + }, + { + "epoch": 0.9241666666666667, + "grad_norm": 4.75, + "grad_norm_var": 41.18232014973958, + "learning_rate": 1.1915491003437065e-05, + "loss": 4.9295, + "loss/crossentropy": 1.6699941158294678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17156944423913956, + "step": 11090 + }, + { + "epoch": 0.9243333333333333, + "grad_norm": 4.5, + "grad_norm_var": 41.20974934895833, + "learning_rate": 1.1899033915976453e-05, + "loss": 4.8379, + "loss/crossentropy": 1.6831908822059631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17644242197275162, + "step": 11092 + }, + { + "epoch": 0.9245, + "grad_norm": 4.46875, + "grad_norm_var": 41.141011555989586, + "learning_rate": 1.1882606686446732e-05, + "loss": 5.266, + "loss/crossentropy": 2.5757681727409363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20049363002181053, + "step": 11094 + }, + { + "epoch": 0.9246666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 40.965458170572916, + "learning_rate": 1.1866209355380452e-05, + "loss": 4.6894, + "loss/crossentropy": 1.46444021910429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18953786976635456, + "step": 11096 + }, + { + "epoch": 0.9248333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.03970947265625, + "learning_rate": 1.1849841963236408e-05, + "loss": 5.0015, + "loss/crossentropy": 2.0792530477046967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22961734235286713, + "step": 11098 + }, + { + "epoch": 0.925, + "grad_norm": 4.5625, + "grad_norm_var": 0.03365478515625, + "learning_rate": 1.1833504550399506e-05, + "loss": 4.8575, + "loss/crossentropy": 2.336266815662384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21120865643024445, + "step": 11100 + }, + { + "epoch": 0.9251666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.04254150390625, + "learning_rate": 1.1817197157180693e-05, + "loss": 5.5143, + "loss/crossentropy": 2.3953791558742523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2296001985669136, + "step": 11102 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.05139567057291667, + "learning_rate": 1.1800919823816834e-05, + "loss": 5.3345, + "loss/crossentropy": 2.2379717230796814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23791631683707237, + "step": 11104 + }, + { + "epoch": 0.9255, + "grad_norm": 4.375, + "grad_norm_var": 0.056624348958333334, + "learning_rate": 1.1784672590470643e-05, + "loss": 4.544, + "loss/crossentropy": 1.2902886420488358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16380742564797401, + "step": 11106 + }, + { + "epoch": 0.9256666666666666, + "grad_norm": 4.3125, + "grad_norm_var": 0.06513264973958334, + "learning_rate": 1.1768455497230537e-05, + "loss": 4.456, + "loss/crossentropy": 1.441624328494072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13727031461894512, + "step": 11108 + }, + { + "epoch": 0.9258333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.060868326822916666, + "learning_rate": 1.1752268584110593e-05, + "loss": 4.7432, + "loss/crossentropy": 1.9650721102952957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18655399791896343, + "step": 11110 + }, + { + "epoch": 0.926, + "grad_norm": 4.78125, + "grad_norm_var": 0.051102701822916666, + "learning_rate": 1.1736111891050406e-05, + "loss": 5.304, + "loss/crossentropy": 2.145664870738983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.239684097468853, + "step": 11112 + }, + { + "epoch": 0.9261666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.04827067057291667, + "learning_rate": 1.1719985457915014e-05, + "loss": 4.5032, + "loss/crossentropy": 2.4814305305480957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20958179607987404, + "step": 11114 + }, + { + "epoch": 0.9263333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.04413655598958333, + "learning_rate": 1.1703889324494778e-05, + "loss": 4.7755, + "loss/crossentropy": 2.2378440499305725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20146576315164566, + "step": 11116 + }, + { + "epoch": 0.9265, + "grad_norm": 4.34375, + "grad_norm_var": 0.03313395182291667, + "learning_rate": 1.1687823530505315e-05, + "loss": 5.0817, + "loss/crossentropy": 2.245323598384857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19192436337471008, + "step": 11118 + }, + { + "epoch": 0.9266666666666666, + "grad_norm": 4.34375, + "grad_norm_var": 0.03518473307291667, + "learning_rate": 1.1671788115587374e-05, + "loss": 4.7758, + "loss/crossentropy": 1.3894713819026947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16750122606754303, + "step": 11120 + }, + { + "epoch": 0.9268333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.03463541666666667, + "learning_rate": 1.1655783119306752e-05, + "loss": 5.3347, + "loss/crossentropy": 2.481057107448578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20578979700803757, + "step": 11122 + }, + { + "epoch": 0.927, + "grad_norm": 4.15625, + "grad_norm_var": 0.04648030598958333, + "learning_rate": 1.163980858115417e-05, + "loss": 4.632, + "loss/crossentropy": 1.7022991552948952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15510082617402077, + "step": 11124 + }, + { + "epoch": 0.9271666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.04983317057291667, + "learning_rate": 1.1623864540545231e-05, + "loss": 4.7536, + "loss/crossentropy": 2.399439185857773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043217159807682, + "step": 11126 + }, + { + "epoch": 0.9273333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.05299072265625, + "learning_rate": 1.1607951036820262e-05, + "loss": 4.3418, + "loss/crossentropy": 1.9050696045160294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17808566614985466, + "step": 11128 + }, + { + "epoch": 0.9275, + "grad_norm": 4.8125, + "grad_norm_var": 0.06236979166666667, + "learning_rate": 1.1592068109244253e-05, + "loss": 5.3348, + "loss/crossentropy": 2.106477528810501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24340755119919777, + "step": 11130 + }, + { + "epoch": 0.9276666666666666, + "grad_norm": 4.75, + "grad_norm_var": 0.06741129557291667, + "learning_rate": 1.1576215797006743e-05, + "loss": 5.5603, + "loss/crossentropy": 2.5341862440109253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22335410490632057, + "step": 11132 + }, + { + "epoch": 0.9278333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.07589518229166667, + "learning_rate": 1.1560394139221746e-05, + "loss": 4.3389, + "loss/crossentropy": 1.561987891793251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1611312460154295, + "step": 11134 + }, + { + "epoch": 0.928, + "grad_norm": 4.84375, + "grad_norm_var": 0.075634765625, + "learning_rate": 1.154460317492763e-05, + "loss": 5.1787, + "loss/crossentropy": 1.756756342947483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17010868340730667, + "step": 11136 + }, + { + "epoch": 0.9281666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.07965087890625, + "learning_rate": 1.152884294308702e-05, + "loss": 5.1386, + "loss/crossentropy": 2.078896164894104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20521379634737968, + "step": 11138 + }, + { + "epoch": 0.9283333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.06614176432291667, + "learning_rate": 1.1513113482586724e-05, + "loss": 5.0978, + "loss/crossentropy": 2.103352040052414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20386765152215958, + "step": 11140 + }, + { + "epoch": 0.9285, + "grad_norm": 4.375, + "grad_norm_var": 0.06272379557291667, + "learning_rate": 1.1497414832237634e-05, + "loss": 5.1341, + "loss/crossentropy": 2.3629302382469177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23492885753512383, + "step": 11142 + }, + { + "epoch": 0.9286666666666666, + "grad_norm": 4.625, + "grad_norm_var": 0.05846354166666667, + "learning_rate": 1.1481747030774593e-05, + "loss": 4.9212, + "loss/crossentropy": 1.937618963420391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17890803515911102, + "step": 11144 + }, + { + "epoch": 0.9288333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.04869384765625, + "learning_rate": 1.1466110116856353e-05, + "loss": 4.8033, + "loss/crossentropy": 1.8597223535180092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16911944560706615, + "step": 11146 + }, + { + "epoch": 0.929, + "grad_norm": 4.65625, + "grad_norm_var": 0.04894205729166667, + "learning_rate": 1.1450504129065438e-05, + "loss": 4.7281, + "loss/crossentropy": 2.203222244977951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975608691573143, + "step": 11148 + }, + { + "epoch": 0.9291666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.037886555989583334, + "learning_rate": 1.1434929105908086e-05, + "loss": 4.8444, + "loss/crossentropy": 1.9725009500980377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20708056166768074, + "step": 11150 + }, + { + "epoch": 0.9293333333333333, + "grad_norm": 4.1875, + "grad_norm_var": 0.03948160807291667, + "learning_rate": 1.1419385085814099e-05, + "loss": 4.9513, + "loss/crossentropy": 2.3870702385902405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217625379562378, + "step": 11152 + }, + { + "epoch": 0.9295, + "grad_norm": 4.8125, + "grad_norm_var": 0.04568684895833333, + "learning_rate": 1.1403872107136816e-05, + "loss": 4.6376, + "loss/crossentropy": 1.5546553134918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16211243718862534, + "step": 11154 + }, + { + "epoch": 0.9296666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.046223958333333336, + "learning_rate": 1.1388390208152962e-05, + "loss": 5.1388, + "loss/crossentropy": 2.185354083776474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2254449725151062, + "step": 11156 + }, + { + "epoch": 0.9298333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.048567708333333334, + "learning_rate": 1.1372939427062588e-05, + "loss": 4.4744, + "loss/crossentropy": 1.2566208392381668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1643849052488804, + "step": 11158 + }, + { + "epoch": 0.93, + "grad_norm": 4.28125, + "grad_norm_var": 0.054671223958333334, + "learning_rate": 1.1357519801988954e-05, + "loss": 4.1238, + "loss/crossentropy": 1.4119196981191635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16916809789836407, + "step": 11160 + }, + { + "epoch": 0.9301666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.06249593098958333, + "learning_rate": 1.1342131370978461e-05, + "loss": 5.0425, + "loss/crossentropy": 2.1853462755680084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916177235543728, + "step": 11162 + }, + { + "epoch": 0.9303333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.05829671223958333, + "learning_rate": 1.132677417200053e-05, + "loss": 4.6284, + "loss/crossentropy": 1.6912791430950165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18075689859688282, + "step": 11164 + }, + { + "epoch": 0.9305, + "grad_norm": 5.71875, + "grad_norm_var": 0.15859375, + "learning_rate": 1.131144824294752e-05, + "loss": 4.5363, + "loss/crossentropy": 1.6092994064092636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17647656612098217, + "step": 11166 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 4.6875, + "grad_norm_var": 0.14659830729166667, + "learning_rate": 1.1296153621634636e-05, + "loss": 5.163, + "loss/crossentropy": 1.3289310112595558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1501878947019577, + "step": 11168 + }, + { + "epoch": 0.9308333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.14178059895833334, + "learning_rate": 1.1280890345799842e-05, + "loss": 4.7383, + "loss/crossentropy": 2.2952709197998047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19613110646605492, + "step": 11170 + }, + { + "epoch": 0.931, + "grad_norm": 4.15625, + "grad_norm_var": 0.151806640625, + "learning_rate": 1.126565845310375e-05, + "loss": 4.2535, + "loss/crossentropy": 2.18592032790184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18933845311403275, + "step": 11172 + }, + { + "epoch": 0.9311666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.15920817057291667, + "learning_rate": 1.125045798112954e-05, + "loss": 5.3535, + "loss/crossentropy": 1.5739598274230957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.222409188747406, + "step": 11174 + }, + { + "epoch": 0.9313333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.14685872395833333, + "learning_rate": 1.1235288967382864e-05, + "loss": 5.3601, + "loss/crossentropy": 2.36248779296875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22431249916553497, + "step": 11176 + }, + { + "epoch": 0.9315, + "grad_norm": 4.625, + "grad_norm_var": 0.15898030598958332, + "learning_rate": 1.1220151449291767e-05, + "loss": 5.0139, + "loss/crossentropy": 1.7432967498898506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1720932126045227, + "step": 11178 + }, + { + "epoch": 0.9316666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.14892171223958334, + "learning_rate": 1.1205045464206552e-05, + "loss": 5.3784, + "loss/crossentropy": 2.0632041543722153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18448470905423164, + "step": 11180 + }, + { + "epoch": 0.9318333333333333, + "grad_norm": 4.15625, + "grad_norm_var": 0.08092447916666666, + "learning_rate": 1.1189971049399753e-05, + "loss": 4.7851, + "loss/crossentropy": 2.0081071704626083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17401228658854961, + "step": 11182 + }, + { + "epoch": 0.932, + "grad_norm": 4.28125, + "grad_norm_var": 0.08815104166666667, + "learning_rate": 1.1174928242065974e-05, + "loss": 4.8777, + "loss/crossentropy": 1.90491384267807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091764397919178, + "step": 11184 + }, + { + "epoch": 0.9321666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.0841796875, + "learning_rate": 1.1159917079321865e-05, + "loss": 5.2098, + "loss/crossentropy": 1.6461158990859985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16235189884901047, + "step": 11186 + }, + { + "epoch": 0.9323333333333333, + "grad_norm": 4.1875, + "grad_norm_var": 0.07303059895833333, + "learning_rate": 1.114493759820596e-05, + "loss": 4.622, + "loss/crossentropy": 1.760587900876999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17077770829200745, + "step": 11188 + }, + { + "epoch": 0.9325, + "grad_norm": 4.53125, + "grad_norm_var": 0.065478515625, + "learning_rate": 1.112998983567865e-05, + "loss": 4.8809, + "loss/crossentropy": 2.216802418231964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19440119341015816, + "step": 11190 + }, + { + "epoch": 0.9326666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.0654296875, + "learning_rate": 1.1115073828622052e-05, + "loss": 5.1407, + "loss/crossentropy": 1.1693921089172363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12160374782979488, + "step": 11192 + }, + { + "epoch": 0.9328333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.04021809895833333, + "learning_rate": 1.110018961383993e-05, + "loss": 5.0442, + "loss/crossentropy": 2.37707781791687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22920748591423035, + "step": 11194 + }, + { + "epoch": 0.933, + "grad_norm": 5.1875, + "grad_norm_var": 0.06210530598958333, + "learning_rate": 1.1085337228057597e-05, + "loss": 5.733, + "loss/crossentropy": 1.9281913936138153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2165273167192936, + "step": 11196 + }, + { + "epoch": 0.9331666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.0515625, + "learning_rate": 1.1070516707921849e-05, + "loss": 4.6594, + "loss/crossentropy": 1.7963752299547195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17561664432287216, + "step": 11198 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.04371337890625, + "learning_rate": 1.1055728090000844e-05, + "loss": 4.6236, + "loss/crossentropy": 2.2825274989008904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17308339476585388, + "step": 11200 + }, + { + "epoch": 0.9335, + "grad_norm": 5.125, + "grad_norm_var": 0.06678059895833334, + "learning_rate": 1.1040971410784026e-05, + "loss": 4.8975, + "loss/crossentropy": 1.9707480520009995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18031662702560425, + "step": 11202 + }, + { + "epoch": 0.9336666666666666, + "grad_norm": 4.25, + "grad_norm_var": 0.06982014973958334, + "learning_rate": 1.1026246706682024e-05, + "loss": 4.7378, + "loss/crossentropy": 1.832257367670536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1671583391726017, + "step": 11204 + }, + { + "epoch": 0.9338333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.0720703125, + "learning_rate": 1.10115540140266e-05, + "loss": 5.1746, + "loss/crossentropy": 1.7907705903053284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.184514869004488, + "step": 11206 + }, + { + "epoch": 0.934, + "grad_norm": 4.59375, + "grad_norm_var": 0.07584228515625, + "learning_rate": 1.0996893369070497e-05, + "loss": 4.9469, + "loss/crossentropy": 1.8899082094430923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16880904138088226, + "step": 11208 + }, + { + "epoch": 0.9341666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.07815348307291667, + "learning_rate": 1.098226480798741e-05, + "loss": 5.3, + "loss/crossentropy": 2.023133747279644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2057199329137802, + "step": 11210 + }, + { + "epoch": 0.9343333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.052018229166666666, + "learning_rate": 1.0967668366871851e-05, + "loss": 4.7523, + "loss/crossentropy": 1.6161313951015472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110748626291752, + "step": 11212 + }, + { + "epoch": 0.9345, + "grad_norm": 4.09375, + "grad_norm_var": 0.0712890625, + "learning_rate": 1.0953104081739094e-05, + "loss": 4.1247, + "loss/crossentropy": 1.3108344376087189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14841301552951336, + "step": 11214 + }, + { + "epoch": 0.9346666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.07255452473958333, + "learning_rate": 1.0938571988525059e-05, + "loss": 5.3319, + "loss/crossentropy": 2.4737696051597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20922718197107315, + "step": 11216 + }, + { + "epoch": 0.9348333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.045572916666666664, + "learning_rate": 1.0924072123086247e-05, + "loss": 5.1227, + "loss/crossentropy": 2.4679543375968933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21668322756886482, + "step": 11218 + }, + { + "epoch": 0.935, + "grad_norm": 4.59375, + "grad_norm_var": 0.04117431640625, + "learning_rate": 1.0909604521199624e-05, + "loss": 4.2241, + "loss/crossentropy": 1.8565114438533783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19333305954933167, + "step": 11220 + }, + { + "epoch": 0.9351666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.04308268229166667, + "learning_rate": 1.0895169218562578e-05, + "loss": 4.6922, + "loss/crossentropy": 1.6779407858848572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1518637202680111, + "step": 11222 + }, + { + "epoch": 0.9353333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.04244791666666667, + "learning_rate": 1.0880766250792765e-05, + "loss": 4.6942, + "loss/crossentropy": 1.5626015737652779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16061452589929104, + "step": 11224 + }, + { + "epoch": 0.9355, + "grad_norm": 4.5, + "grad_norm_var": 0.04058837890625, + "learning_rate": 1.0866395653428086e-05, + "loss": 4.4, + "loss/crossentropy": 2.4633554816246033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20592839643359184, + "step": 11226 + }, + { + "epoch": 0.9356666666666666, + "grad_norm": 4.09375, + "grad_norm_var": 0.05310872395833333, + "learning_rate": 1.085205746192656e-05, + "loss": 4.9874, + "loss/crossentropy": 2.4507370591163635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22082506865262985, + "step": 11228 + }, + { + "epoch": 0.9358333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.04401041666666667, + "learning_rate": 1.0837751711666246e-05, + "loss": 4.5429, + "loss/crossentropy": 1.2794091627001762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1429294366389513, + "step": 11230 + }, + { + "epoch": 0.936, + "grad_norm": 4.78125, + "grad_norm_var": 0.04869384765625, + "learning_rate": 1.0823478437945164e-05, + "loss": 5.0331, + "loss/crossentropy": 2.070686124265194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17334336414933205, + "step": 11232 + }, + { + "epoch": 0.9361666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.060872395833333336, + "learning_rate": 1.0809237675981197e-05, + "loss": 4.8727, + "loss/crossentropy": 1.6955928951501846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17060757614672184, + "step": 11234 + }, + { + "epoch": 0.9363333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.06170247395833333, + "learning_rate": 1.0795029460912008e-05, + "loss": 4.7234, + "loss/crossentropy": 1.8138331472873688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17841831222176552, + "step": 11236 + }, + { + "epoch": 0.9365, + "grad_norm": 4.65625, + "grad_norm_var": 0.05716145833333333, + "learning_rate": 1.0780853827794959e-05, + "loss": 4.884, + "loss/crossentropy": 2.296046257019043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997358687222004, + "step": 11238 + }, + { + "epoch": 0.9366666666666666, + "grad_norm": 4.84375, + "grad_norm_var": 0.06222330729166667, + "learning_rate": 1.0766710811607011e-05, + "loss": 5.2366, + "loss/crossentropy": 1.3264843076467514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1398827638477087, + "step": 11240 + }, + { + "epoch": 0.9368333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.051656087239583336, + "learning_rate": 1.0752600447244654e-05, + "loss": 5.2335, + "loss/crossentropy": 1.9065971076488495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1831565536558628, + "step": 11242 + }, + { + "epoch": 0.937, + "grad_norm": 4.40625, + "grad_norm_var": 0.03863525390625, + "learning_rate": 1.073852276952381e-05, + "loss": 4.6856, + "loss/crossentropy": 2.1032577455043793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24391169473528862, + "step": 11244 + }, + { + "epoch": 0.9371666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.024202473958333335, + "learning_rate": 1.072447781317975e-05, + "loss": 4.7772, + "loss/crossentropy": 2.3768675327301025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24659275636076927, + "step": 11246 + }, + { + "epoch": 0.9373333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.020572916666666666, + "learning_rate": 1.0710465612866999e-05, + "loss": 4.7231, + "loss/crossentropy": 1.6266694143414497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16489019989967346, + "step": 11248 + }, + { + "epoch": 0.9375, + "grad_norm": 4.0625, + "grad_norm_var": 0.035868326822916664, + "learning_rate": 1.0696486203159275e-05, + "loss": 4.5313, + "loss/crossentropy": 1.6675259098410606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16724395845085382, + "step": 11250 + }, + { + "epoch": 0.9376666666666666, + "grad_norm": 4.65625, + "grad_norm_var": 0.038798014322916664, + "learning_rate": 1.068253961854939e-05, + "loss": 5.0235, + "loss/crossentropy": 1.755689986050129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17844800278544426, + "step": 11252 + }, + { + "epoch": 0.9378333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.0421875, + "learning_rate": 1.0668625893449138e-05, + "loss": 4.6551, + "loss/crossentropy": 1.8777173906564713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17272545769810677, + "step": 11254 + }, + { + "epoch": 0.938, + "grad_norm": 4.375, + "grad_norm_var": 0.04452718098958333, + "learning_rate": 1.0654745062189265e-05, + "loss": 4.3983, + "loss/crossentropy": 2.261226326227188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18433188647031784, + "step": 11256 + }, + { + "epoch": 0.9381666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.055562337239583336, + "learning_rate": 1.0640897159019337e-05, + "loss": 5.332, + "loss/crossentropy": 1.8724040985107422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1859893724322319, + "step": 11258 + }, + { + "epoch": 0.9383333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.0578125, + "learning_rate": 1.062708221810768e-05, + "loss": 4.9082, + "loss/crossentropy": 1.9240873903036118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816778089851141, + "step": 11260 + }, + { + "epoch": 0.9385, + "grad_norm": 4.53125, + "grad_norm_var": 0.05546875, + "learning_rate": 1.0613300273541285e-05, + "loss": 4.7688, + "loss/crossentropy": 2.6359651684761047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23091797903180122, + "step": 11262 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.0478515625, + "learning_rate": 1.0599551359325735e-05, + "loss": 5.138, + "loss/crossentropy": 2.3837802410125732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20433859154582024, + "step": 11264 + }, + { + "epoch": 0.9388333333333333, + "grad_norm": 5.65625, + "grad_norm_var": 0.11451416015625, + "learning_rate": 1.0585835509385108e-05, + "loss": 4.6649, + "loss/crossentropy": 1.5584058910608292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1602993868291378, + "step": 11266 + }, + { + "epoch": 0.939, + "grad_norm": 4.28125, + "grad_norm_var": 0.117822265625, + "learning_rate": 1.0572152757561898e-05, + "loss": 4.6806, + "loss/crossentropy": 2.2199689149856567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2136174440383911, + "step": 11268 + }, + { + "epoch": 0.9391666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.11357014973958333, + "learning_rate": 1.0558503137616932e-05, + "loss": 4.7357, + "loss/crossentropy": 1.70195122808218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19203688949346542, + "step": 11270 + }, + { + "epoch": 0.9393333333333334, + "grad_norm": 4.375, + "grad_norm_var": 0.10624593098958333, + "learning_rate": 1.0544886683229296e-05, + "loss": 4.5968, + "loss/crossentropy": 2.2238181829452515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21144040301442146, + "step": 11272 + }, + { + "epoch": 0.9395, + "grad_norm": 4.625, + "grad_norm_var": 0.09920247395833333, + "learning_rate": 1.0531303427996238e-05, + "loss": 4.9468, + "loss/crossentropy": 2.153789669275284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122886162251234, + "step": 11274 + }, + { + "epoch": 0.9396666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.092041015625, + "learning_rate": 1.0517753405433089e-05, + "loss": 5.4184, + "loss/crossentropy": 2.0947333574295044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19803571701049805, + "step": 11276 + }, + { + "epoch": 0.9398333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.09853108723958333, + "learning_rate": 1.0504236648973173e-05, + "loss": 4.4953, + "loss/crossentropy": 2.064661145210266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201131172478199, + "step": 11278 + }, + { + "epoch": 0.94, + "grad_norm": 4.4375, + "grad_norm_var": 1.8308430989583333, + "learning_rate": 1.0490753191967764e-05, + "loss": 5.4182, + "loss/crossentropy": 2.5684571266174316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23840726912021637, + "step": 11280 + }, + { + "epoch": 0.9401666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 1.8093587239583333, + "learning_rate": 1.047730306768593e-05, + "loss": 4.9324, + "loss/crossentropy": 2.354119837284088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058768942952156, + "step": 11282 + }, + { + "epoch": 0.9403333333333334, + "grad_norm": 4.40625, + "grad_norm_var": 1.820166015625, + "learning_rate": 1.0463886309314528e-05, + "loss": 4.8346, + "loss/crossentropy": 1.460967630147934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15813573449850082, + "step": 11284 + }, + { + "epoch": 0.9405, + "grad_norm": 4.25, + "grad_norm_var": 1.8402303059895833, + "learning_rate": 1.045050294995807e-05, + "loss": 5.3441, + "loss/crossentropy": 1.9262563213706017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17652270942926407, + "step": 11286 + }, + { + "epoch": 0.9406666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 1.8167805989583334, + "learning_rate": 1.0437153022638674e-05, + "loss": 4.8955, + "loss/crossentropy": 1.821229636669159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17122468166053295, + "step": 11288 + }, + { + "epoch": 0.9408333333333333, + "grad_norm": 4.75, + "grad_norm_var": 1.8264322916666667, + "learning_rate": 1.0423836560295944e-05, + "loss": 4.8418, + "loss/crossentropy": 2.3553628623485565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22077826783061028, + "step": 11290 + }, + { + "epoch": 0.941, + "grad_norm": 4.3125, + "grad_norm_var": 1.84986572265625, + "learning_rate": 1.0410553595786939e-05, + "loss": 4.656, + "loss/crossentropy": 1.8085390403866768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14958495646715164, + "step": 11292 + }, + { + "epoch": 0.9411666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 1.840087890625, + "learning_rate": 1.0397304161886049e-05, + "loss": 5.1937, + "loss/crossentropy": 2.1406570374965668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25243185088038445, + "step": 11294 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 4.4375, + "grad_norm_var": 0.03472900390625, + "learning_rate": 1.0384088291284935e-05, + "loss": 5.2409, + "loss/crossentropy": 2.0899730026721954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21066424995660782, + "step": 11296 + }, + { + "epoch": 0.9415, + "grad_norm": 4.40625, + "grad_norm_var": 0.04459228515625, + "learning_rate": 1.0370906016592441e-05, + "loss": 5.0729, + "loss/crossentropy": 2.3987383246421814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20506568253040314, + "step": 11298 + }, + { + "epoch": 0.9416666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.042561848958333336, + "learning_rate": 1.0357757370334528e-05, + "loss": 4.9799, + "loss/crossentropy": 2.0026203393936157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17385689727962017, + "step": 11300 + }, + { + "epoch": 0.9418333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.03883056640625, + "learning_rate": 1.0344642384954166e-05, + "loss": 4.2991, + "loss/crossentropy": 1.580312892794609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19087418541312218, + "step": 11302 + }, + { + "epoch": 0.942, + "grad_norm": 4.25, + "grad_norm_var": 0.0435546875, + "learning_rate": 1.0331561092811282e-05, + "loss": 4.7055, + "loss/crossentropy": 1.8026033341884613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18064633011817932, + "step": 11304 + }, + { + "epoch": 0.9421666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.040087890625, + "learning_rate": 1.0318513526182659e-05, + "loss": 5.2248, + "loss/crossentropy": 1.9698734879493713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878417208790779, + "step": 11306 + }, + { + "epoch": 0.9423333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.03860677083333333, + "learning_rate": 1.0305499717261872e-05, + "loss": 5.1038, + "loss/crossentropy": 2.082389175891876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19220246747136116, + "step": 11308 + }, + { + "epoch": 0.9425, + "grad_norm": 4.875, + "grad_norm_var": 0.04412434895833333, + "learning_rate": 1.029251969815921e-05, + "loss": 5.6204, + "loss/crossentropy": 1.8455578163266182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17056034691631794, + "step": 11310 + }, + { + "epoch": 0.9426666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.04192301432291667, + "learning_rate": 1.0279573500901568e-05, + "loss": 4.7678, + "loss/crossentropy": 1.603665716946125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16744246520102024, + "step": 11312 + }, + { + "epoch": 0.9428333333333333, + "grad_norm": 4.25, + "grad_norm_var": 0.03917643229166667, + "learning_rate": 1.0266661157432403e-05, + "loss": 4.763, + "loss/crossentropy": 2.118795096874237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045084908604622, + "step": 11314 + }, + { + "epoch": 0.943, + "grad_norm": 4.28125, + "grad_norm_var": 0.04192301432291667, + "learning_rate": 1.0253782699611648e-05, + "loss": 4.7973, + "loss/crossentropy": 2.4567037224769592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21456681564450264, + "step": 11316 + }, + { + "epoch": 0.9431666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.03599853515625, + "learning_rate": 1.0240938159215603e-05, + "loss": 4.6769, + "loss/crossentropy": 1.9589063227176666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025170996785164, + "step": 11318 + }, + { + "epoch": 0.9433333333333334, + "grad_norm": 4.375, + "grad_norm_var": 0.044820149739583336, + "learning_rate": 1.0228127567936906e-05, + "loss": 4.4701, + "loss/crossentropy": 1.1538459286093712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13562462851405144, + "step": 11320 + }, + { + "epoch": 0.9435, + "grad_norm": 4.375, + "grad_norm_var": 0.045182291666666666, + "learning_rate": 1.0215350957384408e-05, + "loss": 4.4514, + "loss/crossentropy": 2.2808018624782562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20572230219841003, + "step": 11322 + }, + { + "epoch": 0.9436666666666667, + "grad_norm": 4.96875, + "grad_norm_var": 0.05520833333333333, + "learning_rate": 1.0202608359083141e-05, + "loss": 4.7319, + "loss/crossentropy": 2.2184520065784454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21957562491297722, + "step": 11324 + }, + { + "epoch": 0.9438333333333333, + "grad_norm": 4.09375, + "grad_norm_var": 0.057450358072916666, + "learning_rate": 1.0189899804474181e-05, + "loss": 4.4743, + "loss/crossentropy": 1.53329998254776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14534585922956467, + "step": 11326 + }, + { + "epoch": 0.944, + "grad_norm": 4.875, + "grad_norm_var": 0.06506754557291666, + "learning_rate": 1.0177225324914637e-05, + "loss": 5.4519, + "loss/crossentropy": 2.229980379343033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18609286099672318, + "step": 11328 + }, + { + "epoch": 0.9441666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.06656494140625, + "learning_rate": 1.0164584951677522e-05, + "loss": 5.1793, + "loss/crossentropy": 1.5389113202691078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17611819133162498, + "step": 11330 + }, + { + "epoch": 0.9443333333333334, + "grad_norm": 4.9375, + "grad_norm_var": 0.07356770833333333, + "learning_rate": 1.01519787159517e-05, + "loss": 4.5413, + "loss/crossentropy": 1.4153800904750824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14668962359428406, + "step": 11332 + }, + { + "epoch": 0.9445, + "grad_norm": 4.71875, + "grad_norm_var": 0.08761393229166667, + "learning_rate": 1.0139406648841803e-05, + "loss": 4.8911, + "loss/crossentropy": 1.660580761730671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15345065668225288, + "step": 11334 + }, + { + "epoch": 0.9446666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.07526041666666666, + "learning_rate": 1.0126868781368162e-05, + "loss": 4.4777, + "loss/crossentropy": 1.9693571105599403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19380193203687668, + "step": 11336 + }, + { + "epoch": 0.9448333333333333, + "grad_norm": 4.8125, + "grad_norm_var": 0.07727864583333334, + "learning_rate": 1.0114365144466716e-05, + "loss": 5.1901, + "loss/crossentropy": 2.2114559710025787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22473587468266487, + "step": 11338 + }, + { + "epoch": 0.945, + "grad_norm": 4.9375, + "grad_norm_var": 0.07615559895833333, + "learning_rate": 1.0101895768988945e-05, + "loss": 4.8001, + "loss/crossentropy": 1.65935680270195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22352956980466843, + "step": 11340 + }, + { + "epoch": 0.9451666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.062890625, + "learning_rate": 1.0089460685701788e-05, + "loss": 4.7855, + "loss/crossentropy": 1.4414317682385445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1585723366588354, + "step": 11342 + }, + { + "epoch": 0.9453333333333334, + "grad_norm": 4.125, + "grad_norm_var": 0.069384765625, + "learning_rate": 1.007705992528759e-05, + "loss": 4.2898, + "loss/crossentropy": 1.8804664388298988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19146862998604774, + "step": 11344 + }, + { + "epoch": 0.9455, + "grad_norm": 4.59375, + "grad_norm_var": 0.07424723307291667, + "learning_rate": 1.0064693518343989e-05, + "loss": 4.4927, + "loss/crossentropy": 0.4384430721402168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.0893248300999403, + "step": 11346 + }, + { + "epoch": 0.9456666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.07740885416666667, + "learning_rate": 1.0052361495383862e-05, + "loss": 5.2412, + "loss/crossentropy": 1.9607902467250824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19718700274825096, + "step": 11348 + }, + { + "epoch": 0.9458333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.06324462890625, + "learning_rate": 1.0040063886835247e-05, + "loss": 5.3429, + "loss/crossentropy": 2.4686543345451355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21067270264029503, + "step": 11350 + }, + { + "epoch": 0.946, + "grad_norm": 4.65625, + "grad_norm_var": 0.067578125, + "learning_rate": 1.0027800723041284e-05, + "loss": 4.4446, + "loss/crossentropy": 2.339235484600067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19029224291443825, + "step": 11352 + }, + { + "epoch": 0.9461666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.07734375, + "learning_rate": 1.0015572034260092e-05, + "loss": 4.3474, + "loss/crossentropy": 0.8812186121940613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1112806610763073, + "step": 11354 + }, + { + "epoch": 0.9463333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.06470947265625, + "learning_rate": 1.0003377850664759e-05, + "loss": 5.0562, + "loss/crossentropy": 1.98075682669878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17196648381650448, + "step": 11356 + }, + { + "epoch": 0.9465, + "grad_norm": 4.46875, + "grad_norm_var": 0.07298177083333333, + "learning_rate": 9.991218202343211e-06, + "loss": 4.6137, + "loss/crossentropy": 2.13834910094738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17433346435427666, + "step": 11358 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.07316080729166667, + "learning_rate": 9.979093119298187e-06, + "loss": 5.0153, + "loss/crossentropy": 1.312939204275608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14639427699148655, + "step": 11360 + }, + { + "epoch": 0.9468333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.06573893229166666, + "learning_rate": 9.967002631447104e-06, + "loss": 4.6521, + "loss/crossentropy": 1.6801223307847977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17429677583277225, + "step": 11362 + }, + { + "epoch": 0.947, + "grad_norm": 4.5, + "grad_norm_var": 0.04957275390625, + "learning_rate": 9.954946768622056e-06, + "loss": 4.7481, + "loss/crossentropy": 1.2624147459864616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15043756738305092, + "step": 11364 + }, + { + "epoch": 0.9471666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.04495035807291667, + "learning_rate": 9.942925560569677e-06, + "loss": 4.8214, + "loss/crossentropy": 1.251896284520626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13010199926793575, + "step": 11366 + }, + { + "epoch": 0.9473333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.040087890625, + "learning_rate": 9.930939036951104e-06, + "loss": 4.4365, + "loss/crossentropy": 1.8336703404784203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19744067266583443, + "step": 11368 + }, + { + "epoch": 0.9475, + "grad_norm": 4.5, + "grad_norm_var": 0.03287353515625, + "learning_rate": 9.91898722734189e-06, + "loss": 5.1877, + "loss/crossentropy": 2.477368474006653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20827669277787209, + "step": 11370 + }, + { + "epoch": 0.9476666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.03606363932291667, + "learning_rate": 9.907070161231944e-06, + "loss": 4.8895, + "loss/crossentropy": 2.3553980588912964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20754562690854073, + "step": 11372 + }, + { + "epoch": 0.9478333333333333, + "grad_norm": 4.9375, + "grad_norm_var": 0.04159749348958333, + "learning_rate": 9.89518786802544e-06, + "loss": 5.143, + "loss/crossentropy": 2.1513184905052185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19551891088485718, + "step": 11374 + }, + { + "epoch": 0.948, + "grad_norm": 4.4375, + "grad_norm_var": 0.03570556640625, + "learning_rate": 9.883340377040752e-06, + "loss": 5.258, + "loss/crossentropy": 1.9037449285387993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17840158566832542, + "step": 11376 + }, + { + "epoch": 0.9481666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.03644205729166667, + "learning_rate": 9.871527717510382e-06, + "loss": 4.9532, + "loss/crossentropy": 2.183581203222275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22390246018767357, + "step": 11378 + }, + { + "epoch": 0.9483333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.03284098307291667, + "learning_rate": 9.859749918580906e-06, + "loss": 4.7762, + "loss/crossentropy": 2.2573187053203583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075618952512741, + "step": 11380 + }, + { + "epoch": 0.9485, + "grad_norm": 4.34375, + "grad_norm_var": 0.04192301432291667, + "learning_rate": 9.848007009312865e-06, + "loss": 4.7298, + "loss/crossentropy": 1.5737459063529968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046054407954216, + "step": 11382 + }, + { + "epoch": 0.9486666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.052144368489583336, + "learning_rate": 9.836299018680719e-06, + "loss": 4.4557, + "loss/crossentropy": 1.3081732392311096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14434547536075115, + "step": 11384 + }, + { + "epoch": 0.9488333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.05185139973958333, + "learning_rate": 9.82462597557277e-06, + "loss": 4.8961, + "loss/crossentropy": 2.0935000479221344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17151028290390968, + "step": 11386 + }, + { + "epoch": 0.949, + "grad_norm": 4.71875, + "grad_norm_var": 0.051806640625, + "learning_rate": 9.812987908791095e-06, + "loss": 5.3142, + "loss/crossentropy": 2.3687087893486023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20503158122301102, + "step": 11388 + }, + { + "epoch": 0.9491666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.03186442057291667, + "learning_rate": 9.80138484705147e-06, + "loss": 5.0832, + "loss/crossentropy": 2.0034788250923157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18592121824622154, + "step": 11390 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 4.15625, + "grad_norm_var": 0.04034830729166667, + "learning_rate": 9.78981681898329e-06, + "loss": 4.6754, + "loss/crossentropy": 2.1087347492575645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17885474488139153, + "step": 11392 + }, + { + "epoch": 0.9495, + "grad_norm": 4.53125, + "grad_norm_var": 0.058203125, + "learning_rate": 9.778283853129514e-06, + "loss": 4.678, + "loss/crossentropy": 1.7178971394896507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1700619626790285, + "step": 11394 + }, + { + "epoch": 0.9496666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.06832275390625, + "learning_rate": 9.766785977946597e-06, + "loss": 4.9998, + "loss/crossentropy": 1.9785993993282318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2348547950387001, + "step": 11396 + }, + { + "epoch": 0.9498333333333333, + "grad_norm": 5.03125, + "grad_norm_var": 0.07381184895833333, + "learning_rate": 9.75532322180439e-06, + "loss": 4.783, + "loss/crossentropy": 2.074269473552704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21151044219732285, + "step": 11398 + }, + { + "epoch": 0.95, + "grad_norm": 4.0625, + "grad_norm_var": 0.07394205729166667, + "learning_rate": 9.743895612986116e-06, + "loss": 4.217, + "loss/crossentropy": 1.7610245794057846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.170846126973629, + "step": 11400 + }, + { + "epoch": 0.9501666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.08157145182291667, + "learning_rate": 9.73250317968826e-06, + "loss": 4.9011, + "loss/crossentropy": 1.725033387541771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19924123957753181, + "step": 11402 + }, + { + "epoch": 0.9503333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.08974202473958333, + "learning_rate": 9.721145950020516e-06, + "loss": 4.3106, + "loss/crossentropy": 0.96659966558218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1814847458153963, + "step": 11404 + }, + { + "epoch": 0.9505, + "grad_norm": 4.34375, + "grad_norm_var": 0.10403645833333333, + "learning_rate": 9.70982395200572e-06, + "loss": 4.8238, + "loss/crossentropy": 2.1273521780967712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20532144233584404, + "step": 11406 + }, + { + "epoch": 0.9506666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.08800455729166666, + "learning_rate": 9.698537213579781e-06, + "loss": 4.8639, + "loss/crossentropy": 2.0131620913743973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16216029226779938, + "step": 11408 + }, + { + "epoch": 0.9508333333333333, + "grad_norm": 3.828125, + "grad_norm_var": 0.11118062337239583, + "learning_rate": 9.687285762591601e-06, + "loss": 3.844, + "loss/crossentropy": 1.404318891465664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15305296704173088, + "step": 11410 + }, + { + "epoch": 0.951, + "grad_norm": 4.5625, + "grad_norm_var": 0.11960347493489583, + "learning_rate": 9.676069626803016e-06, + "loss": 5.4016, + "loss/crossentropy": 2.029324918985367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23561448231339455, + "step": 11412 + }, + { + "epoch": 0.9511666666666667, + "grad_norm": 4.125, + "grad_norm_var": 0.1167144775390625, + "learning_rate": 9.664888833888724e-06, + "loss": 4.7174, + "loss/crossentropy": 2.380665957927704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15510449546854943, + "step": 11414 + }, + { + "epoch": 0.9513333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.10854390462239584, + "learning_rate": 9.653743411436227e-06, + "loss": 5.0389, + "loss/crossentropy": 2.1694408655166626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21396595984697342, + "step": 11416 + }, + { + "epoch": 0.9515, + "grad_norm": 4.25, + "grad_norm_var": 0.10562235514322917, + "learning_rate": 9.642633386945742e-06, + "loss": 4.4986, + "loss/crossentropy": 2.210035800933838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2272140271961689, + "step": 11418 + }, + { + "epoch": 0.9516666666666667, + "grad_norm": 5.125, + "grad_norm_var": 0.12150777180989583, + "learning_rate": 9.631558787830153e-06, + "loss": 4.8095, + "loss/crossentropy": 2.4195462465286255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20426450297236443, + "step": 11420 + }, + { + "epoch": 0.9518333333333333, + "grad_norm": 4.1875, + "grad_norm_var": 0.1145416259765625, + "learning_rate": 9.620519641414926e-06, + "loss": 4.7143, + "loss/crossentropy": 2.499524176120758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20367558673024178, + "step": 11422 + }, + { + "epoch": 0.952, + "grad_norm": 4.53125, + "grad_norm_var": 0.1120513916015625, + "learning_rate": 9.609515974938064e-06, + "loss": 5.2156, + "loss/crossentropy": 2.4850030541419983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20476289093494415, + "step": 11424 + }, + { + "epoch": 0.9521666666666667, + "grad_norm": 4.8125, + "grad_norm_var": 0.08566080729166667, + "learning_rate": 9.59854781555002e-06, + "loss": 4.6327, + "loss/crossentropy": 1.6624226868152618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1814777236431837, + "step": 11426 + }, + { + "epoch": 0.9523333333333334, + "grad_norm": 4.40625, + "grad_norm_var": 0.07615559895833333, + "learning_rate": 9.587615190313633e-06, + "loss": 4.4029, + "loss/crossentropy": 1.5843016058206558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17348378524184227, + "step": 11428 + }, + { + "epoch": 0.9525, + "grad_norm": 4.78125, + "grad_norm_var": 0.06573893229166666, + "learning_rate": 9.576718126204069e-06, + "loss": 4.5239, + "loss/crossentropy": 2.3472007513046265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20952258631587029, + "step": 11430 + }, + { + "epoch": 0.9526666666666667, + "grad_norm": 4.875, + "grad_norm_var": 0.06991780598958333, + "learning_rate": 9.565856650108758e-06, + "loss": 5.1015, + "loss/crossentropy": 1.7776538357138634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009373940527439, + "step": 11432 + }, + { + "epoch": 0.9528333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.06300455729166667, + "learning_rate": 9.555030788827302e-06, + "loss": 5.2163, + "loss/crossentropy": 1.7030307799577713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17652593553066254, + "step": 11434 + }, + { + "epoch": 0.953, + "grad_norm": 4.71875, + "grad_norm_var": 0.04342447916666667, + "learning_rate": 9.544240569071444e-06, + "loss": 4.8409, + "loss/crossentropy": 1.8213524222373962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17334122583270073, + "step": 11436 + }, + { + "epoch": 0.9531666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.034956868489583334, + "learning_rate": 9.533486017464979e-06, + "loss": 5.0472, + "loss/crossentropy": 2.128684014081955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20218900591135025, + "step": 11438 + }, + { + "epoch": 0.9533333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.03752848307291667, + "learning_rate": 9.522767160543692e-06, + "loss": 5.3667, + "loss/crossentropy": 2.5350887775421143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2353416383266449, + "step": 11440 + }, + { + "epoch": 0.9535, + "grad_norm": 4.53125, + "grad_norm_var": 0.03162434895833333, + "learning_rate": 9.512084024755293e-06, + "loss": 4.6945, + "loss/crossentropy": 1.500010333955288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1599786952137947, + "step": 11442 + }, + { + "epoch": 0.9536666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.027046712239583333, + "learning_rate": 9.501436636459364e-06, + "loss": 4.7603, + "loss/crossentropy": 2.3232173323631287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2208048515021801, + "step": 11444 + }, + { + "epoch": 0.9538333333333333, + "grad_norm": 4.25, + "grad_norm_var": 0.03137613932291667, + "learning_rate": 9.490825021927276e-06, + "loss": 4.4929, + "loss/crossentropy": 1.404862955212593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14171510562300682, + "step": 11446 + }, + { + "epoch": 0.954, + "grad_norm": 5.03125, + "grad_norm_var": 0.0388671875, + "learning_rate": 9.48024920734213e-06, + "loss": 5.2693, + "loss/crossentropy": 1.9854433834552765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20603374019265175, + "step": 11448 + }, + { + "epoch": 0.9541666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.047900390625, + "learning_rate": 9.4697092187987e-06, + "loss": 4.7512, + "loss/crossentropy": 2.452913224697113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20292839780449867, + "step": 11450 + }, + { + "epoch": 0.9543333333333334, + "grad_norm": 4.59375, + "grad_norm_var": 0.04527587890625, + "learning_rate": 9.459205082303359e-06, + "loss": 4.8945, + "loss/crossentropy": 2.0005833134055138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19060774892568588, + "step": 11452 + }, + { + "epoch": 0.9545, + "grad_norm": 4.6875, + "grad_norm_var": 0.05230712890625, + "learning_rate": 9.44873682377402e-06, + "loss": 4.4555, + "loss/crossentropy": 1.4971669167280197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15944399684667587, + "step": 11454 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.04869384765625, + "learning_rate": 9.43830446904007e-06, + "loss": 5.0422, + "loss/crossentropy": 2.2370823323726654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956566758453846, + "step": 11456 + }, + { + "epoch": 0.9548333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.047119140625, + "learning_rate": 9.427908043842305e-06, + "loss": 4.706, + "loss/crossentropy": 2.28358057141304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20159728825092316, + "step": 11458 + }, + { + "epoch": 0.955, + "grad_norm": 4.40625, + "grad_norm_var": 0.04745686848958333, + "learning_rate": 9.417547573832876e-06, + "loss": 5.2039, + "loss/crossentropy": 2.2083005011081696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086111195385456, + "step": 11460 + }, + { + "epoch": 0.9551666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.048628743489583334, + "learning_rate": 9.40722308457521e-06, + "loss": 4.5374, + "loss/crossentropy": 2.4701362252235413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123345211148262, + "step": 11462 + }, + { + "epoch": 0.9553333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.02965087890625, + "learning_rate": 9.396934601543957e-06, + "loss": 4.9871, + "loss/crossentropy": 1.1636821255087852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1338321641087532, + "step": 11464 + }, + { + "epoch": 0.9555, + "grad_norm": 4.21875, + "grad_norm_var": 0.023893229166666665, + "learning_rate": 9.386682150124923e-06, + "loss": 4.9635, + "loss/crossentropy": 2.139076389372349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16577439568936825, + "step": 11466 + }, + { + "epoch": 0.9556666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.028999837239583333, + "learning_rate": 9.376465755615024e-06, + "loss": 4.4825, + "loss/crossentropy": 1.589639350771904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894889585673809, + "step": 11468 + }, + { + "epoch": 0.9558333333333333, + "grad_norm": 5.0, + "grad_norm_var": 0.043863932291666664, + "learning_rate": 9.366285443222183e-06, + "loss": 4.8079, + "loss/crossentropy": 1.9420047849416733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19465864449739456, + "step": 11470 + }, + { + "epoch": 0.956, + "grad_norm": 4.21875, + "grad_norm_var": 0.04607747395833333, + "learning_rate": 9.35614123806532e-06, + "loss": 4.5104, + "loss/crossentropy": 2.0238417387008667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1953933723270893, + "step": 11472 + }, + { + "epoch": 0.9561666666666667, + "grad_norm": 6.25, + "grad_norm_var": 0.24599202473958334, + "learning_rate": 9.346033165174249e-06, + "loss": 5.0515, + "loss/crossentropy": 2.287261486053467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21523414179682732, + "step": 11474 + }, + { + "epoch": 0.9563333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.24566650390625, + "learning_rate": 9.335961249489635e-06, + "loss": 4.8473, + "loss/crossentropy": 2.182397872209549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18900325149297714, + "step": 11476 + }, + { + "epoch": 0.9565, + "grad_norm": 4.5, + "grad_norm_var": 0.24120686848958334, + "learning_rate": 9.325925515862926e-06, + "loss": 5.0944, + "loss/crossentropy": 1.7978277504444122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19625483080744743, + "step": 11478 + }, + { + "epoch": 0.9566666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.24230143229166667, + "learning_rate": 9.315925989056303e-06, + "loss": 4.9761, + "loss/crossentropy": 2.6512285470962524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22339925542473793, + "step": 11480 + }, + { + "epoch": 0.9568333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.23248291015625, + "learning_rate": 9.305962693742601e-06, + "loss": 5.4635, + "loss/crossentropy": 2.1777456402778625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22406087815761566, + "step": 11482 + }, + { + "epoch": 0.957, + "grad_norm": 4.5625, + "grad_norm_var": 0.21929931640625, + "learning_rate": 9.296035654505261e-06, + "loss": 5.3445, + "loss/crossentropy": 1.466300867497921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1500567179173231, + "step": 11484 + }, + { + "epoch": 0.9571666666666667, + "grad_norm": 4.71875, + "grad_norm_var": 0.213134765625, + "learning_rate": 9.286144895838262e-06, + "loss": 5.4844, + "loss/crossentropy": 2.4218156337738037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21662846207618713, + "step": 11486 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 4.34375, + "grad_norm_var": 0.20597330729166666, + "learning_rate": 9.276290442146075e-06, + "loss": 4.6464, + "loss/crossentropy": 1.5975105240941048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16215290501713753, + "step": 11488 + }, + { + "epoch": 0.9575, + "grad_norm": 4.625, + "grad_norm_var": 0.029427083333333333, + "learning_rate": 9.266472317743582e-06, + "loss": 4.6487, + "loss/crossentropy": 1.2481160312891006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13352661207318306, + "step": 11490 + }, + { + "epoch": 0.9576666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.027718098958333333, + "learning_rate": 9.256690546856028e-06, + "loss": 4.895, + "loss/crossentropy": 1.5689271241426468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1847672201693058, + "step": 11492 + }, + { + "epoch": 0.9578333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.029231770833333334, + "learning_rate": 9.246945153618955e-06, + "loss": 5.2113, + "loss/crossentropy": 2.406019926071167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23838234320282936, + "step": 11494 + }, + { + "epoch": 0.958, + "grad_norm": 4.65625, + "grad_norm_var": 0.032938639322916664, + "learning_rate": 9.237236162078162e-06, + "loss": 4.6259, + "loss/crossentropy": 2.4825395345687866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015017382800579, + "step": 11496 + }, + { + "epoch": 0.9581666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.033056640625, + "learning_rate": 9.227563596189619e-06, + "loss": 5.034, + "loss/crossentropy": 1.7402432262897491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.176887271925807, + "step": 11498 + }, + { + "epoch": 0.9583333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.03631184895833333, + "learning_rate": 9.217927479819413e-06, + "loss": 4.3762, + "loss/crossentropy": 1.77406807243824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18127808719873428, + "step": 11500 + }, + { + "epoch": 0.9585, + "grad_norm": 4.0, + "grad_norm_var": 0.04517822265625, + "learning_rate": 9.208327836743711e-06, + "loss": 4.5285, + "loss/crossentropy": 1.7723116129636765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1636607013642788, + "step": 11502 + }, + { + "epoch": 0.9586666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.048502604166666664, + "learning_rate": 9.198764690648673e-06, + "loss": 4.4688, + "loss/crossentropy": 1.9188175573945045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1713660228997469, + "step": 11504 + }, + { + "epoch": 0.9588333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.04698893229166667, + "learning_rate": 9.189238065130415e-06, + "loss": 4.9859, + "loss/crossentropy": 2.25072905421257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18884330987930298, + "step": 11506 + }, + { + "epoch": 0.959, + "grad_norm": 4.4375, + "grad_norm_var": 0.046858723958333334, + "learning_rate": 9.179747983694935e-06, + "loss": 4.478, + "loss/crossentropy": 2.0359173715114594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1767810508608818, + "step": 11508 + }, + { + "epoch": 0.9591666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.038374837239583334, + "learning_rate": 9.170294469758068e-06, + "loss": 4.9137, + "loss/crossentropy": 1.9276671707630157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19128254242241383, + "step": 11510 + }, + { + "epoch": 0.9593333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.04195556640625, + "learning_rate": 9.16087754664542e-06, + "loss": 5.1605, + "loss/crossentropy": 2.04610376060009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1933000460267067, + "step": 11512 + }, + { + "epoch": 0.9595, + "grad_norm": 4.75, + "grad_norm_var": 0.04224853515625, + "learning_rate": 9.151497237592314e-06, + "loss": 5.3041, + "loss/crossentropy": 2.4091951847076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2099679596722126, + "step": 11514 + }, + { + "epoch": 0.9596666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.047261555989583336, + "learning_rate": 9.142153565743724e-06, + "loss": 4.9227, + "loss/crossentropy": 1.4057952463626862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13871552795171738, + "step": 11516 + }, + { + "epoch": 0.9598333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.031119791666666667, + "learning_rate": 9.132846554154239e-06, + "loss": 4.6217, + "loss/crossentropy": 2.13298699259758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115863598883152, + "step": 11518 + }, + { + "epoch": 0.96, + "grad_norm": 4.375, + "grad_norm_var": 0.036702473958333336, + "learning_rate": 9.12357622578798e-06, + "loss": 4.3296, + "loss/crossentropy": 2.031085819005966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061699703335762, + "step": 11520 + }, + { + "epoch": 0.9601666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.060380045572916666, + "learning_rate": 9.114342603518563e-06, + "loss": 4.9346, + "loss/crossentropy": 1.6019954681396484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1720646657049656, + "step": 11522 + }, + { + "epoch": 0.9603333333333334, + "grad_norm": 5.0625, + "grad_norm_var": 0.077978515625, + "learning_rate": 9.105145710129028e-06, + "loss": 4.7582, + "loss/crossentropy": 0.9307103678584099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.11513753607869148, + "step": 11524 + }, + { + "epoch": 0.9605, + "grad_norm": 4.5625, + "grad_norm_var": 0.07886962890625, + "learning_rate": 9.095985568311806e-06, + "loss": 5.6277, + "loss/crossentropy": 2.263214409351349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19302452355623245, + "step": 11526 + }, + { + "epoch": 0.9606666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.0796875, + "learning_rate": 9.086862200668626e-06, + "loss": 4.5774, + "loss/crossentropy": 1.8522422462701797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1796627752482891, + "step": 11528 + }, + { + "epoch": 0.9608333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.07847900390625, + "learning_rate": 9.077775629710496e-06, + "loss": 4.8885, + "loss/crossentropy": 1.6978005468845367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16846343874931335, + "step": 11530 + }, + { + "epoch": 0.961, + "grad_norm": 4.96875, + "grad_norm_var": 0.0826171875, + "learning_rate": 9.068725877857623e-06, + "loss": 5.4411, + "loss/crossentropy": 2.5143747329711914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2164710983633995, + "step": 11532 + }, + { + "epoch": 0.9611666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.0802734375, + "learning_rate": 9.059712967439377e-06, + "loss": 4.7038, + "loss/crossentropy": 1.7296985238790512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15602321550250053, + "step": 11534 + }, + { + "epoch": 0.9613333333333334, + "grad_norm": 4.5, + "grad_norm_var": 0.07636311848958334, + "learning_rate": 9.050736920694208e-06, + "loss": 5.0484, + "loss/crossentropy": 2.0967109203338623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19041990116238594, + "step": 11536 + }, + { + "epoch": 0.9615, + "grad_norm": 4.21875, + "grad_norm_var": 0.06404622395833333, + "learning_rate": 9.041797759769628e-06, + "loss": 4.5043, + "loss/crossentropy": 2.400757282972336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213943500071764, + "step": 11538 + }, + { + "epoch": 0.9616666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.09295247395833334, + "learning_rate": 9.032895506722125e-06, + "loss": 5.113, + "loss/crossentropy": 2.5202205777168274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162243202328682, + "step": 11540 + }, + { + "epoch": 0.9618333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.11480712890625, + "learning_rate": 9.024030183517124e-06, + "loss": 4.9387, + "loss/crossentropy": 2.189281791448593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20729172229766846, + "step": 11542 + }, + { + "epoch": 0.962, + "grad_norm": 4.28125, + "grad_norm_var": 0.12498372395833333, + "learning_rate": 9.015201812028924e-06, + "loss": 4.2009, + "loss/crossentropy": 2.25010347366333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20368079841136932, + "step": 11544 + }, + { + "epoch": 0.9621666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.134375, + "learning_rate": 9.006410414040662e-06, + "loss": 4.8032, + "loss/crossentropy": 1.898626983165741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16156602837145329, + "step": 11546 + }, + { + "epoch": 0.9623333333333334, + "grad_norm": 4.28125, + "grad_norm_var": 0.13006184895833334, + "learning_rate": 8.997656011244232e-06, + "loss": 5.0192, + "loss/crossentropy": 2.262854278087616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19358738511800766, + "step": 11548 + }, + { + "epoch": 0.9625, + "grad_norm": 4.46875, + "grad_norm_var": 0.13123372395833333, + "learning_rate": 8.988938625240257e-06, + "loss": 4.6301, + "loss/crossentropy": 2.4176487922668457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21823088079690933, + "step": 11550 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.109619140625, + "learning_rate": 8.980258277538017e-06, + "loss": 5.0695, + "loss/crossentropy": 2.1023247241973877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19244016706943512, + "step": 11552 + }, + { + "epoch": 0.9628333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.10989176432291667, + "learning_rate": 8.971614989555408e-06, + "loss": 4.7315, + "loss/crossentropy": 1.242757223546505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13383663445711136, + "step": 11554 + }, + { + "epoch": 0.963, + "grad_norm": 4.3125, + "grad_norm_var": 0.04986979166666667, + "learning_rate": 8.963008782618887e-06, + "loss": 5.2009, + "loss/crossentropy": 1.6220801323652267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1612663622945547, + "step": 11556 + }, + { + "epoch": 0.9631666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.015169270833333333, + "learning_rate": 8.954439677963411e-06, + "loss": 4.6598, + "loss/crossentropy": 1.3346295356750488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1350972019135952, + "step": 11558 + }, + { + "epoch": 0.9633333333333334, + "grad_norm": 4.09375, + "grad_norm_var": 0.034228515625, + "learning_rate": 8.945907696732395e-06, + "loss": 4.761, + "loss/crossentropy": 1.9910719692707062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22182273492217064, + "step": 11560 + }, + { + "epoch": 0.9635, + "grad_norm": 4.6875, + "grad_norm_var": 0.03553059895833333, + "learning_rate": 8.937412859977653e-06, + "loss": 5.1157, + "loss/crossentropy": 2.031724736094475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.179793706163764, + "step": 11562 + }, + { + "epoch": 0.9636666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.04021809895833333, + "learning_rate": 8.928955188659353e-06, + "loss": 5.2434, + "loss/crossentropy": 1.5864408761262894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1709186527878046, + "step": 11564 + }, + { + "epoch": 0.9638333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.03974202473958333, + "learning_rate": 8.920534703645955e-06, + "loss": 4.3488, + "loss/crossentropy": 1.995065838098526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19275368005037308, + "step": 11566 + }, + { + "epoch": 0.964, + "grad_norm": 4.71875, + "grad_norm_var": 0.04778238932291667, + "learning_rate": 8.912151425714168e-06, + "loss": 4.2521, + "loss/crossentropy": 1.6531179696321487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17187139578163624, + "step": 11568 + }, + { + "epoch": 0.9641666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.07861226399739583, + "learning_rate": 8.903805375548904e-06, + "loss": 4.1241, + "loss/crossentropy": 1.9064756259322166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16551323048770428, + "step": 11570 + }, + { + "epoch": 0.9643333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.0802398681640625, + "learning_rate": 8.895496573743207e-06, + "loss": 5.0267, + "loss/crossentropy": 2.3756193816661835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20273981615900993, + "step": 11572 + }, + { + "epoch": 0.9645, + "grad_norm": 4.1875, + "grad_norm_var": 0.0854400634765625, + "learning_rate": 8.887225040798218e-06, + "loss": 4.7173, + "loss/crossentropy": 2.5865076184272766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206777635961771, + "step": 11574 + }, + { + "epoch": 0.9646666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.06680399576822917, + "learning_rate": 8.878990797123125e-06, + "loss": 5.0553, + "loss/crossentropy": 2.5730031728744507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20398882031440735, + "step": 11576 + }, + { + "epoch": 0.9648333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.10822652180989584, + "learning_rate": 8.870793863035105e-06, + "loss": 5.0463, + "loss/crossentropy": 1.767984189093113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17187254317104816, + "step": 11578 + }, + { + "epoch": 0.965, + "grad_norm": 4.46875, + "grad_norm_var": 0.11066792805989584, + "learning_rate": 8.862634258759277e-06, + "loss": 4.7057, + "loss/crossentropy": 1.9222623482346535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19420848414301872, + "step": 11580 + }, + { + "epoch": 0.9651666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.11227925618489583, + "learning_rate": 8.854512004428653e-06, + "loss": 4.6709, + "loss/crossentropy": 1.7106484100222588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16590185649693012, + "step": 11582 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.10446675618489583, + "learning_rate": 8.846427120084094e-06, + "loss": 4.9918, + "loss/crossentropy": 2.2791011333465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21460538357496262, + "step": 11584 + }, + { + "epoch": 0.9655, + "grad_norm": 4.25, + "grad_norm_var": 0.07740478515625, + "learning_rate": 8.838379625674243e-06, + "loss": 5.0301, + "loss/crossentropy": 1.9393450617790222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23692452535033226, + "step": 11586 + }, + { + "epoch": 0.9656666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.08001302083333334, + "learning_rate": 8.83036954105549e-06, + "loss": 4.935, + "loss/crossentropy": 1.323600873351097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1447231750935316, + "step": 11588 + }, + { + "epoch": 0.9658333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.08019205729166666, + "learning_rate": 8.822396885991927e-06, + "loss": 5.1718, + "loss/crossentropy": 2.269530236721039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2282923273742199, + "step": 11590 + }, + { + "epoch": 0.966, + "grad_norm": 4.65625, + "grad_norm_var": 0.08072916666666667, + "learning_rate": 8.81446168015529e-06, + "loss": 5.4426, + "loss/crossentropy": 2.427491843700409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2319641038775444, + "step": 11592 + }, + { + "epoch": 0.9661666666666666, + "grad_norm": 4.5625, + "grad_norm_var": 0.038895670572916666, + "learning_rate": 8.806563943124903e-06, + "loss": 4.935, + "loss/crossentropy": 1.817050889134407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16897720657289028, + "step": 11594 + }, + { + "epoch": 0.9663333333333334, + "grad_norm": 4.65625, + "grad_norm_var": 0.03253580729166667, + "learning_rate": 8.798703694387653e-06, + "loss": 4.9714, + "loss/crossentropy": 2.0594170689582825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18061714619398117, + "step": 11596 + }, + { + "epoch": 0.9665, + "grad_norm": 4.28125, + "grad_norm_var": 0.03372395833333333, + "learning_rate": 8.790880953337921e-06, + "loss": 4.8757, + "loss/crossentropy": 2.0173055678606033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18694764375686646, + "step": 11598 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.036572265625, + "learning_rate": 8.783095739277544e-06, + "loss": 4.7308, + "loss/crossentropy": 2.0797626599669456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18865588307380676, + "step": 11600 + }, + { + "epoch": 0.9668333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.03502604166666667, + "learning_rate": 8.775348071415762e-06, + "loss": 4.8921, + "loss/crossentropy": 1.6193000376224518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17745701409876347, + "step": 11602 + }, + { + "epoch": 0.967, + "grad_norm": 4.5625, + "grad_norm_var": 0.0333984375, + "learning_rate": 8.767637968869175e-06, + "loss": 5.0154, + "loss/crossentropy": 2.262195646762848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19456271454691887, + "step": 11604 + }, + { + "epoch": 0.9671666666666666, + "grad_norm": 4.09375, + "grad_norm_var": 0.043603515625, + "learning_rate": 8.759965450661698e-06, + "loss": 4.8588, + "loss/crossentropy": 1.9921872094273567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18706602230668068, + "step": 11606 + }, + { + "epoch": 0.9673333333333334, + "grad_norm": 4.5625, + "grad_norm_var": 0.04299723307291667, + "learning_rate": 8.752330535724502e-06, + "loss": 5.1061, + "loss/crossentropy": 1.6814751327037811, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17119668051600456, + "step": 11608 + }, + { + "epoch": 0.9675, + "grad_norm": 4.46875, + "grad_norm_var": 0.043603515625, + "learning_rate": 8.744733242895983e-06, + "loss": 5.0926, + "loss/crossentropy": 1.2483708560466766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13527445308864117, + "step": 11610 + }, + { + "epoch": 0.9676666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.042578125, + "learning_rate": 8.737173590921707e-06, + "loss": 4.8308, + "loss/crossentropy": 1.6758858039975166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1577359363436699, + "step": 11612 + }, + { + "epoch": 0.9678333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.03917643229166667, + "learning_rate": 8.729651598454359e-06, + "loss": 5.4152, + "loss/crossentropy": 1.8468017801642418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17877262830734253, + "step": 11614 + }, + { + "epoch": 0.968, + "grad_norm": 4.46875, + "grad_norm_var": 0.03763020833333333, + "learning_rate": 8.722167284053714e-06, + "loss": 4.6742, + "loss/crossentropy": 2.1274545565247536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16794894263148308, + "step": 11616 + }, + { + "epoch": 0.9681666666666666, + "grad_norm": 4.21875, + "grad_norm_var": 0.03189697265625, + "learning_rate": 8.71472066618657e-06, + "loss": 4.6825, + "loss/crossentropy": 2.20508149266243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075282223522663, + "step": 11618 + }, + { + "epoch": 0.9683333333333334, + "grad_norm": 4.75, + "grad_norm_var": 0.027274576822916667, + "learning_rate": 8.707311763226719e-06, + "loss": 4.2036, + "loss/crossentropy": 1.425516776740551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1791730523109436, + "step": 11620 + }, + { + "epoch": 0.9685, + "grad_norm": 4.21875, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 8.699940593454892e-06, + "loss": 4.9838, + "loss/crossentropy": 2.060584656894207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1716004889458418, + "step": 11622 + }, + { + "epoch": 0.9686666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.027632649739583334, + "learning_rate": 8.692607175058713e-06, + "loss": 4.9354, + "loss/crossentropy": 1.3096114546060562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1387592125684023, + "step": 11624 + }, + { + "epoch": 0.9688333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.029280598958333334, + "learning_rate": 8.685311526132668e-06, + "loss": 4.7851, + "loss/crossentropy": 1.965927578508854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17507245764136314, + "step": 11626 + }, + { + "epoch": 0.969, + "grad_norm": 4.0625, + "grad_norm_var": 0.03958333333333333, + "learning_rate": 8.678053664678045e-06, + "loss": 4.3549, + "loss/crossentropy": 1.4416181147098541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15535733103752136, + "step": 11628 + }, + { + "epoch": 0.9691666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.052469889322916664, + "learning_rate": 8.670833608602895e-06, + "loss": 4.8076, + "loss/crossentropy": 1.90502218157053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777823492884636, + "step": 11630 + }, + { + "epoch": 0.9693333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.060835774739583334, + "learning_rate": 8.663651375721986e-06, + "loss": 4.7508, + "loss/crossentropy": 1.7098061069846153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836803499609232, + "step": 11632 + }, + { + "epoch": 0.9695, + "grad_norm": 4.71875, + "grad_norm_var": 0.05601806640625, + "learning_rate": 8.656506983756768e-06, + "loss": 5.2306, + "loss/crossentropy": 1.4646401852369308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14851868897676468, + "step": 11634 + }, + { + "epoch": 0.9696666666666667, + "grad_norm": 4.6875, + "grad_norm_var": 0.051590983072916666, + "learning_rate": 8.649400450335316e-06, + "loss": 5.0876, + "loss/crossentropy": 2.236992657184601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19888271763920784, + "step": 11636 + }, + { + "epoch": 0.9698333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 8.642331792992293e-06, + "loss": 5.0013, + "loss/crossentropy": 2.04328054189682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21399332210421562, + "step": 11638 + }, + { + "epoch": 0.97, + "grad_norm": 5.03125, + "grad_norm_var": 0.05803629557291667, + "learning_rate": 8.635301029168912e-06, + "loss": 4.7815, + "loss/crossentropy": 1.3681641593575478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13712685741484165, + "step": 11640 + }, + { + "epoch": 0.9701666666666666, + "grad_norm": 4.875, + "grad_norm_var": 0.06041259765625, + "learning_rate": 8.628308176212882e-06, + "loss": 5.1699, + "loss/crossentropy": 2.167069435119629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925915852189064, + "step": 11642 + }, + { + "epoch": 0.9703333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.050764973958333334, + "learning_rate": 8.62135325137837e-06, + "loss": 4.2311, + "loss/crossentropy": 1.7462330013513565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1746540553867817, + "step": 11644 + }, + { + "epoch": 0.9705, + "grad_norm": 4.1875, + "grad_norm_var": 0.0578125, + "learning_rate": 8.614436271825966e-06, + "loss": 4.8348, + "loss/crossentropy": 2.1338234543800354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19719984009861946, + "step": 11646 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.05416259765625, + "learning_rate": 8.607557254622627e-06, + "loss": 4.6722, + "loss/crossentropy": 2.0075062438845634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16331494599580765, + "step": 11648 + }, + { + "epoch": 0.9708333333333333, + "grad_norm": 4.75, + "grad_norm_var": 0.0587890625, + "learning_rate": 8.600716216741648e-06, + "loss": 4.8223, + "loss/crossentropy": 1.6360519081354141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1850583702325821, + "step": 11650 + }, + { + "epoch": 0.971, + "grad_norm": 4.21875, + "grad_norm_var": 0.06848551432291666, + "learning_rate": 8.5939131750626e-06, + "loss": 4.8729, + "loss/crossentropy": 2.094428636133671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17381912097334862, + "step": 11652 + }, + { + "epoch": 0.9711666666666666, + "grad_norm": 4.125, + "grad_norm_var": 0.08971354166666666, + "learning_rate": 8.587148146371323e-06, + "loss": 4.4635, + "loss/crossentropy": 1.7716087624430656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17506087198853493, + "step": 11654 + }, + { + "epoch": 0.9713333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.07463785807291666, + "learning_rate": 8.580421147359846e-06, + "loss": 5.0966, + "loss/crossentropy": 2.5120007693767548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275298647582531, + "step": 11656 + }, + { + "epoch": 0.9715, + "grad_norm": 4.25, + "grad_norm_var": 0.07042643229166666, + "learning_rate": 8.573732194626374e-06, + "loss": 4.815, + "loss/crossentropy": 2.480922281742096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106558196246624, + "step": 11658 + }, + { + "epoch": 0.9716666666666667, + "grad_norm": 4.5625, + "grad_norm_var": 0.06243489583333333, + "learning_rate": 8.567081304675231e-06, + "loss": 5.0735, + "loss/crossentropy": 2.1009537279605865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20914742723107338, + "step": 11660 + }, + { + "epoch": 0.9718333333333333, + "grad_norm": 4.59375, + "grad_norm_var": 0.05310872395833333, + "learning_rate": 8.560468493916829e-06, + "loss": 5.1147, + "loss/crossentropy": 2.174714207649231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22449615225195885, + "step": 11662 + }, + { + "epoch": 0.972, + "grad_norm": 4.25, + "grad_norm_var": 0.05829671223958333, + "learning_rate": 8.553893778667619e-06, + "loss": 4.4634, + "loss/crossentropy": 1.4345918074250221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13629802502691746, + "step": 11664 + }, + { + "epoch": 0.9721666666666666, + "grad_norm": 4.09375, + "grad_norm_var": 0.059891764322916666, + "learning_rate": 8.54735717515006e-06, + "loss": 4.6073, + "loss/crossentropy": 1.4699408039450645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17171907797455788, + "step": 11666 + }, + { + "epoch": 0.9723333333333334, + "grad_norm": 4.8125, + "grad_norm_var": 0.05836181640625, + "learning_rate": 8.540858699492564e-06, + "loss": 5.0484, + "loss/crossentropy": 1.4010847359895706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19796455651521683, + "step": 11668 + }, + { + "epoch": 0.9725, + "grad_norm": 4.40625, + "grad_norm_var": 0.04322509765625, + "learning_rate": 8.534398367729485e-06, + "loss": 5.1173, + "loss/crossentropy": 1.8564397096633911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20212604105472565, + "step": 11670 + }, + { + "epoch": 0.9726666666666667, + "grad_norm": 4.375, + "grad_norm_var": 0.04351806640625, + "learning_rate": 8.52797619580104e-06, + "loss": 5.2406, + "loss/crossentropy": 1.4024172648787498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14744648337364197, + "step": 11672 + }, + { + "epoch": 0.9728333333333333, + "grad_norm": 4.375, + "grad_norm_var": 0.04501546223958333, + "learning_rate": 8.521592199553305e-06, + "loss": 4.743, + "loss/crossentropy": 1.6213370859622955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17050425335764885, + "step": 11674 + }, + { + "epoch": 0.973, + "grad_norm": 4.6875, + "grad_norm_var": 0.05325113932291667, + "learning_rate": 8.515246394738153e-06, + "loss": 4.9606, + "loss/crossentropy": 1.9720600247383118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22468455135822296, + "step": 11676 + }, + { + "epoch": 0.9731666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.059895833333333336, + "learning_rate": 8.50893879701323e-06, + "loss": 5.2049, + "loss/crossentropy": 2.5113608837127686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21483315154910088, + "step": 11678 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 4.46875, + "grad_norm_var": 0.074853515625, + "learning_rate": 8.502669421941903e-06, + "loss": 4.406, + "loss/crossentropy": 1.8661476969718933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17148572579026222, + "step": 11680 + }, + { + "epoch": 0.9735, + "grad_norm": 4.34375, + "grad_norm_var": 0.06451416015625, + "learning_rate": 8.496438284993235e-06, + "loss": 4.8, + "loss/crossentropy": 2.2413404658436775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972121000289917, + "step": 11682 + }, + { + "epoch": 0.9736666666666667, + "grad_norm": 4.0625, + "grad_norm_var": 0.07109375, + "learning_rate": 8.49024540154193e-06, + "loss": 4.3588, + "loss/crossentropy": 1.1721899956464767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14115168899297714, + "step": 11684 + }, + { + "epoch": 0.9738333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.056103515625, + "learning_rate": 8.484090786868324e-06, + "loss": 5.1475, + "loss/crossentropy": 2.327622562646866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22277278453111649, + "step": 11686 + }, + { + "epoch": 0.974, + "grad_norm": 4.375, + "grad_norm_var": 0.05767822265625, + "learning_rate": 8.47797445615831e-06, + "loss": 4.9333, + "loss/crossentropy": 1.6267950534820557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1809178777039051, + "step": 11688 + }, + { + "epoch": 0.9741666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.05972900390625, + "learning_rate": 8.471896424503321e-06, + "loss": 5.206, + "loss/crossentropy": 1.8035884648561478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15355466306209564, + "step": 11690 + }, + { + "epoch": 0.9743333333333334, + "grad_norm": 4.40625, + "grad_norm_var": 0.05240478515625, + "learning_rate": 8.465856706900305e-06, + "loss": 4.7138, + "loss/crossentropy": 2.031545266509056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16774150729179382, + "step": 11692 + }, + { + "epoch": 0.9745, + "grad_norm": 4.5, + "grad_norm_var": 0.042801920572916666, + "learning_rate": 8.459855318251661e-06, + "loss": 5.1921, + "loss/crossentropy": 2.504208981990814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22938504815101624, + "step": 11694 + }, + { + "epoch": 0.9746666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.029227701822916667, + "learning_rate": 8.453892273365217e-06, + "loss": 4.7644, + "loss/crossentropy": 1.6472094357013702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16006597690284252, + "step": 11696 + }, + { + "epoch": 0.9748333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.03290608723958333, + "learning_rate": 8.447967586954199e-06, + "loss": 4.7048, + "loss/crossentropy": 1.7583764493465424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18461447581648827, + "step": 11698 + }, + { + "epoch": 0.975, + "grad_norm": 4.375, + "grad_norm_var": 0.02076416015625, + "learning_rate": 8.442081273637176e-06, + "loss": 5.0457, + "loss/crossentropy": 1.7519859299063683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17579527385532856, + "step": 11700 + }, + { + "epoch": 0.9751666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.023111979166666668, + "learning_rate": 8.436233347938044e-06, + "loss": 4.6854, + "loss/crossentropy": 1.455995261669159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16757206618785858, + "step": 11702 + }, + { + "epoch": 0.9753333333333334, + "grad_norm": 4.53125, + "grad_norm_var": 0.021317545572916666, + "learning_rate": 8.430423824285975e-06, + "loss": 4.6304, + "loss/crossentropy": 2.621677041053772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436899363994598, + "step": 11704 + }, + { + "epoch": 0.9755, + "grad_norm": 4.25, + "grad_norm_var": 0.030171712239583332, + "learning_rate": 8.424652717015399e-06, + "loss": 4.9472, + "loss/crossentropy": 2.6113321185112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2325909100472927, + "step": 11706 + }, + { + "epoch": 0.9756666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.026546223958333334, + "learning_rate": 8.41892004036594e-06, + "loss": 4.5195, + "loss/crossentropy": 1.959231823682785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18092519976198673, + "step": 11708 + }, + { + "epoch": 0.9758333333333333, + "grad_norm": 4.875, + "grad_norm_var": 0.03778889973958333, + "learning_rate": 8.413225808482412e-06, + "loss": 5.284, + "loss/crossentropy": 2.698577105998993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22032983228564262, + "step": 11710 + }, + { + "epoch": 0.976, + "grad_norm": 4.46875, + "grad_norm_var": 0.03720296223958333, + "learning_rate": 8.407570035414765e-06, + "loss": 4.8336, + "loss/crossentropy": 0.9326044321060181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12788059562444687, + "step": 11712 + }, + { + "epoch": 0.9761666666666666, + "grad_norm": 4.03125, + "grad_norm_var": 0.04550374348958333, + "learning_rate": 8.401952735118062e-06, + "loss": 4.3629, + "loss/crossentropy": 2.06018128991127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17738444730639458, + "step": 11714 + }, + { + "epoch": 0.9763333333333334, + "grad_norm": 4.625, + "grad_norm_var": 0.045182291666666666, + "learning_rate": 8.396373921452428e-06, + "loss": 5.0716, + "loss/crossentropy": 1.9189767017960548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18726221565157175, + "step": 11716 + }, + { + "epoch": 0.9765, + "grad_norm": 4.28125, + "grad_norm_var": 0.047379557291666666, + "learning_rate": 8.390833608183029e-06, + "loss": 5.023, + "loss/crossentropy": 2.3186798691749573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21441612765192986, + "step": 11718 + }, + { + "epoch": 0.9766666666666667, + "grad_norm": 5.5625, + "grad_norm_var": 0.12237955729166666, + "learning_rate": 8.385331808980042e-06, + "loss": 4.8558, + "loss/crossentropy": 1.8235585540533066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17181962355971336, + "step": 11720 + }, + { + "epoch": 0.9768333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.10935872395833333, + "learning_rate": 8.37986853741861e-06, + "loss": 4.9082, + "loss/crossentropy": 1.8868702054023743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18395415879786015, + "step": 11722 + }, + { + "epoch": 0.977, + "grad_norm": 4.09375, + "grad_norm_var": 0.12198893229166667, + "learning_rate": 8.374443806978809e-06, + "loss": 4.6495, + "loss/crossentropy": 2.3554630279541016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20914624631404877, + "step": 11724 + }, + { + "epoch": 0.9771666666666666, + "grad_norm": 4.34375, + "grad_norm_var": 0.14693603515625, + "learning_rate": 8.369057631045622e-06, + "loss": 4.9514, + "loss/crossentropy": 1.8347747921943665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17518590204417706, + "step": 11726 + }, + { + "epoch": 0.9773333333333334, + "grad_norm": 4.71875, + "grad_norm_var": 0.15188802083333333, + "learning_rate": 8.363710022908906e-06, + "loss": 5.1507, + "loss/crossentropy": 2.025658816099167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20820339396595955, + "step": 11728 + }, + { + "epoch": 0.9775, + "grad_norm": 4.6875, + "grad_norm_var": 0.13352864583333332, + "learning_rate": 8.358400995763352e-06, + "loss": 4.9741, + "loss/crossentropy": 2.234781265258789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21136652678251266, + "step": 11730 + }, + { + "epoch": 0.9776666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.13313802083333334, + "learning_rate": 8.353130562708451e-06, + "loss": 4.4836, + "loss/crossentropy": 2.3086537420749664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21127301827073097, + "step": 11732 + }, + { + "epoch": 0.9778333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.12784830729166666, + "learning_rate": 8.347898736748481e-06, + "loss": 4.8491, + "loss/crossentropy": 1.5660332068800926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19694342091679573, + "step": 11734 + }, + { + "epoch": 0.978, + "grad_norm": 4.78125, + "grad_norm_var": 0.06549072265625, + "learning_rate": 8.342705530792447e-06, + "loss": 4.9907, + "loss/crossentropy": 2.612446963787079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22936904057860374, + "step": 11736 + }, + { + "epoch": 0.9781666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.06923421223958333, + "learning_rate": 8.33755095765407e-06, + "loss": 5.0026, + "loss/crossentropy": 2.260433554649353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20796746760606766, + "step": 11738 + }, + { + "epoch": 0.9783333333333334, + "grad_norm": 4.375, + "grad_norm_var": 0.15533447265625, + "learning_rate": 8.332435030051747e-06, + "loss": 4.7093, + "loss/crossentropy": 2.0627638399600983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280513308942318, + "step": 11740 + }, + { + "epoch": 0.9785, + "grad_norm": 4.375, + "grad_norm_var": 0.13065999348958332, + "learning_rate": 8.327357760608522e-06, + "loss": 5.2809, + "loss/crossentropy": 1.2782742008566856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16142114251852036, + "step": 11742 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 4.46875, + "grad_norm_var": 0.12525634765625, + "learning_rate": 8.322319161852052e-06, + "loss": 5.4681, + "loss/crossentropy": 2.36602121591568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090701460838318, + "step": 11744 + }, + { + "epoch": 0.9788333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.12467447916666667, + "learning_rate": 8.317319246214578e-06, + "loss": 5.566, + "loss/crossentropy": 1.765766218304634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20317689701914787, + "step": 11746 + }, + { + "epoch": 0.979, + "grad_norm": 4.53125, + "grad_norm_var": 0.14269205729166667, + "learning_rate": 8.31235802603289e-06, + "loss": 4.831, + "loss/crossentropy": 2.3671552538871765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20855028927326202, + "step": 11748 + }, + { + "epoch": 0.9791666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.14560139973958333, + "learning_rate": 8.307435513548314e-06, + "loss": 4.8928, + "loss/crossentropy": 2.134007513523102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18522262014448643, + "step": 11750 + }, + { + "epoch": 0.9793333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.140087890625, + "learning_rate": 8.302551720906658e-06, + "loss": 5.2723, + "loss/crossentropy": 2.4626063108444214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22433257102966309, + "step": 11752 + }, + { + "epoch": 0.9795, + "grad_norm": 4.9375, + "grad_norm_var": 0.139697265625, + "learning_rate": 8.297706660158189e-06, + "loss": 4.5625, + "loss/crossentropy": 1.8229105174541473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18363786488771439, + "step": 11754 + }, + { + "epoch": 0.9796666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.04208577473958333, + "learning_rate": 8.29290034325762e-06, + "loss": 4.7565, + "loss/crossentropy": 2.0099611580371857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18299407325685024, + "step": 11756 + }, + { + "epoch": 0.9798333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.03990885416666667, + "learning_rate": 8.288132782064057e-06, + "loss": 4.8898, + "loss/crossentropy": 1.8129331469535828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15079554915428162, + "step": 11758 + }, + { + "epoch": 0.98, + "grad_norm": 4.59375, + "grad_norm_var": 0.039453125, + "learning_rate": 8.283403988340983e-06, + "loss": 5.0303, + "loss/crossentropy": 2.317029356956482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21258693933486938, + "step": 11760 + }, + { + "epoch": 0.9801666666666666, + "grad_norm": 4.3125, + "grad_norm_var": 0.04296468098958333, + "learning_rate": 8.278713973756227e-06, + "loss": 5.0208, + "loss/crossentropy": 2.253770500421524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18931128084659576, + "step": 11762 + }, + { + "epoch": 0.9803333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.027197265625, + "learning_rate": 8.274062749881934e-06, + "loss": 5.3963, + "loss/crossentropy": 2.4488985538482666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19674066081643105, + "step": 11764 + }, + { + "epoch": 0.9805, + "grad_norm": 5.3125, + "grad_norm_var": 0.06139322916666667, + "learning_rate": 8.269450328194538e-06, + "loss": 4.1609, + "loss/crossentropy": 1.3078017458319664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14418474957346916, + "step": 11766 + }, + { + "epoch": 0.9806666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.07688395182291667, + "learning_rate": 8.264876720074727e-06, + "loss": 5.0368, + "loss/crossentropy": 2.1359574496746063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973453313112259, + "step": 11768 + }, + { + "epoch": 0.9808333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.06926676432291666, + "learning_rate": 8.260341936807425e-06, + "loss": 4.7631, + "loss/crossentropy": 2.2247210144996643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20966701954603195, + "step": 11770 + }, + { + "epoch": 0.981, + "grad_norm": 4.75, + "grad_norm_var": 0.07185872395833333, + "learning_rate": 8.255845989581765e-06, + "loss": 4.5269, + "loss/crossentropy": 1.9601154178380966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16586436331272125, + "step": 11772 + }, + { + "epoch": 0.9811666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.07239583333333334, + "learning_rate": 8.251388889491044e-06, + "loss": 4.6656, + "loss/crossentropy": 1.9562703296542168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.190062141045928, + "step": 11774 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.07121988932291666, + "learning_rate": 8.246970647532716e-06, + "loss": 5.3309, + "loss/crossentropy": 2.1067320704460144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069613099098206, + "step": 11776 + }, + { + "epoch": 0.9815, + "grad_norm": 4.6875, + "grad_norm_var": 0.06979166666666667, + "learning_rate": 8.242591274608351e-06, + "loss": 4.9249, + "loss/crossentropy": 2.6109946966171265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20355751365423203, + "step": 11778 + }, + { + "epoch": 0.9816666666666667, + "grad_norm": 4.75, + "grad_norm_var": 0.068212890625, + "learning_rate": 8.23825078152362e-06, + "loss": 5.3303, + "loss/crossentropy": 2.306352376937866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22947991266846657, + "step": 11780 + }, + { + "epoch": 0.9818333333333333, + "grad_norm": 4.46875, + "grad_norm_var": 0.03203125, + "learning_rate": 8.233949178988255e-06, + "loss": 4.8106, + "loss/crossentropy": 2.0892684012651443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19318658113479614, + "step": 11782 + }, + { + "epoch": 0.982, + "grad_norm": 4.46875, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 8.229686477616033e-06, + "loss": 5.1217, + "loss/crossentropy": 2.2837354838848114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23053519800305367, + "step": 11784 + }, + { + "epoch": 0.9821666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.022456868489583334, + "learning_rate": 8.225462687924748e-06, + "loss": 4.5143, + "loss/crossentropy": 1.8091852068901062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16203287802636623, + "step": 11786 + }, + { + "epoch": 0.9823333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.018778483072916668, + "learning_rate": 8.22127782033618e-06, + "loss": 5.1684, + "loss/crossentropy": 1.8546411916613579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17528345808386803, + "step": 11788 + }, + { + "epoch": 0.9825, + "grad_norm": 4.4375, + "grad_norm_var": 0.032938639322916664, + "learning_rate": 8.217131885176074e-06, + "loss": 4.6615, + "loss/crossentropy": 1.4845838844776154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1486959345638752, + "step": 11790 + }, + { + "epoch": 0.9826666666666667, + "grad_norm": 4.4375, + "grad_norm_var": 0.033003743489583334, + "learning_rate": 8.213024892674113e-06, + "loss": 5.1357, + "loss/crossentropy": 1.778126172721386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207093708217144, + "step": 11792 + }, + { + "epoch": 0.9828333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.034895833333333334, + "learning_rate": 8.208956852963892e-06, + "loss": 4.4221, + "loss/crossentropy": 2.1073838770389557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19880036637187004, + "step": 11794 + }, + { + "epoch": 0.983, + "grad_norm": 4.21875, + "grad_norm_var": 0.039778645833333334, + "learning_rate": 8.204927776082895e-06, + "loss": 4.1364, + "loss/crossentropy": 1.9823874160647392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16754209622740746, + "step": 11796 + }, + { + "epoch": 0.9831666666666666, + "grad_norm": 5.0625, + "grad_norm_var": 0.07884114583333333, + "learning_rate": 8.200937671972468e-06, + "loss": 4.7407, + "loss/crossentropy": 0.9149458408355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16483060829341412, + "step": 11798 + }, + { + "epoch": 0.9833333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.07862955729166667, + "learning_rate": 8.1969865504778e-06, + "loss": 4.5806, + "loss/crossentropy": 1.7888997569680214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17346928641200066, + "step": 11800 + }, + { + "epoch": 0.9835, + "grad_norm": 4.21875, + "grad_norm_var": 0.08186442057291667, + "learning_rate": 8.193074421347883e-06, + "loss": 4.7757, + "loss/crossentropy": 1.3748513013124466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13545435294508934, + "step": 11802 + }, + { + "epoch": 0.9836666666666667, + "grad_norm": 4.3125, + "grad_norm_var": 0.087744140625, + "learning_rate": 8.189201294235514e-06, + "loss": 4.6596, + "loss/crossentropy": 1.8794011771678925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18594608083367348, + "step": 11804 + }, + { + "epoch": 0.9838333333333333, + "grad_norm": 4.625, + "grad_norm_var": 0.07935791015625, + "learning_rate": 8.185367178697244e-06, + "loss": 4.6955, + "loss/crossentropy": 2.4637969732284546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970806047320366, + "step": 11806 + }, + { + "epoch": 0.984, + "grad_norm": 4.375, + "grad_norm_var": 0.085009765625, + "learning_rate": 8.181572084193377e-06, + "loss": 5.6585, + "loss/crossentropy": 2.1041803061962128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20276985503733158, + "step": 11808 + }, + { + "epoch": 0.9841666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.08043212890625, + "learning_rate": 8.177816020087929e-06, + "loss": 4.4479, + "loss/crossentropy": 1.4653847217559814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1403233241289854, + "step": 11810 + }, + { + "epoch": 0.9843333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.070556640625, + "learning_rate": 8.174098995648613e-06, + "loss": 4.712, + "loss/crossentropy": 1.5452167689800262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.12613629549741745, + "step": 11812 + }, + { + "epoch": 0.9845, + "grad_norm": 4.5, + "grad_norm_var": 0.030192057291666668, + "learning_rate": 8.170421020046818e-06, + "loss": 5.3324, + "loss/crossentropy": 2.5659364461898804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20733560249209404, + "step": 11814 + }, + { + "epoch": 0.9846666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.030582682291666666, + "learning_rate": 8.166782102357586e-06, + "loss": 4.9242, + "loss/crossentropy": 2.5664992928504944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21779290586709976, + "step": 11816 + }, + { + "epoch": 0.9848333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.02447509765625, + "learning_rate": 8.163182251559582e-06, + "loss": 4.7698, + "loss/crossentropy": 1.7134768441319466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17407608777284622, + "step": 11818 + }, + { + "epoch": 0.985, + "grad_norm": 4.21875, + "grad_norm_var": 0.021419270833333334, + "learning_rate": 8.15962147653508e-06, + "loss": 4.6578, + "loss/crossentropy": 1.5287619307637215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948755346238613, + "step": 11820 + }, + { + "epoch": 0.9851666666666666, + "grad_norm": 5.09375, + "grad_norm_var": 0.04247639973958333, + "learning_rate": 8.15609978606994e-06, + "loss": 4.8483, + "loss/crossentropy": 1.825168825685978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16270102560520172, + "step": 11822 + }, + { + "epoch": 0.9853333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.044596354166666664, + "learning_rate": 8.152617188853582e-06, + "loss": 5.1059, + "loss/crossentropy": 2.337290108203888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2274259254336357, + "step": 11824 + }, + { + "epoch": 0.9855, + "grad_norm": 4.4375, + "grad_norm_var": 0.049702962239583336, + "learning_rate": 8.149173693478968e-06, + "loss": 4.7688, + "loss/crossentropy": 2.4057921767234802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579170435667038, + "step": 11826 + }, + { + "epoch": 0.9856666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.055859375, + "learning_rate": 8.145769308442583e-06, + "loss": 4.8138, + "loss/crossentropy": 2.0226185023784637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22264505177736282, + "step": 11828 + }, + { + "epoch": 0.9858333333333333, + "grad_norm": 4.15625, + "grad_norm_var": 0.06705729166666667, + "learning_rate": 8.142404042144405e-06, + "loss": 4.9318, + "loss/crossentropy": 2.267892837524414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211035192012787, + "step": 11830 + }, + { + "epoch": 0.986, + "grad_norm": 4.15625, + "grad_norm_var": 0.07336832682291666, + "learning_rate": 8.139077902887897e-06, + "loss": 4.3544, + "loss/crossentropy": 1.4533291533589363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16291098482906818, + "step": 11832 + }, + { + "epoch": 0.9861666666666666, + "grad_norm": 4.5, + "grad_norm_var": 0.12278645833333333, + "learning_rate": 8.135790898879973e-06, + "loss": 5.3159, + "loss/crossentropy": 1.5565712675452232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15048514120280743, + "step": 11834 + }, + { + "epoch": 0.9863333333333333, + "grad_norm": 5.9375, + "grad_norm_var": 0.22107747395833333, + "learning_rate": 8.132543038230996e-06, + "loss": 5.0178, + "loss/crossentropy": 2.1774487793445587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21005262434482574, + "step": 11836 + }, + { + "epoch": 0.9865, + "grad_norm": 4.84375, + "grad_norm_var": 0.22655843098958334, + "learning_rate": 8.129334328954733e-06, + "loss": 4.978, + "loss/crossentropy": 2.3862339854240417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916748955845833, + "step": 11838 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.232666015625, + "learning_rate": 8.126164778968358e-06, + "loss": 4.9298, + "loss/crossentropy": 2.2145788967609406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891205869615078, + "step": 11840 + }, + { + "epoch": 0.9868333333333333, + "grad_norm": 4.3125, + "grad_norm_var": 0.22823893229166667, + "learning_rate": 8.123034396092415e-06, + "loss": 4.7015, + "loss/crossentropy": 2.1209593415260315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21985666453838348, + "step": 11842 + }, + { + "epoch": 0.987, + "grad_norm": 4.5, + "grad_norm_var": 0.22784830729166666, + "learning_rate": 8.119943188050822e-06, + "loss": 4.8956, + "loss/crossentropy": 2.3089587688446045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974850781261921, + "step": 11844 + }, + { + "epoch": 0.9871666666666666, + "grad_norm": 4.125, + "grad_norm_var": 0.235400390625, + "learning_rate": 8.116891162470822e-06, + "loss": 4.3814, + "loss/crossentropy": 2.284092366695404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19774354994297028, + "step": 11846 + }, + { + "epoch": 0.9873333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.22551676432291667, + "learning_rate": 8.113878326882984e-06, + "loss": 4.7244, + "loss/crossentropy": 2.0912757217884064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20107288658618927, + "step": 11848 + }, + { + "epoch": 0.9875, + "grad_norm": 4.3125, + "grad_norm_var": 0.18136393229166667, + "learning_rate": 8.110904688721181e-06, + "loss": 5.111, + "loss/crossentropy": 1.7536583244800568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13841561134904623, + "step": 11850 + }, + { + "epoch": 0.9876666666666667, + "grad_norm": 4.78125, + "grad_norm_var": 0.04576416015625, + "learning_rate": 8.107970255322572e-06, + "loss": 5.3386, + "loss/crossentropy": 2.4679291248321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21784574910998344, + "step": 11852 + }, + { + "epoch": 0.9878333333333333, + "grad_norm": 4.0625, + "grad_norm_var": 0.04595947265625, + "learning_rate": 8.105075033927576e-06, + "loss": 4.7901, + "loss/crossentropy": 2.679027020931244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104400470852852, + "step": 11854 + }, + { + "epoch": 0.988, + "grad_norm": 4.5, + "grad_norm_var": 0.044755045572916666, + "learning_rate": 8.102219031679866e-06, + "loss": 4.7311, + "loss/crossentropy": 2.548813045024872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21962737292051315, + "step": 11856 + }, + { + "epoch": 0.9881666666666666, + "grad_norm": 4.8125, + "grad_norm_var": 0.051041666666666666, + "learning_rate": 8.099402255626345e-06, + "loss": 4.8849, + "loss/crossentropy": 2.0113202035427094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012788474559784, + "step": 11858 + }, + { + "epoch": 0.9883333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.06373291015625, + "learning_rate": 8.096624712717127e-06, + "loss": 5.0429, + "loss/crossentropy": 2.3392894864082336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20352452620863914, + "step": 11860 + }, + { + "epoch": 0.9885, + "grad_norm": 4.65625, + "grad_norm_var": 0.0501953125, + "learning_rate": 8.09388640980552e-06, + "loss": 5.4598, + "loss/crossentropy": 2.566970646381378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093185856938362, + "step": 11862 + }, + { + "epoch": 0.9886666666666667, + "grad_norm": 4.53125, + "grad_norm_var": 0.04742431640625, + "learning_rate": 8.091187353648018e-06, + "loss": 5.002, + "loss/crossentropy": 1.5040106773376465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845961958169937, + "step": 11864 + }, + { + "epoch": 0.9888333333333333, + "grad_norm": 4.5625, + "grad_norm_var": 0.04479166666666667, + "learning_rate": 8.088527550904274e-06, + "loss": 4.7993, + "loss/crossentropy": 1.3132527843117714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15961330942809582, + "step": 11866 + }, + { + "epoch": 0.989, + "grad_norm": 4.53125, + "grad_norm_var": 0.042801920572916666, + "learning_rate": 8.085907008137084e-06, + "loss": 4.6958, + "loss/crossentropy": 1.2898187711834908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15488006733357906, + "step": 11868 + }, + { + "epoch": 0.9891666666666666, + "grad_norm": 4.40625, + "grad_norm_var": 0.04114583333333333, + "learning_rate": 8.083325731812376e-06, + "loss": 5.077, + "loss/crossentropy": 2.087516203522682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222130000591278, + "step": 11870 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 4.84375, + "grad_norm_var": 0.04973551432291667, + "learning_rate": 8.080783728299198e-06, + "loss": 4.488, + "loss/crossentropy": 1.3223537430167198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1429068874567747, + "step": 11872 + }, + { + "epoch": 0.9895, + "grad_norm": 4.65625, + "grad_norm_var": 0.044775390625, + "learning_rate": 8.078281003869689e-06, + "loss": 5.2082, + "loss/crossentropy": 2.4864302277565002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21986323222517967, + "step": 11874 + }, + { + "epoch": 0.9896666666666667, + "grad_norm": 4.28125, + "grad_norm_var": 0.04149983723958333, + "learning_rate": 8.075817564699068e-06, + "loss": 4.9471, + "loss/crossentropy": 2.1714051365852356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23419787362217903, + "step": 11876 + }, + { + "epoch": 0.9898333333333333, + "grad_norm": 4.375, + "grad_norm_var": 0.043192545572916664, + "learning_rate": 8.07339341686563e-06, + "loss": 4.5392, + "loss/crossentropy": 1.5454725325107574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15994950011372566, + "step": 11878 + }, + { + "epoch": 0.99, + "grad_norm": 4.5, + "grad_norm_var": 0.04659830729166667, + "learning_rate": 8.071008566350721e-06, + "loss": 4.6596, + "loss/crossentropy": 1.6750903725624084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18367098458111286, + "step": 11880 + }, + { + "epoch": 0.9901666666666666, + "grad_norm": 4.375, + "grad_norm_var": 0.0462890625, + "learning_rate": 8.068663019038719e-06, + "loss": 4.7149, + "loss/crossentropy": 1.3502802401781082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.13940864615142345, + "step": 11882 + }, + { + "epoch": 0.9903333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.047379557291666666, + "learning_rate": 8.066356780717031e-06, + "loss": 4.8176, + "loss/crossentropy": 1.3339405804872513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1357364384457469, + "step": 11884 + }, + { + "epoch": 0.9905, + "grad_norm": 4.75, + "grad_norm_var": 0.03388264973958333, + "learning_rate": 8.064089857076067e-06, + "loss": 5.0944, + "loss/crossentropy": 2.2128437161445618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21685703843832016, + "step": 11886 + }, + { + "epoch": 0.9906666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.024723307291666666, + "learning_rate": 8.06186225370924e-06, + "loss": 4.9089, + "loss/crossentropy": 2.279437929391861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22451356425881386, + "step": 11888 + }, + { + "epoch": 0.9908333333333333, + "grad_norm": 4.53125, + "grad_norm_var": 0.024723307291666666, + "learning_rate": 8.059673976112941e-06, + "loss": 5.402, + "loss/crossentropy": 2.281008332967758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916458159685135, + "step": 11890 + }, + { + "epoch": 0.991, + "grad_norm": 4.34375, + "grad_norm_var": 0.025113932291666665, + "learning_rate": 8.057525029686523e-06, + "loss": 5.0482, + "loss/crossentropy": 2.2194367945194244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23771441355347633, + "step": 11892 + }, + { + "epoch": 0.9911666666666666, + "grad_norm": 4.4375, + "grad_norm_var": 0.030192057291666668, + "learning_rate": 8.055415419732298e-06, + "loss": 4.617, + "loss/crossentropy": 2.204625815153122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21681509166955948, + "step": 11894 + }, + { + "epoch": 0.9913333333333333, + "grad_norm": 4.5, + "grad_norm_var": 0.026493326822916666, + "learning_rate": 8.053345151455523e-06, + "loss": 5.319, + "loss/crossentropy": 2.290549635887146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23177609220147133, + "step": 11896 + }, + { + "epoch": 0.9915, + "grad_norm": 4.3125, + "grad_norm_var": 0.029520670572916668, + "learning_rate": 8.051314229964375e-06, + "loss": 4.6101, + "loss/crossentropy": 1.993983969092369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18598743341863155, + "step": 11898 + }, + { + "epoch": 0.9916666666666667, + "grad_norm": 4.0, + "grad_norm_var": 0.04659830729166667, + "learning_rate": 8.049322660269954e-06, + "loss": 4.5727, + "loss/crossentropy": 1.6651684641838074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17348604835569859, + "step": 11900 + }, + { + "epoch": 0.9918333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.063134765625, + "learning_rate": 8.047370447286258e-06, + "loss": 4.925, + "loss/crossentropy": 1.5045694410800934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14359690621495247, + "step": 11902 + }, + { + "epoch": 0.992, + "grad_norm": 4.4375, + "grad_norm_var": 0.05917561848958333, + "learning_rate": 8.045457595830179e-06, + "loss": 4.8642, + "loss/crossentropy": 2.3525500893592834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19554975256323814, + "step": 11904 + }, + { + "epoch": 0.9921666666666666, + "grad_norm": 4.4375, + "grad_norm_var": 0.05792643229166667, + "learning_rate": 8.043584110621488e-06, + "loss": 5.0682, + "loss/crossentropy": 2.599298894405365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20872588828206062, + "step": 11906 + }, + { + "epoch": 0.9923333333333333, + "grad_norm": 4.71875, + "grad_norm_var": 0.055985514322916666, + "learning_rate": 8.041749996282821e-06, + "loss": 4.9446, + "loss/crossentropy": 1.7182498648762703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21588006988167763, + "step": 11908 + }, + { + "epoch": 0.9925, + "grad_norm": 4.6875, + "grad_norm_var": 0.05523681640625, + "learning_rate": 8.03995525733968e-06, + "loss": 4.5029, + "loss/crossentropy": 2.0742194950580597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20126686617732048, + "step": 11910 + }, + { + "epoch": 0.9926666666666667, + "grad_norm": 4.25, + "grad_norm_var": 0.09256184895833333, + "learning_rate": 8.038199898220398e-06, + "loss": 4.5741, + "loss/crossentropy": 1.3627977594733238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1341603621840477, + "step": 11912 + }, + { + "epoch": 0.9928333333333333, + "grad_norm": 4.125, + "grad_norm_var": 0.10745035807291667, + "learning_rate": 8.036483923256152e-06, + "loss": 4.5946, + "loss/crossentropy": 2.560191512107849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274262875318527, + "step": 11914 + }, + { + "epoch": 0.993, + "grad_norm": 4.34375, + "grad_norm_var": 0.08928629557291666, + "learning_rate": 8.034807336680938e-06, + "loss": 4.7074, + "loss/crossentropy": 1.5181104466319084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1560316327959299, + "step": 11916 + }, + { + "epoch": 0.9931666666666666, + "grad_norm": 4.46875, + "grad_norm_var": 0.07154541015625, + "learning_rate": 8.033170142631567e-06, + "loss": 5.451, + "loss/crossentropy": 1.7161678597331047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1688497867435217, + "step": 11918 + }, + { + "epoch": 0.9933333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.07255452473958333, + "learning_rate": 8.031572345147655e-06, + "loss": 4.7851, + "loss/crossentropy": 1.6516497433185577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17710182815790176, + "step": 11920 + }, + { + "epoch": 0.9935, + "grad_norm": 4.46875, + "grad_norm_var": 0.07125244140625, + "learning_rate": 8.030013948171608e-06, + "loss": 5.3578, + "loss/crossentropy": 2.3381210267543793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103869989514351, + "step": 11922 + }, + { + "epoch": 0.9936666666666667, + "grad_norm": 4.65625, + "grad_norm_var": 0.06998291015625, + "learning_rate": 8.028494955548613e-06, + "loss": 4.996, + "loss/crossentropy": 2.2223448157310486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23018431290984154, + "step": 11924 + }, + { + "epoch": 0.9938333333333333, + "grad_norm": 4.65625, + "grad_norm_var": 0.06783447265625, + "learning_rate": 8.027015371026635e-06, + "loss": 5.0702, + "loss/crossentropy": 2.454653322696686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2261667139828205, + "step": 11926 + }, + { + "epoch": 0.994, + "grad_norm": 4.46875, + "grad_norm_var": 0.031966145833333334, + "learning_rate": 8.025575198256401e-06, + "loss": 4.3612, + "loss/crossentropy": 1.864028476178646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18934139795601368, + "step": 11928 + }, + { + "epoch": 0.9941666666666666, + "grad_norm": 4.78125, + "grad_norm_var": 0.016341145833333334, + "learning_rate": 8.024174440791395e-06, + "loss": 4.9677, + "loss/crossentropy": 2.5450727939605713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21843239665031433, + "step": 11930 + }, + { + "epoch": 0.9943333333333333, + "grad_norm": 4.34375, + "grad_norm_var": 0.016434733072916666, + "learning_rate": 8.022813102087846e-06, + "loss": 5.0552, + "loss/crossentropy": 2.4372522234916687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21206426993012428, + "step": 11932 + }, + { + "epoch": 0.9945, + "grad_norm": 4.75, + "grad_norm_var": 0.017606608072916665, + "learning_rate": 8.021491185504721e-06, + "loss": 5.1754, + "loss/crossentropy": 2.269777476787567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21210772916674614, + "step": 11934 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 4.5, + "grad_norm_var": 0.017118326822916665, + "learning_rate": 8.020208694303722e-06, + "loss": 5.0094, + "loss/crossentropy": 1.7897434085607529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16236883774399757, + "step": 11936 + }, + { + "epoch": 0.9948333333333333, + "grad_norm": 4.28125, + "grad_norm_var": 0.030171712239583332, + "learning_rate": 8.018965631649264e-06, + "loss": 3.9853, + "loss/crossentropy": 1.755036287009716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18948844075202942, + "step": 11938 + }, + { + "epoch": 0.995, + "grad_norm": 4.125, + "grad_norm_var": 0.03632405598958333, + "learning_rate": 8.017762000608482e-06, + "loss": 4.078, + "loss/crossentropy": 1.7440339028835297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1710295006632805, + "step": 11940 + }, + { + "epoch": 0.9951666666666666, + "grad_norm": 4.53125, + "grad_norm_var": 0.0333984375, + "learning_rate": 8.016597804151215e-06, + "loss": 5.1789, + "loss/crossentropy": 2.2095680236816406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18934720009565353, + "step": 11942 + }, + { + "epoch": 0.9953333333333333, + "grad_norm": 5.09375, + "grad_norm_var": 0.060009765625, + "learning_rate": 8.015473045150006e-06, + "loss": 5.272, + "loss/crossentropy": 2.004868745803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18382899090647697, + "step": 11944 + }, + { + "epoch": 0.9955, + "grad_norm": 4.84375, + "grad_norm_var": 0.06287434895833334, + "learning_rate": 8.014387726380082e-06, + "loss": 5.0277, + "loss/crossentropy": 1.9194257259368896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17790965735912323, + "step": 11946 + }, + { + "epoch": 0.9956666666666667, + "grad_norm": 4.59375, + "grad_norm_var": 0.06617431640625, + "learning_rate": 8.013341850519359e-06, + "loss": 4.9612, + "loss/crossentropy": 1.963321976363659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1870459709316492, + "step": 11948 + }, + { + "epoch": 0.9958333333333333, + "grad_norm": 4.6875, + "grad_norm_var": 0.06884358723958334, + "learning_rate": 8.012335420148435e-06, + "loss": 4.6321, + "loss/crossentropy": 1.8418525904417038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1703290119767189, + "step": 11950 + }, + { + "epoch": 0.996, + "grad_norm": 4.53125, + "grad_norm_var": 0.07405192057291667, + "learning_rate": 8.011368437750574e-06, + "loss": 5.0424, + "loss/crossentropy": 2.372679352760315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18234703317284584, + "step": 11952 + }, + { + "epoch": 0.9961666666666666, + "grad_norm": 4.3125, + "grad_norm_var": 0.07980143229166667, + "learning_rate": 8.010440905711708e-06, + "loss": 5.4315, + "loss/crossentropy": 1.4641002044081688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15950197726488113, + "step": 11954 + }, + { + "epoch": 0.9963333333333333, + "grad_norm": 4.78125, + "grad_norm_var": 0.065869140625, + "learning_rate": 8.009552826320434e-06, + "loss": 5.1694, + "loss/crossentropy": 1.9262694045901299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20404083095490932, + "step": 11956 + }, + { + "epoch": 0.9965, + "grad_norm": 4.4375, + "grad_norm_var": 0.06754150390625, + "learning_rate": 8.008704201767998e-06, + "loss": 5.0181, + "loss/crossentropy": 1.6573146134614944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728476718068123, + "step": 11958 + }, + { + "epoch": 0.9966666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.06236572265625, + "learning_rate": 8.007895034148296e-06, + "loss": 5.3437, + "loss/crossentropy": 1.4944916442036629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17235364392399788, + "step": 11960 + }, + { + "epoch": 0.9968333333333333, + "grad_norm": 4.1875, + "grad_norm_var": 0.06978759765625, + "learning_rate": 8.007125325457868e-06, + "loss": 4.9422, + "loss/crossentropy": 1.9228725656867027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1831230465322733, + "step": 11962 + }, + { + "epoch": 0.997, + "grad_norm": 4.5625, + "grad_norm_var": 0.06789957682291667, + "learning_rate": 8.006395077595897e-06, + "loss": 4.6737, + "loss/crossentropy": 2.0940150320529938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19881772994995117, + "step": 11964 + }, + { + "epoch": 0.9971666666666666, + "grad_norm": 4.71875, + "grad_norm_var": 0.06066080729166667, + "learning_rate": 8.005704292364192e-06, + "loss": 4.8361, + "loss/crossentropy": 2.0688324570655823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.190332543104887, + "step": 11966 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 4.0625, + "grad_norm_var": 0.09060872395833333, + "learning_rate": 8.005052971467203e-06, + "loss": 3.5708, + "loss/crossentropy": 1.320427618920803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1498071327805519, + "step": 11968 + }, + { + "epoch": 0.9975, + "grad_norm": 4.625, + "grad_norm_var": 0.06432291666666666, + "learning_rate": 8.004441116511992e-06, + "loss": 5.0302, + "loss/crossentropy": 1.9641523733735085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834118254482746, + "step": 11970 + }, + { + "epoch": 0.9976666666666667, + "grad_norm": 4.34375, + "grad_norm_var": 0.05987955729166667, + "learning_rate": 8.003868729008256e-06, + "loss": 4.1466, + "loss/crossentropy": 1.9224779605865479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22309407964348793, + "step": 11972 + }, + { + "epoch": 0.9978333333333333, + "grad_norm": 4.40625, + "grad_norm_var": 0.058447265625, + "learning_rate": 8.003335810368304e-06, + "loss": 5.0203, + "loss/crossentropy": 1.661292903125286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18765877932310104, + "step": 11974 + }, + { + "epoch": 0.998, + "grad_norm": 8.25, + "grad_norm_var": 0.94605712890625, + "learning_rate": 8.002842361907057e-06, + "loss": 4.5332, + "loss/crossentropy": 1.523488275706768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16733020916581154, + "step": 11976 + }, + { + "epoch": 0.9981666666666666, + "grad_norm": 4.96875, + "grad_norm_var": 0.9265625, + "learning_rate": 8.002388384842052e-06, + "loss": 5.0479, + "loss/crossentropy": 2.3596551716327667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128112018108368, + "step": 11978 + }, + { + "epoch": 0.9983333333333333, + "grad_norm": 4.90625, + "grad_norm_var": 0.9274739583333333, + "learning_rate": 8.001973880293432e-06, + "loss": 5.511, + "loss/crossentropy": 1.8143180459737778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1826395783573389, + "step": 11980 + }, + { + "epoch": 0.9985, + "grad_norm": 4.40625, + "grad_norm_var": 0.9441365559895833, + "learning_rate": 8.001598849283945e-06, + "loss": 4.6422, + "loss/crossentropy": 1.7479843944311142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20489901304244995, + "step": 11982 + }, + { + "epoch": 0.9986666666666667, + "grad_norm": 4.40625, + "grad_norm_var": 0.8844889322916667, + "learning_rate": 8.001263292738943e-06, + "loss": 5.034, + "loss/crossentropy": 2.108782708644867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19143879041075706, + "step": 11984 + }, + { + "epoch": 0.9988333333333334, + "grad_norm": 4.6875, + "grad_norm_var": 0.9028483072916667, + "learning_rate": 8.00096721148638e-06, + "loss": 4.9638, + "loss/crossentropy": 2.291710913181305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.227816391736269, + "step": 11986 + }, + { + "epoch": 0.999, + "grad_norm": 4.28125, + "grad_norm_var": 0.9287068684895833, + "learning_rate": 8.000710606256803e-06, + "loss": 4.6671, + "loss/crossentropy": 1.8156883418560028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17330962419509888, + "step": 11988 + }, + { + "epoch": 0.9991666666666666, + "grad_norm": 4.59375, + "grad_norm_var": 0.9245930989583333, + "learning_rate": 8.000493477683367e-06, + "loss": 4.8698, + "loss/crossentropy": 2.213321268558502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21897965297102928, + "step": 11990 + }, + { + "epoch": 0.9993333333333333, + "grad_norm": 4.4375, + "grad_norm_var": 0.05319010416666667, + "learning_rate": 8.000315826301807e-06, + "loss": 4.7729, + "loss/crossentropy": 2.0580232441425323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1888243965804577, + "step": 11992 + }, + { + "epoch": 0.9995, + "grad_norm": 4.4375, + "grad_norm_var": 0.037495930989583336, + "learning_rate": 8.000177652550465e-06, + "loss": 4.8099, + "loss/crossentropy": 1.9432259127497673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17133133299648762, + "step": 11994 + }, + { + "epoch": 0.9996666666666667, + "grad_norm": 4.625, + "grad_norm_var": 0.028645833333333332, + "learning_rate": 8.00007895677027e-06, + "loss": 4.947, + "loss/crossentropy": 1.5523300170898438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16255545988678932, + "step": 11996 + }, + { + "epoch": 0.9998333333333334, + "grad_norm": 4.375, + "grad_norm_var": 0.03136393229166667, + "learning_rate": 8.000019739204745e-06, + "loss": 4.9184, + "loss/crossentropy": 1.881621241569519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18999022245407104, + "step": 11998 + }, + { + "epoch": 1.0, + "grad_norm": 5.0, + "grad_norm_var": 0.04724934895833333, + "learning_rate": 8.000000000000001e-06, + "loss": 5.0732, + "loss/crossentropy": 1.9301600456237793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24359508231282234, + "step": 12000 + } + ], + "logging_steps": 2, + "max_steps": 12000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 6000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.5200982329720832e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}