{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016666666666666666, "grad_norm": 9.4375, "learning_rate": 8.640000000000002e-06, "loss": 5.2912, "loss/crossentropy": 2.164160817861557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059289701282978, "step": 2 }, { "epoch": 0.0003333333333333333, "grad_norm": 7.90625, "learning_rate": 9.280000000000001e-06, "loss": 4.7345, "loss/crossentropy": 1.9222038090229034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22200099378824234, "step": 4 }, { "epoch": 0.0005, "grad_norm": 6.625, "learning_rate": 9.920000000000002e-06, "loss": 5.159, "loss/crossentropy": 2.4256778359413147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2173849642276764, "step": 6 }, { "epoch": 0.0006666666666666666, "grad_norm": 6.6875, "learning_rate": 1.056e-05, "loss": 4.2586, "loss/crossentropy": 1.0981817543506622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15661142766475677, "step": 8 }, { "epoch": 0.0008333333333333334, "grad_norm": 5.3125, "learning_rate": 1.1200000000000001e-05, "loss": 4.9503, "loss/crossentropy": 2.302097499370575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2299620732665062, "step": 10 }, { "epoch": 0.001, "grad_norm": 5.34375, "learning_rate": 1.1840000000000002e-05, "loss": 5.5701, "loss/crossentropy": 1.7413269132375717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19975881464779377, "step": 12 }, { "epoch": 0.0011666666666666668, "grad_norm": 5.4375, "learning_rate": 1.2480000000000002e-05, "loss": 5.0073, "loss/crossentropy": 1.2278007790446281, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15528732910752296, "step": 14 }, { "epoch": 0.0013333333333333333, "grad_norm": 5.34375, "grad_norm_var": 2.45025634765625, "learning_rate": 1.3120000000000001e-05, "loss": 5.1973, "loss/crossentropy": 2.5199625492095947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22803819924592972, "step": 16 }, { "epoch": 0.0015, "grad_norm": 5.0, "grad_norm_var": 0.64693603515625, "learning_rate": 1.376e-05, "loss": 4.7183, "loss/crossentropy": 2.4793767035007477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2310861274600029, "step": 18 }, { "epoch": 0.0016666666666666668, "grad_norm": 5.34375, "grad_norm_var": 0.34068603515625, "learning_rate": 1.4400000000000003e-05, "loss": 4.5878, "loss/crossentropy": 1.9572802186012268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1875070109963417, "step": 20 }, { "epoch": 0.0018333333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.20558268229166668, "learning_rate": 1.5040000000000002e-05, "loss": 5.5266, "loss/crossentropy": 1.6191904172301292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16165770776569843, "step": 22 }, { "epoch": 0.002, "grad_norm": 5.875, "grad_norm_var": 0.10188395182291667, "learning_rate": 1.5680000000000002e-05, "loss": 5.32, "loss/crossentropy": 2.563029944896698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23509466275572777, "step": 24 }, { "epoch": 0.0021666666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.10467122395833334, "learning_rate": 1.6320000000000003e-05, "loss": 4.6407, "loss/crossentropy": 1.9466444551944733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20470014959573746, "step": 26 }, { "epoch": 0.0023333333333333335, "grad_norm": 5.25, "grad_norm_var": 0.03609619140625, "learning_rate": 1.6960000000000004e-05, "loss": 4.7149, "loss/crossentropy": 1.9283565133810043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20196671783924103, "step": 28 }, { "epoch": 0.0025, "grad_norm": 5.34375, "grad_norm_var": 0.03609619140625, "learning_rate": 1.76e-05, "loss": 5.4057, "loss/crossentropy": 1.890766903758049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21231163665652275, "step": 30 }, { "epoch": 0.0026666666666666666, "grad_norm": 6.0, "grad_norm_var": 0.0775390625, "learning_rate": 1.824e-05, "loss": 4.5287, "loss/crossentropy": 2.2417571544647217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24475030601024628, "step": 32 }, { "epoch": 0.0028333333333333335, "grad_norm": 5.25, "grad_norm_var": 0.07095947265625, "learning_rate": 1.8880000000000002e-05, "loss": 5.5623, "loss/crossentropy": 1.8421208187937737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1875022854655981, "step": 34 }, { "epoch": 0.003, "grad_norm": 5.1875, "grad_norm_var": 0.07320556640625, "learning_rate": 1.9520000000000003e-05, "loss": 5.5626, "loss/crossentropy": 2.560234487056732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24960926175117493, "step": 36 }, { "epoch": 0.0031666666666666666, "grad_norm": 5.4375, "grad_norm_var": 0.06907552083333333, "learning_rate": 2.016e-05, "loss": 5.2685, "loss/crossentropy": 2.3100323379039764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2332722283899784, "step": 38 }, { "epoch": 0.0033333333333333335, "grad_norm": 5.28125, "grad_norm_var": 0.0484375, "learning_rate": 2.08e-05, "loss": 5.0457, "loss/crossentropy": 1.8883708715438843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20036867633461952, "step": 40 }, { "epoch": 0.0035, "grad_norm": 5.5625, "grad_norm_var": 0.052994791666666666, "learning_rate": 2.144e-05, "loss": 5.085, "loss/crossentropy": 1.181441307067871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15967545099556446, "step": 42 }, { "epoch": 0.0036666666666666666, "grad_norm": 5.40625, "grad_norm_var": 0.052587890625, "learning_rate": 2.2080000000000002e-05, "loss": 4.9071, "loss/crossentropy": 2.132170617580414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24000748619437218, "step": 44 }, { "epoch": 0.003833333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.06109619140625, "learning_rate": 2.2720000000000003e-05, "loss": 5.0678, "loss/crossentropy": 2.0978946685791016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20172996446490288, "step": 46 }, { "epoch": 0.004, "grad_norm": 6.59375, "grad_norm_var": 0.12342122395833334, "learning_rate": 2.3360000000000003e-05, "loss": 5.212, "loss/crossentropy": 2.2711612582206726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26810943335294724, "step": 48 }, { "epoch": 0.004166666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.12473958333333333, "learning_rate": 2.4000000000000004e-05, "loss": 4.8046, "loss/crossentropy": 1.5270142555236816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17906202748417854, "step": 50 }, { "epoch": 0.004333333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.15995686848958332, "learning_rate": 2.4640000000000005e-05, "loss": 4.8139, "loss/crossentropy": 2.6848429441452026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2273760698735714, "step": 52 }, { "epoch": 0.0045, "grad_norm": 5.40625, "grad_norm_var": 0.16112874348958334, "learning_rate": 2.5280000000000005e-05, "loss": 5.3508, "loss/crossentropy": 2.5355905294418335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24536684900522232, "step": 54 }, { "epoch": 0.004666666666666667, "grad_norm": 5.75, "grad_norm_var": 0.163671875, "learning_rate": 2.5920000000000006e-05, "loss": 5.5182, "loss/crossentropy": 2.480812221765518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21574737504124641, "step": 56 }, { "epoch": 0.004833333333333334, "grad_norm": 5.125, "grad_norm_var": 0.17935791015625, "learning_rate": 2.656e-05, "loss": 4.919, "loss/crossentropy": 1.4815584272146225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1718614138662815, "step": 58 }, { "epoch": 0.005, "grad_norm": 5.3125, "grad_norm_var": 0.194921875, "learning_rate": 2.72e-05, "loss": 4.1787, "loss/crossentropy": 0.5379917472600937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11368012242019176, "step": 60 }, { "epoch": 0.005166666666666667, "grad_norm": 5.4375, "grad_norm_var": 0.18131103515625, "learning_rate": 2.784e-05, "loss": 4.7688, "loss/crossentropy": 2.2010596245527267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20431457087397575, "step": 62 }, { "epoch": 0.005333333333333333, "grad_norm": 6.0, "grad_norm_var": 0.135009765625, "learning_rate": 2.8480000000000002e-05, "loss": 5.4329, "loss/crossentropy": 2.047866404056549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21845614910125732, "step": 64 }, { "epoch": 0.0055, "grad_norm": 6.28125, "grad_norm_var": 0.17209879557291666, "learning_rate": 2.9120000000000002e-05, "loss": 4.8769, "loss/crossentropy": 2.302406132221222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2274726703763008, "step": 66 }, { "epoch": 0.005666666666666667, "grad_norm": 6.0625, "grad_norm_var": 0.15818684895833332, "learning_rate": 2.9760000000000003e-05, "loss": 5.4183, "loss/crossentropy": 2.040872871875763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20815075933933258, "step": 68 }, { "epoch": 0.005833333333333334, "grad_norm": 5.71875, "grad_norm_var": 0.15071614583333334, "learning_rate": 3.0400000000000004e-05, "loss": 5.3612, "loss/crossentropy": 1.6699720919132233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23643635585904121, "step": 70 }, { "epoch": 0.006, "grad_norm": 5.46875, "grad_norm_var": 0.14836832682291667, "learning_rate": 3.104e-05, "loss": 4.6081, "loss/crossentropy": 2.111388862133026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21453475579619408, "step": 72 }, { "epoch": 0.006166666666666667, "grad_norm": 5.125, "grad_norm_var": 0.13775634765625, "learning_rate": 3.168e-05, "loss": 5.28, "loss/crossentropy": 2.7730491161346436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24475040659308434, "step": 74 }, { "epoch": 0.006333333333333333, "grad_norm": 4.875, "grad_norm_var": 0.14843343098958334, "learning_rate": 3.232e-05, "loss": 4.9434, "loss/crossentropy": 1.8236006125807762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18496105633676052, "step": 76 }, { "epoch": 0.0065, "grad_norm": 5.46875, "grad_norm_var": 0.14843343098958334, "learning_rate": 3.296e-05, "loss": 4.8855, "loss/crossentropy": 2.105473317205906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216831523925066, "step": 78 }, { "epoch": 0.006666666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.12511393229166667, "learning_rate": 3.3600000000000004e-05, "loss": 4.8352, "loss/crossentropy": 2.1026684939861298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20776227116584778, "step": 80 }, { "epoch": 0.006833333333333334, "grad_norm": 5.125, "grad_norm_var": 0.10536702473958333, "learning_rate": 3.4240000000000004e-05, "loss": 4.8734, "loss/crossentropy": 1.8813765197992325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2019767053425312, "step": 82 }, { "epoch": 0.007, "grad_norm": 5.21875, "grad_norm_var": 0.08444010416666667, "learning_rate": 3.4880000000000005e-05, "loss": 4.9091, "loss/crossentropy": 1.3020039498806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16160101629793644, "step": 84 }, { "epoch": 0.007166666666666667, "grad_norm": 5.53125, "grad_norm_var": 0.07952067057291666, "learning_rate": 3.5520000000000006e-05, "loss": 5.2581, "loss/crossentropy": 1.912569299340248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20463180169463158, "step": 86 }, { "epoch": 0.007333333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.07932535807291667, "learning_rate": 3.6160000000000006e-05, "loss": 5.2699, "loss/crossentropy": 2.6311103105545044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24578388035297394, "step": 88 }, { "epoch": 0.0075, "grad_norm": 6.21875, "grad_norm_var": 0.11334228515625, "learning_rate": 3.680000000000001e-05, "loss": 4.6852, "loss/crossentropy": 2.1940360069274902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21145440265536308, "step": 90 }, { "epoch": 0.007666666666666666, "grad_norm": 6.03125, "grad_norm_var": 0.0927734375, "learning_rate": 3.744000000000001e-05, "loss": 5.1184, "loss/crossentropy": 1.5804511904716492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18777066841721535, "step": 92 }, { "epoch": 0.007833333333333333, "grad_norm": 5.125, "grad_norm_var": 0.11366780598958333, "learning_rate": 3.808e-05, "loss": 4.5045, "loss/crossentropy": 1.8188975527882576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19868333637714386, "step": 94 }, { "epoch": 0.008, "grad_norm": 5.59375, "grad_norm_var": 0.10859375, "learning_rate": 3.872e-05, "loss": 5.5254, "loss/crossentropy": 2.3987780809402466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2095976211130619, "step": 96 }, { "epoch": 0.008166666666666666, "grad_norm": 6.21875, "grad_norm_var": 0.10833333333333334, "learning_rate": 3.936e-05, "loss": 5.129, "loss/crossentropy": 2.186008095741272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24592407420277596, "step": 98 }, { "epoch": 0.008333333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.09739583333333333, "learning_rate": 4e-05, "loss": 4.9801, "loss/crossentropy": 2.0446798354387283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20239055342972279, "step": 100 }, { "epoch": 0.0085, "grad_norm": 5.40625, "grad_norm_var": 0.09557291666666666, "learning_rate": 4e-05, "loss": 5.3834, "loss/crossentropy": 2.29119148850441, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22458446770906448, "step": 102 }, { "epoch": 0.008666666666666666, "grad_norm": 5.8125, "grad_norm_var": 0.08826497395833334, "learning_rate": 4e-05, "loss": 4.7358, "loss/crossentropy": 2.1947161257267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2436535656452179, "step": 104 }, { "epoch": 0.008833333333333334, "grad_norm": 5.34375, "grad_norm_var": 0.08043212890625, "learning_rate": 4e-05, "loss": 5.1568, "loss/crossentropy": 2.04066064953804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21565158292651176, "step": 106 }, { "epoch": 0.009, "grad_norm": 5.84375, "grad_norm_var": 0.06982014973958334, "learning_rate": 4e-05, "loss": 4.875, "loss/crossentropy": 1.8622316792607307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19797790050506592, "step": 108 }, { "epoch": 0.009166666666666667, "grad_norm": 5.9375, "grad_norm_var": 0.05045166015625, "learning_rate": 4e-05, "loss": 5.05, "loss/crossentropy": 1.489914320409298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17602229118347168, "step": 110 }, { "epoch": 0.009333333333333334, "grad_norm": 5.71875, "grad_norm_var": 0.16669514973958333, "learning_rate": 4e-05, "loss": 5.4823, "loss/crossentropy": 1.7484403923153877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20575151592493057, "step": 112 }, { "epoch": 0.0095, "grad_norm": 5.96875, "grad_norm_var": 0.15703125, "learning_rate": 4e-05, "loss": 5.0073, "loss/crossentropy": 1.7794604748487473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22764279693365097, "step": 114 }, { "epoch": 0.009666666666666667, "grad_norm": 5.6875, "grad_norm_var": 0.14993082682291667, "learning_rate": 4e-05, "loss": 5.352, "loss/crossentropy": 2.6334983110427856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23335204645991325, "step": 116 }, { "epoch": 0.009833333333333333, "grad_norm": 5.6875, "grad_norm_var": 0.14498291015625, "learning_rate": 4e-05, "loss": 5.3831, "loss/crossentropy": 1.6918310597538948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19874560460448265, "step": 118 }, { "epoch": 0.01, "grad_norm": 5.71875, "grad_norm_var": 0.1791015625, "learning_rate": 4e-05, "loss": 5.0705, "loss/crossentropy": 2.277990937232971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2351701594889164, "step": 120 }, { "epoch": 0.010166666666666666, "grad_norm": 5.6875, "grad_norm_var": 0.19599202473958333, "learning_rate": 4e-05, "loss": 4.8238, "loss/crossentropy": 1.9308300465345383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19367161393165588, "step": 122 }, { "epoch": 0.010333333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.19933268229166667, "learning_rate": 4e-05, "loss": 5.1309, "loss/crossentropy": 1.2643241733312607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15139093436300755, "step": 124 }, { "epoch": 0.0105, "grad_norm": 5.8125, "grad_norm_var": 0.19879150390625, "learning_rate": 4e-05, "loss": 5.0878, "loss/crossentropy": 1.5644885823130608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17594983614981174, "step": 126 }, { "epoch": 0.010666666666666666, "grad_norm": 5.25, "grad_norm_var": 0.08401285807291667, "learning_rate": 4e-05, "loss": 4.9866, "loss/crossentropy": 2.0537383928894997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19487846456468105, "step": 128 }, { "epoch": 0.010833333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.07447509765625, "learning_rate": 4e-05, "loss": 5.0095, "loss/crossentropy": 1.8626472651958466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2333734817802906, "step": 130 }, { "epoch": 0.011, "grad_norm": 5.625, "grad_norm_var": 0.06751302083333334, "learning_rate": 4e-05, "loss": 5.305, "loss/crossentropy": 1.4333342388272285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17128831893205643, "step": 132 }, { "epoch": 0.011166666666666667, "grad_norm": 6.625, "grad_norm_var": 0.14511311848958333, "learning_rate": 4e-05, "loss": 4.8194, "loss/crossentropy": 1.4352454990148544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869527231901884, "step": 134 }, { "epoch": 0.011333333333333334, "grad_norm": 5.46875, "grad_norm_var": 0.12706705729166667, "learning_rate": 4e-05, "loss": 5.1539, "loss/crossentropy": 2.2443730235099792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240537665784359, "step": 136 }, { "epoch": 0.0115, "grad_norm": 5.21875, "grad_norm_var": 0.12381184895833333, "learning_rate": 4e-05, "loss": 4.7903, "loss/crossentropy": 2.265403002500534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24458804354071617, "step": 138 }, { "epoch": 0.011666666666666667, "grad_norm": 5.53125, "grad_norm_var": 0.12317708333333334, "learning_rate": 4e-05, "loss": 5.3068, "loss/crossentropy": 1.260722041130066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1643795594573021, "step": 140 }, { "epoch": 0.011833333333333333, "grad_norm": 6.1875, "grad_norm_var": 0.14358317057291667, "learning_rate": 4e-05, "loss": 5.394, "loss/crossentropy": 2.1383322402834892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20691991224884987, "step": 142 }, { "epoch": 0.012, "grad_norm": 5.75, "grad_norm_var": 0.14034830729166667, "learning_rate": 4e-05, "loss": 5.1448, "loss/crossentropy": 2.40448135137558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21756655722856522, "step": 144 }, { "epoch": 0.012166666666666666, "grad_norm": 5.8125, "grad_norm_var": 0.15500895182291666, "learning_rate": 4e-05, "loss": 5.3001, "loss/crossentropy": 1.8169787228107452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20706401392817497, "step": 146 }, { "epoch": 0.012333333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.15266520182291668, "learning_rate": 4e-05, "loss": 5.2623, "loss/crossentropy": 1.8481503129005432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19028427824378014, "step": 148 }, { "epoch": 0.0125, "grad_norm": 5.53125, "grad_norm_var": 0.08918863932291667, "learning_rate": 4e-05, "loss": 4.7276, "loss/crossentropy": 1.4998832270503044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18570281565189362, "step": 150 }, { "epoch": 0.012666666666666666, "grad_norm": 5.5625, "grad_norm_var": 0.09029947916666667, "learning_rate": 4e-05, "loss": 4.9088, "loss/crossentropy": 2.361013948917389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21920835599303246, "step": 152 }, { "epoch": 0.012833333333333334, "grad_norm": 5.375, "grad_norm_var": 0.0796875, "learning_rate": 4e-05, "loss": 5.4666, "loss/crossentropy": 2.4189918637275696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2187294214963913, "step": 154 }, { "epoch": 0.013, "grad_norm": 5.53125, "grad_norm_var": 0.07991129557291667, "learning_rate": 4e-05, "loss": 4.9728, "loss/crossentropy": 1.3893551230430603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17549285665154457, "step": 156 }, { "epoch": 0.013166666666666667, "grad_norm": 5.78125, "grad_norm_var": 0.05859375, "learning_rate": 4e-05, "loss": 4.6976, "loss/crossentropy": 1.219208374619484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18519950285553932, "step": 158 }, { "epoch": 0.013333333333333334, "grad_norm": 6.25, "grad_norm_var": 0.08166910807291666, "learning_rate": 4e-05, "loss": 5.2135, "loss/crossentropy": 2.5400354266166687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2633819431066513, "step": 160 }, { "epoch": 0.0135, "grad_norm": 5.8125, "grad_norm_var": 0.08655192057291666, "learning_rate": 4e-05, "loss": 5.1126, "loss/crossentropy": 2.380533277988434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545565590262413, "step": 162 }, { "epoch": 0.013666666666666667, "grad_norm": 5.25, "grad_norm_var": 0.09368082682291666, "learning_rate": 4e-05, "loss": 5.279, "loss/crossentropy": 2.279165208339691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159963957965374, "step": 164 }, { "epoch": 0.013833333333333333, "grad_norm": 5.25, "grad_norm_var": 0.106103515625, "learning_rate": 4e-05, "loss": 4.4199, "loss/crossentropy": 1.5300931632518768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16407620534300804, "step": 166 }, { "epoch": 0.014, "grad_norm": 5.1875, "grad_norm_var": 0.137744140625, "learning_rate": 4e-05, "loss": 4.6505, "loss/crossentropy": 1.1790905147790909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16002923622727394, "step": 168 }, { "epoch": 0.014166666666666666, "grad_norm": 5.6875, "grad_norm_var": 0.14759114583333333, "learning_rate": 4e-05, "loss": 5.5463, "loss/crossentropy": 2.186621367931366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2605951316654682, "step": 170 }, { "epoch": 0.014333333333333333, "grad_norm": 5.34375, "grad_norm_var": 0.15513916015625, "learning_rate": 4e-05, "loss": 5.2762, "loss/crossentropy": 2.4367510974407196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.221878033131361, "step": 172 }, { "epoch": 0.0145, "grad_norm": 5.8125, "grad_norm_var": 0.16365559895833334, "learning_rate": 4e-05, "loss": 5.2094, "loss/crossentropy": 1.7971658408641815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21865063533186913, "step": 174 }, { "epoch": 0.014666666666666666, "grad_norm": 5.3125, "grad_norm_var": 0.15614827473958334, "learning_rate": 4e-05, "loss": 4.3297, "loss/crossentropy": 2.00938368588686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059515416622162, "step": 176 }, { "epoch": 0.014833333333333334, "grad_norm": 5.5, "grad_norm_var": 0.12600504557291667, "learning_rate": 4e-05, "loss": 5.7069, "loss/crossentropy": 2.2138592898845673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146594636142254, "step": 178 }, { "epoch": 0.015, "grad_norm": 5.6875, "grad_norm_var": 0.13190104166666666, "learning_rate": 4e-05, "loss": 5.2236, "loss/crossentropy": 2.3264683187007904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24893302470445633, "step": 180 }, { "epoch": 0.015166666666666667, "grad_norm": 7.65625, "grad_norm_var": 0.41073811848958336, "learning_rate": 4e-05, "loss": 5.0168, "loss/crossentropy": 2.421372711658478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2704160064458847, "step": 182 }, { "epoch": 0.015333333333333332, "grad_norm": 5.4375, "grad_norm_var": 0.370556640625, "learning_rate": 4e-05, "loss": 4.8129, "loss/crossentropy": 2.6394213438034058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23327884078025818, "step": 184 }, { "epoch": 0.0155, "grad_norm": 6.03125, "grad_norm_var": 0.37688802083333334, "learning_rate": 4e-05, "loss": 4.9848, "loss/crossentropy": 2.0492628812789917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2619275823235512, "step": 186 }, { "epoch": 0.015666666666666666, "grad_norm": 5.75, "grad_norm_var": 0.36477457682291664, "learning_rate": 4e-05, "loss": 5.2399, "loss/crossentropy": 2.671754002571106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2431931532919407, "step": 188 }, { "epoch": 0.015833333333333335, "grad_norm": 5.65625, "grad_norm_var": 0.36471354166666664, "learning_rate": 4e-05, "loss": 5.4273, "loss/crossentropy": 1.8667291477322578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19933542981743813, "step": 190 }, { "epoch": 0.016, "grad_norm": 5.59375, "grad_norm_var": 0.3186848958333333, "learning_rate": 4e-05, "loss": 5.2553, "loss/crossentropy": 2.3034614622592926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2405945621430874, "step": 192 }, { "epoch": 0.016166666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.39088134765625, "learning_rate": 4e-05, "loss": 4.7129, "loss/crossentropy": 1.9020505920052528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275901511311531, "step": 194 }, { "epoch": 0.01633333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.4091796875, "learning_rate": 4e-05, "loss": 4.6671, "loss/crossentropy": 1.9580153226852417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23201943933963776, "step": 196 }, { "epoch": 0.0165, "grad_norm": 5.15625, "grad_norm_var": 0.18052978515625, "learning_rate": 4e-05, "loss": 4.7744, "loss/crossentropy": 2.0856711715459824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20316448248922825, "step": 198 }, { "epoch": 0.016666666666666666, "grad_norm": 5.46875, "grad_norm_var": 0.18019205729166668, "learning_rate": 4e-05, "loss": 5.5373, "loss/crossentropy": 2.463364541530609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22138278931379318, "step": 200 }, { "epoch": 0.016833333333333332, "grad_norm": 5.53125, "grad_norm_var": 0.13873291015625, "learning_rate": 4e-05, "loss": 4.8611, "loss/crossentropy": 2.0712440609931946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275029793381691, "step": 202 }, { "epoch": 0.017, "grad_norm": 5.34375, "grad_norm_var": 0.133837890625, "learning_rate": 4e-05, "loss": 5.2617, "loss/crossentropy": 2.640321433544159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2553870268166065, "step": 204 }, { "epoch": 0.017166666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.14345296223958334, "learning_rate": 4e-05, "loss": 5.1737, "loss/crossentropy": 2.339095562696457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2486417517066002, "step": 206 }, { "epoch": 0.017333333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.139306640625, "learning_rate": 4e-05, "loss": 5.2609, "loss/crossentropy": 1.7947577238082886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1953553967177868, "step": 208 }, { "epoch": 0.0175, "grad_norm": 5.3125, "grad_norm_var": 0.056538899739583336, "learning_rate": 4e-05, "loss": 4.9067, "loss/crossentropy": 1.9185269623994827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18810669146478176, "step": 210 }, { "epoch": 0.017666666666666667, "grad_norm": 5.59375, "grad_norm_var": 0.05703125, "learning_rate": 4e-05, "loss": 5.0059, "loss/crossentropy": 1.9670357257127762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22664643824100494, "step": 212 }, { "epoch": 0.017833333333333333, "grad_norm": 5.6875, "grad_norm_var": 0.04908447265625, "learning_rate": 4e-05, "loss": 5.2811, "loss/crossentropy": 1.1792488172650337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16042216308414936, "step": 214 }, { "epoch": 0.018, "grad_norm": 5.1875, "grad_norm_var": 0.057535807291666664, "learning_rate": 4e-05, "loss": 4.6516, "loss/crossentropy": 1.189962238073349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15354885905981064, "step": 216 }, { "epoch": 0.018166666666666668, "grad_norm": 5.53125, "grad_norm_var": 0.12376302083333333, "learning_rate": 4e-05, "loss": 5.5928, "loss/crossentropy": 2.4891774654388428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23457545787096024, "step": 218 }, { "epoch": 0.018333333333333333, "grad_norm": 5.59375, "grad_norm_var": 0.11832275390625, "learning_rate": 4e-05, "loss": 5.0127, "loss/crossentropy": 1.6934428215026855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18142644688487053, "step": 220 }, { "epoch": 0.0185, "grad_norm": 5.5, "grad_norm_var": 0.11252848307291667, "learning_rate": 4e-05, "loss": 5.3514, "loss/crossentropy": 2.0071809887886047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19888557493686676, "step": 222 }, { "epoch": 0.018666666666666668, "grad_norm": 5.40625, "grad_norm_var": 0.11529947916666666, "learning_rate": 4e-05, "loss": 5.2169, "loss/crossentropy": 2.1441567465662956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21664733067154884, "step": 224 }, { "epoch": 0.018833333333333334, "grad_norm": 5.375, "grad_norm_var": 0.10338541666666666, "learning_rate": 4e-05, "loss": 5.5327, "loss/crossentropy": 2.479779541492462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23695838451385498, "step": 226 }, { "epoch": 0.019, "grad_norm": 5.5625, "grad_norm_var": 0.11315104166666666, "learning_rate": 4e-05, "loss": 5.1857, "loss/crossentropy": 1.8668599054217339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20227666944265366, "step": 228 }, { "epoch": 0.019166666666666665, "grad_norm": 5.71875, "grad_norm_var": 0.11767171223958334, "learning_rate": 4e-05, "loss": 4.3191, "loss/crossentropy": 1.2977168932557106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17029542475938797, "step": 230 }, { "epoch": 0.019333333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.12414957682291666, "learning_rate": 4e-05, "loss": 4.5462, "loss/crossentropy": 1.5032763928174973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17685853876173496, "step": 232 }, { "epoch": 0.0195, "grad_norm": 5.53125, "grad_norm_var": 0.06717122395833333, "learning_rate": 4e-05, "loss": 4.9859, "loss/crossentropy": 1.3485910668969154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15323390066623688, "step": 234 }, { "epoch": 0.019666666666666666, "grad_norm": 6.1875, "grad_norm_var": 0.09693603515625, "learning_rate": 4e-05, "loss": 5.187, "loss/crossentropy": 2.1716194823384285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2310977354645729, "step": 236 }, { "epoch": 0.019833333333333335, "grad_norm": 5.96875, "grad_norm_var": 0.10621337890625, "learning_rate": 4e-05, "loss": 4.9196, "loss/crossentropy": 1.9589915871620178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23559781908988953, "step": 238 }, { "epoch": 0.02, "grad_norm": 5.21875, "grad_norm_var": 0.11256103515625, "learning_rate": 4e-05, "loss": 5.2263, "loss/crossentropy": 2.0510232746601105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26163269206881523, "step": 240 }, { "epoch": 0.020166666666666666, "grad_norm": 7.0625, "grad_norm_var": 0.24869384765625, "learning_rate": 4e-05, "loss": 4.9284, "loss/crossentropy": 1.565013274550438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.3279726207256317, "step": 242 }, { "epoch": 0.02033333333333333, "grad_norm": 5.5, "grad_norm_var": 0.2613118489583333, "learning_rate": 4e-05, "loss": 4.9741, "loss/crossentropy": 1.447442576289177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1801074482500553, "step": 244 }, { "epoch": 0.0205, "grad_norm": 5.34375, "grad_norm_var": 0.258056640625, "learning_rate": 4e-05, "loss": 5.115, "loss/crossentropy": 1.7351520657539368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19535227306187153, "step": 246 }, { "epoch": 0.020666666666666667, "grad_norm": 5.46875, "grad_norm_var": 0.22069905598958334, "learning_rate": 4e-05, "loss": 5.1927, "loss/crossentropy": 1.2356021031737328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1455559842288494, "step": 248 }, { "epoch": 0.020833333333333332, "grad_norm": 5.4375, "grad_norm_var": 0.22005208333333334, "learning_rate": 4e-05, "loss": 5.1384, "loss/crossentropy": 2.3360126316547394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24712038040161133, "step": 250 }, { "epoch": 0.021, "grad_norm": 5.90625, "grad_norm_var": 0.20193684895833333, "learning_rate": 4e-05, "loss": 4.8796, "loss/crossentropy": 2.037757635116577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20274589955806732, "step": 252 }, { "epoch": 0.021166666666666667, "grad_norm": 5.75, "grad_norm_var": 0.21686197916666666, "learning_rate": 4e-05, "loss": 5.4491, "loss/crossentropy": 2.517900228500366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23455920815467834, "step": 254 }, { "epoch": 0.021333333333333333, "grad_norm": 5.53125, "grad_norm_var": 0.20636393229166666, "learning_rate": 4e-05, "loss": 5.5084, "loss/crossentropy": 2.3689188957214355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23140762373805046, "step": 256 }, { "epoch": 0.0215, "grad_norm": 5.375, "grad_norm_var": 0.06953125, "learning_rate": 4e-05, "loss": 5.1052, "loss/crossentropy": 2.45695823431015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22622350975871086, "step": 258 }, { "epoch": 0.021666666666666667, "grad_norm": 5.6875, "grad_norm_var": 0.047587076822916664, "learning_rate": 4e-05, "loss": 5.1092, "loss/crossentropy": 2.481264054775238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26096219196915627, "step": 260 }, { "epoch": 0.021833333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.05701497395833333, "learning_rate": 4e-05, "loss": 4.9228, "loss/crossentropy": 2.266570031642914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2023736946284771, "step": 262 }, { "epoch": 0.022, "grad_norm": 5.6875, "grad_norm_var": 0.07498372395833333, "learning_rate": 4e-05, "loss": 5.3689, "loss/crossentropy": 2.311848521232605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21398617699742317, "step": 264 }, { "epoch": 0.022166666666666668, "grad_norm": 5.1875, "grad_norm_var": 0.08192952473958333, "learning_rate": 4e-05, "loss": 5.2604, "loss/crossentropy": 1.5015419125556946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18505272828042507, "step": 266 }, { "epoch": 0.022333333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.07519124348958334, "learning_rate": 4e-05, "loss": 4.8222, "loss/crossentropy": 1.9860661998391151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171510737389326, "step": 268 }, { "epoch": 0.0225, "grad_norm": 5.4375, "grad_norm_var": 0.05950520833333333, "learning_rate": 4e-05, "loss": 5.7417, "loss/crossentropy": 2.4434638023376465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2453981228172779, "step": 270 }, { "epoch": 0.02266666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.06282145182291667, "learning_rate": 4e-05, "loss": 4.462, "loss/crossentropy": 1.597841739654541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16011325269937515, "step": 272 }, { "epoch": 0.022833333333333334, "grad_norm": 5.5625, "grad_norm_var": 0.06526285807291667, "learning_rate": 4e-05, "loss": 4.8157, "loss/crossentropy": 1.8127425089478493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005634494125843, "step": 274 }, { "epoch": 0.023, "grad_norm": 5.875, "grad_norm_var": 0.07187093098958333, "learning_rate": 4e-05, "loss": 5.1641, "loss/crossentropy": 2.327672451734543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2233283594250679, "step": 276 }, { "epoch": 0.023166666666666665, "grad_norm": 5.46875, "grad_norm_var": 0.06812744140625, "learning_rate": 4e-05, "loss": 5.0835, "loss/crossentropy": 1.4483234286308289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19945433549582958, "step": 278 }, { "epoch": 0.023333333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.07877197265625, "learning_rate": 4e-05, "loss": 4.9559, "loss/crossentropy": 1.4878328368067741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17448855936527252, "step": 280 }, { "epoch": 0.0235, "grad_norm": 5.71875, "grad_norm_var": 0.07849934895833334, "learning_rate": 4e-05, "loss": 4.9091, "loss/crossentropy": 1.396668791770935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1872088983654976, "step": 282 }, { "epoch": 0.023666666666666666, "grad_norm": 5.5625, "grad_norm_var": 0.0837890625, "learning_rate": 4e-05, "loss": 4.6245, "loss/crossentropy": 2.4077460169792175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21915850043296814, "step": 284 }, { "epoch": 0.023833333333333335, "grad_norm": 5.125, "grad_norm_var": 0.09215087890625, "learning_rate": 4e-05, "loss": 4.1122, "loss/crossentropy": 1.5891410186886787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18681494891643524, "step": 286 }, { "epoch": 0.024, "grad_norm": 5.59375, "grad_norm_var": 0.087353515625, "learning_rate": 4e-05, "loss": 5.0425, "loss/crossentropy": 2.184404134750366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1966254562139511, "step": 288 }, { "epoch": 0.024166666666666666, "grad_norm": 5.8125, "grad_norm_var": 0.11027018229166667, "learning_rate": 4e-05, "loss": 5.1775, "loss/crossentropy": 2.3604514598846436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20175722613930702, "step": 290 }, { "epoch": 0.024333333333333332, "grad_norm": 5.5625, "grad_norm_var": 0.10240478515625, "learning_rate": 4e-05, "loss": 5.0769, "loss/crossentropy": 1.402433268725872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16730003245174885, "step": 292 }, { "epoch": 0.0245, "grad_norm": 5.25, "grad_norm_var": 0.09696858723958333, "learning_rate": 4e-05, "loss": 4.9796, "loss/crossentropy": 1.5468462631106377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18110387213528156, "step": 294 }, { "epoch": 0.024666666666666667, "grad_norm": 5.625, "grad_norm_var": 0.07235921223958333, "learning_rate": 4e-05, "loss": 4.6309, "loss/crossentropy": 2.3594585359096527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23140858113765717, "step": 296 }, { "epoch": 0.024833333333333332, "grad_norm": 5.09375, "grad_norm_var": 0.074072265625, "learning_rate": 4e-05, "loss": 5.1319, "loss/crossentropy": 1.3274840712547302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16749969869852066, "step": 298 }, { "epoch": 0.025, "grad_norm": 5.4375, "grad_norm_var": 0.06261393229166666, "learning_rate": 4e-05, "loss": 4.7546, "loss/crossentropy": 0.9479904547333717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1345222145318985, "step": 300 }, { "epoch": 0.025166666666666667, "grad_norm": 5.78125, "grad_norm_var": 5.926005045572917, "learning_rate": 4e-05, "loss": 4.1966, "loss/crossentropy": 1.8578788191080093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18595384806394577, "step": 302 }, { "epoch": 0.025333333333333333, "grad_norm": 5.3125, "grad_norm_var": 5.941337076822917, "learning_rate": 4e-05, "loss": 4.5656, "loss/crossentropy": 1.192492350935936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15102743916213512, "step": 304 }, { "epoch": 0.0255, "grad_norm": 5.21875, "grad_norm_var": 6.0068359375, "learning_rate": 4e-05, "loss": 4.7877, "loss/crossentropy": 2.530356705188751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2265690192580223, "step": 306 }, { "epoch": 0.025666666666666667, "grad_norm": 5.34375, "grad_norm_var": 5.976558430989583, "learning_rate": 4e-05, "loss": 5.002, "loss/crossentropy": 1.917500764131546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19061551988124847, "step": 308 }, { "epoch": 0.025833333333333333, "grad_norm": 5.71875, "grad_norm_var": 5.966988118489583, "learning_rate": 4e-05, "loss": 5.3558, "loss/crossentropy": 2.382882058620453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21255266666412354, "step": 310 }, { "epoch": 0.026, "grad_norm": 5.15625, "grad_norm_var": 6.049019368489583, "learning_rate": 4e-05, "loss": 5.0068, "loss/crossentropy": 2.380540519952774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22027641534805298, "step": 312 }, { "epoch": 0.026166666666666668, "grad_norm": 5.6875, "grad_norm_var": 6.014827473958333, "learning_rate": 4e-05, "loss": 4.8984, "loss/crossentropy": 1.7228035554289818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18263637647032738, "step": 314 }, { "epoch": 0.026333333333333334, "grad_norm": 5.5625, "grad_norm_var": 5.969254557291666, "learning_rate": 4e-05, "loss": 5.3086, "loss/crossentropy": 1.3391352742910385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15971971489489079, "step": 316 }, { "epoch": 0.0265, "grad_norm": 5.4375, "grad_norm_var": 0.04724934895833333, "learning_rate": 4e-05, "loss": 4.879, "loss/crossentropy": 2.5128698348999023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24337675794959068, "step": 318 }, { "epoch": 0.02666666666666667, "grad_norm": 5.375, "grad_norm_var": 0.056441243489583334, "learning_rate": 4e-05, "loss": 4.6445, "loss/crossentropy": 2.6085115671157837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2379378005862236, "step": 320 }, { "epoch": 0.026833333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.07005208333333333, "learning_rate": 4e-05, "loss": 5.0695, "loss/crossentropy": 2.1165069714188576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19540811702609062, "step": 322 }, { "epoch": 0.027, "grad_norm": 5.46875, "grad_norm_var": 0.059228515625, "learning_rate": 4e-05, "loss": 5.5129, "loss/crossentropy": 1.4290212765336037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1564047373831272, "step": 324 }, { "epoch": 0.027166666666666665, "grad_norm": 5.78125, "grad_norm_var": 0.0625, "learning_rate": 4e-05, "loss": 4.941, "loss/crossentropy": 2.273834705352783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24747185036540031, "step": 326 }, { "epoch": 0.027333333333333334, "grad_norm": 5.25, "grad_norm_var": 0.05660400390625, "learning_rate": 4e-05, "loss": 4.486, "loss/crossentropy": 2.2860072553157806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2299199439585209, "step": 328 }, { "epoch": 0.0275, "grad_norm": 5.46875, "grad_norm_var": 0.057906087239583334, "learning_rate": 4e-05, "loss": 5.4171, "loss/crossentropy": 2.138988643884659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22434020042419434, "step": 330 }, { "epoch": 0.027666666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.049072265625, "learning_rate": 4e-05, "loss": 5.4328, "loss/crossentropy": 2.5559749603271484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22616703063249588, "step": 332 }, { "epoch": 0.027833333333333335, "grad_norm": 5.34375, "grad_norm_var": 0.04973551432291667, "learning_rate": 4e-05, "loss": 5.1085, "loss/crossentropy": 2.2669193148612976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23874261230230331, "step": 334 }, { "epoch": 0.028, "grad_norm": 5.65625, "grad_norm_var": 0.050374348958333336, "learning_rate": 4e-05, "loss": 5.1428, "loss/crossentropy": 1.1794737800955772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15420645847916603, "step": 336 }, { "epoch": 0.028166666666666666, "grad_norm": 5.5625, "grad_norm_var": 0.04049072265625, "learning_rate": 4e-05, "loss": 5.3906, "loss/crossentropy": 2.6066287755966187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2180948294699192, "step": 338 }, { "epoch": 0.028333333333333332, "grad_norm": 5.28125, "grad_norm_var": 0.044140625, "learning_rate": 4e-05, "loss": 5.0314, "loss/crossentropy": 2.4064601063728333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2222812995314598, "step": 340 }, { "epoch": 0.0285, "grad_norm": 6.28125, "grad_norm_var": 0.138671875, "learning_rate": 4e-05, "loss": 5.3436, "loss/crossentropy": 1.8959501832723618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23883518390357494, "step": 342 }, { "epoch": 0.028666666666666667, "grad_norm": 5.625, "grad_norm_var": 0.13948160807291668, "learning_rate": 4e-05, "loss": 5.1752, "loss/crossentropy": 1.8244957998394966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19024837389588356, "step": 344 }, { "epoch": 0.028833333333333332, "grad_norm": 5.0625, "grad_norm_var": 0.18053385416666667, "learning_rate": 4e-05, "loss": 4.8808, "loss/crossentropy": 1.635428212583065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17805110290646553, "step": 346 }, { "epoch": 0.029, "grad_norm": 5.75, "grad_norm_var": 0.18841145833333334, "learning_rate": 4e-05, "loss": 5.7496, "loss/crossentropy": 2.2286045253276825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23897172883152962, "step": 348 }, { "epoch": 0.029166666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.238916015625, "learning_rate": 4e-05, "loss": 4.0875, "loss/crossentropy": 1.5485807359218597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19915905967354774, "step": 350 }, { "epoch": 0.029333333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.230078125, "learning_rate": 4e-05, "loss": 5.3479, "loss/crossentropy": 2.485140085220337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23857445269823074, "step": 352 }, { "epoch": 0.0295, "grad_norm": 5.53125, "grad_norm_var": 0.225634765625, "learning_rate": 4e-05, "loss": 4.8827, "loss/crossentropy": 2.4410774409770966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23277483880519867, "step": 354 }, { "epoch": 0.029666666666666668, "grad_norm": 5.15625, "grad_norm_var": 0.22734375, "learning_rate": 4e-05, "loss": 4.3879, "loss/crossentropy": 1.7819544896483421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17286515049636364, "step": 356 }, { "epoch": 0.029833333333333333, "grad_norm": 5.90625, "grad_norm_var": 0.12259114583333333, "learning_rate": 4e-05, "loss": 5.1247, "loss/crossentropy": 2.2199259996414185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22783676907420158, "step": 358 }, { "epoch": 0.03, "grad_norm": 5.6875, "grad_norm_var": 0.12567952473958333, "learning_rate": 4e-05, "loss": 4.9228, "loss/crossentropy": 2.2036180198192596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20686358213424683, "step": 360 }, { "epoch": 0.030166666666666668, "grad_norm": 5.6875, "grad_norm_var": 0.11106770833333333, "learning_rate": 4e-05, "loss": 4.9717, "loss/crossentropy": 1.7007370814681053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126607969403267, "step": 362 }, { "epoch": 0.030333333333333334, "grad_norm": 5.25, "grad_norm_var": 0.09641520182291667, "learning_rate": 4e-05, "loss": 5.1573, "loss/crossentropy": 1.4196887761354446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16805725172162056, "step": 364 }, { "epoch": 0.0305, "grad_norm": 6.625, "grad_norm_var": 0.16404622395833332, "learning_rate": 4e-05, "loss": 5.3914, "loss/crossentropy": 2.2769704461097717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20909231901168823, "step": 366 }, { "epoch": 0.030666666666666665, "grad_norm": 5.3125, "grad_norm_var": 0.167578125, "learning_rate": 4e-05, "loss": 5.3227, "loss/crossentropy": 2.4747599363327026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2311922200024128, "step": 368 }, { "epoch": 0.030833333333333334, "grad_norm": 5.25, "grad_norm_var": 0.18045247395833333, "learning_rate": 4e-05, "loss": 5.2762, "loss/crossentropy": 1.7561465799808502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1812431439757347, "step": 370 }, { "epoch": 0.031, "grad_norm": 5.40625, "grad_norm_var": 0.20201822916666667, "learning_rate": 4e-05, "loss": 4.6249, "loss/crossentropy": 1.8143546804785728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970304287970066, "step": 372 }, { "epoch": 0.031166666666666665, "grad_norm": 5.9375, "grad_norm_var": 0.20995686848958334, "learning_rate": 4e-05, "loss": 4.9844, "loss/crossentropy": 1.9378879070281982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20872123166918755, "step": 374 }, { "epoch": 0.03133333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.38843994140625, "learning_rate": 4e-05, "loss": 5.0347, "loss/crossentropy": 1.7512712702155113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19000215269625187, "step": 376 }, { "epoch": 0.0315, "grad_norm": 6.46875, "grad_norm_var": 0.3963826497395833, "learning_rate": 4e-05, "loss": 5.8688, "loss/crossentropy": 1.9144393801689148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19313014298677444, "step": 378 }, { "epoch": 0.03166666666666667, "grad_norm": 6.0625, "grad_norm_var": 0.39254150390625, "learning_rate": 4e-05, "loss": 4.7452, "loss/crossentropy": 1.5328343883156776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17478107661008835, "step": 380 }, { "epoch": 0.03183333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.35426025390625, "learning_rate": 4e-05, "loss": 4.9958, "loss/crossentropy": 2.0120982453227043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19707869738340378, "step": 382 }, { "epoch": 0.032, "grad_norm": 6.125, "grad_norm_var": 0.34947509765625, "learning_rate": 4e-05, "loss": 5.3694, "loss/crossentropy": 2.1418115496635437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2317424789071083, "step": 384 }, { "epoch": 0.03216666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.3590494791666667, "learning_rate": 4e-05, "loss": 5.0396, "loss/crossentropy": 2.2628641948103905, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19536038115620613, "step": 386 }, { "epoch": 0.03233333333333333, "grad_norm": 5.625, "grad_norm_var": 0.32274983723958334, "learning_rate": 4e-05, "loss": 5.1977, "loss/crossentropy": 2.708618402481079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24159733951091766, "step": 388 }, { "epoch": 0.0325, "grad_norm": 5.0, "grad_norm_var": 0.3513671875, "learning_rate": 4e-05, "loss": 5.1688, "loss/crossentropy": 1.427762784063816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16233957186341286, "step": 390 }, { "epoch": 0.03266666666666666, "grad_norm": 5.59375, "grad_norm_var": 0.15084635416666667, "learning_rate": 4e-05, "loss": 5.0311, "loss/crossentropy": 1.1995328813791275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16477875411510468, "step": 392 }, { "epoch": 0.03283333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.09377848307291667, "learning_rate": 4e-05, "loss": 5.1179, "loss/crossentropy": 1.8344684839248657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19066274911165237, "step": 394 }, { "epoch": 0.033, "grad_norm": 5.53125, "grad_norm_var": 0.075634765625, "learning_rate": 4e-05, "loss": 5.0157, "loss/crossentropy": 2.1210782676935196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18409163504838943, "step": 396 }, { "epoch": 0.033166666666666664, "grad_norm": 5.90625, "grad_norm_var": 0.08240559895833334, "learning_rate": 4e-05, "loss": 4.8918, "loss/crossentropy": 2.5905413031578064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24663139507174492, "step": 398 }, { "epoch": 0.03333333333333333, "grad_norm": 5.96875, "grad_norm_var": 0.08899332682291666, "learning_rate": 4e-05, "loss": 4.8675, "loss/crossentropy": 1.6443525850772858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19166382029652596, "step": 400 }, { "epoch": 0.0335, "grad_norm": 5.875, "grad_norm_var": 0.09504801432291667, "learning_rate": 4e-05, "loss": 5.0497, "loss/crossentropy": 1.82669086009264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19115986675024033, "step": 402 }, { "epoch": 0.033666666666666664, "grad_norm": 5.34375, "grad_norm_var": 0.09576822916666666, "learning_rate": 4e-05, "loss": 5.2452, "loss/crossentropy": 2.424153983592987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2266378439962864, "step": 404 }, { "epoch": 0.03383333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.08782145182291666, "learning_rate": 4e-05, "loss": 5.556, "loss/crossentropy": 1.9700486361980438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20585943385958672, "step": 406 }, { "epoch": 0.034, "grad_norm": 5.625, "grad_norm_var": 0.095166015625, "learning_rate": 4e-05, "loss": 5.3139, "loss/crossentropy": 2.2227725982666016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2445167973637581, "step": 408 }, { "epoch": 0.034166666666666665, "grad_norm": 5.0625, "grad_norm_var": 0.11912434895833333, "learning_rate": 4e-05, "loss": 5.1151, "loss/crossentropy": 2.4945799708366394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22168324142694473, "step": 410 }, { "epoch": 0.034333333333333334, "grad_norm": 5.5625, "grad_norm_var": 0.12263997395833333, "learning_rate": 4e-05, "loss": 4.6133, "loss/crossentropy": 2.288731187582016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22380968183279037, "step": 412 }, { "epoch": 0.0345, "grad_norm": 5.0625, "grad_norm_var": 0.12537434895833333, "learning_rate": 4e-05, "loss": 5.1334, "loss/crossentropy": 2.1317990124225616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19617579877376556, "step": 414 }, { "epoch": 0.034666666666666665, "grad_norm": 13.3125, "grad_norm_var": 3.95738525390625, "learning_rate": 4e-05, "loss": 4.456, "loss/crossentropy": 1.048334889113903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13727323338389397, "step": 416 }, { "epoch": 0.034833333333333334, "grad_norm": 5.15625, "grad_norm_var": 3.9932291666666666, "learning_rate": 4e-05, "loss": 5.0826, "loss/crossentropy": 1.9634416326880455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20289554446935654, "step": 418 }, { "epoch": 0.035, "grad_norm": 5.0625, "grad_norm_var": 4.047330729166666, "learning_rate": 4e-05, "loss": 4.7622, "loss/crossentropy": 1.7749098986387253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1768258735537529, "step": 420 }, { "epoch": 0.035166666666666666, "grad_norm": 5.59375, "grad_norm_var": 4.063570149739584, "learning_rate": 4e-05, "loss": 5.2613, "loss/crossentropy": 2.7100062370300293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460428662598133, "step": 422 }, { "epoch": 0.035333333333333335, "grad_norm": 5.375, "grad_norm_var": 4.056966145833333, "learning_rate": 4e-05, "loss": 4.2625, "loss/crossentropy": 2.2796683609485626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166573479771614, "step": 424 }, { "epoch": 0.0355, "grad_norm": 41.0, "grad_norm_var": 80.93147379557291, "learning_rate": 4e-05, "loss": 4.7386, "loss/crossentropy": 2.4747623205184937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24762186035513878, "step": 426 }, { "epoch": 0.035666666666666666, "grad_norm": 4.96875, "grad_norm_var": 81.11119384765625, "learning_rate": 4e-05, "loss": 4.984, "loss/crossentropy": 2.4123693108558655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23126976937055588, "step": 428 }, { "epoch": 0.035833333333333335, "grad_norm": 6.4375, "grad_norm_var": 80.71106770833333, "learning_rate": 4e-05, "loss": 5.3447, "loss/crossentropy": 2.5928608775138855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24646497890353203, "step": 430 }, { "epoch": 0.036, "grad_norm": 5.40625, "grad_norm_var": 79.07750244140625, "learning_rate": 4e-05, "loss": 4.6058, "loss/crossentropy": 1.2497084438800812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15603481978178024, "step": 432 }, { "epoch": 0.036166666666666666, "grad_norm": 5.375, "grad_norm_var": 79.04388020833333, "learning_rate": 4e-05, "loss": 4.5617, "loss/crossentropy": 2.0195882841944695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020249329507351, "step": 434 }, { "epoch": 0.036333333333333336, "grad_norm": 5.4375, "grad_norm_var": 78.90089518229166, "learning_rate": 4e-05, "loss": 5.3138, "loss/crossentropy": 2.254369556903839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22006989270448685, "step": 436 }, { "epoch": 0.0365, "grad_norm": 7.65625, "grad_norm_var": 78.45159098307292, "learning_rate": 4e-05, "loss": 4.723, "loss/crossentropy": 1.7706375047564507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2226849813014269, "step": 438 }, { "epoch": 0.03666666666666667, "grad_norm": 6.8125, "grad_norm_var": 77.962744140625, "learning_rate": 4e-05, "loss": 5.7765, "loss/crossentropy": 2.3849419355392456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24738119542598724, "step": 440 }, { "epoch": 0.036833333333333336, "grad_norm": 4.8125, "grad_norm_var": 0.5287760416666667, "learning_rate": 4e-05, "loss": 4.8364, "loss/crossentropy": 1.9102841913700104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21137163415551186, "step": 442 }, { "epoch": 0.037, "grad_norm": 5.75, "grad_norm_var": 0.48355712890625, "learning_rate": 4e-05, "loss": 5.4136, "loss/crossentropy": 1.9578236639499664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1841808743774891, "step": 444 }, { "epoch": 0.03716666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.46092122395833335, "learning_rate": 4e-05, "loss": 4.6981, "loss/crossentropy": 1.9191040992736816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19365688413381577, "step": 446 }, { "epoch": 0.037333333333333336, "grad_norm": 5.15625, "grad_norm_var": 0.46985270182291666, "learning_rate": 4e-05, "loss": 5.2009, "loss/crossentropy": 2.8165441155433655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2253844402730465, "step": 448 }, { "epoch": 0.0375, "grad_norm": 5.15625, "grad_norm_var": 0.4892578125, "learning_rate": 4e-05, "loss": 4.6916, "loss/crossentropy": 1.6579081416130066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2267564721405506, "step": 450 }, { "epoch": 0.03766666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.505322265625, "learning_rate": 4e-05, "loss": 4.8117, "loss/crossentropy": 1.8108457028865814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969534195959568, "step": 452 }, { "epoch": 0.03783333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.21812744140625, "learning_rate": 4e-05, "loss": 4.9635, "loss/crossentropy": 1.8094572871923447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18736277520656586, "step": 454 }, { "epoch": 0.038, "grad_norm": 5.875, "grad_norm_var": 0.10299479166666667, "learning_rate": 4e-05, "loss": 5.2644, "loss/crossentropy": 2.125039577484131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20303135737776756, "step": 456 }, { "epoch": 0.03816666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.07994791666666666, "learning_rate": 4e-05, "loss": 5.0687, "loss/crossentropy": 1.938138335943222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19551260024309158, "step": 458 }, { "epoch": 0.03833333333333333, "grad_norm": 5.65625, "grad_norm_var": 0.07825113932291666, "learning_rate": 4e-05, "loss": 5.5359, "loss/crossentropy": 1.8810575380921364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18603547476232052, "step": 460 }, { "epoch": 0.0385, "grad_norm": 6.1875, "grad_norm_var": 0.11744791666666667, "learning_rate": 4e-05, "loss": 4.9456, "loss/crossentropy": 2.3643300533294678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21903591975569725, "step": 462 }, { "epoch": 0.03866666666666667, "grad_norm": 5.59375, "grad_norm_var": 0.13967692057291667, "learning_rate": 4e-05, "loss": 4.5044, "loss/crossentropy": 2.0795028433203697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20525045320391655, "step": 464 }, { "epoch": 0.03883333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.14654541015625, "learning_rate": 4e-05, "loss": 5.2285, "loss/crossentropy": 2.670228064060211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22724847868084908, "step": 466 }, { "epoch": 0.039, "grad_norm": 5.375, "grad_norm_var": 0.13710530598958334, "learning_rate": 4e-05, "loss": 5.0264, "loss/crossentropy": 1.9535819217562675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19420474022626877, "step": 468 }, { "epoch": 0.03916666666666667, "grad_norm": 5.75, "grad_norm_var": 0.12994791666666666, "learning_rate": 4e-05, "loss": 5.1026, "loss/crossentropy": 2.0033904761075974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19617502018809319, "step": 470 }, { "epoch": 0.03933333333333333, "grad_norm": 6.1875, "grad_norm_var": 0.13538004557291666, "learning_rate": 4e-05, "loss": 4.9723, "loss/crossentropy": 2.530248761177063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24241825193166733, "step": 472 }, { "epoch": 0.0395, "grad_norm": 5.125, "grad_norm_var": 0.14451497395833332, "learning_rate": 4e-05, "loss": 4.6632, "loss/crossentropy": 1.8372912853956223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19772254303097725, "step": 474 }, { "epoch": 0.03966666666666667, "grad_norm": 5.46875, "grad_norm_var": 0.16470947265625, "learning_rate": 4e-05, "loss": 5.2215, "loss/crossentropy": 1.9594649076461792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21060075610876083, "step": 476 }, { "epoch": 0.03983333333333333, "grad_norm": 5.625, "grad_norm_var": 0.12909749348958333, "learning_rate": 4e-05, "loss": 5.2237, "loss/crossentropy": 1.7402563989162445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18676879815757275, "step": 478 }, { "epoch": 0.04, "grad_norm": 5.0625, "grad_norm_var": 0.111962890625, "learning_rate": 4e-05, "loss": 4.852, "loss/crossentropy": 2.213899254798889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23049471899867058, "step": 480 }, { "epoch": 0.04016666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.11487223307291666, "learning_rate": 4e-05, "loss": 4.6514, "loss/crossentropy": 1.3909804075956345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1692242305725813, "step": 482 }, { "epoch": 0.04033333333333333, "grad_norm": 5.84375, "grad_norm_var": 0.12057291666666667, "learning_rate": 4e-05, "loss": 5.5403, "loss/crossentropy": 2.5903589129447937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23884601891040802, "step": 484 }, { "epoch": 0.0405, "grad_norm": 5.4375, "grad_norm_var": 0.12375895182291667, "learning_rate": 4e-05, "loss": 5.015, "loss/crossentropy": 2.107970714569092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21329589560627937, "step": 486 }, { "epoch": 0.04066666666666666, "grad_norm": 5.5625, "grad_norm_var": 0.093359375, "learning_rate": 4e-05, "loss": 5.075, "loss/crossentropy": 1.8551287949085236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2251221276819706, "step": 488 }, { "epoch": 0.04083333333333333, "grad_norm": 5.53125, "grad_norm_var": 0.08202718098958334, "learning_rate": 4e-05, "loss": 5.5261, "loss/crossentropy": 2.0091424509882927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18690266273915768, "step": 490 }, { "epoch": 0.041, "grad_norm": 4.9375, "grad_norm_var": 0.11715087890625, "learning_rate": 4e-05, "loss": 5.1047, "loss/crossentropy": 1.6703919917345047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19888342171907425, "step": 492 }, { "epoch": 0.041166666666666664, "grad_norm": 5.625, "grad_norm_var": 0.14237874348958332, "learning_rate": 4e-05, "loss": 5.674, "loss/crossentropy": 2.249885469675064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21771755814552307, "step": 494 }, { "epoch": 0.04133333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.13059895833333332, "learning_rate": 4e-05, "loss": 5.2914, "loss/crossentropy": 2.4916725754737854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22457458823919296, "step": 496 }, { "epoch": 0.0415, "grad_norm": 5.03125, "grad_norm_var": 0.14097900390625, "learning_rate": 4e-05, "loss": 4.8649, "loss/crossentropy": 1.9688801318407059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2109392024576664, "step": 498 }, { "epoch": 0.041666666666666664, "grad_norm": 5.5625, "grad_norm_var": 0.15689697265625, "learning_rate": 4e-05, "loss": 5.0342, "loss/crossentropy": 2.300499051809311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113119661808014, "step": 500 }, { "epoch": 0.041833333333333333, "grad_norm": 5.25, "grad_norm_var": 0.200390625, "learning_rate": 4e-05, "loss": 5.4782, "loss/crossentropy": 2.1189796030521393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2590511702001095, "step": 502 }, { "epoch": 0.042, "grad_norm": 5.5625, "grad_norm_var": 0.20634358723958332, "learning_rate": 4e-05, "loss": 5.5632, "loss/crossentropy": 2.599315345287323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2375834807753563, "step": 504 }, { "epoch": 0.042166666666666665, "grad_norm": 5.46875, "grad_norm_var": 0.20716145833333333, "learning_rate": 4e-05, "loss": 5.3239, "loss/crossentropy": 2.3774854242801666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033194676041603, "step": 506 }, { "epoch": 0.042333333333333334, "grad_norm": 5.9375, "grad_norm_var": 0.17721354166666667, "learning_rate": 4e-05, "loss": 5.4254, "loss/crossentropy": 1.6486110389232635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900431402027607, "step": 508 }, { "epoch": 0.0425, "grad_norm": 5.78125, "grad_norm_var": 0.18352864583333334, "learning_rate": 4e-05, "loss": 4.6721, "loss/crossentropy": 2.2632896304130554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22721751406788826, "step": 510 }, { "epoch": 0.042666666666666665, "grad_norm": 4.96875, "grad_norm_var": 0.20403238932291667, "learning_rate": 4e-05, "loss": 4.5325, "loss/crossentropy": 1.248498149216175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.166516724973917, "step": 512 }, { "epoch": 0.042833333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.20436197916666668, "learning_rate": 4e-05, "loss": 4.6529, "loss/crossentropy": 2.0403945446014404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20415923185646534, "step": 514 }, { "epoch": 0.043, "grad_norm": 5.5, "grad_norm_var": 0.18642171223958334, "learning_rate": 4e-05, "loss": 4.8357, "loss/crossentropy": 1.902937613427639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22126169875264168, "step": 516 }, { "epoch": 0.043166666666666666, "grad_norm": 6.1875, "grad_norm_var": 0.16643473307291667, "learning_rate": 4e-05, "loss": 5.2757, "loss/crossentropy": 2.0559470653533936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19494493678212166, "step": 518 }, { "epoch": 0.043333333333333335, "grad_norm": 5.5, "grad_norm_var": 0.16109619140625, "learning_rate": 4e-05, "loss": 5.2423, "loss/crossentropy": 2.4924589097499847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24120581522583961, "step": 520 }, { "epoch": 0.0435, "grad_norm": 5.4375, "grad_norm_var": 0.16646728515625, "learning_rate": 4e-05, "loss": 5.1833, "loss/crossentropy": 1.681239552795887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20514622330665588, "step": 522 }, { "epoch": 0.043666666666666666, "grad_norm": 5.125, "grad_norm_var": 0.13847249348958332, "learning_rate": 4e-05, "loss": 5.142, "loss/crossentropy": 2.186009407043457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21601705998182297, "step": 524 }, { "epoch": 0.043833333333333335, "grad_norm": 5.5, "grad_norm_var": 0.11901041666666666, "learning_rate": 4e-05, "loss": 5.941, "loss/crossentropy": 2.4676918387413025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23410123214125633, "step": 526 }, { "epoch": 0.044, "grad_norm": 5.84375, "grad_norm_var": 0.13984375, "learning_rate": 4e-05, "loss": 5.0339, "loss/crossentropy": 2.1201189160346985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19850488752126694, "step": 528 }, { "epoch": 0.04416666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.12274983723958334, "learning_rate": 4e-05, "loss": 5.2603, "loss/crossentropy": 1.8619603216648102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19815063290297985, "step": 530 }, { "epoch": 0.044333333333333336, "grad_norm": 5.46875, "grad_norm_var": 0.11757405598958333, "learning_rate": 4e-05, "loss": 5.2116, "loss/crossentropy": 2.5051605105400085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23532362654805183, "step": 532 }, { "epoch": 0.0445, "grad_norm": 5.53125, "grad_norm_var": 0.10702718098958333, "learning_rate": 4e-05, "loss": 5.0707, "loss/crossentropy": 1.956557109951973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19309379532933235, "step": 534 }, { "epoch": 0.04466666666666667, "grad_norm": 5.40625, "grad_norm_var": 0.111572265625, "learning_rate": 4e-05, "loss": 5.253, "loss/crossentropy": 1.8768843710422516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20720825903117657, "step": 536 }, { "epoch": 0.044833333333333336, "grad_norm": 5.09375, "grad_norm_var": 0.11599934895833333, "learning_rate": 4e-05, "loss": 5.2099, "loss/crossentropy": 2.205892413854599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255897894501686, "step": 538 }, { "epoch": 0.045, "grad_norm": 5.0, "grad_norm_var": 0.12721354166666668, "learning_rate": 4e-05, "loss": 4.6425, "loss/crossentropy": 0.9755007773637772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13529013842344284, "step": 540 }, { "epoch": 0.04516666666666667, "grad_norm": 5.5, "grad_norm_var": 0.11718343098958334, "learning_rate": 4e-05, "loss": 5.0623, "loss/crossentropy": 2.016813486814499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20038552209734917, "step": 542 }, { "epoch": 0.04533333333333334, "grad_norm": 6.1875, "grad_norm_var": 0.10767822265625, "learning_rate": 4e-05, "loss": 5.7204, "loss/crossentropy": 2.6434133052825928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23127877712249756, "step": 544 }, { "epoch": 0.0455, "grad_norm": 5.125, "grad_norm_var": 0.11239827473958333, "learning_rate": 4e-05, "loss": 5.2036, "loss/crossentropy": 1.9537419080734253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20113248005509377, "step": 546 }, { "epoch": 0.04566666666666667, "grad_norm": 6.1875, "grad_norm_var": 0.14345296223958334, "learning_rate": 4e-05, "loss": 5.5505, "loss/crossentropy": 2.5237995982170105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211185209453106, "step": 548 }, { "epoch": 0.04583333333333333, "grad_norm": 6.28125, "grad_norm_var": 0.16265869140625, "learning_rate": 4e-05, "loss": 4.7336, "loss/crossentropy": 1.9351204261183739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19682549498975277, "step": 550 }, { "epoch": 0.046, "grad_norm": 5.15625, "grad_norm_var": 0.16594645182291667, "learning_rate": 4e-05, "loss": 5.0406, "loss/crossentropy": 2.4625622630119324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21886296197772026, "step": 552 }, { "epoch": 0.04616666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.16692301432291667, "learning_rate": 4e-05, "loss": 4.7676, "loss/crossentropy": 1.4909594282507896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1696036048233509, "step": 554 }, { "epoch": 0.04633333333333333, "grad_norm": 5.84375, "grad_norm_var": 0.1544921875, "learning_rate": 4e-05, "loss": 5.1545, "loss/crossentropy": 1.6347006186842918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19013791903853416, "step": 556 }, { "epoch": 0.0465, "grad_norm": 5.96875, "grad_norm_var": 0.16282145182291666, "learning_rate": 4e-05, "loss": 4.7549, "loss/crossentropy": 2.16485732793808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2486794888973236, "step": 558 }, { "epoch": 0.04666666666666667, "grad_norm": 4.75, "grad_norm_var": 0.17987874348958333, "learning_rate": 4e-05, "loss": 4.5371, "loss/crossentropy": 2.0290512144565582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2468886598944664, "step": 560 }, { "epoch": 0.04683333333333333, "grad_norm": 5.53125, "grad_norm_var": 0.16985270182291667, "learning_rate": 4e-05, "loss": 5.4622, "loss/crossentropy": 1.8024419024586678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2226398829370737, "step": 562 }, { "epoch": 0.047, "grad_norm": 6.5625, "grad_norm_var": 0.22628580729166667, "learning_rate": 4e-05, "loss": 5.7771, "loss/crossentropy": 2.0841223895549774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2839187905192375, "step": 564 }, { "epoch": 0.04716666666666667, "grad_norm": 5.40625, "grad_norm_var": 0.19843343098958333, "learning_rate": 4e-05, "loss": 5.2361, "loss/crossentropy": 1.9585634768009186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2198926955461502, "step": 566 }, { "epoch": 0.04733333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.20552978515625, "learning_rate": 4e-05, "loss": 5.2611, "loss/crossentropy": 2.0971501171588898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21288679540157318, "step": 568 }, { "epoch": 0.0475, "grad_norm": 5.4375, "grad_norm_var": 0.18889567057291667, "learning_rate": 4e-05, "loss": 5.1852, "loss/crossentropy": 2.444584846496582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2557501494884491, "step": 570 }, { "epoch": 0.04766666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.18349202473958334, "learning_rate": 4e-05, "loss": 5.0379, "loss/crossentropy": 2.0450302958488464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.205327320843935, "step": 572 }, { "epoch": 0.04783333333333333, "grad_norm": 5.71875, "grad_norm_var": 0.17245686848958333, "learning_rate": 4e-05, "loss": 5.8995, "loss/crossentropy": 2.5409964323043823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22698857262730598, "step": 574 }, { "epoch": 0.048, "grad_norm": 5.71875, "grad_norm_var": 0.11966145833333333, "learning_rate": 4e-05, "loss": 5.0287, "loss/crossentropy": 1.696646198630333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20716996863484383, "step": 576 }, { "epoch": 0.04816666666666667, "grad_norm": 5.8125, "grad_norm_var": 0.12636311848958334, "learning_rate": 4e-05, "loss": 5.3317, "loss/crossentropy": 2.2942482829093933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25501084327697754, "step": 578 }, { "epoch": 0.04833333333333333, "grad_norm": 5.59375, "grad_norm_var": 0.05279541015625, "learning_rate": 4e-05, "loss": 5.248, "loss/crossentropy": 2.4969963431358337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23663923889398575, "step": 580 }, { "epoch": 0.0485, "grad_norm": 5.40625, "grad_norm_var": 0.055322265625, "learning_rate": 4e-05, "loss": 5.0694, "loss/crossentropy": 2.179100275039673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20842677354812622, "step": 582 }, { "epoch": 0.048666666666666664, "grad_norm": 5.71875, "grad_norm_var": 0.05797119140625, "learning_rate": 4e-05, "loss": 5.0973, "loss/crossentropy": 2.542878270149231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23901895433664322, "step": 584 }, { "epoch": 0.04883333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.104296875, "learning_rate": 4e-05, "loss": 4.4501, "loss/crossentropy": 1.3374748602509499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15280469879508018, "step": 586 }, { "epoch": 0.049, "grad_norm": 5.09375, "grad_norm_var": 0.11021728515625, "learning_rate": 4e-05, "loss": 4.2479, "loss/crossentropy": 2.1031662821769714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21286172419786453, "step": 588 }, { "epoch": 0.049166666666666664, "grad_norm": 5.75, "grad_norm_var": 0.11578369140625, "learning_rate": 4e-05, "loss": 4.4553, "loss/crossentropy": 2.226746082305908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2602303549647331, "step": 590 }, { "epoch": 0.04933333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.10282796223958333, "learning_rate": 4e-05, "loss": 5.1448, "loss/crossentropy": 1.9679524078965187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005962673574686, "step": 592 }, { "epoch": 0.0495, "grad_norm": 5.125, "grad_norm_var": 0.08931884765625, "learning_rate": 4e-05, "loss": 4.7614, "loss/crossentropy": 1.159188948571682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14164014346897602, "step": 594 }, { "epoch": 0.049666666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.09078369140625, "learning_rate": 4e-05, "loss": 5.0281, "loss/crossentropy": 1.8265404999256134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891588643193245, "step": 596 }, { "epoch": 0.049833333333333334, "grad_norm": 5.4375, "grad_norm_var": 0.12486979166666666, "learning_rate": 4e-05, "loss": 5.2755, "loss/crossentropy": 2.3386247754096985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26255329325795174, "step": 598 }, { "epoch": 0.05, "grad_norm": 5.09375, "grad_norm_var": 0.11562093098958333, "learning_rate": 4e-05, "loss": 5.0808, "loss/crossentropy": 1.9218714386224747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2169225849211216, "step": 600 }, { "epoch": 0.050166666666666665, "grad_norm": 5.21875, "grad_norm_var": 0.11552327473958333, "learning_rate": 4e-05, "loss": 5.2712, "loss/crossentropy": 1.5146638751029968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18410832434892654, "step": 602 }, { "epoch": 0.050333333333333334, "grad_norm": 5.59375, "grad_norm_var": 0.12724202473958332, "learning_rate": 4e-05, "loss": 4.7125, "loss/crossentropy": 2.293552666902542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22562003880739212, "step": 604 }, { "epoch": 0.0505, "grad_norm": 5.34375, "grad_norm_var": 0.11326497395833333, "learning_rate": 4e-05, "loss": 5.1169, "loss/crossentropy": 1.7014083191752434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18919607624411583, "step": 606 }, { "epoch": 0.050666666666666665, "grad_norm": 5.5, "grad_norm_var": 0.11340738932291666, "learning_rate": 4e-05, "loss": 5.322, "loss/crossentropy": 1.4259334281086922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16573997400701046, "step": 608 }, { "epoch": 0.050833333333333335, "grad_norm": 6.09375, "grad_norm_var": 0.14000244140625, "learning_rate": 4e-05, "loss": 4.7041, "loss/crossentropy": 1.3645060807466507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17446519620716572, "step": 610 }, { "epoch": 0.051, "grad_norm": 5.84375, "grad_norm_var": 0.11724853515625, "learning_rate": 4e-05, "loss": 5.0354, "loss/crossentropy": 1.4357607513666153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1666635200381279, "step": 612 }, { "epoch": 0.051166666666666666, "grad_norm": 5.4375, "grad_norm_var": 0.09254150390625, "learning_rate": 4e-05, "loss": 4.8718, "loss/crossentropy": 2.40578031539917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22677217051386833, "step": 614 }, { "epoch": 0.051333333333333335, "grad_norm": 5.34375, "grad_norm_var": 0.06404622395833333, "learning_rate": 4e-05, "loss": 5.0725, "loss/crossentropy": 1.8017898797988892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18605670891702175, "step": 616 }, { "epoch": 0.0515, "grad_norm": 6.8125, "grad_norm_var": 0.16373697916666666, "learning_rate": 4e-05, "loss": 5.0028, "loss/crossentropy": 1.283976010978222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15850860998034477, "step": 618 }, { "epoch": 0.051666666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.21100260416666666, "learning_rate": 4e-05, "loss": 4.1235, "loss/crossentropy": 1.4461367800831795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16198919713497162, "step": 620 }, { "epoch": 0.051833333333333335, "grad_norm": 5.25, "grad_norm_var": 0.222509765625, "learning_rate": 4e-05, "loss": 5.0553, "loss/crossentropy": 2.7310924530029297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21842283383011818, "step": 622 }, { "epoch": 0.052, "grad_norm": 5.34375, "grad_norm_var": 0.23632405598958334, "learning_rate": 4e-05, "loss": 4.602, "loss/crossentropy": 2.3664903938770294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23367810249328613, "step": 624 }, { "epoch": 0.05216666666666667, "grad_norm": 5.4375, "grad_norm_var": 0.20592447916666667, "learning_rate": 4e-05, "loss": 4.9899, "loss/crossentropy": 1.2108296155929565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14673249796032906, "step": 626 }, { "epoch": 0.052333333333333336, "grad_norm": 5.03125, "grad_norm_var": 0.19685872395833334, "learning_rate": 4e-05, "loss": 4.4967, "loss/crossentropy": 2.283060073852539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23095425590872765, "step": 628 }, { "epoch": 0.0525, "grad_norm": 5.28125, "grad_norm_var": 0.19582926432291667, "learning_rate": 4e-05, "loss": 5.1891, "loss/crossentropy": 2.3244327008724213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20680969208478928, "step": 630 }, { "epoch": 0.05266666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.19827067057291667, "learning_rate": 4e-05, "loss": 4.7258, "loss/crossentropy": 1.5194010734558105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2393566593527794, "step": 632 }, { "epoch": 0.052833333333333336, "grad_norm": 5.53125, "grad_norm_var": 0.04147135416666667, "learning_rate": 4e-05, "loss": 5.3842, "loss/crossentropy": 2.275718003511429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21031928807497025, "step": 634 }, { "epoch": 0.053, "grad_norm": 5.375, "grad_norm_var": 0.04000244140625, "learning_rate": 4e-05, "loss": 4.8783, "loss/crossentropy": 2.351560056209564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2495262697339058, "step": 636 }, { "epoch": 0.05316666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.0392578125, "learning_rate": 4e-05, "loss": 4.9231, "loss/crossentropy": 2.415817618370056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23617206886410713, "step": 638 }, { "epoch": 0.05333333333333334, "grad_norm": 5.53125, "grad_norm_var": 0.041259765625, "learning_rate": 4e-05, "loss": 5.1447, "loss/crossentropy": 2.2912066876888275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22658982872962952, "step": 640 }, { "epoch": 0.0535, "grad_norm": 4.8125, "grad_norm_var": 0.060530598958333334, "learning_rate": 4e-05, "loss": 4.1259, "loss/crossentropy": 1.3093429505825043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1687719877809286, "step": 642 }, { "epoch": 0.05366666666666667, "grad_norm": 5.375, "grad_norm_var": 0.06966145833333333, "learning_rate": 4e-05, "loss": 4.796, "loss/crossentropy": 1.8905025273561478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039121687412262, "step": 644 }, { "epoch": 0.05383333333333333, "grad_norm": 5.0, "grad_norm_var": 0.08878580729166667, "learning_rate": 4e-05, "loss": 4.0684, "loss/crossentropy": 1.2859214022755623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1498262993991375, "step": 646 }, { "epoch": 0.054, "grad_norm": 5.34375, "grad_norm_var": 0.08448893229166667, "learning_rate": 4e-05, "loss": 5.1231, "loss/crossentropy": 1.3642852455377579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15733711794018745, "step": 648 }, { "epoch": 0.05416666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.07732747395833334, "learning_rate": 4e-05, "loss": 5.221, "loss/crossentropy": 1.8772388100624084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18816200830042362, "step": 650 }, { "epoch": 0.05433333333333333, "grad_norm": 5.375, "grad_norm_var": 0.08318684895833334, "learning_rate": 4e-05, "loss": 4.9661, "loss/crossentropy": 2.054552912712097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23715369030833244, "step": 652 }, { "epoch": 0.0545, "grad_norm": 5.28125, "grad_norm_var": 0.08352457682291667, "learning_rate": 4e-05, "loss": 5.0736, "loss/crossentropy": 2.033175766468048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20897972956299782, "step": 654 }, { "epoch": 0.05466666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.07459309895833334, "learning_rate": 4e-05, "loss": 5.2757, "loss/crossentropy": 1.4109216630458832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16402552276849747, "step": 656 }, { "epoch": 0.05483333333333333, "grad_norm": 5.53125, "grad_norm_var": 0.063525390625, "learning_rate": 4e-05, "loss": 4.4992, "loss/crossentropy": 1.7298256531357765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20376906543970108, "step": 658 }, { "epoch": 0.055, "grad_norm": 5.6875, "grad_norm_var": 0.07955322265625, "learning_rate": 4e-05, "loss": 5.2287, "loss/crossentropy": 2.059411734342575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22227568551898003, "step": 660 }, { "epoch": 0.05516666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.05319010416666667, "learning_rate": 4e-05, "loss": 5.1509, "loss/crossentropy": 2.0033040791749954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19995213858783245, "step": 662 }, { "epoch": 0.05533333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.059794108072916664, "learning_rate": 4e-05, "loss": 5.3328, "loss/crossentropy": 2.360860764980316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22881463915109634, "step": 664 }, { "epoch": 0.0555, "grad_norm": 4.9375, "grad_norm_var": 0.07548421223958333, "learning_rate": 4e-05, "loss": 4.7419, "loss/crossentropy": 1.968344509601593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20371564105153084, "step": 666 }, { "epoch": 0.05566666666666667, "grad_norm": 5.5625, "grad_norm_var": 0.07323811848958334, "learning_rate": 4e-05, "loss": 4.8958, "loss/crossentropy": 2.8875539898872375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2515605129301548, "step": 668 }, { "epoch": 0.05583333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.15545247395833334, "learning_rate": 4e-05, "loss": 5.5544, "loss/crossentropy": 1.9591965079307556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150409147143364, "step": 670 }, { "epoch": 0.056, "grad_norm": 5.65625, "grad_norm_var": 0.1416015625, "learning_rate": 4e-05, "loss": 5.1414, "loss/crossentropy": 1.8480764627456665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19637847691774368, "step": 672 }, { "epoch": 0.05616666666666666, "grad_norm": 5.46875, "grad_norm_var": 0.1478515625, "learning_rate": 4e-05, "loss": 5.4557, "loss/crossentropy": 1.8511382415890694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19494801573455334, "step": 674 }, { "epoch": 0.05633333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.14837239583333334, "learning_rate": 4e-05, "loss": 5.1891, "loss/crossentropy": 2.2563489973545074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20653247460722923, "step": 676 }, { "epoch": 0.0565, "grad_norm": 4.75, "grad_norm_var": 0.20735270182291668, "learning_rate": 4e-05, "loss": 5.0675, "loss/crossentropy": 2.1194111332297325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18754742667078972, "step": 678 }, { "epoch": 0.056666666666666664, "grad_norm": 5.03125, "grad_norm_var": 0.20435791015625, "learning_rate": 4e-05, "loss": 4.8841, "loss/crossentropy": 2.106343001127243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2169276624917984, "step": 680 }, { "epoch": 0.05683333333333333, "grad_norm": 5.125, "grad_norm_var": 0.1974609375, "learning_rate": 4e-05, "loss": 4.692, "loss/crossentropy": 2.522699236869812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24442313238978386, "step": 682 }, { "epoch": 0.057, "grad_norm": 5.0625, "grad_norm_var": 0.22665608723958333, "learning_rate": 4e-05, "loss": 4.0882, "loss/crossentropy": 2.3042386770248413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26410669833421707, "step": 684 }, { "epoch": 0.057166666666666664, "grad_norm": 5.34375, "grad_norm_var": 0.13101806640625, "learning_rate": 4e-05, "loss": 4.6953, "loss/crossentropy": 2.2301080226898193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25920237600803375, "step": 686 }, { "epoch": 0.05733333333333333, "grad_norm": 5.6875, "grad_norm_var": 0.14117431640625, "learning_rate": 4e-05, "loss": 4.9626, "loss/crossentropy": 1.1905392110347748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1609304826706648, "step": 688 }, { "epoch": 0.0575, "grad_norm": 5.0625, "grad_norm_var": 0.14351806640625, "learning_rate": 4e-05, "loss": 4.9173, "loss/crossentropy": 2.519528329372406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2300243265926838, "step": 690 }, { "epoch": 0.057666666666666665, "grad_norm": 5.25, "grad_norm_var": 0.145947265625, "learning_rate": 4e-05, "loss": 4.8009, "loss/crossentropy": 2.0772966742515564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19900832697749138, "step": 692 }, { "epoch": 0.057833333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.08697509765625, "learning_rate": 4e-05, "loss": 5.1798, "loss/crossentropy": 1.7827163264155388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19675160013139248, "step": 694 }, { "epoch": 0.058, "grad_norm": 5.1875, "grad_norm_var": 0.08430989583333333, "learning_rate": 4e-05, "loss": 4.8942, "loss/crossentropy": 1.373624011874199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1578064877539873, "step": 696 }, { "epoch": 0.058166666666666665, "grad_norm": 5.15625, "grad_norm_var": 0.0779296875, "learning_rate": 4e-05, "loss": 5.0116, "loss/crossentropy": 2.012943536043167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20599104836583138, "step": 698 }, { "epoch": 0.058333333333333334, "grad_norm": 5.5625, "grad_norm_var": 0.051025390625, "learning_rate": 4e-05, "loss": 5.2032, "loss/crossentropy": 2.2552223205566406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2347894161939621, "step": 700 }, { "epoch": 0.0585, "grad_norm": 5.1875, "grad_norm_var": 0.05080973307291667, "learning_rate": 4e-05, "loss": 4.7641, "loss/crossentropy": 1.840671882033348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20984739437699318, "step": 702 }, { "epoch": 0.058666666666666666, "grad_norm": 5.5, "grad_norm_var": 0.04462483723958333, "learning_rate": 4e-05, "loss": 4.653, "loss/crossentropy": 1.0027276128530502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13207154348492622, "step": 704 }, { "epoch": 0.058833333333333335, "grad_norm": 5.84375, "grad_norm_var": 0.06643473307291667, "learning_rate": 4e-05, "loss": 5.1619, "loss/crossentropy": 2.2675763964653015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21700335666537285, "step": 706 }, { "epoch": 0.059, "grad_norm": 4.9375, "grad_norm_var": 0.07395426432291667, "learning_rate": 4e-05, "loss": 5.1794, "loss/crossentropy": 2.1361162662506104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21916163712739944, "step": 708 }, { "epoch": 0.059166666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.08739827473958334, "learning_rate": 4e-05, "loss": 4.8597, "loss/crossentropy": 1.6593957543373108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18479419499635696, "step": 710 }, { "epoch": 0.059333333333333335, "grad_norm": 4.8125, "grad_norm_var": 0.10388997395833334, "learning_rate": 4e-05, "loss": 4.3248, "loss/crossentropy": 1.333509661257267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22560935281217098, "step": 712 }, { "epoch": 0.0595, "grad_norm": 5.125, "grad_norm_var": 0.10705973307291666, "learning_rate": 4e-05, "loss": 4.9656, "loss/crossentropy": 2.230701059103012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20880433917045593, "step": 714 }, { "epoch": 0.059666666666666666, "grad_norm": 5.375, "grad_norm_var": 0.09498291015625, "learning_rate": 4e-05, "loss": 5.2067, "loss/crossentropy": 2.4413784742355347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24665574729442596, "step": 716 }, { "epoch": 0.059833333333333336, "grad_norm": 5.71875, "grad_norm_var": 0.111181640625, "learning_rate": 4e-05, "loss": 4.8928, "loss/crossentropy": 2.3639025390148163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20420604944229126, "step": 718 }, { "epoch": 0.06, "grad_norm": 5.625, "grad_norm_var": 0.10507405598958333, "learning_rate": 4e-05, "loss": 4.6914, "loss/crossentropy": 2.329402983188629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23727432638406754, "step": 720 }, { "epoch": 0.06016666666666667, "grad_norm": 5.25, "grad_norm_var": 0.08019205729166666, "learning_rate": 4e-05, "loss": 5.5923, "loss/crossentropy": 2.584353506565094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22987722977995872, "step": 722 }, { "epoch": 0.060333333333333336, "grad_norm": 5.375, "grad_norm_var": 0.06521809895833333, "learning_rate": 4e-05, "loss": 4.9543, "loss/crossentropy": 1.8400915935635567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17739013954997063, "step": 724 }, { "epoch": 0.0605, "grad_norm": 5.375, "grad_norm_var": 0.04706624348958333, "learning_rate": 4e-05, "loss": 5.5607, "loss/crossentropy": 2.403924733400345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2308141142129898, "step": 726 }, { "epoch": 0.06066666666666667, "grad_norm": 5.5625, "grad_norm_var": 0.029488118489583333, "learning_rate": 4e-05, "loss": 5.0417, "loss/crossentropy": 1.4918632730841637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17629101499915123, "step": 728 }, { "epoch": 0.060833333333333336, "grad_norm": 5.8125, "grad_norm_var": 0.038916015625, "learning_rate": 4e-05, "loss": 5.2212, "loss/crossentropy": 2.5051349997520447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24874291196465492, "step": 730 }, { "epoch": 0.061, "grad_norm": 5.15625, "grad_norm_var": 0.03756510416666667, "learning_rate": 4e-05, "loss": 4.7708, "loss/crossentropy": 1.9273648858070374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22050853073596954, "step": 732 }, { "epoch": 0.06116666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.046223958333333336, "learning_rate": 4e-05, "loss": 4.8641, "loss/crossentropy": 2.55005943775177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2286563366651535, "step": 734 }, { "epoch": 0.06133333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.06383056640625, "learning_rate": 4e-05, "loss": 4.474, "loss/crossentropy": 0.9446901753544807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12489008717238903, "step": 736 }, { "epoch": 0.0615, "grad_norm": 4.90625, "grad_norm_var": 0.08017171223958333, "learning_rate": 4e-05, "loss": 5.0295, "loss/crossentropy": 2.087219849228859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19661586359143257, "step": 738 }, { "epoch": 0.06166666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.08253580729166667, "learning_rate": 4e-05, "loss": 5.4304, "loss/crossentropy": 2.1698725819587708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202838696539402, "step": 740 }, { "epoch": 0.06183333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.08683268229166667, "learning_rate": 4e-05, "loss": 4.4203, "loss/crossentropy": 1.530371643602848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16617339104413986, "step": 742 }, { "epoch": 0.062, "grad_norm": 5.0, "grad_norm_var": 0.08704020182291666, "learning_rate": 4e-05, "loss": 4.9379, "loss/crossentropy": 2.3629955649375916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2189258188009262, "step": 744 }, { "epoch": 0.06216666666666667, "grad_norm": 5.9375, "grad_norm_var": 0.10247395833333334, "learning_rate": 4e-05, "loss": 4.9963, "loss/crossentropy": 2.0986749082803726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2822576127946377, "step": 746 }, { "epoch": 0.06233333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.11256510416666667, "learning_rate": 4e-05, "loss": 4.6594, "loss/crossentropy": 2.3157600462436676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23773421347141266, "step": 748 }, { "epoch": 0.0625, "grad_norm": 4.78125, "grad_norm_var": 0.10819905598958333, "learning_rate": 4e-05, "loss": 4.7568, "loss/crossentropy": 1.8990642204880714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18217475526034832, "step": 750 }, { "epoch": 0.06266666666666666, "grad_norm": 5.0, "grad_norm_var": 0.14478759765625, "learning_rate": 4e-05, "loss": 4.5526, "loss/crossentropy": 2.3672779500484467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21740539371967316, "step": 752 }, { "epoch": 0.06283333333333334, "grad_norm": 5.53125, "grad_norm_var": 0.136181640625, "learning_rate": 4e-05, "loss": 5.4863, "loss/crossentropy": 2.2541432678699493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22684243321418762, "step": 754 }, { "epoch": 0.063, "grad_norm": 5.59375, "grad_norm_var": 0.14283447265625, "learning_rate": 4e-05, "loss": 4.582, "loss/crossentropy": 1.5108322128653526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16953840106725693, "step": 756 }, { "epoch": 0.06316666666666666, "grad_norm": 6.21875, "grad_norm_var": 0.19959309895833333, "learning_rate": 4e-05, "loss": 5.1294, "loss/crossentropy": 1.2453164830803871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14559439942240715, "step": 758 }, { "epoch": 0.06333333333333334, "grad_norm": 5.0, "grad_norm_var": 0.19706624348958332, "learning_rate": 4e-05, "loss": 4.524, "loss/crossentropy": 2.1696812510490417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20982523635029793, "step": 760 }, { "epoch": 0.0635, "grad_norm": 4.9375, "grad_norm_var": 0.172509765625, "learning_rate": 4e-05, "loss": 5.2126, "loss/crossentropy": 2.0176029577851295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048779744654894, "step": 762 }, { "epoch": 0.06366666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.17336832682291667, "learning_rate": 4e-05, "loss": 5.1471, "loss/crossentropy": 2.5985326170921326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24666956812143326, "step": 764 }, { "epoch": 0.06383333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.15237223307291667, "learning_rate": 4e-05, "loss": 4.9156, "loss/crossentropy": 1.7296672835946083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16684475913643837, "step": 766 }, { "epoch": 0.064, "grad_norm": 5.0625, "grad_norm_var": 0.11751302083333333, "learning_rate": 4e-05, "loss": 5.1446, "loss/crossentropy": 1.9281784817576408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19502447918057442, "step": 768 }, { "epoch": 0.06416666666666666, "grad_norm": 5.0, "grad_norm_var": 0.11482747395833333, "learning_rate": 4e-05, "loss": 4.8494, "loss/crossentropy": 1.8453112244606018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19226614944636822, "step": 770 }, { "epoch": 0.06433333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.10813395182291667, "learning_rate": 4e-05, "loss": 4.4943, "loss/crossentropy": 1.4456355720758438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1608578786253929, "step": 772 }, { "epoch": 0.0645, "grad_norm": 5.125, "grad_norm_var": 0.030171712239583332, "learning_rate": 4e-05, "loss": 4.7407, "loss/crossentropy": 1.93793186545372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1842864453792572, "step": 774 }, { "epoch": 0.06466666666666666, "grad_norm": 5.3125, "grad_norm_var": 0.02496337890625, "learning_rate": 4e-05, "loss": 5.1067, "loss/crossentropy": 2.04061222076416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23408568277955055, "step": 776 }, { "epoch": 0.06483333333333334, "grad_norm": 5.65625, "grad_norm_var": 0.04267171223958333, "learning_rate": 4e-05, "loss": 5.3864, "loss/crossentropy": 2.418355941772461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2686479911208153, "step": 778 }, { "epoch": 0.065, "grad_norm": 5.8125, "grad_norm_var": 0.05896809895833333, "learning_rate": 4e-05, "loss": 5.8333, "loss/crossentropy": 1.8340658321976662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1828531064093113, "step": 780 }, { "epoch": 0.06516666666666666, "grad_norm": 5.375, "grad_norm_var": 0.06099853515625, "learning_rate": 4e-05, "loss": 5.004, "loss/crossentropy": 1.4804940819740295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16691730543971062, "step": 782 }, { "epoch": 0.06533333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.07131754557291667, "learning_rate": 4e-05, "loss": 5.1147, "loss/crossentropy": 2.2358897924423218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22555414587259293, "step": 784 }, { "epoch": 0.0655, "grad_norm": 5.15625, "grad_norm_var": 0.06549479166666666, "learning_rate": 4e-05, "loss": 5.1507, "loss/crossentropy": 2.0352462232112885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2000010460615158, "step": 786 }, { "epoch": 0.06566666666666666, "grad_norm": 5.4375, "grad_norm_var": 0.06360677083333334, "learning_rate": 4e-05, "loss": 5.1654, "loss/crossentropy": 2.098189502954483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18786033987998962, "step": 788 }, { "epoch": 0.06583333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.09657796223958333, "learning_rate": 4e-05, "loss": 4.6436, "loss/crossentropy": 2.23826864361763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015607375651598, "step": 790 }, { "epoch": 0.066, "grad_norm": 5.71875, "grad_norm_var": 0.1150390625, "learning_rate": 4e-05, "loss": 5.1651, "loss/crossentropy": 1.690386563539505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726808026432991, "step": 792 }, { "epoch": 0.06616666666666667, "grad_norm": 5.75, "grad_norm_var": 0.12457275390625, "learning_rate": 4e-05, "loss": 5.9104, "loss/crossentropy": 2.5857779383659363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23685766011476517, "step": 794 }, { "epoch": 0.06633333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.10857747395833334, "learning_rate": 4e-05, "loss": 5.2371, "loss/crossentropy": 1.768482819199562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18570737540721893, "step": 796 }, { "epoch": 0.0665, "grad_norm": 5.46875, "grad_norm_var": 0.10833333333333334, "learning_rate": 4e-05, "loss": 5.4012, "loss/crossentropy": 1.9267660677433014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2040906585752964, "step": 798 }, { "epoch": 0.06666666666666667, "grad_norm": 5.8125, "grad_norm_var": 0.1189453125, "learning_rate": 4e-05, "loss": 4.9879, "loss/crossentropy": 1.2167518213391304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16967863216996193, "step": 800 }, { "epoch": 0.06683333333333333, "grad_norm": 6.59375, "grad_norm_var": 0.22284749348958333, "learning_rate": 4e-05, "loss": 5.419, "loss/crossentropy": 2.580377459526062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2212601639330387, "step": 802 }, { "epoch": 0.067, "grad_norm": 5.28125, "grad_norm_var": 0.21803385416666668, "learning_rate": 4e-05, "loss": 5.1168, "loss/crossentropy": 1.6435775309801102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17711565271019936, "step": 804 }, { "epoch": 0.06716666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.177197265625, "learning_rate": 4e-05, "loss": 4.8107, "loss/crossentropy": 1.8337387293577194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18238316662609577, "step": 806 }, { "epoch": 0.06733333333333333, "grad_norm": 5.375, "grad_norm_var": 0.16861979166666666, "learning_rate": 4e-05, "loss": 4.6552, "loss/crossentropy": 0.9152474626898766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1301976777613163, "step": 808 }, { "epoch": 0.0675, "grad_norm": 5.25, "grad_norm_var": 0.15790608723958333, "learning_rate": 4e-05, "loss": 5.2066, "loss/crossentropy": 2.6412184834480286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262093760073185, "step": 810 }, { "epoch": 0.06766666666666667, "grad_norm": 5.625, "grad_norm_var": 0.16119384765625, "learning_rate": 4e-05, "loss": 5.1448, "loss/crossentropy": 2.0506534948945045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1795985009521246, "step": 812 }, { "epoch": 0.06783333333333333, "grad_norm": 5.78125, "grad_norm_var": 0.17060139973958333, "learning_rate": 4e-05, "loss": 5.3269, "loss/crossentropy": 1.978536695241928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2082153595983982, "step": 814 }, { "epoch": 0.068, "grad_norm": 5.125, "grad_norm_var": 0.18036702473958333, "learning_rate": 4e-05, "loss": 4.9719, "loss/crossentropy": 2.047212928533554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22651727497577667, "step": 816 }, { "epoch": 0.06816666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.07636311848958334, "learning_rate": 4e-05, "loss": 4.6999, "loss/crossentropy": 2.19197478890419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21344346180558205, "step": 818 }, { "epoch": 0.06833333333333333, "grad_norm": 5.6875, "grad_norm_var": 0.08183186848958333, "learning_rate": 4e-05, "loss": 4.8976, "loss/crossentropy": 1.737916611135006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20098767057061195, "step": 820 }, { "epoch": 0.0685, "grad_norm": 5.3125, "grad_norm_var": 0.08097330729166667, "learning_rate": 4e-05, "loss": 4.8823, "loss/crossentropy": 2.3596703112125397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2249809466302395, "step": 822 }, { "epoch": 0.06866666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.07472330729166667, "learning_rate": 4e-05, "loss": 4.9886, "loss/crossentropy": 1.005987472832203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1333060059696436, "step": 824 }, { "epoch": 0.06883333333333333, "grad_norm": 5.75, "grad_norm_var": 0.11842447916666667, "learning_rate": 4e-05, "loss": 4.2789, "loss/crossentropy": 1.277719035744667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.184982817620039, "step": 826 }, { "epoch": 0.069, "grad_norm": 5.46875, "grad_norm_var": 0.12177327473958334, "learning_rate": 4e-05, "loss": 5.2915, "loss/crossentropy": 2.51472669839859, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24901026487350464, "step": 828 }, { "epoch": 0.06916666666666667, "grad_norm": 4.875, "grad_norm_var": 0.12773030598958332, "learning_rate": 4e-05, "loss": 4.5775, "loss/crossentropy": 1.783848948776722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1819527931511402, "step": 830 }, { "epoch": 0.06933333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.08843994140625, "learning_rate": 4e-05, "loss": 4.9177, "loss/crossentropy": 1.785573087632656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1884115468710661, "step": 832 }, { "epoch": 0.0695, "grad_norm": 5.59375, "grad_norm_var": 0.10284830729166666, "learning_rate": 4e-05, "loss": 5.365, "loss/crossentropy": 2.576684892177582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24840709939599037, "step": 834 }, { "epoch": 0.06966666666666667, "grad_norm": 5.5, "grad_norm_var": 0.10690104166666667, "learning_rate": 4e-05, "loss": 4.9888, "loss/crossentropy": 1.324866883456707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16928620636463165, "step": 836 }, { "epoch": 0.06983333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.10640869140625, "learning_rate": 4e-05, "loss": 4.7214, "loss/crossentropy": 1.2735597863793373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14277022145688534, "step": 838 }, { "epoch": 0.07, "grad_norm": 5.65625, "grad_norm_var": 0.10634358723958333, "learning_rate": 4e-05, "loss": 5.2884, "loss/crossentropy": 2.3253634870052338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21503334864974022, "step": 840 }, { "epoch": 0.07016666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.06145833333333333, "learning_rate": 4e-05, "loss": 5.0755, "loss/crossentropy": 2.1621678471565247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23412977531552315, "step": 842 }, { "epoch": 0.07033333333333333, "grad_norm": 5.5, "grad_norm_var": 0.05701497395833333, "learning_rate": 4e-05, "loss": 4.9175, "loss/crossentropy": 1.3182961717247963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1545308232307434, "step": 844 }, { "epoch": 0.0705, "grad_norm": 5.5625, "grad_norm_var": 0.04241129557291667, "learning_rate": 4e-05, "loss": 4.7885, "loss/crossentropy": 1.8972595036029816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1849633026868105, "step": 846 }, { "epoch": 0.07066666666666667, "grad_norm": 5.4375, "grad_norm_var": 0.05201416015625, "learning_rate": 4e-05, "loss": 5.4305, "loss/crossentropy": 1.8853968381881714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18923737294971943, "step": 848 }, { "epoch": 0.07083333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.03893229166666667, "learning_rate": 4e-05, "loss": 4.8156, "loss/crossentropy": 2.07659313082695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22809230163693428, "step": 850 }, { "epoch": 0.071, "grad_norm": 5.8125, "grad_norm_var": 0.042252604166666666, "learning_rate": 4e-05, "loss": 4.8969, "loss/crossentropy": 1.9546649530529976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20475487783551216, "step": 852 }, { "epoch": 0.07116666666666667, "grad_norm": 5.53125, "grad_norm_var": 0.04049479166666667, "learning_rate": 4e-05, "loss": 4.9251, "loss/crossentropy": 1.9904922246932983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2668594792485237, "step": 854 }, { "epoch": 0.07133333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.040755208333333334, "learning_rate": 4e-05, "loss": 4.869, "loss/crossentropy": 1.5018320679664612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16269180364906788, "step": 856 }, { "epoch": 0.0715, "grad_norm": 5.40625, "grad_norm_var": 0.04816080729166667, "learning_rate": 4e-05, "loss": 5.0008, "loss/crossentropy": 1.5820802375674248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1965535432100296, "step": 858 }, { "epoch": 0.07166666666666667, "grad_norm": 5.5, "grad_norm_var": 0.049544270833333334, "learning_rate": 4e-05, "loss": 5.2759, "loss/crossentropy": 2.0585487335920334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19966792315244675, "step": 860 }, { "epoch": 0.07183333333333333, "grad_norm": 5.375, "grad_norm_var": 0.10250244140625, "learning_rate": 4e-05, "loss": 5.383, "loss/crossentropy": 1.9323284551501274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18302668258547783, "step": 862 }, { "epoch": 0.072, "grad_norm": 5.28125, "grad_norm_var": 0.09348958333333333, "learning_rate": 4e-05, "loss": 5.2908, "loss/crossentropy": 2.5324109196662903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2370966337621212, "step": 864 }, { "epoch": 0.07216666666666667, "grad_norm": 5.75, "grad_norm_var": 0.09452718098958333, "learning_rate": 4e-05, "loss": 4.9592, "loss/crossentropy": 1.6841916590929031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1897643618285656, "step": 866 }, { "epoch": 0.07233333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.09479166666666666, "learning_rate": 4e-05, "loss": 5.0178, "loss/crossentropy": 2.266584038734436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22946883738040924, "step": 868 }, { "epoch": 0.0725, "grad_norm": 5.65625, "grad_norm_var": 0.10764567057291667, "learning_rate": 4e-05, "loss": 5.0285, "loss/crossentropy": 2.6628386974334717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22233451902866364, "step": 870 }, { "epoch": 0.07266666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.15286051432291667, "learning_rate": 4e-05, "loss": 5.2623, "loss/crossentropy": 2.4069382548332214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2290056124329567, "step": 872 }, { "epoch": 0.07283333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.14179280598958333, "learning_rate": 4e-05, "loss": 5.2452, "loss/crossentropy": 1.9541796445846558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18380443193018436, "step": 874 }, { "epoch": 0.073, "grad_norm": 5.28125, "grad_norm_var": 0.139306640625, "learning_rate": 4e-05, "loss": 5.198, "loss/crossentropy": 1.4988721013069153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17175763100385666, "step": 876 }, { "epoch": 0.07316666666666667, "grad_norm": 5.0, "grad_norm_var": 0.10328369140625, "learning_rate": 4e-05, "loss": 4.9121, "loss/crossentropy": 2.0601812303066254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19267475232481956, "step": 878 }, { "epoch": 0.07333333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.13046875, "learning_rate": 4e-05, "loss": 4.3653, "loss/crossentropy": 1.9160602986812592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2581901587545872, "step": 880 }, { "epoch": 0.0735, "grad_norm": 5.59375, "grad_norm_var": 0.14569905598958333, "learning_rate": 4e-05, "loss": 4.7792, "loss/crossentropy": 2.4729442596435547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2501170076429844, "step": 882 }, { "epoch": 0.07366666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.14511311848958333, "learning_rate": 4e-05, "loss": 5.2312, "loss/crossentropy": 2.5770280361175537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22895906120538712, "step": 884 }, { "epoch": 0.07383333333333333, "grad_norm": 5.5, "grad_norm_var": 0.15028889973958334, "learning_rate": 4e-05, "loss": 4.6341, "loss/crossentropy": 1.9654364585876465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19148295372724533, "step": 886 }, { "epoch": 0.074, "grad_norm": 5.1875, "grad_norm_var": 0.09972330729166666, "learning_rate": 4e-05, "loss": 4.9945, "loss/crossentropy": 2.1374219059944153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22470850497484207, "step": 888 }, { "epoch": 0.07416666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.09351806640625, "learning_rate": 4e-05, "loss": 5.1861, "loss/crossentropy": 2.381446748971939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19887309148907661, "step": 890 }, { "epoch": 0.07433333333333333, "grad_norm": 5.375, "grad_norm_var": 0.09345296223958334, "learning_rate": 4e-05, "loss": 4.8414, "loss/crossentropy": 1.1452403292059898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17032541893422604, "step": 892 }, { "epoch": 0.0745, "grad_norm": 5.78125, "grad_norm_var": 0.17688802083333333, "learning_rate": 4e-05, "loss": 5.0405, "loss/crossentropy": 2.2590576112270355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102867066860199, "step": 894 }, { "epoch": 0.07466666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.13743082682291666, "learning_rate": 4e-05, "loss": 5.2191, "loss/crossentropy": 1.7941122353076935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.186736473813653, "step": 896 }, { "epoch": 0.07483333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.12220052083333334, "learning_rate": 4e-05, "loss": 5.1269, "loss/crossentropy": 2.2678189873695374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22983458638191223, "step": 898 }, { "epoch": 0.075, "grad_norm": 5.375, "grad_norm_var": 0.13631184895833334, "learning_rate": 4e-05, "loss": 4.8294, "loss/crossentropy": 2.2537818551063538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2183692753314972, "step": 900 }, { "epoch": 0.07516666666666667, "grad_norm": 5.5, "grad_norm_var": 0.12668863932291666, "learning_rate": 4e-05, "loss": 4.7788, "loss/crossentropy": 2.1383658349514008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21728335320949554, "step": 902 }, { "epoch": 0.07533333333333334, "grad_norm": 5.71875, "grad_norm_var": 0.13644205729166667, "learning_rate": 4e-05, "loss": 5.0809, "loss/crossentropy": 1.899217240512371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22059489786624908, "step": 904 }, { "epoch": 0.0755, "grad_norm": 5.125, "grad_norm_var": 0.14010416666666667, "learning_rate": 4e-05, "loss": 5.072, "loss/crossentropy": 2.597853124141693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22190867736935616, "step": 906 }, { "epoch": 0.07566666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.14192708333333334, "learning_rate": 4e-05, "loss": 5.0169, "loss/crossentropy": 2.3640182316303253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.234844408929348, "step": 908 }, { "epoch": 0.07583333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.04889322916666667, "learning_rate": 4e-05, "loss": 4.3463, "loss/crossentropy": 1.7232627272605896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17844930291175842, "step": 910 }, { "epoch": 0.076, "grad_norm": 5.40625, "grad_norm_var": 0.07498372395833333, "learning_rate": 4e-05, "loss": 4.2796, "loss/crossentropy": 1.3739222288131714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15678460523486137, "step": 912 }, { "epoch": 0.07616666666666666, "grad_norm": 5.53125, "grad_norm_var": 0.096728515625, "learning_rate": 4e-05, "loss": 4.9171, "loss/crossentropy": 2.0244703590869904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22794625163078308, "step": 914 }, { "epoch": 0.07633333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.10305989583333333, "learning_rate": 4e-05, "loss": 4.9391, "loss/crossentropy": 2.0325954258441925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.203802440315485, "step": 916 }, { "epoch": 0.0765, "grad_norm": 5.125, "grad_norm_var": 0.1103515625, "learning_rate": 4e-05, "loss": 4.8463, "loss/crossentropy": 2.536973237991333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22789884731173515, "step": 918 }, { "epoch": 0.07666666666666666, "grad_norm": 5.5, "grad_norm_var": 0.10354410807291667, "learning_rate": 4e-05, "loss": 4.956, "loss/crossentropy": 2.464049816131592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22606119513511658, "step": 920 }, { "epoch": 0.07683333333333334, "grad_norm": 5.625, "grad_norm_var": 0.10116780598958333, "learning_rate": 4e-05, "loss": 5.1667, "loss/crossentropy": 1.8012469932436943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2026966456323862, "step": 922 }, { "epoch": 0.077, "grad_norm": 4.875, "grad_norm_var": 0.12096354166666666, "learning_rate": 4e-05, "loss": 4.3867, "loss/crossentropy": 0.7537075951695442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11622426472604275, "step": 924 }, { "epoch": 0.07716666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.14560139973958333, "learning_rate": 4e-05, "loss": 4.9043, "loss/crossentropy": 2.0230683609843254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19815506786108017, "step": 926 }, { "epoch": 0.07733333333333334, "grad_norm": 5.8125, "grad_norm_var": 0.11731363932291666, "learning_rate": 4e-05, "loss": 5.4114, "loss/crossentropy": 2.449110984802246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23653070628643036, "step": 928 }, { "epoch": 0.0775, "grad_norm": 5.21875, "grad_norm_var": 0.11243489583333334, "learning_rate": 4e-05, "loss": 4.9905, "loss/crossentropy": 1.4106080010533333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20390398800373077, "step": 930 }, { "epoch": 0.07766666666666666, "grad_norm": 5.125, "grad_norm_var": 0.10862223307291667, "learning_rate": 4e-05, "loss": 5.1747, "loss/crossentropy": 2.168779395520687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18856257386505604, "step": 932 }, { "epoch": 0.07783333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.09322916666666667, "learning_rate": 4e-05, "loss": 5.3753, "loss/crossentropy": 1.9819502532482147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20197083055973053, "step": 934 }, { "epoch": 0.078, "grad_norm": 6.0, "grad_norm_var": 0.12967122395833333, "learning_rate": 4e-05, "loss": 4.586, "loss/crossentropy": 2.2913994789123535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21365177258849144, "step": 936 }, { "epoch": 0.07816666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.1279296875, "learning_rate": 4e-05, "loss": 5.4079, "loss/crossentropy": 2.3492658138275146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2506399601697922, "step": 938 }, { "epoch": 0.07833333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.10709228515625, "learning_rate": 4e-05, "loss": 5.1427, "loss/crossentropy": 2.3426185250282288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21920472010970116, "step": 940 }, { "epoch": 0.0785, "grad_norm": 5.5, "grad_norm_var": 0.09029947916666667, "learning_rate": 4e-05, "loss": 5.7286, "loss/crossentropy": 1.9868685603141785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1850258819758892, "step": 942 }, { "epoch": 0.07866666666666666, "grad_norm": 5.65625, "grad_norm_var": 0.080859375, "learning_rate": 4e-05, "loss": 5.1068, "loss/crossentropy": 2.3053890466690063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2049488164484501, "step": 944 }, { "epoch": 0.07883333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.09524739583333333, "learning_rate": 4e-05, "loss": 5.1353, "loss/crossentropy": 2.455892562866211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22708288580179214, "step": 946 }, { "epoch": 0.079, "grad_norm": 5.625, "grad_norm_var": 0.09212239583333333, "learning_rate": 4e-05, "loss": 5.5379, "loss/crossentropy": 2.7193942070007324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23220528662204742, "step": 948 }, { "epoch": 0.07916666666666666, "grad_norm": 5.375, "grad_norm_var": 0.08515218098958334, "learning_rate": 4e-05, "loss": 5.3782, "loss/crossentropy": 2.350933760404587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21525833755731583, "step": 950 }, { "epoch": 0.07933333333333334, "grad_norm": 5.34375, "grad_norm_var": 0.046708170572916666, "learning_rate": 4e-05, "loss": 5.1386, "loss/crossentropy": 1.689025953412056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20566146075725555, "step": 952 }, { "epoch": 0.0795, "grad_norm": 5.40625, "grad_norm_var": 0.04062093098958333, "learning_rate": 4e-05, "loss": 4.5698, "loss/crossentropy": 1.6841574162244797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17244276031851768, "step": 954 }, { "epoch": 0.07966666666666666, "grad_norm": 5.53125, "grad_norm_var": 0.040755208333333334, "learning_rate": 4e-05, "loss": 5.2786, "loss/crossentropy": 1.983876220881939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17574166506528854, "step": 956 }, { "epoch": 0.07983333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.04439697265625, "learning_rate": 4e-05, "loss": 4.7437, "loss/crossentropy": 1.4942948892712593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2328424882143736, "step": 958 }, { "epoch": 0.08, "grad_norm": 5.25, "grad_norm_var": 0.03677978515625, "learning_rate": 4e-05, "loss": 5.1834, "loss/crossentropy": 1.8319725766777992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1919633485376835, "step": 960 }, { "epoch": 0.08016666666666666, "grad_norm": 5.53125, "grad_norm_var": 0.021480305989583334, "learning_rate": 4e-05, "loss": 5.5035, "loss/crossentropy": 1.7946438565850258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19422894716262817, "step": 962 }, { "epoch": 0.08033333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.040262858072916664, "learning_rate": 4e-05, "loss": 5.0136, "loss/crossentropy": 1.359002597630024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14611583948135376, "step": 964 }, { "epoch": 0.0805, "grad_norm": 5.34375, "grad_norm_var": 0.040848795572916666, "learning_rate": 4e-05, "loss": 5.1417, "loss/crossentropy": 1.8300999030470848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20316387340426445, "step": 966 }, { "epoch": 0.08066666666666666, "grad_norm": 5.4375, "grad_norm_var": 0.04348551432291667, "learning_rate": 4e-05, "loss": 5.4677, "loss/crossentropy": 2.5086329579353333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2248428463935852, "step": 968 }, { "epoch": 0.08083333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.04659830729166667, "learning_rate": 4e-05, "loss": 4.9775, "loss/crossentropy": 1.6595203876495361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1784625742584467, "step": 970 }, { "epoch": 0.081, "grad_norm": 5.21875, "grad_norm_var": 0.04006754557291667, "learning_rate": 4e-05, "loss": 4.5601, "loss/crossentropy": 2.0515496730804443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.251333761960268, "step": 972 }, { "epoch": 0.08116666666666666, "grad_norm": 5.9375, "grad_norm_var": 0.07532145182291666, "learning_rate": 4e-05, "loss": 4.9809, "loss/crossentropy": 2.0578393265604973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044786848127842, "step": 974 }, { "epoch": 0.08133333333333333, "grad_norm": 5.5, "grad_norm_var": 0.07902018229166667, "learning_rate": 4e-05, "loss": 5.3295, "loss/crossentropy": 2.3250069618225098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22813353687524796, "step": 976 }, { "epoch": 0.0815, "grad_norm": 5.46875, "grad_norm_var": 0.10435791015625, "learning_rate": 4e-05, "loss": 4.2614, "loss/crossentropy": 1.668112076818943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1645719800144434, "step": 978 }, { "epoch": 0.08166666666666667, "grad_norm": 5.0, "grad_norm_var": 0.09332275390625, "learning_rate": 4e-05, "loss": 4.5831, "loss/crossentropy": 2.0174410790205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.194509482011199, "step": 980 }, { "epoch": 0.08183333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.10445556640625, "learning_rate": 4e-05, "loss": 4.487, "loss/crossentropy": 1.5212369486689568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15324652940034866, "step": 982 }, { "epoch": 0.082, "grad_norm": 5.625, "grad_norm_var": 0.11470947265625, "learning_rate": 4e-05, "loss": 5.4141, "loss/crossentropy": 2.1497460901737213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21780480071902275, "step": 984 }, { "epoch": 0.08216666666666667, "grad_norm": 5.6875, "grad_norm_var": 0.12024332682291666, "learning_rate": 4e-05, "loss": 4.9362, "loss/crossentropy": 2.4396926164627075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23286250606179237, "step": 986 }, { "epoch": 0.08233333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.12483317057291667, "learning_rate": 4e-05, "loss": 5.0999, "loss/crossentropy": 1.889777421951294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20137444324791431, "step": 988 }, { "epoch": 0.0825, "grad_norm": 5.4375, "grad_norm_var": 0.09685872395833334, "learning_rate": 4e-05, "loss": 5.0512, "loss/crossentropy": 1.6567106246948242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1837901659309864, "step": 990 }, { "epoch": 0.08266666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.12862955729166667, "learning_rate": 4e-05, "loss": 5.1472, "loss/crossentropy": 2.333681643009186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2757877744734287, "step": 992 }, { "epoch": 0.08283333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.09720052083333333, "learning_rate": 4e-05, "loss": 4.9003, "loss/crossentropy": 2.455785632133484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.235845647752285, "step": 994 }, { "epoch": 0.083, "grad_norm": 5.59375, "grad_norm_var": 0.10006103515625, "learning_rate": 4e-05, "loss": 4.7869, "loss/crossentropy": 1.6923584789037704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18443119525909424, "step": 996 }, { "epoch": 0.08316666666666667, "grad_norm": 5.59375, "grad_norm_var": 0.12935791015625, "learning_rate": 4e-05, "loss": 5.2978, "loss/crossentropy": 2.250712603330612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22323672845959663, "step": 998 }, { "epoch": 0.08333333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.14816080729166667, "learning_rate": 4e-05, "loss": 4.8683, "loss/crossentropy": 1.629243291914463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17515265941619873, "step": 1000 }, { "epoch": 0.0835, "grad_norm": 4.90625, "grad_norm_var": 0.163134765625, "learning_rate": 4e-05, "loss": 4.8394, "loss/crossentropy": 1.5451477617025375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16557494178414345, "step": 1002 }, { "epoch": 0.08366666666666667, "grad_norm": 5.5, "grad_norm_var": 0.16617431640625, "learning_rate": 4e-05, "loss": 5.3454, "loss/crossentropy": 2.027478814125061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930145062506199, "step": 1004 }, { "epoch": 0.08383333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.16291910807291668, "learning_rate": 4e-05, "loss": 5.4164, "loss/crossentropy": 1.7522178888320923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17787319794297218, "step": 1006 }, { "epoch": 0.084, "grad_norm": 5.375, "grad_norm_var": 0.12831624348958334, "learning_rate": 4e-05, "loss": 5.1326, "loss/crossentropy": 2.2606292963027954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22609979286789894, "step": 1008 }, { "epoch": 0.08416666666666667, "grad_norm": 5.375, "grad_norm_var": 0.12669270833333332, "learning_rate": 4e-05, "loss": 4.6972, "loss/crossentropy": 1.3283646404743195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16521522216498852, "step": 1010 }, { "epoch": 0.08433333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.12975260416666667, "learning_rate": 4e-05, "loss": 4.4952, "loss/crossentropy": 1.5593970566987991, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17167419753968716, "step": 1012 }, { "epoch": 0.0845, "grad_norm": 6.46875, "grad_norm_var": 0.16975504557291668, "learning_rate": 4e-05, "loss": 4.9853, "loss/crossentropy": 1.7582807093858719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954742781817913, "step": 1014 }, { "epoch": 0.08466666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.14810791015625, "learning_rate": 4e-05, "loss": 5.5712, "loss/crossentropy": 1.4745187312364578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.175180334597826, "step": 1016 }, { "epoch": 0.08483333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.1140625, "learning_rate": 4e-05, "loss": 5.02, "loss/crossentropy": 1.4914978370070457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.153540201485157, "step": 1018 }, { "epoch": 0.085, "grad_norm": 5.21875, "grad_norm_var": 0.11285400390625, "learning_rate": 4e-05, "loss": 5.0581, "loss/crossentropy": 2.0033098682761192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19129659608006477, "step": 1020 }, { "epoch": 0.08516666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.11636962890625, "learning_rate": 4e-05, "loss": 4.9019, "loss/crossentropy": 1.3103836476802826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14399930834770203, "step": 1022 }, { "epoch": 0.08533333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.11998291015625, "learning_rate": 4e-05, "loss": 5.3554, "loss/crossentropy": 2.316011965274811, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24070628732442856, "step": 1024 }, { "epoch": 0.0855, "grad_norm": 5.3125, "grad_norm_var": 0.135400390625, "learning_rate": 4e-05, "loss": 5.0308, "loss/crossentropy": 1.4706613272428513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16036737896502018, "step": 1026 }, { "epoch": 0.08566666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.12779541015625, "learning_rate": 4e-05, "loss": 4.9992, "loss/crossentropy": 2.273362785577774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22011291980743408, "step": 1028 }, { "epoch": 0.08583333333333333, "grad_norm": 5.75, "grad_norm_var": 0.05761311848958333, "learning_rate": 4e-05, "loss": 4.8171, "loss/crossentropy": 1.5396167114377022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16673108749091625, "step": 1030 }, { "epoch": 0.086, "grad_norm": 5.34375, "grad_norm_var": 0.20703125, "learning_rate": 4e-05, "loss": 5.0549, "loss/crossentropy": 2.5265402793884277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24674354121088982, "step": 1032 }, { "epoch": 0.08616666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.23644205729166667, "learning_rate": 4e-05, "loss": 4.7003, "loss/crossentropy": 1.2078011631965637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17747630923986435, "step": 1034 }, { "epoch": 0.08633333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.23554280598958333, "learning_rate": 4e-05, "loss": 5.3133, "loss/crossentropy": 2.0871264040470123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24350010231137276, "step": 1036 }, { "epoch": 0.0865, "grad_norm": 5.40625, "grad_norm_var": 0.22498372395833333, "learning_rate": 4e-05, "loss": 4.7851, "loss/crossentropy": 1.9023667722940445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18097983859479427, "step": 1038 }, { "epoch": 0.08666666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.25533854166666664, "learning_rate": 4e-05, "loss": 4.3374, "loss/crossentropy": 1.3959245532751083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1558135487139225, "step": 1040 }, { "epoch": 0.08683333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.24257405598958334, "learning_rate": 4e-05, "loss": 5.2833, "loss/crossentropy": 2.175060898065567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21963898465037346, "step": 1042 }, { "epoch": 0.087, "grad_norm": 4.78125, "grad_norm_var": 0.27828369140625, "learning_rate": 4e-05, "loss": 5.4538, "loss/crossentropy": 2.4693975150585175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25492599606513977, "step": 1044 }, { "epoch": 0.08716666666666667, "grad_norm": 7.6875, "grad_norm_var": 0.59791259765625, "learning_rate": 4e-05, "loss": 4.7527, "loss/crossentropy": 1.8814911097288132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21333661675453186, "step": 1046 }, { "epoch": 0.08733333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.4727701822916667, "learning_rate": 4e-05, "loss": 4.6145, "loss/crossentropy": 1.8344381749629974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21359984576702118, "step": 1048 }, { "epoch": 0.0875, "grad_norm": 4.78125, "grad_norm_var": 0.48202718098958336, "learning_rate": 4e-05, "loss": 5.2823, "loss/crossentropy": 2.050938367843628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1791608016937971, "step": 1050 }, { "epoch": 0.08766666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.48342692057291664, "learning_rate": 4e-05, "loss": 5.0677, "loss/crossentropy": 2.4595237970352173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22237907722592354, "step": 1052 }, { "epoch": 0.08783333333333333, "grad_norm": 5.375, "grad_norm_var": 0.4832967122395833, "learning_rate": 4e-05, "loss": 5.6448, "loss/crossentropy": 2.6399565935134888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23271853476762772, "step": 1054 }, { "epoch": 0.088, "grad_norm": 5.25, "grad_norm_var": 0.4495930989583333, "learning_rate": 4e-05, "loss": 4.7361, "loss/crossentropy": 1.2053988501429558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15466507151722908, "step": 1056 }, { "epoch": 0.08816666666666667, "grad_norm": 5.46875, "grad_norm_var": 0.461181640625, "learning_rate": 4e-05, "loss": 4.9132, "loss/crossentropy": 1.488468736410141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14729905128479004, "step": 1058 }, { "epoch": 0.08833333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.4491170247395833, "learning_rate": 4e-05, "loss": 4.8853, "loss/crossentropy": 1.5609579607844353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16526676714420319, "step": 1060 }, { "epoch": 0.0885, "grad_norm": 5.46875, "grad_norm_var": 0.07265218098958333, "learning_rate": 4e-05, "loss": 4.6105, "loss/crossentropy": 1.6217800825834274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17694772593677044, "step": 1062 }, { "epoch": 0.08866666666666667, "grad_norm": 8.0625, "grad_norm_var": 0.61412353515625, "learning_rate": 4e-05, "loss": 5.2193, "loss/crossentropy": 1.8461291044950485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21077187731862068, "step": 1064 }, { "epoch": 0.08883333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.60703125, "learning_rate": 4e-05, "loss": 5.1246, "loss/crossentropy": 2.6400803327560425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22616837173700333, "step": 1066 }, { "epoch": 0.089, "grad_norm": 5.46875, "grad_norm_var": 0.6093587239583333, "learning_rate": 4e-05, "loss": 5.0405, "loss/crossentropy": 2.375422090291977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20153706148266792, "step": 1068 }, { "epoch": 0.08916666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.6176432291666667, "learning_rate": 4e-05, "loss": 4.6603, "loss/crossentropy": 2.2146050930023193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24322227016091347, "step": 1070 }, { "epoch": 0.08933333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.6146769205729167, "learning_rate": 4e-05, "loss": 4.8999, "loss/crossentropy": 1.8436658903956413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19220684841275215, "step": 1072 }, { "epoch": 0.0895, "grad_norm": 5.15625, "grad_norm_var": 0.6200358072916666, "learning_rate": 4e-05, "loss": 5.1488, "loss/crossentropy": 1.8162973299622536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20228558778762817, "step": 1074 }, { "epoch": 0.08966666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.6080729166666666, "learning_rate": 4e-05, "loss": 3.9845, "loss/crossentropy": 2.2521041929721832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21630793064832687, "step": 1076 }, { "epoch": 0.08983333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.5907389322916666, "learning_rate": 4e-05, "loss": 4.899, "loss/crossentropy": 2.8008521795272827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2185939960181713, "step": 1078 }, { "epoch": 0.09, "grad_norm": 5.25, "grad_norm_var": 0.04781494140625, "learning_rate": 4e-05, "loss": 5.1706, "loss/crossentropy": 2.006529211997986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23423625528812408, "step": 1080 }, { "epoch": 0.09016666666666667, "grad_norm": 4.875, "grad_norm_var": 0.043359375, "learning_rate": 4e-05, "loss": 4.5149, "loss/crossentropy": 1.5252627283334732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16670545563101768, "step": 1082 }, { "epoch": 0.09033333333333333, "grad_norm": 5.0, "grad_norm_var": 0.029801432291666666, "learning_rate": 4e-05, "loss": 5.1486, "loss/crossentropy": 1.7940563037991524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2000715285539627, "step": 1084 }, { "epoch": 0.0905, "grad_norm": 6.03125, "grad_norm_var": 0.07154541015625, "learning_rate": 4e-05, "loss": 5.0886, "loss/crossentropy": 2.1735753268003464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20067806914448738, "step": 1086 }, { "epoch": 0.09066666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.07135009765625, "learning_rate": 4e-05, "loss": 4.8474, "loss/crossentropy": 2.2438295483589172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19555602967739105, "step": 1088 }, { "epoch": 0.09083333333333334, "grad_norm": 5.75, "grad_norm_var": 0.09073893229166667, "learning_rate": 4e-05, "loss": 5.2058, "loss/crossentropy": 1.4616017490625381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21333470940589905, "step": 1090 }, { "epoch": 0.091, "grad_norm": 5.90625, "grad_norm_var": 0.11666259765625, "learning_rate": 4e-05, "loss": 4.6305, "loss/crossentropy": 2.167073041200638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22877944260835648, "step": 1092 }, { "epoch": 0.09116666666666666, "grad_norm": 6.0, "grad_norm_var": 0.14568684895833334, "learning_rate": 4e-05, "loss": 5.1547, "loss/crossentropy": 2.292715698480606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23625801876187325, "step": 1094 }, { "epoch": 0.09133333333333334, "grad_norm": 5.375, "grad_norm_var": 0.1615234375, "learning_rate": 4e-05, "loss": 4.6894, "loss/crossentropy": 1.8984995782375336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20002290606498718, "step": 1096 }, { "epoch": 0.0915, "grad_norm": 5.53125, "grad_norm_var": 0.15767822265625, "learning_rate": 4e-05, "loss": 5.3258, "loss/crossentropy": 2.3215838074684143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.239714615046978, "step": 1098 }, { "epoch": 0.09166666666666666, "grad_norm": 5.28125, "grad_norm_var": 0.153759765625, "learning_rate": 4e-05, "loss": 5.659, "loss/crossentropy": 2.0826582312583923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21085572242736816, "step": 1100 }, { "epoch": 0.09183333333333334, "grad_norm": 5.75, "grad_norm_var": 0.14205322265625, "learning_rate": 4e-05, "loss": 5.2229, "loss/crossentropy": 1.4214537590742111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1674542874097824, "step": 1102 }, { "epoch": 0.092, "grad_norm": 5.125, "grad_norm_var": 0.14312744140625, "learning_rate": 4e-05, "loss": 4.8583, "loss/crossentropy": 1.8252490535378456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1999596320092678, "step": 1104 }, { "epoch": 0.09216666666666666, "grad_norm": 5.3125, "grad_norm_var": 0.9711873372395833, "learning_rate": 4e-05, "loss": 4.6134, "loss/crossentropy": 1.295023687183857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15225711092352867, "step": 1106 }, { "epoch": 0.09233333333333334, "grad_norm": 5.8125, "grad_norm_var": 0.9461873372395834, "learning_rate": 4e-05, "loss": 4.8484, "loss/crossentropy": 1.2159418240189552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1386381033807993, "step": 1108 }, { "epoch": 0.0925, "grad_norm": 5.0625, "grad_norm_var": 0.9702962239583334, "learning_rate": 4e-05, "loss": 5.0996, "loss/crossentropy": 2.4133604764938354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2145095020532608, "step": 1110 }, { "epoch": 0.09266666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.9756510416666667, "learning_rate": 4e-05, "loss": 4.8344, "loss/crossentropy": 1.7319612950086594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1731079574674368, "step": 1112 }, { "epoch": 0.09283333333333334, "grad_norm": 5.0, "grad_norm_var": 0.9765625, "learning_rate": 4e-05, "loss": 4.8517, "loss/crossentropy": 1.6233867853879929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1842523030936718, "step": 1114 }, { "epoch": 0.093, "grad_norm": 5.40625, "grad_norm_var": 0.9680826822916667, "learning_rate": 4e-05, "loss": 4.8547, "loss/crossentropy": 2.48843851685524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22499863803386688, "step": 1116 }, { "epoch": 0.09316666666666666, "grad_norm": 5.0, "grad_norm_var": 0.98170166015625, "learning_rate": 4e-05, "loss": 5.0257, "loss/crossentropy": 1.9700958281755447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18995188921689987, "step": 1118 }, { "epoch": 0.09333333333333334, "grad_norm": 5.5, "grad_norm_var": 0.9780232747395833, "learning_rate": 4e-05, "loss": 5.2743, "loss/crossentropy": 2.393950343132019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21254169195890427, "step": 1120 }, { "epoch": 0.0935, "grad_norm": 4.875, "grad_norm_var": 0.07667643229166667, "learning_rate": 4e-05, "loss": 4.3726, "loss/crossentropy": 2.0441563352942467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.176620876416564, "step": 1122 }, { "epoch": 0.09366666666666666, "grad_norm": 5.40625, "grad_norm_var": 0.05546875, "learning_rate": 4e-05, "loss": 4.7267, "loss/crossentropy": 1.8536287397146225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18368082493543625, "step": 1124 }, { "epoch": 0.09383333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.0544921875, "learning_rate": 4e-05, "loss": 5.0803, "loss/crossentropy": 0.8758844807744026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12684419937431812, "step": 1126 }, { "epoch": 0.094, "grad_norm": 5.5, "grad_norm_var": 0.048726399739583336, "learning_rate": 4e-05, "loss": 5.1615, "loss/crossentropy": 2.5835047364234924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21840446069836617, "step": 1128 }, { "epoch": 0.09416666666666666, "grad_norm": 5.25, "grad_norm_var": 0.04986979166666667, "learning_rate": 4e-05, "loss": 5.5816, "loss/crossentropy": 2.6384198665618896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2301637977361679, "step": 1130 }, { "epoch": 0.09433333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.056103515625, "learning_rate": 4e-05, "loss": 5.4229, "loss/crossentropy": 1.1250766292214394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18534447066485882, "step": 1132 }, { "epoch": 0.0945, "grad_norm": 5.125, "grad_norm_var": 0.04959309895833333, "learning_rate": 4e-05, "loss": 5.3554, "loss/crossentropy": 2.3953994810581207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2530173920094967, "step": 1134 }, { "epoch": 0.09466666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.048828125, "learning_rate": 4e-05, "loss": 4.88, "loss/crossentropy": 1.7053054720163345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1791569832712412, "step": 1136 }, { "epoch": 0.09483333333333334, "grad_norm": 6.03125, "grad_norm_var": 0.06383056640625, "learning_rate": 4e-05, "loss": 5.3785, "loss/crossentropy": 2.3179805874824524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2322893813252449, "step": 1138 }, { "epoch": 0.095, "grad_norm": 5.15625, "grad_norm_var": 0.07327067057291667, "learning_rate": 4e-05, "loss": 4.8176, "loss/crossentropy": 1.420469008386135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16073052026331425, "step": 1140 }, { "epoch": 0.09516666666666666, "grad_norm": 5.3125, "grad_norm_var": 0.08088785807291667, "learning_rate": 4e-05, "loss": 4.3205, "loss/crossentropy": 2.4501261115074158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22779357805848122, "step": 1142 }, { "epoch": 0.09533333333333334, "grad_norm": 5.3125, "grad_norm_var": 0.07838541666666667, "learning_rate": 4e-05, "loss": 5.4399, "loss/crossentropy": 1.6310898885130882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19184290245175362, "step": 1144 }, { "epoch": 0.0955, "grad_norm": 5.5, "grad_norm_var": 0.09289957682291666, "learning_rate": 4e-05, "loss": 4.874, "loss/crossentropy": 2.2742528915405273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111048549413681, "step": 1146 }, { "epoch": 0.09566666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.08596598307291667, "learning_rate": 4e-05, "loss": 4.5276, "loss/crossentropy": 2.360960155725479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2365327812731266, "step": 1148 }, { "epoch": 0.09583333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.08917643229166666, "learning_rate": 4e-05, "loss": 5.2565, "loss/crossentropy": 2.354514867067337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22909708321094513, "step": 1150 }, { "epoch": 0.096, "grad_norm": 5.15625, "grad_norm_var": 0.08723958333333333, "learning_rate": 4e-05, "loss": 4.9971, "loss/crossentropy": 1.930092841386795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069223504513502, "step": 1152 }, { "epoch": 0.09616666666666666, "grad_norm": 5.71875, "grad_norm_var": 0.083056640625, "learning_rate": 4e-05, "loss": 4.5473, "loss/crossentropy": 2.1483106315135956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045624665915966, "step": 1154 }, { "epoch": 0.09633333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.5146484375, "learning_rate": 4e-05, "loss": 5.0278, "loss/crossentropy": 1.9653250426054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19056643545627594, "step": 1156 }, { "epoch": 0.0965, "grad_norm": 5.0, "grad_norm_var": 0.53082275390625, "learning_rate": 4e-05, "loss": 4.2204, "loss/crossentropy": 1.59404868632555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19967875257134438, "step": 1158 }, { "epoch": 0.09666666666666666, "grad_norm": 5.65625, "grad_norm_var": 0.5387003580729167, "learning_rate": 4e-05, "loss": 4.9826, "loss/crossentropy": 2.1325821727514267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22309079766273499, "step": 1160 }, { "epoch": 0.09683333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.5381510416666667, "learning_rate": 4e-05, "loss": 4.3877, "loss/crossentropy": 1.4681326597929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22608055919408798, "step": 1162 }, { "epoch": 0.097, "grad_norm": 5.59375, "grad_norm_var": 0.5563639322916667, "learning_rate": 4e-05, "loss": 4.3296, "loss/crossentropy": 1.7761568650603294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17438078857958317, "step": 1164 }, { "epoch": 0.09716666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.5484659830729167, "learning_rate": 4e-05, "loss": 4.6415, "loss/crossentropy": 2.402270257472992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2308974713087082, "step": 1166 }, { "epoch": 0.09733333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.5458333333333333, "learning_rate": 4e-05, "loss": 5.0955, "loss/crossentropy": 2.493393361568451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2291024811565876, "step": 1168 }, { "epoch": 0.0975, "grad_norm": 5.3125, "grad_norm_var": 0.5023396809895834, "learning_rate": 4e-05, "loss": 5.3826, "loss/crossentropy": 2.0367672443389893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1950318068265915, "step": 1170 }, { "epoch": 0.09766666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.09302978515625, "learning_rate": 4e-05, "loss": 4.8491, "loss/crossentropy": 1.9876883029937744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22394466400146484, "step": 1172 }, { "epoch": 0.09783333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.08215738932291666, "learning_rate": 4e-05, "loss": 5.3828, "loss/crossentropy": 3.0178449749946594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20590073242783546, "step": 1174 }, { "epoch": 0.098, "grad_norm": 5.25, "grad_norm_var": 0.07343343098958334, "learning_rate": 4e-05, "loss": 4.7851, "loss/crossentropy": 1.9217498302459717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19339833036065102, "step": 1176 }, { "epoch": 0.09816666666666667, "grad_norm": 6.375, "grad_norm_var": 0.13677978515625, "learning_rate": 4e-05, "loss": 5.4871, "loss/crossentropy": 2.4020891785621643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2366964928805828, "step": 1178 }, { "epoch": 0.09833333333333333, "grad_norm": 5.125, "grad_norm_var": 0.12213134765625, "learning_rate": 4e-05, "loss": 4.8593, "loss/crossentropy": 2.4006099104881287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22088930010795593, "step": 1180 }, { "epoch": 0.0985, "grad_norm": 6.5, "grad_norm_var": 0.21571858723958334, "learning_rate": 4e-05, "loss": 5.0819, "loss/crossentropy": 2.0782680213451385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24928846955299377, "step": 1182 }, { "epoch": 0.09866666666666667, "grad_norm": 5.5, "grad_norm_var": 0.21457926432291666, "learning_rate": 4e-05, "loss": 5.0726, "loss/crossentropy": 2.0695590674877167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18602534383535385, "step": 1184 }, { "epoch": 0.09883333333333333, "grad_norm": 5.78125, "grad_norm_var": 0.22304280598958334, "learning_rate": 4e-05, "loss": 4.7877, "loss/crossentropy": 1.6022805571556091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.210852961987257, "step": 1186 }, { "epoch": 0.099, "grad_norm": 5.375, "grad_norm_var": 0.18235677083333332, "learning_rate": 4e-05, "loss": 5.1089, "loss/crossentropy": 2.142100676894188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20843049511313438, "step": 1188 }, { "epoch": 0.09916666666666667, "grad_norm": 5.59375, "grad_norm_var": 0.23609619140625, "learning_rate": 4e-05, "loss": 4.787, "loss/crossentropy": 1.4725098609924316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18635604158043861, "step": 1190 }, { "epoch": 0.09933333333333333, "grad_norm": 5.625, "grad_norm_var": 0.245166015625, "learning_rate": 4e-05, "loss": 5.169, "loss/crossentropy": 2.237101376056671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2352110929787159, "step": 1192 }, { "epoch": 0.0995, "grad_norm": 5.46875, "grad_norm_var": 0.18853759765625, "learning_rate": 4e-05, "loss": 5.5126, "loss/crossentropy": 2.0248168110847473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19281497597694397, "step": 1194 }, { "epoch": 0.09966666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.19179280598958334, "learning_rate": 4e-05, "loss": 5.1818, "loss/crossentropy": 1.6968555450439453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19379430450499058, "step": 1196 }, { "epoch": 0.09983333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.10631103515625, "learning_rate": 4e-05, "loss": 4.8209, "loss/crossentropy": 1.5468868017196655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16440916433930397, "step": 1198 }, { "epoch": 0.1, "grad_norm": 5.125, "grad_norm_var": 0.10435791015625, "learning_rate": 4e-05, "loss": 5.0316, "loss/crossentropy": 1.8919531255960464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1950981542468071, "step": 1200 }, { "epoch": 0.10016666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.0984375, "learning_rate": 4e-05, "loss": 5.147, "loss/crossentropy": 1.6899343207478523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17296983301639557, "step": 1202 }, { "epoch": 0.10033333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.10318603515625, "learning_rate": 4e-05, "loss": 4.8684, "loss/crossentropy": 2.129749298095703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22954006493091583, "step": 1204 }, { "epoch": 0.1005, "grad_norm": 6.0, "grad_norm_var": 0.08917643229166666, "learning_rate": 4e-05, "loss": 5.0732, "loss/crossentropy": 1.873815581202507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18716946989297867, "step": 1206 }, { "epoch": 0.10066666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.07224934895833333, "learning_rate": 4e-05, "loss": 5.2633, "loss/crossentropy": 2.32140251994133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23857327923178673, "step": 1208 }, { "epoch": 0.10083333333333333, "grad_norm": 5.0, "grad_norm_var": 0.07857666015625, "learning_rate": 4e-05, "loss": 5.4185, "loss/crossentropy": 2.4386764764785767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22701247781515121, "step": 1210 }, { "epoch": 0.101, "grad_norm": 5.0, "grad_norm_var": 0.06886393229166667, "learning_rate": 4e-05, "loss": 5.0207, "loss/crossentropy": 1.033334881067276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1509057618677616, "step": 1212 }, { "epoch": 0.10116666666666667, "grad_norm": 5.125, "grad_norm_var": 0.07232666015625, "learning_rate": 4e-05, "loss": 5.3547, "loss/crossentropy": 2.575928032398224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22841667383909225, "step": 1214 }, { "epoch": 0.10133333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.08534749348958333, "learning_rate": 4e-05, "loss": 4.6306, "loss/crossentropy": 1.8462003320455551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1828253660351038, "step": 1216 }, { "epoch": 0.1015, "grad_norm": 5.5625, "grad_norm_var": 0.07909749348958334, "learning_rate": 4e-05, "loss": 4.7978, "loss/crossentropy": 2.3569419384002686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24894802644848824, "step": 1218 }, { "epoch": 0.10166666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.08370768229166667, "learning_rate": 4e-05, "loss": 5.6661, "loss/crossentropy": 2.0236599445343018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20601807162165642, "step": 1220 }, { "epoch": 0.10183333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.1169921875, "learning_rate": 4e-05, "loss": 4.9545, "loss/crossentropy": 2.3474625945091248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21253760159015656, "step": 1222 }, { "epoch": 0.102, "grad_norm": 5.09375, "grad_norm_var": 0.12795817057291667, "learning_rate": 4e-05, "loss": 5.1116, "loss/crossentropy": 2.0291855931282043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2004396729171276, "step": 1224 }, { "epoch": 0.10216666666666667, "grad_norm": 5.6875, "grad_norm_var": 0.13644205729166667, "learning_rate": 4e-05, "loss": 4.839, "loss/crossentropy": 1.786637932062149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1793370395898819, "step": 1226 }, { "epoch": 0.10233333333333333, "grad_norm": 5.0, "grad_norm_var": 0.19013264973958333, "learning_rate": 4e-05, "loss": 4.4245, "loss/crossentropy": 1.5033576264977455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1820121891796589, "step": 1228 }, { "epoch": 0.1025, "grad_norm": 5.78125, "grad_norm_var": 0.20201822916666667, "learning_rate": 4e-05, "loss": 5.0492, "loss/crossentropy": 1.7019300237298012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1789207085967064, "step": 1230 }, { "epoch": 0.10266666666666667, "grad_norm": 4.875, "grad_norm_var": 0.21126302083333334, "learning_rate": 4e-05, "loss": 4.788, "loss/crossentropy": 1.907809428870678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18188271671533585, "step": 1232 }, { "epoch": 0.10283333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.21638997395833334, "learning_rate": 4e-05, "loss": 5.1517, "loss/crossentropy": 2.4552002549171448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21234703436493874, "step": 1234 }, { "epoch": 0.103, "grad_norm": 4.96875, "grad_norm_var": 0.20930582682291668, "learning_rate": 4e-05, "loss": 4.9628, "loss/crossentropy": 2.644266128540039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23479052260518074, "step": 1236 }, { "epoch": 0.10316666666666667, "grad_norm": 5.5, "grad_norm_var": 0.12864176432291666, "learning_rate": 4e-05, "loss": 5.3084, "loss/crossentropy": 2.67303067445755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22366882488131523, "step": 1238 }, { "epoch": 0.10333333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.111181640625, "learning_rate": 4e-05, "loss": 4.9053, "loss/crossentropy": 1.5672541037201881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16596291214227676, "step": 1240 }, { "epoch": 0.1035, "grad_norm": 4.625, "grad_norm_var": 0.10232747395833333, "learning_rate": 4e-05, "loss": 4.5866, "loss/crossentropy": 2.6629343032836914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24874060973525047, "step": 1242 }, { "epoch": 0.10366666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.08292643229166667, "learning_rate": 4e-05, "loss": 4.4504, "loss/crossentropy": 2.156973510980606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21799809858202934, "step": 1244 }, { "epoch": 0.10383333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.05780843098958333, "learning_rate": 4e-05, "loss": 5.2903, "loss/crossentropy": 1.837260901927948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21180275082588196, "step": 1246 }, { "epoch": 0.104, "grad_norm": 5.28125, "grad_norm_var": 0.0615234375, "learning_rate": 4e-05, "loss": 4.5459, "loss/crossentropy": 1.2645907923579216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14384145848453045, "step": 1248 }, { "epoch": 0.10416666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.06304931640625, "learning_rate": 4e-05, "loss": 5.0964, "loss/crossentropy": 2.430781602859497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22693831473588943, "step": 1250 }, { "epoch": 0.10433333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.3664021809895833, "learning_rate": 4e-05, "loss": 4.4491, "loss/crossentropy": 1.7102079764008522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1991841085255146, "step": 1252 }, { "epoch": 0.1045, "grad_norm": 5.03125, "grad_norm_var": 0.36282145182291664, "learning_rate": 4e-05, "loss": 4.9358, "loss/crossentropy": 2.073123261332512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17315717414021492, "step": 1254 }, { "epoch": 0.10466666666666667, "grad_norm": 5.0, "grad_norm_var": 0.37418212890625, "learning_rate": 4e-05, "loss": 5.5336, "loss/crossentropy": 2.1373045444488525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219538614153862, "step": 1256 }, { "epoch": 0.10483333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.37675374348958335, "learning_rate": 4e-05, "loss": 4.912, "loss/crossentropy": 1.0041880533099174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13228822499513626, "step": 1258 }, { "epoch": 0.105, "grad_norm": 4.875, "grad_norm_var": 0.35985921223958334, "learning_rate": 4e-05, "loss": 5.418, "loss/crossentropy": 2.5306063890457153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2324695996940136, "step": 1260 }, { "epoch": 0.10516666666666667, "grad_norm": 5.125, "grad_norm_var": 0.352197265625, "learning_rate": 4e-05, "loss": 4.8752, "loss/crossentropy": 1.99101173132658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20560059323906898, "step": 1262 }, { "epoch": 0.10533333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.3563639322916667, "learning_rate": 4e-05, "loss": 4.5795, "loss/crossentropy": 1.2876827344298363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13764869794249535, "step": 1264 }, { "epoch": 0.1055, "grad_norm": 5.5625, "grad_norm_var": 0.3610310872395833, "learning_rate": 4e-05, "loss": 4.9981, "loss/crossentropy": 2.373332917690277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24393631890416145, "step": 1266 }, { "epoch": 0.10566666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.06151936848958333, "learning_rate": 4e-05, "loss": 4.7999, "loss/crossentropy": 2.0270435735583305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20645768009126186, "step": 1268 }, { "epoch": 0.10583333333333333, "grad_norm": 5.125, "grad_norm_var": 0.06073811848958333, "learning_rate": 4e-05, "loss": 4.6323, "loss/crossentropy": 1.5579545721411705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1702574621886015, "step": 1270 }, { "epoch": 0.106, "grad_norm": 5.15625, "grad_norm_var": 0.047265625, "learning_rate": 4e-05, "loss": 4.4562, "loss/crossentropy": 2.4982908964157104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2260519601404667, "step": 1272 }, { "epoch": 0.10616666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.03284098307291667, "learning_rate": 4e-05, "loss": 4.9248, "loss/crossentropy": 1.9325073957443237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19312690198421478, "step": 1274 }, { "epoch": 0.10633333333333334, "grad_norm": 5.375, "grad_norm_var": 0.031884765625, "learning_rate": 4e-05, "loss": 4.9711, "loss/crossentropy": 1.769273281097412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21776899322867393, "step": 1276 }, { "epoch": 0.1065, "grad_norm": 5.28125, "grad_norm_var": 0.028999837239583333, "learning_rate": 4e-05, "loss": 5.5337, "loss/crossentropy": 2.284360885620117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22272326424717903, "step": 1278 }, { "epoch": 0.10666666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.028499348958333334, "learning_rate": 4e-05, "loss": 5.2305, "loss/crossentropy": 1.8736866936087608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111079953610897, "step": 1280 }, { "epoch": 0.10683333333333334, "grad_norm": 5.125, "grad_norm_var": 0.04016927083333333, "learning_rate": 4e-05, "loss": 4.5859, "loss/crossentropy": 1.6894002929329872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19435017928481102, "step": 1282 }, { "epoch": 0.107, "grad_norm": 5.0625, "grad_norm_var": 0.03518473307291667, "learning_rate": 4e-05, "loss": 4.6698, "loss/crossentropy": 1.5436028242111206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1630682311952114, "step": 1284 }, { "epoch": 0.10716666666666666, "grad_norm": 5.84375, "grad_norm_var": 0.06803385416666667, "learning_rate": 4e-05, "loss": 5.3232, "loss/crossentropy": 2.6317964792251587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24785209447145462, "step": 1286 }, { "epoch": 0.10733333333333334, "grad_norm": 5.53125, "grad_norm_var": 0.10422770182291667, "learning_rate": 4e-05, "loss": 5.1804, "loss/crossentropy": 1.5235177874565125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16595890559256077, "step": 1288 }, { "epoch": 0.1075, "grad_norm": 5.0625, "grad_norm_var": 0.11223958333333334, "learning_rate": 4e-05, "loss": 5.0182, "loss/crossentropy": 2.0896430388092995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19103838317096233, "step": 1290 }, { "epoch": 0.10766666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.13593343098958333, "learning_rate": 4e-05, "loss": 4.7246, "loss/crossentropy": 2.059629112482071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21217556670308113, "step": 1292 }, { "epoch": 0.10783333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.15558268229166666, "learning_rate": 4e-05, "loss": 4.3552, "loss/crossentropy": 1.9174365252256393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18914584256708622, "step": 1294 }, { "epoch": 0.108, "grad_norm": 5.25, "grad_norm_var": 0.15556233723958332, "learning_rate": 4e-05, "loss": 4.8855, "loss/crossentropy": 2.258558452129364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21850135922431946, "step": 1296 }, { "epoch": 0.10816666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.14166666666666666, "learning_rate": 4e-05, "loss": 4.8919, "loss/crossentropy": 2.0549103915691376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19344163686037064, "step": 1298 }, { "epoch": 0.10833333333333334, "grad_norm": 4.875, "grad_norm_var": 0.14542643229166666, "learning_rate": 4e-05, "loss": 4.614, "loss/crossentropy": 2.098690018057823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2107415907084942, "step": 1300 }, { "epoch": 0.1085, "grad_norm": 5.375, "grad_norm_var": 0.11389567057291666, "learning_rate": 4e-05, "loss": 4.8096, "loss/crossentropy": 1.3393612429499626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17516088113188744, "step": 1302 }, { "epoch": 0.10866666666666666, "grad_norm": 5.125, "grad_norm_var": 0.04724934895833333, "learning_rate": 4e-05, "loss": 5.2506, "loss/crossentropy": 1.7870676293969154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19672731682658195, "step": 1304 }, { "epoch": 0.10883333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.04983317057291667, "learning_rate": 4e-05, "loss": 5.2597, "loss/crossentropy": 2.245271325111389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21325793489813805, "step": 1306 }, { "epoch": 0.109, "grad_norm": 6.0, "grad_norm_var": 0.09648030598958333, "learning_rate": 4e-05, "loss": 5.0179, "loss/crossentropy": 2.556147426366806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22807640954852104, "step": 1308 }, { "epoch": 0.10916666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.078759765625, "learning_rate": 4e-05, "loss": 4.9811, "loss/crossentropy": 2.244424045085907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22903723642230034, "step": 1310 }, { "epoch": 0.10933333333333334, "grad_norm": 5.125, "grad_norm_var": 0.0791015625, "learning_rate": 4e-05, "loss": 4.3534, "loss/crossentropy": 1.5312049500644207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16121600940823555, "step": 1312 }, { "epoch": 0.1095, "grad_norm": 6.0, "grad_norm_var": 0.10878499348958333, "learning_rate": 4e-05, "loss": 5.0244, "loss/crossentropy": 1.7539609968662262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039298675954342, "step": 1314 }, { "epoch": 0.10966666666666666, "grad_norm": 5.25, "grad_norm_var": 0.10193684895833334, "learning_rate": 4e-05, "loss": 5.0756, "loss/crossentropy": 2.2906831800937653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22330694645643234, "step": 1316 }, { "epoch": 0.10983333333333334, "grad_norm": 5.375, "grad_norm_var": 0.10165608723958333, "learning_rate": 4e-05, "loss": 5.4659, "loss/crossentropy": 2.422152817249298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22418388351798058, "step": 1318 }, { "epoch": 0.11, "grad_norm": 4.78125, "grad_norm_var": 0.12274983723958334, "learning_rate": 4e-05, "loss": 5.0434, "loss/crossentropy": 2.335483729839325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262851968407631, "step": 1320 }, { "epoch": 0.11016666666666666, "grad_norm": 5.0, "grad_norm_var": 0.11308186848958333, "learning_rate": 4e-05, "loss": 4.7446, "loss/crossentropy": 1.9849571883678436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22970320284366608, "step": 1322 }, { "epoch": 0.11033333333333334, "grad_norm": 5.0, "grad_norm_var": 0.08787434895833333, "learning_rate": 4e-05, "loss": 4.9389, "loss/crossentropy": 1.5644195303320885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21707364916801453, "step": 1324 }, { "epoch": 0.1105, "grad_norm": 4.90625, "grad_norm_var": 0.09641927083333333, "learning_rate": 4e-05, "loss": 5.1959, "loss/crossentropy": 2.2511331140995026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20980435609817505, "step": 1326 }, { "epoch": 0.11066666666666666, "grad_norm": 5.4375, "grad_norm_var": 0.20533854166666668, "learning_rate": 4e-05, "loss": 5.3261, "loss/crossentropy": 2.4248663187026978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21466375887393951, "step": 1328 }, { "epoch": 0.11083333333333334, "grad_norm": 5.125, "grad_norm_var": 0.17899983723958332, "learning_rate": 4e-05, "loss": 4.4923, "loss/crossentropy": 2.2259855568408966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21654033288359642, "step": 1330 }, { "epoch": 0.111, "grad_norm": 5.78125, "grad_norm_var": 0.19322916666666667, "learning_rate": 4e-05, "loss": 5.5515, "loss/crossentropy": 2.337790846824646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23069706931710243, "step": 1332 }, { "epoch": 0.11116666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.21103108723958333, "learning_rate": 4e-05, "loss": 4.8015, "loss/crossentropy": 1.8960464149713516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18768173828721046, "step": 1334 }, { "epoch": 0.11133333333333334, "grad_norm": 5.25, "grad_norm_var": 0.19947916666666668, "learning_rate": 4e-05, "loss": 4.9181, "loss/crossentropy": 2.5201704502105713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23990615457296371, "step": 1336 }, { "epoch": 0.1115, "grad_norm": 5.125, "grad_norm_var": 0.20764567057291666, "learning_rate": 4e-05, "loss": 4.9746, "loss/crossentropy": 2.4421244263648987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21953672170639038, "step": 1338 }, { "epoch": 0.11166666666666666, "grad_norm": 4.875, "grad_norm_var": 0.230712890625, "learning_rate": 4e-05, "loss": 4.5514, "loss/crossentropy": 1.0926872938871384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1308863628655672, "step": 1340 }, { "epoch": 0.11183333333333334, "grad_norm": 4.875, "grad_norm_var": 0.23811442057291668, "learning_rate": 4e-05, "loss": 4.5832, "loss/crossentropy": 1.838907465338707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18402666971087456, "step": 1342 }, { "epoch": 0.112, "grad_norm": 4.84375, "grad_norm_var": 0.14998372395833334, "learning_rate": 4e-05, "loss": 4.5698, "loss/crossentropy": 0.5912249013781548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10033663548529148, "step": 1344 }, { "epoch": 0.11216666666666666, "grad_norm": 6.125, "grad_norm_var": 0.30123291015625, "learning_rate": 4e-05, "loss": 5.5643, "loss/crossentropy": 2.446168899536133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21937239170074463, "step": 1346 }, { "epoch": 0.11233333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.2791015625, "learning_rate": 4e-05, "loss": 5.3916, "loss/crossentropy": 2.0473891273140907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.195414362475276, "step": 1348 }, { "epoch": 0.1125, "grad_norm": 5.09375, "grad_norm_var": 0.26171468098958334, "learning_rate": 4e-05, "loss": 4.8752, "loss/crossentropy": 2.4043519496917725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23139266297221184, "step": 1350 }, { "epoch": 0.11266666666666666, "grad_norm": 6.28125, "grad_norm_var": 0.3319295247395833, "learning_rate": 4e-05, "loss": 5.2582, "loss/crossentropy": 2.082743376493454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21164586022496223, "step": 1352 }, { "epoch": 0.11283333333333333, "grad_norm": 5.0, "grad_norm_var": 0.324609375, "learning_rate": 4e-05, "loss": 4.8166, "loss/crossentropy": 2.4302121698856354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206825278699398, "step": 1354 }, { "epoch": 0.113, "grad_norm": 4.84375, "grad_norm_var": 0.3073201497395833, "learning_rate": 4e-05, "loss": 4.5695, "loss/crossentropy": 1.4399050921201706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16213641315698624, "step": 1356 }, { "epoch": 0.11316666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.2970052083333333, "learning_rate": 4e-05, "loss": 5.0332, "loss/crossentropy": 1.3866655454039574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17396113276481628, "step": 1358 }, { "epoch": 0.11333333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.24954020182291667, "learning_rate": 4e-05, "loss": 4.807, "loss/crossentropy": 2.120422273874283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19081920012831688, "step": 1360 }, { "epoch": 0.1135, "grad_norm": 5.46875, "grad_norm_var": 0.1865234375, "learning_rate": 4e-05, "loss": 4.4229, "loss/crossentropy": 2.40205454826355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23431218788027763, "step": 1362 }, { "epoch": 0.11366666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.18375244140625, "learning_rate": 4e-05, "loss": 5.2446, "loss/crossentropy": 2.0221628546714783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2017949502915144, "step": 1364 }, { "epoch": 0.11383333333333333, "grad_norm": 5.25, "grad_norm_var": 0.18479410807291666, "learning_rate": 4e-05, "loss": 4.7709, "loss/crossentropy": 1.6139603182673454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16068048775196075, "step": 1366 }, { "epoch": 0.114, "grad_norm": 5.375, "grad_norm_var": 0.10015869140625, "learning_rate": 4e-05, "loss": 5.212, "loss/crossentropy": 2.3433795869350433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126586139202118, "step": 1368 }, { "epoch": 0.11416666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.09189046223958333, "learning_rate": 4e-05, "loss": 4.8335, "loss/crossentropy": 1.349827267229557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15882672742009163, "step": 1370 }, { "epoch": 0.11433333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.09112955729166666, "learning_rate": 4e-05, "loss": 5.3731, "loss/crossentropy": 2.245560199022293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21613704785704613, "step": 1372 }, { "epoch": 0.1145, "grad_norm": 5.1875, "grad_norm_var": 0.09959309895833333, "learning_rate": 4e-05, "loss": 5.1417, "loss/crossentropy": 2.2078827619552612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2603098005056381, "step": 1374 }, { "epoch": 0.11466666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.09361572265625, "learning_rate": 4e-05, "loss": 4.8827, "loss/crossentropy": 1.0163473710417747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13514012470841408, "step": 1376 }, { "epoch": 0.11483333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.04693603515625, "learning_rate": 4e-05, "loss": 5.586, "loss/crossentropy": 2.3435881435871124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20985861867666245, "step": 1378 }, { "epoch": 0.115, "grad_norm": 5.53125, "grad_norm_var": 0.060009765625, "learning_rate": 4e-05, "loss": 5.4013, "loss/crossentropy": 2.3194149136543274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22041558474302292, "step": 1380 }, { "epoch": 0.11516666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.05862223307291667, "learning_rate": 4e-05, "loss": 4.7929, "loss/crossentropy": 1.5186148211359978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15534107387065887, "step": 1382 }, { "epoch": 0.11533333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.04973551432291667, "learning_rate": 4e-05, "loss": 5.4201, "loss/crossentropy": 2.5172139406204224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23081282153725624, "step": 1384 }, { "epoch": 0.1155, "grad_norm": 5.15625, "grad_norm_var": 0.05025634765625, "learning_rate": 4e-05, "loss": 4.9595, "loss/crossentropy": 1.8386990427970886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22093252837657928, "step": 1386 }, { "epoch": 0.11566666666666667, "grad_norm": 5.8125, "grad_norm_var": 0.07411702473958333, "learning_rate": 4e-05, "loss": 5.8714, "loss/crossentropy": 2.4365760684013367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22801436483860016, "step": 1388 }, { "epoch": 0.11583333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.08166910807291666, "learning_rate": 4e-05, "loss": 5.5638, "loss/crossentropy": 2.097976215183735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022643592208624, "step": 1390 }, { "epoch": 0.116, "grad_norm": 5.3125, "grad_norm_var": 0.07825113932291666, "learning_rate": 4e-05, "loss": 4.4936, "loss/crossentropy": 1.5444388389587402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18695401214063168, "step": 1392 }, { "epoch": 0.11616666666666667, "grad_norm": 5.4375, "grad_norm_var": 0.09191080729166666, "learning_rate": 4e-05, "loss": 5.3733, "loss/crossentropy": 2.4442446529865265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22987202182412148, "step": 1394 }, { "epoch": 0.11633333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.08596598307291667, "learning_rate": 4e-05, "loss": 4.4895, "loss/crossentropy": 2.2781380712985992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2173830196261406, "step": 1396 }, { "epoch": 0.1165, "grad_norm": 5.28125, "grad_norm_var": 0.07771809895833333, "learning_rate": 4e-05, "loss": 5.594, "loss/crossentropy": 1.9904277324676514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20960525795817375, "step": 1398 }, { "epoch": 0.11666666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.07701416015625, "learning_rate": 4e-05, "loss": 4.8685, "loss/crossentropy": 1.894834741950035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187057975679636, "step": 1400 }, { "epoch": 0.11683333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.075244140625, "learning_rate": 4e-05, "loss": 4.952, "loss/crossentropy": 2.408271312713623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21414168551564217, "step": 1402 }, { "epoch": 0.117, "grad_norm": 5.1875, "grad_norm_var": 0.046728515625, "learning_rate": 4e-05, "loss": 4.8514, "loss/crossentropy": 1.8350339084863663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26108158379793167, "step": 1404 }, { "epoch": 0.11716666666666667, "grad_norm": 5.25, "grad_norm_var": 0.033056640625, "learning_rate": 4e-05, "loss": 4.9133, "loss/crossentropy": 2.0125522017478943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20184185728430748, "step": 1406 }, { "epoch": 0.11733333333333333, "grad_norm": 5.0, "grad_norm_var": 0.038525390625, "learning_rate": 4e-05, "loss": 4.7645, "loss/crossentropy": 1.840851441025734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1736996527761221, "step": 1408 }, { "epoch": 0.1175, "grad_norm": 5.0625, "grad_norm_var": 0.02086181640625, "learning_rate": 4e-05, "loss": 5.0723, "loss/crossentropy": 1.8003590106964111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19614629819989204, "step": 1410 }, { "epoch": 0.11766666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.049544270833333334, "learning_rate": 4e-05, "loss": 4.4699, "loss/crossentropy": 1.66546168923378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1754789799451828, "step": 1412 }, { "epoch": 0.11783333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.05738525390625, "learning_rate": 4e-05, "loss": 5.0322, "loss/crossentropy": 2.2698044180870056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20972862467169762, "step": 1414 }, { "epoch": 0.118, "grad_norm": 4.875, "grad_norm_var": 0.05716145833333333, "learning_rate": 4e-05, "loss": 5.4831, "loss/crossentropy": 2.5912956595420837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23193010687828064, "step": 1416 }, { "epoch": 0.11816666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.05792643229166667, "learning_rate": 4e-05, "loss": 5.418, "loss/crossentropy": 2.321280747652054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22956770285964012, "step": 1418 }, { "epoch": 0.11833333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.22893473307291667, "learning_rate": 4e-05, "loss": 6.2402, "loss/crossentropy": 2.3734790682792664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2287864312529564, "step": 1420 }, { "epoch": 0.1185, "grad_norm": 5.21875, "grad_norm_var": 0.23326822916666667, "learning_rate": 4e-05, "loss": 5.1958, "loss/crossentropy": 2.5214961767196655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23864174634218216, "step": 1422 }, { "epoch": 0.11866666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.22447916666666667, "learning_rate": 4e-05, "loss": 5.4997, "loss/crossentropy": 2.4628164768218994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24057137593626976, "step": 1424 }, { "epoch": 0.11883333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.23722330729166666, "learning_rate": 4e-05, "loss": 4.7698, "loss/crossentropy": 1.5074485763907433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1644449681043625, "step": 1426 }, { "epoch": 0.119, "grad_norm": 4.96875, "grad_norm_var": 0.229150390625, "learning_rate": 4e-05, "loss": 5.1593, "loss/crossentropy": 1.9605501666665077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1836913451552391, "step": 1428 }, { "epoch": 0.11916666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.22888997395833333, "learning_rate": 4e-05, "loss": 4.5879, "loss/crossentropy": 0.9429236724972725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12995466589927673, "step": 1430 }, { "epoch": 0.11933333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.229541015625, "learning_rate": 4e-05, "loss": 4.85, "loss/crossentropy": 2.3147625029087067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474900841712952, "step": 1432 }, { "epoch": 0.1195, "grad_norm": 5.78125, "grad_norm_var": 0.24944254557291667, "learning_rate": 4e-05, "loss": 5.2804, "loss/crossentropy": 2.4392059445381165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091044746339321, "step": 1434 }, { "epoch": 0.11966666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.10818684895833333, "learning_rate": 4e-05, "loss": 4.7755, "loss/crossentropy": 2.5641788244247437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216048277914524, "step": 1436 }, { "epoch": 0.11983333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.10390218098958333, "learning_rate": 4e-05, "loss": 4.5948, "loss/crossentropy": 1.3560013100504875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16775010898709297, "step": 1438 }, { "epoch": 0.12, "grad_norm": 5.1875, "grad_norm_var": 0.10220947265625, "learning_rate": 4e-05, "loss": 5.0291, "loss/crossentropy": 1.731564313173294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2059439942240715, "step": 1440 }, { "epoch": 0.12016666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.09065348307291667, "learning_rate": 4e-05, "loss": 4.8764, "loss/crossentropy": 1.6669957488775253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2174536045640707, "step": 1442 }, { "epoch": 0.12033333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.05572509765625, "learning_rate": 4e-05, "loss": 4.1789, "loss/crossentropy": 1.694481611251831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21173863485455513, "step": 1444 }, { "epoch": 0.1205, "grad_norm": 5.90625, "grad_norm_var": 0.09667561848958334, "learning_rate": 4e-05, "loss": 5.8553, "loss/crossentropy": 2.4365376234054565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22777355462312698, "step": 1446 }, { "epoch": 0.12066666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.0845703125, "learning_rate": 4e-05, "loss": 4.9573, "loss/crossentropy": 1.7493075802922249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954033151268959, "step": 1448 }, { "epoch": 0.12083333333333333, "grad_norm": 5.75, "grad_norm_var": 0.07862955729166667, "learning_rate": 4e-05, "loss": 5.2187, "loss/crossentropy": 2.0391087383031845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20556922629475594, "step": 1450 }, { "epoch": 0.121, "grad_norm": 5.15625, "grad_norm_var": 0.079150390625, "learning_rate": 4e-05, "loss": 5.0911, "loss/crossentropy": 2.0242467522621155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108009122312069, "step": 1452 }, { "epoch": 0.12116666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.07587483723958334, "learning_rate": 4e-05, "loss": 4.9383, "loss/crossentropy": 2.3943995237350464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22050346434116364, "step": 1454 }, { "epoch": 0.12133333333333333, "grad_norm": 5.0, "grad_norm_var": 0.07675374348958333, "learning_rate": 4e-05, "loss": 4.6839, "loss/crossentropy": 1.3471101224422455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15038909018039703, "step": 1456 }, { "epoch": 0.1215, "grad_norm": 5.46875, "grad_norm_var": 0.07496337890625, "learning_rate": 4e-05, "loss": 4.5277, "loss/crossentropy": 1.43942092359066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15480425581336021, "step": 1458 }, { "epoch": 0.12166666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.08983968098958334, "learning_rate": 4e-05, "loss": 4.7697, "loss/crossentropy": 1.4175751879811287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17789405956864357, "step": 1460 }, { "epoch": 0.12183333333333334, "grad_norm": 4.875, "grad_norm_var": 0.0681640625, "learning_rate": 4e-05, "loss": 4.8328, "loss/crossentropy": 1.766202375292778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20591039210557938, "step": 1462 }, { "epoch": 0.122, "grad_norm": 5.125, "grad_norm_var": 0.06751302083333334, "learning_rate": 4e-05, "loss": 5.0332, "loss/crossentropy": 1.4572946727275848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17859814316034317, "step": 1464 }, { "epoch": 0.12216666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.052718098958333334, "learning_rate": 4e-05, "loss": 4.6043, "loss/crossentropy": 1.657370686531067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17122356966137886, "step": 1466 }, { "epoch": 0.12233333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.055952962239583334, "learning_rate": 4e-05, "loss": 4.3136, "loss/crossentropy": 1.8274584114551544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.194772370159626, "step": 1468 }, { "epoch": 0.1225, "grad_norm": 4.875, "grad_norm_var": 0.058056640625, "learning_rate": 4e-05, "loss": 4.4468, "loss/crossentropy": 2.0565488040447235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20730895921587944, "step": 1470 }, { "epoch": 0.12266666666666666, "grad_norm": 5.40625, "grad_norm_var": 0.05943603515625, "learning_rate": 4e-05, "loss": 4.9933, "loss/crossentropy": 2.241286873817444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22634489834308624, "step": 1472 }, { "epoch": 0.12283333333333334, "grad_norm": 6.53125, "grad_norm_var": 0.18683268229166666, "learning_rate": 4e-05, "loss": 4.3905, "loss/crossentropy": 1.1198562234640121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1370545905083418, "step": 1474 }, { "epoch": 0.123, "grad_norm": 4.84375, "grad_norm_var": 0.17771809895833332, "learning_rate": 4e-05, "loss": 4.7302, "loss/crossentropy": 1.3211162611842155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15940341539680958, "step": 1476 }, { "epoch": 0.12316666666666666, "grad_norm": 5.53125, "grad_norm_var": 0.17636311848958333, "learning_rate": 4e-05, "loss": 4.942, "loss/crossentropy": 1.9065639302134514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158641517162323, "step": 1478 }, { "epoch": 0.12333333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.19544270833333333, "learning_rate": 4e-05, "loss": 4.5968, "loss/crossentropy": 1.284117877483368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15446274541318417, "step": 1480 }, { "epoch": 0.1235, "grad_norm": 5.1875, "grad_norm_var": 0.21178385416666667, "learning_rate": 4e-05, "loss": 5.2084, "loss/crossentropy": 2.334298014640808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24484197422862053, "step": 1482 }, { "epoch": 0.12366666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.20715738932291666, "learning_rate": 4e-05, "loss": 4.9366, "loss/crossentropy": 2.0920065343379974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23238061368465424, "step": 1484 }, { "epoch": 0.12383333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.216259765625, "learning_rate": 4e-05, "loss": 4.8202, "loss/crossentropy": 1.1066526547074318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14913895167410374, "step": 1486 }, { "epoch": 0.124, "grad_norm": 4.5625, "grad_norm_var": 0.23922119140625, "learning_rate": 4e-05, "loss": 4.0508, "loss/crossentropy": 1.4457052126526833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.148674588650465, "step": 1488 }, { "epoch": 0.12416666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.12756754557291666, "learning_rate": 4e-05, "loss": 4.7907, "loss/crossentropy": 2.526742786169052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22900137305259705, "step": 1490 }, { "epoch": 0.12433333333333334, "grad_norm": 5.6875, "grad_norm_var": 0.13847249348958332, "learning_rate": 4e-05, "loss": 5.2301, "loss/crossentropy": 3.177564024925232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22343944758176804, "step": 1492 }, { "epoch": 0.1245, "grad_norm": 4.75, "grad_norm_var": 0.14231770833333332, "learning_rate": 4e-05, "loss": 4.9742, "loss/crossentropy": 2.5111494660377502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2281503677368164, "step": 1494 }, { "epoch": 0.12466666666666666, "grad_norm": 5.0, "grad_norm_var": 0.12511393229166667, "learning_rate": 4e-05, "loss": 5.148, "loss/crossentropy": 1.6791961714625359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17439424619078636, "step": 1496 }, { "epoch": 0.12483333333333334, "grad_norm": 5.125, "grad_norm_var": 0.09153238932291667, "learning_rate": 4e-05, "loss": 5.0941, "loss/crossentropy": 2.4359480142593384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23082533478736877, "step": 1498 }, { "epoch": 0.125, "grad_norm": 5.46875, "grad_norm_var": 0.098681640625, "learning_rate": 4e-05, "loss": 5.4316, "loss/crossentropy": 2.4162683486938477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21909157186746597, "step": 1500 }, { "epoch": 0.12516666666666668, "grad_norm": 5.53125, "grad_norm_var": 0.09726155598958333, "learning_rate": 4e-05, "loss": 4.6494, "loss/crossentropy": 1.3783812075853348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16549547761678696, "step": 1502 }, { "epoch": 0.12533333333333332, "grad_norm": 5.3125, "grad_norm_var": 0.06874593098958333, "learning_rate": 4e-05, "loss": 5.1572, "loss/crossentropy": 2.214748978614807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22452621906995773, "step": 1504 }, { "epoch": 0.1255, "grad_norm": 5.0625, "grad_norm_var": 0.06366780598958334, "learning_rate": 4e-05, "loss": 4.6462, "loss/crossentropy": 1.724791169166565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19539067894220352, "step": 1506 }, { "epoch": 0.12566666666666668, "grad_norm": 5.125, "grad_norm_var": 0.06627604166666666, "learning_rate": 4e-05, "loss": 4.7258, "loss/crossentropy": 1.3971355706453323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1541696861386299, "step": 1508 }, { "epoch": 0.12583333333333332, "grad_norm": 4.9375, "grad_norm_var": 0.057275390625, "learning_rate": 4e-05, "loss": 4.8271, "loss/crossentropy": 1.845518447458744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17741846106946468, "step": 1510 }, { "epoch": 0.126, "grad_norm": 5.0, "grad_norm_var": 0.07776285807291666, "learning_rate": 4e-05, "loss": 4.0287, "loss/crossentropy": 2.070504516363144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981128826737404, "step": 1512 }, { "epoch": 0.12616666666666668, "grad_norm": 6.3125, "grad_norm_var": 0.17545166015625, "learning_rate": 4e-05, "loss": 4.5925, "loss/crossentropy": 2.5709685683250427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24376041069626808, "step": 1514 }, { "epoch": 0.12633333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.17541910807291666, "learning_rate": 4e-05, "loss": 5.0738, "loss/crossentropy": 2.435946464538574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22766336053609848, "step": 1516 }, { "epoch": 0.1265, "grad_norm": 5.75, "grad_norm_var": 0.18553059895833332, "learning_rate": 4e-05, "loss": 5.0032, "loss/crossentropy": 1.9362272024154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20861081406474113, "step": 1518 }, { "epoch": 0.12666666666666668, "grad_norm": 5.375, "grad_norm_var": 0.18470052083333333, "learning_rate": 4e-05, "loss": 5.0846, "loss/crossentropy": 2.1167075484991074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20843248441815376, "step": 1520 }, { "epoch": 0.12683333333333333, "grad_norm": 5.90625, "grad_norm_var": 0.3120930989583333, "learning_rate": 4e-05, "loss": 4.5747, "loss/crossentropy": 1.9277404323220253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18708796799182892, "step": 1522 }, { "epoch": 0.127, "grad_norm": 4.96875, "grad_norm_var": 0.309375, "learning_rate": 4e-05, "loss": 4.5207, "loss/crossentropy": 2.263314723968506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160516083240509, "step": 1524 }, { "epoch": 0.12716666666666668, "grad_norm": 5.09375, "grad_norm_var": 0.30276285807291664, "learning_rate": 4e-05, "loss": 4.5577, "loss/crossentropy": 2.3493450582027435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21893595904111862, "step": 1526 }, { "epoch": 0.12733333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.4759724934895833, "learning_rate": 4e-05, "loss": 4.8924, "loss/crossentropy": 1.4031277000904083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15571350418031216, "step": 1528 }, { "epoch": 0.1275, "grad_norm": 5.46875, "grad_norm_var": 0.4598592122395833, "learning_rate": 4e-05, "loss": 4.8127, "loss/crossentropy": 1.026276372373104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14767338708043098, "step": 1530 }, { "epoch": 0.12766666666666668, "grad_norm": 5.03125, "grad_norm_var": 0.4837849934895833, "learning_rate": 4e-05, "loss": 4.8847, "loss/crossentropy": 1.8962939083576202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18200470879673958, "step": 1532 }, { "epoch": 0.12783333333333333, "grad_norm": 5.125, "grad_norm_var": 0.4787068684895833, "learning_rate": 4e-05, "loss": 5.0413, "loss/crossentropy": 2.3904325664043427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21139219775795937, "step": 1534 }, { "epoch": 0.128, "grad_norm": 5.28125, "grad_norm_var": 0.5261067708333333, "learning_rate": 4e-05, "loss": 4.6779, "loss/crossentropy": 2.3674957752227783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22268419340252876, "step": 1536 }, { "epoch": 0.12816666666666668, "grad_norm": 4.6875, "grad_norm_var": 0.4163411458333333, "learning_rate": 4e-05, "loss": 4.7978, "loss/crossentropy": 1.4360255599021912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14481048472225666, "step": 1538 }, { "epoch": 0.12833333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.41028238932291666, "learning_rate": 4e-05, "loss": 5.2889, "loss/crossentropy": 1.9318012371659279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18672936409711838, "step": 1540 }, { "epoch": 0.1285, "grad_norm": 4.96875, "grad_norm_var": 0.4143880208333333, "learning_rate": 4e-05, "loss": 5.3868, "loss/crossentropy": 2.4637942910194397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21543822437524796, "step": 1542 }, { "epoch": 0.12866666666666668, "grad_norm": 5.78125, "grad_norm_var": 0.12024739583333334, "learning_rate": 4e-05, "loss": 5.0045, "loss/crossentropy": 1.7794182002544403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21857787482440472, "step": 1544 }, { "epoch": 0.12883333333333333, "grad_norm": 5.25, "grad_norm_var": 0.09724934895833333, "learning_rate": 4e-05, "loss": 5.0813, "loss/crossentropy": 1.905199073255062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17909251898527145, "step": 1546 }, { "epoch": 0.129, "grad_norm": 5.03125, "grad_norm_var": 0.097900390625, "learning_rate": 4e-05, "loss": 4.7055, "loss/crossentropy": 2.350240021944046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20850903540849686, "step": 1548 }, { "epoch": 0.12916666666666668, "grad_norm": 4.8125, "grad_norm_var": 0.09957275390625, "learning_rate": 4e-05, "loss": 4.666, "loss/crossentropy": 1.4298752844333649, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15599924698472023, "step": 1550 }, { "epoch": 0.12933333333333333, "grad_norm": 5.90625, "grad_norm_var": 0.12213134765625, "learning_rate": 4e-05, "loss": 4.5669, "loss/crossentropy": 1.468435674905777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16555116325616837, "step": 1552 }, { "epoch": 0.1295, "grad_norm": 5.21875, "grad_norm_var": 0.10188802083333333, "learning_rate": 4e-05, "loss": 5.1023, "loss/crossentropy": 2.4195556640625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22506968677043915, "step": 1554 }, { "epoch": 0.12966666666666668, "grad_norm": 5.125, "grad_norm_var": 0.10370686848958334, "learning_rate": 4e-05, "loss": 5.1492, "loss/crossentropy": 1.5128118842840195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15466869808733463, "step": 1556 }, { "epoch": 0.12983333333333333, "grad_norm": 14.1875, "grad_norm_var": 5.136393229166667, "learning_rate": 4e-05, "loss": 5.2507, "loss/crossentropy": 1.1354478374123573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13475182093679905, "step": 1558 }, { "epoch": 0.13, "grad_norm": 5.03125, "grad_norm_var": 5.169254557291667, "learning_rate": 4e-05, "loss": 4.7625, "loss/crossentropy": 1.5475751757621765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16386966034770012, "step": 1560 }, { "epoch": 0.13016666666666668, "grad_norm": 4.9375, "grad_norm_var": 5.164351399739584, "learning_rate": 4e-05, "loss": 5.0927, "loss/crossentropy": 2.1593563556671143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24635591357946396, "step": 1562 }, { "epoch": 0.13033333333333333, "grad_norm": 5.03125, "grad_norm_var": 5.16011962890625, "learning_rate": 4e-05, "loss": 5.1048, "loss/crossentropy": 2.4482282400131226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22428294271230698, "step": 1564 }, { "epoch": 0.1305, "grad_norm": 4.96875, "grad_norm_var": 5.107157389322917, "learning_rate": 4e-05, "loss": 4.5934, "loss/crossentropy": 1.9403712749481201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931654028594494, "step": 1566 }, { "epoch": 0.13066666666666665, "grad_norm": 5.34375, "grad_norm_var": 5.100223795572917, "learning_rate": 4e-05, "loss": 5.2486, "loss/crossentropy": 1.9268745481967926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18993456289172173, "step": 1568 }, { "epoch": 0.13083333333333333, "grad_norm": 5.1875, "grad_norm_var": 5.137565104166667, "learning_rate": 4e-05, "loss": 4.8246, "loss/crossentropy": 2.275236487388611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21554533019661903, "step": 1570 }, { "epoch": 0.131, "grad_norm": 4.84375, "grad_norm_var": 5.121484375, "learning_rate": 4e-05, "loss": 4.8314, "loss/crossentropy": 2.044840008020401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845100037753582, "step": 1572 }, { "epoch": 0.13116666666666665, "grad_norm": 5.25, "grad_norm_var": 0.04846598307291667, "learning_rate": 4e-05, "loss": 4.5061, "loss/crossentropy": 1.3665557280182838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1586722508072853, "step": 1574 }, { "epoch": 0.13133333333333333, "grad_norm": 5.0, "grad_norm_var": 0.051285807291666666, "learning_rate": 4e-05, "loss": 4.9826, "loss/crossentropy": 1.770714707672596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17428990453481674, "step": 1576 }, { "epoch": 0.1315, "grad_norm": 5.40625, "grad_norm_var": 0.053385416666666664, "learning_rate": 4e-05, "loss": 5.4026, "loss/crossentropy": 1.7916882634162903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20171686075627804, "step": 1578 }, { "epoch": 0.13166666666666665, "grad_norm": 5.1875, "grad_norm_var": 0.059305826822916664, "learning_rate": 4e-05, "loss": 4.938, "loss/crossentropy": 1.495934583246708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16308372281491756, "step": 1580 }, { "epoch": 0.13183333333333333, "grad_norm": 5.25, "grad_norm_var": 0.054488118489583334, "learning_rate": 4e-05, "loss": 4.8879, "loss/crossentropy": 2.5814104676246643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21871698647737503, "step": 1582 }, { "epoch": 0.132, "grad_norm": 5.25, "grad_norm_var": 0.06337483723958333, "learning_rate": 4e-05, "loss": 4.9582, "loss/crossentropy": 2.209455542266369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19592761620879173, "step": 1584 }, { "epoch": 0.13216666666666665, "grad_norm": 5.25, "grad_norm_var": 0.057145182291666666, "learning_rate": 4e-05, "loss": 4.7327, "loss/crossentropy": 1.3624914586544037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1524915173649788, "step": 1586 }, { "epoch": 0.13233333333333333, "grad_norm": 5.125, "grad_norm_var": 0.057145182291666666, "learning_rate": 4e-05, "loss": 5.075, "loss/crossentropy": 2.308533728122711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21155225485563278, "step": 1588 }, { "epoch": 0.1325, "grad_norm": 5.03125, "grad_norm_var": 0.06763916015625, "learning_rate": 4e-05, "loss": 5.7472, "loss/crossentropy": 1.77063799649477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19738946482539177, "step": 1590 }, { "epoch": 0.13266666666666665, "grad_norm": 5.15625, "grad_norm_var": 0.06131184895833333, "learning_rate": 4e-05, "loss": 5.0464, "loss/crossentropy": 1.8190487623214722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1730487048625946, "step": 1592 }, { "epoch": 0.13283333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.062093098958333336, "learning_rate": 4e-05, "loss": 4.6534, "loss/crossentropy": 1.6317052841186523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1610642485320568, "step": 1594 }, { "epoch": 0.133, "grad_norm": 5.15625, "grad_norm_var": 0.05699462890625, "learning_rate": 4e-05, "loss": 5.2636, "loss/crossentropy": 2.040685288608074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18768882751464844, "step": 1596 }, { "epoch": 0.13316666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.06627197265625, "learning_rate": 4e-05, "loss": 4.99, "loss/crossentropy": 2.288883000612259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22606860101222992, "step": 1598 }, { "epoch": 0.13333333333333333, "grad_norm": 5.0, "grad_norm_var": 0.07382405598958333, "learning_rate": 4e-05, "loss": 4.4231, "loss/crossentropy": 1.997236281633377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19324356690049171, "step": 1600 }, { "epoch": 0.1335, "grad_norm": 5.84375, "grad_norm_var": 0.10868733723958333, "learning_rate": 4e-05, "loss": 4.8902, "loss/crossentropy": 1.9118523299694061, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18948077410459518, "step": 1602 }, { "epoch": 0.13366666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.13033447265625, "learning_rate": 4e-05, "loss": 4.5752, "loss/crossentropy": 1.4698756337165833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17594150640070438, "step": 1604 }, { "epoch": 0.13383333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.11119384765625, "learning_rate": 4e-05, "loss": 5.0654, "loss/crossentropy": 1.5521681532263756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.167510736733675, "step": 1606 }, { "epoch": 0.134, "grad_norm": 5.40625, "grad_norm_var": 0.12392171223958333, "learning_rate": 4e-05, "loss": 5.4244, "loss/crossentropy": 2.3366805016994476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125181294977665, "step": 1608 }, { "epoch": 0.13416666666666666, "grad_norm": 5.40625, "grad_norm_var": 0.12021077473958333, "learning_rate": 4e-05, "loss": 4.8163, "loss/crossentropy": 2.4447622895240784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22324832528829575, "step": 1610 }, { "epoch": 0.13433333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.11339518229166666, "learning_rate": 4e-05, "loss": 5.0853, "loss/crossentropy": 1.8828533068299294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18855641968548298, "step": 1612 }, { "epoch": 0.1345, "grad_norm": 8.5625, "grad_norm_var": 0.7942545572916667, "learning_rate": 4e-05, "loss": 5.023, "loss/crossentropy": 2.5013024508953094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21763851121068, "step": 1614 }, { "epoch": 0.13466666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.7565104166666666, "learning_rate": 4e-05, "loss": 4.9341, "loss/crossentropy": 2.0742052495479584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010239139199257, "step": 1616 }, { "epoch": 0.13483333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.7486328125, "learning_rate": 4e-05, "loss": 5.2218, "loss/crossentropy": 2.068577319383621, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23265555873513222, "step": 1618 }, { "epoch": 0.135, "grad_norm": 5.34375, "grad_norm_var": 0.74644775390625, "learning_rate": 4e-05, "loss": 5.3211, "loss/crossentropy": 2.0565109848976135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19935489632189274, "step": 1620 }, { "epoch": 0.13516666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.7264322916666667, "learning_rate": 4e-05, "loss": 4.5425, "loss/crossentropy": 1.4980078116059303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15502066165208817, "step": 1622 }, { "epoch": 0.13533333333333333, "grad_norm": 5.25, "grad_norm_var": 0.7397745768229167, "learning_rate": 4e-05, "loss": 4.9548, "loss/crossentropy": 2.127755284309387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20423029735684395, "step": 1624 }, { "epoch": 0.1355, "grad_norm": 5.21875, "grad_norm_var": 0.741650390625, "learning_rate": 4e-05, "loss": 4.9034, "loss/crossentropy": 2.0613020807504654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2056688815355301, "step": 1626 }, { "epoch": 0.13566666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.73834228515625, "learning_rate": 4e-05, "loss": 4.7378, "loss/crossentropy": 2.2652209401130676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20464074611663818, "step": 1628 }, { "epoch": 0.13583333333333333, "grad_norm": 5.5, "grad_norm_var": 0.07753499348958333, "learning_rate": 4e-05, "loss": 4.7986, "loss/crossentropy": 1.7409793213009834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20771316066384315, "step": 1630 }, { "epoch": 0.136, "grad_norm": 5.125, "grad_norm_var": 0.06633707682291666, "learning_rate": 4e-05, "loss": 5.4698, "loss/crossentropy": 2.0719391107559204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20104419440031052, "step": 1632 }, { "epoch": 0.13616666666666666, "grad_norm": 4.875, "grad_norm_var": 0.05220947265625, "learning_rate": 4e-05, "loss": 4.8056, "loss/crossentropy": 2.4364999532699585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327173836529255, "step": 1634 }, { "epoch": 0.13633333333333333, "grad_norm": 5.71875, "grad_norm_var": 0.1138671875, "learning_rate": 4e-05, "loss": 5.0928, "loss/crossentropy": 1.8130161613225937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1989502925425768, "step": 1636 }, { "epoch": 0.1365, "grad_norm": 5.53125, "grad_norm_var": 0.12629801432291668, "learning_rate": 4e-05, "loss": 4.4983, "loss/crossentropy": 1.8206287994980812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003721445798874, "step": 1638 }, { "epoch": 0.13666666666666666, "grad_norm": 5.875, "grad_norm_var": 0.15676676432291667, "learning_rate": 4e-05, "loss": 4.6903, "loss/crossentropy": 1.9255887940526009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2551080249249935, "step": 1640 }, { "epoch": 0.13683333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.164306640625, "learning_rate": 4e-05, "loss": 4.6729, "loss/crossentropy": 1.737329825758934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21636349894106388, "step": 1642 }, { "epoch": 0.137, "grad_norm": 5.0625, "grad_norm_var": 0.16217041015625, "learning_rate": 4e-05, "loss": 5.31, "loss/crossentropy": 2.59636914730072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2293633446097374, "step": 1644 }, { "epoch": 0.13716666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.14149983723958334, "learning_rate": 4e-05, "loss": 5.5565, "loss/crossentropy": 2.477478504180908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.232582226395607, "step": 1646 }, { "epoch": 0.13733333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.13826497395833334, "learning_rate": 4e-05, "loss": 4.979, "loss/crossentropy": 1.4540888145565987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14884299412369728, "step": 1648 }, { "epoch": 0.1375, "grad_norm": 5.28125, "grad_norm_var": 0.12708333333333333, "learning_rate": 4e-05, "loss": 4.9889, "loss/crossentropy": 1.8568930253386497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2040102779865265, "step": 1650 }, { "epoch": 0.13766666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.0666015625, "learning_rate": 4e-05, "loss": 4.7083, "loss/crossentropy": 1.5726129412651062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16069914400577545, "step": 1652 }, { "epoch": 0.13783333333333334, "grad_norm": 5.40625, "grad_norm_var": 0.059098307291666666, "learning_rate": 4e-05, "loss": 5.2683, "loss/crossentropy": 1.7672999277710915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18627581745386124, "step": 1654 }, { "epoch": 0.138, "grad_norm": 5.53125, "grad_norm_var": 0.03515218098958333, "learning_rate": 4e-05, "loss": 5.4057, "loss/crossentropy": 2.0668781399726868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19123420864343643, "step": 1656 }, { "epoch": 0.13816666666666666, "grad_norm": 5.78125, "grad_norm_var": 0.046610514322916664, "learning_rate": 4e-05, "loss": 4.9816, "loss/crossentropy": 1.6409248635172844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18773740530014038, "step": 1658 }, { "epoch": 0.13833333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.052632649739583336, "learning_rate": 4e-05, "loss": 5.1193, "loss/crossentropy": 1.2817718982696533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17280958406627178, "step": 1660 }, { "epoch": 0.1385, "grad_norm": 5.0625, "grad_norm_var": 0.05597330729166667, "learning_rate": 4e-05, "loss": 4.7329, "loss/crossentropy": 2.2702360451221466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21893694251775742, "step": 1662 }, { "epoch": 0.13866666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.0603515625, "learning_rate": 4e-05, "loss": 5.0972, "loss/crossentropy": 2.0147531405091286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18305204808712006, "step": 1664 }, { "epoch": 0.13883333333333334, "grad_norm": 5.25, "grad_norm_var": 0.06276041666666667, "learning_rate": 4e-05, "loss": 5.1082, "loss/crossentropy": 2.198255777359009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22349874302744865, "step": 1666 }, { "epoch": 0.139, "grad_norm": 5.0625, "grad_norm_var": 0.06510416666666667, "learning_rate": 4e-05, "loss": 4.676, "loss/crossentropy": 1.8043780699372292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21263182908296585, "step": 1668 }, { "epoch": 0.13916666666666666, "grad_norm": 5.25, "grad_norm_var": 0.06417643229166667, "learning_rate": 4e-05, "loss": 5.7267, "loss/crossentropy": 2.2814477682113647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21907350048422813, "step": 1670 }, { "epoch": 0.13933333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.06222330729166667, "learning_rate": 4e-05, "loss": 5.3567, "loss/crossentropy": 2.4507880806922913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21009447425603867, "step": 1672 }, { "epoch": 0.1395, "grad_norm": 4.625, "grad_norm_var": 0.05358072916666667, "learning_rate": 4e-05, "loss": 5.083, "loss/crossentropy": 2.072799079120159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938009299337864, "step": 1674 }, { "epoch": 0.13966666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.07610270182291666, "learning_rate": 4e-05, "loss": 4.23, "loss/crossentropy": 1.8492163196206093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20542608201503754, "step": 1676 }, { "epoch": 0.13983333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.06927083333333334, "learning_rate": 4e-05, "loss": 4.8043, "loss/crossentropy": 1.972286880016327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20922426879405975, "step": 1678 }, { "epoch": 0.14, "grad_norm": 5.4375, "grad_norm_var": 0.07862955729166667, "learning_rate": 4e-05, "loss": 4.5887, "loss/crossentropy": 1.6558887809515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17858467251062393, "step": 1680 }, { "epoch": 0.14016666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.087744140625, "learning_rate": 4e-05, "loss": 4.761, "loss/crossentropy": 1.950936883687973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1931697092950344, "step": 1682 }, { "epoch": 0.14033333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.102978515625, "learning_rate": 4e-05, "loss": 4.8183, "loss/crossentropy": 1.3800019018817693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13202445802744478, "step": 1684 }, { "epoch": 0.1405, "grad_norm": 5.0, "grad_norm_var": 0.083447265625, "learning_rate": 4e-05, "loss": 4.4808, "loss/crossentropy": 2.337316393852234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25428661331534386, "step": 1686 }, { "epoch": 0.14066666666666666, "grad_norm": 4.5, "grad_norm_var": 0.10367431640625, "learning_rate": 4e-05, "loss": 5.0679, "loss/crossentropy": 2.343565195798874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2149459458887577, "step": 1688 }, { "epoch": 0.14083333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.09651285807291667, "learning_rate": 4e-05, "loss": 5.114, "loss/crossentropy": 2.6408793926239014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21381603553891182, "step": 1690 }, { "epoch": 0.141, "grad_norm": 5.25, "grad_norm_var": 0.07955322265625, "learning_rate": 4e-05, "loss": 5.163, "loss/crossentropy": 2.383645087480545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23237445205450058, "step": 1692 }, { "epoch": 0.14116666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.08216145833333334, "learning_rate": 4e-05, "loss": 4.9111, "loss/crossentropy": 1.9802673906087875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19009397737681866, "step": 1694 }, { "epoch": 0.14133333333333334, "grad_norm": 5.3125, "grad_norm_var": 0.07603759765625, "learning_rate": 4e-05, "loss": 5.0697, "loss/crossentropy": 2.302330046892166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208226066082716, "step": 1696 }, { "epoch": 0.1415, "grad_norm": 4.84375, "grad_norm_var": 0.06842447916666666, "learning_rate": 4e-05, "loss": 5.205, "loss/crossentropy": 1.977390617132187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20193704962730408, "step": 1698 }, { "epoch": 0.14166666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.0548828125, "learning_rate": 4e-05, "loss": 5.1493, "loss/crossentropy": 2.0484447479248047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19997265562415123, "step": 1700 }, { "epoch": 0.14183333333333334, "grad_norm": 5.25, "grad_norm_var": 0.07564697265625, "learning_rate": 4e-05, "loss": 4.9987, "loss/crossentropy": 1.799696996808052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18479579128324986, "step": 1702 }, { "epoch": 0.142, "grad_norm": 5.21875, "grad_norm_var": 0.049723307291666664, "learning_rate": 4e-05, "loss": 4.781, "loss/crossentropy": 1.1656968668103218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15266522020101547, "step": 1704 }, { "epoch": 0.14216666666666666, "grad_norm": 4.875, "grad_norm_var": 0.058854166666666666, "learning_rate": 4e-05, "loss": 4.1525, "loss/crossentropy": 1.4563089236617088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14857494831085205, "step": 1706 }, { "epoch": 0.14233333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.07823893229166666, "learning_rate": 4e-05, "loss": 4.1668, "loss/crossentropy": 1.807879388332367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19141371175646782, "step": 1708 }, { "epoch": 0.1425, "grad_norm": 5.8125, "grad_norm_var": 0.10091145833333333, "learning_rate": 4e-05, "loss": 4.9732, "loss/crossentropy": 2.9230883717536926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23849774524569511, "step": 1710 }, { "epoch": 0.14266666666666666, "grad_norm": 5.375, "grad_norm_var": 0.16031494140625, "learning_rate": 4e-05, "loss": 5.1238, "loss/crossentropy": 1.3809229135513306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20320942997932434, "step": 1712 }, { "epoch": 0.14283333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.15139567057291667, "learning_rate": 4e-05, "loss": 4.9135, "loss/crossentropy": 2.339945375919342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23271268606185913, "step": 1714 }, { "epoch": 0.143, "grad_norm": 6.09375, "grad_norm_var": 0.17459309895833333, "learning_rate": 4e-05, "loss": 5.1534, "loss/crossentropy": 1.5341841503977776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16835985332727432, "step": 1716 }, { "epoch": 0.14316666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.16959228515625, "learning_rate": 4e-05, "loss": 4.572, "loss/crossentropy": 2.6712507009506226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24255206063389778, "step": 1718 }, { "epoch": 0.14333333333333334, "grad_norm": 5.3125, "grad_norm_var": 0.16822509765625, "learning_rate": 4e-05, "loss": 5.2017, "loss/crossentropy": 2.1750669479370117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2266225516796112, "step": 1720 }, { "epoch": 0.1435, "grad_norm": 4.875, "grad_norm_var": 0.15441080729166667, "learning_rate": 4e-05, "loss": 4.8313, "loss/crossentropy": 1.5534632056951523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15007868967950344, "step": 1722 }, { "epoch": 0.14366666666666666, "grad_norm": 5.5625, "grad_norm_var": 0.14334309895833333, "learning_rate": 4e-05, "loss": 5.5259, "loss/crossentropy": 2.7301476895809174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24438628181815147, "step": 1724 }, { "epoch": 0.14383333333333334, "grad_norm": 5.71875, "grad_norm_var": 0.15627848307291667, "learning_rate": 4e-05, "loss": 4.7724, "loss/crossentropy": 1.191504381597042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1475481502711773, "step": 1726 }, { "epoch": 0.144, "grad_norm": 5.1875, "grad_norm_var": 0.14524739583333332, "learning_rate": 4e-05, "loss": 4.8071, "loss/crossentropy": 2.629876434803009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2133907452225685, "step": 1728 }, { "epoch": 0.14416666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.17906494140625, "learning_rate": 4e-05, "loss": 4.3475, "loss/crossentropy": 2.0846259891986847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23153432458639145, "step": 1730 }, { "epoch": 0.14433333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.111328125, "learning_rate": 4e-05, "loss": 5.3218, "loss/crossentropy": 2.3203621357679367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2167310230433941, "step": 1732 }, { "epoch": 0.1445, "grad_norm": 4.9375, "grad_norm_var": 0.10002848307291666, "learning_rate": 4e-05, "loss": 5.3713, "loss/crossentropy": 2.592851758003235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2294953651726246, "step": 1734 }, { "epoch": 0.14466666666666667, "grad_norm": 5.375, "grad_norm_var": 0.107666015625, "learning_rate": 4e-05, "loss": 4.4483, "loss/crossentropy": 0.9307321533560753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13924349658191204, "step": 1736 }, { "epoch": 0.14483333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.10692952473958334, "learning_rate": 4e-05, "loss": 4.9398, "loss/crossentropy": 2.3195015490055084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2094910740852356, "step": 1738 }, { "epoch": 0.145, "grad_norm": 4.875, "grad_norm_var": 0.10390625, "learning_rate": 4e-05, "loss": 4.3281, "loss/crossentropy": 2.1040413677692413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23669259622693062, "step": 1740 }, { "epoch": 0.14516666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.07890625, "learning_rate": 4e-05, "loss": 4.7054, "loss/crossentropy": 1.9919825196266174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1985725536942482, "step": 1742 }, { "epoch": 0.14533333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.07511393229166667, "learning_rate": 4e-05, "loss": 5.1666, "loss/crossentropy": 2.397672086954117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19976507499814034, "step": 1744 }, { "epoch": 0.1455, "grad_norm": 4.84375, "grad_norm_var": 0.07239176432291666, "learning_rate": 4e-05, "loss": 4.7493, "loss/crossentropy": 1.2222031652927399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14994047954678535, "step": 1746 }, { "epoch": 0.14566666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.07967122395833333, "learning_rate": 4e-05, "loss": 4.8289, "loss/crossentropy": 2.458436369895935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22444933280348778, "step": 1748 }, { "epoch": 0.14583333333333334, "grad_norm": 5.0, "grad_norm_var": 0.07899983723958333, "learning_rate": 4e-05, "loss": 4.6862, "loss/crossentropy": 1.9638415053486824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19102539867162704, "step": 1750 }, { "epoch": 0.146, "grad_norm": 4.875, "grad_norm_var": 0.05963134765625, "learning_rate": 4e-05, "loss": 4.6494, "loss/crossentropy": 2.514769494533539, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22001716122031212, "step": 1752 }, { "epoch": 0.14616666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.202734375, "learning_rate": 4e-05, "loss": 4.8056, "loss/crossentropy": 1.6116134598851204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18631109967827797, "step": 1754 }, { "epoch": 0.14633333333333334, "grad_norm": 5.46875, "grad_norm_var": 0.18538004557291668, "learning_rate": 4e-05, "loss": 4.6867, "loss/crossentropy": 1.6297817006707191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18314477056264877, "step": 1756 }, { "epoch": 0.1465, "grad_norm": 5.0, "grad_norm_var": 0.15367431640625, "learning_rate": 4e-05, "loss": 5.3406, "loss/crossentropy": 2.525915801525116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21952301263809204, "step": 1758 }, { "epoch": 0.14666666666666667, "grad_norm": 5.25, "grad_norm_var": 0.15338134765625, "learning_rate": 4e-05, "loss": 5.3172, "loss/crossentropy": 2.1865801215171814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21111416071653366, "step": 1760 }, { "epoch": 0.14683333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.16013997395833332, "learning_rate": 4e-05, "loss": 4.4209, "loss/crossentropy": 1.7157298550009727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20164168626070023, "step": 1762 }, { "epoch": 0.147, "grad_norm": 5.0, "grad_norm_var": 0.1662109375, "learning_rate": 4e-05, "loss": 4.7885, "loss/crossentropy": 1.6044250950217247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19163594394922256, "step": 1764 }, { "epoch": 0.14716666666666667, "grad_norm": 5.25, "grad_norm_var": 0.17174479166666667, "learning_rate": 4e-05, "loss": 5.5379, "loss/crossentropy": 2.4521120488643646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20806816592812538, "step": 1766 }, { "epoch": 0.14733333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.16829020182291668, "learning_rate": 4e-05, "loss": 4.4508, "loss/crossentropy": 2.1604004204273224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1913926713168621, "step": 1768 }, { "epoch": 0.1475, "grad_norm": 5.5, "grad_norm_var": 0.05987955729166667, "learning_rate": 4e-05, "loss": 5.3149, "loss/crossentropy": 2.428584039211273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24361254274845123, "step": 1770 }, { "epoch": 0.14766666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.05725504557291667, "learning_rate": 4e-05, "loss": 5.5396, "loss/crossentropy": 2.1453306525945663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18525381945073605, "step": 1772 }, { "epoch": 0.14783333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.06269124348958334, "learning_rate": 4e-05, "loss": 4.9702, "loss/crossentropy": 2.037685215473175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19330525025725365, "step": 1774 }, { "epoch": 0.148, "grad_norm": 4.8125, "grad_norm_var": 0.07906494140625, "learning_rate": 4e-05, "loss": 4.8811, "loss/crossentropy": 1.9805008471012115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17861885949969292, "step": 1776 }, { "epoch": 0.14816666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.09000244140625, "learning_rate": 4e-05, "loss": 5.0579, "loss/crossentropy": 2.3986242413520813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22519202157855034, "step": 1778 }, { "epoch": 0.14833333333333334, "grad_norm": 4.875, "grad_norm_var": 0.08487955729166667, "learning_rate": 4e-05, "loss": 4.3706, "loss/crossentropy": 1.5709987133741379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20712516456842422, "step": 1780 }, { "epoch": 0.1485, "grad_norm": 5.0625, "grad_norm_var": 0.08948160807291666, "learning_rate": 4e-05, "loss": 5.2491, "loss/crossentropy": 2.725129246711731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22343413904309273, "step": 1782 }, { "epoch": 0.14866666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.09230143229166667, "learning_rate": 4e-05, "loss": 5.1373, "loss/crossentropy": 1.8716261237859726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893265787512064, "step": 1784 }, { "epoch": 0.14883333333333335, "grad_norm": 5.34375, "grad_norm_var": 0.08098958333333334, "learning_rate": 4e-05, "loss": 5.4504, "loss/crossentropy": 2.106525592505932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19252919033169746, "step": 1786 }, { "epoch": 0.149, "grad_norm": 5.21875, "grad_norm_var": 0.08274739583333333, "learning_rate": 4e-05, "loss": 5.155, "loss/crossentropy": 2.140032261610031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22491325810551643, "step": 1788 }, { "epoch": 0.14916666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.07890625, "learning_rate": 4e-05, "loss": 4.6521, "loss/crossentropy": 1.9066472426056862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19813631661236286, "step": 1790 }, { "epoch": 0.14933333333333335, "grad_norm": 5.375, "grad_norm_var": 0.07017822265625, "learning_rate": 4e-05, "loss": 4.9176, "loss/crossentropy": 1.9560877978801727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19545845314860344, "step": 1792 }, { "epoch": 0.1495, "grad_norm": 5.21875, "grad_norm_var": 0.046187337239583334, "learning_rate": 4e-05, "loss": 5.0363, "loss/crossentropy": 1.8670417666435242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26671649515628815, "step": 1794 }, { "epoch": 0.14966666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.040913899739583336, "learning_rate": 4e-05, "loss": 4.7842, "loss/crossentropy": 1.4337139576673508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16944348998367786, "step": 1796 }, { "epoch": 0.14983333333333335, "grad_norm": 4.9375, "grad_norm_var": 0.04244384765625, "learning_rate": 4e-05, "loss": 4.8675, "loss/crossentropy": 2.3128662705421448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2631648778915405, "step": 1798 }, { "epoch": 0.15, "grad_norm": 5.5, "grad_norm_var": 0.034098307291666664, "learning_rate": 4e-05, "loss": 5.4269, "loss/crossentropy": 2.659923791885376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22622456029057503, "step": 1800 }, { "epoch": 0.15016666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.0412109375, "learning_rate": 4e-05, "loss": 4.7557, "loss/crossentropy": 1.2567346766591072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16816434264183044, "step": 1802 }, { "epoch": 0.15033333333333335, "grad_norm": 5.21875, "grad_norm_var": 0.04107666015625, "learning_rate": 4e-05, "loss": 4.9006, "loss/crossentropy": 1.7615478411316872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22866688668727875, "step": 1804 }, { "epoch": 0.1505, "grad_norm": 5.0, "grad_norm_var": 0.06204427083333333, "learning_rate": 4e-05, "loss": 4.8503, "loss/crossentropy": 1.9991141185164452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18227214738726616, "step": 1806 }, { "epoch": 0.15066666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.05517171223958333, "learning_rate": 4e-05, "loss": 4.7189, "loss/crossentropy": 2.0811602771282196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1848256252706051, "step": 1808 }, { "epoch": 0.15083333333333335, "grad_norm": 5.5, "grad_norm_var": 0.10266927083333334, "learning_rate": 4e-05, "loss": 4.8733, "loss/crossentropy": 2.1010901927948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20911147445440292, "step": 1810 }, { "epoch": 0.151, "grad_norm": 5.03125, "grad_norm_var": 0.09928385416666667, "learning_rate": 4e-05, "loss": 5.1623, "loss/crossentropy": 1.4801330715417862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16037312522530556, "step": 1812 }, { "epoch": 0.15116666666666667, "grad_norm": 5.84375, "grad_norm_var": 0.13240559895833334, "learning_rate": 4e-05, "loss": 4.9778, "loss/crossentropy": 1.7996556013822556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23594452254474163, "step": 1814 }, { "epoch": 0.15133333333333332, "grad_norm": 5.09375, "grad_norm_var": 0.13013916015625, "learning_rate": 4e-05, "loss": 5.1219, "loss/crossentropy": 1.7359198927879333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1817542277276516, "step": 1816 }, { "epoch": 0.1515, "grad_norm": 5.03125, "grad_norm_var": 0.13631184895833334, "learning_rate": 4e-05, "loss": 4.7057, "loss/crossentropy": 2.527916431427002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22807636111974716, "step": 1818 }, { "epoch": 0.15166666666666667, "grad_norm": 5.71875, "grad_norm_var": 0.15128580729166666, "learning_rate": 4e-05, "loss": 4.7407, "loss/crossentropy": 1.3911296725273132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24339628778398037, "step": 1820 }, { "epoch": 0.15183333333333332, "grad_norm": 4.96875, "grad_norm_var": 0.124462890625, "learning_rate": 4e-05, "loss": 5.0528, "loss/crossentropy": 1.8240971639752388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18753194250166416, "step": 1822 }, { "epoch": 0.152, "grad_norm": 4.96875, "grad_norm_var": 0.1193359375, "learning_rate": 4e-05, "loss": 4.4, "loss/crossentropy": 1.6177871525287628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020118124783039, "step": 1824 }, { "epoch": 0.15216666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.09401041666666667, "learning_rate": 4e-05, "loss": 4.8499, "loss/crossentropy": 2.3816640377044678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21570777148008347, "step": 1826 }, { "epoch": 0.15233333333333332, "grad_norm": 8.1875, "grad_norm_var": 0.6413899739583333, "learning_rate": 4e-05, "loss": 5.6275, "loss/crossentropy": 2.400721490383148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21228517964482307, "step": 1828 }, { "epoch": 0.1525, "grad_norm": 5.4375, "grad_norm_var": 0.6223917643229167, "learning_rate": 4e-05, "loss": 4.5715, "loss/crossentropy": 1.3201181143522263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16993128694593906, "step": 1830 }, { "epoch": 0.15266666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.6361328125, "learning_rate": 4e-05, "loss": 5.4743, "loss/crossentropy": 2.2901048958301544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23060648143291473, "step": 1832 }, { "epoch": 0.15283333333333332, "grad_norm": 4.90625, "grad_norm_var": 0.630322265625, "learning_rate": 4e-05, "loss": 4.8528, "loss/crossentropy": 1.1291920691728592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13143914192914963, "step": 1834 }, { "epoch": 0.153, "grad_norm": 4.6875, "grad_norm_var": 0.6576171875, "learning_rate": 4e-05, "loss": 5.0234, "loss/crossentropy": 1.9712878987193108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17850744351744652, "step": 1836 }, { "epoch": 0.15316666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.7058553059895833, "learning_rate": 4e-05, "loss": 4.779, "loss/crossentropy": 2.4524222016334534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22154907137155533, "step": 1838 }, { "epoch": 0.15333333333333332, "grad_norm": 4.90625, "grad_norm_var": 0.70953369140625, "learning_rate": 4e-05, "loss": 5.1157, "loss/crossentropy": 2.52553254365921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21638255938887596, "step": 1840 }, { "epoch": 0.1535, "grad_norm": 5.28125, "grad_norm_var": 0.6929036458333333, "learning_rate": 4e-05, "loss": 4.9487, "loss/crossentropy": 2.4182560443878174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22422944754362106, "step": 1842 }, { "epoch": 0.15366666666666667, "grad_norm": 4.875, "grad_norm_var": 0.13957926432291667, "learning_rate": 4e-05, "loss": 4.9538, "loss/crossentropy": 2.422487258911133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22509846463799477, "step": 1844 }, { "epoch": 0.15383333333333332, "grad_norm": 4.9375, "grad_norm_var": 0.13420817057291667, "learning_rate": 4e-05, "loss": 5.3952, "loss/crossentropy": 2.584647834300995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22367503494024277, "step": 1846 }, { "epoch": 0.154, "grad_norm": 5.21875, "grad_norm_var": 0.10582275390625, "learning_rate": 4e-05, "loss": 5.0143, "loss/crossentropy": 2.4110784828662872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21831950172781944, "step": 1848 }, { "epoch": 0.15416666666666667, "grad_norm": 5.5, "grad_norm_var": 0.11378580729166667, "learning_rate": 4e-05, "loss": 5.1977, "loss/crossentropy": 2.0590811669826508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20148218050599098, "step": 1850 }, { "epoch": 0.15433333333333332, "grad_norm": 5.21875, "grad_norm_var": 0.10273030598958334, "learning_rate": 4e-05, "loss": 4.6938, "loss/crossentropy": 1.8851531371474266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17905624769628048, "step": 1852 }, { "epoch": 0.1545, "grad_norm": 4.9375, "grad_norm_var": 0.048177083333333336, "learning_rate": 4e-05, "loss": 4.8345, "loss/crossentropy": 1.723634012043476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19837238639593124, "step": 1854 }, { "epoch": 0.15466666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.03463541666666667, "learning_rate": 4e-05, "loss": 5.229, "loss/crossentropy": 1.8247249498963356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20431892201304436, "step": 1856 }, { "epoch": 0.15483333333333332, "grad_norm": 5.0, "grad_norm_var": 0.042643229166666664, "learning_rate": 4e-05, "loss": 4.8978, "loss/crossentropy": 2.5018930435180664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22337466478347778, "step": 1858 }, { "epoch": 0.155, "grad_norm": 5.09375, "grad_norm_var": 0.04010009765625, "learning_rate": 4e-05, "loss": 4.932, "loss/crossentropy": 1.747809186577797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17998847924172878, "step": 1860 }, { "epoch": 0.15516666666666667, "grad_norm": 5.25, "grad_norm_var": 0.05349934895833333, "learning_rate": 4e-05, "loss": 4.4732, "loss/crossentropy": 1.3985635191202164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15633606724441051, "step": 1862 }, { "epoch": 0.15533333333333332, "grad_norm": 4.5625, "grad_norm_var": 0.08147379557291666, "learning_rate": 4e-05, "loss": 4.3033, "loss/crossentropy": 1.821037471294403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18188106641173363, "step": 1864 }, { "epoch": 0.1555, "grad_norm": 5.3125, "grad_norm_var": 0.075244140625, "learning_rate": 4e-05, "loss": 5.4331, "loss/crossentropy": 2.3469000458717346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21943211928009987, "step": 1866 }, { "epoch": 0.15566666666666668, "grad_norm": 7.59375, "grad_norm_var": 0.48971354166666664, "learning_rate": 4e-05, "loss": 5.1285, "loss/crossentropy": 2.018453985452652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18594501912593842, "step": 1868 }, { "epoch": 0.15583333333333332, "grad_norm": 5.1875, "grad_norm_var": 0.49491780598958335, "learning_rate": 4e-05, "loss": 4.6245, "loss/crossentropy": 1.441122718155384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1634649857878685, "step": 1870 }, { "epoch": 0.156, "grad_norm": 4.84375, "grad_norm_var": 0.5040201822916667, "learning_rate": 4e-05, "loss": 4.7244, "loss/crossentropy": 2.1576380729675293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19306758418679237, "step": 1872 }, { "epoch": 0.15616666666666668, "grad_norm": 5.125, "grad_norm_var": 0.47877197265625, "learning_rate": 4e-05, "loss": 5.286, "loss/crossentropy": 2.008717902004719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18943626806139946, "step": 1874 }, { "epoch": 0.15633333333333332, "grad_norm": 5.1875, "grad_norm_var": 0.48489176432291664, "learning_rate": 4e-05, "loss": 4.9675, "loss/crossentropy": 2.4755281805992126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2329132817685604, "step": 1876 }, { "epoch": 0.1565, "grad_norm": 4.78125, "grad_norm_var": 0.4714152018229167, "learning_rate": 4e-05, "loss": 5.249, "loss/crossentropy": 2.4515629410743713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22444384172558784, "step": 1878 }, { "epoch": 0.15666666666666668, "grad_norm": 6.53125, "grad_norm_var": 0.5702473958333333, "learning_rate": 4e-05, "loss": 4.9503, "loss/crossentropy": 1.8355879187583923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18035876750946045, "step": 1880 }, { "epoch": 0.15683333333333332, "grad_norm": 5.5, "grad_norm_var": 0.5729166666666666, "learning_rate": 4e-05, "loss": 5.7065, "loss/crossentropy": 1.975680448114872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19011396169662476, "step": 1882 }, { "epoch": 0.157, "grad_norm": 4.78125, "grad_norm_var": 0.25494791666666666, "learning_rate": 4e-05, "loss": 5.0065, "loss/crossentropy": 2.348602294921875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2531758286058903, "step": 1884 }, { "epoch": 0.15716666666666668, "grad_norm": 5.15625, "grad_norm_var": 0.249853515625, "learning_rate": 4e-05, "loss": 4.9532, "loss/crossentropy": 1.5291048362851143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16095010749995708, "step": 1886 }, { "epoch": 0.15733333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.25266927083333335, "learning_rate": 4e-05, "loss": 4.4705, "loss/crossentropy": 1.795276552438736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22460604459047318, "step": 1888 }, { "epoch": 0.1575, "grad_norm": 4.4375, "grad_norm_var": 0.29225260416666665, "learning_rate": 4e-05, "loss": 4.5639, "loss/crossentropy": 1.6168242916464806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1597994789481163, "step": 1890 }, { "epoch": 0.15766666666666668, "grad_norm": 5.625, "grad_norm_var": 0.27740478515625, "learning_rate": 4e-05, "loss": 5.4135, "loss/crossentropy": 2.2597386240959167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24958740919828415, "step": 1892 }, { "epoch": 0.15783333333333333, "grad_norm": 4.875, "grad_norm_var": 0.28808186848958334, "learning_rate": 4e-05, "loss": 4.6908, "loss/crossentropy": 1.3280053436756134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1516177151352167, "step": 1894 }, { "epoch": 0.158, "grad_norm": 4.96875, "grad_norm_var": 0.13253580729166667, "learning_rate": 4e-05, "loss": 4.8624, "loss/crossentropy": 1.9109614789485931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17931961454451084, "step": 1896 }, { "epoch": 0.15816666666666668, "grad_norm": 4.875, "grad_norm_var": 0.10818684895833333, "learning_rate": 4e-05, "loss": 4.9535, "loss/crossentropy": 1.4697567075490952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16392095386981964, "step": 1898 }, { "epoch": 0.15833333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.10221354166666667, "learning_rate": 4e-05, "loss": 5.0563, "loss/crossentropy": 1.7749396488070488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17989159747958183, "step": 1900 }, { "epoch": 0.1585, "grad_norm": 5.21875, "grad_norm_var": 0.13990885416666668, "learning_rate": 4e-05, "loss": 5.369, "loss/crossentropy": 2.3539693355560303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21434181556105614, "step": 1902 }, { "epoch": 0.15866666666666668, "grad_norm": 5.78125, "grad_norm_var": 0.15950520833333334, "learning_rate": 4e-05, "loss": 5.3091, "loss/crossentropy": 2.474464476108551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21791671961545944, "step": 1904 }, { "epoch": 0.15883333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.18865559895833334, "learning_rate": 4e-05, "loss": 4.7584, "loss/crossentropy": 1.7396632134914398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19973601773381233, "step": 1906 }, { "epoch": 0.159, "grad_norm": 5.125, "grad_norm_var": 0.18391520182291668, "learning_rate": 4e-05, "loss": 5.0961, "loss/crossentropy": 2.441837340593338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.207939263433218, "step": 1908 }, { "epoch": 0.15916666666666668, "grad_norm": 5.46875, "grad_norm_var": 0.15128580729166666, "learning_rate": 4e-05, "loss": 5.4184, "loss/crossentropy": 2.0546700954437256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21630385518074036, "step": 1910 }, { "epoch": 0.15933333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.1484375, "learning_rate": 4e-05, "loss": 5.432, "loss/crossentropy": 2.465700089931488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23406285047531128, "step": 1912 }, { "epoch": 0.1595, "grad_norm": 5.15625, "grad_norm_var": 0.17476806640625, "learning_rate": 4e-05, "loss": 4.7457, "loss/crossentropy": 1.1225157380104065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15988787077367306, "step": 1914 }, { "epoch": 0.15966666666666668, "grad_norm": 5.125, "grad_norm_var": 0.20487874348958332, "learning_rate": 4e-05, "loss": 4.8988, "loss/crossentropy": 1.849706619977951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1671137586236, "step": 1916 }, { "epoch": 0.15983333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.18251546223958334, "learning_rate": 4e-05, "loss": 5.6658, "loss/crossentropy": 2.5173650979995728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2206265851855278, "step": 1918 }, { "epoch": 0.16, "grad_norm": 5.0, "grad_norm_var": 0.15904541015625, "learning_rate": 4e-05, "loss": 5.4365, "loss/crossentropy": 2.3284026384353638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2226024679839611, "step": 1920 }, { "epoch": 0.16016666666666668, "grad_norm": 5.28125, "grad_norm_var": 0.07131754557291667, "learning_rate": 4e-05, "loss": 5.0088, "loss/crossentropy": 0.9736118018627167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12116567231714725, "step": 1922 }, { "epoch": 0.16033333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.09021809895833334, "learning_rate": 4e-05, "loss": 4.7493, "loss/crossentropy": 2.010709524154663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19896573573350906, "step": 1924 }, { "epoch": 0.1605, "grad_norm": 5.15625, "grad_norm_var": 0.10076497395833334, "learning_rate": 4e-05, "loss": 5.3031, "loss/crossentropy": 1.8335441946983337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24130244553089142, "step": 1926 }, { "epoch": 0.16066666666666668, "grad_norm": 4.78125, "grad_norm_var": 0.13570556640625, "learning_rate": 4e-05, "loss": 5.1219, "loss/crossentropy": 1.9984001368284225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19060589745640755, "step": 1928 }, { "epoch": 0.16083333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.11756184895833334, "learning_rate": 4e-05, "loss": 5.0371, "loss/crossentropy": 1.7571651637554169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17572467029094696, "step": 1930 }, { "epoch": 0.161, "grad_norm": 5.5, "grad_norm_var": 0.10657145182291666, "learning_rate": 4e-05, "loss": 4.8262, "loss/crossentropy": 2.319933772087097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20385072752833366, "step": 1932 }, { "epoch": 0.16116666666666668, "grad_norm": 5.28125, "grad_norm_var": 0.10193684895833334, "learning_rate": 4e-05, "loss": 4.7018, "loss/crossentropy": 2.2420734465122223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2276177629828453, "step": 1934 }, { "epoch": 0.16133333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.114697265625, "learning_rate": 4e-05, "loss": 4.8164, "loss/crossentropy": 2.019700661301613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1761186197400093, "step": 1936 }, { "epoch": 0.1615, "grad_norm": 5.0625, "grad_norm_var": 0.11516927083333334, "learning_rate": 4e-05, "loss": 5.228, "loss/crossentropy": 2.334056079387665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20008408278226852, "step": 1938 }, { "epoch": 0.16166666666666665, "grad_norm": 5.28125, "grad_norm_var": 0.08961181640625, "learning_rate": 4e-05, "loss": 4.815, "loss/crossentropy": 1.8608058020472527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20045208930969238, "step": 1940 }, { "epoch": 0.16183333333333333, "grad_norm": 5.75, "grad_norm_var": 0.10299072265625, "learning_rate": 4e-05, "loss": 4.8539, "loss/crossentropy": 1.859613299369812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18477841094136238, "step": 1942 }, { "epoch": 0.162, "grad_norm": 4.84375, "grad_norm_var": 0.0744140625, "learning_rate": 4e-05, "loss": 4.4145, "loss/crossentropy": 2.1982096135616302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2072415016591549, "step": 1944 }, { "epoch": 0.16216666666666665, "grad_norm": 4.71875, "grad_norm_var": 0.0837890625, "learning_rate": 4e-05, "loss": 5.1867, "loss/crossentropy": 2.53378689289093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22257087007164955, "step": 1946 }, { "epoch": 0.16233333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.071875, "learning_rate": 4e-05, "loss": 4.7655, "loss/crossentropy": 1.2198933511972427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14932805858552456, "step": 1948 }, { "epoch": 0.1625, "grad_norm": 5.15625, "grad_norm_var": 0.073681640625, "learning_rate": 4e-05, "loss": 5.2139, "loss/crossentropy": 2.3506920337677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21482441201806068, "step": 1950 }, { "epoch": 0.16266666666666665, "grad_norm": 5.28125, "grad_norm_var": 0.06640625, "learning_rate": 4e-05, "loss": 4.2194, "loss/crossentropy": 0.9886042326688766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13658013939857483, "step": 1952 }, { "epoch": 0.16283333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.06549072265625, "learning_rate": 4e-05, "loss": 4.6741, "loss/crossentropy": 2.5076074600219727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21432289853692055, "step": 1954 }, { "epoch": 0.163, "grad_norm": 5.5625, "grad_norm_var": 0.07711181640625, "learning_rate": 4e-05, "loss": 5.4932, "loss/crossentropy": 2.4833337664604187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21723904460668564, "step": 1956 }, { "epoch": 0.16316666666666665, "grad_norm": 4.8125, "grad_norm_var": 0.06767171223958333, "learning_rate": 4e-05, "loss": 4.6413, "loss/crossentropy": 1.8598149567842484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18153854832053185, "step": 1958 }, { "epoch": 0.16333333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.07623291015625, "learning_rate": 4e-05, "loss": 4.8104, "loss/crossentropy": 2.2267325818538666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2370566502213478, "step": 1960 }, { "epoch": 0.1635, "grad_norm": 4.90625, "grad_norm_var": 0.08196614583333334, "learning_rate": 4e-05, "loss": 5.6272, "loss/crossentropy": 2.7236159443855286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216436043381691, "step": 1962 }, { "epoch": 0.16366666666666665, "grad_norm": 4.84375, "grad_norm_var": 0.08528645833333333, "learning_rate": 4e-05, "loss": 4.5921, "loss/crossentropy": 1.4360825419425964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15567945316433907, "step": 1964 }, { "epoch": 0.16383333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.08876546223958333, "learning_rate": 4e-05, "loss": 4.8939, "loss/crossentropy": 1.9993894025683403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18757018819451332, "step": 1966 }, { "epoch": 0.164, "grad_norm": 5.125, "grad_norm_var": 0.09542643229166667, "learning_rate": 4e-05, "loss": 5.2536, "loss/crossentropy": 2.448215901851654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2187819555401802, "step": 1968 }, { "epoch": 0.16416666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.09426676432291667, "learning_rate": 4e-05, "loss": 5.3415, "loss/crossentropy": 2.382662773132324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23334766179323196, "step": 1970 }, { "epoch": 0.16433333333333333, "grad_norm": 4.875, "grad_norm_var": 0.08240559895833334, "learning_rate": 4e-05, "loss": 5.1642, "loss/crossentropy": 1.7083993628621101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22550025209784508, "step": 1972 }, { "epoch": 0.1645, "grad_norm": 5.3125, "grad_norm_var": 0.070556640625, "learning_rate": 4e-05, "loss": 4.8266, "loss/crossentropy": 2.033088594675064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23395147919654846, "step": 1974 }, { "epoch": 0.16466666666666666, "grad_norm": 5.3125, "grad_norm_var": 0.06925455729166667, "learning_rate": 4e-05, "loss": 5.4875, "loss/crossentropy": 2.3617620170116425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22937371581792831, "step": 1976 }, { "epoch": 0.16483333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.05813395182291667, "learning_rate": 4e-05, "loss": 5.0183, "loss/crossentropy": 2.381405919790268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20841734111309052, "step": 1978 }, { "epoch": 0.165, "grad_norm": 4.9375, "grad_norm_var": 0.056494140625, "learning_rate": 4e-05, "loss": 4.9318, "loss/crossentropy": 1.984310194849968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19360090605914593, "step": 1980 }, { "epoch": 0.16516666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.09052327473958334, "learning_rate": 4e-05, "loss": 4.7867, "loss/crossentropy": 1.1787148118019104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13031774573028088, "step": 1982 }, { "epoch": 0.16533333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.08163655598958333, "learning_rate": 4e-05, "loss": 5.1374, "loss/crossentropy": 2.660287320613861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21815495565533638, "step": 1984 }, { "epoch": 0.1655, "grad_norm": 5.40625, "grad_norm_var": 0.092822265625, "learning_rate": 4e-05, "loss": 4.5718, "loss/crossentropy": 1.7051584795117378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1718977987766266, "step": 1986 }, { "epoch": 0.16566666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.086572265625, "learning_rate": 4e-05, "loss": 5.3293, "loss/crossentropy": 1.5497848987579346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15701960772275925, "step": 1988 }, { "epoch": 0.16583333333333333, "grad_norm": 5.625, "grad_norm_var": 0.09440104166666667, "learning_rate": 4e-05, "loss": 5.0158, "loss/crossentropy": 2.073232203722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22496861219406128, "step": 1990 }, { "epoch": 0.166, "grad_norm": 5.0, "grad_norm_var": 0.11298421223958334, "learning_rate": 4e-05, "loss": 4.6335, "loss/crossentropy": 1.3899268805980682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14914513006806374, "step": 1992 }, { "epoch": 0.16616666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.10818684895833333, "learning_rate": 4e-05, "loss": 5.4054, "loss/crossentropy": 2.4187216758728027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2222491316497326, "step": 1994 }, { "epoch": 0.16633333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.11230061848958334, "learning_rate": 4e-05, "loss": 4.887, "loss/crossentropy": 1.6145347505807877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16213739477097988, "step": 1996 }, { "epoch": 0.1665, "grad_norm": 4.90625, "grad_norm_var": 0.07278238932291667, "learning_rate": 4e-05, "loss": 4.5128, "loss/crossentropy": 2.2953919768333435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2423577792942524, "step": 1998 }, { "epoch": 0.16666666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.08388264973958333, "learning_rate": 4e-05, "loss": 5.0227, "loss/crossentropy": 2.7778520584106445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22993957251310349, "step": 2000 }, { "epoch": 0.16683333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.09149983723958334, "learning_rate": 4e-05, "loss": 4.5579, "loss/crossentropy": 1.120415337383747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14762288890779018, "step": 2002 }, { "epoch": 0.167, "grad_norm": 5.125, "grad_norm_var": 0.08527018229166666, "learning_rate": 4e-05, "loss": 4.5543, "loss/crossentropy": 1.6726857349276543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18800584971904755, "step": 2004 }, { "epoch": 0.16716666666666666, "grad_norm": 5.3125, "grad_norm_var": 0.06734619140625, "learning_rate": 4e-05, "loss": 4.6986, "loss/crossentropy": 1.9222635477781296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19062339887022972, "step": 2006 }, { "epoch": 0.16733333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.05347900390625, "learning_rate": 4e-05, "loss": 4.8448, "loss/crossentropy": 2.118234932422638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22146522253751755, "step": 2008 }, { "epoch": 0.1675, "grad_norm": 5.0, "grad_norm_var": 0.05175374348958333, "learning_rate": 4e-05, "loss": 4.8215, "loss/crossentropy": 2.252037912607193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157456949353218, "step": 2010 }, { "epoch": 0.16766666666666666, "grad_norm": 4.875, "grad_norm_var": 0.06926676432291666, "learning_rate": 4e-05, "loss": 4.3806, "loss/crossentropy": 0.9033909440040588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1250423714518547, "step": 2012 }, { "epoch": 0.16783333333333333, "grad_norm": 5.0, "grad_norm_var": 0.07434488932291666, "learning_rate": 4e-05, "loss": 4.7516, "loss/crossentropy": 1.9430923759937286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18675028160214424, "step": 2014 }, { "epoch": 0.168, "grad_norm": 5.15625, "grad_norm_var": 0.06448160807291667, "learning_rate": 4e-05, "loss": 4.7063, "loss/crossentropy": 1.634926363825798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1670081913471222, "step": 2016 }, { "epoch": 0.16816666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.04830729166666667, "learning_rate": 4e-05, "loss": 4.857, "loss/crossentropy": 1.694766104221344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19327403232455254, "step": 2018 }, { "epoch": 0.16833333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.05015869140625, "learning_rate": 4e-05, "loss": 5.5487, "loss/crossentropy": 2.1367068588733673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21962807700037956, "step": 2020 }, { "epoch": 0.1685, "grad_norm": 4.96875, "grad_norm_var": 0.07320556640625, "learning_rate": 4e-05, "loss": 4.0799, "loss/crossentropy": 1.671954207122326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1811060570180416, "step": 2022 }, { "epoch": 0.16866666666666666, "grad_norm": 5.28125, "grad_norm_var": 0.09479166666666666, "learning_rate": 4e-05, "loss": 5.2455, "loss/crossentropy": 1.6374276280403137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16752460598945618, "step": 2024 }, { "epoch": 0.16883333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.09312744140625, "learning_rate": 4e-05, "loss": 5.3886, "loss/crossentropy": 2.3457963168621063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2288379706442356, "step": 2026 }, { "epoch": 0.169, "grad_norm": 4.75, "grad_norm_var": 0.06985270182291667, "learning_rate": 4e-05, "loss": 4.6938, "loss/crossentropy": 2.563704550266266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21900682151317596, "step": 2028 }, { "epoch": 0.16916666666666666, "grad_norm": 5.71875, "grad_norm_var": 0.09034830729166667, "learning_rate": 4e-05, "loss": 4.6583, "loss/crossentropy": 2.2988042533397675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2221379354596138, "step": 2030 }, { "epoch": 0.16933333333333334, "grad_norm": 5.5, "grad_norm_var": 0.10146077473958333, "learning_rate": 4e-05, "loss": 4.9018, "loss/crossentropy": 2.246855854988098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2294277399778366, "step": 2032 }, { "epoch": 0.1695, "grad_norm": 4.71875, "grad_norm_var": 0.1115234375, "learning_rate": 4e-05, "loss": 4.2763, "loss/crossentropy": 1.483270302414894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20540903508663177, "step": 2034 }, { "epoch": 0.16966666666666666, "grad_norm": 6.25, "grad_norm_var": 0.20623372395833334, "learning_rate": 4e-05, "loss": 4.6115, "loss/crossentropy": 0.7337675020098686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12142804265022278, "step": 2036 }, { "epoch": 0.16983333333333334, "grad_norm": 5.125, "grad_norm_var": 0.1724609375, "learning_rate": 4e-05, "loss": 5.2868, "loss/crossentropy": 2.4326335787773132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2261388711631298, "step": 2038 }, { "epoch": 0.17, "grad_norm": 5.25, "grad_norm_var": 0.16256510416666667, "learning_rate": 4e-05, "loss": 4.6802, "loss/crossentropy": 1.8480440527200699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2009208109229803, "step": 2040 }, { "epoch": 0.17016666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.17239176432291667, "learning_rate": 4e-05, "loss": 4.8374, "loss/crossentropy": 1.7596202716231346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18629582971334457, "step": 2042 }, { "epoch": 0.17033333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.15969645182291667, "learning_rate": 4e-05, "loss": 5.0063, "loss/crossentropy": 1.6326167657971382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21383372321724892, "step": 2044 }, { "epoch": 0.1705, "grad_norm": 5.25, "grad_norm_var": 0.14698893229166668, "learning_rate": 4e-05, "loss": 4.9583, "loss/crossentropy": 1.3872774839401245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19810861721634865, "step": 2046 }, { "epoch": 0.17066666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.14521077473958333, "learning_rate": 4e-05, "loss": 4.2737, "loss/crossentropy": 1.2238230854272842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1741249691694975, "step": 2048 }, { "epoch": 0.17083333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.14703369140625, "learning_rate": 4e-05, "loss": 3.9872, "loss/crossentropy": 1.1708182319998741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17563006840646267, "step": 2050 }, { "epoch": 0.171, "grad_norm": 4.8125, "grad_norm_var": 0.04694010416666667, "learning_rate": 4e-05, "loss": 4.826, "loss/crossentropy": 1.4472626447677612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18563862517476082, "step": 2052 }, { "epoch": 0.17116666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.04892171223958333, "learning_rate": 4e-05, "loss": 4.5356, "loss/crossentropy": 1.3219031170010567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.136514600366354, "step": 2054 }, { "epoch": 0.17133333333333334, "grad_norm": 5.0, "grad_norm_var": 0.0365234375, "learning_rate": 4e-05, "loss": 4.9406, "loss/crossentropy": 2.4237805008888245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22321195155382156, "step": 2056 }, { "epoch": 0.1715, "grad_norm": 5.375, "grad_norm_var": 0.05188802083333333, "learning_rate": 4e-05, "loss": 5.5955, "loss/crossentropy": 1.8901968151330948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951073817908764, "step": 2058 }, { "epoch": 0.17166666666666666, "grad_norm": 5.125, "grad_norm_var": 0.04957275390625, "learning_rate": 4e-05, "loss": 5.2047, "loss/crossentropy": 2.6344847083091736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2257305197417736, "step": 2060 }, { "epoch": 0.17183333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.048140462239583334, "learning_rate": 4e-05, "loss": 5.1074, "loss/crossentropy": 2.4700939655303955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20783070847392082, "step": 2062 }, { "epoch": 0.172, "grad_norm": 5.1875, "grad_norm_var": 0.044384765625, "learning_rate": 4e-05, "loss": 4.9033, "loss/crossentropy": 2.0393999814987183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21963367983698845, "step": 2064 }, { "epoch": 0.17216666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.046468098958333336, "learning_rate": 4e-05, "loss": 5.1977, "loss/crossentropy": 2.3000362515449524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22355759143829346, "step": 2066 }, { "epoch": 0.17233333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.04269205729166667, "learning_rate": 4e-05, "loss": 5.0526, "loss/crossentropy": 1.692564770579338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17172588407993317, "step": 2068 }, { "epoch": 0.1725, "grad_norm": 4.65625, "grad_norm_var": 0.05230712890625, "learning_rate": 4e-05, "loss": 4.6657, "loss/crossentropy": 2.475119471549988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216409109532833, "step": 2070 }, { "epoch": 0.17266666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.05286051432291667, "learning_rate": 4e-05, "loss": 4.8245, "loss/crossentropy": 2.1208333671092987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088773548603058, "step": 2072 }, { "epoch": 0.17283333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.05090738932291667, "learning_rate": 4e-05, "loss": 4.0344, "loss/crossentropy": 1.5833616331219673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1833811979740858, "step": 2074 }, { "epoch": 0.173, "grad_norm": 5.6875, "grad_norm_var": 0.09172770182291666, "learning_rate": 4e-05, "loss": 5.3868, "loss/crossentropy": 2.053065747022629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20190668106079102, "step": 2076 }, { "epoch": 0.17316666666666666, "grad_norm": 5.5625, "grad_norm_var": 0.10670572916666667, "learning_rate": 4e-05, "loss": 4.4207, "loss/crossentropy": 1.4702235013246536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15409984439611435, "step": 2078 }, { "epoch": 0.17333333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.10556233723958333, "learning_rate": 4e-05, "loss": 4.8621, "loss/crossentropy": 1.6585796177387238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16805803030729294, "step": 2080 }, { "epoch": 0.1735, "grad_norm": 5.84375, "grad_norm_var": 0.1333984375, "learning_rate": 4e-05, "loss": 5.2264, "loss/crossentropy": 1.9839501976966858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20378449745476246, "step": 2082 }, { "epoch": 0.17366666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.13909098307291667, "learning_rate": 4e-05, "loss": 4.7717, "loss/crossentropy": 1.573902688920498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1820588782429695, "step": 2084 }, { "epoch": 0.17383333333333334, "grad_norm": 5.625, "grad_norm_var": 0.13918863932291667, "learning_rate": 4e-05, "loss": 5.1317, "loss/crossentropy": 2.24695548415184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23131686076521873, "step": 2086 }, { "epoch": 0.174, "grad_norm": 4.96875, "grad_norm_var": 0.14078369140625, "learning_rate": 4e-05, "loss": 5.0558, "loss/crossentropy": 2.0588990449905396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19663485884666443, "step": 2088 }, { "epoch": 0.17416666666666666, "grad_norm": 5.53125, "grad_norm_var": 2.6382120768229167, "learning_rate": 4e-05, "loss": 5.2796, "loss/crossentropy": 3.171107590198517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2164313718676567, "step": 2090 }, { "epoch": 0.17433333333333334, "grad_norm": 4.78125, "grad_norm_var": 2.699609375, "learning_rate": 4e-05, "loss": 4.3192, "loss/crossentropy": 1.8898755833506584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18126432970166206, "step": 2092 }, { "epoch": 0.1745, "grad_norm": 5.1875, "grad_norm_var": 2.6998697916666665, "learning_rate": 4e-05, "loss": 5.3395, "loss/crossentropy": 2.3107918202877045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22003484517335892, "step": 2094 }, { "epoch": 0.17466666666666666, "grad_norm": 4.71875, "grad_norm_var": 2.71519775390625, "learning_rate": 4e-05, "loss": 5.0895, "loss/crossentropy": 2.438191533088684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2299581542611122, "step": 2096 }, { "epoch": 0.17483333333333334, "grad_norm": 4.625, "grad_norm_var": 2.741471354166667, "learning_rate": 4e-05, "loss": 5.0573, "loss/crossentropy": 1.8615416586399078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17831822112202644, "step": 2098 }, { "epoch": 0.175, "grad_norm": 5.09375, "grad_norm_var": 2.72359619140625, "learning_rate": 4e-05, "loss": 4.7074, "loss/crossentropy": 1.6599205955863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18141921050846577, "step": 2100 }, { "epoch": 0.17516666666666666, "grad_norm": 5.71875, "grad_norm_var": 2.745556640625, "learning_rate": 4e-05, "loss": 4.9259, "loss/crossentropy": 2.2989392578601837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21219024062156677, "step": 2102 }, { "epoch": 0.17533333333333334, "grad_norm": 5.0, "grad_norm_var": 2.73033447265625, "learning_rate": 4e-05, "loss": 4.7961, "loss/crossentropy": 1.7167327478528023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22489573061466217, "step": 2104 }, { "epoch": 0.1755, "grad_norm": 5.09375, "grad_norm_var": 0.09425455729166667, "learning_rate": 4e-05, "loss": 5.0813, "loss/crossentropy": 1.840710736811161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18269490636885166, "step": 2106 }, { "epoch": 0.17566666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.08704427083333334, "learning_rate": 4e-05, "loss": 5.1087, "loss/crossentropy": 1.8294510319828987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20847078040242195, "step": 2108 }, { "epoch": 0.17583333333333334, "grad_norm": 5.625, "grad_norm_var": 0.11080322265625, "learning_rate": 4e-05, "loss": 5.5059, "loss/crossentropy": 1.6975836902856827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17947252094745636, "step": 2110 }, { "epoch": 0.176, "grad_norm": 5.25, "grad_norm_var": 0.08876546223958333, "learning_rate": 4e-05, "loss": 5.5898, "loss/crossentropy": 1.7362675666809082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20171913504600525, "step": 2112 }, { "epoch": 0.17616666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.07919514973958333, "learning_rate": 4e-05, "loss": 5.3091, "loss/crossentropy": 1.721466027200222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18343711271882057, "step": 2114 }, { "epoch": 0.17633333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.088134765625, "learning_rate": 4e-05, "loss": 5.4335, "loss/crossentropy": 2.5684576630592346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20511827990412712, "step": 2116 }, { "epoch": 0.1765, "grad_norm": 5.4375, "grad_norm_var": 0.111962890625, "learning_rate": 4e-05, "loss": 5.4175, "loss/crossentropy": 2.235967993736267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22469640895724297, "step": 2118 }, { "epoch": 0.17666666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.10972900390625, "learning_rate": 4e-05, "loss": 4.8452, "loss/crossentropy": 1.9292369186878204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18822060897946358, "step": 2120 }, { "epoch": 0.17683333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.12224934895833334, "learning_rate": 4e-05, "loss": 5.119, "loss/crossentropy": 1.9760426580905914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19950895011425018, "step": 2122 }, { "epoch": 0.177, "grad_norm": 5.25, "grad_norm_var": 0.17069905598958332, "learning_rate": 4e-05, "loss": 5.0708, "loss/crossentropy": 2.346391201019287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23865826427936554, "step": 2124 }, { "epoch": 0.17716666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.16614583333333333, "learning_rate": 4e-05, "loss": 4.943, "loss/crossentropy": 2.2712226808071136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997787281870842, "step": 2126 }, { "epoch": 0.17733333333333334, "grad_norm": 5.0, "grad_norm_var": 0.17420247395833333, "learning_rate": 4e-05, "loss": 5.0841, "loss/crossentropy": 2.0525820776820183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887969095259905, "step": 2128 }, { "epoch": 0.1775, "grad_norm": 6.71875, "grad_norm_var": 0.34625244140625, "learning_rate": 4e-05, "loss": 4.298, "loss/crossentropy": 1.7505680918693542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16385265067219734, "step": 2130 }, { "epoch": 0.17766666666666667, "grad_norm": 5.25, "grad_norm_var": 0.33577067057291665, "learning_rate": 4e-05, "loss": 5.0106, "loss/crossentropy": 2.127786874771118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24591680243611336, "step": 2132 }, { "epoch": 0.17783333333333334, "grad_norm": 4.875, "grad_norm_var": 0.2997233072916667, "learning_rate": 4e-05, "loss": 5.2174, "loss/crossentropy": 1.796779453754425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18160532787442207, "step": 2134 }, { "epoch": 0.178, "grad_norm": 5.09375, "grad_norm_var": 0.29931233723958334, "learning_rate": 4e-05, "loss": 4.7535, "loss/crossentropy": 0.8597075119614601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11564531922340393, "step": 2136 }, { "epoch": 0.17816666666666667, "grad_norm": 5.125, "grad_norm_var": 0.2877888997395833, "learning_rate": 4e-05, "loss": 5.0003, "loss/crossentropy": 2.004398114979267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18805610574781895, "step": 2138 }, { "epoch": 0.17833333333333334, "grad_norm": 5.3125, "grad_norm_var": 0.23642171223958333, "learning_rate": 4e-05, "loss": 4.8822, "loss/crossentropy": 2.124794065952301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21644308045506477, "step": 2140 }, { "epoch": 0.1785, "grad_norm": 5.28125, "grad_norm_var": 0.22561442057291667, "learning_rate": 4e-05, "loss": 5.2293, "loss/crossentropy": 1.2877550274133682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14798221364617348, "step": 2142 }, { "epoch": 0.17866666666666667, "grad_norm": 5.65625, "grad_norm_var": 0.22821858723958333, "learning_rate": 4e-05, "loss": 4.7824, "loss/crossentropy": 1.6831488832831383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15735942497849464, "step": 2144 }, { "epoch": 0.17883333333333334, "grad_norm": 5.59375, "grad_norm_var": 0.05546875, "learning_rate": 4e-05, "loss": 5.2751, "loss/crossentropy": 2.193062275648117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21409276500344276, "step": 2146 }, { "epoch": 0.179, "grad_norm": 4.78125, "grad_norm_var": 0.06773681640625, "learning_rate": 4e-05, "loss": 4.6843, "loss/crossentropy": 2.02949271351099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2134292870759964, "step": 2148 }, { "epoch": 0.17916666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.0623046875, "learning_rate": 4e-05, "loss": 5.2284, "loss/crossentropy": 2.0511502772569656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1925040427595377, "step": 2150 }, { "epoch": 0.17933333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.05582275390625, "learning_rate": 4e-05, "loss": 4.8028, "loss/crossentropy": 2.178094059228897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19704850018024445, "step": 2152 }, { "epoch": 0.1795, "grad_norm": 5.78125, "grad_norm_var": 0.090478515625, "learning_rate": 4e-05, "loss": 5.1375, "loss/crossentropy": 2.071473777294159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016850858926773, "step": 2154 }, { "epoch": 0.17966666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.08433837890625, "learning_rate": 4e-05, "loss": 4.4847, "loss/crossentropy": 1.7026320695877075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854725144803524, "step": 2156 }, { "epoch": 0.17983333333333335, "grad_norm": 5.0625, "grad_norm_var": 0.08508707682291666, "learning_rate": 4e-05, "loss": 4.8903, "loss/crossentropy": 1.9909127950668335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21856766939163208, "step": 2158 }, { "epoch": 0.18, "grad_norm": 5.15625, "grad_norm_var": 0.06851806640625, "learning_rate": 4e-05, "loss": 4.8267, "loss/crossentropy": 1.6173651814460754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17569880932569504, "step": 2160 }, { "epoch": 0.18016666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.061747233072916664, "learning_rate": 4e-05, "loss": 4.6537, "loss/crossentropy": 1.753201201558113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20511113107204437, "step": 2162 }, { "epoch": 0.18033333333333335, "grad_norm": 5.0625, "grad_norm_var": 0.050764973958333334, "learning_rate": 4e-05, "loss": 5.3518, "loss/crossentropy": 2.5091399550437927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.232530165463686, "step": 2164 }, { "epoch": 0.1805, "grad_norm": 8.75, "grad_norm_var": 0.8775349934895833, "learning_rate": 4e-05, "loss": 4.9893, "loss/crossentropy": 2.5543057322502136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21447621285915375, "step": 2166 }, { "epoch": 0.18066666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.867041015625, "learning_rate": 4e-05, "loss": 5.153, "loss/crossentropy": 1.5453489795327187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1768591459840536, "step": 2168 }, { "epoch": 0.18083333333333335, "grad_norm": 5.5, "grad_norm_var": 0.86197509765625, "learning_rate": 4e-05, "loss": 4.941, "loss/crossentropy": 1.713011920452118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18514833971858025, "step": 2170 }, { "epoch": 0.181, "grad_norm": 5.0625, "grad_norm_var": 0.86636962890625, "learning_rate": 4e-05, "loss": 5.2454, "loss/crossentropy": 2.531073272228241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22826341539621353, "step": 2172 }, { "epoch": 0.18116666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.8854817708333333, "learning_rate": 4e-05, "loss": 4.6659, "loss/crossentropy": 1.2854736521840096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15183403715491295, "step": 2174 }, { "epoch": 0.18133333333333335, "grad_norm": 5.1875, "grad_norm_var": 0.8856119791666667, "learning_rate": 4e-05, "loss": 5.1651, "loss/crossentropy": 1.829525165259838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17748862504959106, "step": 2176 }, { "epoch": 0.1815, "grad_norm": 5.1875, "grad_norm_var": 0.8767578125, "learning_rate": 4e-05, "loss": 5.3755, "loss/crossentropy": 2.63100802898407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24154625460505486, "step": 2178 }, { "epoch": 0.18166666666666667, "grad_norm": 5.125, "grad_norm_var": 0.9402994791666667, "learning_rate": 4e-05, "loss": 4.9287, "loss/crossentropy": 1.4673430994153023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15381817147135735, "step": 2180 }, { "epoch": 0.18183333333333335, "grad_norm": 5.25, "grad_norm_var": 0.12263997395833333, "learning_rate": 4e-05, "loss": 4.7876, "loss/crossentropy": 2.181483656167984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17778964713215828, "step": 2182 }, { "epoch": 0.182, "grad_norm": 5.09375, "grad_norm_var": 0.12118733723958333, "learning_rate": 4e-05, "loss": 4.9908, "loss/crossentropy": 2.0452851057052612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21750615909695625, "step": 2184 }, { "epoch": 0.18216666666666667, "grad_norm": 5.125, "grad_norm_var": 0.10130208333333333, "learning_rate": 4e-05, "loss": 5.2839, "loss/crossentropy": 1.343794122338295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16215426102280617, "step": 2186 }, { "epoch": 0.18233333333333332, "grad_norm": 5.34375, "grad_norm_var": 0.10143229166666666, "learning_rate": 4e-05, "loss": 4.9467, "loss/crossentropy": 1.4881090819835663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15420327335596085, "step": 2188 }, { "epoch": 0.1825, "grad_norm": 4.9375, "grad_norm_var": 0.09563395182291666, "learning_rate": 4e-05, "loss": 4.7357, "loss/crossentropy": 2.0466759502887726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20744113996624947, "step": 2190 }, { "epoch": 0.18266666666666667, "grad_norm": 6.0, "grad_norm_var": 0.15123291015625, "learning_rate": 4e-05, "loss": 4.8545, "loss/crossentropy": 2.399523586034775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22361257672309875, "step": 2192 }, { "epoch": 0.18283333333333332, "grad_norm": 4.75, "grad_norm_var": 0.13795572916666668, "learning_rate": 4e-05, "loss": 4.3254, "loss/crossentropy": 2.5588160157203674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2560463845729828, "step": 2194 }, { "epoch": 0.183, "grad_norm": 5.0, "grad_norm_var": 0.10146077473958333, "learning_rate": 4e-05, "loss": 4.3573, "loss/crossentropy": 1.7524387538433075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18234056793153286, "step": 2196 }, { "epoch": 0.18316666666666667, "grad_norm": 6.40625, "grad_norm_var": 0.21132405598958334, "learning_rate": 4e-05, "loss": 4.23, "loss/crossentropy": 1.385098822414875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15484962239861488, "step": 2198 }, { "epoch": 0.18333333333333332, "grad_norm": 5.40625, "grad_norm_var": 0.21073811848958332, "learning_rate": 4e-05, "loss": 5.2157, "loss/crossentropy": 2.572742462158203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22486525401473045, "step": 2200 }, { "epoch": 0.1835, "grad_norm": 4.625, "grad_norm_var": 0.23424072265625, "learning_rate": 4e-05, "loss": 4.0647, "loss/crossentropy": 1.7396223545074463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16994404792785645, "step": 2202 }, { "epoch": 0.18366666666666667, "grad_norm": 5.40625, "grad_norm_var": 0.22974853515625, "learning_rate": 4e-05, "loss": 5.3349, "loss/crossentropy": 2.492325782775879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23226865381002426, "step": 2204 }, { "epoch": 0.18383333333333332, "grad_norm": 5.4375, "grad_norm_var": 0.23821614583333334, "learning_rate": 4e-05, "loss": 4.6774, "loss/crossentropy": 1.0675053745508194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1418643444776535, "step": 2206 }, { "epoch": 0.184, "grad_norm": 4.875, "grad_norm_var": 0.18370768229166667, "learning_rate": 4e-05, "loss": 4.6962, "loss/crossentropy": 1.62255859375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18710698559880257, "step": 2208 }, { "epoch": 0.18416666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.19257405598958333, "learning_rate": 4e-05, "loss": 4.572, "loss/crossentropy": 1.9560261443257332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18211353570222855, "step": 2210 }, { "epoch": 0.18433333333333332, "grad_norm": 5.40625, "grad_norm_var": 0.19798177083333332, "learning_rate": 4e-05, "loss": 4.9418, "loss/crossentropy": 2.48140287399292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21697937697172165, "step": 2212 }, { "epoch": 0.1845, "grad_norm": 5.21875, "grad_norm_var": 0.079150390625, "learning_rate": 4e-05, "loss": 5.364, "loss/crossentropy": 2.5937938690185547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2184484452009201, "step": 2214 }, { "epoch": 0.18466666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.07825520833333334, "learning_rate": 4e-05, "loss": 4.9239, "loss/crossentropy": 1.6036360636353493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17884932085871696, "step": 2216 }, { "epoch": 0.18483333333333332, "grad_norm": 4.65625, "grad_norm_var": 0.077587890625, "learning_rate": 4e-05, "loss": 4.6433, "loss/crossentropy": 1.7372171953320503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18615025654435158, "step": 2218 }, { "epoch": 0.185, "grad_norm": 4.90625, "grad_norm_var": 0.06953125, "learning_rate": 4e-05, "loss": 4.3449, "loss/crossentropy": 1.685012899339199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18126941099762917, "step": 2220 }, { "epoch": 0.18516666666666667, "grad_norm": 5.0, "grad_norm_var": 0.07590738932291667, "learning_rate": 4e-05, "loss": 5.6169, "loss/crossentropy": 1.9879830479621887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21872147917747498, "step": 2222 }, { "epoch": 0.18533333333333332, "grad_norm": 4.65625, "grad_norm_var": 0.09312744140625, "learning_rate": 4e-05, "loss": 4.1555, "loss/crossentropy": 0.8916665241122246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11059846729040146, "step": 2224 }, { "epoch": 0.1855, "grad_norm": 4.90625, "grad_norm_var": 0.08787434895833333, "learning_rate": 4e-05, "loss": 5.4427, "loss/crossentropy": 2.078547030687332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19260921701788902, "step": 2226 }, { "epoch": 0.18566666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.08349202473958334, "learning_rate": 4e-05, "loss": 4.473, "loss/crossentropy": 1.3787953928112984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1644715555012226, "step": 2228 }, { "epoch": 0.18583333333333332, "grad_norm": 5.3125, "grad_norm_var": 0.08700764973958333, "learning_rate": 4e-05, "loss": 5.3315, "loss/crossentropy": 2.1148226857185364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21458249166607857, "step": 2230 }, { "epoch": 0.186, "grad_norm": 5.15625, "grad_norm_var": 0.07706705729166667, "learning_rate": 4e-05, "loss": 5.247, "loss/crossentropy": 2.0855464041233063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19903532043099403, "step": 2232 }, { "epoch": 0.18616666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.06721598307291667, "learning_rate": 4e-05, "loss": 4.5324, "loss/crossentropy": 2.331184357404709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067573331296444, "step": 2234 }, { "epoch": 0.18633333333333332, "grad_norm": 5.03125, "grad_norm_var": 0.06634114583333334, "learning_rate": 4e-05, "loss": 4.5482, "loss/crossentropy": 1.4255691543221474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15853366628289223, "step": 2236 }, { "epoch": 0.1865, "grad_norm": 5.1875, "grad_norm_var": 0.04989827473958333, "learning_rate": 4e-05, "loss": 4.9923, "loss/crossentropy": 1.6706126257777214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20294161699712276, "step": 2238 }, { "epoch": 0.18666666666666668, "grad_norm": 4.9375, "grad_norm_var": 0.03319905598958333, "learning_rate": 4e-05, "loss": 5.0886, "loss/crossentropy": 2.1218108534812927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21483079716563225, "step": 2240 }, { "epoch": 0.18683333333333332, "grad_norm": 4.96875, "grad_norm_var": 0.02584228515625, "learning_rate": 4e-05, "loss": 4.8981, "loss/crossentropy": 1.091003268957138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18091485276818275, "step": 2242 }, { "epoch": 0.187, "grad_norm": 5.09375, "grad_norm_var": 0.022977701822916665, "learning_rate": 4e-05, "loss": 4.9307, "loss/crossentropy": 1.5533147603273392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16520674899220467, "step": 2244 }, { "epoch": 0.18716666666666668, "grad_norm": 5.15625, "grad_norm_var": 0.023177083333333334, "learning_rate": 4e-05, "loss": 5.1086, "loss/crossentropy": 1.7440512776374817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17713807709515095, "step": 2246 }, { "epoch": 0.18733333333333332, "grad_norm": 4.84375, "grad_norm_var": 0.027958170572916666, "learning_rate": 4e-05, "loss": 4.9532, "loss/crossentropy": 1.5148718804121017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19824461452662945, "step": 2248 }, { "epoch": 0.1875, "grad_norm": 5.0625, "grad_norm_var": 0.02974853515625, "learning_rate": 4e-05, "loss": 5.4161, "loss/crossentropy": 2.427815794944763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23496362939476967, "step": 2250 }, { "epoch": 0.18766666666666668, "grad_norm": 5.0, "grad_norm_var": 0.03557535807291667, "learning_rate": 4e-05, "loss": 5.3724, "loss/crossentropy": 2.0604121685028076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21071171015501022, "step": 2252 }, { "epoch": 0.18783333333333332, "grad_norm": 4.78125, "grad_norm_var": 0.06519775390625, "learning_rate": 4e-05, "loss": 4.9961, "loss/crossentropy": 2.419283837080002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21685468032956123, "step": 2254 }, { "epoch": 0.188, "grad_norm": 5.25, "grad_norm_var": 0.05738525390625, "learning_rate": 4e-05, "loss": 5.7778, "loss/crossentropy": 2.7884849309921265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20676440745592117, "step": 2256 }, { "epoch": 0.18816666666666668, "grad_norm": 5.5, "grad_norm_var": 0.06968994140625, "learning_rate": 4e-05, "loss": 5.0168, "loss/crossentropy": 1.3894076570868492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17267927527427673, "step": 2258 }, { "epoch": 0.18833333333333332, "grad_norm": 4.71875, "grad_norm_var": 0.079150390625, "learning_rate": 4e-05, "loss": 5.0644, "loss/crossentropy": 1.7085940018296242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17589740082621574, "step": 2260 }, { "epoch": 0.1885, "grad_norm": 5.125, "grad_norm_var": 0.07649332682291667, "learning_rate": 4e-05, "loss": 4.6788, "loss/crossentropy": 2.5399693846702576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22661524266004562, "step": 2262 }, { "epoch": 0.18866666666666668, "grad_norm": 4.375, "grad_norm_var": 0.10227457682291667, "learning_rate": 4e-05, "loss": 4.3746, "loss/crossentropy": 1.6717079058289528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17274832166731358, "step": 2264 }, { "epoch": 0.18883333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.104296875, "learning_rate": 4e-05, "loss": 5.2109, "loss/crossentropy": 2.150670550763607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020272258669138, "step": 2266 }, { "epoch": 0.189, "grad_norm": 5.625, "grad_norm_var": 0.1154296875, "learning_rate": 4e-05, "loss": 5.4183, "loss/crossentropy": 1.5393069609999657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19283157959580421, "step": 2268 }, { "epoch": 0.18916666666666668, "grad_norm": 5.28125, "grad_norm_var": 0.103369140625, "learning_rate": 4e-05, "loss": 5.1102, "loss/crossentropy": 1.8512317463755608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18179438635706902, "step": 2270 }, { "epoch": 0.18933333333333333, "grad_norm": 5.375, "grad_norm_var": 0.11417643229166667, "learning_rate": 4e-05, "loss": 4.9907, "loss/crossentropy": 1.8801306560635567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18935791589319706, "step": 2272 }, { "epoch": 0.1895, "grad_norm": 5.0, "grad_norm_var": 0.09908447265625, "learning_rate": 4e-05, "loss": 5.1938, "loss/crossentropy": 1.6166588142514229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16408336907625198, "step": 2274 }, { "epoch": 0.18966666666666668, "grad_norm": 4.875, "grad_norm_var": 0.098291015625, "learning_rate": 4e-05, "loss": 4.4079, "loss/crossentropy": 1.650967113673687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15877833776175976, "step": 2276 }, { "epoch": 0.18983333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.101953125, "learning_rate": 4e-05, "loss": 5.4296, "loss/crossentropy": 2.6282625794410706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21001841127872467, "step": 2278 }, { "epoch": 0.19, "grad_norm": 5.6875, "grad_norm_var": 0.10572509765625, "learning_rate": 4e-05, "loss": 5.0226, "loss/crossentropy": 1.8326758667826653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20088370144367218, "step": 2280 }, { "epoch": 0.19016666666666668, "grad_norm": 5.4375, "grad_norm_var": 0.10113525390625, "learning_rate": 4e-05, "loss": 5.1215, "loss/crossentropy": 2.404378890991211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2352002039551735, "step": 2282 }, { "epoch": 0.19033333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.09342447916666667, "learning_rate": 4e-05, "loss": 4.2729, "loss/crossentropy": 1.60519190877676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19760267436504364, "step": 2284 }, { "epoch": 0.1905, "grad_norm": 5.1875, "grad_norm_var": 0.08205973307291667, "learning_rate": 4e-05, "loss": 4.9924, "loss/crossentropy": 1.188548594713211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14038443379104137, "step": 2286 }, { "epoch": 0.19066666666666668, "grad_norm": 5.46875, "grad_norm_var": 0.14068603515625, "learning_rate": 4e-05, "loss": 5.083, "loss/crossentropy": 2.4681698083877563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23486436530947685, "step": 2288 }, { "epoch": 0.19083333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.13886311848958333, "learning_rate": 4e-05, "loss": 5.2423, "loss/crossentropy": 2.2314860820770264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19190067425370216, "step": 2290 }, { "epoch": 0.191, "grad_norm": 5.40625, "grad_norm_var": 0.12511393229166667, "learning_rate": 4e-05, "loss": 5.0995, "loss/crossentropy": 1.8339603021740913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18039636500179768, "step": 2292 }, { "epoch": 0.19116666666666668, "grad_norm": 5.4375, "grad_norm_var": 0.13111572265625, "learning_rate": 4e-05, "loss": 4.3892, "loss/crossentropy": 1.6122345626354218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16903558000922203, "step": 2294 }, { "epoch": 0.19133333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.13097330729166667, "learning_rate": 4e-05, "loss": 5.1152, "loss/crossentropy": 1.9905153512954712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18592159822583199, "step": 2296 }, { "epoch": 0.1915, "grad_norm": 5.15625, "grad_norm_var": 0.13268229166666667, "learning_rate": 4e-05, "loss": 5.1794, "loss/crossentropy": 2.347501277923584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232624776661396, "step": 2298 }, { "epoch": 0.19166666666666668, "grad_norm": 5.28125, "grad_norm_var": 0.14368489583333333, "learning_rate": 4e-05, "loss": 4.485, "loss/crossentropy": 1.5246716812252998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17598333582282066, "step": 2300 }, { "epoch": 0.19183333333333333, "grad_norm": 4.875, "grad_norm_var": 0.15271809895833333, "learning_rate": 4e-05, "loss": 4.8725, "loss/crossentropy": 2.488103985786438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2659139633178711, "step": 2302 }, { "epoch": 0.192, "grad_norm": 5.125, "grad_norm_var": 0.09537760416666667, "learning_rate": 4e-05, "loss": 4.7235, "loss/crossentropy": 2.192271262407303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22700363397598267, "step": 2304 }, { "epoch": 0.19216666666666668, "grad_norm": 5.0625, "grad_norm_var": 0.11910400390625, "learning_rate": 4e-05, "loss": 4.8076, "loss/crossentropy": 2.3486633598804474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21487021818757057, "step": 2306 }, { "epoch": 0.19233333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.7286458333333333, "learning_rate": 4e-05, "loss": 4.8531, "loss/crossentropy": 1.9830282926559448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20326359942555428, "step": 2308 }, { "epoch": 0.1925, "grad_norm": 4.59375, "grad_norm_var": 0.7384765625, "learning_rate": 4e-05, "loss": 4.583, "loss/crossentropy": 1.994844913482666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19658523239195347, "step": 2310 }, { "epoch": 0.19266666666666668, "grad_norm": 5.03125, "grad_norm_var": 0.7403605143229167, "learning_rate": 4e-05, "loss": 4.6763, "loss/crossentropy": 2.076766610145569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21167897433042526, "step": 2312 }, { "epoch": 0.19283333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.7406209309895834, "learning_rate": 4e-05, "loss": 5.2687, "loss/crossentropy": 1.9793154150247574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1830280590802431, "step": 2314 }, { "epoch": 0.193, "grad_norm": 5.125, "grad_norm_var": 0.7391764322916666, "learning_rate": 4e-05, "loss": 5.642, "loss/crossentropy": 2.6030354499816895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21964925155043602, "step": 2316 }, { "epoch": 0.19316666666666665, "grad_norm": 5.0, "grad_norm_var": 0.7379191080729167, "learning_rate": 4e-05, "loss": 5.1509, "loss/crossentropy": 1.6609367281198502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19058697298169136, "step": 2318 }, { "epoch": 0.19333333333333333, "grad_norm": 5.34375, "grad_norm_var": 0.7442545572916667, "learning_rate": 4e-05, "loss": 4.8977, "loss/crossentropy": 1.2792168036103249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.160741800442338, "step": 2320 }, { "epoch": 0.1935, "grad_norm": 4.78125, "grad_norm_var": 0.7247355143229167, "learning_rate": 4e-05, "loss": 4.5242, "loss/crossentropy": 1.762831062078476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17957177013158798, "step": 2322 }, { "epoch": 0.19366666666666665, "grad_norm": 5.09375, "grad_norm_var": 0.06565348307291667, "learning_rate": 4e-05, "loss": 5.2638, "loss/crossentropy": 2.090299479663372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19620474614202976, "step": 2324 }, { "epoch": 0.19383333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.06474202473958333, "learning_rate": 4e-05, "loss": 5.1114, "loss/crossentropy": 2.4732211232185364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23878782615065575, "step": 2326 }, { "epoch": 0.194, "grad_norm": 5.03125, "grad_norm_var": 0.06474202473958333, "learning_rate": 4e-05, "loss": 5.012, "loss/crossentropy": 1.8278373926877975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19272247329354286, "step": 2328 }, { "epoch": 0.19416666666666665, "grad_norm": 4.65625, "grad_norm_var": 0.07177327473958334, "learning_rate": 4e-05, "loss": 5.0843, "loss/crossentropy": 2.1059842854738235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18917707912623882, "step": 2330 }, { "epoch": 0.19433333333333333, "grad_norm": 4.625, "grad_norm_var": 0.05779622395833333, "learning_rate": 4e-05, "loss": 4.6124, "loss/crossentropy": 2.4292266964912415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2285252884030342, "step": 2332 }, { "epoch": 0.1945, "grad_norm": 4.96875, "grad_norm_var": 0.05810139973958333, "learning_rate": 4e-05, "loss": 4.9482, "loss/crossentropy": 2.253578156232834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20206546410918236, "step": 2334 }, { "epoch": 0.19466666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.04045817057291667, "learning_rate": 4e-05, "loss": 4.3528, "loss/crossentropy": 2.476481080055237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20762532204389572, "step": 2336 }, { "epoch": 0.19483333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.24381510416666666, "learning_rate": 4e-05, "loss": 5.4346, "loss/crossentropy": 2.1690665781497955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23577401787042618, "step": 2338 }, { "epoch": 0.195, "grad_norm": 5.25, "grad_norm_var": 0.24620768229166667, "learning_rate": 4e-05, "loss": 5.4965, "loss/crossentropy": 2.1100385785102844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158173955976963, "step": 2340 }, { "epoch": 0.19516666666666665, "grad_norm": 5.3125, "grad_norm_var": 0.2534993489583333, "learning_rate": 4e-05, "loss": 4.904, "loss/crossentropy": 1.9950718879699707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21088406071066856, "step": 2342 }, { "epoch": 0.19533333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.25813395182291665, "learning_rate": 4e-05, "loss": 5.2952, "loss/crossentropy": 2.032729558646679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18892317451536655, "step": 2344 }, { "epoch": 0.1955, "grad_norm": 4.96875, "grad_norm_var": 0.24455973307291667, "learning_rate": 4e-05, "loss": 4.843, "loss/crossentropy": 2.043039858341217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19413789920508862, "step": 2346 }, { "epoch": 0.19566666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.21181233723958334, "learning_rate": 4e-05, "loss": 5.0166, "loss/crossentropy": 2.136802703142166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2046213485300541, "step": 2348 }, { "epoch": 0.19583333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.20038655598958333, "learning_rate": 4e-05, "loss": 5.1836, "loss/crossentropy": 1.9726791083812714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19684670493006706, "step": 2350 }, { "epoch": 0.196, "grad_norm": 5.0, "grad_norm_var": 0.18697916666666667, "learning_rate": 4e-05, "loss": 4.5345, "loss/crossentropy": 1.5553816556930542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23889531195163727, "step": 2352 }, { "epoch": 0.19616666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.09179280598958334, "learning_rate": 4e-05, "loss": 4.2414, "loss/crossentropy": 1.5767273381352425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16629118844866753, "step": 2354 }, { "epoch": 0.19633333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.105859375, "learning_rate": 4e-05, "loss": 4.545, "loss/crossentropy": 1.3365295231342316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1607996355742216, "step": 2356 }, { "epoch": 0.1965, "grad_norm": 5.125, "grad_norm_var": 0.08943684895833333, "learning_rate": 4e-05, "loss": 5.5573, "loss/crossentropy": 1.968780405819416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1766455788165331, "step": 2358 }, { "epoch": 0.19666666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.07890218098958333, "learning_rate": 4e-05, "loss": 5.5475, "loss/crossentropy": 1.7819099575281143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18220307305455208, "step": 2360 }, { "epoch": 0.19683333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.09217122395833334, "learning_rate": 4e-05, "loss": 5.383, "loss/crossentropy": 2.179343730211258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25340667366981506, "step": 2362 }, { "epoch": 0.197, "grad_norm": 4.96875, "grad_norm_var": 0.089697265625, "learning_rate": 4e-05, "loss": 5.4171, "loss/crossentropy": 2.6181305050849915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063765563070774, "step": 2364 }, { "epoch": 0.19716666666666666, "grad_norm": 5.5, "grad_norm_var": 0.4177734375, "learning_rate": 4e-05, "loss": 5.3798, "loss/crossentropy": 2.5250502824783325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21604949235916138, "step": 2366 }, { "epoch": 0.19733333333333333, "grad_norm": 5.0, "grad_norm_var": 0.413525390625, "learning_rate": 4e-05, "loss": 4.6161, "loss/crossentropy": 1.679262101650238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18522610887885094, "step": 2368 }, { "epoch": 0.1975, "grad_norm": 5.0, "grad_norm_var": 0.35937093098958334, "learning_rate": 4e-05, "loss": 4.7427, "loss/crossentropy": 1.8730647563934326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18919868022203445, "step": 2370 }, { "epoch": 0.19766666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.32896728515625, "learning_rate": 4e-05, "loss": 4.9464, "loss/crossentropy": 1.8721271231770515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19726279936730862, "step": 2372 }, { "epoch": 0.19783333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.3389973958333333, "learning_rate": 4e-05, "loss": 4.9454, "loss/crossentropy": 2.39280566573143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2158326916396618, "step": 2374 }, { "epoch": 0.198, "grad_norm": 5.0, "grad_norm_var": 0.3389973958333333, "learning_rate": 4e-05, "loss": 5.2534, "loss/crossentropy": 2.6808955669403076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230102077126503, "step": 2376 }, { "epoch": 0.19816666666666666, "grad_norm": 4.625, "grad_norm_var": 0.3664347330729167, "learning_rate": 4e-05, "loss": 4.9032, "loss/crossentropy": 2.062427654862404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18818776682019234, "step": 2378 }, { "epoch": 0.19833333333333333, "grad_norm": 5.0, "grad_norm_var": 0.36503499348958335, "learning_rate": 4e-05, "loss": 4.4601, "loss/crossentropy": 0.9793087244033813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11906297132372856, "step": 2380 }, { "epoch": 0.1985, "grad_norm": 5.4375, "grad_norm_var": 0.05455729166666667, "learning_rate": 4e-05, "loss": 4.8264, "loss/crossentropy": 2.2848470509052277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223106287419796, "step": 2382 }, { "epoch": 0.19866666666666666, "grad_norm": 5.65625, "grad_norm_var": 0.07232666015625, "learning_rate": 4e-05, "loss": 4.9411, "loss/crossentropy": 2.03727525472641, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002503089606762, "step": 2384 }, { "epoch": 0.19883333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.06990559895833333, "learning_rate": 4e-05, "loss": 4.7408, "loss/crossentropy": 2.0407300665974617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17866826429963112, "step": 2386 }, { "epoch": 0.199, "grad_norm": 4.875, "grad_norm_var": 0.06881510416666667, "learning_rate": 4e-05, "loss": 4.6838, "loss/crossentropy": 1.9522857144474983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1880640648305416, "step": 2388 }, { "epoch": 0.19916666666666666, "grad_norm": 5.34375, "grad_norm_var": 0.07489827473958334, "learning_rate": 4e-05, "loss": 4.1322, "loss/crossentropy": 1.9232835546135902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1843096949160099, "step": 2390 }, { "epoch": 0.19933333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.08131103515625, "learning_rate": 4e-05, "loss": 4.9668, "loss/crossentropy": 2.4705487489700317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21942654252052307, "step": 2392 }, { "epoch": 0.1995, "grad_norm": 5.78125, "grad_norm_var": 0.11480712890625, "learning_rate": 4e-05, "loss": 4.046, "loss/crossentropy": 1.8307190835475922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106981799006462, "step": 2394 }, { "epoch": 0.19966666666666666, "grad_norm": 5.375, "grad_norm_var": 0.116650390625, "learning_rate": 4e-05, "loss": 5.0327, "loss/crossentropy": 1.923123762011528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18329189345240593, "step": 2396 }, { "epoch": 0.19983333333333334, "grad_norm": 5.0, "grad_norm_var": 0.11988525390625, "learning_rate": 4e-05, "loss": 4.2767, "loss/crossentropy": 1.2369297593832016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16844778135418892, "step": 2398 }, { "epoch": 0.2, "grad_norm": 5.1875, "grad_norm_var": 0.094140625, "learning_rate": 4e-05, "loss": 5.1696, "loss/crossentropy": 2.8806328773498535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21086610108613968, "step": 2400 }, { "epoch": 0.20016666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.10504150390625, "learning_rate": 4e-05, "loss": 5.0176, "loss/crossentropy": 1.6765633448958397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1639060154557228, "step": 2402 }, { "epoch": 0.20033333333333334, "grad_norm": 5.53125, "grad_norm_var": 0.14058837890625, "learning_rate": 4e-05, "loss": 4.9152, "loss/crossentropy": 1.8263401091098785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19616793654859066, "step": 2404 }, { "epoch": 0.2005, "grad_norm": 5.34375, "grad_norm_var": 0.142041015625, "learning_rate": 4e-05, "loss": 5.5261, "loss/crossentropy": 2.289825439453125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2241990529000759, "step": 2406 }, { "epoch": 0.20066666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.14482014973958332, "learning_rate": 4e-05, "loss": 5.0313, "loss/crossentropy": 2.223125606775284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20748021453619003, "step": 2408 }, { "epoch": 0.20083333333333334, "grad_norm": 5.625, "grad_norm_var": 0.120166015625, "learning_rate": 4e-05, "loss": 4.7346, "loss/crossentropy": 2.1327845007181168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1936967596411705, "step": 2410 }, { "epoch": 0.201, "grad_norm": 5.1875, "grad_norm_var": 0.11864827473958334, "learning_rate": 4e-05, "loss": 5.4438, "loss/crossentropy": 2.612125277519226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22732067853212357, "step": 2412 }, { "epoch": 0.20116666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.10035400390625, "learning_rate": 4e-05, "loss": 4.7912, "loss/crossentropy": 1.7989770472049713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17794376984238625, "step": 2414 }, { "epoch": 0.20133333333333334, "grad_norm": 5.3125, "grad_norm_var": 0.10250244140625, "learning_rate": 4e-05, "loss": 5.2793, "loss/crossentropy": 1.7428071647882462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1840539537370205, "step": 2416 }, { "epoch": 0.2015, "grad_norm": 5.1875, "grad_norm_var": 0.08040364583333333, "learning_rate": 4e-05, "loss": 5.137, "loss/crossentropy": 2.491989552974701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21401550620794296, "step": 2418 }, { "epoch": 0.20166666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.07121988932291666, "learning_rate": 4e-05, "loss": 5.1646, "loss/crossentropy": 1.4126268327236176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16175073012709618, "step": 2420 }, { "epoch": 0.20183333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.07980143229166667, "learning_rate": 4e-05, "loss": 4.6839, "loss/crossentropy": 2.006320595741272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2155579999089241, "step": 2422 }, { "epoch": 0.202, "grad_norm": 4.96875, "grad_norm_var": 0.075634765625, "learning_rate": 4e-05, "loss": 5.0565, "loss/crossentropy": 2.3960747718811035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1939377263188362, "step": 2424 }, { "epoch": 0.20216666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.055924479166666666, "learning_rate": 4e-05, "loss": 4.5787, "loss/crossentropy": 1.9523345828056335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18938801437616348, "step": 2426 }, { "epoch": 0.20233333333333334, "grad_norm": 4.75, "grad_norm_var": 0.05636393229166667, "learning_rate": 4e-05, "loss": 4.8534, "loss/crossentropy": 2.0117806047201157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1813819408416748, "step": 2428 }, { "epoch": 0.2025, "grad_norm": 5.1875, "grad_norm_var": 0.07122395833333334, "learning_rate": 4e-05, "loss": 4.7015, "loss/crossentropy": 1.265286423265934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14998789504170418, "step": 2430 }, { "epoch": 0.20266666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.059403483072916666, "learning_rate": 4e-05, "loss": 5.1934, "loss/crossentropy": 2.120694190263748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23458874225616455, "step": 2432 }, { "epoch": 0.20283333333333334, "grad_norm": 5.625, "grad_norm_var": 0.08866780598958333, "learning_rate": 4e-05, "loss": 4.2152, "loss/crossentropy": 0.7457961067557335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11005957797169685, "step": 2434 }, { "epoch": 0.203, "grad_norm": 5.03125, "grad_norm_var": 0.074462890625, "learning_rate": 4e-05, "loss": 5.2149, "loss/crossentropy": 1.8204586580395699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17762837558984756, "step": 2436 }, { "epoch": 0.20316666666666666, "grad_norm": 5.53125, "grad_norm_var": 0.08463134765625, "learning_rate": 4e-05, "loss": 5.2604, "loss/crossentropy": 2.4990166425704956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22241423279047012, "step": 2438 }, { "epoch": 0.20333333333333334, "grad_norm": 4.875, "grad_norm_var": 0.08203125, "learning_rate": 4e-05, "loss": 4.7264, "loss/crossentropy": 1.5507011637091637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16992885246872902, "step": 2440 }, { "epoch": 0.2035, "grad_norm": 5.0, "grad_norm_var": 0.07978108723958334, "learning_rate": 4e-05, "loss": 4.5444, "loss/crossentropy": 1.4435075148940086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15607544034719467, "step": 2442 }, { "epoch": 0.20366666666666666, "grad_norm": 5.5, "grad_norm_var": 0.08411458333333334, "learning_rate": 4e-05, "loss": 4.8557, "loss/crossentropy": 2.127426564693451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19914470985531807, "step": 2444 }, { "epoch": 0.20383333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.06495768229166667, "learning_rate": 4e-05, "loss": 5.2658, "loss/crossentropy": 2.2032350599765778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21544507518410683, "step": 2446 }, { "epoch": 0.204, "grad_norm": 5.0625, "grad_norm_var": 0.06483968098958333, "learning_rate": 4e-05, "loss": 4.762, "loss/crossentropy": 1.4311346858739853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1594764105975628, "step": 2448 }, { "epoch": 0.20416666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.037093098958333334, "learning_rate": 4e-05, "loss": 5.1773, "loss/crossentropy": 2.3031201362609863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23431189358234406, "step": 2450 }, { "epoch": 0.20433333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.047163899739583334, "learning_rate": 4e-05, "loss": 4.7683, "loss/crossentropy": 1.0590153932571411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14155251905322075, "step": 2452 }, { "epoch": 0.2045, "grad_norm": 4.78125, "grad_norm_var": 0.03553059895833333, "learning_rate": 4e-05, "loss": 4.7823, "loss/crossentropy": 2.6489208340644836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22386395931243896, "step": 2454 }, { "epoch": 0.20466666666666666, "grad_norm": 5.625, "grad_norm_var": 0.056233723958333336, "learning_rate": 4e-05, "loss": 4.9762, "loss/crossentropy": 1.5068995282053947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19015095010399818, "step": 2456 }, { "epoch": 0.20483333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.05579020182291667, "learning_rate": 4e-05, "loss": 4.8357, "loss/crossentropy": 2.2568003833293915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21340489014983177, "step": 2458 }, { "epoch": 0.205, "grad_norm": 4.90625, "grad_norm_var": 0.045426432291666666, "learning_rate": 4e-05, "loss": 4.8352, "loss/crossentropy": 2.2082974314689636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2341243252158165, "step": 2460 }, { "epoch": 0.20516666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.04690348307291667, "learning_rate": 4e-05, "loss": 5.3281, "loss/crossentropy": 2.6525495648384094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21573406457901, "step": 2462 }, { "epoch": 0.20533333333333334, "grad_norm": 4.875, "grad_norm_var": 0.049051920572916664, "learning_rate": 4e-05, "loss": 4.479, "loss/crossentropy": 1.88506181538105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18165216967463493, "step": 2464 }, { "epoch": 0.2055, "grad_norm": 4.96875, "grad_norm_var": 0.05089518229166667, "learning_rate": 4e-05, "loss": 4.6123, "loss/crossentropy": 0.9983108341693878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15218045935034752, "step": 2466 }, { "epoch": 0.20566666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.0470703125, "learning_rate": 4e-05, "loss": 5.4004, "loss/crossentropy": 2.5370147228240967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2264598123729229, "step": 2468 }, { "epoch": 0.20583333333333334, "grad_norm": 5.25, "grad_norm_var": 0.092822265625, "learning_rate": 4e-05, "loss": 5.4403, "loss/crossentropy": 2.3860780596733093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223365642130375, "step": 2470 }, { "epoch": 0.206, "grad_norm": 5.0, "grad_norm_var": 0.07654622395833334, "learning_rate": 4e-05, "loss": 5.4358, "loss/crossentropy": 2.091231919825077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.188719617202878, "step": 2472 }, { "epoch": 0.20616666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.08943684895833333, "learning_rate": 4e-05, "loss": 4.1961, "loss/crossentropy": 2.3965645730495453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22580226883292198, "step": 2474 }, { "epoch": 0.20633333333333334, "grad_norm": 4.625, "grad_norm_var": 0.11350504557291667, "learning_rate": 4e-05, "loss": 4.7299, "loss/crossentropy": 1.1824834942817688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1517321616411209, "step": 2476 }, { "epoch": 0.2065, "grad_norm": 5.09375, "grad_norm_var": 0.12706705729166667, "learning_rate": 4e-05, "loss": 4.9486, "loss/crossentropy": 2.443815290927887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23438604921102524, "step": 2478 }, { "epoch": 0.20666666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.14342041015625, "learning_rate": 4e-05, "loss": 4.6524, "loss/crossentropy": 1.9140185862779617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21477340161800385, "step": 2480 }, { "epoch": 0.20683333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.14384358723958332, "learning_rate": 4e-05, "loss": 5.1093, "loss/crossentropy": 2.3452938199043274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23434526473283768, "step": 2482 }, { "epoch": 0.207, "grad_norm": 4.875, "grad_norm_var": 0.14306233723958334, "learning_rate": 4e-05, "loss": 4.3748, "loss/crossentropy": 1.414752148091793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15633751079440117, "step": 2484 }, { "epoch": 0.20716666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.07537434895833334, "learning_rate": 4e-05, "loss": 5.2316, "loss/crossentropy": 2.1838470697402954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045600563287735, "step": 2486 }, { "epoch": 0.20733333333333334, "grad_norm": 5.0, "grad_norm_var": 0.07502848307291667, "learning_rate": 4e-05, "loss": 4.6684, "loss/crossentropy": 1.1734877079725266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1591203808784485, "step": 2488 }, { "epoch": 0.2075, "grad_norm": 5.0, "grad_norm_var": 0.08381754557291667, "learning_rate": 4e-05, "loss": 4.6216, "loss/crossentropy": 2.521151304244995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22322241961956024, "step": 2490 }, { "epoch": 0.20766666666666667, "grad_norm": 4.875, "grad_norm_var": 0.056494140625, "learning_rate": 4e-05, "loss": 5.1962, "loss/crossentropy": 1.9911609292030334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19882928766310215, "step": 2492 }, { "epoch": 0.20783333333333334, "grad_norm": 5.40625, "grad_norm_var": 0.05728759765625, "learning_rate": 4e-05, "loss": 4.8796, "loss/crossentropy": 1.5489679425954819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.168511301279068, "step": 2494 }, { "epoch": 0.208, "grad_norm": 5.15625, "grad_norm_var": 0.03863525390625, "learning_rate": 4e-05, "loss": 4.9423, "loss/crossentropy": 2.0905182361602783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21954334527254105, "step": 2496 }, { "epoch": 0.20816666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.037430826822916666, "learning_rate": 4e-05, "loss": 4.897, "loss/crossentropy": 2.207546591758728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22128864005208015, "step": 2498 }, { "epoch": 0.20833333333333334, "grad_norm": 6.0625, "grad_norm_var": 0.10032552083333333, "learning_rate": 4e-05, "loss": 5.2691, "loss/crossentropy": 1.9401999711990356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22317113354802132, "step": 2500 }, { "epoch": 0.2085, "grad_norm": 4.78125, "grad_norm_var": 0.104150390625, "learning_rate": 4e-05, "loss": 4.6666, "loss/crossentropy": 1.600251205265522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15894647873938084, "step": 2502 }, { "epoch": 0.20866666666666667, "grad_norm": 4.75, "grad_norm_var": 0.10896809895833333, "learning_rate": 4e-05, "loss": 4.9524, "loss/crossentropy": 1.9892296642065048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17438082955777645, "step": 2504 }, { "epoch": 0.20883333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.09537760416666667, "learning_rate": 4e-05, "loss": 4.8822, "loss/crossentropy": 1.401974692940712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15046437829732895, "step": 2506 }, { "epoch": 0.209, "grad_norm": 5.15625, "grad_norm_var": 0.102197265625, "learning_rate": 4e-05, "loss": 5.2628, "loss/crossentropy": 1.6273729652166367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18275013752281666, "step": 2508 }, { "epoch": 0.20916666666666667, "grad_norm": 5.59375, "grad_norm_var": 0.11470947265625, "learning_rate": 4e-05, "loss": 5.2575, "loss/crossentropy": 2.275018572807312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2178800217807293, "step": 2510 }, { "epoch": 0.20933333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.12200113932291666, "learning_rate": 4e-05, "loss": 5.1247, "loss/crossentropy": 1.5231431126594543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15373231656849384, "step": 2512 }, { "epoch": 0.2095, "grad_norm": 5.46875, "grad_norm_var": 0.14397379557291667, "learning_rate": 4e-05, "loss": 5.4726, "loss/crossentropy": 2.2370805740356445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20047016441822052, "step": 2514 }, { "epoch": 0.20966666666666667, "grad_norm": 4.875, "grad_norm_var": 0.09107666015625, "learning_rate": 4e-05, "loss": 4.7663, "loss/crossentropy": 1.8152910470962524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18139103800058365, "step": 2516 }, { "epoch": 0.20983333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.10071614583333334, "learning_rate": 4e-05, "loss": 4.5476, "loss/crossentropy": 0.983028382062912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11749066784977913, "step": 2518 }, { "epoch": 0.21, "grad_norm": 5.03125, "grad_norm_var": 0.09244384765625, "learning_rate": 4e-05, "loss": 5.1911, "loss/crossentropy": 2.3584609627723694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22418444603681564, "step": 2520 }, { "epoch": 0.21016666666666667, "grad_norm": 5.25, "grad_norm_var": 0.09595947265625, "learning_rate": 4e-05, "loss": 4.8606, "loss/crossentropy": 2.3720744848251343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20562008023262024, "step": 2522 }, { "epoch": 0.21033333333333334, "grad_norm": 5.0, "grad_norm_var": 0.08292643229166667, "learning_rate": 4e-05, "loss": 4.9864, "loss/crossentropy": 0.9131257832050323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11408952996134758, "step": 2524 }, { "epoch": 0.2105, "grad_norm": 5.28125, "grad_norm_var": 0.06822916666666666, "learning_rate": 4e-05, "loss": 4.9971, "loss/crossentropy": 1.5935637727379799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20797242410480976, "step": 2526 }, { "epoch": 0.21066666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.06370035807291667, "learning_rate": 4e-05, "loss": 5.2704, "loss/crossentropy": 2.1786339581012726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19719019532203674, "step": 2528 }, { "epoch": 0.21083333333333334, "grad_norm": 5.4375, "grad_norm_var": 0.0521484375, "learning_rate": 4e-05, "loss": 5.4812, "loss/crossentropy": 2.0704859495162964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22412089258432388, "step": 2530 }, { "epoch": 0.211, "grad_norm": 5.09375, "grad_norm_var": 0.04208577473958333, "learning_rate": 4e-05, "loss": 5.3409, "loss/crossentropy": 2.5442384481430054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23552602529525757, "step": 2532 }, { "epoch": 0.21116666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.025386555989583334, "learning_rate": 4e-05, "loss": 5.168, "loss/crossentropy": 2.3277163207530975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21133242174983025, "step": 2534 }, { "epoch": 0.21133333333333335, "grad_norm": 5.09375, "grad_norm_var": 0.026786295572916667, "learning_rate": 4e-05, "loss": 4.81, "loss/crossentropy": 2.286676347255707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255946770310402, "step": 2536 }, { "epoch": 0.2115, "grad_norm": 5.09375, "grad_norm_var": 0.023111979166666668, "learning_rate": 4e-05, "loss": 4.725, "loss/crossentropy": 1.7959834411740303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1922011747956276, "step": 2538 }, { "epoch": 0.21166666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.024149576822916668, "learning_rate": 4e-05, "loss": 5.055, "loss/crossentropy": 2.4185322523117065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21328793838620186, "step": 2540 }, { "epoch": 0.21183333333333335, "grad_norm": 4.96875, "grad_norm_var": 0.019254557291666665, "learning_rate": 4e-05, "loss": 5.0846, "loss/crossentropy": 2.1555165350437164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2327948994934559, "step": 2542 }, { "epoch": 0.212, "grad_norm": 5.5, "grad_norm_var": 0.04269205729166667, "learning_rate": 4e-05, "loss": 5.2851, "loss/crossentropy": 1.4759873449802399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15248359367251396, "step": 2544 }, { "epoch": 0.21216666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.031884765625, "learning_rate": 4e-05, "loss": 4.7554, "loss/crossentropy": 1.4834392219781876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17983658611774445, "step": 2546 }, { "epoch": 0.21233333333333335, "grad_norm": 5.46875, "grad_norm_var": 0.043603515625, "learning_rate": 4e-05, "loss": 5.007, "loss/crossentropy": 1.708244226872921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18391894362866879, "step": 2548 }, { "epoch": 0.2125, "grad_norm": 5.0625, "grad_norm_var": 0.04400634765625, "learning_rate": 4e-05, "loss": 4.9281, "loss/crossentropy": 1.6128144562244415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1744004562497139, "step": 2550 }, { "epoch": 0.21266666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.059098307291666666, "learning_rate": 4e-05, "loss": 4.9651, "loss/crossentropy": 1.640898883342743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18997144512832165, "step": 2552 }, { "epoch": 0.21283333333333335, "grad_norm": 5.4375, "grad_norm_var": 0.08271077473958334, "learning_rate": 4e-05, "loss": 5.2758, "loss/crossentropy": 1.6650393679738045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1907278336584568, "step": 2554 }, { "epoch": 0.213, "grad_norm": 5.46875, "grad_norm_var": 0.09267171223958333, "learning_rate": 4e-05, "loss": 5.3275, "loss/crossentropy": 2.447467267513275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21921542286872864, "step": 2556 }, { "epoch": 0.21316666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.09659830729166667, "learning_rate": 4e-05, "loss": 5.2095, "loss/crossentropy": 2.4506001472473145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22498662024736404, "step": 2558 }, { "epoch": 0.21333333333333335, "grad_norm": 5.34375, "grad_norm_var": 0.09029947916666667, "learning_rate": 4e-05, "loss": 5.4234, "loss/crossentropy": 2.3685405254364014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21306321397423744, "step": 2560 }, { "epoch": 0.2135, "grad_norm": 5.21875, "grad_norm_var": 0.10598958333333333, "learning_rate": 4e-05, "loss": 4.6514, "loss/crossentropy": 2.0201190412044525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1936206892132759, "step": 2562 }, { "epoch": 0.21366666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.09833577473958334, "learning_rate": 4e-05, "loss": 5.7124, "loss/crossentropy": 2.7159610986709595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22015909105539322, "step": 2564 }, { "epoch": 0.21383333333333332, "grad_norm": 4.84375, "grad_norm_var": 0.10123697916666667, "learning_rate": 4e-05, "loss": 4.8923, "loss/crossentropy": 2.1644165217876434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22002530470490456, "step": 2566 }, { "epoch": 0.214, "grad_norm": 5.34375, "grad_norm_var": 0.0859375, "learning_rate": 4e-05, "loss": 4.6498, "loss/crossentropy": 1.9540190249681473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15238827653229237, "step": 2568 }, { "epoch": 0.21416666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.22779947916666668, "learning_rate": 4e-05, "loss": 5.609, "loss/crossentropy": 2.4001490473747253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24488025903701782, "step": 2570 }, { "epoch": 0.21433333333333332, "grad_norm": 5.03125, "grad_norm_var": 0.23202718098958333, "learning_rate": 4e-05, "loss": 5.2706, "loss/crossentropy": 2.3217179775238037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21956229209899902, "step": 2572 }, { "epoch": 0.2145, "grad_norm": 4.90625, "grad_norm_var": 0.23212483723958333, "learning_rate": 4e-05, "loss": 4.9006, "loss/crossentropy": 1.3002420365810394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17373565584421158, "step": 2574 }, { "epoch": 0.21466666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.24062093098958334, "learning_rate": 4e-05, "loss": 4.8351, "loss/crossentropy": 2.2719730138778687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139144465327263, "step": 2576 }, { "epoch": 0.21483333333333332, "grad_norm": 4.9375, "grad_norm_var": 0.21861572265625, "learning_rate": 4e-05, "loss": 4.6618, "loss/crossentropy": 1.901275411248207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19540333189070225, "step": 2578 }, { "epoch": 0.215, "grad_norm": 5.3125, "grad_norm_var": 0.23121337890625, "learning_rate": 4e-05, "loss": 5.1308, "loss/crossentropy": 2.380274325609207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21984224021434784, "step": 2580 }, { "epoch": 0.21516666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.22320556640625, "learning_rate": 4e-05, "loss": 4.8584, "loss/crossentropy": 1.8006494864821434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16921357065439224, "step": 2582 }, { "epoch": 0.21533333333333332, "grad_norm": 5.3125, "grad_norm_var": 0.22681884765625, "learning_rate": 4e-05, "loss": 4.9215, "loss/crossentropy": 2.525661528110504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.218607347458601, "step": 2584 }, { "epoch": 0.2155, "grad_norm": 5.3125, "grad_norm_var": 0.06099853515625, "learning_rate": 4e-05, "loss": 4.1835, "loss/crossentropy": 2.3541648387908936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21644888073205948, "step": 2586 }, { "epoch": 0.21566666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.05845947265625, "learning_rate": 4e-05, "loss": 5.0687, "loss/crossentropy": 2.440452992916107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21727336198091507, "step": 2588 }, { "epoch": 0.21583333333333332, "grad_norm": 5.0, "grad_norm_var": 0.05611572265625, "learning_rate": 4e-05, "loss": 4.4566, "loss/crossentropy": 1.6151638180017471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18537553399801254, "step": 2590 }, { "epoch": 0.216, "grad_norm": 4.84375, "grad_norm_var": 0.17750244140625, "learning_rate": 4e-05, "loss": 4.9577, "loss/crossentropy": 2.416458487510681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24135924503207207, "step": 2592 }, { "epoch": 0.21616666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.18006184895833333, "learning_rate": 4e-05, "loss": 5.2044, "loss/crossentropy": 2.1265391409397125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17638667300343513, "step": 2594 }, { "epoch": 0.21633333333333332, "grad_norm": 5.03125, "grad_norm_var": 0.17928059895833334, "learning_rate": 4e-05, "loss": 4.6906, "loss/crossentropy": 2.0304845348000526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2004259079694748, "step": 2596 }, { "epoch": 0.2165, "grad_norm": 5.09375, "grad_norm_var": 0.19010416666666666, "learning_rate": 4e-05, "loss": 4.858, "loss/crossentropy": 1.6524630934000015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17721411027014256, "step": 2598 }, { "epoch": 0.21666666666666667, "grad_norm": 5.5, "grad_norm_var": 0.19853108723958332, "learning_rate": 4e-05, "loss": 5.0746, "loss/crossentropy": 2.5842694640159607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22306865453720093, "step": 2600 }, { "epoch": 0.21683333333333332, "grad_norm": 5.125, "grad_norm_var": 0.17987874348958333, "learning_rate": 4e-05, "loss": 5.3043, "loss/crossentropy": 2.285725235939026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159329690039158, "step": 2602 }, { "epoch": 0.217, "grad_norm": 4.84375, "grad_norm_var": 0.7171834309895834, "learning_rate": 4e-05, "loss": 4.7946, "loss/crossentropy": 1.577958881855011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1998040471225977, "step": 2604 }, { "epoch": 0.21716666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.6925130208333333, "learning_rate": 4e-05, "loss": 5.1966, "loss/crossentropy": 1.9956328868865967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1921469122171402, "step": 2606 }, { "epoch": 0.21733333333333332, "grad_norm": 5.0, "grad_norm_var": 0.6096638997395833, "learning_rate": 4e-05, "loss": 4.9423, "loss/crossentropy": 1.4313837885856628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15414538234472275, "step": 2608 }, { "epoch": 0.2175, "grad_norm": 4.90625, "grad_norm_var": 0.5982706705729167, "learning_rate": 4e-05, "loss": 4.4947, "loss/crossentropy": 1.1389791369438171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13487368822097778, "step": 2610 }, { "epoch": 0.21766666666666667, "grad_norm": 5.71875, "grad_norm_var": 0.5936848958333333, "learning_rate": 4e-05, "loss": 5.0737, "loss/crossentropy": 2.404743731021881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2210671752691269, "step": 2612 }, { "epoch": 0.21783333333333332, "grad_norm": 5.28125, "grad_norm_var": 0.5944661458333333, "learning_rate": 4e-05, "loss": 5.5376, "loss/crossentropy": 2.3369793593883514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20953010395169258, "step": 2614 }, { "epoch": 0.218, "grad_norm": 5.0, "grad_norm_var": 0.63160400390625, "learning_rate": 4e-05, "loss": 4.953, "loss/crossentropy": 1.8187780529260635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17888355813920498, "step": 2616 }, { "epoch": 0.21816666666666668, "grad_norm": 4.4375, "grad_norm_var": 0.67525634765625, "learning_rate": 4e-05, "loss": 4.6163, "loss/crossentropy": 2.0286522433161736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18695184588432312, "step": 2618 }, { "epoch": 0.21833333333333332, "grad_norm": 4.96875, "grad_norm_var": 0.13238525390625, "learning_rate": 4e-05, "loss": 4.3147, "loss/crossentropy": 1.8871822357177734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17654790729284286, "step": 2620 }, { "epoch": 0.2185, "grad_norm": 5.09375, "grad_norm_var": 0.13385416666666666, "learning_rate": 4e-05, "loss": 4.956, "loss/crossentropy": 2.6005072593688965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20175550132989883, "step": 2622 }, { "epoch": 0.21866666666666668, "grad_norm": 5.125, "grad_norm_var": 0.138134765625, "learning_rate": 4e-05, "loss": 5.2147, "loss/crossentropy": 2.5134531259536743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22386842593550682, "step": 2624 }, { "epoch": 0.21883333333333332, "grad_norm": 5.3125, "grad_norm_var": 0.14290364583333334, "learning_rate": 4e-05, "loss": 5.3094, "loss/crossentropy": 1.8400039002299309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17169796116650105, "step": 2626 }, { "epoch": 0.219, "grad_norm": 5.0625, "grad_norm_var": 0.06952718098958334, "learning_rate": 4e-05, "loss": 5.0435, "loss/crossentropy": 1.4271916523575783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1459579486399889, "step": 2628 }, { "epoch": 0.21916666666666668, "grad_norm": 4.84375, "grad_norm_var": 0.05868733723958333, "learning_rate": 4e-05, "loss": 4.6969, "loss/crossentropy": 2.2238671481609344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19480058550834656, "step": 2630 }, { "epoch": 0.21933333333333332, "grad_norm": 5.59375, "grad_norm_var": 0.08567708333333333, "learning_rate": 4e-05, "loss": 5.0919, "loss/crossentropy": 2.6155874729156494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124779336154461, "step": 2632 }, { "epoch": 0.2195, "grad_norm": 5.15625, "grad_norm_var": 0.08435872395833334, "learning_rate": 4e-05, "loss": 4.6738, "loss/crossentropy": 2.1278350353240967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25172296166419983, "step": 2634 }, { "epoch": 0.21966666666666668, "grad_norm": 5.1875, "grad_norm_var": 0.0876953125, "learning_rate": 4e-05, "loss": 4.7663, "loss/crossentropy": 2.2914819419384003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232300266623497, "step": 2636 }, { "epoch": 0.21983333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.092822265625, "learning_rate": 4e-05, "loss": 5.2515, "loss/crossentropy": 1.9743507206439972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19434590265154839, "step": 2638 }, { "epoch": 0.22, "grad_norm": 5.3125, "grad_norm_var": 0.090478515625, "learning_rate": 4e-05, "loss": 5.0155, "loss/crossentropy": 2.3763028979301453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22711258754134178, "step": 2640 }, { "epoch": 0.22016666666666668, "grad_norm": 5.15625, "grad_norm_var": 0.09208577473958333, "learning_rate": 4e-05, "loss": 5.0988, "loss/crossentropy": 1.600294180214405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16233623772859573, "step": 2642 }, { "epoch": 0.22033333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.09388020833333334, "learning_rate": 4e-05, "loss": 4.9415, "loss/crossentropy": 2.3785403072834015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217193104326725, "step": 2644 }, { "epoch": 0.2205, "grad_norm": 4.78125, "grad_norm_var": 0.09466145833333334, "learning_rate": 4e-05, "loss": 4.4769, "loss/crossentropy": 1.0602325424551964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12778138555586338, "step": 2646 }, { "epoch": 0.22066666666666668, "grad_norm": 5.46875, "grad_norm_var": 0.08785400390625, "learning_rate": 4e-05, "loss": 4.9776, "loss/crossentropy": 2.049244850873947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20482390746474266, "step": 2648 }, { "epoch": 0.22083333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.08487955729166667, "learning_rate": 4e-05, "loss": 4.9487, "loss/crossentropy": 2.3347797989845276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21474630758166313, "step": 2650 }, { "epoch": 0.221, "grad_norm": 5.28125, "grad_norm_var": 0.06261393229166666, "learning_rate": 4e-05, "loss": 4.755, "loss/crossentropy": 1.834708720445633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18110329657793045, "step": 2652 }, { "epoch": 0.22116666666666668, "grad_norm": 4.96875, "grad_norm_var": 0.06884358723958334, "learning_rate": 4e-05, "loss": 5.009, "loss/crossentropy": 2.164376437664032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20554454252123833, "step": 2654 }, { "epoch": 0.22133333333333333, "grad_norm": 4.625, "grad_norm_var": 0.08476155598958333, "learning_rate": 4e-05, "loss": 4.673, "loss/crossentropy": 2.358445018529892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216878242790699, "step": 2656 }, { "epoch": 0.2215, "grad_norm": 5.03125, "grad_norm_var": 0.08046875, "learning_rate": 4e-05, "loss": 5.1396, "loss/crossentropy": 1.6497721672058105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18405374325811863, "step": 2658 }, { "epoch": 0.22166666666666668, "grad_norm": 5.15625, "grad_norm_var": 0.0720703125, "learning_rate": 4e-05, "loss": 5.5726, "loss/crossentropy": 2.38449564576149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24445926398038864, "step": 2660 }, { "epoch": 0.22183333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.062483723958333334, "learning_rate": 4e-05, "loss": 5.2137, "loss/crossentropy": 1.9726266413927078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19294766709208488, "step": 2662 }, { "epoch": 0.222, "grad_norm": 4.71875, "grad_norm_var": 0.0697265625, "learning_rate": 4e-05, "loss": 4.8543, "loss/crossentropy": 2.184689074754715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20648836717009544, "step": 2664 }, { "epoch": 0.22216666666666668, "grad_norm": 5.28125, "grad_norm_var": 0.07967122395833333, "learning_rate": 4e-05, "loss": 4.6217, "loss/crossentropy": 2.200216382741928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21623068675398827, "step": 2666 }, { "epoch": 0.22233333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.08019205729166666, "learning_rate": 4e-05, "loss": 5.5548, "loss/crossentropy": 2.570397049188614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21973755210638046, "step": 2668 }, { "epoch": 0.2225, "grad_norm": 4.625, "grad_norm_var": 0.07486572265625, "learning_rate": 4e-05, "loss": 4.5188, "loss/crossentropy": 1.840833805501461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19348925352096558, "step": 2670 }, { "epoch": 0.22266666666666668, "grad_norm": 4.90625, "grad_norm_var": 0.064306640625, "learning_rate": 4e-05, "loss": 4.9382, "loss/crossentropy": 2.048027887940407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18247253075242043, "step": 2672 }, { "epoch": 0.22283333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.07825520833333334, "learning_rate": 4e-05, "loss": 5.2389, "loss/crossentropy": 1.7717494443058968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19412222132086754, "step": 2674 }, { "epoch": 0.223, "grad_norm": 5.0625, "grad_norm_var": 0.06483968098958333, "learning_rate": 4e-05, "loss": 5.3882, "loss/crossentropy": 2.4813308119773865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152557373046875, "step": 2676 }, { "epoch": 0.22316666666666668, "grad_norm": 4.875, "grad_norm_var": 0.05845947265625, "learning_rate": 4e-05, "loss": 4.686, "loss/crossentropy": 2.3142955899238586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20804548263549805, "step": 2678 }, { "epoch": 0.22333333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.05245768229166667, "learning_rate": 4e-05, "loss": 4.7857, "loss/crossentropy": 2.190678149461746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21705422922968864, "step": 2680 }, { "epoch": 0.2235, "grad_norm": 5.21875, "grad_norm_var": 0.042643229166666664, "learning_rate": 4e-05, "loss": 5.3474, "loss/crossentropy": 1.5699248164892197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278335839509964, "step": 2682 }, { "epoch": 0.22366666666666668, "grad_norm": 4.84375, "grad_norm_var": 0.038309733072916664, "learning_rate": 4e-05, "loss": 5.0322, "loss/crossentropy": 1.916561797261238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19276293367147446, "step": 2684 }, { "epoch": 0.22383333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.030301920572916665, "learning_rate": 4e-05, "loss": 4.9872, "loss/crossentropy": 2.2632661163806915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21685384586453438, "step": 2686 }, { "epoch": 0.224, "grad_norm": 4.6875, "grad_norm_var": 0.03487955729166667, "learning_rate": 4e-05, "loss": 4.5296, "loss/crossentropy": 1.44465272128582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1623321734368801, "step": 2688 }, { "epoch": 0.22416666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.02545166015625, "learning_rate": 4e-05, "loss": 4.4794, "loss/crossentropy": 2.4449245929718018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22241264954209328, "step": 2690 }, { "epoch": 0.22433333333333333, "grad_norm": 5.5, "grad_norm_var": 0.04412434895833333, "learning_rate": 4e-05, "loss": 5.2167, "loss/crossentropy": 1.7128597050905228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19268352910876274, "step": 2692 }, { "epoch": 0.2245, "grad_norm": 5.875, "grad_norm_var": 0.10227864583333333, "learning_rate": 4e-05, "loss": 5.5377, "loss/crossentropy": 2.5913642048835754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.265250276774168, "step": 2694 }, { "epoch": 0.22466666666666665, "grad_norm": 4.84375, "grad_norm_var": 0.10377197265625, "learning_rate": 4e-05, "loss": 4.514, "loss/crossentropy": 2.124376595020294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20373161137104034, "step": 2696 }, { "epoch": 0.22483333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.101025390625, "learning_rate": 4e-05, "loss": 5.056, "loss/crossentropy": 1.651014804840088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17066487483680248, "step": 2698 }, { "epoch": 0.225, "grad_norm": 4.5625, "grad_norm_var": 0.11314697265625, "learning_rate": 4e-05, "loss": 4.2127, "loss/crossentropy": 2.065021328628063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1880048643797636, "step": 2700 }, { "epoch": 0.22516666666666665, "grad_norm": 5.125, "grad_norm_var": 0.11829020182291666, "learning_rate": 4e-05, "loss": 5.2311, "loss/crossentropy": 2.4089654088020325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25183702632784843, "step": 2702 }, { "epoch": 0.22533333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.10618082682291667, "learning_rate": 4e-05, "loss": 4.624, "loss/crossentropy": 1.7160001248121262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726439669728279, "step": 2704 }, { "epoch": 0.2255, "grad_norm": 4.65625, "grad_norm_var": 0.10435791015625, "learning_rate": 4e-05, "loss": 4.7344, "loss/crossentropy": 1.2806991934776306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15048817172646523, "step": 2706 }, { "epoch": 0.22566666666666665, "grad_norm": 10.375, "grad_norm_var": 1.8723307291666667, "learning_rate": 4e-05, "loss": 5.1457, "loss/crossentropy": 1.30407252907753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14570009522140026, "step": 2708 }, { "epoch": 0.22583333333333333, "grad_norm": 5.09375, "grad_norm_var": 1.8645182291666667, "learning_rate": 4e-05, "loss": 5.4753, "loss/crossentropy": 2.5484912395477295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21789904311299324, "step": 2710 }, { "epoch": 0.226, "grad_norm": 4.90625, "grad_norm_var": 1.8605305989583334, "learning_rate": 4e-05, "loss": 5.1614, "loss/crossentropy": 1.9052416235208511, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19455315172672272, "step": 2712 }, { "epoch": 0.22616666666666665, "grad_norm": 5.28125, "grad_norm_var": 1.8538899739583334, "learning_rate": 4e-05, "loss": 4.8978, "loss/crossentropy": 2.0532439947128296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112819291651249, "step": 2714 }, { "epoch": 0.22633333333333333, "grad_norm": 4.71875, "grad_norm_var": 1.8289021809895833, "learning_rate": 4e-05, "loss": 4.7618, "loss/crossentropy": 2.1942814588546753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2094101719558239, "step": 2716 }, { "epoch": 0.2265, "grad_norm": 4.5625, "grad_norm_var": 1.8766560872395834, "learning_rate": 4e-05, "loss": 4.2254, "loss/crossentropy": 2.525265157222748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2201676107943058, "step": 2718 }, { "epoch": 0.22666666666666666, "grad_norm": 5.03125, "grad_norm_var": 1.8827962239583333, "learning_rate": 4e-05, "loss": 4.8646, "loss/crossentropy": 0.7136424034833908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1026750449091196, "step": 2720 }, { "epoch": 0.22683333333333333, "grad_norm": 4.875, "grad_norm_var": 1.8532389322916667, "learning_rate": 4e-05, "loss": 5.2678, "loss/crossentropy": 2.462954103946686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22561665251851082, "step": 2722 }, { "epoch": 0.227, "grad_norm": 5.15625, "grad_norm_var": 0.05969645182291667, "learning_rate": 4e-05, "loss": 5.0547, "loss/crossentropy": 2.5214640498161316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22827640548348427, "step": 2724 }, { "epoch": 0.22716666666666666, "grad_norm": 4.84375, "grad_norm_var": 9.401981608072917, "learning_rate": 4e-05, "loss": 4.972, "loss/crossentropy": 2.2954089641571045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23702961578965187, "step": 2726 }, { "epoch": 0.22733333333333333, "grad_norm": 5.21875, "grad_norm_var": 9.377437337239583, "learning_rate": 4e-05, "loss": 5.5277, "loss/crossentropy": 1.4282821118831635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1785445362329483, "step": 2728 }, { "epoch": 0.2275, "grad_norm": 5.3125, "grad_norm_var": 9.397847493489584, "learning_rate": 4e-05, "loss": 5.0704, "loss/crossentropy": 1.5387426540255547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17435546405613422, "step": 2730 }, { "epoch": 0.22766666666666666, "grad_norm": 4.78125, "grad_norm_var": 9.3921875, "learning_rate": 4e-05, "loss": 4.5527, "loss/crossentropy": 1.2322108745574951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1475350596010685, "step": 2732 }, { "epoch": 0.22783333333333333, "grad_norm": 4.90625, "grad_norm_var": 9.334228515625, "learning_rate": 4e-05, "loss": 4.6543, "loss/crossentropy": 1.1598545908927917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1438927836716175, "step": 2734 }, { "epoch": 0.228, "grad_norm": 5.40625, "grad_norm_var": 9.274283854166667, "learning_rate": 4e-05, "loss": 4.9029, "loss/crossentropy": 1.6407746598124504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1901111900806427, "step": 2736 }, { "epoch": 0.22816666666666666, "grad_norm": 5.1875, "grad_norm_var": 9.262223307291666, "learning_rate": 4e-05, "loss": 5.0487, "loss/crossentropy": 1.8306042179465294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21500974148511887, "step": 2738 }, { "epoch": 0.22833333333333333, "grad_norm": 5.4375, "grad_norm_var": 9.212333170572917, "learning_rate": 4e-05, "loss": 5.2418, "loss/crossentropy": 2.3094605207443237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21840552985668182, "step": 2740 }, { "epoch": 0.2285, "grad_norm": 5.3125, "grad_norm_var": 0.0423828125, "learning_rate": 4e-05, "loss": 5.6929, "loss/crossentropy": 1.8620459735393524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19147185236215591, "step": 2742 }, { "epoch": 0.22866666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.057938639322916666, "learning_rate": 4e-05, "loss": 4.5089, "loss/crossentropy": 1.920153945684433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18584110215306282, "step": 2744 }, { "epoch": 0.22883333333333333, "grad_norm": 5.125, "grad_norm_var": 0.0654296875, "learning_rate": 4e-05, "loss": 4.6279, "loss/crossentropy": 1.5992632433772087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17130916193127632, "step": 2746 }, { "epoch": 0.229, "grad_norm": 5.0, "grad_norm_var": 0.07082926432291667, "learning_rate": 4e-05, "loss": 4.4131, "loss/crossentropy": 1.882856197655201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18213853426277637, "step": 2748 }, { "epoch": 0.22916666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.07899983723958333, "learning_rate": 4e-05, "loss": 4.8627, "loss/crossentropy": 2.1046335101127625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19893395900726318, "step": 2750 }, { "epoch": 0.22933333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.07330322265625, "learning_rate": 4e-05, "loss": 4.9209, "loss/crossentropy": 2.348418891429901, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22180702164769173, "step": 2752 }, { "epoch": 0.2295, "grad_norm": 5.09375, "grad_norm_var": 0.073828125, "learning_rate": 4e-05, "loss": 5.0366, "loss/crossentropy": 1.4402420744299889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1656176634132862, "step": 2754 }, { "epoch": 0.22966666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.06373697916666667, "learning_rate": 4e-05, "loss": 4.9903, "loss/crossentropy": 1.663852408528328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19472996331751347, "step": 2756 }, { "epoch": 0.22983333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.05071614583333333, "learning_rate": 4e-05, "loss": 4.8286, "loss/crossentropy": 2.364410251379013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22333412244915962, "step": 2758 }, { "epoch": 0.23, "grad_norm": 4.71875, "grad_norm_var": 0.059305826822916664, "learning_rate": 4e-05, "loss": 4.9322, "loss/crossentropy": 2.570330262184143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22481412440538406, "step": 2760 }, { "epoch": 0.23016666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.05406494140625, "learning_rate": 4e-05, "loss": 5.1796, "loss/crossentropy": 2.4776630997657776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199436090886593, "step": 2762 }, { "epoch": 0.23033333333333333, "grad_norm": 5.125, "grad_norm_var": 0.05169270833333333, "learning_rate": 4e-05, "loss": 5.3964, "loss/crossentropy": 2.457890510559082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21888697147369385, "step": 2764 }, { "epoch": 0.2305, "grad_norm": 4.71875, "grad_norm_var": 0.059468587239583336, "learning_rate": 4e-05, "loss": 4.9103, "loss/crossentropy": 2.312767207622528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19433093816041946, "step": 2766 }, { "epoch": 0.23066666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.06252848307291667, "learning_rate": 4e-05, "loss": 4.5758, "loss/crossentropy": 1.472007542848587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1721639148890972, "step": 2768 }, { "epoch": 0.23083333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.06825764973958333, "learning_rate": 4e-05, "loss": 5.0607, "loss/crossentropy": 2.117499329149723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20281216129660606, "step": 2770 }, { "epoch": 0.231, "grad_norm": 5.0625, "grad_norm_var": 0.049637858072916666, "learning_rate": 4e-05, "loss": 4.5248, "loss/crossentropy": 2.032151460647583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22201980277895927, "step": 2772 }, { "epoch": 0.23116666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.04595947265625, "learning_rate": 4e-05, "loss": 4.8957, "loss/crossentropy": 2.0667436867952347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20032967068254948, "step": 2774 }, { "epoch": 0.23133333333333334, "grad_norm": 4.75, "grad_norm_var": 0.04348551432291667, "learning_rate": 4e-05, "loss": 4.8943, "loss/crossentropy": 1.8684946075081825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16954820603132248, "step": 2776 }, { "epoch": 0.2315, "grad_norm": 5.15625, "grad_norm_var": 0.28596598307291665, "learning_rate": 4e-05, "loss": 5.2042, "loss/crossentropy": 1.9076000452041626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20960036292672157, "step": 2778 }, { "epoch": 0.23166666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.29817301432291665, "learning_rate": 4e-05, "loss": 4.6055, "loss/crossentropy": 2.335671216249466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20992399007081985, "step": 2780 }, { "epoch": 0.23183333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.2762858072916667, "learning_rate": 4e-05, "loss": 5.1989, "loss/crossentropy": 2.675258159637451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23351893201470375, "step": 2782 }, { "epoch": 0.232, "grad_norm": 4.78125, "grad_norm_var": 0.2702433268229167, "learning_rate": 4e-05, "loss": 5.0575, "loss/crossentropy": 1.3609646335244179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15972201712429523, "step": 2784 }, { "epoch": 0.23216666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.26500244140625, "learning_rate": 4e-05, "loss": 5.123, "loss/crossentropy": 2.211142838001251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24086641520261765, "step": 2786 }, { "epoch": 0.23233333333333334, "grad_norm": 4.625, "grad_norm_var": 0.27975260416666664, "learning_rate": 4e-05, "loss": 5.0675, "loss/crossentropy": 2.1681629419326782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20870641618967056, "step": 2788 }, { "epoch": 0.2325, "grad_norm": 4.90625, "grad_norm_var": 0.28085530598958336, "learning_rate": 4e-05, "loss": 5.1701, "loss/crossentropy": 2.3621520698070526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21960894763469696, "step": 2790 }, { "epoch": 0.23266666666666666, "grad_norm": 4.625, "grad_norm_var": 0.3078084309895833, "learning_rate": 4e-05, "loss": 4.1998, "loss/crossentropy": 1.8437300026416779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18039709888398647, "step": 2792 }, { "epoch": 0.23283333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.03883056640625, "learning_rate": 4e-05, "loss": 4.6421, "loss/crossentropy": 1.9587142765522003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116855699568987, "step": 2794 }, { "epoch": 0.233, "grad_norm": 5.0625, "grad_norm_var": 0.17548421223958333, "learning_rate": 4e-05, "loss": 5.0527, "loss/crossentropy": 2.1764910221099854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2081896774470806, "step": 2796 }, { "epoch": 0.23316666666666666, "grad_norm": 4.875, "grad_norm_var": 0.190234375, "learning_rate": 4e-05, "loss": 4.5767, "loss/crossentropy": 1.5604673027992249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18230070546269417, "step": 2798 }, { "epoch": 0.23333333333333334, "grad_norm": 5.34375, "grad_norm_var": 0.19625244140625, "learning_rate": 4e-05, "loss": 5.6337, "loss/crossentropy": 2.379370391368866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22885718196630478, "step": 2800 }, { "epoch": 0.2335, "grad_norm": 5.84375, "grad_norm_var": 0.24107666015625, "learning_rate": 4e-05, "loss": 4.9887, "loss/crossentropy": 1.4264894649386406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15079515427350998, "step": 2802 }, { "epoch": 0.23366666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.24075113932291667, "learning_rate": 4e-05, "loss": 5.1173, "loss/crossentropy": 2.43440181016922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21988075599074364, "step": 2804 }, { "epoch": 0.23383333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.24049072265625, "learning_rate": 4e-05, "loss": 5.1611, "loss/crossentropy": 1.7003138586878777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18909470550715923, "step": 2806 }, { "epoch": 0.234, "grad_norm": 5.46875, "grad_norm_var": 0.19302978515625, "learning_rate": 4e-05, "loss": 4.9761, "loss/crossentropy": 1.9690175727009773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17825855500996113, "step": 2808 }, { "epoch": 0.23416666666666666, "grad_norm": 5.53125, "grad_norm_var": 0.20549723307291667, "learning_rate": 4e-05, "loss": 5.146, "loss/crossentropy": 2.0230807662010193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24265416339039803, "step": 2810 }, { "epoch": 0.23433333333333334, "grad_norm": 5.65625, "grad_norm_var": 0.13414306640625, "learning_rate": 4e-05, "loss": 4.7241, "loss/crossentropy": 1.9726131781935692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17903245240449905, "step": 2812 }, { "epoch": 0.2345, "grad_norm": 5.21875, "grad_norm_var": 0.13177083333333334, "learning_rate": 4e-05, "loss": 4.8185, "loss/crossentropy": 1.9392977207899094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18167594075202942, "step": 2814 }, { "epoch": 0.23466666666666666, "grad_norm": 5.40625, "grad_norm_var": 0.13352864583333332, "learning_rate": 4e-05, "loss": 5.2522, "loss/crossentropy": 2.45490038394928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22192463278770447, "step": 2816 }, { "epoch": 0.23483333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.11972249348958333, "learning_rate": 4e-05, "loss": 4.731, "loss/crossentropy": 2.515716075897217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21755141019821167, "step": 2818 }, { "epoch": 0.235, "grad_norm": 5.28125, "grad_norm_var": 0.10526936848958333, "learning_rate": 4e-05, "loss": 5.2301, "loss/crossentropy": 2.4293786883354187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21814614161849022, "step": 2820 }, { "epoch": 0.23516666666666666, "grad_norm": 4.75, "grad_norm_var": 0.11220296223958333, "learning_rate": 4e-05, "loss": 4.5221, "loss/crossentropy": 1.4309967905282974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15550539828836918, "step": 2822 }, { "epoch": 0.23533333333333334, "grad_norm": 5.40625, "grad_norm_var": 0.13733317057291666, "learning_rate": 4e-05, "loss": 4.8982, "loss/crossentropy": 2.560435712337494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20439713820815086, "step": 2824 }, { "epoch": 0.2355, "grad_norm": 5.0625, "grad_norm_var": 0.114306640625, "learning_rate": 4e-05, "loss": 5.4189, "loss/crossentropy": 2.490498185157776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24041643366217613, "step": 2826 }, { "epoch": 0.23566666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.10898030598958333, "learning_rate": 4e-05, "loss": 5.0196, "loss/crossentropy": 2.0801108181476593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159007117152214, "step": 2828 }, { "epoch": 0.23583333333333334, "grad_norm": 5.34375, "grad_norm_var": 0.10545247395833333, "learning_rate": 4e-05, "loss": 5.0843, "loss/crossentropy": 2.540014386177063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21403054893016815, "step": 2830 }, { "epoch": 0.236, "grad_norm": 5.28125, "grad_norm_var": 0.105712890625, "learning_rate": 4e-05, "loss": 5.3198, "loss/crossentropy": 2.0823977291584015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19510109908878803, "step": 2832 }, { "epoch": 0.23616666666666666, "grad_norm": 5.46875, "grad_norm_var": 0.07984619140625, "learning_rate": 4e-05, "loss": 5.3449, "loss/crossentropy": 2.335395246744156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23566577583551407, "step": 2834 }, { "epoch": 0.23633333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.08772379557291667, "learning_rate": 4e-05, "loss": 4.9142, "loss/crossentropy": 2.067429706454277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20775259472429752, "step": 2836 }, { "epoch": 0.2365, "grad_norm": 5.375, "grad_norm_var": 0.09342041015625, "learning_rate": 4e-05, "loss": 5.0618, "loss/crossentropy": 1.6095528677105904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1610132586210966, "step": 2838 }, { "epoch": 0.23666666666666666, "grad_norm": 5.375, "grad_norm_var": 0.06728108723958333, "learning_rate": 4e-05, "loss": 4.9791, "loss/crossentropy": 1.1937666982412338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16449680365622044, "step": 2840 }, { "epoch": 0.23683333333333334, "grad_norm": 4.5, "grad_norm_var": 0.09244791666666667, "learning_rate": 4e-05, "loss": 4.6665, "loss/crossentropy": 2.3993532061576843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20165612176060677, "step": 2842 }, { "epoch": 0.237, "grad_norm": 4.65625, "grad_norm_var": 0.09495035807291667, "learning_rate": 4e-05, "loss": 5.0856, "loss/crossentropy": 1.9005714282393456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18283692002296448, "step": 2844 }, { "epoch": 0.23716666666666666, "grad_norm": 5.125, "grad_norm_var": 0.10019124348958333, "learning_rate": 4e-05, "loss": 4.728, "loss/crossentropy": 2.730278193950653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2370842583477497, "step": 2846 }, { "epoch": 0.23733333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.0990234375, "learning_rate": 4e-05, "loss": 5.1465, "loss/crossentropy": 2.6805002689361572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24382244795560837, "step": 2848 }, { "epoch": 0.2375, "grad_norm": 5.3125, "grad_norm_var": 0.09869384765625, "learning_rate": 4e-05, "loss": 4.9877, "loss/crossentropy": 2.367294877767563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22220474854111671, "step": 2850 }, { "epoch": 0.23766666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.097509765625, "learning_rate": 4e-05, "loss": 5.356, "loss/crossentropy": 2.0009628012776375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20864171534776688, "step": 2852 }, { "epoch": 0.23783333333333334, "grad_norm": 5.375, "grad_norm_var": 0.09065348307291667, "learning_rate": 4e-05, "loss": 4.5579, "loss/crossentropy": 1.845793679356575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19666944071650505, "step": 2854 }, { "epoch": 0.238, "grad_norm": 4.6875, "grad_norm_var": 0.09095052083333334, "learning_rate": 4e-05, "loss": 5.1909, "loss/crossentropy": 2.2471812665462494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2083476521074772, "step": 2856 }, { "epoch": 0.23816666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.07459309895833334, "learning_rate": 4e-05, "loss": 5.174, "loss/crossentropy": 2.458581805229187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21159670129418373, "step": 2858 }, { "epoch": 0.23833333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.07265625, "learning_rate": 4e-05, "loss": 4.5802, "loss/crossentropy": 1.3199757784605026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1531830132007599, "step": 2860 }, { "epoch": 0.2385, "grad_norm": 4.78125, "grad_norm_var": 0.07107747395833333, "learning_rate": 4e-05, "loss": 4.7332, "loss/crossentropy": 1.2995287701487541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14581317454576492, "step": 2862 }, { "epoch": 0.23866666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.06717122395833333, "learning_rate": 4e-05, "loss": 4.8991, "loss/crossentropy": 1.44887076318264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18584540858864784, "step": 2864 }, { "epoch": 0.23883333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.05944010416666667, "learning_rate": 4e-05, "loss": 5.1643, "loss/crossentropy": 2.570837378501892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20513088256120682, "step": 2866 }, { "epoch": 0.239, "grad_norm": 4.875, "grad_norm_var": 0.056929524739583334, "learning_rate": 4e-05, "loss": 4.6723, "loss/crossentropy": 2.399744689464569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21387247368693352, "step": 2868 }, { "epoch": 0.23916666666666667, "grad_norm": 6.375, "grad_norm_var": 0.16711832682291666, "learning_rate": 4e-05, "loss": 5.237, "loss/crossentropy": 1.9510470181703568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18807476945221424, "step": 2870 }, { "epoch": 0.23933333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.16842041015625, "learning_rate": 4e-05, "loss": 4.6487, "loss/crossentropy": 1.1220924705266953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1417934186756611, "step": 2872 }, { "epoch": 0.2395, "grad_norm": 5.125, "grad_norm_var": 0.16691080729166666, "learning_rate": 4e-05, "loss": 5.2557, "loss/crossentropy": 2.146589756011963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22325074672698975, "step": 2874 }, { "epoch": 0.23966666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.15556233723958332, "learning_rate": 4e-05, "loss": 4.945, "loss/crossentropy": 1.7336683943867683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16801801696419716, "step": 2876 }, { "epoch": 0.23983333333333334, "grad_norm": 4.625, "grad_norm_var": 0.16125895182291666, "learning_rate": 4e-05, "loss": 4.9316, "loss/crossentropy": 1.9470653384923935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18395037576556206, "step": 2878 }, { "epoch": 0.24, "grad_norm": 4.625, "grad_norm_var": 0.16829427083333334, "learning_rate": 4e-05, "loss": 4.5489, "loss/crossentropy": 1.708813153207302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18478217348456383, "step": 2880 }, { "epoch": 0.24016666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.16689046223958334, "learning_rate": 4e-05, "loss": 4.3889, "loss/crossentropy": 1.398942418396473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16694530472159386, "step": 2882 }, { "epoch": 0.24033333333333334, "grad_norm": 5.375, "grad_norm_var": 0.17603759765625, "learning_rate": 4e-05, "loss": 4.7529, "loss/crossentropy": 2.43044650554657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21148262172937393, "step": 2884 }, { "epoch": 0.2405, "grad_norm": 4.875, "grad_norm_var": 0.047900390625, "learning_rate": 4e-05, "loss": 4.5462, "loss/crossentropy": 2.5115047097206116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20838917791843414, "step": 2886 }, { "epoch": 0.24066666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.051953125, "learning_rate": 4e-05, "loss": 5.2548, "loss/crossentropy": 2.7271772623062134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21330803260207176, "step": 2888 }, { "epoch": 0.24083333333333334, "grad_norm": 5.34375, "grad_norm_var": 0.05816650390625, "learning_rate": 4e-05, "loss": 5.792, "loss/crossentropy": 2.412293791770935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21458745002746582, "step": 2890 }, { "epoch": 0.241, "grad_norm": 4.71875, "grad_norm_var": 0.059619140625, "learning_rate": 4e-05, "loss": 4.9378, "loss/crossentropy": 1.7731594443321228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1800384782254696, "step": 2892 }, { "epoch": 0.24116666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.0548828125, "learning_rate": 4e-05, "loss": 4.4171, "loss/crossentropy": 2.4972161054611206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21959226578474045, "step": 2894 }, { "epoch": 0.24133333333333334, "grad_norm": 5.65625, "grad_norm_var": 0.073046875, "learning_rate": 4e-05, "loss": 5.5958, "loss/crossentropy": 2.3957661390304565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111573964357376, "step": 2896 }, { "epoch": 0.2415, "grad_norm": 5.125, "grad_norm_var": 0.08391927083333334, "learning_rate": 4e-05, "loss": 5.1817, "loss/crossentropy": 1.2799173444509506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18157821521162987, "step": 2898 }, { "epoch": 0.24166666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.07545166015625, "learning_rate": 4e-05, "loss": 5.0338, "loss/crossentropy": 2.3033843338489532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21752947941422462, "step": 2900 }, { "epoch": 0.24183333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.06261393229166666, "learning_rate": 4e-05, "loss": 5.3801, "loss/crossentropy": 1.950482338666916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18952583149075508, "step": 2902 }, { "epoch": 0.242, "grad_norm": 4.96875, "grad_norm_var": 0.06597900390625, "learning_rate": 4e-05, "loss": 4.7877, "loss/crossentropy": 1.822025142610073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20692705176770687, "step": 2904 }, { "epoch": 0.24216666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.060791015625, "learning_rate": 4e-05, "loss": 4.5748, "loss/crossentropy": 2.2172627449035645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20654083043336868, "step": 2906 }, { "epoch": 0.24233333333333335, "grad_norm": 4.5, "grad_norm_var": 0.07350260416666667, "learning_rate": 4e-05, "loss": 4.5271, "loss/crossentropy": 1.8431589156389236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18556870892643929, "step": 2908 }, { "epoch": 0.2425, "grad_norm": 5.6875, "grad_norm_var": 0.10172119140625, "learning_rate": 4e-05, "loss": 4.9521, "loss/crossentropy": 1.194792091846466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20142144337296486, "step": 2910 }, { "epoch": 0.24266666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.11009114583333333, "learning_rate": 4e-05, "loss": 4.9352, "loss/crossentropy": 1.2394988313317299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14485261403024197, "step": 2912 }, { "epoch": 0.24283333333333335, "grad_norm": 5.1875, "grad_norm_var": 0.10601806640625, "learning_rate": 4e-05, "loss": 4.505, "loss/crossentropy": 2.194884717464447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23719510436058044, "step": 2914 }, { "epoch": 0.243, "grad_norm": 5.21875, "grad_norm_var": 0.10779622395833334, "learning_rate": 4e-05, "loss": 4.5226, "loss/crossentropy": 2.356331080198288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.221795491874218, "step": 2916 }, { "epoch": 0.24316666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.108837890625, "learning_rate": 4e-05, "loss": 4.9338, "loss/crossentropy": 2.3776062428951263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22553601115942, "step": 2918 }, { "epoch": 0.24333333333333335, "grad_norm": 5.03125, "grad_norm_var": 0.10677083333333333, "learning_rate": 4e-05, "loss": 4.83, "loss/crossentropy": 1.745754636824131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18756554648280144, "step": 2920 }, { "epoch": 0.2435, "grad_norm": 5.15625, "grad_norm_var": 0.11197916666666667, "learning_rate": 4e-05, "loss": 4.8526, "loss/crossentropy": 2.37031289935112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172992266714573, "step": 2922 }, { "epoch": 0.24366666666666667, "grad_norm": 5.125, "grad_norm_var": 0.09511311848958333, "learning_rate": 4e-05, "loss": 4.6856, "loss/crossentropy": 2.4921224117279053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21399492397904396, "step": 2924 }, { "epoch": 0.24383333333333335, "grad_norm": 5.5, "grad_norm_var": 0.07069905598958333, "learning_rate": 4e-05, "loss": 5.342, "loss/crossentropy": 2.2873608469963074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2288646660745144, "step": 2926 }, { "epoch": 0.244, "grad_norm": 4.53125, "grad_norm_var": 0.05777587890625, "learning_rate": 4e-05, "loss": 4.4681, "loss/crossentropy": 0.7939189150929451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1262371763586998, "step": 2928 }, { "epoch": 0.24416666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.04724934895833333, "learning_rate": 4e-05, "loss": 4.5896, "loss/crossentropy": 2.084025114774704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19710740074515343, "step": 2930 }, { "epoch": 0.24433333333333335, "grad_norm": 4.78125, "grad_norm_var": 0.04582926432291667, "learning_rate": 4e-05, "loss": 5.1147, "loss/crossentropy": 1.9766810834407806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1635744720697403, "step": 2932 }, { "epoch": 0.2445, "grad_norm": 4.78125, "grad_norm_var": 0.054911295572916664, "learning_rate": 4e-05, "loss": 4.5908, "loss/crossentropy": 2.331399142742157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101854346692562, "step": 2934 }, { "epoch": 0.24466666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.05533447265625, "learning_rate": 4e-05, "loss": 5.0402, "loss/crossentropy": 2.1609912514686584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20745176821947098, "step": 2936 }, { "epoch": 0.24483333333333332, "grad_norm": 5.1875, "grad_norm_var": 0.05715738932291667, "learning_rate": 4e-05, "loss": 4.7067, "loss/crossentropy": 1.9947044774889946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18351775035262108, "step": 2938 }, { "epoch": 0.245, "grad_norm": 4.9375, "grad_norm_var": 0.053971354166666666, "learning_rate": 4e-05, "loss": 4.5758, "loss/crossentropy": 1.7316635847091675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17024549469351768, "step": 2940 }, { "epoch": 0.24516666666666667, "grad_norm": 5.125, "grad_norm_var": 0.14542643229166666, "learning_rate": 4e-05, "loss": 5.457, "loss/crossentropy": 2.462052643299103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24193332344293594, "step": 2942 }, { "epoch": 0.24533333333333332, "grad_norm": 4.875, "grad_norm_var": 0.13683268229166667, "learning_rate": 4e-05, "loss": 4.7638, "loss/crossentropy": 2.3683615624904633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22581441700458527, "step": 2944 }, { "epoch": 0.2455, "grad_norm": 4.59375, "grad_norm_var": 0.14928385416666667, "learning_rate": 4e-05, "loss": 4.9376, "loss/crossentropy": 2.416099488735199, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069566398859024, "step": 2946 }, { "epoch": 0.24566666666666667, "grad_norm": 4.875, "grad_norm_var": 0.14763997395833334, "learning_rate": 4e-05, "loss": 4.9896, "loss/crossentropy": 2.327180027961731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20854448154568672, "step": 2948 }, { "epoch": 0.24583333333333332, "grad_norm": 4.90625, "grad_norm_var": 0.13697916666666668, "learning_rate": 4e-05, "loss": 4.8112, "loss/crossentropy": 2.242485076189041, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2354438677430153, "step": 2950 }, { "epoch": 0.246, "grad_norm": 4.96875, "grad_norm_var": 0.1345703125, "learning_rate": 4e-05, "loss": 4.7682, "loss/crossentropy": 1.564257226884365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19489132426679134, "step": 2952 }, { "epoch": 0.24616666666666667, "grad_norm": 5.0, "grad_norm_var": 0.13258056640625, "learning_rate": 4e-05, "loss": 4.9509, "loss/crossentropy": 2.032680094242096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20923538878560066, "step": 2954 }, { "epoch": 0.24633333333333332, "grad_norm": 5.28125, "grad_norm_var": 0.138916015625, "learning_rate": 4e-05, "loss": 5.0157, "loss/crossentropy": 1.8767257183790207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18023189157247543, "step": 2956 }, { "epoch": 0.2465, "grad_norm": 5.40625, "grad_norm_var": 0.04752197265625, "learning_rate": 4e-05, "loss": 5.2726, "loss/crossentropy": 2.728525757789612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22680773586034775, "step": 2958 }, { "epoch": 0.24666666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.562744140625, "learning_rate": 4e-05, "loss": 5.1125, "loss/crossentropy": 2.137243375182152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2695735916495323, "step": 2960 }, { "epoch": 0.24683333333333332, "grad_norm": 4.6875, "grad_norm_var": 0.563916015625, "learning_rate": 4e-05, "loss": 4.4757, "loss/crossentropy": 2.077547214925289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17666416242718697, "step": 2962 }, { "epoch": 0.247, "grad_norm": 4.78125, "grad_norm_var": 0.5870402018229167, "learning_rate": 4e-05, "loss": 4.5436, "loss/crossentropy": 2.0804325118660927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18956683948636055, "step": 2964 }, { "epoch": 0.24716666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.593603515625, "learning_rate": 4e-05, "loss": 5.3799, "loss/crossentropy": 2.3817147612571716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2155500017106533, "step": 2966 }, { "epoch": 0.24733333333333332, "grad_norm": 5.25, "grad_norm_var": 0.6034464518229167, "learning_rate": 4e-05, "loss": 4.6662, "loss/crossentropy": 1.789809986948967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085736319422722, "step": 2968 }, { "epoch": 0.2475, "grad_norm": 5.0625, "grad_norm_var": 0.602734375, "learning_rate": 4e-05, "loss": 5.3509, "loss/crossentropy": 1.5908312797546387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1741067972034216, "step": 2970 }, { "epoch": 0.24766666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.5999348958333334, "learning_rate": 4e-05, "loss": 4.859, "loss/crossentropy": 1.097387008368969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1571547295898199, "step": 2972 }, { "epoch": 0.24783333333333332, "grad_norm": 5.1875, "grad_norm_var": 0.6059733072916667, "learning_rate": 4e-05, "loss": 5.2508, "loss/crossentropy": 1.552187517285347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1697844583541155, "step": 2974 }, { "epoch": 0.248, "grad_norm": 5.59375, "grad_norm_var": 0.09501546223958333, "learning_rate": 4e-05, "loss": 4.5019, "loss/crossentropy": 2.224805660545826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20385221019387245, "step": 2976 }, { "epoch": 0.24816666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.08841145833333333, "learning_rate": 4e-05, "loss": 5.1168, "loss/crossentropy": 2.5963165760040283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21954534575343132, "step": 2978 }, { "epoch": 0.24833333333333332, "grad_norm": 4.875, "grad_norm_var": 0.076953125, "learning_rate": 4e-05, "loss": 4.5793, "loss/crossentropy": 1.7659974992275238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1737888753414154, "step": 2980 }, { "epoch": 0.2485, "grad_norm": 5.4375, "grad_norm_var": 0.07746988932291667, "learning_rate": 4e-05, "loss": 5.2939, "loss/crossentropy": 2.5869803428649902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22545062378048897, "step": 2982 }, { "epoch": 0.24866666666666667, "grad_norm": 5.53125, "grad_norm_var": 0.08444010416666667, "learning_rate": 4e-05, "loss": 4.9712, "loss/crossentropy": 2.442513942718506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22997939586639404, "step": 2984 }, { "epoch": 0.24883333333333332, "grad_norm": 5.15625, "grad_norm_var": 0.11669514973958334, "learning_rate": 4e-05, "loss": 4.8956, "loss/crossentropy": 2.3541765213012695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21100665628910065, "step": 2986 }, { "epoch": 0.249, "grad_norm": 5.15625, "grad_norm_var": 0.11220296223958333, "learning_rate": 4e-05, "loss": 5.2216, "loss/crossentropy": 1.5780949518084526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18446550890803337, "step": 2988 }, { "epoch": 0.24916666666666668, "grad_norm": 5.75, "grad_norm_var": 0.12652587890625, "learning_rate": 4e-05, "loss": 5.1027, "loss/crossentropy": 1.2989030554890633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17299162782728672, "step": 2990 }, { "epoch": 0.24933333333333332, "grad_norm": 4.625, "grad_norm_var": 0.13007405598958333, "learning_rate": 4e-05, "loss": 4.0987, "loss/crossentropy": 1.86490598320961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18420669436454773, "step": 2992 }, { "epoch": 0.2495, "grad_norm": 4.75, "grad_norm_var": 0.14172770182291666, "learning_rate": 4e-05, "loss": 4.6842, "loss/crossentropy": 1.3324758186936378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16431885957717896, "step": 2994 }, { "epoch": 0.24966666666666668, "grad_norm": 5.03125, "grad_norm_var": 0.13136393229166668, "learning_rate": 4e-05, "loss": 4.9357, "loss/crossentropy": 2.010372966527939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944860704243183, "step": 2996 }, { "epoch": 0.24983333333333332, "grad_norm": 5.34375, "grad_norm_var": 0.13075764973958334, "learning_rate": 4e-05, "loss": 5.4454, "loss/crossentropy": 2.445209562778473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20995644852519035, "step": 2998 }, { "epoch": 0.25, "grad_norm": 4.875, "grad_norm_var": 0.13524983723958334, "learning_rate": 4e-05, "loss": 4.1599, "loss/crossentropy": 2.438918113708496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2217733934521675, "step": 3000 }, { "epoch": 0.25016666666666665, "grad_norm": 5.4375, "grad_norm_var": 0.15467122395833333, "learning_rate": 4e-05, "loss": 4.6514, "loss/crossentropy": 1.777455359697342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.164383664727211, "step": 3002 }, { "epoch": 0.25033333333333335, "grad_norm": 5.28125, "grad_norm_var": 0.15601806640625, "learning_rate": 4e-05, "loss": 4.8879, "loss/crossentropy": 1.5648024901747704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1650273408740759, "step": 3004 }, { "epoch": 0.2505, "grad_norm": 5.25, "grad_norm_var": 0.1333984375, "learning_rate": 4e-05, "loss": 4.8656, "loss/crossentropy": 1.9242472425103188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20352683775126934, "step": 3006 }, { "epoch": 0.25066666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.11575520833333333, "learning_rate": 4e-05, "loss": 5.4478, "loss/crossentropy": 2.404434084892273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24195265397429466, "step": 3008 }, { "epoch": 0.25083333333333335, "grad_norm": 4.6875, "grad_norm_var": 0.11451416015625, "learning_rate": 4e-05, "loss": 4.4537, "loss/crossentropy": 1.8876915350556374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17346367239952087, "step": 3010 }, { "epoch": 0.251, "grad_norm": 5.90625, "grad_norm_var": 0.148291015625, "learning_rate": 4e-05, "loss": 4.7405, "loss/crossentropy": 1.463827095925808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1612564455717802, "step": 3012 }, { "epoch": 0.25116666666666665, "grad_norm": 4.71875, "grad_norm_var": 0.16005452473958334, "learning_rate": 4e-05, "loss": 4.3048, "loss/crossentropy": 1.8139414489269257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981374379247427, "step": 3014 }, { "epoch": 0.25133333333333335, "grad_norm": 4.9375, "grad_norm_var": 0.13645833333333332, "learning_rate": 4e-05, "loss": 5.2113, "loss/crossentropy": 2.3456265330314636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157314494252205, "step": 3016 }, { "epoch": 0.2515, "grad_norm": 5.28125, "grad_norm_var": 0.13212483723958332, "learning_rate": 4e-05, "loss": 5.2592, "loss/crossentropy": 1.983870379626751, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20767552591860294, "step": 3018 }, { "epoch": 0.25166666666666665, "grad_norm": 5.125, "grad_norm_var": 0.15126546223958334, "learning_rate": 4e-05, "loss": 5.5097, "loss/crossentropy": 1.9750956296920776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23682751134037971, "step": 3020 }, { "epoch": 0.25183333333333335, "grad_norm": 4.90625, "grad_norm_var": 0.16835530598958334, "learning_rate": 4e-05, "loss": 4.7177, "loss/crossentropy": 1.952264316380024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19494295865297318, "step": 3022 }, { "epoch": 0.252, "grad_norm": 7.9375, "grad_norm_var": 0.65435791015625, "learning_rate": 4e-05, "loss": 4.9452, "loss/crossentropy": 1.7668914496898651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20950108766555786, "step": 3024 }, { "epoch": 0.25216666666666665, "grad_norm": 5.375, "grad_norm_var": 0.6429646809895834, "learning_rate": 4e-05, "loss": 5.0399, "loss/crossentropy": 2.3525235652923584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22499406710267067, "step": 3026 }, { "epoch": 0.25233333333333335, "grad_norm": 4.84375, "grad_norm_var": 0.6343587239583334, "learning_rate": 4e-05, "loss": 4.7009, "loss/crossentropy": 2.423838883638382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22652239352464676, "step": 3028 }, { "epoch": 0.2525, "grad_norm": 5.6875, "grad_norm_var": 0.6251139322916667, "learning_rate": 4e-05, "loss": 4.9118, "loss/crossentropy": 2.315009117126465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25304970145225525, "step": 3030 }, { "epoch": 0.25266666666666665, "grad_norm": 5.15625, "grad_norm_var": 0.6044921875, "learning_rate": 4e-05, "loss": 5.3209, "loss/crossentropy": 1.1788423582911491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18229475058615208, "step": 3032 }, { "epoch": 0.25283333333333335, "grad_norm": 4.90625, "grad_norm_var": 0.59713134765625, "learning_rate": 4e-05, "loss": 4.8294, "loss/crossentropy": 2.1284771263599396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20650208741426468, "step": 3034 }, { "epoch": 0.253, "grad_norm": 5.25, "grad_norm_var": 0.946728515625, "learning_rate": 4e-05, "loss": 4.833, "loss/crossentropy": 1.8748324885964394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19778919033706188, "step": 3036 }, { "epoch": 0.25316666666666665, "grad_norm": 5.6875, "grad_norm_var": 0.8805989583333333, "learning_rate": 4e-05, "loss": 5.103, "loss/crossentropy": 1.374475210905075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24287692829966545, "step": 3038 }, { "epoch": 0.25333333333333335, "grad_norm": 4.90625, "grad_norm_var": 0.49957275390625, "learning_rate": 4e-05, "loss": 4.6131, "loss/crossentropy": 2.303386151790619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1943720206618309, "step": 3040 }, { "epoch": 0.2535, "grad_norm": 4.84375, "grad_norm_var": 0.5026652018229166, "learning_rate": 4e-05, "loss": 5.2035, "loss/crossentropy": 2.6998149752616882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21584071591496468, "step": 3042 }, { "epoch": 0.25366666666666665, "grad_norm": 4.5625, "grad_norm_var": 0.5387003580729167, "learning_rate": 4e-05, "loss": 4.652, "loss/crossentropy": 1.8677115961909294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18451419100165367, "step": 3044 }, { "epoch": 0.25383333333333336, "grad_norm": 5.3125, "grad_norm_var": 0.5251302083333333, "learning_rate": 4e-05, "loss": 5.0967, "loss/crossentropy": 2.085889607667923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1815444529056549, "step": 3046 }, { "epoch": 0.254, "grad_norm": 4.78125, "grad_norm_var": 0.536572265625, "learning_rate": 4e-05, "loss": 5.1203, "loss/crossentropy": 1.9297250807285309, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21572398394346237, "step": 3048 }, { "epoch": 0.25416666666666665, "grad_norm": 4.59375, "grad_norm_var": 0.5628255208333334, "learning_rate": 4e-05, "loss": 5.0028, "loss/crossentropy": 2.18538436293602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21398789063096046, "step": 3050 }, { "epoch": 0.25433333333333336, "grad_norm": 5.0, "grad_norm_var": 0.08681233723958333, "learning_rate": 4e-05, "loss": 4.7436, "loss/crossentropy": 1.881318211555481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17555838078260422, "step": 3052 }, { "epoch": 0.2545, "grad_norm": 5.53125, "grad_norm_var": 0.06534830729166667, "learning_rate": 4e-05, "loss": 5.5268, "loss/crossentropy": 2.593364179134369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22329099476337433, "step": 3054 }, { "epoch": 0.25466666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.06262613932291666, "learning_rate": 4e-05, "loss": 5.3541, "loss/crossentropy": 1.8421437069773674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1585595551878214, "step": 3056 }, { "epoch": 0.25483333333333336, "grad_norm": 8.75, "grad_norm_var": 0.9886678059895834, "learning_rate": 4e-05, "loss": 4.2504, "loss/crossentropy": 1.5335690155625343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15620680898427963, "step": 3058 }, { "epoch": 0.255, "grad_norm": 5.21875, "grad_norm_var": 0.9575358072916667, "learning_rate": 4e-05, "loss": 5.0432, "loss/crossentropy": 2.136391341686249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2133907824754715, "step": 3060 }, { "epoch": 0.25516666666666665, "grad_norm": 4.34375, "grad_norm_var": 0.994775390625, "learning_rate": 4e-05, "loss": 4.5259, "loss/crossentropy": 2.214922845363617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20154564455151558, "step": 3062 }, { "epoch": 0.25533333333333336, "grad_norm": 4.9375, "grad_norm_var": 0.9885701497395833, "learning_rate": 4e-05, "loss": 5.2165, "loss/crossentropy": 1.6571815237402916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20795507915318012, "step": 3064 }, { "epoch": 0.2555, "grad_norm": 4.96875, "grad_norm_var": 0.9559529622395834, "learning_rate": 4e-05, "loss": 4.9805, "loss/crossentropy": 2.20686411857605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20064949244260788, "step": 3066 }, { "epoch": 0.25566666666666665, "grad_norm": 5.25, "grad_norm_var": 0.9485636393229167, "learning_rate": 4e-05, "loss": 5.049, "loss/crossentropy": 1.8665556535124779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17407236993312836, "step": 3068 }, { "epoch": 0.25583333333333336, "grad_norm": 5.1875, "grad_norm_var": 0.9401652018229166, "learning_rate": 4e-05, "loss": 5.3568, "loss/crossentropy": 2.4138555824756622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21526311710476875, "step": 3070 }, { "epoch": 0.256, "grad_norm": 4.71875, "grad_norm_var": 0.9445149739583333, "learning_rate": 4e-05, "loss": 5.3499, "loss/crossentropy": 1.4635539650917053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14331906288862228, "step": 3072 }, { "epoch": 0.25616666666666665, "grad_norm": 5.625, "grad_norm_var": 0.07545572916666667, "learning_rate": 4e-05, "loss": 5.6253, "loss/crossentropy": 2.168128550052643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2246512994170189, "step": 3074 }, { "epoch": 0.25633333333333336, "grad_norm": 5.40625, "grad_norm_var": 0.0783203125, "learning_rate": 4e-05, "loss": 5.3584, "loss/crossentropy": 2.067095883190632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18999614752829075, "step": 3076 }, { "epoch": 0.2565, "grad_norm": 4.625, "grad_norm_var": 0.05533447265625, "learning_rate": 4e-05, "loss": 4.4266, "loss/crossentropy": 1.4931324049830437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15274662151932716, "step": 3078 }, { "epoch": 0.25666666666666665, "grad_norm": 5.0625, "grad_norm_var": 0.07102457682291667, "learning_rate": 4e-05, "loss": 4.7741, "loss/crossentropy": 1.9350454285740852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.177174037322402, "step": 3080 }, { "epoch": 0.25683333333333336, "grad_norm": 5.3125, "grad_norm_var": 0.08879801432291666, "learning_rate": 4e-05, "loss": 5.2018, "loss/crossentropy": 2.3658514618873596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21689199656248093, "step": 3082 }, { "epoch": 0.257, "grad_norm": 5.5, "grad_norm_var": 0.10442301432291666, "learning_rate": 4e-05, "loss": 5.0723, "loss/crossentropy": 1.9613457173109055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1923742052167654, "step": 3084 }, { "epoch": 0.25716666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.10349934895833333, "learning_rate": 4e-05, "loss": 5.1772, "loss/crossentropy": 2.0767830312252045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23773017153143883, "step": 3086 }, { "epoch": 0.25733333333333336, "grad_norm": 5.375, "grad_norm_var": 0.10396728515625, "learning_rate": 4e-05, "loss": 4.8215, "loss/crossentropy": 1.7117633819580078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1797526590526104, "step": 3088 }, { "epoch": 0.2575, "grad_norm": 5.28125, "grad_norm_var": 0.114306640625, "learning_rate": 4e-05, "loss": 5.2554, "loss/crossentropy": 1.9202795922756195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22795704752206802, "step": 3090 }, { "epoch": 0.25766666666666665, "grad_norm": 4.5625, "grad_norm_var": 0.12177327473958334, "learning_rate": 4e-05, "loss": 4.9953, "loss/crossentropy": 1.3964777737855911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15975043550133705, "step": 3092 }, { "epoch": 0.25783333333333336, "grad_norm": 5.125, "grad_norm_var": 0.117822265625, "learning_rate": 4e-05, "loss": 4.9921, "loss/crossentropy": 2.2678469121456146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2159191109240055, "step": 3094 }, { "epoch": 0.258, "grad_norm": 5.0625, "grad_norm_var": 0.10702718098958333, "learning_rate": 4e-05, "loss": 5.1833, "loss/crossentropy": 1.8077596053481102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17754389718174934, "step": 3096 }, { "epoch": 0.25816666666666666, "grad_norm": 5.46875, "grad_norm_var": 0.12102457682291666, "learning_rate": 4e-05, "loss": 5.5425, "loss/crossentropy": 2.327247679233551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2090022973716259, "step": 3098 }, { "epoch": 0.25833333333333336, "grad_norm": 5.125, "grad_norm_var": 0.11256510416666667, "learning_rate": 4e-05, "loss": 4.4231, "loss/crossentropy": 1.0582982525229454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14690488018095493, "step": 3100 }, { "epoch": 0.2585, "grad_norm": 4.78125, "grad_norm_var": 0.12447509765625, "learning_rate": 4e-05, "loss": 5.1684, "loss/crossentropy": 2.24881511926651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19497083500027657, "step": 3102 }, { "epoch": 0.25866666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.12945556640625, "learning_rate": 4e-05, "loss": 4.5281, "loss/crossentropy": 2.5441195368766785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22488819062709808, "step": 3104 }, { "epoch": 0.25883333333333336, "grad_norm": 5.0, "grad_norm_var": 0.09163004557291667, "learning_rate": 4e-05, "loss": 4.4234, "loss/crossentropy": 1.352687880396843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15268324688076973, "step": 3106 }, { "epoch": 0.259, "grad_norm": 4.625, "grad_norm_var": 0.10302327473958334, "learning_rate": 4e-05, "loss": 5.2155, "loss/crossentropy": 2.357410877943039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21430985257029533, "step": 3108 }, { "epoch": 0.25916666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.10701497395833333, "learning_rate": 4e-05, "loss": 4.9418, "loss/crossentropy": 1.0604775324463844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14432695135474205, "step": 3110 }, { "epoch": 0.25933333333333336, "grad_norm": 5.28125, "grad_norm_var": 0.1150390625, "learning_rate": 4e-05, "loss": 5.6923, "loss/crossentropy": 2.180578827857971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20271052047610283, "step": 3112 }, { "epoch": 0.2595, "grad_norm": 6.125, "grad_norm_var": 0.15416259765625, "learning_rate": 4e-05, "loss": 5.4177, "loss/crossentropy": 2.2036134004592896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19966942071914673, "step": 3114 }, { "epoch": 0.25966666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.15191650390625, "learning_rate": 4e-05, "loss": 4.8192, "loss/crossentropy": 2.486625075340271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21220136806368828, "step": 3116 }, { "epoch": 0.25983333333333336, "grad_norm": 5.0, "grad_norm_var": 0.14612223307291666, "learning_rate": 4e-05, "loss": 5.0516, "loss/crossentropy": 2.187554508447647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20102859288454056, "step": 3118 }, { "epoch": 0.26, "grad_norm": 4.375, "grad_norm_var": 0.16428629557291666, "learning_rate": 4e-05, "loss": 4.7012, "loss/crossentropy": 2.2847339808940887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20380695909261703, "step": 3120 }, { "epoch": 0.26016666666666666, "grad_norm": 4.625, "grad_norm_var": 0.17362874348958332, "learning_rate": 4e-05, "loss": 4.6752, "loss/crossentropy": 2.6803387999534607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214877150952816, "step": 3122 }, { "epoch": 0.26033333333333336, "grad_norm": 5.09375, "grad_norm_var": 0.15247395833333333, "learning_rate": 4e-05, "loss": 5.0271, "loss/crossentropy": 1.861951231956482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2054112609475851, "step": 3124 }, { "epoch": 0.2605, "grad_norm": 4.84375, "grad_norm_var": 0.17760416666666667, "learning_rate": 4e-05, "loss": 5.052, "loss/crossentropy": 1.845504753291607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22759747132658958, "step": 3126 }, { "epoch": 0.26066666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.168603515625, "learning_rate": 4e-05, "loss": 4.966, "loss/crossentropy": 2.169025242328644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220594372600317, "step": 3128 }, { "epoch": 0.2608333333333333, "grad_norm": 4.75, "grad_norm_var": 0.0830078125, "learning_rate": 4e-05, "loss": 4.4052, "loss/crossentropy": 1.50277678668499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1798977069556713, "step": 3130 }, { "epoch": 0.261, "grad_norm": 4.875, "grad_norm_var": 0.09996337890625, "learning_rate": 4e-05, "loss": 5.3784, "loss/crossentropy": 2.4498740434646606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22047830745577812, "step": 3132 }, { "epoch": 0.26116666666666666, "grad_norm": 4.875, "grad_norm_var": 0.10013020833333333, "learning_rate": 4e-05, "loss": 4.8054, "loss/crossentropy": 1.9772669970989227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18800954148173332, "step": 3134 }, { "epoch": 0.2613333333333333, "grad_norm": 4.875, "grad_norm_var": 0.09488525390625, "learning_rate": 4e-05, "loss": 5.2146, "loss/crossentropy": 1.8985195010900497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1984998844563961, "step": 3136 }, { "epoch": 0.2615, "grad_norm": 5.125, "grad_norm_var": 0.086572265625, "learning_rate": 4e-05, "loss": 4.9154, "loss/crossentropy": 1.8307873159646988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17886247113347054, "step": 3138 }, { "epoch": 0.26166666666666666, "grad_norm": 5.125, "grad_norm_var": 0.08778889973958333, "learning_rate": 4e-05, "loss": 4.2008, "loss/crossentropy": 1.0259275138378143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13531352765858173, "step": 3140 }, { "epoch": 0.2618333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.07936197916666667, "learning_rate": 4e-05, "loss": 4.8041, "loss/crossentropy": 1.9614720344543457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22988587990403175, "step": 3142 }, { "epoch": 0.262, "grad_norm": 4.96875, "grad_norm_var": 0.07545572916666667, "learning_rate": 4e-05, "loss": 4.651, "loss/crossentropy": 2.0952285528182983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139127030968666, "step": 3144 }, { "epoch": 0.26216666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.07194010416666667, "learning_rate": 4e-05, "loss": 4.8872, "loss/crossentropy": 2.3387314677238464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2338441088795662, "step": 3146 }, { "epoch": 0.2623333333333333, "grad_norm": 7.0625, "grad_norm_var": 0.3492838541666667, "learning_rate": 4e-05, "loss": 4.8195, "loss/crossentropy": 1.864521287381649, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1690395548939705, "step": 3148 }, { "epoch": 0.2625, "grad_norm": 4.96875, "grad_norm_var": 0.34215087890625, "learning_rate": 4e-05, "loss": 4.9518, "loss/crossentropy": 2.1711268723011017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20523667708039284, "step": 3150 }, { "epoch": 0.26266666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.33498942057291664, "learning_rate": 4e-05, "loss": 5.4427, "loss/crossentropy": 2.0541456565260887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1990213841199875, "step": 3152 }, { "epoch": 0.2628333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.33255208333333336, "learning_rate": 4e-05, "loss": 4.4446, "loss/crossentropy": 2.2430761456489563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24454378709197044, "step": 3154 }, { "epoch": 0.263, "grad_norm": 5.5, "grad_norm_var": 0.3441365559895833, "learning_rate": 4e-05, "loss": 4.6601, "loss/crossentropy": 1.9410057738423347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1790495365858078, "step": 3156 }, { "epoch": 0.26316666666666666, "grad_norm": 4.625, "grad_norm_var": 0.34894205729166666, "learning_rate": 4e-05, "loss": 4.716, "loss/crossentropy": 2.0159209072589874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19673524983227253, "step": 3158 }, { "epoch": 0.2633333333333333, "grad_norm": 5.375, "grad_norm_var": 0.3651041666666667, "learning_rate": 4e-05, "loss": 5.6148, "loss/crossentropy": 2.481500566005707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217854592949152, "step": 3160 }, { "epoch": 0.2635, "grad_norm": 5.875, "grad_norm_var": 0.3628743489583333, "learning_rate": 4e-05, "loss": 4.9497, "loss/crossentropy": 2.3142440021038055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22278093919157982, "step": 3162 }, { "epoch": 0.26366666666666666, "grad_norm": 4.75, "grad_norm_var": 0.11575520833333333, "learning_rate": 4e-05, "loss": 5.2122, "loss/crossentropy": 1.4542016088962555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14941613376140594, "step": 3164 }, { "epoch": 0.2638333333333333, "grad_norm": 5.25, "grad_norm_var": 0.13199462890625, "learning_rate": 4e-05, "loss": 4.6475, "loss/crossentropy": 2.320178806781769, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20817406103014946, "step": 3166 }, { "epoch": 0.264, "grad_norm": 5.09375, "grad_norm_var": 0.13214518229166666, "learning_rate": 4e-05, "loss": 5.1067, "loss/crossentropy": 1.9453425705432892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127118781208992, "step": 3168 }, { "epoch": 0.26416666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.15015869140625, "learning_rate": 4e-05, "loss": 4.8359, "loss/crossentropy": 1.9130103662610054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1991387903690338, "step": 3170 }, { "epoch": 0.2643333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.14706624348958333, "learning_rate": 4e-05, "loss": 5.1114, "loss/crossentropy": 1.981778234243393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1902402602136135, "step": 3172 }, { "epoch": 0.2645, "grad_norm": 5.4375, "grad_norm_var": 0.13782145182291666, "learning_rate": 4e-05, "loss": 5.281, "loss/crossentropy": 2.231064334511757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19765946455299854, "step": 3174 }, { "epoch": 0.26466666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.119384765625, "learning_rate": 4e-05, "loss": 4.3434, "loss/crossentropy": 1.753200277686119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18743771128356457, "step": 3176 }, { "epoch": 0.2648333333333333, "grad_norm": 5.125, "grad_norm_var": 0.10230712890625, "learning_rate": 4e-05, "loss": 5.3008, "loss/crossentropy": 2.396784156560898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2302083782851696, "step": 3178 }, { "epoch": 0.265, "grad_norm": 5.0, "grad_norm_var": 0.097509765625, "learning_rate": 4e-05, "loss": 5.6565, "loss/crossentropy": 2.6294925808906555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2336473949253559, "step": 3180 }, { "epoch": 0.26516666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.09308268229166666, "learning_rate": 4e-05, "loss": 4.9291, "loss/crossentropy": 1.6210493966937065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16944964230060577, "step": 3182 }, { "epoch": 0.2653333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.09791259765625, "learning_rate": 4e-05, "loss": 4.6843, "loss/crossentropy": 1.8240682110190392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20642348006367683, "step": 3184 }, { "epoch": 0.2655, "grad_norm": 5.125, "grad_norm_var": 0.08756510416666667, "learning_rate": 4e-05, "loss": 4.7397, "loss/crossentropy": 2.4440919160842896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144545502960682, "step": 3186 }, { "epoch": 0.26566666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.07662760416666667, "learning_rate": 4e-05, "loss": 5.3081, "loss/crossentropy": 2.5918545722961426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22979921475052834, "step": 3188 }, { "epoch": 0.2658333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.07615559895833333, "learning_rate": 4e-05, "loss": 4.3005, "loss/crossentropy": 1.7671714574098587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187911469489336, "step": 3190 }, { "epoch": 0.266, "grad_norm": 5.375, "grad_norm_var": 0.084375, "learning_rate": 4e-05, "loss": 5.0428, "loss/crossentropy": 1.6388864442706108, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1955713890492916, "step": 3192 }, { "epoch": 0.26616666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.04895426432291667, "learning_rate": 4e-05, "loss": 5.1676, "loss/crossentropy": 1.9798070192337036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20462529733777046, "step": 3194 }, { "epoch": 0.2663333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.04947509765625, "learning_rate": 4e-05, "loss": 4.5824, "loss/crossentropy": 2.0957940965890884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1770927645266056, "step": 3196 }, { "epoch": 0.2665, "grad_norm": 4.90625, "grad_norm_var": 0.04244384765625, "learning_rate": 4e-05, "loss": 5.1135, "loss/crossentropy": 1.977191299200058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1755628753453493, "step": 3198 }, { "epoch": 0.26666666666666666, "grad_norm": 5.3125, "grad_norm_var": 0.043863932291666664, "learning_rate": 4e-05, "loss": 5.1938, "loss/crossentropy": 1.5719657689332962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15234808064997196, "step": 3200 }, { "epoch": 0.2668333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.09797770182291667, "learning_rate": 4e-05, "loss": 5.1316, "loss/crossentropy": 1.9265673011541367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19334514439105988, "step": 3202 }, { "epoch": 0.267, "grad_norm": 5.4375, "grad_norm_var": 0.11991780598958333, "learning_rate": 4e-05, "loss": 5.4066, "loss/crossentropy": 1.7487748563289642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16883439384400845, "step": 3204 }, { "epoch": 0.26716666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.10393473307291666, "learning_rate": 4e-05, "loss": 5.3232, "loss/crossentropy": 1.8918979242444038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1814283076673746, "step": 3206 }, { "epoch": 0.2673333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.10279947916666667, "learning_rate": 4e-05, "loss": 4.5995, "loss/crossentropy": 2.1503345668315887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2536822408437729, "step": 3208 }, { "epoch": 0.2675, "grad_norm": 4.90625, "grad_norm_var": 0.10256754557291667, "learning_rate": 4e-05, "loss": 4.8599, "loss/crossentropy": 1.4438120797276497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15272528119385242, "step": 3210 }, { "epoch": 0.26766666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.10813395182291667, "learning_rate": 4e-05, "loss": 4.9177, "loss/crossentropy": 1.8605887293815613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19962546601891518, "step": 3212 }, { "epoch": 0.2678333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.11399332682291667, "learning_rate": 4e-05, "loss": 5.6567, "loss/crossentropy": 2.586755871772766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24593067169189453, "step": 3214 }, { "epoch": 0.268, "grad_norm": 10.3125, "grad_norm_var": 1.7810506184895833, "learning_rate": 4e-05, "loss": 5.4459, "loss/crossentropy": 1.9296107813715935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18788691982626915, "step": 3216 }, { "epoch": 0.26816666666666666, "grad_norm": 4.8125, "grad_norm_var": 1.7888631184895833, "learning_rate": 4e-05, "loss": 4.6947, "loss/crossentropy": 2.048318862915039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1871740221977234, "step": 3218 }, { "epoch": 0.2683333333333333, "grad_norm": 4.65625, "grad_norm_var": 1.8235514322916666, "learning_rate": 4e-05, "loss": 4.4589, "loss/crossentropy": 1.732799842953682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16609930619597435, "step": 3220 }, { "epoch": 0.2685, "grad_norm": 5.34375, "grad_norm_var": 1.8171712239583333, "learning_rate": 4e-05, "loss": 5.099, "loss/crossentropy": 2.048336148262024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951584815979004, "step": 3222 }, { "epoch": 0.26866666666666666, "grad_norm": 4.9375, "grad_norm_var": 1.83570556640625, "learning_rate": 4e-05, "loss": 4.5305, "loss/crossentropy": 2.4394567012786865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21930208802223206, "step": 3224 }, { "epoch": 0.2688333333333333, "grad_norm": 4.71875, "grad_norm_var": 1.851416015625, "learning_rate": 4e-05, "loss": 4.3228, "loss/crossentropy": 1.6342605128884315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1711755134165287, "step": 3226 }, { "epoch": 0.269, "grad_norm": 4.9375, "grad_norm_var": 1.8655558268229167, "learning_rate": 4e-05, "loss": 4.9384, "loss/crossentropy": 2.0174030661582947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106718048453331, "step": 3228 }, { "epoch": 0.26916666666666667, "grad_norm": 5.03125, "grad_norm_var": 1.8641886393229166, "learning_rate": 4e-05, "loss": 5.324, "loss/crossentropy": 1.9064742401242256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16172141209244728, "step": 3230 }, { "epoch": 0.2693333333333333, "grad_norm": 4.75, "grad_norm_var": 0.06669514973958333, "learning_rate": 4e-05, "loss": 4.881, "loss/crossentropy": 2.1121154129505157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20355704799294472, "step": 3232 }, { "epoch": 0.2695, "grad_norm": 5.1875, "grad_norm_var": 0.34280192057291664, "learning_rate": 4e-05, "loss": 5.3211, "loss/crossentropy": 1.989701747894287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1762289609760046, "step": 3234 }, { "epoch": 0.26966666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.33136393229166666, "learning_rate": 4e-05, "loss": 4.7003, "loss/crossentropy": 2.457001119852066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2369070202112198, "step": 3236 }, { "epoch": 0.2698333333333333, "grad_norm": 5.0, "grad_norm_var": 0.32483317057291666, "learning_rate": 4e-05, "loss": 4.9221, "loss/crossentropy": 0.9388425797224045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12862342968583107, "step": 3238 }, { "epoch": 0.27, "grad_norm": 5.03125, "grad_norm_var": 0.318994140625, "learning_rate": 4e-05, "loss": 4.5479, "loss/crossentropy": 0.9154981449246407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.126858439296484, "step": 3240 }, { "epoch": 0.27016666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.32135009765625, "learning_rate": 4e-05, "loss": 4.9938, "loss/crossentropy": 1.5279822647571564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15195055678486824, "step": 3242 }, { "epoch": 0.2703333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.30831705729166664, "learning_rate": 4e-05, "loss": 5.1522, "loss/crossentropy": 2.0843097865581512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19562271982431412, "step": 3244 }, { "epoch": 0.2705, "grad_norm": 5.09375, "grad_norm_var": 0.3346964518229167, "learning_rate": 4e-05, "loss": 5.0055, "loss/crossentropy": 1.8548680245876312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1670110784471035, "step": 3246 }, { "epoch": 0.27066666666666667, "grad_norm": 5.125, "grad_norm_var": 0.32146809895833334, "learning_rate": 4e-05, "loss": 4.913, "loss/crossentropy": 2.6331475973129272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23913651704788208, "step": 3248 }, { "epoch": 0.2708333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.05701497395833333, "learning_rate": 4e-05, "loss": 4.9626, "loss/crossentropy": 2.254679262638092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20621953532099724, "step": 3250 }, { "epoch": 0.271, "grad_norm": 5.0, "grad_norm_var": 0.048291015625, "learning_rate": 4e-05, "loss": 4.4947, "loss/crossentropy": 2.1386323794722557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19046901538968086, "step": 3252 }, { "epoch": 0.27116666666666667, "grad_norm": 4.875, "grad_norm_var": 0.04820556640625, "learning_rate": 4e-05, "loss": 4.8927, "loss/crossentropy": 1.6399564519524574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18125244602560997, "step": 3254 }, { "epoch": 0.2713333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.05284830729166667, "learning_rate": 4e-05, "loss": 5.1166, "loss/crossentropy": 1.8724690079689026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21212750673294067, "step": 3256 }, { "epoch": 0.2715, "grad_norm": 4.875, "grad_norm_var": 0.03268229166666667, "learning_rate": 4e-05, "loss": 4.6229, "loss/crossentropy": 1.315964438021183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17530952394008636, "step": 3258 }, { "epoch": 0.27166666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.03658854166666667, "learning_rate": 4e-05, "loss": 5.214, "loss/crossentropy": 1.7634995728731155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21435541100800037, "step": 3260 }, { "epoch": 0.2718333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.15556233723958332, "learning_rate": 4e-05, "loss": 4.5934, "loss/crossentropy": 2.2641907036304474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23568279296159744, "step": 3262 }, { "epoch": 0.272, "grad_norm": 4.40625, "grad_norm_var": 0.18863525390625, "learning_rate": 4e-05, "loss": 4.6097, "loss/crossentropy": 1.5739614740014076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1697534527629614, "step": 3264 }, { "epoch": 0.27216666666666667, "grad_norm": 5.65625, "grad_norm_var": 0.21256510416666666, "learning_rate": 4e-05, "loss": 4.8451, "loss/crossentropy": 2.052751898765564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18519877456128597, "step": 3266 }, { "epoch": 0.2723333333333333, "grad_norm": 4.75, "grad_norm_var": 0.23088785807291667, "learning_rate": 4e-05, "loss": 4.6083, "loss/crossentropy": 1.1079089492559433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11845254711806774, "step": 3268 }, { "epoch": 0.2725, "grad_norm": 4.9375, "grad_norm_var": 0.2380859375, "learning_rate": 4e-05, "loss": 4.9116, "loss/crossentropy": 1.9221205562353134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930643692612648, "step": 3270 }, { "epoch": 0.27266666666666667, "grad_norm": 5.125, "grad_norm_var": 0.2255859375, "learning_rate": 4e-05, "loss": 5.3047, "loss/crossentropy": 2.6353172063827515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22324591875076294, "step": 3272 }, { "epoch": 0.2728333333333333, "grad_norm": 6.5625, "grad_norm_var": 0.3466796875, "learning_rate": 4e-05, "loss": 4.5763, "loss/crossentropy": 2.372876226902008, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2210596241056919, "step": 3274 }, { "epoch": 0.273, "grad_norm": 4.5, "grad_norm_var": 0.37942708333333336, "learning_rate": 4e-05, "loss": 4.5375, "loss/crossentropy": 1.4885797277092934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15314335376024246, "step": 3276 }, { "epoch": 0.27316666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.27278238932291665, "learning_rate": 4e-05, "loss": 4.698, "loss/crossentropy": 1.7347316294908524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17671633325517178, "step": 3278 }, { "epoch": 0.2733333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.239697265625, "learning_rate": 4e-05, "loss": 5.1095, "loss/crossentropy": 1.9635898768901825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21393844485282898, "step": 3280 }, { "epoch": 0.2735, "grad_norm": 4.78125, "grad_norm_var": 0.23287760416666667, "learning_rate": 4e-05, "loss": 4.4866, "loss/crossentropy": 1.3102534040808678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16433557122945786, "step": 3282 }, { "epoch": 0.27366666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.21649983723958333, "learning_rate": 4e-05, "loss": 4.8568, "loss/crossentropy": 2.0422130823135376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17257808148860931, "step": 3284 }, { "epoch": 0.2738333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.207275390625, "learning_rate": 4e-05, "loss": 5.3481, "loss/crossentropy": 1.767970271408558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1866742093116045, "step": 3286 }, { "epoch": 0.274, "grad_norm": 5.34375, "grad_norm_var": 0.21874593098958334, "learning_rate": 4e-05, "loss": 4.8972, "loss/crossentropy": 2.234264552593231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20818063244223595, "step": 3288 }, { "epoch": 0.27416666666666667, "grad_norm": 4.875, "grad_norm_var": 0.07056884765625, "learning_rate": 4e-05, "loss": 4.5813, "loss/crossentropy": 1.398232415318489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15509275533258915, "step": 3290 }, { "epoch": 0.2743333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.054541015625, "learning_rate": 4e-05, "loss": 4.8906, "loss/crossentropy": 1.7504291385412216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19099892303347588, "step": 3292 }, { "epoch": 0.2745, "grad_norm": 5.28125, "grad_norm_var": 0.04859619140625, "learning_rate": 4e-05, "loss": 4.9809, "loss/crossentropy": 2.1250991821289062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20637407153844833, "step": 3294 }, { "epoch": 0.27466666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.045426432291666666, "learning_rate": 4e-05, "loss": 5.0804, "loss/crossentropy": 2.4322333335876465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22350069507956505, "step": 3296 }, { "epoch": 0.2748333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.03292643229166667, "learning_rate": 4e-05, "loss": 5.1094, "loss/crossentropy": 2.3606340885162354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20735639706254005, "step": 3298 }, { "epoch": 0.275, "grad_norm": 4.875, "grad_norm_var": 0.030582682291666666, "learning_rate": 4e-05, "loss": 4.8605, "loss/crossentropy": 1.2059366628527641, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15578687004745007, "step": 3300 }, { "epoch": 0.27516666666666667, "grad_norm": 5.375, "grad_norm_var": 0.04980061848958333, "learning_rate": 4e-05, "loss": 4.461, "loss/crossentropy": 1.335912600159645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15207264013588428, "step": 3302 }, { "epoch": 0.2753333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.042867024739583336, "learning_rate": 4e-05, "loss": 4.2409, "loss/crossentropy": 0.9111597612500191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1322095450013876, "step": 3304 }, { "epoch": 0.2755, "grad_norm": 4.59375, "grad_norm_var": 0.05423177083333333, "learning_rate": 4e-05, "loss": 5.0508, "loss/crossentropy": 2.276846766471863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20194971933960915, "step": 3306 }, { "epoch": 0.27566666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.05865885416666667, "learning_rate": 4e-05, "loss": 3.8198, "loss/crossentropy": 1.5635306984186172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16944929771125317, "step": 3308 }, { "epoch": 0.2758333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.060282389322916664, "learning_rate": 4e-05, "loss": 4.7673, "loss/crossentropy": 2.0001417845487595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25854466669261456, "step": 3310 }, { "epoch": 0.276, "grad_norm": 5.28125, "grad_norm_var": 0.06731770833333334, "learning_rate": 4e-05, "loss": 5.1252, "loss/crossentropy": 2.004520893096924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20436689630150795, "step": 3312 }, { "epoch": 0.27616666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.0599609375, "learning_rate": 4e-05, "loss": 4.1743, "loss/crossentropy": 1.7104368656873703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18997495248913765, "step": 3314 }, { "epoch": 0.2763333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.067431640625, "learning_rate": 4e-05, "loss": 4.3987, "loss/crossentropy": 1.6688388288021088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20947271212935448, "step": 3316 }, { "epoch": 0.2765, "grad_norm": 4.59375, "grad_norm_var": 0.04918212890625, "learning_rate": 4e-05, "loss": 4.6297, "loss/crossentropy": 1.7906979843974113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1828641202300787, "step": 3318 }, { "epoch": 0.27666666666666667, "grad_norm": 5.375, "grad_norm_var": 0.13917643229166668, "learning_rate": 4e-05, "loss": 5.2282, "loss/crossentropy": 1.8596151024103165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19812500476837158, "step": 3320 }, { "epoch": 0.2768333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.13097330729166667, "learning_rate": 4e-05, "loss": 4.6779, "loss/crossentropy": 1.829872913658619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17703045904636383, "step": 3322 }, { "epoch": 0.277, "grad_norm": 5.5625, "grad_norm_var": 0.14312744140625, "learning_rate": 4e-05, "loss": 5.2029, "loss/crossentropy": 2.1334268152713776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22605855762958527, "step": 3324 }, { "epoch": 0.2771666666666667, "grad_norm": 5.25, "grad_norm_var": 0.13534749348958333, "learning_rate": 4e-05, "loss": 5.2196, "loss/crossentropy": 2.1470797285437584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1935490034520626, "step": 3326 }, { "epoch": 0.2773333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.14254150390625, "learning_rate": 4e-05, "loss": 4.7942, "loss/crossentropy": 1.2008096277713776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13764559477567673, "step": 3328 }, { "epoch": 0.2775, "grad_norm": 5.15625, "grad_norm_var": 0.14039306640625, "learning_rate": 4e-05, "loss": 4.7285, "loss/crossentropy": 0.860782727599144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11173893883824348, "step": 3330 }, { "epoch": 0.2776666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.12434488932291667, "learning_rate": 4e-05, "loss": 5.1798, "loss/crossentropy": 1.5026598796248436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16885693185031414, "step": 3332 }, { "epoch": 0.2778333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.11148681640625, "learning_rate": 4e-05, "loss": 5.1526, "loss/crossentropy": 1.838165283203125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18797642178833485, "step": 3334 }, { "epoch": 0.278, "grad_norm": 5.25, "grad_norm_var": 0.06652018229166666, "learning_rate": 4e-05, "loss": 4.8834, "loss/crossentropy": 1.5069977939128876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18563106283545494, "step": 3336 }, { "epoch": 0.2781666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.07502848307291667, "learning_rate": 4e-05, "loss": 4.7134, "loss/crossentropy": 2.5508508384227753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22817543521523476, "step": 3338 }, { "epoch": 0.2783333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.04722900390625, "learning_rate": 4e-05, "loss": 4.7194, "loss/crossentropy": 2.5258346498012543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2168513759970665, "step": 3340 }, { "epoch": 0.2785, "grad_norm": 5.125, "grad_norm_var": 0.04166259765625, "learning_rate": 4e-05, "loss": 4.8206, "loss/crossentropy": 1.079502247273922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16153132170438766, "step": 3342 }, { "epoch": 0.2786666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.04251302083333333, "learning_rate": 4e-05, "loss": 5.8333, "loss/crossentropy": 1.930338904261589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19220566004514694, "step": 3344 }, { "epoch": 0.2788333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.05429280598958333, "learning_rate": 4e-05, "loss": 4.6864, "loss/crossentropy": 2.3099615573883057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20308882370591164, "step": 3346 }, { "epoch": 0.279, "grad_norm": 4.78125, "grad_norm_var": 0.05310872395833333, "learning_rate": 4e-05, "loss": 4.0611, "loss/crossentropy": 1.3752945065498352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14378135465085506, "step": 3348 }, { "epoch": 0.2791666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.053450520833333334, "learning_rate": 4e-05, "loss": 5.0266, "loss/crossentropy": 2.265912115573883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20121736079454422, "step": 3350 }, { "epoch": 0.2793333333333333, "grad_norm": 4.875, "grad_norm_var": 0.036909993489583334, "learning_rate": 4e-05, "loss": 4.7789, "loss/crossentropy": 2.411749243736267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24077193066477776, "step": 3352 }, { "epoch": 0.2795, "grad_norm": 5.1875, "grad_norm_var": 0.049214680989583336, "learning_rate": 4e-05, "loss": 5.0433, "loss/crossentropy": 1.2377412021160126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1717513632029295, "step": 3354 }, { "epoch": 0.2796666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.05455729166666667, "learning_rate": 4e-05, "loss": 5.0701, "loss/crossentropy": 1.9626172259449959, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19007350504398346, "step": 3356 }, { "epoch": 0.2798333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.06783447265625, "learning_rate": 4e-05, "loss": 4.8856, "loss/crossentropy": 1.9040052741765976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1902400143444538, "step": 3358 }, { "epoch": 0.28, "grad_norm": 4.96875, "grad_norm_var": 0.061909993489583336, "learning_rate": 4e-05, "loss": 4.5763, "loss/crossentropy": 2.0596228316426277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19010531157255173, "step": 3360 }, { "epoch": 0.2801666666666667, "grad_norm": 5.875, "grad_norm_var": 0.10143229166666666, "learning_rate": 4e-05, "loss": 4.9529, "loss/crossentropy": 1.5693950355052948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19930481910705566, "step": 3362 }, { "epoch": 0.2803333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.09498697916666667, "learning_rate": 4e-05, "loss": 5.1627, "loss/crossentropy": 1.9942995011806488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22873876243829727, "step": 3364 }, { "epoch": 0.2805, "grad_norm": 5.28125, "grad_norm_var": 0.09830322265625, "learning_rate": 4e-05, "loss": 4.52, "loss/crossentropy": 0.7917919382452965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12067169696092606, "step": 3366 }, { "epoch": 0.2806666666666667, "grad_norm": 4.875, "grad_norm_var": 0.10076497395833334, "learning_rate": 4e-05, "loss": 4.3528, "loss/crossentropy": 1.2874325066804886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14191594906151295, "step": 3368 }, { "epoch": 0.2808333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.07919514973958333, "learning_rate": 4e-05, "loss": 4.8554, "loss/crossentropy": 1.3948650658130646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20387698709964752, "step": 3370 }, { "epoch": 0.281, "grad_norm": 4.9375, "grad_norm_var": 0.07200113932291667, "learning_rate": 4e-05, "loss": 5.2899, "loss/crossentropy": 1.7417053952813148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18501684069633484, "step": 3372 }, { "epoch": 0.2811666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.08860270182291667, "learning_rate": 4e-05, "loss": 3.8689, "loss/crossentropy": 1.5205266997218132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15771334990859032, "step": 3374 }, { "epoch": 0.2813333333333333, "grad_norm": 5.25, "grad_norm_var": 0.11339518229166666, "learning_rate": 4e-05, "loss": 5.5822, "loss/crossentropy": 2.372869312763214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21234075352549553, "step": 3376 }, { "epoch": 0.2815, "grad_norm": 5.46875, "grad_norm_var": 0.08644205729166667, "learning_rate": 4e-05, "loss": 4.5008, "loss/crossentropy": 1.7059477791190147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1649102047085762, "step": 3378 }, { "epoch": 0.2816666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.0859375, "learning_rate": 4e-05, "loss": 5.1233, "loss/crossentropy": 2.4804338812828064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21589474752545357, "step": 3380 }, { "epoch": 0.2818333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.08381754557291667, "learning_rate": 4e-05, "loss": 4.868, "loss/crossentropy": 1.9849779903888702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23055313155055046, "step": 3382 }, { "epoch": 0.282, "grad_norm": 4.3125, "grad_norm_var": 0.11340738932291666, "learning_rate": 4e-05, "loss": 4.6629, "loss/crossentropy": 2.002195544540882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18036412820219994, "step": 3384 }, { "epoch": 0.2821666666666667, "grad_norm": 4.875, "grad_norm_var": 0.11770426432291667, "learning_rate": 4e-05, "loss": 5.5229, "loss/crossentropy": 2.409064471721649, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20758257806301117, "step": 3386 }, { "epoch": 0.2823333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.12473551432291667, "learning_rate": 4e-05, "loss": 4.3955, "loss/crossentropy": 1.6170982271432877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2150630559772253, "step": 3388 }, { "epoch": 0.2825, "grad_norm": 5.0, "grad_norm_var": 0.208203125, "learning_rate": 4e-05, "loss": 4.9565, "loss/crossentropy": 2.2994788140058517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2032534722238779, "step": 3390 }, { "epoch": 0.2826666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.18736572265625, "learning_rate": 4e-05, "loss": 4.9765, "loss/crossentropy": 1.881778173148632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19996779970824718, "step": 3392 }, { "epoch": 0.2828333333333333, "grad_norm": 4.625, "grad_norm_var": 0.18381754557291666, "learning_rate": 4e-05, "loss": 5.0977, "loss/crossentropy": 1.3992729112505913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14480482786893845, "step": 3394 }, { "epoch": 0.283, "grad_norm": 4.96875, "grad_norm_var": 0.18495686848958334, "learning_rate": 4e-05, "loss": 4.6631, "loss/crossentropy": 2.6132925748825073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21732110902667046, "step": 3396 }, { "epoch": 0.2831666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.19269205729166666, "learning_rate": 4e-05, "loss": 4.7744, "loss/crossentropy": 1.9939734041690826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19406055100262165, "step": 3398 }, { "epoch": 0.2833333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.16276041666666666, "learning_rate": 4e-05, "loss": 4.9806, "loss/crossentropy": 2.3022571206092834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2050349861383438, "step": 3400 }, { "epoch": 0.2835, "grad_norm": 4.75, "grad_norm_var": 0.15948893229166666, "learning_rate": 4e-05, "loss": 4.7427, "loss/crossentropy": 1.8530340567231178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16540405713021755, "step": 3402 }, { "epoch": 0.2836666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.15429280598958334, "learning_rate": 4e-05, "loss": 4.7811, "loss/crossentropy": 2.265649139881134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2381511926651001, "step": 3404 }, { "epoch": 0.2838333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.04602864583333333, "learning_rate": 4e-05, "loss": 5.1025, "loss/crossentropy": 2.4468571543693542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2114931084215641, "step": 3406 }, { "epoch": 0.284, "grad_norm": 4.59375, "grad_norm_var": 0.05552978515625, "learning_rate": 4e-05, "loss": 4.8916, "loss/crossentropy": 1.9908486604690552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20576748996973038, "step": 3408 }, { "epoch": 0.2841666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.048173014322916666, "learning_rate": 4e-05, "loss": 4.3453, "loss/crossentropy": 1.8514483124017715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18919625878334045, "step": 3410 }, { "epoch": 0.2843333333333333, "grad_norm": 5.25, "grad_norm_var": 0.056494140625, "learning_rate": 4e-05, "loss": 5.3242, "loss/crossentropy": 1.6181135475635529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1568435113877058, "step": 3412 }, { "epoch": 0.2845, "grad_norm": 4.78125, "grad_norm_var": 0.06448160807291667, "learning_rate": 4e-05, "loss": 4.917, "loss/crossentropy": 1.7242164313793182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18557476624846458, "step": 3414 }, { "epoch": 0.2846666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.06378580729166666, "learning_rate": 4e-05, "loss": 4.9142, "loss/crossentropy": 2.7115097641944885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2387315072119236, "step": 3416 }, { "epoch": 0.2848333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.06672770182291667, "learning_rate": 4e-05, "loss": 4.5766, "loss/crossentropy": 1.780228778719902, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1914806980639696, "step": 3418 }, { "epoch": 0.285, "grad_norm": 4.78125, "grad_norm_var": 0.06864827473958333, "learning_rate": 4e-05, "loss": 4.7589, "loss/crossentropy": 1.842021107673645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18138272687792778, "step": 3420 }, { "epoch": 0.2851666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.05806884765625, "learning_rate": 4e-05, "loss": 5.3465, "loss/crossentropy": 2.4019596874713898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22154908254742622, "step": 3422 }, { "epoch": 0.2853333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.7114420572916667, "learning_rate": 4e-05, "loss": 4.7384, "loss/crossentropy": 2.0549621507525444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19369005970656872, "step": 3424 }, { "epoch": 0.2855, "grad_norm": 5.65625, "grad_norm_var": 0.6972615559895833, "learning_rate": 4e-05, "loss": 4.7996, "loss/crossentropy": 2.2193926870822906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2191501259803772, "step": 3426 }, { "epoch": 0.2856666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.7040974934895833, "learning_rate": 4e-05, "loss": 5.373, "loss/crossentropy": 2.4476476907730103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21305923536419868, "step": 3428 }, { "epoch": 0.28583333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.73140869140625, "learning_rate": 4e-05, "loss": 4.2742, "loss/crossentropy": 2.048733487725258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17609414085745811, "step": 3430 }, { "epoch": 0.286, "grad_norm": 4.78125, "grad_norm_var": 0.733203125, "learning_rate": 4e-05, "loss": 4.9869, "loss/crossentropy": 2.569149136543274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2256152704358101, "step": 3432 }, { "epoch": 0.2861666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.72750244140625, "learning_rate": 4e-05, "loss": 5.0844, "loss/crossentropy": 1.9300435483455658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18370277993381023, "step": 3434 }, { "epoch": 0.28633333333333333, "grad_norm": 4.875, "grad_norm_var": 0.7176920572916666, "learning_rate": 4e-05, "loss": 5.0577, "loss/crossentropy": 2.043847441673279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19139155372977257, "step": 3436 }, { "epoch": 0.2865, "grad_norm": 5.15625, "grad_norm_var": 0.7329264322916667, "learning_rate": 4e-05, "loss": 4.9035, "loss/crossentropy": 1.8298492655158043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17233120650053024, "step": 3438 }, { "epoch": 0.2866666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.11510416666666666, "learning_rate": 4e-05, "loss": 4.9975, "loss/crossentropy": 2.417716324329376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20786355063319206, "step": 3440 }, { "epoch": 0.28683333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.08084309895833333, "learning_rate": 4e-05, "loss": 4.853, "loss/crossentropy": 1.0041667819023132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14175090938806534, "step": 3442 }, { "epoch": 0.287, "grad_norm": 4.9375, "grad_norm_var": 0.09130452473958334, "learning_rate": 4e-05, "loss": 4.6887, "loss/crossentropy": 2.447651743888855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20354222133755684, "step": 3444 }, { "epoch": 0.2871666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.09529622395833333, "learning_rate": 4e-05, "loss": 4.4465, "loss/crossentropy": 1.1983712315559387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18250321969389915, "step": 3446 }, { "epoch": 0.28733333333333333, "grad_norm": 5.125, "grad_norm_var": 0.12408854166666666, "learning_rate": 4e-05, "loss": 5.1078, "loss/crossentropy": 2.3835307359695435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20711689442396164, "step": 3448 }, { "epoch": 0.2875, "grad_norm": 4.6875, "grad_norm_var": 0.13255208333333332, "learning_rate": 4e-05, "loss": 4.579, "loss/crossentropy": 1.925173059105873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21384599804878235, "step": 3450 }, { "epoch": 0.2876666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.15198160807291666, "learning_rate": 4e-05, "loss": 4.6881, "loss/crossentropy": 2.016170620918274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19685513153672218, "step": 3452 }, { "epoch": 0.28783333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.10729166666666666, "learning_rate": 4e-05, "loss": 4.8844, "loss/crossentropy": 1.8985230028629303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19102028012275696, "step": 3454 }, { "epoch": 0.288, "grad_norm": 5.15625, "grad_norm_var": 0.10963134765625, "learning_rate": 4e-05, "loss": 5.2529, "loss/crossentropy": 2.064757615327835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18660160526633263, "step": 3456 }, { "epoch": 0.2881666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.11500244140625, "learning_rate": 4e-05, "loss": 4.5073, "loss/crossentropy": 2.4254125356674194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18952980637550354, "step": 3458 }, { "epoch": 0.28833333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.10279947916666667, "learning_rate": 4e-05, "loss": 4.7031, "loss/crossentropy": 2.1310142278671265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2081185169517994, "step": 3460 }, { "epoch": 0.2885, "grad_norm": 4.875, "grad_norm_var": 0.09498291015625, "learning_rate": 4e-05, "loss": 5.0103, "loss/crossentropy": 1.6102216243743896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1663418561220169, "step": 3462 }, { "epoch": 0.2886666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.21265869140625, "learning_rate": 4e-05, "loss": 5.3356, "loss/crossentropy": 2.331478774547577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20860225334763527, "step": 3464 }, { "epoch": 0.28883333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.20702718098958334, "learning_rate": 4e-05, "loss": 4.9703, "loss/crossentropy": 1.7024380043148994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17736496776342392, "step": 3466 }, { "epoch": 0.289, "grad_norm": 4.75, "grad_norm_var": 0.1900390625, "learning_rate": 4e-05, "loss": 4.9114, "loss/crossentropy": 2.388719826936722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1955118253827095, "step": 3468 }, { "epoch": 0.2891666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.20167643229166668, "learning_rate": 4e-05, "loss": 4.9545, "loss/crossentropy": 2.093321107327938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18176774308085442, "step": 3470 }, { "epoch": 0.28933333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.20956624348958333, "learning_rate": 4e-05, "loss": 4.7973, "loss/crossentropy": 2.4865227341651917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278795726597309, "step": 3472 }, { "epoch": 0.2895, "grad_norm": 4.78125, "grad_norm_var": 0.21008707682291666, "learning_rate": 4e-05, "loss": 5.1973, "loss/crossentropy": 1.8062629103660583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20775445736944675, "step": 3474 }, { "epoch": 0.2896666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.213525390625, "learning_rate": 4e-05, "loss": 4.5915, "loss/crossentropy": 2.489815413951874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21216319501399994, "step": 3476 }, { "epoch": 0.28983333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.20618082682291666, "learning_rate": 4e-05, "loss": 4.7876, "loss/crossentropy": 2.5675423741340637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23500743508338928, "step": 3478 }, { "epoch": 0.29, "grad_norm": 5.1875, "grad_norm_var": 0.04607747395833333, "learning_rate": 4e-05, "loss": 5.3603, "loss/crossentropy": 2.3670946955680847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2199428491294384, "step": 3480 }, { "epoch": 0.2901666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.03524983723958333, "learning_rate": 4e-05, "loss": 4.9564, "loss/crossentropy": 1.8418959975242615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22825615853071213, "step": 3482 }, { "epoch": 0.29033333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.036051432291666664, "learning_rate": 4e-05, "loss": 4.4244, "loss/crossentropy": 1.8724389523267746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19478942267596722, "step": 3484 }, { "epoch": 0.2905, "grad_norm": 5.09375, "grad_norm_var": 0.04407145182291667, "learning_rate": 4e-05, "loss": 4.4165, "loss/crossentropy": 2.111702561378479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20953534543514252, "step": 3486 }, { "epoch": 0.2906666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.711181640625, "learning_rate": 4e-05, "loss": 4.3107, "loss/crossentropy": 2.3148096799850464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190384529531002, "step": 3488 }, { "epoch": 0.29083333333333333, "grad_norm": 7.6875, "grad_norm_var": 1.09732666015625, "learning_rate": 4e-05, "loss": 4.9802, "loss/crossentropy": 1.6215331330895424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20154114998877048, "step": 3490 }, { "epoch": 0.291, "grad_norm": 5.0625, "grad_norm_var": 1.06236572265625, "learning_rate": 4e-05, "loss": 4.7553, "loss/crossentropy": 1.8760625272989273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19243442453444004, "step": 3492 }, { "epoch": 0.2911666666666667, "grad_norm": 5.15625, "grad_norm_var": 1.0556925455729167, "learning_rate": 4e-05, "loss": 5.3884, "loss/crossentropy": 2.2172908782958984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23113702610135078, "step": 3494 }, { "epoch": 0.29133333333333333, "grad_norm": 4.90625, "grad_norm_var": 1.0721964518229166, "learning_rate": 4e-05, "loss": 5.1201, "loss/crossentropy": 2.3262163400650024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20330826565623283, "step": 3496 }, { "epoch": 0.2915, "grad_norm": 4.53125, "grad_norm_var": 1.1188639322916667, "learning_rate": 4e-05, "loss": 4.4723, "loss/crossentropy": 2.01950091868639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16761692985892296, "step": 3498 }, { "epoch": 0.2916666666666667, "grad_norm": 4.96875, "grad_norm_var": 1.0830037434895834, "learning_rate": 4e-05, "loss": 4.668, "loss/crossentropy": 1.7684204503893852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1895194798707962, "step": 3500 }, { "epoch": 0.29183333333333333, "grad_norm": 4.96875, "grad_norm_var": 1.0822224934895834, "learning_rate": 4e-05, "loss": 4.8008, "loss/crossentropy": 2.3736203610897064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22592481598258018, "step": 3502 }, { "epoch": 0.292, "grad_norm": 4.96875, "grad_norm_var": 0.6147135416666667, "learning_rate": 4e-05, "loss": 5.3605, "loss/crossentropy": 2.3942826986312866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2380472868680954, "step": 3504 }, { "epoch": 0.2921666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.19387613932291667, "learning_rate": 4e-05, "loss": 4.4185, "loss/crossentropy": 2.3212435841560364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20279332622885704, "step": 3506 }, { "epoch": 0.29233333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.20012613932291667, "learning_rate": 4e-05, "loss": 5.1964, "loss/crossentropy": 2.09537735581398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19826416298747063, "step": 3508 }, { "epoch": 0.2925, "grad_norm": 4.75, "grad_norm_var": 0.211572265625, "learning_rate": 4e-05, "loss": 5.1824, "loss/crossentropy": 2.1464912593364716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2115538753569126, "step": 3510 }, { "epoch": 0.2926666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.22095947265625, "learning_rate": 4e-05, "loss": 4.8959, "loss/crossentropy": 0.9554274380207062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12400326877832413, "step": 3512 }, { "epoch": 0.29283333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.20963541666666666, "learning_rate": 4e-05, "loss": 4.5966, "loss/crossentropy": 1.952194757759571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851113010197878, "step": 3514 }, { "epoch": 0.293, "grad_norm": 4.75, "grad_norm_var": 0.20462239583333333, "learning_rate": 4e-05, "loss": 4.9804, "loss/crossentropy": 2.3729577362537384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21630284935235977, "step": 3516 }, { "epoch": 0.2931666666666667, "grad_norm": 5.0, "grad_norm_var": 0.19927978515625, "learning_rate": 4e-05, "loss": 4.7558, "loss/crossentropy": 2.4642003178596497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22600191831588745, "step": 3518 }, { "epoch": 0.29333333333333333, "grad_norm": 6.4375, "grad_norm_var": 0.20206705729166666, "learning_rate": 4e-05, "loss": 4.6977, "loss/crossentropy": 1.8401424586772919, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26591188833117485, "step": 3520 }, { "epoch": 0.2935, "grad_norm": 5.1875, "grad_norm_var": 0.19804280598958332, "learning_rate": 4e-05, "loss": 4.8546, "loss/crossentropy": 1.9076418355107307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19173485785722733, "step": 3522 }, { "epoch": 0.2936666666666667, "grad_norm": 5.0, "grad_norm_var": 0.20474853515625, "learning_rate": 4e-05, "loss": 5.6459, "loss/crossentropy": 2.5764644145965576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21692033484578133, "step": 3524 }, { "epoch": 0.29383333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.191015625, "learning_rate": 4e-05, "loss": 5.1832, "loss/crossentropy": 2.5007553696632385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23094122856855392, "step": 3526 }, { "epoch": 0.294, "grad_norm": 5.03125, "grad_norm_var": 0.1990234375, "learning_rate": 4e-05, "loss": 4.7084, "loss/crossentropy": 1.4607620611786842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1543761007487774, "step": 3528 }, { "epoch": 0.2941666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.20777587890625, "learning_rate": 4e-05, "loss": 4.6505, "loss/crossentropy": 2.0909395068883896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16600767709314823, "step": 3530 }, { "epoch": 0.29433333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.20909830729166667, "learning_rate": 4e-05, "loss": 4.6819, "loss/crossentropy": 0.9055970907211304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1309139858931303, "step": 3532 }, { "epoch": 0.2945, "grad_norm": 4.59375, "grad_norm_var": 0.23097330729166668, "learning_rate": 4e-05, "loss": 4.5222, "loss/crossentropy": 1.9780186116695404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18702267482876778, "step": 3534 }, { "epoch": 0.2946666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.08114827473958333, "learning_rate": 4e-05, "loss": 4.4649, "loss/crossentropy": 0.9073627293109894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10845192894339561, "step": 3536 }, { "epoch": 0.29483333333333334, "grad_norm": 4.375, "grad_norm_var": 0.10050455729166667, "learning_rate": 4e-05, "loss": 4.7735, "loss/crossentropy": 1.6618280410766602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16298267990350723, "step": 3538 }, { "epoch": 0.295, "grad_norm": 5.15625, "grad_norm_var": 0.0806640625, "learning_rate": 4e-05, "loss": 5.0086, "loss/crossentropy": 1.9391166269779205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18532704934477806, "step": 3540 }, { "epoch": 0.2951666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.09185791015625, "learning_rate": 4e-05, "loss": 5.1276, "loss/crossentropy": 2.1296051144599915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987944282591343, "step": 3542 }, { "epoch": 0.29533333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.09579671223958333, "learning_rate": 4e-05, "loss": 4.5496, "loss/crossentropy": 1.3152644261717796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15342802554368973, "step": 3544 }, { "epoch": 0.2955, "grad_norm": 4.71875, "grad_norm_var": 0.092041015625, "learning_rate": 4e-05, "loss": 4.6793, "loss/crossentropy": 2.123941093683243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20897839963436127, "step": 3546 }, { "epoch": 0.2956666666666667, "grad_norm": 4.875, "grad_norm_var": 0.08857014973958334, "learning_rate": 4e-05, "loss": 4.8814, "loss/crossentropy": 2.487266719341278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22740302234888077, "step": 3548 }, { "epoch": 0.29583333333333334, "grad_norm": 5.34375, "grad_norm_var": 0.07704671223958333, "learning_rate": 4e-05, "loss": 5.4725, "loss/crossentropy": 2.515300452709198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22232673317193985, "step": 3550 }, { "epoch": 0.296, "grad_norm": 5.0, "grad_norm_var": 0.07159830729166666, "learning_rate": 4e-05, "loss": 5.225, "loss/crossentropy": 2.5012297928333282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24186847358942032, "step": 3552 }, { "epoch": 0.2961666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.055403645833333334, "learning_rate": 4e-05, "loss": 4.4848, "loss/crossentropy": 1.829747200012207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132105603814125, "step": 3554 }, { "epoch": 0.29633333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.05221354166666667, "learning_rate": 4e-05, "loss": 4.7871, "loss/crossentropy": 2.078303784132004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992538534104824, "step": 3556 }, { "epoch": 0.2965, "grad_norm": 4.6875, "grad_norm_var": 0.05115559895833333, "learning_rate": 4e-05, "loss": 4.1948, "loss/crossentropy": 1.9181992933154106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17587333172559738, "step": 3558 }, { "epoch": 0.2966666666666667, "grad_norm": 5.46875, "grad_norm_var": 0.05836181640625, "learning_rate": 4e-05, "loss": 5.1335, "loss/crossentropy": 2.469767451286316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22005610167980194, "step": 3560 }, { "epoch": 0.29683333333333334, "grad_norm": 5.125, "grad_norm_var": 0.06483968098958333, "learning_rate": 4e-05, "loss": 5.0746, "loss/crossentropy": 1.754760041832924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1840778887271881, "step": 3562 }, { "epoch": 0.297, "grad_norm": 4.875, "grad_norm_var": 0.06282552083333333, "learning_rate": 4e-05, "loss": 4.6255, "loss/crossentropy": 1.6421096697449684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16191892698407173, "step": 3564 }, { "epoch": 0.2971666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.05679931640625, "learning_rate": 4e-05, "loss": 4.8003, "loss/crossentropy": 1.5874068588018417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16946525312960148, "step": 3566 }, { "epoch": 0.29733333333333334, "grad_norm": 5.125, "grad_norm_var": 0.06601155598958333, "learning_rate": 4e-05, "loss": 4.9971, "loss/crossentropy": 2.3962987661361694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20658328384160995, "step": 3568 }, { "epoch": 0.2975, "grad_norm": 4.875, "grad_norm_var": 0.06623942057291667, "learning_rate": 4e-05, "loss": 5.5287, "loss/crossentropy": 2.5732553601264954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2334974780678749, "step": 3570 }, { "epoch": 0.2976666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.06647135416666666, "learning_rate": 4e-05, "loss": 5.0897, "loss/crossentropy": 2.3147547245025635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20426007360219955, "step": 3572 }, { "epoch": 0.29783333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.07278645833333333, "learning_rate": 4e-05, "loss": 4.5379, "loss/crossentropy": 1.3760528713464737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13978317752480507, "step": 3574 }, { "epoch": 0.298, "grad_norm": 5.8125, "grad_norm_var": 0.27421468098958335, "learning_rate": 4e-05, "loss": 5.0086, "loss/crossentropy": 1.6505027040839195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21621382609009743, "step": 3576 }, { "epoch": 0.2981666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.27554931640625, "learning_rate": 4e-05, "loss": 5.0922, "loss/crossentropy": 1.40341117978096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16107412800192833, "step": 3578 }, { "epoch": 0.29833333333333334, "grad_norm": 5.25, "grad_norm_var": 0.279931640625, "learning_rate": 4e-05, "loss": 4.7093, "loss/crossentropy": 1.147512048482895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15361671149730682, "step": 3580 }, { "epoch": 0.2985, "grad_norm": 4.75, "grad_norm_var": 0.28746337890625, "learning_rate": 4e-05, "loss": 5.0159, "loss/crossentropy": 1.4668299034237862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17344230972230434, "step": 3582 }, { "epoch": 0.2986666666666667, "grad_norm": 4.75, "grad_norm_var": 0.2789713541666667, "learning_rate": 4e-05, "loss": 4.8091, "loss/crossentropy": 1.2716411352157593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1398077104240656, "step": 3584 }, { "epoch": 0.29883333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.2873331705729167, "learning_rate": 4e-05, "loss": 4.7028, "loss/crossentropy": 2.2946461737155914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183996111154556, "step": 3586 }, { "epoch": 0.299, "grad_norm": 4.90625, "grad_norm_var": 0.291650390625, "learning_rate": 4e-05, "loss": 5.2464, "loss/crossentropy": 1.7846654728055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16545471921563148, "step": 3588 }, { "epoch": 0.2991666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.26832275390625, "learning_rate": 4e-05, "loss": 4.6774, "loss/crossentropy": 1.7224418818950653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17625931091606617, "step": 3590 }, { "epoch": 0.29933333333333334, "grad_norm": 5.25, "grad_norm_var": 0.05462239583333333, "learning_rate": 4e-05, "loss": 5.2031, "loss/crossentropy": 2.0435468032956123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18544995225965977, "step": 3592 }, { "epoch": 0.2995, "grad_norm": 4.78125, "grad_norm_var": 0.055924479166666666, "learning_rate": 4e-05, "loss": 5.073, "loss/crossentropy": 2.0027381628751755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18559293076395988, "step": 3594 }, { "epoch": 0.2996666666666667, "grad_norm": 4.9375, "grad_norm_var": 1.8641560872395833, "learning_rate": 4e-05, "loss": 4.5997, "loss/crossentropy": 1.6425791680812836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16363847069442272, "step": 3596 }, { "epoch": 0.29983333333333334, "grad_norm": 5.34375, "grad_norm_var": 1.8575480143229166, "learning_rate": 4e-05, "loss": 4.984, "loss/crossentropy": 2.020586669445038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21033752337098122, "step": 3598 }, { "epoch": 0.3, "grad_norm": 5.34375, "grad_norm_var": 1.85592041015625, "learning_rate": 4e-05, "loss": 5.0516, "loss/crossentropy": 2.270563930273056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22760266438126564, "step": 3600 }, { "epoch": 0.3001666666666667, "grad_norm": 4.65625, "grad_norm_var": 1.8462076822916667, "learning_rate": 4e-05, "loss": 5.1508, "loss/crossentropy": 2.489174962043762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22328739613294601, "step": 3602 }, { "epoch": 0.30033333333333334, "grad_norm": 5.0, "grad_norm_var": 1.8499959309895833, "learning_rate": 4e-05, "loss": 4.8976, "loss/crossentropy": 1.9335657581686974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18224405869841576, "step": 3604 }, { "epoch": 0.3005, "grad_norm": 5.0, "grad_norm_var": 1.8414021809895833, "learning_rate": 4e-05, "loss": 5.4268, "loss/crossentropy": 1.7507117837667465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16100936010479927, "step": 3606 }, { "epoch": 0.3006666666666667, "grad_norm": 4.96875, "grad_norm_var": 1.9059244791666667, "learning_rate": 4e-05, "loss": 4.8097, "loss/crossentropy": 2.112537205219269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2001986764371395, "step": 3608 }, { "epoch": 0.30083333333333334, "grad_norm": 5.21875, "grad_norm_var": 1.8837076822916667, "learning_rate": 4e-05, "loss": 5.5574, "loss/crossentropy": 2.74730384349823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2495119497179985, "step": 3610 }, { "epoch": 0.301, "grad_norm": 4.96875, "grad_norm_var": 0.12821858723958332, "learning_rate": 4e-05, "loss": 5.0715, "loss/crossentropy": 2.563708484172821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22458581998944283, "step": 3612 }, { "epoch": 0.3011666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.0716796875, "learning_rate": 4e-05, "loss": 4.237, "loss/crossentropy": 2.0116556510329247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19622116163372993, "step": 3614 }, { "epoch": 0.30133333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.05546875, "learning_rate": 4e-05, "loss": 4.3814, "loss/crossentropy": 2.005406975746155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17633453384041786, "step": 3616 }, { "epoch": 0.3015, "grad_norm": 4.65625, "grad_norm_var": 0.042801920572916666, "learning_rate": 4e-05, "loss": 4.6612, "loss/crossentropy": 1.9902734458446503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19928394444286823, "step": 3618 }, { "epoch": 0.3016666666666667, "grad_norm": 5.0, "grad_norm_var": 0.04230143229166667, "learning_rate": 4e-05, "loss": 4.388, "loss/crossentropy": 1.580359660089016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15102678537368774, "step": 3620 }, { "epoch": 0.30183333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.04609375, "learning_rate": 4e-05, "loss": 4.5381, "loss/crossentropy": 0.9725519716739655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12142913416028023, "step": 3622 }, { "epoch": 0.302, "grad_norm": 5.5, "grad_norm_var": 0.054671223958333334, "learning_rate": 4e-05, "loss": 5.0478, "loss/crossentropy": 2.1247295141220093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23397767543792725, "step": 3624 }, { "epoch": 0.30216666666666664, "grad_norm": 4.53125, "grad_norm_var": 0.06005452473958333, "learning_rate": 4e-05, "loss": 4.3377, "loss/crossentropy": 0.9640841111540794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19951742701232433, "step": 3626 }, { "epoch": 0.30233333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.06534830729166667, "learning_rate": 4e-05, "loss": 4.5658, "loss/crossentropy": 2.2950040102005005, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21112338826060295, "step": 3628 }, { "epoch": 0.3025, "grad_norm": 5.125, "grad_norm_var": 0.08375244140625, "learning_rate": 4e-05, "loss": 5.3449, "loss/crossentropy": 2.4528151154518127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23017141222953796, "step": 3630 }, { "epoch": 0.30266666666666664, "grad_norm": 5.0625, "grad_norm_var": 0.08409830729166666, "learning_rate": 4e-05, "loss": 5.032, "loss/crossentropy": 1.951070874929428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18713463097810745, "step": 3632 }, { "epoch": 0.30283333333333334, "grad_norm": 5.0, "grad_norm_var": 0.07734375, "learning_rate": 4e-05, "loss": 4.9745, "loss/crossentropy": 1.9134965389966965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16628370434045792, "step": 3634 }, { "epoch": 0.303, "grad_norm": 4.90625, "grad_norm_var": 0.07965087890625, "learning_rate": 4e-05, "loss": 4.6723, "loss/crossentropy": 2.3537526428699493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2268533818423748, "step": 3636 }, { "epoch": 0.30316666666666664, "grad_norm": 4.96875, "grad_norm_var": 0.06760660807291667, "learning_rate": 4e-05, "loss": 5.6514, "loss/crossentropy": 2.3131695985794067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19909925386309624, "step": 3638 }, { "epoch": 0.30333333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.058329264322916664, "learning_rate": 4e-05, "loss": 4.8647, "loss/crossentropy": 2.0087155923247337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1793438270688057, "step": 3640 }, { "epoch": 0.3035, "grad_norm": 5.375, "grad_norm_var": 0.05432535807291667, "learning_rate": 4e-05, "loss": 4.9107, "loss/crossentropy": 2.076196014881134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22751999273896217, "step": 3642 }, { "epoch": 0.30366666666666664, "grad_norm": 5.0, "grad_norm_var": 0.047526041666666664, "learning_rate": 4e-05, "loss": 5.6599, "loss/crossentropy": 2.399946093559265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21956488490104675, "step": 3644 }, { "epoch": 0.30383333333333334, "grad_norm": 4.875, "grad_norm_var": 0.03313395182291667, "learning_rate": 4e-05, "loss": 4.3474, "loss/crossentropy": 1.9796275794506073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20108654350042343, "step": 3646 }, { "epoch": 0.304, "grad_norm": 4.6875, "grad_norm_var": 0.03508707682291667, "learning_rate": 4e-05, "loss": 4.3212, "loss/crossentropy": 1.328979179263115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1541975736618042, "step": 3648 }, { "epoch": 0.30416666666666664, "grad_norm": 4.34375, "grad_norm_var": 0.07428385416666666, "learning_rate": 4e-05, "loss": 4.2876, "loss/crossentropy": 2.024112194776535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18114975281059742, "step": 3650 }, { "epoch": 0.30433333333333334, "grad_norm": 4.625, "grad_norm_var": 0.07897135416666666, "learning_rate": 4e-05, "loss": 4.8026, "loss/crossentropy": 2.082690417766571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21567688882350922, "step": 3652 }, { "epoch": 0.3045, "grad_norm": 4.625, "grad_norm_var": 0.09465738932291666, "learning_rate": 4e-05, "loss": 5.3316, "loss/crossentropy": 2.600900650024414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2172529399394989, "step": 3654 }, { "epoch": 0.30466666666666664, "grad_norm": 4.71875, "grad_norm_var": 0.09465738932291666, "learning_rate": 4e-05, "loss": 5.3408, "loss/crossentropy": 2.4749475717544556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20883005484938622, "step": 3656 }, { "epoch": 0.30483333333333335, "grad_norm": 5.65625, "grad_norm_var": 0.11627197265625, "learning_rate": 4e-05, "loss": 4.8844, "loss/crossentropy": 2.0045883879065514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1844545528292656, "step": 3658 }, { "epoch": 0.305, "grad_norm": 5.375, "grad_norm_var": 0.128369140625, "learning_rate": 4e-05, "loss": 5.303, "loss/crossentropy": 2.2967261970043182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20461099222302437, "step": 3660 }, { "epoch": 0.30516666666666664, "grad_norm": 5.25, "grad_norm_var": 0.13121337890625, "learning_rate": 4e-05, "loss": 5.1984, "loss/crossentropy": 2.034844785928726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2299644947052002, "step": 3662 }, { "epoch": 0.30533333333333335, "grad_norm": 4.90625, "grad_norm_var": 0.127197265625, "learning_rate": 4e-05, "loss": 4.2718, "loss/crossentropy": 2.4984880089759827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2178654558956623, "step": 3664 }, { "epoch": 0.3055, "grad_norm": 4.90625, "grad_norm_var": 0.08873697916666666, "learning_rate": 4e-05, "loss": 5.309, "loss/crossentropy": 2.227113127708435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22134817764163017, "step": 3666 }, { "epoch": 0.30566666666666664, "grad_norm": 5.5625, "grad_norm_var": 0.09855143229166667, "learning_rate": 4e-05, "loss": 5.3596, "loss/crossentropy": 2.1710298359394073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19393081590533257, "step": 3668 }, { "epoch": 0.30583333333333335, "grad_norm": 5.25, "grad_norm_var": 0.07805582682291666, "learning_rate": 4e-05, "loss": 4.8099, "loss/crossentropy": 2.4346525073051453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2716682218015194, "step": 3670 }, { "epoch": 0.306, "grad_norm": 4.90625, "grad_norm_var": 0.07301025390625, "learning_rate": 4e-05, "loss": 4.8859, "loss/crossentropy": 1.582870475947857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15237391367554665, "step": 3672 }, { "epoch": 0.30616666666666664, "grad_norm": 4.59375, "grad_norm_var": 0.06607666015625, "learning_rate": 4e-05, "loss": 5.2553, "loss/crossentropy": 1.999775506556034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1884305290877819, "step": 3674 }, { "epoch": 0.30633333333333335, "grad_norm": 5.0, "grad_norm_var": 0.05813802083333333, "learning_rate": 4e-05, "loss": 4.5563, "loss/crossentropy": 1.2396316081285477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1338381376117468, "step": 3676 }, { "epoch": 0.3065, "grad_norm": 5.0625, "grad_norm_var": 0.052718098958333334, "learning_rate": 4e-05, "loss": 4.8746, "loss/crossentropy": 2.200127214193344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.212520282715559, "step": 3678 }, { "epoch": 0.30666666666666664, "grad_norm": 4.53125, "grad_norm_var": 0.07420247395833333, "learning_rate": 4e-05, "loss": 4.7778, "loss/crossentropy": 2.107081711292267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1995658278465271, "step": 3680 }, { "epoch": 0.30683333333333335, "grad_norm": 5.6875, "grad_norm_var": 0.11503499348958333, "learning_rate": 4e-05, "loss": 5.2662, "loss/crossentropy": 2.5890790224075317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21765975654125214, "step": 3682 }, { "epoch": 0.307, "grad_norm": 5.4375, "grad_norm_var": 0.10989176432291667, "learning_rate": 4e-05, "loss": 4.6385, "loss/crossentropy": 2.281311720609665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19055946916341782, "step": 3684 }, { "epoch": 0.30716666666666664, "grad_norm": 4.96875, "grad_norm_var": 0.10813802083333333, "learning_rate": 4e-05, "loss": 4.6168, "loss/crossentropy": 2.4844754934310913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2209065817296505, "step": 3686 }, { "epoch": 0.30733333333333335, "grad_norm": 4.5, "grad_norm_var": 0.12333577473958333, "learning_rate": 4e-05, "loss": 4.0221, "loss/crossentropy": 1.6593739092350006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19019902870059013, "step": 3688 }, { "epoch": 0.3075, "grad_norm": 4.90625, "grad_norm_var": 0.13010660807291666, "learning_rate": 4e-05, "loss": 4.9422, "loss/crossentropy": 2.1268528401851654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18465667217969894, "step": 3690 }, { "epoch": 0.30766666666666664, "grad_norm": 4.84375, "grad_norm_var": 0.131494140625, "learning_rate": 4e-05, "loss": 4.437, "loss/crossentropy": 1.5557752773165703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.177669333294034, "step": 3692 }, { "epoch": 0.30783333333333335, "grad_norm": 4.71875, "grad_norm_var": 0.13357747395833333, "learning_rate": 4e-05, "loss": 4.3403, "loss/crossentropy": 1.798197090625763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15706698969006538, "step": 3694 }, { "epoch": 0.308, "grad_norm": 4.46875, "grad_norm_var": 0.12081705729166667, "learning_rate": 4e-05, "loss": 4.6368, "loss/crossentropy": 1.8836499452590942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17866197228431702, "step": 3696 }, { "epoch": 0.30816666666666664, "grad_norm": 5.125, "grad_norm_var": 0.083447265625, "learning_rate": 4e-05, "loss": 5.3486, "loss/crossentropy": 2.1042481660842896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17958716675639153, "step": 3698 }, { "epoch": 0.30833333333333335, "grad_norm": 5.0625, "grad_norm_var": 0.0623046875, "learning_rate": 4e-05, "loss": 5.0219, "loss/crossentropy": 2.4937482476234436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2245134860277176, "step": 3700 }, { "epoch": 0.3085, "grad_norm": 4.75, "grad_norm_var": 0.06243082682291667, "learning_rate": 4e-05, "loss": 4.4926, "loss/crossentropy": 1.953664094209671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048991098999977, "step": 3702 }, { "epoch": 0.30866666666666664, "grad_norm": 5.125, "grad_norm_var": 0.0521484375, "learning_rate": 4e-05, "loss": 5.2685, "loss/crossentropy": 2.191207781434059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21109728887677193, "step": 3704 }, { "epoch": 0.30883333333333335, "grad_norm": 4.90625, "grad_norm_var": 0.03722330729166667, "learning_rate": 4e-05, "loss": 4.9706, "loss/crossentropy": 2.088053673505783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20717396587133408, "step": 3706 }, { "epoch": 0.309, "grad_norm": 4.90625, "grad_norm_var": 0.038863118489583334, "learning_rate": 4e-05, "loss": 4.7178, "loss/crossentropy": 1.7166788578033447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16185236908495426, "step": 3708 }, { "epoch": 0.30916666666666665, "grad_norm": 4.90625, "grad_norm_var": 0.040999348958333334, "learning_rate": 4e-05, "loss": 5.0539, "loss/crossentropy": 2.4472317695617676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22122575715184212, "step": 3710 }, { "epoch": 0.30933333333333335, "grad_norm": 5.6875, "grad_norm_var": 0.058333333333333334, "learning_rate": 4e-05, "loss": 5.3555, "loss/crossentropy": 1.8674227595329285, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1979275792837143, "step": 3712 }, { "epoch": 0.3095, "grad_norm": 4.78125, "grad_norm_var": 0.062483723958333334, "learning_rate": 4e-05, "loss": 4.3914, "loss/crossentropy": 2.116463601589203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22198039293289185, "step": 3714 }, { "epoch": 0.30966666666666665, "grad_norm": 4.59375, "grad_norm_var": 0.09490559895833334, "learning_rate": 4e-05, "loss": 4.6182, "loss/crossentropy": 2.104882702231407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18206360936164856, "step": 3716 }, { "epoch": 0.30983333333333335, "grad_norm": 5.0625, "grad_norm_var": 0.09440104166666667, "learning_rate": 4e-05, "loss": 4.3327, "loss/crossentropy": 1.162502907216549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16418416053056717, "step": 3718 }, { "epoch": 0.31, "grad_norm": 4.90625, "grad_norm_var": 0.09283447265625, "learning_rate": 4e-05, "loss": 4.6641, "loss/crossentropy": 1.5889663323760033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1913046780973673, "step": 3720 }, { "epoch": 0.31016666666666665, "grad_norm": 4.90625, "grad_norm_var": 0.09950764973958333, "learning_rate": 4e-05, "loss": 4.6431, "loss/crossentropy": 2.3483422100543976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22202644869685173, "step": 3722 }, { "epoch": 0.31033333333333335, "grad_norm": 6.21875, "grad_norm_var": 0.19620768229166666, "learning_rate": 4e-05, "loss": 5.4244, "loss/crossentropy": 2.0444701313972473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22006652504205704, "step": 3724 }, { "epoch": 0.3105, "grad_norm": 4.90625, "grad_norm_var": 0.19073893229166666, "learning_rate": 4e-05, "loss": 4.9727, "loss/crossentropy": 2.4517840147018433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100413180887699, "step": 3726 }, { "epoch": 0.31066666666666665, "grad_norm": 4.90625, "grad_norm_var": 0.16324462890625, "learning_rate": 4e-05, "loss": 5.1654, "loss/crossentropy": 1.9761425107717514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1948082633316517, "step": 3728 }, { "epoch": 0.31083333333333335, "grad_norm": 4.875, "grad_norm_var": 0.153759765625, "learning_rate": 4e-05, "loss": 4.8851, "loss/crossentropy": 1.5147030353546143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16606771387159824, "step": 3730 }, { "epoch": 0.311, "grad_norm": 4.8125, "grad_norm_var": 0.12649332682291667, "learning_rate": 4e-05, "loss": 4.8831, "loss/crossentropy": 2.117761880159378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.187788724899292, "step": 3732 }, { "epoch": 0.31116666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.12493082682291666, "learning_rate": 4e-05, "loss": 4.7365, "loss/crossentropy": 2.5555994510650635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22206680104136467, "step": 3734 }, { "epoch": 0.31133333333333335, "grad_norm": 4.71875, "grad_norm_var": 0.14605712890625, "learning_rate": 4e-05, "loss": 5.1176, "loss/crossentropy": 1.7837401628494263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18287081271409988, "step": 3736 }, { "epoch": 0.3115, "grad_norm": 4.8125, "grad_norm_var": 0.1443359375, "learning_rate": 4e-05, "loss": 5.3582, "loss/crossentropy": 2.330405503511429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22329926490783691, "step": 3738 }, { "epoch": 0.31166666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.043212890625, "learning_rate": 4e-05, "loss": 4.8281, "loss/crossentropy": 1.5541361793875694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18051035329699516, "step": 3740 }, { "epoch": 0.31183333333333335, "grad_norm": 5.0625, "grad_norm_var": 0.057352701822916664, "learning_rate": 4e-05, "loss": 4.9007, "loss/crossentropy": 2.17046582698822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19161805883049965, "step": 3742 }, { "epoch": 0.312, "grad_norm": 5.21875, "grad_norm_var": 0.0638671875, "learning_rate": 4e-05, "loss": 5.3251, "loss/crossentropy": 2.254283905029297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19508297741413116, "step": 3744 }, { "epoch": 0.31216666666666665, "grad_norm": 4.71875, "grad_norm_var": 0.06750895182291666, "learning_rate": 4e-05, "loss": 5.1207, "loss/crossentropy": 2.116343930363655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21743535064160824, "step": 3746 }, { "epoch": 0.31233333333333335, "grad_norm": 5.46875, "grad_norm_var": 0.08072916666666667, "learning_rate": 4e-05, "loss": 4.6328, "loss/crossentropy": 2.3838615715503693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21325530484318733, "step": 3748 }, { "epoch": 0.3125, "grad_norm": 5.03125, "grad_norm_var": 0.08307291666666666, "learning_rate": 4e-05, "loss": 4.9696, "loss/crossentropy": 1.8378008008003235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19234362617135048, "step": 3750 }, { "epoch": 0.31266666666666665, "grad_norm": 5.0, "grad_norm_var": 0.05933837890625, "learning_rate": 4e-05, "loss": 5.1705, "loss/crossentropy": 2.1367595493793488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20700618252158165, "step": 3752 }, { "epoch": 0.31283333333333335, "grad_norm": 5.15625, "grad_norm_var": 0.06483968098958333, "learning_rate": 4e-05, "loss": 4.9547, "loss/crossentropy": 1.8602565303444862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17942636832594872, "step": 3754 }, { "epoch": 0.313, "grad_norm": 4.90625, "grad_norm_var": 0.06610921223958334, "learning_rate": 4e-05, "loss": 4.1205, "loss/crossentropy": 1.9927891492843628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2000282108783722, "step": 3756 }, { "epoch": 0.31316666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.048140462239583334, "learning_rate": 4e-05, "loss": 4.6056, "loss/crossentropy": 2.0198487788438797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19490646198391914, "step": 3758 }, { "epoch": 0.31333333333333335, "grad_norm": 4.78125, "grad_norm_var": 0.050390625, "learning_rate": 4e-05, "loss": 5.2929, "loss/crossentropy": 2.36793053150177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22241582348942757, "step": 3760 }, { "epoch": 0.3135, "grad_norm": 4.71875, "grad_norm_var": 0.06569010416666667, "learning_rate": 4e-05, "loss": 4.9795, "loss/crossentropy": 2.4190186858177185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21430841460824013, "step": 3762 }, { "epoch": 0.31366666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.05250244140625, "learning_rate": 4e-05, "loss": 4.9606, "loss/crossentropy": 1.79479618370533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916802916675806, "step": 3764 }, { "epoch": 0.31383333333333335, "grad_norm": 5.21875, "grad_norm_var": 0.07081705729166667, "learning_rate": 4e-05, "loss": 4.373, "loss/crossentropy": 1.937475398182869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18721584975719452, "step": 3766 }, { "epoch": 0.314, "grad_norm": 4.71875, "grad_norm_var": 0.07121988932291666, "learning_rate": 4e-05, "loss": 4.286, "loss/crossentropy": 1.4481577202677727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16050837188959122, "step": 3768 }, { "epoch": 0.31416666666666665, "grad_norm": 5.15625, "grad_norm_var": 0.06432291666666666, "learning_rate": 4e-05, "loss": 4.9644, "loss/crossentropy": 1.147791676223278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13392825238406658, "step": 3770 }, { "epoch": 0.31433333333333335, "grad_norm": 4.9375, "grad_norm_var": 0.06131184895833333, "learning_rate": 4e-05, "loss": 4.6804, "loss/crossentropy": 1.4648746028542519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1747361645102501, "step": 3772 }, { "epoch": 0.3145, "grad_norm": 4.78125, "grad_norm_var": 0.0615234375, "learning_rate": 4e-05, "loss": 4.8572, "loss/crossentropy": 2.5256667137145996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21758800372481346, "step": 3774 }, { "epoch": 0.31466666666666665, "grad_norm": 4.96875, "grad_norm_var": 0.05904541015625, "learning_rate": 4e-05, "loss": 5.6007, "loss/crossentropy": 2.4053181409835815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21975573524832726, "step": 3776 }, { "epoch": 0.31483333333333335, "grad_norm": 5.0625, "grad_norm_var": 0.03664957682291667, "learning_rate": 4e-05, "loss": 4.6974, "loss/crossentropy": 1.6105652749538422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15871884860098362, "step": 3778 }, { "epoch": 0.315, "grad_norm": 4.46875, "grad_norm_var": 0.0544921875, "learning_rate": 4e-05, "loss": 4.3221, "loss/crossentropy": 1.7338557913899422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18251017853617668, "step": 3780 }, { "epoch": 0.31516666666666665, "grad_norm": 4.59375, "grad_norm_var": 0.045947265625, "learning_rate": 4e-05, "loss": 4.4713, "loss/crossentropy": 1.9897050261497498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19093642942607403, "step": 3782 }, { "epoch": 0.31533333333333335, "grad_norm": 5.1875, "grad_norm_var": 0.054427083333333334, "learning_rate": 4e-05, "loss": 4.9917, "loss/crossentropy": 1.9837996065616608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19374994188547134, "step": 3784 }, { "epoch": 0.3155, "grad_norm": 5.15625, "grad_norm_var": 0.0572265625, "learning_rate": 4e-05, "loss": 4.8675, "loss/crossentropy": 2.3804187774658203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22300543636083603, "step": 3786 }, { "epoch": 0.31566666666666665, "grad_norm": 5.0625, "grad_norm_var": 0.05806884765625, "learning_rate": 4e-05, "loss": 4.8823, "loss/crossentropy": 2.190940797328949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202349789440632, "step": 3788 }, { "epoch": 0.31583333333333335, "grad_norm": 4.59375, "grad_norm_var": 0.07258707682291667, "learning_rate": 4e-05, "loss": 4.7738, "loss/crossentropy": 1.6915459632873535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17686060070991516, "step": 3790 }, { "epoch": 0.316, "grad_norm": 5.53125, "grad_norm_var": 0.10279947916666667, "learning_rate": 4e-05, "loss": 5.4494, "loss/crossentropy": 1.9292872324585915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17759693413972855, "step": 3792 }, { "epoch": 0.31616666666666665, "grad_norm": 4.90625, "grad_norm_var": 0.09698893229166666, "learning_rate": 4e-05, "loss": 5.0908, "loss/crossentropy": 1.4473036751151085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1512235328555107, "step": 3794 }, { "epoch": 0.31633333333333336, "grad_norm": 4.875, "grad_norm_var": 0.07909749348958334, "learning_rate": 4e-05, "loss": 4.9736, "loss/crossentropy": 2.1033048927783966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22066448256373405, "step": 3796 }, { "epoch": 0.3165, "grad_norm": 4.9375, "grad_norm_var": 0.08420817057291667, "learning_rate": 4e-05, "loss": 4.3264, "loss/crossentropy": 0.7586923539638519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10199865326285362, "step": 3798 }, { "epoch": 0.31666666666666665, "grad_norm": 4.4375, "grad_norm_var": 0.10089518229166666, "learning_rate": 4e-05, "loss": 4.2479, "loss/crossentropy": 1.5026521235704422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14725450798869133, "step": 3800 }, { "epoch": 0.31683333333333336, "grad_norm": 4.90625, "grad_norm_var": 0.10911458333333333, "learning_rate": 4e-05, "loss": 4.5031, "loss/crossentropy": 1.4894457682967186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17442147992551327, "step": 3802 }, { "epoch": 0.317, "grad_norm": 4.84375, "grad_norm_var": 0.10983072916666667, "learning_rate": 4e-05, "loss": 4.185, "loss/crossentropy": 1.578904926776886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1705641932785511, "step": 3804 }, { "epoch": 0.31716666666666665, "grad_norm": 5.21875, "grad_norm_var": 0.09895833333333333, "learning_rate": 4e-05, "loss": 4.8458, "loss/crossentropy": 2.099468767642975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21197058260440826, "step": 3806 }, { "epoch": 0.31733333333333336, "grad_norm": 4.6875, "grad_norm_var": 0.97633056640625, "learning_rate": 4e-05, "loss": 4.7211, "loss/crossentropy": 0.40370237082242966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.08282195776700974, "step": 3808 }, { "epoch": 0.3175, "grad_norm": 5.03125, "grad_norm_var": 0.9772745768229166, "learning_rate": 4e-05, "loss": 5.1344, "loss/crossentropy": 1.8095313608646393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19387658312916756, "step": 3810 }, { "epoch": 0.31766666666666665, "grad_norm": 4.90625, "grad_norm_var": 0.9825358072916667, "learning_rate": 4e-05, "loss": 4.6487, "loss/crossentropy": 1.3583406507968903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14877759665250778, "step": 3812 }, { "epoch": 0.31783333333333336, "grad_norm": 4.46875, "grad_norm_var": 0.9834635416666667, "learning_rate": 4e-05, "loss": 4.9006, "loss/crossentropy": 1.9160986170172691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18941473588347435, "step": 3814 }, { "epoch": 0.318, "grad_norm": 4.75, "grad_norm_var": 0.94478759765625, "learning_rate": 4e-05, "loss": 4.9874, "loss/crossentropy": 2.3055627644062042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2155849188566208, "step": 3816 }, { "epoch": 0.31816666666666665, "grad_norm": 4.875, "grad_norm_var": 0.9505045572916667, "learning_rate": 4e-05, "loss": 4.5018, "loss/crossentropy": 1.952782303094864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18765784800052643, "step": 3818 }, { "epoch": 0.31833333333333336, "grad_norm": 4.9375, "grad_norm_var": 0.9403483072916666, "learning_rate": 4e-05, "loss": 5.0223, "loss/crossentropy": 2.0353624671697617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889826748520136, "step": 3820 }, { "epoch": 0.3185, "grad_norm": 4.90625, "grad_norm_var": 0.9404581705729167, "learning_rate": 4e-05, "loss": 4.8417, "loss/crossentropy": 2.015192322432995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17999350652098656, "step": 3822 }, { "epoch": 0.31866666666666665, "grad_norm": 4.5625, "grad_norm_var": 0.053999837239583334, "learning_rate": 4e-05, "loss": 4.6136, "loss/crossentropy": 1.6971989944577217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18851319141685963, "step": 3824 }, { "epoch": 0.31883333333333336, "grad_norm": 4.78125, "grad_norm_var": 0.04108072916666667, "learning_rate": 4e-05, "loss": 5.0423, "loss/crossentropy": 2.3695130348205566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2304910495877266, "step": 3826 }, { "epoch": 0.319, "grad_norm": 4.84375, "grad_norm_var": 0.043745930989583334, "learning_rate": 4e-05, "loss": 5.2882, "loss/crossentropy": 2.449763298034668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22548651695251465, "step": 3828 }, { "epoch": 0.31916666666666665, "grad_norm": 4.53125, "grad_norm_var": 0.039286295572916664, "learning_rate": 4e-05, "loss": 4.4039, "loss/crossentropy": 1.6869621872901917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16885101795196533, "step": 3830 }, { "epoch": 0.31933333333333336, "grad_norm": 5.4375, "grad_norm_var": 0.05597330729166667, "learning_rate": 4e-05, "loss": 5.409, "loss/crossentropy": 2.49446177482605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22949394211173058, "step": 3832 }, { "epoch": 0.3195, "grad_norm": 4.96875, "grad_norm_var": 0.05478108723958333, "learning_rate": 4e-05, "loss": 4.5315, "loss/crossentropy": 2.66249018907547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23358352109789848, "step": 3834 }, { "epoch": 0.31966666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.05045166015625, "learning_rate": 4e-05, "loss": 4.7935, "loss/crossentropy": 2.1069458723068237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21881438791751862, "step": 3836 }, { "epoch": 0.31983333333333336, "grad_norm": 4.84375, "grad_norm_var": 0.05243733723958333, "learning_rate": 4e-05, "loss": 5.1939, "loss/crossentropy": 1.933380126953125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17562244832515717, "step": 3838 }, { "epoch": 0.32, "grad_norm": 5.03125, "grad_norm_var": 0.04023030598958333, "learning_rate": 4e-05, "loss": 5.0553, "loss/crossentropy": 2.7020374536514282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23687465488910675, "step": 3840 }, { "epoch": 0.32016666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.264453125, "learning_rate": 4e-05, "loss": 4.9235, "loss/crossentropy": 1.614741176366806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20109020546078682, "step": 3842 }, { "epoch": 0.32033333333333336, "grad_norm": 5.0, "grad_norm_var": 0.26038004557291666, "learning_rate": 4e-05, "loss": 5.7655, "loss/crossentropy": 2.1124462485313416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2182532399892807, "step": 3844 }, { "epoch": 0.3205, "grad_norm": 5.03125, "grad_norm_var": 0.2306640625, "learning_rate": 4e-05, "loss": 5.5571, "loss/crossentropy": 1.813891276717186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19078289158642292, "step": 3846 }, { "epoch": 0.32066666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.2259765625, "learning_rate": 4e-05, "loss": 5.1904, "loss/crossentropy": 1.3256629407405853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13756003230810165, "step": 3848 }, { "epoch": 0.32083333333333336, "grad_norm": 4.53125, "grad_norm_var": 0.24933268229166666, "learning_rate": 4e-05, "loss": 4.5194, "loss/crossentropy": 1.691509410738945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18102239537984133, "step": 3850 }, { "epoch": 0.321, "grad_norm": 4.6875, "grad_norm_var": 0.26067708333333334, "learning_rate": 4e-05, "loss": 4.5382, "loss/crossentropy": 1.8079805970191956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17653886415064335, "step": 3852 }, { "epoch": 0.32116666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.278125, "learning_rate": 4e-05, "loss": 5.7217, "loss/crossentropy": 1.882356882095337, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1414974480867386, "step": 3854 }, { "epoch": 0.32133333333333336, "grad_norm": 4.5, "grad_norm_var": 0.31217041015625, "learning_rate": 4e-05, "loss": 4.301, "loss/crossentropy": 1.9757508039474487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19711638614535332, "step": 3856 }, { "epoch": 0.3215, "grad_norm": 5.0625, "grad_norm_var": 0.10032552083333333, "learning_rate": 4e-05, "loss": 5.2709, "loss/crossentropy": 1.9217640459537506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18658523634076118, "step": 3858 }, { "epoch": 0.32166666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.11614176432291666, "learning_rate": 4e-05, "loss": 4.824, "loss/crossentropy": 1.439366839826107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1621258743107319, "step": 3860 }, { "epoch": 0.32183333333333336, "grad_norm": 4.59375, "grad_norm_var": 0.12642822265625, "learning_rate": 4e-05, "loss": 4.8631, "loss/crossentropy": 1.9740833342075348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18762733228504658, "step": 3862 }, { "epoch": 0.322, "grad_norm": 5.03125, "grad_norm_var": 0.11793212890625, "learning_rate": 4e-05, "loss": 4.6297, "loss/crossentropy": 1.8139654770493507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1765834055840969, "step": 3864 }, { "epoch": 0.32216666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.10741780598958334, "learning_rate": 4e-05, "loss": 4.9415, "loss/crossentropy": 1.6795841604471207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2075301818549633, "step": 3866 }, { "epoch": 0.32233333333333336, "grad_norm": 5.15625, "grad_norm_var": 0.10282796223958333, "learning_rate": 4e-05, "loss": 4.7924, "loss/crossentropy": 0.9799469262361526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13794360868632793, "step": 3868 }, { "epoch": 0.3225, "grad_norm": 5.53125, "grad_norm_var": 0.08677978515625, "learning_rate": 4e-05, "loss": 5.0427, "loss/crossentropy": 2.014233537018299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18055324628949165, "step": 3870 }, { "epoch": 0.32266666666666666, "grad_norm": 4.625, "grad_norm_var": 0.09534098307291666, "learning_rate": 4e-05, "loss": 5.1675, "loss/crossentropy": 2.118018291890621, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19751184806227684, "step": 3872 }, { "epoch": 0.32283333333333336, "grad_norm": 4.65625, "grad_norm_var": 0.08857014973958334, "learning_rate": 4e-05, "loss": 4.8977, "loss/crossentropy": 1.732959657907486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17621224001049995, "step": 3874 }, { "epoch": 0.323, "grad_norm": 5.0625, "grad_norm_var": 0.07437744140625, "learning_rate": 4e-05, "loss": 5.006, "loss/crossentropy": 1.454321675002575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17282269150018692, "step": 3876 }, { "epoch": 0.32316666666666666, "grad_norm": 5.5625, "grad_norm_var": 0.0849609375, "learning_rate": 4e-05, "loss": 5.2047, "loss/crossentropy": 2.027598097920418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19759362563490868, "step": 3878 }, { "epoch": 0.3233333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.0951171875, "learning_rate": 4e-05, "loss": 4.8963, "loss/crossentropy": 1.6502454578876495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19640647992491722, "step": 3880 }, { "epoch": 0.3235, "grad_norm": 4.34375, "grad_norm_var": 0.12183837890625, "learning_rate": 4e-05, "loss": 4.6344, "loss/crossentropy": 2.5610092878341675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21330714598298073, "step": 3882 }, { "epoch": 0.32366666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.1318359375, "learning_rate": 4e-05, "loss": 4.9918, "loss/crossentropy": 2.5567209720611572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21084987744688988, "step": 3884 }, { "epoch": 0.3238333333333333, "grad_norm": 5.25, "grad_norm_var": 0.12971598307291668, "learning_rate": 4e-05, "loss": 4.6148, "loss/crossentropy": 2.139784097671509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20885953679680824, "step": 3886 }, { "epoch": 0.324, "grad_norm": 5.625, "grad_norm_var": 0.12698160807291667, "learning_rate": 4e-05, "loss": 5.1695, "loss/crossentropy": 2.2444785833358765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216685228049755, "step": 3888 }, { "epoch": 0.32416666666666666, "grad_norm": 4.875, "grad_norm_var": 0.12121988932291666, "learning_rate": 4e-05, "loss": 5.1188, "loss/crossentropy": 1.5884385108947754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17183389514684677, "step": 3890 }, { "epoch": 0.3243333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.12537434895833333, "learning_rate": 4e-05, "loss": 4.5442, "loss/crossentropy": 1.5150625482201576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1651450339704752, "step": 3892 }, { "epoch": 0.3245, "grad_norm": 4.8125, "grad_norm_var": 0.105859375, "learning_rate": 4e-05, "loss": 4.973, "loss/crossentropy": 2.0057149529457092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19402217864990234, "step": 3894 }, { "epoch": 0.32466666666666666, "grad_norm": 5.28125, "grad_norm_var": 0.09908447265625, "learning_rate": 4e-05, "loss": 5.1988, "loss/crossentropy": 1.953499749302864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18943873792886734, "step": 3896 }, { "epoch": 0.3248333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.070166015625, "learning_rate": 4e-05, "loss": 5.2872, "loss/crossentropy": 1.3116377219557762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14449688233435154, "step": 3898 }, { "epoch": 0.325, "grad_norm": 4.6875, "grad_norm_var": 0.06925455729166667, "learning_rate": 4e-05, "loss": 5.0211, "loss/crossentropy": 0.8303311765193939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13305531814694405, "step": 3900 }, { "epoch": 0.32516666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.05601806640625, "learning_rate": 4e-05, "loss": 4.5965, "loss/crossentropy": 2.015589267015457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2238336279988289, "step": 3902 }, { "epoch": 0.3253333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.04247639973958333, "learning_rate": 4e-05, "loss": 5.0168, "loss/crossentropy": 2.3163425028324127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127598598599434, "step": 3904 }, { "epoch": 0.3255, "grad_norm": 5.0625, "grad_norm_var": 0.04058837890625, "learning_rate": 4e-05, "loss": 4.6785, "loss/crossentropy": 2.3567277789115906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23480309918522835, "step": 3906 }, { "epoch": 0.32566666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.043863932291666664, "learning_rate": 4e-05, "loss": 4.682, "loss/crossentropy": 2.10383278131485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18023706413805485, "step": 3908 }, { "epoch": 0.3258333333333333, "grad_norm": 4.75, "grad_norm_var": 0.046187337239583334, "learning_rate": 4e-05, "loss": 5.1307, "loss/crossentropy": 2.568555533885956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2189786396920681, "step": 3910 }, { "epoch": 0.326, "grad_norm": 5.0, "grad_norm_var": 0.03759358723958333, "learning_rate": 4e-05, "loss": 4.7643, "loss/crossentropy": 1.2290566712617874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13838442414999008, "step": 3912 }, { "epoch": 0.32616666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.03323160807291667, "learning_rate": 4e-05, "loss": 4.6486, "loss/crossentropy": 2.0212435722351074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2013506405055523, "step": 3914 }, { "epoch": 0.3263333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.05370686848958333, "learning_rate": 4e-05, "loss": 4.7519, "loss/crossentropy": 2.114032506942749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23080236837267876, "step": 3916 }, { "epoch": 0.3265, "grad_norm": 4.84375, "grad_norm_var": 0.057356770833333334, "learning_rate": 4e-05, "loss": 4.5704, "loss/crossentropy": 1.7334122359752655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1655733287334442, "step": 3918 }, { "epoch": 0.32666666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.09133707682291667, "learning_rate": 4e-05, "loss": 4.7757, "loss/crossentropy": 1.6256769001483917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19204578548669815, "step": 3920 }, { "epoch": 0.3268333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.10172119140625, "learning_rate": 4e-05, "loss": 4.8596, "loss/crossentropy": 1.5621510818600655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21445403434336185, "step": 3922 }, { "epoch": 0.327, "grad_norm": 4.65625, "grad_norm_var": 0.10689697265625, "learning_rate": 4e-05, "loss": 5.1977, "loss/crossentropy": 2.47482892870903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19530130550265312, "step": 3924 }, { "epoch": 0.32716666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.10823160807291667, "learning_rate": 4e-05, "loss": 5.4857, "loss/crossentropy": 2.1922404766082764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200409695506096, "step": 3926 }, { "epoch": 0.3273333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.10944010416666666, "learning_rate": 4e-05, "loss": 4.9498, "loss/crossentropy": 1.8819985389709473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18507347628474236, "step": 3928 }, { "epoch": 0.3275, "grad_norm": 5.46875, "grad_norm_var": 0.12170817057291666, "learning_rate": 4e-05, "loss": 5.1485, "loss/crossentropy": 1.8625006452202797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19629210233688354, "step": 3930 }, { "epoch": 0.32766666666666666, "grad_norm": 4.75, "grad_norm_var": 0.10520833333333333, "learning_rate": 4e-05, "loss": 4.4317, "loss/crossentropy": 1.701745517551899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17160597257316113, "step": 3932 }, { "epoch": 0.3278333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.106494140625, "learning_rate": 4e-05, "loss": 5.4612, "loss/crossentropy": 2.235189765691757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071330025792122, "step": 3934 }, { "epoch": 0.328, "grad_norm": 4.65625, "grad_norm_var": 0.06888020833333333, "learning_rate": 4e-05, "loss": 4.5437, "loss/crossentropy": 1.5824126675724983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1563483476638794, "step": 3936 }, { "epoch": 0.32816666666666666, "grad_norm": 4.75, "grad_norm_var": 0.07414957682291666, "learning_rate": 4e-05, "loss": 4.5375, "loss/crossentropy": 1.6542961448431015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892082616686821, "step": 3938 }, { "epoch": 0.3283333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.06927083333333334, "learning_rate": 4e-05, "loss": 4.9733, "loss/crossentropy": 1.3052943646907806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1394607052206993, "step": 3940 }, { "epoch": 0.3285, "grad_norm": 5.03125, "grad_norm_var": 0.06451822916666666, "learning_rate": 4e-05, "loss": 4.7605, "loss/crossentropy": 2.0294989347457886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20356900244951248, "step": 3942 }, { "epoch": 0.32866666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.06417643229166667, "learning_rate": 4e-05, "loss": 4.9209, "loss/crossentropy": 1.5529565215110779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1776176542043686, "step": 3944 }, { "epoch": 0.3288333333333333, "grad_norm": 4.875, "grad_norm_var": 0.03951416015625, "learning_rate": 4e-05, "loss": 4.7867, "loss/crossentropy": 1.780011311173439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17883405461907387, "step": 3946 }, { "epoch": 0.329, "grad_norm": 5.0625, "grad_norm_var": 0.063134765625, "learning_rate": 4e-05, "loss": 4.7582, "loss/crossentropy": 1.2589640021324158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15058333426713943, "step": 3948 }, { "epoch": 0.32916666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.17603759765625, "learning_rate": 4e-05, "loss": 5.0849, "loss/crossentropy": 2.279158651828766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21800993382930756, "step": 3950 }, { "epoch": 0.3293333333333333, "grad_norm": 5.625, "grad_norm_var": 0.1841796875, "learning_rate": 4e-05, "loss": 5.1935, "loss/crossentropy": 2.0449778214097023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19124972261488438, "step": 3952 }, { "epoch": 0.3295, "grad_norm": 5.09375, "grad_norm_var": 0.151416015625, "learning_rate": 4e-05, "loss": 4.8903, "loss/crossentropy": 2.590156674385071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2216012328863144, "step": 3954 }, { "epoch": 0.32966666666666666, "grad_norm": 4.34375, "grad_norm_var": 0.19921875, "learning_rate": 4e-05, "loss": 4.3514, "loss/crossentropy": 2.0297087728977203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1800425611436367, "step": 3956 }, { "epoch": 0.3298333333333333, "grad_norm": 4.25, "grad_norm_var": 0.24986572265625, "learning_rate": 4e-05, "loss": 4.1529, "loss/crossentropy": 1.5149021744728088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15365608409047127, "step": 3958 }, { "epoch": 0.33, "grad_norm": 5.65625, "grad_norm_var": 0.32239176432291666, "learning_rate": 4e-05, "loss": 4.6866, "loss/crossentropy": 2.2006970942020416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23189735412597656, "step": 3960 }, { "epoch": 0.33016666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.3184895833333333, "learning_rate": 4e-05, "loss": 4.9903, "loss/crossentropy": 1.732621654868126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17110507935285568, "step": 3962 }, { "epoch": 0.3303333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.33411051432291666, "learning_rate": 4e-05, "loss": 5.0462, "loss/crossentropy": 2.0788157284259796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21352742239832878, "step": 3964 }, { "epoch": 0.3305, "grad_norm": 5.15625, "grad_norm_var": 0.22056884765625, "learning_rate": 4e-05, "loss": 5.4219, "loss/crossentropy": 2.5594743490219116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.218813955783844, "step": 3966 }, { "epoch": 0.33066666666666666, "grad_norm": 5.5, "grad_norm_var": 0.7591756184895834, "learning_rate": 4e-05, "loss": 5.2523, "loss/crossentropy": 2.548967719078064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22973427176475525, "step": 3968 }, { "epoch": 0.3308333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.7623006184895833, "learning_rate": 4e-05, "loss": 4.8598, "loss/crossentropy": 2.1485989689826965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2088041864335537, "step": 3970 }, { "epoch": 0.331, "grad_norm": 4.84375, "grad_norm_var": 0.7180623372395833, "learning_rate": 4e-05, "loss": 4.9403, "loss/crossentropy": 1.671858310699463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029484622180462, "step": 3972 }, { "epoch": 0.33116666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.6586588541666667, "learning_rate": 4e-05, "loss": 4.9027, "loss/crossentropy": 2.0752905011177063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24618571624159813, "step": 3974 }, { "epoch": 0.3313333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.6116495768229167, "learning_rate": 4e-05, "loss": 5.267, "loss/crossentropy": 2.5116668939590454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215965948998928, "step": 3976 }, { "epoch": 0.3315, "grad_norm": 4.96875, "grad_norm_var": 0.61109619140625, "learning_rate": 4e-05, "loss": 5.0597, "loss/crossentropy": 2.00965479016304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21152685582637787, "step": 3978 }, { "epoch": 0.33166666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.5948201497395833, "learning_rate": 4e-05, "loss": 5.0523, "loss/crossentropy": 2.200817584991455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1879693791270256, "step": 3980 }, { "epoch": 0.3318333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.59000244140625, "learning_rate": 4e-05, "loss": 5.1081, "loss/crossentropy": 1.3755600899457932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17068074271082878, "step": 3982 }, { "epoch": 0.332, "grad_norm": 4.5, "grad_norm_var": 0.05705973307291667, "learning_rate": 4e-05, "loss": 5.1049, "loss/crossentropy": 2.173791080713272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22593029215931892, "step": 3984 }, { "epoch": 0.33216666666666667, "grad_norm": 4.75, "grad_norm_var": 0.19778238932291667, "learning_rate": 4e-05, "loss": 5.1376, "loss/crossentropy": 2.2958777248859406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23552871868014336, "step": 3986 }, { "epoch": 0.3323333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.21233317057291667, "learning_rate": 4e-05, "loss": 5.0748, "loss/crossentropy": 1.2604089081287384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17020060494542122, "step": 3988 }, { "epoch": 0.3325, "grad_norm": 4.9375, "grad_norm_var": 0.20779622395833333, "learning_rate": 4e-05, "loss": 4.9403, "loss/crossentropy": 1.6421017423272133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15172147378325462, "step": 3990 }, { "epoch": 0.33266666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.20526936848958333, "learning_rate": 4e-05, "loss": 5.3164, "loss/crossentropy": 2.2981130182743073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21888618916273117, "step": 3992 }, { "epoch": 0.3328333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.20545247395833333, "learning_rate": 4e-05, "loss": 5.0443, "loss/crossentropy": 2.0642926692962646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18958804570138454, "step": 3994 }, { "epoch": 0.333, "grad_norm": 5.1875, "grad_norm_var": 0.22216389973958334, "learning_rate": 4e-05, "loss": 5.0989, "loss/crossentropy": 2.440661907196045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2130712941288948, "step": 3996 }, { "epoch": 0.33316666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.23648681640625, "learning_rate": 4e-05, "loss": 4.2979, "loss/crossentropy": 1.7171569466590881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16440174356102943, "step": 3998 }, { "epoch": 0.3333333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.211572265625, "learning_rate": 4e-05, "loss": 5.0666, "loss/crossentropy": 2.1179882287979126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20530518516898155, "step": 4000 }, { "epoch": 0.3335, "grad_norm": 5.0, "grad_norm_var": 0.07102864583333333, "learning_rate": 4e-05, "loss": 4.9151, "loss/crossentropy": 2.2780506312847137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23622548207640648, "step": 4002 }, { "epoch": 0.33366666666666667, "grad_norm": 5.59375, "grad_norm_var": 0.08527018229166666, "learning_rate": 4e-05, "loss": 4.5568, "loss/crossentropy": 1.0578216835856438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17180327884852886, "step": 4004 }, { "epoch": 0.3338333333333333, "grad_norm": 5.0, "grad_norm_var": 0.08487955729166667, "learning_rate": 4e-05, "loss": 4.5557, "loss/crossentropy": 2.4999157786369324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2154349498450756, "step": 4006 }, { "epoch": 0.334, "grad_norm": 4.8125, "grad_norm_var": 0.0830078125, "learning_rate": 4e-05, "loss": 4.9014, "loss/crossentropy": 2.443576067686081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23118840903043747, "step": 4008 }, { "epoch": 0.33416666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.07903645833333334, "learning_rate": 4e-05, "loss": 4.7797, "loss/crossentropy": 2.619147837162018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22277311235666275, "step": 4010 }, { "epoch": 0.3343333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.07941080729166666, "learning_rate": 4e-05, "loss": 4.6468, "loss/crossentropy": 1.4702613353729248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15802369080483913, "step": 4012 }, { "epoch": 0.3345, "grad_norm": 4.96875, "grad_norm_var": 0.06698811848958333, "learning_rate": 4e-05, "loss": 4.8489, "loss/crossentropy": 2.362972617149353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2217639461159706, "step": 4014 }, { "epoch": 0.33466666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.06425374348958333, "learning_rate": 4e-05, "loss": 5.0929, "loss/crossentropy": 2.2614522576332092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22378670051693916, "step": 4016 }, { "epoch": 0.3348333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.06874593098958333, "learning_rate": 4e-05, "loss": 4.8375, "loss/crossentropy": 2.6223338842391968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232932560145855, "step": 4018 }, { "epoch": 0.335, "grad_norm": 4.90625, "grad_norm_var": 0.028238932291666668, "learning_rate": 4e-05, "loss": 4.5218, "loss/crossentropy": 1.5880136415362358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15379321202635765, "step": 4020 }, { "epoch": 0.33516666666666667, "grad_norm": 5.5625, "grad_norm_var": 0.05325113932291667, "learning_rate": 4e-05, "loss": 5.2639, "loss/crossentropy": 1.8124565333127975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19380969554185867, "step": 4022 }, { "epoch": 0.3353333333333333, "grad_norm": 4.5, "grad_norm_var": 0.083203125, "learning_rate": 4e-05, "loss": 4.8314, "loss/crossentropy": 1.6842049807310104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17753949016332626, "step": 4024 }, { "epoch": 0.3355, "grad_norm": 4.96875, "grad_norm_var": 0.08212483723958333, "learning_rate": 4e-05, "loss": 5.4194, "loss/crossentropy": 2.30204838514328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22381476312875748, "step": 4026 }, { "epoch": 0.33566666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.06549479166666666, "learning_rate": 4e-05, "loss": 5.2147, "loss/crossentropy": 2.446783661842346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124006450176239, "step": 4028 }, { "epoch": 0.3358333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.07688802083333333, "learning_rate": 4e-05, "loss": 4.4994, "loss/crossentropy": 1.6328002288937569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17847843281924725, "step": 4030 }, { "epoch": 0.336, "grad_norm": 5.0625, "grad_norm_var": 0.07667643229166667, "learning_rate": 4e-05, "loss": 4.9838, "loss/crossentropy": 1.2188917100429535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14789031259715557, "step": 4032 }, { "epoch": 0.33616666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.07649332682291667, "learning_rate": 4e-05, "loss": 4.4962, "loss/crossentropy": 2.2478115260601044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19586298614740372, "step": 4034 }, { "epoch": 0.3363333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.08118082682291666, "learning_rate": 4e-05, "loss": 4.9507, "loss/crossentropy": 2.4908804297447205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20837373286485672, "step": 4036 }, { "epoch": 0.3365, "grad_norm": 5.375, "grad_norm_var": 0.06300455729166667, "learning_rate": 4e-05, "loss": 5.0215, "loss/crossentropy": 2.0036857947707176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18870280869305134, "step": 4038 }, { "epoch": 0.33666666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.046614583333333334, "learning_rate": 4e-05, "loss": 4.7382, "loss/crossentropy": 1.7384353280067444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17724663019180298, "step": 4040 }, { "epoch": 0.3368333333333333, "grad_norm": 4.75, "grad_norm_var": 0.047770182291666664, "learning_rate": 4e-05, "loss": 4.969, "loss/crossentropy": 1.8062372133135796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18678754568099976, "step": 4042 }, { "epoch": 0.337, "grad_norm": 5.15625, "grad_norm_var": 0.05071614583333333, "learning_rate": 4e-05, "loss": 5.4893, "loss/crossentropy": 2.328328639268875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2098284848034382, "step": 4044 }, { "epoch": 0.33716666666666667, "grad_norm": 4.875, "grad_norm_var": 0.03817952473958333, "learning_rate": 4e-05, "loss": 4.7384, "loss/crossentropy": 2.3635981678962708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23770976066589355, "step": 4046 }, { "epoch": 0.3373333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.03723958333333333, "learning_rate": 4e-05, "loss": 5.3287, "loss/crossentropy": 2.3061963617801666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2013118974864483, "step": 4048 }, { "epoch": 0.3375, "grad_norm": 5.1875, "grad_norm_var": 0.04104410807291667, "learning_rate": 4e-05, "loss": 5.528, "loss/crossentropy": 2.056824043393135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19357911497354507, "step": 4050 }, { "epoch": 0.33766666666666667, "grad_norm": 4.75, "grad_norm_var": 0.04153645833333333, "learning_rate": 4e-05, "loss": 4.2291, "loss/crossentropy": 1.8092492744326591, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18169726617634296, "step": 4052 }, { "epoch": 0.3378333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.03385009765625, "learning_rate": 4e-05, "loss": 4.9316, "loss/crossentropy": 1.640300840139389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19402212649583817, "step": 4054 }, { "epoch": 0.338, "grad_norm": 5.28125, "grad_norm_var": 0.0404296875, "learning_rate": 4e-05, "loss": 5.1329, "loss/crossentropy": 2.0621906220912933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21923184767365456, "step": 4056 }, { "epoch": 0.33816666666666667, "grad_norm": 5.96875, "grad_norm_var": 0.108056640625, "learning_rate": 4e-05, "loss": 5.268, "loss/crossentropy": 1.8473474383354187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038598507642746, "step": 4058 }, { "epoch": 0.3383333333333333, "grad_norm": 4.875, "grad_norm_var": 0.10699462890625, "learning_rate": 4e-05, "loss": 4.5688, "loss/crossentropy": 1.7940621376037598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18157127313315868, "step": 4060 }, { "epoch": 0.3385, "grad_norm": 4.84375, "grad_norm_var": 0.10950520833333334, "learning_rate": 4e-05, "loss": 5.1588, "loss/crossentropy": 1.2232213392853737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12759443558752537, "step": 4062 }, { "epoch": 0.33866666666666667, "grad_norm": 4.875, "grad_norm_var": 0.12467041015625, "learning_rate": 4e-05, "loss": 5.0865, "loss/crossentropy": 1.5486139208078384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.165193947032094, "step": 4064 }, { "epoch": 0.3388333333333333, "grad_norm": 4.875, "grad_norm_var": 0.11685791015625, "learning_rate": 4e-05, "loss": 4.8044, "loss/crossentropy": 1.621782824397087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1728647444397211, "step": 4066 }, { "epoch": 0.339, "grad_norm": 4.875, "grad_norm_var": 0.1095703125, "learning_rate": 4e-05, "loss": 4.9063, "loss/crossentropy": 2.137328863143921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19338041357696056, "step": 4068 }, { "epoch": 0.33916666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.09488525390625, "learning_rate": 4e-05, "loss": 4.7485, "loss/crossentropy": 2.2186881601810455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21209581568837166, "step": 4070 }, { "epoch": 0.3393333333333333, "grad_norm": 5.125, "grad_norm_var": 0.10592447916666667, "learning_rate": 4e-05, "loss": 5.1329, "loss/crossentropy": 2.485084891319275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2395104244351387, "step": 4072 }, { "epoch": 0.3395, "grad_norm": 4.84375, "grad_norm_var": 0.04698893229166667, "learning_rate": 4e-05, "loss": 4.7952, "loss/crossentropy": 2.354505956172943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22287846729159355, "step": 4074 }, { "epoch": 0.3396666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.07971598307291666, "learning_rate": 4e-05, "loss": 4.7303, "loss/crossentropy": 2.4395925402641296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21506384015083313, "step": 4076 }, { "epoch": 0.3398333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.07437744140625, "learning_rate": 4e-05, "loss": 5.1776, "loss/crossentropy": 2.2610137462615967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21337458491325378, "step": 4078 }, { "epoch": 0.34, "grad_norm": 4.875, "grad_norm_var": 0.06252848307291667, "learning_rate": 4e-05, "loss": 4.7574, "loss/crossentropy": 1.0004555508494377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10723226889967918, "step": 4080 }, { "epoch": 0.3401666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.07395426432291667, "learning_rate": 4e-05, "loss": 4.4206, "loss/crossentropy": 1.8156683892011642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18349895626306534, "step": 4082 }, { "epoch": 0.3403333333333333, "grad_norm": 5.34375, "grad_norm_var": 0.09114583333333333, "learning_rate": 4e-05, "loss": 5.0601, "loss/crossentropy": 1.5282996445894241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19007975049316883, "step": 4084 }, { "epoch": 0.3405, "grad_norm": 4.875, "grad_norm_var": 0.09127197265625, "learning_rate": 4e-05, "loss": 5.1873, "loss/crossentropy": 1.8278708755970001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1809481494128704, "step": 4086 }, { "epoch": 0.3406666666666667, "grad_norm": 5.875, "grad_norm_var": 0.1298828125, "learning_rate": 4e-05, "loss": 5.6198, "loss/crossentropy": 1.9336512684822083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21758461371064186, "step": 4088 }, { "epoch": 0.3408333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.14052327473958334, "learning_rate": 4e-05, "loss": 4.6314, "loss/crossentropy": 1.5730094835162163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1618702970445156, "step": 4090 }, { "epoch": 0.341, "grad_norm": 5.15625, "grad_norm_var": 0.10575764973958333, "learning_rate": 4e-05, "loss": 5.2243, "loss/crossentropy": 2.555690288543701, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24880832433700562, "step": 4092 }, { "epoch": 0.3411666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.13854166666666667, "learning_rate": 4e-05, "loss": 4.7328, "loss/crossentropy": 2.2953881919384003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20687290653586388, "step": 4094 }, { "epoch": 0.3413333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.16490885416666667, "learning_rate": 4e-05, "loss": 4.4851, "loss/crossentropy": 2.585106134414673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208958238363266, "step": 4096 }, { "epoch": 0.3415, "grad_norm": 4.84375, "grad_norm_var": 0.15780843098958333, "learning_rate": 4e-05, "loss": 5.0493, "loss/crossentropy": 2.4826498925685883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24454660713672638, "step": 4098 }, { "epoch": 0.3416666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.13704427083333334, "learning_rate": 4e-05, "loss": 5.0745, "loss/crossentropy": 2.297918736934662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1883096918463707, "step": 4100 }, { "epoch": 0.3418333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.13837483723958333, "learning_rate": 4e-05, "loss": 4.8996, "loss/crossentropy": 2.1561270356178284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22525209560990334, "step": 4102 }, { "epoch": 0.342, "grad_norm": 4.9375, "grad_norm_var": 0.05943603515625, "learning_rate": 4e-05, "loss": 4.8204, "loss/crossentropy": 2.0578841269016266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22047049179673195, "step": 4104 }, { "epoch": 0.3421666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.058447265625, "learning_rate": 4e-05, "loss": 4.7867, "loss/crossentropy": 2.4789949655532837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21268969774246216, "step": 4106 }, { "epoch": 0.3423333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.05172119140625, "learning_rate": 4e-05, "loss": 4.9054, "loss/crossentropy": 2.394868552684784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20047050714492798, "step": 4108 }, { "epoch": 0.3425, "grad_norm": 4.96875, "grad_norm_var": 0.052083333333333336, "learning_rate": 4e-05, "loss": 5.0385, "loss/crossentropy": 2.43982994556427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21248744055628777, "step": 4110 }, { "epoch": 0.3426666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.03857014973958333, "learning_rate": 4e-05, "loss": 5.0087, "loss/crossentropy": 1.919486790895462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2137829028069973, "step": 4112 }, { "epoch": 0.3428333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.05504150390625, "learning_rate": 4e-05, "loss": 4.3936, "loss/crossentropy": 0.8350804150104523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11626596376299858, "step": 4114 }, { "epoch": 0.343, "grad_norm": 5.25, "grad_norm_var": 0.05771077473958333, "learning_rate": 4e-05, "loss": 4.821, "loss/crossentropy": 1.480412408709526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16480615735054016, "step": 4116 }, { "epoch": 0.3431666666666667, "grad_norm": 4.75, "grad_norm_var": 0.13240559895833334, "learning_rate": 4e-05, "loss": 4.6536, "loss/crossentropy": 1.9430910348892212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1870816107839346, "step": 4118 }, { "epoch": 0.3433333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.13743489583333332, "learning_rate": 4e-05, "loss": 5.11, "loss/crossentropy": 2.1659523844718933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21893510594964027, "step": 4120 }, { "epoch": 0.3435, "grad_norm": 4.84375, "grad_norm_var": 0.13619384765625, "learning_rate": 4e-05, "loss": 5.0083, "loss/crossentropy": 2.2735475897789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21642907708883286, "step": 4122 }, { "epoch": 0.3436666666666667, "grad_norm": 5.0, "grad_norm_var": 0.13544514973958333, "learning_rate": 4e-05, "loss": 5.1141, "loss/crossentropy": 2.3820077180862427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19136777892708778, "step": 4124 }, { "epoch": 0.3438333333333333, "grad_norm": 5.0, "grad_norm_var": 0.13720296223958334, "learning_rate": 4e-05, "loss": 4.7251, "loss/crossentropy": 1.08474662899971, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13359439186751842, "step": 4126 }, { "epoch": 0.344, "grad_norm": 5.28125, "grad_norm_var": 0.13561197916666667, "learning_rate": 4e-05, "loss": 4.8041, "loss/crossentropy": 2.264466494321823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23006341233849525, "step": 4128 }, { "epoch": 0.3441666666666667, "grad_norm": 5.53125, "grad_norm_var": 0.13995768229166666, "learning_rate": 4e-05, "loss": 5.1237, "loss/crossentropy": 2.056764245033264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545086592435837, "step": 4130 }, { "epoch": 0.3443333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.13658854166666667, "learning_rate": 4e-05, "loss": 5.2477, "loss/crossentropy": 2.574945330619812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190633974969387, "step": 4132 }, { "epoch": 0.3445, "grad_norm": 5.15625, "grad_norm_var": 0.07923177083333334, "learning_rate": 4e-05, "loss": 5.1239, "loss/crossentropy": 2.2885874211788177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122734598815441, "step": 4134 }, { "epoch": 0.3446666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.07522379557291667, "learning_rate": 4e-05, "loss": 4.9945, "loss/crossentropy": 1.8102921470999718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18051475286483765, "step": 4136 }, { "epoch": 0.3448333333333333, "grad_norm": 19.875, "grad_norm_var": 13.979130045572917, "learning_rate": 4e-05, "loss": 5.1695, "loss/crossentropy": 2.2670177817344666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23726213350892067, "step": 4138 }, { "epoch": 0.345, "grad_norm": 4.875, "grad_norm_var": 13.991129557291666, "learning_rate": 4e-05, "loss": 5.1073, "loss/crossentropy": 1.9467437043786049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18490321934223175, "step": 4140 }, { "epoch": 0.3451666666666667, "grad_norm": 4.78125, "grad_norm_var": 13.946028645833334, "learning_rate": 4e-05, "loss": 4.7978, "loss/crossentropy": 1.795416384935379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21431923657655716, "step": 4142 }, { "epoch": 0.3453333333333333, "grad_norm": 4.96875, "grad_norm_var": 13.94879150390625, "learning_rate": 4e-05, "loss": 5.0306, "loss/crossentropy": 1.867057092487812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19479823857545853, "step": 4144 }, { "epoch": 0.3455, "grad_norm": 4.53125, "grad_norm_var": 13.983056640625, "learning_rate": 4e-05, "loss": 4.3536, "loss/crossentropy": 2.0579030513763428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20508618280291557, "step": 4146 }, { "epoch": 0.3456666666666667, "grad_norm": 4.84375, "grad_norm_var": 13.959830729166667, "learning_rate": 4e-05, "loss": 4.9017, "loss/crossentropy": 1.839685283601284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.169641749933362, "step": 4148 }, { "epoch": 0.3458333333333333, "grad_norm": 5.65625, "grad_norm_var": 13.884273274739583, "learning_rate": 4e-05, "loss": 5.2768, "loss/crossentropy": 1.167643092572689, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15179457888007164, "step": 4150 }, { "epoch": 0.346, "grad_norm": 5.03125, "grad_norm_var": 13.831624348958334, "learning_rate": 4e-05, "loss": 4.6659, "loss/crossentropy": 2.5688222646713257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23355836793780327, "step": 4152 }, { "epoch": 0.3461666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.09479166666666666, "learning_rate": 4e-05, "loss": 4.8113, "loss/crossentropy": 2.2830787897109985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20207640901207924, "step": 4154 }, { "epoch": 0.3463333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.09582926432291666, "learning_rate": 4e-05, "loss": 5.0874, "loss/crossentropy": 2.287237584590912, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2221594713628292, "step": 4156 }, { "epoch": 0.3465, "grad_norm": 4.75, "grad_norm_var": 0.10351155598958334, "learning_rate": 4e-05, "loss": 4.7523, "loss/crossentropy": 1.992442563176155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18241856060922146, "step": 4158 }, { "epoch": 0.3466666666666667, "grad_norm": 5.125, "grad_norm_var": 0.10006510416666667, "learning_rate": 4e-05, "loss": 5.429, "loss/crossentropy": 2.3137503266334534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21104448288679123, "step": 4160 }, { "epoch": 0.3468333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.10113525390625, "learning_rate": 4e-05, "loss": 5.909, "loss/crossentropy": 2.0655910670757294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2179282046854496, "step": 4162 }, { "epoch": 0.347, "grad_norm": 4.75, "grad_norm_var": 0.10198160807291666, "learning_rate": 4e-05, "loss": 4.1742, "loss/crossentropy": 1.2919567823410034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13722717948257923, "step": 4164 }, { "epoch": 0.3471666666666667, "grad_norm": 5.25, "grad_norm_var": 0.074853515625, "learning_rate": 4e-05, "loss": 5.3134, "loss/crossentropy": 2.1499520242214203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992734745144844, "step": 4166 }, { "epoch": 0.3473333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.07672119140625, "learning_rate": 4e-05, "loss": 4.2854, "loss/crossentropy": 1.5794583559036255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16880195401608944, "step": 4168 }, { "epoch": 0.3475, "grad_norm": 4.78125, "grad_norm_var": 0.08603108723958333, "learning_rate": 4e-05, "loss": 4.7931, "loss/crossentropy": 1.4645969420671463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14471661485731602, "step": 4170 }, { "epoch": 0.3476666666666667, "grad_norm": 5.0, "grad_norm_var": 0.11170247395833334, "learning_rate": 4e-05, "loss": 4.9849, "loss/crossentropy": 2.362588882446289, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20443035662174225, "step": 4172 }, { "epoch": 0.3478333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.10764567057291667, "learning_rate": 4e-05, "loss": 5.4635, "loss/crossentropy": 2.2797932624816895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22322962060570717, "step": 4174 }, { "epoch": 0.348, "grad_norm": 4.40625, "grad_norm_var": 0.12890625, "learning_rate": 4e-05, "loss": 4.8326, "loss/crossentropy": 2.0911890268325806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915295533835888, "step": 4176 }, { "epoch": 0.3481666666666667, "grad_norm": 5.125, "grad_norm_var": 0.120947265625, "learning_rate": 4e-05, "loss": 5.0474, "loss/crossentropy": 2.514350116252899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21622708812355995, "step": 4178 }, { "epoch": 0.34833333333333333, "grad_norm": 5.0, "grad_norm_var": 0.0876953125, "learning_rate": 4e-05, "loss": 5.1258, "loss/crossentropy": 1.924629956483841, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893673911690712, "step": 4180 }, { "epoch": 0.3485, "grad_norm": 4.71875, "grad_norm_var": 0.08136393229166666, "learning_rate": 4e-05, "loss": 5.2519, "loss/crossentropy": 1.5494660213589668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15524601563811302, "step": 4182 }, { "epoch": 0.3486666666666667, "grad_norm": 5.625, "grad_norm_var": 0.09410400390625, "learning_rate": 4e-05, "loss": 4.6197, "loss/crossentropy": 1.914233423769474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18002042546868324, "step": 4184 }, { "epoch": 0.34883333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.10506184895833333, "learning_rate": 4e-05, "loss": 4.6615, "loss/crossentropy": 1.8490305989980698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15980881080031395, "step": 4186 }, { "epoch": 0.349, "grad_norm": 5.0, "grad_norm_var": 0.08748372395833333, "learning_rate": 4e-05, "loss": 5.1398, "loss/crossentropy": 1.982210248708725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987055353820324, "step": 4188 }, { "epoch": 0.3491666666666667, "grad_norm": 5.71875, "grad_norm_var": 0.12515869140625, "learning_rate": 4e-05, "loss": 4.8987, "loss/crossentropy": 1.8878967761993408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18452826887369156, "step": 4190 }, { "epoch": 0.34933333333333333, "grad_norm": 4.625, "grad_norm_var": 0.12565104166666666, "learning_rate": 4e-05, "loss": 4.2882, "loss/crossentropy": 1.7068097814917564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1642775908112526, "step": 4192 }, { "epoch": 0.3495, "grad_norm": 5.0625, "grad_norm_var": 0.11731770833333334, "learning_rate": 4e-05, "loss": 5.2158, "loss/crossentropy": 1.33222147077322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15623358637094498, "step": 4194 }, { "epoch": 0.3496666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.12068684895833333, "learning_rate": 4e-05, "loss": 4.7641, "loss/crossentropy": 1.6291079074144363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1752131376415491, "step": 4196 }, { "epoch": 0.34983333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.11979166666666667, "learning_rate": 4e-05, "loss": 4.4284, "loss/crossentropy": 1.9068839251995087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23426436632871628, "step": 4198 }, { "epoch": 0.35, "grad_norm": 5.09375, "grad_norm_var": 0.09062093098958333, "learning_rate": 4e-05, "loss": 5.1638, "loss/crossentropy": 1.8091362193226814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16963034123182297, "step": 4200 }, { "epoch": 0.3501666666666667, "grad_norm": 4.625, "grad_norm_var": 0.08717447916666667, "learning_rate": 4e-05, "loss": 5.1473, "loss/crossentropy": 2.4834869503974915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2258441038429737, "step": 4202 }, { "epoch": 0.35033333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.08977457682291666, "learning_rate": 4e-05, "loss": 5.0231, "loss/crossentropy": 1.9468242824077606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19090471975505352, "step": 4204 }, { "epoch": 0.3505, "grad_norm": 5.125, "grad_norm_var": 0.05871988932291667, "learning_rate": 4e-05, "loss": 4.3702, "loss/crossentropy": 2.568976879119873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22428393363952637, "step": 4206 }, { "epoch": 0.3506666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.03804931640625, "learning_rate": 4e-05, "loss": 5.1383, "loss/crossentropy": 2.1792095601558685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2094913348555565, "step": 4208 }, { "epoch": 0.35083333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.036942545572916666, "learning_rate": 4e-05, "loss": 4.7552, "loss/crossentropy": 1.8818095847964287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17569194920361042, "step": 4210 }, { "epoch": 0.351, "grad_norm": 5.15625, "grad_norm_var": 0.08564046223958334, "learning_rate": 4e-05, "loss": 5.2439, "loss/crossentropy": 1.962063044309616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20449374616146088, "step": 4212 }, { "epoch": 0.3511666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.08448893229166667, "learning_rate": 4e-05, "loss": 5.1916, "loss/crossentropy": 2.030742183327675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19212795421481133, "step": 4214 }, { "epoch": 0.35133333333333333, "grad_norm": 5.0, "grad_norm_var": 0.08749593098958333, "learning_rate": 4e-05, "loss": 5.1428, "loss/crossentropy": 2.2157254815101624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048327773809433, "step": 4216 }, { "epoch": 0.3515, "grad_norm": 4.59375, "grad_norm_var": 0.08971354166666666, "learning_rate": 4e-05, "loss": 4.7645, "loss/crossentropy": 1.7414375841617584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17881914600729942, "step": 4218 }, { "epoch": 0.3516666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.09138997395833333, "learning_rate": 4e-05, "loss": 5.0248, "loss/crossentropy": 1.7276012226939201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17646119743585587, "step": 4220 }, { "epoch": 0.35183333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.08346354166666667, "learning_rate": 4e-05, "loss": 4.4493, "loss/crossentropy": 1.6815314665436745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1717877183109522, "step": 4222 }, { "epoch": 0.352, "grad_norm": 4.5, "grad_norm_var": 0.11477457682291667, "learning_rate": 4e-05, "loss": 5.1927, "loss/crossentropy": 2.431355118751526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992778554558754, "step": 4224 }, { "epoch": 0.3521666666666667, "grad_norm": 4.625, "grad_norm_var": 0.12667643229166667, "learning_rate": 4e-05, "loss": 4.4977, "loss/crossentropy": 2.3680657148361206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.199774120002985, "step": 4226 }, { "epoch": 0.35233333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.07472330729166667, "learning_rate": 4e-05, "loss": 5.2512, "loss/crossentropy": 2.244264602661133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101249285042286, "step": 4228 }, { "epoch": 0.3525, "grad_norm": 5.0625, "grad_norm_var": 0.08424072265625, "learning_rate": 4e-05, "loss": 5.2336, "loss/crossentropy": 1.5435407906770706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15030980855226517, "step": 4230 }, { "epoch": 0.3526666666666667, "grad_norm": 5.5625, "grad_norm_var": 0.108056640625, "learning_rate": 4e-05, "loss": 5.0424, "loss/crossentropy": 1.6612791121006012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18562641739845276, "step": 4232 }, { "epoch": 0.35283333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.09986979166666667, "learning_rate": 4e-05, "loss": 5.0052, "loss/crossentropy": 1.051307551562786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12871519662439823, "step": 4234 }, { "epoch": 0.353, "grad_norm": 4.875, "grad_norm_var": 0.10552978515625, "learning_rate": 4e-05, "loss": 4.8386, "loss/crossentropy": 2.6155091524124146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23894379287958145, "step": 4236 }, { "epoch": 0.3531666666666667, "grad_norm": 5.0, "grad_norm_var": 0.102734375, "learning_rate": 4e-05, "loss": 5.061, "loss/crossentropy": 1.767829805612564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993457730859518, "step": 4238 }, { "epoch": 0.35333333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.09159749348958333, "learning_rate": 4e-05, "loss": 4.9406, "loss/crossentropy": 1.423234261572361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1627085842192173, "step": 4240 }, { "epoch": 0.3535, "grad_norm": 5.125, "grad_norm_var": 0.08586832682291666, "learning_rate": 4e-05, "loss": 5.243, "loss/crossentropy": 2.1509978771209717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19178292714059353, "step": 4242 }, { "epoch": 0.3536666666666667, "grad_norm": 4.75, "grad_norm_var": 0.085791015625, "learning_rate": 4e-05, "loss": 4.8738, "loss/crossentropy": 2.137165904045105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19972681626677513, "step": 4244 }, { "epoch": 0.35383333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.07597249348958333, "learning_rate": 4e-05, "loss": 4.5654, "loss/crossentropy": 1.1787279769778252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13839380256831646, "step": 4246 }, { "epoch": 0.354, "grad_norm": 5.0, "grad_norm_var": 0.052469889322916664, "learning_rate": 4e-05, "loss": 4.8617, "loss/crossentropy": 2.045122891664505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18550307303667068, "step": 4248 }, { "epoch": 0.3541666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.10305582682291667, "learning_rate": 4e-05, "loss": 5.1557, "loss/crossentropy": 1.8815576285123825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1797770895063877, "step": 4250 }, { "epoch": 0.35433333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.10064697265625, "learning_rate": 4e-05, "loss": 4.974, "loss/crossentropy": 1.4666025638580322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15171694196760654, "step": 4252 }, { "epoch": 0.3545, "grad_norm": 5.0625, "grad_norm_var": 0.15735270182291666, "learning_rate": 4e-05, "loss": 4.9926, "loss/crossentropy": 2.0984753370285034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19612977281212807, "step": 4254 }, { "epoch": 0.3546666666666667, "grad_norm": 5.0, "grad_norm_var": 0.13743082682291666, "learning_rate": 4e-05, "loss": 5.5811, "loss/crossentropy": 2.1417490541934967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23930613696575165, "step": 4256 }, { "epoch": 0.35483333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.14933268229166666, "learning_rate": 4e-05, "loss": 4.403, "loss/crossentropy": 1.3386527448892593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1588802933692932, "step": 4258 }, { "epoch": 0.355, "grad_norm": 5.34375, "grad_norm_var": 0.15167643229166666, "learning_rate": 4e-05, "loss": 5.0899, "loss/crossentropy": 1.9312639832496643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.182557824999094, "step": 4260 }, { "epoch": 0.3551666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.162353515625, "learning_rate": 4e-05, "loss": 4.8298, "loss/crossentropy": 1.7975911796092987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19092736393213272, "step": 4262 }, { "epoch": 0.35533333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.16760660807291666, "learning_rate": 4e-05, "loss": 5.2307, "loss/crossentropy": 2.0434736609458923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22150762006640434, "step": 4264 }, { "epoch": 0.3555, "grad_norm": 4.75, "grad_norm_var": 0.12682291666666667, "learning_rate": 4e-05, "loss": 4.8851, "loss/crossentropy": 2.5525485277175903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2277713306248188, "step": 4266 }, { "epoch": 0.3556666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.11847330729166666, "learning_rate": 4e-05, "loss": 5.5178, "loss/crossentropy": 2.019535183906555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19332855939865112, "step": 4268 }, { "epoch": 0.35583333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.04980061848958333, "learning_rate": 4e-05, "loss": 4.0606, "loss/crossentropy": 1.5260907039046288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14622344821691513, "step": 4270 }, { "epoch": 0.356, "grad_norm": 4.9375, "grad_norm_var": 0.04390869140625, "learning_rate": 4e-05, "loss": 4.8001, "loss/crossentropy": 1.830235406756401, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18560582026839256, "step": 4272 }, { "epoch": 0.3561666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.04159749348958333, "learning_rate": 4e-05, "loss": 5.0943, "loss/crossentropy": 2.435010313987732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23003898561000824, "step": 4274 }, { "epoch": 0.35633333333333334, "grad_norm": 6.40625, "grad_norm_var": 0.17096354166666666, "learning_rate": 4e-05, "loss": 5.0332, "loss/crossentropy": 2.1487464606761932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22359532490372658, "step": 4276 }, { "epoch": 0.3565, "grad_norm": 4.90625, "grad_norm_var": 0.16985270182291667, "learning_rate": 4e-05, "loss": 5.2108, "loss/crossentropy": 3.206775486469269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20537611469626427, "step": 4278 }, { "epoch": 0.3566666666666667, "grad_norm": 5.0, "grad_norm_var": 0.16985677083333334, "learning_rate": 4e-05, "loss": 5.3511, "loss/crossentropy": 2.3853268325328827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20269788056612015, "step": 4280 }, { "epoch": 0.35683333333333334, "grad_norm": 5.4375, "grad_norm_var": 0.17174479166666667, "learning_rate": 4e-05, "loss": 5.336, "loss/crossentropy": 1.9683539271354675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21414640918374062, "step": 4282 }, { "epoch": 0.357, "grad_norm": 5.03125, "grad_norm_var": 0.18079427083333333, "learning_rate": 4e-05, "loss": 4.8088, "loss/crossentropy": 1.9340718239545822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17748939990997314, "step": 4284 }, { "epoch": 0.3571666666666667, "grad_norm": 5.8125, "grad_norm_var": 0.20422770182291666, "learning_rate": 4e-05, "loss": 5.0841, "loss/crossentropy": 2.4096251130104065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2105432227253914, "step": 4286 }, { "epoch": 0.35733333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.2512003580729167, "learning_rate": 4e-05, "loss": 5.1465, "loss/crossentropy": 1.613951414823532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16857917606830597, "step": 4288 }, { "epoch": 0.3575, "grad_norm": 5.125, "grad_norm_var": 0.23918863932291667, "learning_rate": 4e-05, "loss": 5.3099, "loss/crossentropy": 2.3350643515586853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21324753761291504, "step": 4290 }, { "epoch": 0.3576666666666667, "grad_norm": 5.0, "grad_norm_var": 0.14329427083333332, "learning_rate": 4e-05, "loss": 4.4897, "loss/crossentropy": 1.8215420618653297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1822014134377241, "step": 4292 }, { "epoch": 0.35783333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.12467041015625, "learning_rate": 4e-05, "loss": 5.1288, "loss/crossentropy": 1.9511115998029709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198295496404171, "step": 4294 }, { "epoch": 0.358, "grad_norm": 4.6875, "grad_norm_var": 0.13853759765625, "learning_rate": 4e-05, "loss": 5.0161, "loss/crossentropy": 2.398141384124756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1993892453610897, "step": 4296 }, { "epoch": 0.3581666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.13880208333333333, "learning_rate": 4e-05, "loss": 5.2304, "loss/crossentropy": 1.723719596862793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2035377323627472, "step": 4298 }, { "epoch": 0.35833333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.14390869140625, "learning_rate": 4e-05, "loss": 4.712, "loss/crossentropy": 1.977920413017273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20899861678481102, "step": 4300 }, { "epoch": 0.3585, "grad_norm": 4.96875, "grad_norm_var": 0.12115885416666666, "learning_rate": 4e-05, "loss": 5.0954, "loss/crossentropy": 2.031194046139717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2107471190392971, "step": 4302 }, { "epoch": 0.3586666666666667, "grad_norm": 5.4375, "grad_norm_var": 0.07550455729166666, "learning_rate": 4e-05, "loss": 5.6159, "loss/crossentropy": 1.8637639060616493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20326609164476395, "step": 4304 }, { "epoch": 0.35883333333333334, "grad_norm": 5.25, "grad_norm_var": 0.07654622395833334, "learning_rate": 4e-05, "loss": 4.402, "loss/crossentropy": 2.2037427127361298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23041347414255142, "step": 4306 }, { "epoch": 0.359, "grad_norm": 4.6875, "grad_norm_var": 0.07929280598958334, "learning_rate": 4e-05, "loss": 5.0445, "loss/crossentropy": 2.564959466457367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22452329471707344, "step": 4308 }, { "epoch": 0.3591666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.08616129557291667, "learning_rate": 4e-05, "loss": 5.4649, "loss/crossentropy": 2.3605875968933105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2296753227710724, "step": 4310 }, { "epoch": 0.35933333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.07667643229166667, "learning_rate": 4e-05, "loss": 5.0013, "loss/crossentropy": 2.170982927083969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2387605495750904, "step": 4312 }, { "epoch": 0.3595, "grad_norm": 4.9375, "grad_norm_var": 0.0662109375, "learning_rate": 4e-05, "loss": 4.6528, "loss/crossentropy": 2.062200278043747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1859907377511263, "step": 4314 }, { "epoch": 0.3596666666666667, "grad_norm": 4.875, "grad_norm_var": 0.061051432291666666, "learning_rate": 4e-05, "loss": 5.1288, "loss/crossentropy": 2.2674206495285034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20846696570515633, "step": 4316 }, { "epoch": 0.35983333333333334, "grad_norm": 5.375, "grad_norm_var": 0.06187744140625, "learning_rate": 4e-05, "loss": 5.3712, "loss/crossentropy": 2.384114146232605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240867242217064, "step": 4318 }, { "epoch": 0.36, "grad_norm": 5.15625, "grad_norm_var": 0.037886555989583334, "learning_rate": 4e-05, "loss": 5.3943, "loss/crossentropy": 2.363667756319046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20440954342484474, "step": 4320 }, { "epoch": 0.3601666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.03592122395833333, "learning_rate": 4e-05, "loss": 5.0069, "loss/crossentropy": 2.1359744668006897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19151579588651657, "step": 4322 }, { "epoch": 0.36033333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.029292805989583334, "learning_rate": 4e-05, "loss": 5.3678, "loss/crossentropy": 2.1146958768367767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19935699924826622, "step": 4324 }, { "epoch": 0.3605, "grad_norm": 4.6875, "grad_norm_var": 0.026949055989583335, "learning_rate": 4e-05, "loss": 5.0466, "loss/crossentropy": 1.8558301255106926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1702909618616104, "step": 4326 }, { "epoch": 0.3606666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.024117024739583333, "learning_rate": 4e-05, "loss": 5.2054, "loss/crossentropy": 2.2706758975982666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21369409933686256, "step": 4328 }, { "epoch": 0.36083333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.02916259765625, "learning_rate": 4e-05, "loss": 4.5506, "loss/crossentropy": 2.637549102306366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21644367650151253, "step": 4330 }, { "epoch": 0.361, "grad_norm": 5.1875, "grad_norm_var": 0.033003743489583334, "learning_rate": 4e-05, "loss": 5.4144, "loss/crossentropy": 1.6384681463241577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1661134473979473, "step": 4332 }, { "epoch": 0.3611666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.021484375, "learning_rate": 4e-05, "loss": 5.3093, "loss/crossentropy": 1.6732659563422203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18927017599344254, "step": 4334 }, { "epoch": 0.36133333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.018550618489583334, "learning_rate": 4e-05, "loss": 5.518, "loss/crossentropy": 2.198245033621788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16945981606841087, "step": 4336 }, { "epoch": 0.3615, "grad_norm": 4.75, "grad_norm_var": 0.020442708333333334, "learning_rate": 4e-05, "loss": 4.5047, "loss/crossentropy": 2.4632855653762817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22415439784526825, "step": 4338 }, { "epoch": 0.3616666666666667, "grad_norm": 5.6875, "grad_norm_var": 0.059228515625, "learning_rate": 4e-05, "loss": 5.4676, "loss/crossentropy": 2.4306774139404297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2453540340065956, "step": 4340 }, { "epoch": 0.36183333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.05826416015625, "learning_rate": 4e-05, "loss": 4.868, "loss/crossentropy": 2.255247950553894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19311066716909409, "step": 4342 }, { "epoch": 0.362, "grad_norm": 5.1875, "grad_norm_var": 0.060282389322916664, "learning_rate": 4e-05, "loss": 4.9875, "loss/crossentropy": 1.7551113069057465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17623233050107956, "step": 4344 }, { "epoch": 0.3621666666666667, "grad_norm": 5.625, "grad_norm_var": 0.07473551432291667, "learning_rate": 4e-05, "loss": 4.9721, "loss/crossentropy": 2.160099893808365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21592870354652405, "step": 4346 }, { "epoch": 0.36233333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.07928059895833334, "learning_rate": 4e-05, "loss": 4.4798, "loss/crossentropy": 2.1821701526641846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061518207192421, "step": 4348 }, { "epoch": 0.3625, "grad_norm": 4.84375, "grad_norm_var": 0.08125, "learning_rate": 4e-05, "loss": 4.5843, "loss/crossentropy": 1.9987001791596413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1731714904308319, "step": 4350 }, { "epoch": 0.3626666666666667, "grad_norm": 5.125, "grad_norm_var": 0.083447265625, "learning_rate": 4e-05, "loss": 4.9023, "loss/crossentropy": 1.9951072037220001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2395828329026699, "step": 4352 }, { "epoch": 0.36283333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.10832926432291666, "learning_rate": 4e-05, "loss": 4.1862, "loss/crossentropy": 1.293114811182022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1551448032259941, "step": 4354 }, { "epoch": 0.363, "grad_norm": 5.125, "grad_norm_var": 0.07701822916666666, "learning_rate": 4e-05, "loss": 5.3229, "loss/crossentropy": 1.7663483619689941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1607537753880024, "step": 4356 }, { "epoch": 0.3631666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.07620035807291667, "learning_rate": 4e-05, "loss": 5.3652, "loss/crossentropy": 2.1835967004299164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20054300501942635, "step": 4358 }, { "epoch": 0.36333333333333334, "grad_norm": 5.40625, "grad_norm_var": 0.08502604166666666, "learning_rate": 4e-05, "loss": 4.8332, "loss/crossentropy": 1.9410650432109833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1902483105659485, "step": 4360 }, { "epoch": 0.3635, "grad_norm": 4.78125, "grad_norm_var": 0.06321207682291667, "learning_rate": 4e-05, "loss": 4.0222, "loss/crossentropy": 2.366856336593628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30182891711592674, "step": 4362 }, { "epoch": 0.3636666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.06246337890625, "learning_rate": 4e-05, "loss": 5.0653, "loss/crossentropy": 2.190703809261322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2186608836054802, "step": 4364 }, { "epoch": 0.36383333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.0689453125, "learning_rate": 4e-05, "loss": 4.6313, "loss/crossentropy": 1.8244432508945465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1971954982727766, "step": 4366 }, { "epoch": 0.364, "grad_norm": 6.09375, "grad_norm_var": 0.14373372395833334, "learning_rate": 4e-05, "loss": 5.0425, "loss/crossentropy": 1.3379913195967674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20181451365351677, "step": 4368 }, { "epoch": 0.3641666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.22604166666666667, "learning_rate": 4e-05, "loss": 4.6713, "loss/crossentropy": 2.4003341794013977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20302216708660126, "step": 4370 }, { "epoch": 0.36433333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.22392171223958332, "learning_rate": 4e-05, "loss": 5.2939, "loss/crossentropy": 1.87662872672081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22657470777630806, "step": 4372 }, { "epoch": 0.3645, "grad_norm": 4.8125, "grad_norm_var": 0.230712890625, "learning_rate": 4e-05, "loss": 5.0056, "loss/crossentropy": 2.6067728400230408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22825849428772926, "step": 4374 }, { "epoch": 0.36466666666666664, "grad_norm": 5.1875, "grad_norm_var": 0.22245686848958332, "learning_rate": 4e-05, "loss": 5.048, "loss/crossentropy": 2.219629019498825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24455127492547035, "step": 4376 }, { "epoch": 0.36483333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.22948811848958334, "learning_rate": 4e-05, "loss": 4.9465, "loss/crossentropy": 1.511473461985588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1544812899082899, "step": 4378 }, { "epoch": 0.365, "grad_norm": 5.09375, "grad_norm_var": 0.22342122395833333, "learning_rate": 4e-05, "loss": 5.6686, "loss/crossentropy": 2.0744031071662903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21673217043280602, "step": 4380 }, { "epoch": 0.36516666666666664, "grad_norm": 5.3125, "grad_norm_var": 0.21171468098958332, "learning_rate": 4e-05, "loss": 5.515, "loss/crossentropy": 1.9527825713157654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20629049092531204, "step": 4382 }, { "epoch": 0.36533333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.144921875, "learning_rate": 4e-05, "loss": 5.0875, "loss/crossentropy": 1.833628848195076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18492337316274643, "step": 4384 }, { "epoch": 0.3655, "grad_norm": 4.78125, "grad_norm_var": 0.03218994140625, "learning_rate": 4e-05, "loss": 4.8835, "loss/crossentropy": 1.2569792494177818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13009061105549335, "step": 4386 }, { "epoch": 0.36566666666666664, "grad_norm": 5.15625, "grad_norm_var": 0.039839680989583334, "learning_rate": 4e-05, "loss": 5.0789, "loss/crossentropy": 1.895428627729416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112501822412014, "step": 4388 }, { "epoch": 0.36583333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.06669514973958333, "learning_rate": 4e-05, "loss": 4.4833, "loss/crossentropy": 1.2277986034750938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14113224111497402, "step": 4390 }, { "epoch": 0.366, "grad_norm": 4.8125, "grad_norm_var": 0.06640218098958334, "learning_rate": 4e-05, "loss": 4.8386, "loss/crossentropy": 2.208162397146225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132505401968956, "step": 4392 }, { "epoch": 0.36616666666666664, "grad_norm": 5.125, "grad_norm_var": 0.05885009765625, "learning_rate": 4e-05, "loss": 4.8422, "loss/crossentropy": 2.383415102958679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22132331132888794, "step": 4394 }, { "epoch": 0.36633333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.087109375, "learning_rate": 4e-05, "loss": 4.6894, "loss/crossentropy": 2.077396295964718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17519673332571983, "step": 4396 }, { "epoch": 0.3665, "grad_norm": 5.5, "grad_norm_var": 0.11412760416666666, "learning_rate": 4e-05, "loss": 5.3017, "loss/crossentropy": 2.461028575897217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349405586719513, "step": 4398 }, { "epoch": 0.36666666666666664, "grad_norm": 4.90625, "grad_norm_var": 0.11868489583333333, "learning_rate": 4e-05, "loss": 5.7916, "loss/crossentropy": 2.4471404552459717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2270291931927204, "step": 4400 }, { "epoch": 0.36683333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.11689046223958334, "learning_rate": 4e-05, "loss": 5.0363, "loss/crossentropy": 1.5369725078344345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969006545841694, "step": 4402 }, { "epoch": 0.367, "grad_norm": 5.0, "grad_norm_var": 0.10426025390625, "learning_rate": 4e-05, "loss": 5.6669, "loss/crossentropy": 1.8737575113773346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2562069408595562, "step": 4404 }, { "epoch": 0.36716666666666664, "grad_norm": 4.78125, "grad_norm_var": 0.08183186848958333, "learning_rate": 4e-05, "loss": 4.642, "loss/crossentropy": 1.9986793920397758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19230439141392708, "step": 4406 }, { "epoch": 0.36733333333333335, "grad_norm": 5.09375, "grad_norm_var": 0.07941080729166666, "learning_rate": 4e-05, "loss": 4.8895, "loss/crossentropy": 1.9708809554576874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21388357877731323, "step": 4408 }, { "epoch": 0.3675, "grad_norm": 5.21875, "grad_norm_var": 0.08229166666666667, "learning_rate": 4e-05, "loss": 5.0833, "loss/crossentropy": 1.5469930842518806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15567244589328766, "step": 4410 }, { "epoch": 0.36766666666666664, "grad_norm": 5.15625, "grad_norm_var": 0.05133056640625, "learning_rate": 4e-05, "loss": 5.7301, "loss/crossentropy": 2.365154951810837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022877149283886, "step": 4412 }, { "epoch": 0.36783333333333335, "grad_norm": 4.78125, "grad_norm_var": 0.028629557291666666, "learning_rate": 4e-05, "loss": 4.6753, "loss/crossentropy": 1.574171431362629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17130100168287754, "step": 4414 }, { "epoch": 0.368, "grad_norm": 4.90625, "grad_norm_var": 0.03131510416666667, "learning_rate": 4e-05, "loss": 5.2293, "loss/crossentropy": 2.4405595660209656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22489827498793602, "step": 4416 }, { "epoch": 0.36816666666666664, "grad_norm": 4.875, "grad_norm_var": 0.03043212890625, "learning_rate": 4e-05, "loss": 4.8175, "loss/crossentropy": 2.0399864241480827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18748826161026955, "step": 4418 }, { "epoch": 0.36833333333333335, "grad_norm": 5.25, "grad_norm_var": 0.039351399739583334, "learning_rate": 4e-05, "loss": 4.3589, "loss/crossentropy": 1.993159256875515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1920241042971611, "step": 4420 }, { "epoch": 0.3685, "grad_norm": 4.96875, "grad_norm_var": 0.04407552083333333, "learning_rate": 4e-05, "loss": 5.0311, "loss/crossentropy": 1.9237814024090767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1813476886600256, "step": 4422 }, { "epoch": 0.36866666666666664, "grad_norm": 5.09375, "grad_norm_var": 0.04596354166666667, "learning_rate": 4e-05, "loss": 5.0933, "loss/crossentropy": 1.994587004184723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19352146796882153, "step": 4424 }, { "epoch": 0.36883333333333335, "grad_norm": 5.125, "grad_norm_var": 0.05901285807291667, "learning_rate": 4e-05, "loss": 5.3789, "loss/crossentropy": 2.0247163474559784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23301972076296806, "step": 4426 }, { "epoch": 0.369, "grad_norm": 5.03125, "grad_norm_var": 0.06249593098958333, "learning_rate": 4e-05, "loss": 5.0111, "loss/crossentropy": 2.0897902846336365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1841061543673277, "step": 4428 }, { "epoch": 0.36916666666666664, "grad_norm": 4.40625, "grad_norm_var": 0.08587239583333334, "learning_rate": 4e-05, "loss": 4.759, "loss/crossentropy": 2.043010212481022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18399441055953503, "step": 4430 }, { "epoch": 0.36933333333333335, "grad_norm": 5.03125, "grad_norm_var": 0.07978108723958334, "learning_rate": 4e-05, "loss": 4.6126, "loss/crossentropy": 1.329975888133049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1455624159425497, "step": 4432 }, { "epoch": 0.3695, "grad_norm": 5.21875, "grad_norm_var": 0.0798828125, "learning_rate": 4e-05, "loss": 4.7537, "loss/crossentropy": 1.5519905239343643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16253823041915894, "step": 4434 }, { "epoch": 0.36966666666666664, "grad_norm": 5.375, "grad_norm_var": 0.11686197916666667, "learning_rate": 4e-05, "loss": 5.2753, "loss/crossentropy": 2.5547273755073547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22284472361207008, "step": 4436 }, { "epoch": 0.36983333333333335, "grad_norm": 4.5625, "grad_norm_var": 0.13437093098958333, "learning_rate": 4e-05, "loss": 4.6019, "loss/crossentropy": 1.815646231174469, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18009398505091667, "step": 4438 }, { "epoch": 0.37, "grad_norm": 5.40625, "grad_norm_var": 0.14185791015625, "learning_rate": 4e-05, "loss": 5.086, "loss/crossentropy": 1.72943264991045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15384770184755325, "step": 4440 }, { "epoch": 0.37016666666666664, "grad_norm": 5.375, "grad_norm_var": 0.13355712890625, "learning_rate": 4e-05, "loss": 4.7693, "loss/crossentropy": 2.069035105407238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1780005767941475, "step": 4442 }, { "epoch": 0.37033333333333335, "grad_norm": 4.875, "grad_norm_var": 0.13001302083333333, "learning_rate": 4e-05, "loss": 4.988, "loss/crossentropy": 1.8963488563895226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16615681909024715, "step": 4444 }, { "epoch": 0.3705, "grad_norm": 4.5, "grad_norm_var": 0.11767171223958334, "learning_rate": 4e-05, "loss": 4.5752, "loss/crossentropy": 1.6237219274044037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18100574985146523, "step": 4446 }, { "epoch": 0.37066666666666664, "grad_norm": 5.6875, "grad_norm_var": 0.17493489583333333, "learning_rate": 4e-05, "loss": 5.2411, "loss/crossentropy": 1.7035855576395988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18641939014196396, "step": 4448 }, { "epoch": 0.37083333333333335, "grad_norm": 4.4375, "grad_norm_var": 0.21393229166666666, "learning_rate": 4e-05, "loss": 5.0545, "loss/crossentropy": 2.348470985889435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21899165213108063, "step": 4450 }, { "epoch": 0.371, "grad_norm": 5.0, "grad_norm_var": 0.161181640625, "learning_rate": 4e-05, "loss": 4.6793, "loss/crossentropy": 2.0913305208086967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19459903612732887, "step": 4452 }, { "epoch": 0.37116666666666664, "grad_norm": 4.40625, "grad_norm_var": 0.17157796223958333, "learning_rate": 4e-05, "loss": 4.7403, "loss/crossentropy": 2.2749286592006683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18400948494672775, "step": 4454 }, { "epoch": 0.37133333333333335, "grad_norm": 4.53125, "grad_norm_var": 0.16903889973958333, "learning_rate": 4e-05, "loss": 4.4669, "loss/crossentropy": 2.459641069173813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20260080322623253, "step": 4456 }, { "epoch": 0.3715, "grad_norm": 5.0625, "grad_norm_var": 0.15579427083333333, "learning_rate": 4e-05, "loss": 5.0665, "loss/crossentropy": 2.2930372953414917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102019041776657, "step": 4458 }, { "epoch": 0.37166666666666665, "grad_norm": 5.0625, "grad_norm_var": 0.16861979166666666, "learning_rate": 4e-05, "loss": 4.9879, "loss/crossentropy": 2.230555236339569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22333020716905594, "step": 4460 }, { "epoch": 0.37183333333333335, "grad_norm": 5.96875, "grad_norm_var": 0.22862955729166667, "learning_rate": 4e-05, "loss": 5.0287, "loss/crossentropy": 2.245620846748352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2319910265505314, "step": 4462 }, { "epoch": 0.372, "grad_norm": 4.65625, "grad_norm_var": 0.14010009765625, "learning_rate": 4e-05, "loss": 4.7234, "loss/crossentropy": 1.5217574685811996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15399678982794285, "step": 4464 }, { "epoch": 0.37216666666666665, "grad_norm": 4.8125, "grad_norm_var": 0.13970947265625, "learning_rate": 4e-05, "loss": 5.1308, "loss/crossentropy": 1.8298010528087616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1942594312131405, "step": 4466 }, { "epoch": 0.37233333333333335, "grad_norm": 4.5625, "grad_norm_var": 0.149462890625, "learning_rate": 4e-05, "loss": 4.8408, "loss/crossentropy": 1.8616338968276978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17482497915625572, "step": 4468 }, { "epoch": 0.3725, "grad_norm": 5.34375, "grad_norm_var": 0.1765625, "learning_rate": 4e-05, "loss": 5.4852, "loss/crossentropy": 2.097053498029709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2076597884297371, "step": 4470 }, { "epoch": 0.37266666666666665, "grad_norm": 5.125, "grad_norm_var": 0.15944010416666668, "learning_rate": 4e-05, "loss": 4.3597, "loss/crossentropy": 2.012524388730526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1794872172176838, "step": 4472 }, { "epoch": 0.37283333333333335, "grad_norm": 5.0, "grad_norm_var": 0.16184895833333332, "learning_rate": 4e-05, "loss": 4.8864, "loss/crossentropy": 1.5553888604044914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15588123723864555, "step": 4474 }, { "epoch": 0.373, "grad_norm": 5.03125, "grad_norm_var": 0.13857014973958334, "learning_rate": 4e-05, "loss": 4.9816, "loss/crossentropy": 2.187106668949127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085089460015297, "step": 4476 }, { "epoch": 0.37316666666666665, "grad_norm": 4.875, "grad_norm_var": 0.08456624348958333, "learning_rate": 4e-05, "loss": 4.8089, "loss/crossentropy": 1.8687955513596535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18639723025262356, "step": 4478 }, { "epoch": 0.37333333333333335, "grad_norm": 4.9375, "grad_norm_var": 0.07550455729166666, "learning_rate": 4e-05, "loss": 4.8763, "loss/crossentropy": 1.53606728464365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15891759283840656, "step": 4480 }, { "epoch": 0.3735, "grad_norm": 4.6875, "grad_norm_var": 0.073681640625, "learning_rate": 4e-05, "loss": 4.8269, "loss/crossentropy": 2.033777177333832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21377325057983398, "step": 4482 }, { "epoch": 0.37366666666666665, "grad_norm": 5.15625, "grad_norm_var": 0.05810139973958333, "learning_rate": 4e-05, "loss": 5.0661, "loss/crossentropy": 2.221651792526245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20696890354156494, "step": 4484 }, { "epoch": 0.37383333333333335, "grad_norm": 4.40625, "grad_norm_var": 0.056966145833333336, "learning_rate": 4e-05, "loss": 4.9653, "loss/crossentropy": 2.0857246443629265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1875857077538967, "step": 4486 }, { "epoch": 0.374, "grad_norm": 4.84375, "grad_norm_var": 0.05701497395833333, "learning_rate": 4e-05, "loss": 4.6214, "loss/crossentropy": 1.5107336938381195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18940279260277748, "step": 4488 }, { "epoch": 0.37416666666666665, "grad_norm": 5.875, "grad_norm_var": 0.10110677083333333, "learning_rate": 4e-05, "loss": 5.4512, "loss/crossentropy": 2.483080804347992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103271558880806, "step": 4490 }, { "epoch": 0.37433333333333335, "grad_norm": 4.6875, "grad_norm_var": 0.11083577473958334, "learning_rate": 4e-05, "loss": 5.0859, "loss/crossentropy": 2.398630738258362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2321801409125328, "step": 4492 }, { "epoch": 0.3745, "grad_norm": 5.09375, "grad_norm_var": 0.112109375, "learning_rate": 4e-05, "loss": 5.1353, "loss/crossentropy": 2.4930431246757507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20315857604146004, "step": 4494 }, { "epoch": 0.37466666666666665, "grad_norm": 4.6875, "grad_norm_var": 0.12353108723958334, "learning_rate": 4e-05, "loss": 4.8157, "loss/crossentropy": 2.4085164666175842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192452736198902, "step": 4496 }, { "epoch": 0.37483333333333335, "grad_norm": 5.03125, "grad_norm_var": 0.11612955729166667, "learning_rate": 4e-05, "loss": 5.4999, "loss/crossentropy": 2.2982660233974457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21146676689386368, "step": 4498 }, { "epoch": 0.375, "grad_norm": 5.0, "grad_norm_var": 0.11431884765625, "learning_rate": 4e-05, "loss": 4.5637, "loss/crossentropy": 1.7181595116853714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1607910357415676, "step": 4500 }, { "epoch": 0.37516666666666665, "grad_norm": 4.78125, "grad_norm_var": 0.08179931640625, "learning_rate": 4e-05, "loss": 4.6966, "loss/crossentropy": 1.7414831668138504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1897329930216074, "step": 4502 }, { "epoch": 0.37533333333333335, "grad_norm": 5.21875, "grad_norm_var": 0.08476155598958333, "learning_rate": 4e-05, "loss": 4.6112, "loss/crossentropy": 1.4592458382248878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1842550728470087, "step": 4504 }, { "epoch": 0.3755, "grad_norm": 4.8125, "grad_norm_var": 0.029150390625, "learning_rate": 4e-05, "loss": 4.7776, "loss/crossentropy": 2.0442886650562286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18794919177889824, "step": 4506 }, { "epoch": 0.37566666666666665, "grad_norm": 4.6875, "grad_norm_var": 0.022509765625, "learning_rate": 4e-05, "loss": 4.4669, "loss/crossentropy": 2.0219354778528214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18397306650877, "step": 4508 }, { "epoch": 0.37583333333333335, "grad_norm": 4.6875, "grad_norm_var": 0.023177083333333334, "learning_rate": 4e-05, "loss": 4.807, "loss/crossentropy": 1.6848445013165474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17057411931455135, "step": 4510 }, { "epoch": 0.376, "grad_norm": 4.875, "grad_norm_var": 0.018778483072916668, "learning_rate": 4e-05, "loss": 4.8326, "loss/crossentropy": 1.9318490028381348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17598963528871536, "step": 4512 }, { "epoch": 0.37616666666666665, "grad_norm": 4.96875, "grad_norm_var": 0.018994140625, "learning_rate": 4e-05, "loss": 4.5025, "loss/crossentropy": 2.4948436617851257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19910050928592682, "step": 4514 }, { "epoch": 0.37633333333333335, "grad_norm": 4.9375, "grad_norm_var": 0.02105712890625, "learning_rate": 4e-05, "loss": 5.4404, "loss/crossentropy": 1.8748653531074524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18252076767385006, "step": 4516 }, { "epoch": 0.3765, "grad_norm": 5.09375, "grad_norm_var": 0.024983723958333332, "learning_rate": 4e-05, "loss": 5.1306, "loss/crossentropy": 2.293698728084564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19080045446753502, "step": 4518 }, { "epoch": 0.37666666666666665, "grad_norm": 5.15625, "grad_norm_var": 0.022721354166666666, "learning_rate": 4e-05, "loss": 5.2861, "loss/crossentropy": 2.323317229747772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21780076250433922, "step": 4520 }, { "epoch": 0.37683333333333335, "grad_norm": 4.875, "grad_norm_var": 0.022981770833333335, "learning_rate": 4e-05, "loss": 4.9991, "loss/crossentropy": 1.8763496354222298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1941455528140068, "step": 4522 }, { "epoch": 0.377, "grad_norm": 4.59375, "grad_norm_var": 0.031233723958333334, "learning_rate": 4e-05, "loss": 4.7514, "loss/crossentropy": 1.469950720667839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15184346586465836, "step": 4524 }, { "epoch": 0.37716666666666665, "grad_norm": 4.90625, "grad_norm_var": 0.03345947265625, "learning_rate": 4e-05, "loss": 4.7452, "loss/crossentropy": 1.4754580333828926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14591082371771336, "step": 4526 }, { "epoch": 0.37733333333333335, "grad_norm": 4.84375, "grad_norm_var": 0.036962890625, "learning_rate": 4e-05, "loss": 4.8999, "loss/crossentropy": 2.164148509502411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1979655660688877, "step": 4528 }, { "epoch": 0.3775, "grad_norm": 4.875, "grad_norm_var": 0.038407389322916666, "learning_rate": 4e-05, "loss": 4.9113, "loss/crossentropy": 2.2905170917510986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23950210586190224, "step": 4530 }, { "epoch": 0.37766666666666665, "grad_norm": 4.46875, "grad_norm_var": 0.05904541015625, "learning_rate": 4e-05, "loss": 4.8391, "loss/crossentropy": 1.711202435195446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18126659281551838, "step": 4532 }, { "epoch": 0.37783333333333335, "grad_norm": 4.96875, "grad_norm_var": 0.05487874348958333, "learning_rate": 4e-05, "loss": 5.2967, "loss/crossentropy": 2.019508332014084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19983846321702003, "step": 4534 }, { "epoch": 0.378, "grad_norm": 4.875, "grad_norm_var": 0.04934895833333333, "learning_rate": 4e-05, "loss": 4.6832, "loss/crossentropy": 1.4596013128757477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1501648034900427, "step": 4536 }, { "epoch": 0.37816666666666665, "grad_norm": 5.125, "grad_norm_var": 0.053369140625, "learning_rate": 4e-05, "loss": 4.688, "loss/crossentropy": 1.7826803848147392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20163408666849136, "step": 4538 }, { "epoch": 0.37833333333333335, "grad_norm": 5.03125, "grad_norm_var": 0.06555582682291666, "learning_rate": 4e-05, "loss": 5.2718, "loss/crossentropy": 2.1941796839237213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30321093276143074, "step": 4540 }, { "epoch": 0.3785, "grad_norm": 4.625, "grad_norm_var": 0.066015625, "learning_rate": 4e-05, "loss": 4.7823, "loss/crossentropy": 1.5937781259417534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16691001877188683, "step": 4542 }, { "epoch": 0.37866666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.067041015625, "learning_rate": 4e-05, "loss": 5.1226, "loss/crossentropy": 1.317307323217392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17711128666996956, "step": 4544 }, { "epoch": 0.37883333333333336, "grad_norm": 4.71875, "grad_norm_var": 0.07118733723958333, "learning_rate": 4e-05, "loss": 5.1718, "loss/crossentropy": 2.36332568526268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21637556329369545, "step": 4546 }, { "epoch": 0.379, "grad_norm": 5.375, "grad_norm_var": 0.060807291666666666, "learning_rate": 4e-05, "loss": 4.8774, "loss/crossentropy": 1.8394339084625244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21467747539281845, "step": 4548 }, { "epoch": 0.37916666666666665, "grad_norm": 4.75, "grad_norm_var": 0.06847330729166666, "learning_rate": 4e-05, "loss": 5.0993, "loss/crossentropy": 1.8647487238049507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19115696288645267, "step": 4550 }, { "epoch": 0.37933333333333336, "grad_norm": 5.0625, "grad_norm_var": 0.09465738932291666, "learning_rate": 4e-05, "loss": 5.0467, "loss/crossentropy": 2.344729393720627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2191801331937313, "step": 4552 }, { "epoch": 0.3795, "grad_norm": 5.15625, "grad_norm_var": 0.09576416015625, "learning_rate": 4e-05, "loss": 5.0544, "loss/crossentropy": 0.9832122027873993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.126436535269022, "step": 4554 }, { "epoch": 0.37966666666666665, "grad_norm": 5.125, "grad_norm_var": 0.08508707682291666, "learning_rate": 4e-05, "loss": 4.9261, "loss/crossentropy": 2.408473551273346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232109010219574, "step": 4556 }, { "epoch": 0.37983333333333336, "grad_norm": 5.0625, "grad_norm_var": 0.06808268229166667, "learning_rate": 4e-05, "loss": 4.6928, "loss/crossentropy": 1.7633072063326836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19468731619417667, "step": 4558 }, { "epoch": 0.38, "grad_norm": 5.03125, "grad_norm_var": 0.059098307291666666, "learning_rate": 4e-05, "loss": 4.7743, "loss/crossentropy": 1.9241390004754066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015049308538437, "step": 4560 }, { "epoch": 0.38016666666666665, "grad_norm": 5.125, "grad_norm_var": 0.050374348958333336, "learning_rate": 4e-05, "loss": 4.8489, "loss/crossentropy": 1.9497752413153648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18487541005015373, "step": 4562 }, { "epoch": 0.38033333333333336, "grad_norm": 5.25, "grad_norm_var": 0.04163004557291667, "learning_rate": 4e-05, "loss": 5.0498, "loss/crossentropy": 2.279202699661255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2351878061890602, "step": 4564 }, { "epoch": 0.3805, "grad_norm": 4.8125, "grad_norm_var": 0.038525390625, "learning_rate": 4e-05, "loss": 4.9243, "loss/crossentropy": 2.036419540643692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20054011419415474, "step": 4566 }, { "epoch": 0.38066666666666665, "grad_norm": 5.09375, "grad_norm_var": 0.018880208333333332, "learning_rate": 4e-05, "loss": 5.4103, "loss/crossentropy": 2.19651135802269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140839770436287, "step": 4568 }, { "epoch": 0.38083333333333336, "grad_norm": 4.4375, "grad_norm_var": 0.033980305989583334, "learning_rate": 4e-05, "loss": 4.4335, "loss/crossentropy": 2.5283347964286804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21709014475345612, "step": 4570 }, { "epoch": 0.381, "grad_norm": 4.65625, "grad_norm_var": 0.04034830729166667, "learning_rate": 4e-05, "loss": 4.62, "loss/crossentropy": 1.9986247941851616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19273697584867477, "step": 4572 }, { "epoch": 0.38116666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.04950764973958333, "learning_rate": 4e-05, "loss": 5.1786, "loss/crossentropy": 1.9769628196954727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18022861890494823, "step": 4574 }, { "epoch": 0.38133333333333336, "grad_norm": 5.5, "grad_norm_var": 0.06734619140625, "learning_rate": 4e-05, "loss": 5.084, "loss/crossentropy": 1.9706605598330498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18343539349734783, "step": 4576 }, { "epoch": 0.3815, "grad_norm": 5.65625, "grad_norm_var": 0.15592041015625, "learning_rate": 4e-05, "loss": 4.723, "loss/crossentropy": 2.2917512953281403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.216607965528965, "step": 4578 }, { "epoch": 0.38166666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.15623372395833332, "learning_rate": 4e-05, "loss": 4.8429, "loss/crossentropy": 2.4264036417007446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21467092260718346, "step": 4580 }, { "epoch": 0.38183333333333336, "grad_norm": 5.4375, "grad_norm_var": 0.15740559895833334, "learning_rate": 4e-05, "loss": 5.2089, "loss/crossentropy": 2.411838263273239, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21722927317023277, "step": 4582 }, { "epoch": 0.382, "grad_norm": 4.75, "grad_norm_var": 0.16604410807291667, "learning_rate": 4e-05, "loss": 4.9028, "loss/crossentropy": 1.5032013952732086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.140468442812562, "step": 4584 }, { "epoch": 0.38216666666666665, "grad_norm": 4.65625, "grad_norm_var": 0.14618733723958333, "learning_rate": 4e-05, "loss": 4.6503, "loss/crossentropy": 1.8413489237427711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1861533373594284, "step": 4586 }, { "epoch": 0.38233333333333336, "grad_norm": 5.375, "grad_norm_var": 0.12428385416666667, "learning_rate": 4e-05, "loss": 5.4413, "loss/crossentropy": 1.6172087043523788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1677580364048481, "step": 4588 }, { "epoch": 0.3825, "grad_norm": 5.21875, "grad_norm_var": 0.13123372395833333, "learning_rate": 4e-05, "loss": 4.3317, "loss/crossentropy": 1.9612976610660553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23077058792114258, "step": 4590 }, { "epoch": 0.38266666666666665, "grad_norm": 5.25, "grad_norm_var": 0.14269205729166667, "learning_rate": 4e-05, "loss": 4.9979, "loss/crossentropy": 1.6219684183597565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15546699054539204, "step": 4592 }, { "epoch": 0.38283333333333336, "grad_norm": 4.875, "grad_norm_var": 0.063525390625, "learning_rate": 4e-05, "loss": 4.5898, "loss/crossentropy": 1.8427765145897865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1906105801463127, "step": 4594 }, { "epoch": 0.383, "grad_norm": 4.5625, "grad_norm_var": 0.08398030598958334, "learning_rate": 4e-05, "loss": 4.5646, "loss/crossentropy": 2.483305275440216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23165292665362358, "step": 4596 }, { "epoch": 0.38316666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.07870686848958333, "learning_rate": 4e-05, "loss": 4.6242, "loss/crossentropy": 1.164937436580658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.149330236017704, "step": 4598 }, { "epoch": 0.38333333333333336, "grad_norm": 4.90625, "grad_norm_var": 0.08125, "learning_rate": 4e-05, "loss": 4.7377, "loss/crossentropy": 1.1906629279255867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14596578665077686, "step": 4600 }, { "epoch": 0.3835, "grad_norm": 5.1875, "grad_norm_var": 0.7831868489583333, "learning_rate": 4e-05, "loss": 4.8185, "loss/crossentropy": 1.5672737285494804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17864598706364632, "step": 4602 }, { "epoch": 0.38366666666666666, "grad_norm": 5.40625, "grad_norm_var": 0.78707275390625, "learning_rate": 4e-05, "loss": 5.3415, "loss/crossentropy": 1.97494575381279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22634240612387657, "step": 4604 }, { "epoch": 0.38383333333333336, "grad_norm": 4.25, "grad_norm_var": 0.8302693684895833, "learning_rate": 4e-05, "loss": 4.8177, "loss/crossentropy": 2.1882776021957397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2004379890859127, "step": 4606 }, { "epoch": 0.384, "grad_norm": 5.0, "grad_norm_var": 0.8123697916666667, "learning_rate": 4e-05, "loss": 5.3293, "loss/crossentropy": 2.0698306038975716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1867370456457138, "step": 4608 }, { "epoch": 0.38416666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.82799072265625, "learning_rate": 4e-05, "loss": 4.5079, "loss/crossentropy": 2.5524789094924927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.229669027030468, "step": 4610 }, { "epoch": 0.38433333333333336, "grad_norm": 4.875, "grad_norm_var": 0.80015869140625, "learning_rate": 4e-05, "loss": 4.7616, "loss/crossentropy": 2.66834419965744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2175808660686016, "step": 4612 }, { "epoch": 0.3845, "grad_norm": 4.71875, "grad_norm_var": 0.7801717122395834, "learning_rate": 4e-05, "loss": 5.358, "loss/crossentropy": 1.8729632422327995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17812915332615376, "step": 4614 }, { "epoch": 0.38466666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.7616495768229167, "learning_rate": 4e-05, "loss": 4.9371, "loss/crossentropy": 1.8376353681087494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18147564306855202, "step": 4616 }, { "epoch": 0.38483333333333336, "grad_norm": 4.75, "grad_norm_var": 0.12734375, "learning_rate": 4e-05, "loss": 4.8466, "loss/crossentropy": 2.1042263209819794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1988103687763214, "step": 4618 }, { "epoch": 0.385, "grad_norm": 4.9375, "grad_norm_var": 0.10914306640625, "learning_rate": 4e-05, "loss": 4.4813, "loss/crossentropy": 1.6850282698869705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18490208312869072, "step": 4620 }, { "epoch": 0.38516666666666666, "grad_norm": 5.59375, "grad_norm_var": 0.08850504557291666, "learning_rate": 4e-05, "loss": 5.1136, "loss/crossentropy": 2.3153931200504303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22859248518943787, "step": 4622 }, { "epoch": 0.38533333333333336, "grad_norm": 5.15625, "grad_norm_var": 0.09101155598958334, "learning_rate": 4e-05, "loss": 4.6581, "loss/crossentropy": 2.230543076992035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22239073365926743, "step": 4624 }, { "epoch": 0.3855, "grad_norm": 4.9375, "grad_norm_var": 0.07851155598958333, "learning_rate": 4e-05, "loss": 5.0687, "loss/crossentropy": 1.9034294560551643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17507881671190262, "step": 4626 }, { "epoch": 0.38566666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.08332926432291667, "learning_rate": 4e-05, "loss": 5.2866, "loss/crossentropy": 1.9101981818675995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21490610018372536, "step": 4628 }, { "epoch": 0.3858333333333333, "grad_norm": 5.125, "grad_norm_var": 0.072119140625, "learning_rate": 4e-05, "loss": 5.136, "loss/crossentropy": 2.074092000722885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18353088945150375, "step": 4630 }, { "epoch": 0.386, "grad_norm": 5.3125, "grad_norm_var": 0.06419270833333333, "learning_rate": 4e-05, "loss": 5.4292, "loss/crossentropy": 2.1906376481056213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22475622966885567, "step": 4632 }, { "epoch": 0.38616666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.05526936848958333, "learning_rate": 4e-05, "loss": 4.9876, "loss/crossentropy": 2.148811638355255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19886000081896782, "step": 4634 }, { "epoch": 0.3863333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.057291666666666664, "learning_rate": 4e-05, "loss": 4.5391, "loss/crossentropy": 2.1843119859695435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2001435048878193, "step": 4636 }, { "epoch": 0.3865, "grad_norm": 4.46875, "grad_norm_var": 0.0609375, "learning_rate": 4e-05, "loss": 4.2873, "loss/crossentropy": 1.3234648406505585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17355255037546158, "step": 4638 }, { "epoch": 0.38666666666666666, "grad_norm": 5.25, "grad_norm_var": 0.07340087890625, "learning_rate": 4e-05, "loss": 4.7845, "loss/crossentropy": 2.5076652467250824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20514310523867607, "step": 4640 }, { "epoch": 0.3868333333333333, "grad_norm": 4.875, "grad_norm_var": 0.07643229166666667, "learning_rate": 4e-05, "loss": 5.0443, "loss/crossentropy": 1.0731484815478325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1514381840825081, "step": 4642 }, { "epoch": 0.387, "grad_norm": 5.125, "grad_norm_var": 0.07844645182291667, "learning_rate": 4e-05, "loss": 4.8815, "loss/crossentropy": 1.5910435616970062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16784923523664474, "step": 4644 }, { "epoch": 0.38716666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.08284098307291667, "learning_rate": 4e-05, "loss": 4.9735, "loss/crossentropy": 2.524174451828003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103761024773121, "step": 4646 }, { "epoch": 0.3873333333333333, "grad_norm": 5.0, "grad_norm_var": 0.048563639322916664, "learning_rate": 4e-05, "loss": 5.0155, "loss/crossentropy": 1.8493199050426483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17533520981669426, "step": 4648 }, { "epoch": 0.3875, "grad_norm": 4.59375, "grad_norm_var": 0.049605305989583334, "learning_rate": 4e-05, "loss": 5.0395, "loss/crossentropy": 2.0180707573890686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20294680073857307, "step": 4650 }, { "epoch": 0.38766666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.049605305989583334, "learning_rate": 4e-05, "loss": 5.0705, "loss/crossentropy": 2.2379818856716156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2366400510072708, "step": 4652 }, { "epoch": 0.3878333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.038736979166666664, "learning_rate": 4e-05, "loss": 4.5548, "loss/crossentropy": 2.384194016456604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23891232535243034, "step": 4654 }, { "epoch": 0.388, "grad_norm": 5.375, "grad_norm_var": 0.044775390625, "learning_rate": 4e-05, "loss": 4.8009, "loss/crossentropy": 1.7017896994948387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1889229454100132, "step": 4656 }, { "epoch": 0.38816666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.05585530598958333, "learning_rate": 4e-05, "loss": 4.4232, "loss/crossentropy": 1.9795458614826202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17039254680275917, "step": 4658 }, { "epoch": 0.3883333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.07613525390625, "learning_rate": 4e-05, "loss": 4.7452, "loss/crossentropy": 1.8883531391620636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24201414734125137, "step": 4660 }, { "epoch": 0.3885, "grad_norm": 4.875, "grad_norm_var": 0.074853515625, "learning_rate": 4e-05, "loss": 5.0235, "loss/crossentropy": 1.8122221529483795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21144957840442657, "step": 4662 }, { "epoch": 0.38866666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.08553059895833333, "learning_rate": 4e-05, "loss": 4.4121, "loss/crossentropy": 1.4162417724728584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14702197536826134, "step": 4664 }, { "epoch": 0.3888333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.11549479166666667, "learning_rate": 4e-05, "loss": 4.2224, "loss/crossentropy": 1.2500296533107758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12859608978033066, "step": 4666 }, { "epoch": 0.389, "grad_norm": 5.03125, "grad_norm_var": 0.117822265625, "learning_rate": 4e-05, "loss": 4.4014, "loss/crossentropy": 1.477287195622921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14969945698976517, "step": 4668 }, { "epoch": 0.38916666666666666, "grad_norm": 5.125, "grad_norm_var": 0.120947265625, "learning_rate": 4e-05, "loss": 4.7961, "loss/crossentropy": 2.572759300470352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2284352257847786, "step": 4670 }, { "epoch": 0.3893333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.11588541666666667, "learning_rate": 4e-05, "loss": 4.6865, "loss/crossentropy": 2.2431783378124237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18131668493151665, "step": 4672 }, { "epoch": 0.3895, "grad_norm": 4.96875, "grad_norm_var": 0.13084309895833332, "learning_rate": 4e-05, "loss": 4.7876, "loss/crossentropy": 2.0269206687808037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18316753953695297, "step": 4674 }, { "epoch": 0.38966666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.107421875, "learning_rate": 4e-05, "loss": 5.4435, "loss/crossentropy": 2.206951141357422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20157692208886147, "step": 4676 }, { "epoch": 0.3898333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.11022135416666666, "learning_rate": 4e-05, "loss": 4.936, "loss/crossentropy": 1.9816325455904007, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19688685610890388, "step": 4678 }, { "epoch": 0.39, "grad_norm": 5.15625, "grad_norm_var": 0.11027018229166667, "learning_rate": 4e-05, "loss": 5.7048, "loss/crossentropy": 2.4815438985824585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25391989201307297, "step": 4680 }, { "epoch": 0.39016666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.1109375, "learning_rate": 4e-05, "loss": 5.1386, "loss/crossentropy": 2.527748703956604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21069994568824768, "step": 4682 }, { "epoch": 0.3903333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.10948893229166666, "learning_rate": 4e-05, "loss": 4.8226, "loss/crossentropy": 2.2085874676704407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19113614037632942, "step": 4684 }, { "epoch": 0.3905, "grad_norm": 5.03125, "grad_norm_var": 0.105322265625, "learning_rate": 4e-05, "loss": 5.4503, "loss/crossentropy": 2.017712041735649, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1863031480461359, "step": 4686 }, { "epoch": 0.39066666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.10198160807291666, "learning_rate": 4e-05, "loss": 4.6944, "loss/crossentropy": 1.8249566927552223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17401202395558357, "step": 4688 }, { "epoch": 0.3908333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.07532145182291666, "learning_rate": 4e-05, "loss": 4.4237, "loss/crossentropy": 2.3632700443267822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1909812018275261, "step": 4690 }, { "epoch": 0.391, "grad_norm": 4.5625, "grad_norm_var": 0.07483317057291666, "learning_rate": 4e-05, "loss": 4.3248, "loss/crossentropy": 0.6874695122241974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.09018013998866081, "step": 4692 }, { "epoch": 0.39116666666666666, "grad_norm": 4.5, "grad_norm_var": 0.07688395182291667, "learning_rate": 4e-05, "loss": 5.2917, "loss/crossentropy": 1.7731168419122696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1825818531215191, "step": 4694 }, { "epoch": 0.3913333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.07511393229166667, "learning_rate": 4e-05, "loss": 5.2002, "loss/crossentropy": 1.7397530004382133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17103871516883373, "step": 4696 }, { "epoch": 0.3915, "grad_norm": 4.84375, "grad_norm_var": 0.0484375, "learning_rate": 4e-05, "loss": 4.7215, "loss/crossentropy": 1.8667742162942886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19179138913750648, "step": 4698 }, { "epoch": 0.39166666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.04937744140625, "learning_rate": 4e-05, "loss": 4.8689, "loss/crossentropy": 2.058789312839508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24368786439299583, "step": 4700 }, { "epoch": 0.3918333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.048046875, "learning_rate": 4e-05, "loss": 4.7386, "loss/crossentropy": 1.992527186870575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2063431590795517, "step": 4702 }, { "epoch": 0.392, "grad_norm": 4.6875, "grad_norm_var": 0.05038655598958333, "learning_rate": 4e-05, "loss": 4.3603, "loss/crossentropy": 1.689400039613247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18329921551048756, "step": 4704 }, { "epoch": 0.39216666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.048811848958333334, "learning_rate": 4e-05, "loss": 4.6414, "loss/crossentropy": 2.0435714572668076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18891741707921028, "step": 4706 }, { "epoch": 0.3923333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.04748942057291667, "learning_rate": 4e-05, "loss": 4.8324, "loss/crossentropy": 1.8614301830530167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17250847071409225, "step": 4708 }, { "epoch": 0.3925, "grad_norm": 4.71875, "grad_norm_var": 0.03553059895833333, "learning_rate": 4e-05, "loss": 5.0679, "loss/crossentropy": 2.0357573106884956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1831020563840866, "step": 4710 }, { "epoch": 0.39266666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.036909993489583334, "learning_rate": 4e-05, "loss": 4.6938, "loss/crossentropy": 1.6935219168663025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851331926882267, "step": 4712 }, { "epoch": 0.3928333333333333, "grad_norm": 4.5, "grad_norm_var": 0.04680582682291667, "learning_rate": 4e-05, "loss": 4.9553, "loss/crossentropy": 1.4612598046660423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15830958634614944, "step": 4714 }, { "epoch": 0.393, "grad_norm": 5.15625, "grad_norm_var": 0.056233723958333336, "learning_rate": 4e-05, "loss": 5.5202, "loss/crossentropy": 2.0164549723267555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17883064597845078, "step": 4716 }, { "epoch": 0.39316666666666666, "grad_norm": 5.4375, "grad_norm_var": 0.0845703125, "learning_rate": 4e-05, "loss": 5.1372, "loss/crossentropy": 2.097052186727524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877942606806755, "step": 4718 }, { "epoch": 0.3933333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.07454020182291667, "learning_rate": 4e-05, "loss": 4.4806, "loss/crossentropy": 2.1500546038150787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19928832724690437, "step": 4720 }, { "epoch": 0.3935, "grad_norm": 5.6875, "grad_norm_var": 0.10998942057291666, "learning_rate": 4e-05, "loss": 5.2306, "loss/crossentropy": 1.7852617651224136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16534100286662579, "step": 4722 }, { "epoch": 0.39366666666666666, "grad_norm": 4.5, "grad_norm_var": 0.11151936848958334, "learning_rate": 4e-05, "loss": 4.6223, "loss/crossentropy": 1.8593629002571106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19395272806286812, "step": 4724 }, { "epoch": 0.3938333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.10857747395833334, "learning_rate": 4e-05, "loss": 4.8391, "loss/crossentropy": 2.176317922770977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18766920641064644, "step": 4726 }, { "epoch": 0.394, "grad_norm": 4.9375, "grad_norm_var": 0.10728759765625, "learning_rate": 4e-05, "loss": 4.7861, "loss/crossentropy": 1.5290814563632011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16091121174395084, "step": 4728 }, { "epoch": 0.39416666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.07991129557291667, "learning_rate": 4e-05, "loss": 5.1133, "loss/crossentropy": 2.491378366947174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20404277369379997, "step": 4730 }, { "epoch": 0.3943333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.101416015625, "learning_rate": 4e-05, "loss": 5.0591, "loss/crossentropy": 2.1622492969036102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20248603075742722, "step": 4732 }, { "epoch": 0.3945, "grad_norm": 4.75, "grad_norm_var": 0.09254150390625, "learning_rate": 4e-05, "loss": 4.4277, "loss/crossentropy": 2.1794531047344208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22922635823488235, "step": 4734 }, { "epoch": 0.39466666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.08995768229166666, "learning_rate": 4e-05, "loss": 5.177, "loss/crossentropy": 1.7416007369756699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.181478600949049, "step": 4736 }, { "epoch": 0.3948333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.07745768229166666, "learning_rate": 4e-05, "loss": 5.3699, "loss/crossentropy": 1.9947757422924042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2080913558602333, "step": 4738 }, { "epoch": 0.395, "grad_norm": 4.5625, "grad_norm_var": 0.07239583333333334, "learning_rate": 4e-05, "loss": 4.9546, "loss/crossentropy": 1.2601947486400604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1411193422973156, "step": 4740 }, { "epoch": 0.39516666666666667, "grad_norm": 5.9375, "grad_norm_var": 0.11790364583333333, "learning_rate": 4e-05, "loss": 5.0002, "loss/crossentropy": 1.9214930534362793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973959505558014, "step": 4742 }, { "epoch": 0.3953333333333333, "grad_norm": 5.0, "grad_norm_var": 0.13326416015625, "learning_rate": 4e-05, "loss": 4.72, "loss/crossentropy": 2.6403688788414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22059182822704315, "step": 4744 }, { "epoch": 0.3955, "grad_norm": 4.9375, "grad_norm_var": 0.148291015625, "learning_rate": 4e-05, "loss": 4.8914, "loss/crossentropy": 2.0313423722982407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1909470409154892, "step": 4746 }, { "epoch": 0.39566666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.14836832682291667, "learning_rate": 4e-05, "loss": 4.0028, "loss/crossentropy": 0.390984907746315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.08585469983518124, "step": 4748 }, { "epoch": 0.3958333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.147509765625, "learning_rate": 4e-05, "loss": 5.5398, "loss/crossentropy": 1.977037712931633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18233729153871536, "step": 4750 }, { "epoch": 0.396, "grad_norm": 5.40625, "grad_norm_var": 0.25857747395833336, "learning_rate": 4e-05, "loss": 4.7413, "loss/crossentropy": 2.0886579751968384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19855722039937973, "step": 4752 }, { "epoch": 0.39616666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.27277018229166666, "learning_rate": 4e-05, "loss": 4.6178, "loss/crossentropy": 2.5692251324653625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22474710270762444, "step": 4754 }, { "epoch": 0.3963333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.258837890625, "learning_rate": 4e-05, "loss": 5.2294, "loss/crossentropy": 2.397335708141327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20136011764407158, "step": 4756 }, { "epoch": 0.3965, "grad_norm": 4.78125, "grad_norm_var": 0.19625244140625, "learning_rate": 4e-05, "loss": 4.885, "loss/crossentropy": 1.47114946693182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1387049611657858, "step": 4758 }, { "epoch": 0.39666666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.18866780598958333, "learning_rate": 4e-05, "loss": 4.9398, "loss/crossentropy": 1.8775576800107956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18835894763469696, "step": 4760 }, { "epoch": 0.3968333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.18726806640625, "learning_rate": 4e-05, "loss": 5.1243, "loss/crossentropy": 1.7907000631093979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16836080700159073, "step": 4762 }, { "epoch": 0.397, "grad_norm": 4.84375, "grad_norm_var": 0.17512613932291668, "learning_rate": 4e-05, "loss": 5.0837, "loss/crossentropy": 2.0826119109988213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17814225889742374, "step": 4764 }, { "epoch": 0.39716666666666667, "grad_norm": 5.0, "grad_norm_var": 0.17003580729166667, "learning_rate": 4e-05, "loss": 5.1873, "loss/crossentropy": 2.191085457801819, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2033637948334217, "step": 4766 }, { "epoch": 0.3973333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.0205078125, "learning_rate": 4e-05, "loss": 4.4774, "loss/crossentropy": 2.2374271750450134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20148679986596107, "step": 4768 }, { "epoch": 0.3975, "grad_norm": 4.625, "grad_norm_var": 0.017411295572916666, "learning_rate": 4e-05, "loss": 4.9577, "loss/crossentropy": 1.713077962398529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17870762944221497, "step": 4770 }, { "epoch": 0.39766666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.019205729166666668, "learning_rate": 4e-05, "loss": 4.7891, "loss/crossentropy": 2.1501103043556213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18045742809772491, "step": 4772 }, { "epoch": 0.3978333333333333, "grad_norm": 6.0, "grad_norm_var": 0.10403238932291667, "learning_rate": 4e-05, "loss": 4.8548, "loss/crossentropy": 1.9973932579159737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19423209875822067, "step": 4774 }, { "epoch": 0.398, "grad_norm": 4.8125, "grad_norm_var": 0.11031494140625, "learning_rate": 4e-05, "loss": 4.9899, "loss/crossentropy": 1.975264847278595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1853540502488613, "step": 4776 }, { "epoch": 0.39816666666666667, "grad_norm": 4.75, "grad_norm_var": 0.111328125, "learning_rate": 4e-05, "loss": 4.9549, "loss/crossentropy": 1.631221704185009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1606139950454235, "step": 4778 }, { "epoch": 0.3983333333333333, "grad_norm": 4.875, "grad_norm_var": 0.11077067057291666, "learning_rate": 4e-05, "loss": 5.0013, "loss/crossentropy": 1.5004914924502373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1420083437114954, "step": 4780 }, { "epoch": 0.3985, "grad_norm": 4.90625, "grad_norm_var": 0.11002197265625, "learning_rate": 4e-05, "loss": 4.9629, "loss/crossentropy": 2.5754368901252747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215489249676466, "step": 4782 }, { "epoch": 0.39866666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.11923421223958333, "learning_rate": 4e-05, "loss": 4.7496, "loss/crossentropy": 1.8387674316763878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17710744589567184, "step": 4784 }, { "epoch": 0.3988333333333333, "grad_norm": 5.125, "grad_norm_var": 0.10998942057291666, "learning_rate": 4e-05, "loss": 5.1065, "loss/crossentropy": 1.951240062713623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17940466478466988, "step": 4786 }, { "epoch": 0.399, "grad_norm": 5.0, "grad_norm_var": 0.10175374348958334, "learning_rate": 4e-05, "loss": 5.3268, "loss/crossentropy": 2.650111675262451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2154219187796116, "step": 4788 }, { "epoch": 0.39916666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.036572265625, "learning_rate": 4e-05, "loss": 4.9731, "loss/crossentropy": 1.7309883832931519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170034870505333, "step": 4790 }, { "epoch": 0.3993333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.034012858072916666, "learning_rate": 4e-05, "loss": 4.9897, "loss/crossentropy": 1.7699553072452545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1885082796216011, "step": 4792 }, { "epoch": 0.3995, "grad_norm": 4.96875, "grad_norm_var": 0.03306884765625, "learning_rate": 4e-05, "loss": 5.3317, "loss/crossentropy": 1.8295550793409348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17074094712734222, "step": 4794 }, { "epoch": 0.39966666666666667, "grad_norm": 4.75, "grad_norm_var": 0.04583333333333333, "learning_rate": 4e-05, "loss": 5.2288, "loss/crossentropy": 2.3379410803318024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21407023817300797, "step": 4796 }, { "epoch": 0.3998333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.04892171223958333, "learning_rate": 4e-05, "loss": 4.4721, "loss/crossentropy": 1.9539672955870628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17216168344020844, "step": 4798 }, { "epoch": 0.4, "grad_norm": 4.90625, "grad_norm_var": 0.20284830729166667, "learning_rate": 4e-05, "loss": 5.193, "loss/crossentropy": 2.7041677832603455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21110525727272034, "step": 4800 }, { "epoch": 0.40016666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.2056640625, "learning_rate": 4e-05, "loss": 4.7426, "loss/crossentropy": 1.9566970467567444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17532207444310188, "step": 4802 }, { "epoch": 0.4003333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.209228515625, "learning_rate": 4e-05, "loss": 5.0816, "loss/crossentropy": 1.6096001043915749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16063898243010044, "step": 4804 }, { "epoch": 0.4005, "grad_norm": 4.78125, "grad_norm_var": 0.212109375, "learning_rate": 4e-05, "loss": 4.8431, "loss/crossentropy": 1.724914450198412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16081226454116404, "step": 4806 }, { "epoch": 0.40066666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.20445556640625, "learning_rate": 4e-05, "loss": 5.7713, "loss/crossentropy": 1.7849983498454094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18764329701662064, "step": 4808 }, { "epoch": 0.4008333333333333, "grad_norm": 5.25, "grad_norm_var": 0.20894775390625, "learning_rate": 4e-05, "loss": 5.8588, "loss/crossentropy": 2.3519081473350525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21424896642565727, "step": 4810 }, { "epoch": 0.401, "grad_norm": 5.34375, "grad_norm_var": 0.19498697916666666, "learning_rate": 4e-05, "loss": 5.2778, "loss/crossentropy": 1.5925240516662598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17957336083054543, "step": 4812 }, { "epoch": 0.40116666666666667, "grad_norm": 5.375, "grad_norm_var": 0.19231363932291667, "learning_rate": 4e-05, "loss": 5.3563, "loss/crossentropy": 2.3263401687145233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23666761443018913, "step": 4814 }, { "epoch": 0.4013333333333333, "grad_norm": 10.0, "grad_norm_var": 1.5945963541666666, "learning_rate": 4e-05, "loss": 4.823, "loss/crossentropy": 3.0336874127388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21647527068853378, "step": 4816 }, { "epoch": 0.4015, "grad_norm": 5.09375, "grad_norm_var": 1.5526692708333334, "learning_rate": 4e-05, "loss": 4.741, "loss/crossentropy": 2.064057379961014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19196297973394394, "step": 4818 }, { "epoch": 0.40166666666666667, "grad_norm": 4.75, "grad_norm_var": 1.57222900390625, "learning_rate": 4e-05, "loss": 4.8622, "loss/crossentropy": 2.474276751279831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21422725915908813, "step": 4820 }, { "epoch": 0.4018333333333333, "grad_norm": 5.0625, "grad_norm_var": 1.5520792643229167, "learning_rate": 4e-05, "loss": 5.3988, "loss/crossentropy": 2.4372578859329224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20125412195920944, "step": 4822 }, { "epoch": 0.402, "grad_norm": 5.09375, "grad_norm_var": 1.5538899739583334, "learning_rate": 4e-05, "loss": 5.0631, "loss/crossentropy": 2.34754741191864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2127290964126587, "step": 4824 }, { "epoch": 0.4021666666666667, "grad_norm": 5.0, "grad_norm_var": 1.602978515625, "learning_rate": 4e-05, "loss": 4.8608, "loss/crossentropy": 1.7592350095510483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19576997309923172, "step": 4826 }, { "epoch": 0.4023333333333333, "grad_norm": 5.125, "grad_norm_var": 1.5969685872395833, "learning_rate": 4e-05, "loss": 5.0661, "loss/crossentropy": 2.473145544528961, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2341373898088932, "step": 4828 }, { "epoch": 0.4025, "grad_norm": 5.03125, "grad_norm_var": 1.6091796875, "learning_rate": 4e-05, "loss": 4.7009, "loss/crossentropy": 0.9298921674489975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1077762320637703, "step": 4830 }, { "epoch": 0.4026666666666667, "grad_norm": 4.875, "grad_norm_var": 0.052408854166666664, "learning_rate": 4e-05, "loss": 4.5144, "loss/crossentropy": 1.3825726583600044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15989060886204243, "step": 4832 }, { "epoch": 0.4028333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.05169270833333333, "learning_rate": 4e-05, "loss": 4.9293, "loss/crossentropy": 1.9143131226301193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1857363097369671, "step": 4834 }, { "epoch": 0.403, "grad_norm": 4.875, "grad_norm_var": 0.0419921875, "learning_rate": 4e-05, "loss": 4.791, "loss/crossentropy": 1.947355903685093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18314639292657375, "step": 4836 }, { "epoch": 0.4031666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.037353515625, "learning_rate": 4e-05, "loss": 4.9574, "loss/crossentropy": 2.1336475014686584, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22201551869511604, "step": 4838 }, { "epoch": 0.4033333333333333, "grad_norm": 5.375, "grad_norm_var": 0.042431640625, "learning_rate": 4e-05, "loss": 4.5785, "loss/crossentropy": 2.036346584558487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21780623495578766, "step": 4840 }, { "epoch": 0.4035, "grad_norm": 5.03125, "grad_norm_var": 0.02633056640625, "learning_rate": 4e-05, "loss": 4.9651, "loss/crossentropy": 2.0523361265659332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20310677587985992, "step": 4842 }, { "epoch": 0.4036666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.0302734375, "learning_rate": 4e-05, "loss": 5.1269, "loss/crossentropy": 1.9599568769335747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1813591830432415, "step": 4844 }, { "epoch": 0.4038333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.03277587890625, "learning_rate": 4e-05, "loss": 4.9369, "loss/crossentropy": 2.3991090655326843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21284585446119308, "step": 4846 }, { "epoch": 0.404, "grad_norm": 4.875, "grad_norm_var": 0.034749348958333336, "learning_rate": 4e-05, "loss": 4.5958, "loss/crossentropy": 1.9806862249970436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1810116320848465, "step": 4848 }, { "epoch": 0.4041666666666667, "grad_norm": 4.75, "grad_norm_var": 0.04934488932291667, "learning_rate": 4e-05, "loss": 4.6824, "loss/crossentropy": 2.1363211572170258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16956374421715736, "step": 4850 }, { "epoch": 0.4043333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.05347900390625, "learning_rate": 4e-05, "loss": 4.6704, "loss/crossentropy": 2.7161881923675537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152242660522461, "step": 4852 }, { "epoch": 0.4045, "grad_norm": 4.71875, "grad_norm_var": 0.052567545572916666, "learning_rate": 4e-05, "loss": 4.915, "loss/crossentropy": 2.2441403567790985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20361649617552757, "step": 4854 }, { "epoch": 0.4046666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.058854166666666666, "learning_rate": 4e-05, "loss": 5.3845, "loss/crossentropy": 2.0099719166755676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22300902009010315, "step": 4856 }, { "epoch": 0.4048333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.05705973307291667, "learning_rate": 4e-05, "loss": 4.9307, "loss/crossentropy": 2.7886196970939636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21208975464105606, "step": 4858 }, { "epoch": 0.405, "grad_norm": 5.0, "grad_norm_var": 0.05188802083333333, "learning_rate": 4e-05, "loss": 4.9334, "loss/crossentropy": 1.954047828912735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21413618698716164, "step": 4860 }, { "epoch": 0.4051666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.05286051432291667, "learning_rate": 4e-05, "loss": 5.0889, "loss/crossentropy": 2.416406363248825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19541754946112633, "step": 4862 }, { "epoch": 0.4053333333333333, "grad_norm": 4.625, "grad_norm_var": 0.05859375, "learning_rate": 4e-05, "loss": 4.8194, "loss/crossentropy": 2.1043947488069534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21232957020401955, "step": 4864 }, { "epoch": 0.4055, "grad_norm": 4.625, "grad_norm_var": 0.0431640625, "learning_rate": 4e-05, "loss": 5.0973, "loss/crossentropy": 1.5513064786791801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18353967368602753, "step": 4866 }, { "epoch": 0.4056666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.045145670572916664, "learning_rate": 4e-05, "loss": 4.8616, "loss/crossentropy": 2.0387043803930283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1888214349746704, "step": 4868 }, { "epoch": 0.4058333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.042822265625, "learning_rate": 4e-05, "loss": 5.3838, "loss/crossentropy": 2.5968635082244873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22734135761857033, "step": 4870 }, { "epoch": 0.406, "grad_norm": 5.59375, "grad_norm_var": 0.077197265625, "learning_rate": 4e-05, "loss": 5.6008, "loss/crossentropy": 2.491960108280182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2108830250799656, "step": 4872 }, { "epoch": 0.4061666666666667, "grad_norm": 5.0, "grad_norm_var": 0.07515869140625, "learning_rate": 4e-05, "loss": 4.5119, "loss/crossentropy": 1.766836240887642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16757315024733543, "step": 4874 }, { "epoch": 0.4063333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.07476806640625, "learning_rate": 4e-05, "loss": 5.1455, "loss/crossentropy": 2.424330711364746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23356298729777336, "step": 4876 }, { "epoch": 0.4065, "grad_norm": 4.9375, "grad_norm_var": 0.07401936848958333, "learning_rate": 4e-05, "loss": 4.817, "loss/crossentropy": 1.9086792171001434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21842263266444206, "step": 4878 }, { "epoch": 0.4066666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.066015625, "learning_rate": 4e-05, "loss": 5.3566, "loss/crossentropy": 2.3481759428977966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22666499763727188, "step": 4880 }, { "epoch": 0.4068333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.07565104166666667, "learning_rate": 4e-05, "loss": 4.8577, "loss/crossentropy": 1.8519111350178719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18224726244807243, "step": 4882 }, { "epoch": 0.407, "grad_norm": 4.78125, "grad_norm_var": 0.0791015625, "learning_rate": 4e-05, "loss": 5.2082, "loss/crossentropy": 2.2147536873817444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20171857625246048, "step": 4884 }, { "epoch": 0.4071666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.07926025390625, "learning_rate": 4e-05, "loss": 4.8338, "loss/crossentropy": 2.537692904472351, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117961779236794, "step": 4886 }, { "epoch": 0.4073333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.03982747395833333, "learning_rate": 4e-05, "loss": 4.9522, "loss/crossentropy": 2.4090050756931305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2254425659775734, "step": 4888 }, { "epoch": 0.4075, "grad_norm": 4.9375, "grad_norm_var": 0.03957926432291667, "learning_rate": 4e-05, "loss": 4.3431, "loss/crossentropy": 1.156929299235344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14492111094295979, "step": 4890 }, { "epoch": 0.4076666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.044661458333333334, "learning_rate": 4e-05, "loss": 5.3081, "loss/crossentropy": 2.1810811161994934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20051919296383858, "step": 4892 }, { "epoch": 0.4078333333333333, "grad_norm": 5.96875, "grad_norm_var": 0.12571207682291666, "learning_rate": 4e-05, "loss": 4.8635, "loss/crossentropy": 2.264392375946045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23612874373793602, "step": 4894 }, { "epoch": 0.408, "grad_norm": 5.25, "grad_norm_var": 0.12161458333333333, "learning_rate": 4e-05, "loss": 5.0033, "loss/crossentropy": 1.631929226219654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20521026104688644, "step": 4896 }, { "epoch": 0.4081666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.15950113932291668, "learning_rate": 4e-05, "loss": 5.1642, "loss/crossentropy": 2.431085526943207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2308473400771618, "step": 4898 }, { "epoch": 0.4083333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.20800374348958334, "learning_rate": 4e-05, "loss": 5.5606, "loss/crossentropy": 2.380655586719513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22495479881763458, "step": 4900 }, { "epoch": 0.4085, "grad_norm": 5.1875, "grad_norm_var": 0.23761393229166666, "learning_rate": 4e-05, "loss": 4.318, "loss/crossentropy": 2.0739801824092865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1816551871597767, "step": 4902 }, { "epoch": 0.4086666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.24060872395833333, "learning_rate": 4e-05, "loss": 4.9015, "loss/crossentropy": 1.5711579322814941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2034294791519642, "step": 4904 }, { "epoch": 0.4088333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.25816650390625, "learning_rate": 4e-05, "loss": 4.7011, "loss/crossentropy": 1.5462888479232788, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15808816254138947, "step": 4906 }, { "epoch": 0.409, "grad_norm": 4.59375, "grad_norm_var": 0.2498046875, "learning_rate": 4e-05, "loss": 4.6972, "loss/crossentropy": 1.5701627358794212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15472500771284103, "step": 4908 }, { "epoch": 0.4091666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.18088785807291666, "learning_rate": 4e-05, "loss": 4.4919, "loss/crossentropy": 2.258408010005951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2214752584695816, "step": 4910 }, { "epoch": 0.4093333333333333, "grad_norm": 6.125, "grad_norm_var": 0.26545817057291665, "learning_rate": 4e-05, "loss": 5.6132, "loss/crossentropy": 2.2918245792388916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20702063292264938, "step": 4912 }, { "epoch": 0.4095, "grad_norm": 4.46875, "grad_norm_var": 0.23922119140625, "learning_rate": 4e-05, "loss": 4.9267, "loss/crossentropy": 2.0654823556542397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17589706182479858, "step": 4914 }, { "epoch": 0.4096666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.18759358723958333, "learning_rate": 4e-05, "loss": 4.8291, "loss/crossentropy": 1.9735463857650757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1892772577702999, "step": 4916 }, { "epoch": 0.4098333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.1677734375, "learning_rate": 4e-05, "loss": 5.1753, "loss/crossentropy": 1.4942561835050583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14400891959667206, "step": 4918 }, { "epoch": 0.41, "grad_norm": 5.34375, "grad_norm_var": 0.185546875, "learning_rate": 4e-05, "loss": 5.0549, "loss/crossentropy": 1.840515322983265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930188685655594, "step": 4920 }, { "epoch": 0.4101666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.17565104166666667, "learning_rate": 4e-05, "loss": 4.8569, "loss/crossentropy": 1.399222806096077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20139185898005962, "step": 4922 }, { "epoch": 0.4103333333333333, "grad_norm": 4.875, "grad_norm_var": 0.16612955729166667, "learning_rate": 4e-05, "loss": 5.0999, "loss/crossentropy": 1.889309674501419, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1681222766637802, "step": 4924 }, { "epoch": 0.4105, "grad_norm": 4.6875, "grad_norm_var": 0.16809488932291666, "learning_rate": 4e-05, "loss": 4.6713, "loss/crossentropy": 2.352425366640091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22923307865858078, "step": 4926 }, { "epoch": 0.4106666666666667, "grad_norm": 4.875, "grad_norm_var": 0.06617431640625, "learning_rate": 4e-05, "loss": 5.1291, "loss/crossentropy": 2.288997530937195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21416041254997253, "step": 4928 }, { "epoch": 0.41083333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.05806884765625, "learning_rate": 4e-05, "loss": 4.5875, "loss/crossentropy": 2.1992684602737427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20887574926018715, "step": 4930 }, { "epoch": 0.411, "grad_norm": 5.28125, "grad_norm_var": 0.06334228515625, "learning_rate": 4e-05, "loss": 4.7654, "loss/crossentropy": 2.058065950870514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20995530486106873, "step": 4932 }, { "epoch": 0.4111666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.059619140625, "learning_rate": 4e-05, "loss": 4.8428, "loss/crossentropy": 1.7666442766785622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17294050380587578, "step": 4934 }, { "epoch": 0.41133333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.03240559895833333, "learning_rate": 4e-05, "loss": 4.9497, "loss/crossentropy": 1.412701353430748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15248983725905418, "step": 4936 }, { "epoch": 0.4115, "grad_norm": 4.71875, "grad_norm_var": 0.029911295572916666, "learning_rate": 4e-05, "loss": 4.8646, "loss/crossentropy": 1.9869374781847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19339880347251892, "step": 4938 }, { "epoch": 0.4116666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.03248291015625, "learning_rate": 4e-05, "loss": 4.4866, "loss/crossentropy": 1.2516977936029434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13892163336277008, "step": 4940 }, { "epoch": 0.41183333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.0404296875, "learning_rate": 4e-05, "loss": 5.4393, "loss/crossentropy": 2.3766159415245056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21843568980693817, "step": 4942 }, { "epoch": 0.412, "grad_norm": 4.78125, "grad_norm_var": 0.04959309895833333, "learning_rate": 4e-05, "loss": 5.1146, "loss/crossentropy": 1.8096503615379333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20148269459605217, "step": 4944 }, { "epoch": 0.4121666666666667, "grad_norm": 4.875, "grad_norm_var": 0.044661458333333334, "learning_rate": 4e-05, "loss": 4.7362, "loss/crossentropy": 2.301755279302597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2105144038796425, "step": 4946 }, { "epoch": 0.41233333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.03873291015625, "learning_rate": 4e-05, "loss": 5.2578, "loss/crossentropy": 2.1293097138404846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22269337251782417, "step": 4948 }, { "epoch": 0.4125, "grad_norm": 5.0, "grad_norm_var": 0.036458333333333336, "learning_rate": 4e-05, "loss": 4.7433, "loss/crossentropy": 0.9624597281217575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11805179342627525, "step": 4950 }, { "epoch": 0.4126666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.0375, "learning_rate": 4e-05, "loss": 5.0177, "loss/crossentropy": 2.7045233845710754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20849882066249847, "step": 4952 }, { "epoch": 0.41283333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.040478515625, "learning_rate": 4e-05, "loss": 4.8001, "loss/crossentropy": 2.376677691936493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22974925860762596, "step": 4954 }, { "epoch": 0.413, "grad_norm": 4.875, "grad_norm_var": 0.04052327473958333, "learning_rate": 4e-05, "loss": 5.1012, "loss/crossentropy": 1.7594347819685936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18802068009972572, "step": 4956 }, { "epoch": 0.4131666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.03551025390625, "learning_rate": 4e-05, "loss": 5.3854, "loss/crossentropy": 2.0615014731884003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2038946896791458, "step": 4958 }, { "epoch": 0.41333333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.03592122395833333, "learning_rate": 4e-05, "loss": 4.8311, "loss/crossentropy": 1.5626792162656784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15392129682004452, "step": 4960 }, { "epoch": 0.4135, "grad_norm": 4.90625, "grad_norm_var": 0.03958333333333333, "learning_rate": 4e-05, "loss": 5.3433, "loss/crossentropy": 2.549463391304016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21993155032396317, "step": 4962 }, { "epoch": 0.4136666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.053120930989583336, "learning_rate": 4e-05, "loss": 4.5375, "loss/crossentropy": 1.9942337423563004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19395725801587105, "step": 4964 }, { "epoch": 0.41383333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.06834309895833333, "learning_rate": 4e-05, "loss": 4.7104, "loss/crossentropy": 1.3048944622278214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14355522580444813, "step": 4966 }, { "epoch": 0.414, "grad_norm": 4.75, "grad_norm_var": 0.0759765625, "learning_rate": 4e-05, "loss": 4.9089, "loss/crossentropy": 2.365107297897339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21021969243884087, "step": 4968 }, { "epoch": 0.4141666666666667, "grad_norm": 4.625, "grad_norm_var": 0.08472900390625, "learning_rate": 4e-05, "loss": 5.2194, "loss/crossentropy": 1.9617774188518524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20810309797525406, "step": 4970 }, { "epoch": 0.41433333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.08136393229166666, "learning_rate": 4e-05, "loss": 4.9017, "loss/crossentropy": 2.406768888235092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21964864060282707, "step": 4972 }, { "epoch": 0.4145, "grad_norm": 4.65625, "grad_norm_var": 0.07849934895833334, "learning_rate": 4e-05, "loss": 5.0075, "loss/crossentropy": 1.9754514545202255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20375796221196651, "step": 4974 }, { "epoch": 0.4146666666666667, "grad_norm": 5.125, "grad_norm_var": 0.07923177083333334, "learning_rate": 4e-05, "loss": 4.6253, "loss/crossentropy": 0.8821175321936607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12686316668987274, "step": 4976 }, { "epoch": 0.41483333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.09016927083333333, "learning_rate": 4e-05, "loss": 4.4754, "loss/crossentropy": 1.8768207728862762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18468411453068256, "step": 4978 }, { "epoch": 0.415, "grad_norm": 5.15625, "grad_norm_var": 0.09446207682291667, "learning_rate": 4e-05, "loss": 4.8406, "loss/crossentropy": 1.8561868369579315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19827888533473015, "step": 4980 }, { "epoch": 0.4151666666666667, "grad_norm": 4.875, "grad_norm_var": 0.08046468098958333, "learning_rate": 4e-05, "loss": 5.1432, "loss/crossentropy": 2.181670993566513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22666886448860168, "step": 4982 }, { "epoch": 0.41533333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.07532552083333334, "learning_rate": 4e-05, "loss": 4.812, "loss/crossentropy": 1.7170398011803627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21950143948197365, "step": 4984 }, { "epoch": 0.4155, "grad_norm": 5.1875, "grad_norm_var": 0.07823893229166666, "learning_rate": 4e-05, "loss": 5.0194, "loss/crossentropy": 1.7972271963953972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1704818457365036, "step": 4986 }, { "epoch": 0.4156666666666667, "grad_norm": 5.0, "grad_norm_var": 0.08424479166666667, "learning_rate": 4e-05, "loss": 5.1628, "loss/crossentropy": 2.5009243488311768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20981622114777565, "step": 4988 }, { "epoch": 0.41583333333333333, "grad_norm": 4.875, "grad_norm_var": 0.08765869140625, "learning_rate": 4e-05, "loss": 4.6857, "loss/crossentropy": 1.990450143814087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19208272732794285, "step": 4990 }, { "epoch": 0.416, "grad_norm": 5.125, "grad_norm_var": 0.087744140625, "learning_rate": 4e-05, "loss": 4.9785, "loss/crossentropy": 1.6779858320951462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16968519985675812, "step": 4992 }, { "epoch": 0.4161666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.06444905598958334, "learning_rate": 4e-05, "loss": 4.9131, "loss/crossentropy": 2.3368648886680603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20479774847626686, "step": 4994 }, { "epoch": 0.41633333333333333, "grad_norm": 5.125, "grad_norm_var": 0.06220296223958333, "learning_rate": 4e-05, "loss": 4.7284, "loss/crossentropy": 2.203623980283737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22645379230380058, "step": 4996 }, { "epoch": 0.4165, "grad_norm": 4.84375, "grad_norm_var": 0.06614583333333333, "learning_rate": 4e-05, "loss": 4.5558, "loss/crossentropy": 1.6457276046276093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14439713582396507, "step": 4998 }, { "epoch": 0.4166666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.060221354166666664, "learning_rate": 4e-05, "loss": 5.3545, "loss/crossentropy": 1.8733460828661919, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17062750086188316, "step": 5000 }, { "epoch": 0.41683333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.043192545572916664, "learning_rate": 4e-05, "loss": 4.5375, "loss/crossentropy": 1.8087796047329903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17719511315226555, "step": 5002 }, { "epoch": 0.417, "grad_norm": 4.84375, "grad_norm_var": 0.02578125, "learning_rate": 4e-05, "loss": 5.0403, "loss/crossentropy": 2.1366709172725677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200287990272045, "step": 5004 }, { "epoch": 0.4171666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.024637858072916668, "learning_rate": 4e-05, "loss": 4.5753, "loss/crossentropy": 2.0192334055900574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2032482735812664, "step": 5006 }, { "epoch": 0.41733333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.03951822916666667, "learning_rate": 4e-05, "loss": 5.5531, "loss/crossentropy": 1.7714089825749397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18016327545046806, "step": 5008 }, { "epoch": 0.4175, "grad_norm": 5.40625, "grad_norm_var": 0.051981608072916664, "learning_rate": 4e-05, "loss": 5.0002, "loss/crossentropy": 2.296230137348175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2626774534583092, "step": 5010 }, { "epoch": 0.4176666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.05383707682291667, "learning_rate": 4e-05, "loss": 5.2479, "loss/crossentropy": 2.222260892391205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1999911367893219, "step": 5012 }, { "epoch": 0.41783333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.064697265625, "learning_rate": 4e-05, "loss": 4.8299, "loss/crossentropy": 1.8547895401716232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18344702012836933, "step": 5014 }, { "epoch": 0.418, "grad_norm": 4.6875, "grad_norm_var": 0.07916259765625, "learning_rate": 4e-05, "loss": 3.9861, "loss/crossentropy": 1.537365846335888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15916940197348595, "step": 5016 }, { "epoch": 0.4181666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.115478515625, "learning_rate": 4e-05, "loss": 5.4498, "loss/crossentropy": 2.237378031015396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22286979109048843, "step": 5018 }, { "epoch": 0.41833333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.12050374348958333, "learning_rate": 4e-05, "loss": 5.3501, "loss/crossentropy": 2.154662251472473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220684964209795, "step": 5020 }, { "epoch": 0.4185, "grad_norm": 5.09375, "grad_norm_var": 0.11252848307291667, "learning_rate": 4e-05, "loss": 5.2871, "loss/crossentropy": 1.9252085089683533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19202982634305954, "step": 5022 }, { "epoch": 0.4186666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.11041259765625, "learning_rate": 4e-05, "loss": 4.6119, "loss/crossentropy": 1.7248478308320045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18302454613149166, "step": 5024 }, { "epoch": 0.41883333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.09351806640625, "learning_rate": 4e-05, "loss": 4.9789, "loss/crossentropy": 2.413954019546509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19942022860050201, "step": 5026 }, { "epoch": 0.419, "grad_norm": 4.84375, "grad_norm_var": 0.09270833333333334, "learning_rate": 4e-05, "loss": 4.5155, "loss/crossentropy": 2.0690543353557587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17765720933675766, "step": 5028 }, { "epoch": 0.4191666666666667, "grad_norm": 4.875, "grad_norm_var": 0.08153889973958334, "learning_rate": 4e-05, "loss": 4.9016, "loss/crossentropy": 1.9299027398228645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17881011590361595, "step": 5030 }, { "epoch": 0.41933333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.05956624348958333, "learning_rate": 4e-05, "loss": 4.945, "loss/crossentropy": 1.9287557378411293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16480084136128426, "step": 5032 }, { "epoch": 0.4195, "grad_norm": 4.9375, "grad_norm_var": 0.026676432291666666, "learning_rate": 4e-05, "loss": 4.9642, "loss/crossentropy": 1.560925267636776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15121719613671303, "step": 5034 }, { "epoch": 0.4196666666666667, "grad_norm": 4.75, "grad_norm_var": 0.019645182291666667, "learning_rate": 4e-05, "loss": 4.846, "loss/crossentropy": 1.8814395442605019, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1752840355038643, "step": 5036 }, { "epoch": 0.41983333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.020829264322916666, "learning_rate": 4e-05, "loss": 5.4615, "loss/crossentropy": 2.11568945646286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19466029852628708, "step": 5038 }, { "epoch": 0.42, "grad_norm": 5.125, "grad_norm_var": 0.019820149739583334, "learning_rate": 4e-05, "loss": 4.7966, "loss/crossentropy": 2.5833939909934998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22409406676888466, "step": 5040 }, { "epoch": 0.4201666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.022526041666666666, "learning_rate": 4e-05, "loss": 4.8328, "loss/crossentropy": 1.853715144097805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956741362810135, "step": 5042 }, { "epoch": 0.42033333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.021207682291666665, "learning_rate": 4e-05, "loss": 5.0333, "loss/crossentropy": 1.6487743258476257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1757977306842804, "step": 5044 }, { "epoch": 0.4205, "grad_norm": 4.6875, "grad_norm_var": 0.023942057291666666, "learning_rate": 4e-05, "loss": 4.7196, "loss/crossentropy": 2.2509495317935944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21562692895531654, "step": 5046 }, { "epoch": 0.4206666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.03268229166666667, "learning_rate": 4e-05, "loss": 5.0596, "loss/crossentropy": 2.125421464443207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19918570667505264, "step": 5048 }, { "epoch": 0.42083333333333334, "grad_norm": 4.625, "grad_norm_var": 0.03553059895833333, "learning_rate": 4e-05, "loss": 4.6523, "loss/crossentropy": 1.7011590600013733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1660672463476658, "step": 5050 }, { "epoch": 0.421, "grad_norm": 5.40625, "grad_norm_var": 0.054036458333333336, "learning_rate": 4e-05, "loss": 5.1967, "loss/crossentropy": 2.2367068231105804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1934006493538618, "step": 5052 }, { "epoch": 0.4211666666666667, "grad_norm": 5.25, "grad_norm_var": 0.055562337239583336, "learning_rate": 4e-05, "loss": 5.5412, "loss/crossentropy": 2.018110543489456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19330649077892303, "step": 5054 }, { "epoch": 0.42133333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.062174479166666664, "learning_rate": 4e-05, "loss": 4.7395, "loss/crossentropy": 1.915867231786251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18698799423873425, "step": 5056 }, { "epoch": 0.4215, "grad_norm": 5.0625, "grad_norm_var": 0.07667643229166667, "learning_rate": 4e-05, "loss": 5.4262, "loss/crossentropy": 2.3704627454280853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22046199068427086, "step": 5058 }, { "epoch": 0.4216666666666667, "grad_norm": 4.625, "grad_norm_var": 0.08043212890625, "learning_rate": 4e-05, "loss": 4.8319, "loss/crossentropy": 2.6193134784698486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22060347348451614, "step": 5060 }, { "epoch": 0.42183333333333334, "grad_norm": 4.75, "grad_norm_var": 0.07978108723958334, "learning_rate": 4e-05, "loss": 4.5304, "loss/crossentropy": 2.0035160332918167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18313829228281975, "step": 5062 }, { "epoch": 0.422, "grad_norm": 4.6875, "grad_norm_var": 0.08489583333333334, "learning_rate": 4e-05, "loss": 5.3497, "loss/crossentropy": 2.353265404701233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23167630657553673, "step": 5064 }, { "epoch": 0.4221666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.07928059895833334, "learning_rate": 4e-05, "loss": 4.7515, "loss/crossentropy": 2.228081852197647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21101577952504158, "step": 5066 }, { "epoch": 0.42233333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.069384765625, "learning_rate": 4e-05, "loss": 5.2329, "loss/crossentropy": 2.2022290229797363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20957914367318153, "step": 5068 }, { "epoch": 0.4225, "grad_norm": 4.71875, "grad_norm_var": 0.07408854166666666, "learning_rate": 4e-05, "loss": 4.5227, "loss/crossentropy": 1.3012079074978828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1404709778726101, "step": 5070 }, { "epoch": 0.4226666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.06925455729166667, "learning_rate": 4e-05, "loss": 4.2628, "loss/crossentropy": 1.5303220078349113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17120682075619698, "step": 5072 }, { "epoch": 0.42283333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.05364176432291667, "learning_rate": 4e-05, "loss": 4.8328, "loss/crossentropy": 2.4758930802345276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22224238142371178, "step": 5074 }, { "epoch": 0.423, "grad_norm": 5.03125, "grad_norm_var": 0.05194905598958333, "learning_rate": 4e-05, "loss": 5.3414, "loss/crossentropy": 2.2223449051380157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18280233442783356, "step": 5076 }, { "epoch": 0.4231666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.054150390625, "learning_rate": 4e-05, "loss": 5.0858, "loss/crossentropy": 2.2318738102912903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102554365992546, "step": 5078 }, { "epoch": 0.42333333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.04156494140625, "learning_rate": 4e-05, "loss": 4.8605, "loss/crossentropy": 1.872434914112091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1843944452702999, "step": 5080 }, { "epoch": 0.4235, "grad_norm": 5.09375, "grad_norm_var": 0.045817057291666664, "learning_rate": 4e-05, "loss": 4.7855, "loss/crossentropy": 2.177910089492798, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981346271932125, "step": 5082 }, { "epoch": 0.4236666666666667, "grad_norm": 4.625, "grad_norm_var": 0.04592692057291667, "learning_rate": 4e-05, "loss": 4.8335, "loss/crossentropy": 1.09625893086195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17088285274803638, "step": 5084 }, { "epoch": 0.42383333333333334, "grad_norm": 4.875, "grad_norm_var": 0.03638916015625, "learning_rate": 4e-05, "loss": 5.8526, "loss/crossentropy": 1.9430483132600784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1950901709496975, "step": 5086 }, { "epoch": 0.424, "grad_norm": 4.53125, "grad_norm_var": 0.06304931640625, "learning_rate": 4e-05, "loss": 4.7281, "loss/crossentropy": 1.635485090315342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18363632634282112, "step": 5088 }, { "epoch": 0.4241666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.0609375, "learning_rate": 4e-05, "loss": 4.9117, "loss/crossentropy": 1.2679156586527824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16498573496937752, "step": 5090 }, { "epoch": 0.42433333333333334, "grad_norm": 5.84375, "grad_norm_var": 8.680452473958333, "learning_rate": 4e-05, "loss": 5.3373, "loss/crossentropy": 2.0983648747205734, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19808345846831799, "step": 5092 }, { "epoch": 0.4245, "grad_norm": 5.09375, "grad_norm_var": 8.694124348958333, "learning_rate": 4e-05, "loss": 4.9142, "loss/crossentropy": 2.0484844595193863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18929221853613853, "step": 5094 }, { "epoch": 0.4246666666666667, "grad_norm": 5.40625, "grad_norm_var": 8.717041015625, "learning_rate": 4e-05, "loss": 5.1393, "loss/crossentropy": 1.9642380774021149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21084356680512428, "step": 5096 }, { "epoch": 0.42483333333333334, "grad_norm": 4.96875, "grad_norm_var": 22.81953125, "learning_rate": 4e-05, "loss": 4.9222, "loss/crossentropy": 1.8336669728159904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18257286585867405, "step": 5098 }, { "epoch": 0.425, "grad_norm": 5.15625, "grad_norm_var": 22.73375244140625, "learning_rate": 4e-05, "loss": 4.986, "loss/crossentropy": 2.136046200990677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19740736857056618, "step": 5100 }, { "epoch": 0.4251666666666667, "grad_norm": 4.53125, "grad_norm_var": 22.855497233072917, "learning_rate": 4e-05, "loss": 5.3193, "loss/crossentropy": 1.5806643292307854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1525644864886999, "step": 5102 }, { "epoch": 0.42533333333333334, "grad_norm": 4.75, "grad_norm_var": 22.573030598958333, "learning_rate": 4e-05, "loss": 4.8681, "loss/crossentropy": 1.7995612248778343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969994120299816, "step": 5104 }, { "epoch": 0.4255, "grad_norm": 5.09375, "grad_norm_var": 22.454541015625, "learning_rate": 4e-05, "loss": 4.6807, "loss/crossentropy": 1.8234473168849945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21108638122677803, "step": 5106 }, { "epoch": 0.4256666666666667, "grad_norm": 5.125, "grad_norm_var": 15.586962890625, "learning_rate": 4e-05, "loss": 5.1477, "loss/crossentropy": 2.558901071548462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21013934910297394, "step": 5108 }, { "epoch": 0.42583333333333334, "grad_norm": 5.375, "grad_norm_var": 15.538212076822917, "learning_rate": 4e-05, "loss": 4.8511, "loss/crossentropy": 1.5620201379060745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16541289910674095, "step": 5110 }, { "epoch": 0.426, "grad_norm": 4.75, "grad_norm_var": 15.622782389322916, "learning_rate": 4e-05, "loss": 4.8213, "loss/crossentropy": 2.680659532546997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2138008326292038, "step": 5112 }, { "epoch": 0.4261666666666667, "grad_norm": 4.625, "grad_norm_var": 0.06482747395833334, "learning_rate": 4e-05, "loss": 4.7686, "loss/crossentropy": 1.9398740530014038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1988565307110548, "step": 5114 }, { "epoch": 0.42633333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.0630859375, "learning_rate": 4e-05, "loss": 5.5553, "loss/crossentropy": 2.090523838996887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20620887726545334, "step": 5116 }, { "epoch": 0.4265, "grad_norm": 6.21875, "grad_norm_var": 0.15349934895833334, "learning_rate": 4e-05, "loss": 4.8973, "loss/crossentropy": 1.4458559900522232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23421939089894295, "step": 5118 }, { "epoch": 0.4266666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.15455322265625, "learning_rate": 4e-05, "loss": 4.678, "loss/crossentropy": 1.8547161743044853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17052751407027245, "step": 5120 }, { "epoch": 0.42683333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.15501302083333332, "learning_rate": 4e-05, "loss": 4.5453, "loss/crossentropy": 1.6307990327477455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15353327803313732, "step": 5122 }, { "epoch": 0.427, "grad_norm": 4.6875, "grad_norm_var": 0.17401936848958333, "learning_rate": 4e-05, "loss": 4.6401, "loss/crossentropy": 2.0444329008460045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21640102565288544, "step": 5124 }, { "epoch": 0.42716666666666664, "grad_norm": 4.96875, "grad_norm_var": 0.1701171875, "learning_rate": 4e-05, "loss": 5.3645, "loss/crossentropy": 2.4515466690063477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20284807682037354, "step": 5126 }, { "epoch": 0.42733333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.162109375, "learning_rate": 4e-05, "loss": 4.9106, "loss/crossentropy": 1.507865995168686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1698614414781332, "step": 5128 }, { "epoch": 0.4275, "grad_norm": 5.25, "grad_norm_var": 0.16126302083333333, "learning_rate": 4e-05, "loss": 4.8938, "loss/crossentropy": 1.7518939077854156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1792406179010868, "step": 5130 }, { "epoch": 0.42766666666666664, "grad_norm": 5.40625, "grad_norm_var": 0.17263997395833333, "learning_rate": 4e-05, "loss": 4.729, "loss/crossentropy": 1.932149201631546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19870978593826294, "step": 5132 }, { "epoch": 0.42783333333333334, "grad_norm": 4.875, "grad_norm_var": 0.069921875, "learning_rate": 4e-05, "loss": 5.2444, "loss/crossentropy": 2.447461187839508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20525626838207245, "step": 5134 }, { "epoch": 0.428, "grad_norm": 4.875, "grad_norm_var": 0.06666259765625, "learning_rate": 4e-05, "loss": 4.9806, "loss/crossentropy": 2.315503776073456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193039208650589, "step": 5136 }, { "epoch": 0.42816666666666664, "grad_norm": 5.5, "grad_norm_var": 0.08899332682291666, "learning_rate": 4e-05, "loss": 4.8899, "loss/crossentropy": 2.2293468713760376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100820429623127, "step": 5138 }, { "epoch": 0.42833333333333334, "grad_norm": 4.875, "grad_norm_var": 0.06516927083333333, "learning_rate": 4e-05, "loss": 5.0528, "loss/crossentropy": 1.8010507598519325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1925482451915741, "step": 5140 }, { "epoch": 0.4285, "grad_norm": 5.0, "grad_norm_var": 0.060933430989583336, "learning_rate": 4e-05, "loss": 4.9691, "loss/crossentropy": 2.4426570534706116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21383054926991463, "step": 5142 }, { "epoch": 0.42866666666666664, "grad_norm": 4.5625, "grad_norm_var": 0.06350504557291667, "learning_rate": 4e-05, "loss": 5.1129, "loss/crossentropy": 1.7086549401283264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16771548986434937, "step": 5144 }, { "epoch": 0.42883333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.06678059895833334, "learning_rate": 4e-05, "loss": 5.1706, "loss/crossentropy": 2.5270156860351562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23303020745515823, "step": 5146 }, { "epoch": 0.429, "grad_norm": 5.0625, "grad_norm_var": 0.05377197265625, "learning_rate": 4e-05, "loss": 5.3792, "loss/crossentropy": 2.2300464510917664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2331315577030182, "step": 5148 }, { "epoch": 0.42916666666666664, "grad_norm": 5.28125, "grad_norm_var": 0.060286458333333334, "learning_rate": 4e-05, "loss": 4.8926, "loss/crossentropy": 1.5624526962637901, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16189817152917385, "step": 5150 }, { "epoch": 0.42933333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.06730143229166667, "learning_rate": 4e-05, "loss": 5.3266, "loss/crossentropy": 2.736234724521637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2452528215944767, "step": 5152 }, { "epoch": 0.4295, "grad_norm": 4.75, "grad_norm_var": 0.04892171223958333, "learning_rate": 4e-05, "loss": 4.0593, "loss/crossentropy": 1.1300897151231766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1562607455998659, "step": 5154 }, { "epoch": 0.42966666666666664, "grad_norm": 5.46875, "grad_norm_var": 0.06806233723958334, "learning_rate": 4e-05, "loss": 5.2706, "loss/crossentropy": 2.048402391374111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18175113759934902, "step": 5156 }, { "epoch": 0.42983333333333335, "grad_norm": 4.53125, "grad_norm_var": 0.08134358723958333, "learning_rate": 4e-05, "loss": 4.3642, "loss/crossentropy": 2.2617290019989014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21256079152226448, "step": 5158 }, { "epoch": 0.43, "grad_norm": 4.75, "grad_norm_var": 0.07506103515625, "learning_rate": 4e-05, "loss": 4.7489, "loss/crossentropy": 1.670315831899643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18331779912114143, "step": 5160 }, { "epoch": 0.43016666666666664, "grad_norm": 5.1875, "grad_norm_var": 0.06717122395833333, "learning_rate": 4e-05, "loss": 5.3245, "loss/crossentropy": 1.1678467690944672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1482110098004341, "step": 5162 }, { "epoch": 0.43033333333333335, "grad_norm": 5.1875, "grad_norm_var": 0.07580973307291666, "learning_rate": 4e-05, "loss": 4.2844, "loss/crossentropy": 2.235573798418045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24548039212822914, "step": 5164 }, { "epoch": 0.4305, "grad_norm": 5.28125, "grad_norm_var": 0.092578125, "learning_rate": 4e-05, "loss": 4.5618, "loss/crossentropy": 2.2987032532691956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20700116828083992, "step": 5166 }, { "epoch": 0.43066666666666664, "grad_norm": 5.25, "grad_norm_var": 0.11443684895833334, "learning_rate": 4e-05, "loss": 5.6446, "loss/crossentropy": 2.2951321601867676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21205606684088707, "step": 5168 }, { "epoch": 0.43083333333333335, "grad_norm": 5.375, "grad_norm_var": 0.11767171223958334, "learning_rate": 4e-05, "loss": 5.4096, "loss/crossentropy": 2.568745195865631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20844319835305214, "step": 5170 }, { "epoch": 0.431, "grad_norm": 5.25, "grad_norm_var": 0.11890869140625, "learning_rate": 4e-05, "loss": 5.3175, "loss/crossentropy": 2.58900648355484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21212553977966309, "step": 5172 }, { "epoch": 0.43116666666666664, "grad_norm": 4.5625, "grad_norm_var": 0.11170247395833334, "learning_rate": 4e-05, "loss": 4.6694, "loss/crossentropy": 1.557967871427536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21451672539114952, "step": 5174 }, { "epoch": 0.43133333333333335, "grad_norm": 4.59375, "grad_norm_var": 0.11808268229166667, "learning_rate": 4e-05, "loss": 4.841, "loss/crossentropy": 2.51181161403656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21414360031485558, "step": 5176 }, { "epoch": 0.4315, "grad_norm": 5.40625, "grad_norm_var": 0.12913004557291666, "learning_rate": 4e-05, "loss": 5.4822, "loss/crossentropy": 2.557952344417572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21239551529288292, "step": 5178 }, { "epoch": 0.43166666666666664, "grad_norm": 4.71875, "grad_norm_var": 0.13228759765625, "learning_rate": 4e-05, "loss": 4.3222, "loss/crossentropy": 0.8792792856693268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10647947527468204, "step": 5180 }, { "epoch": 0.43183333333333335, "grad_norm": 5.0, "grad_norm_var": 0.10898030598958333, "learning_rate": 4e-05, "loss": 4.4222, "loss/crossentropy": 1.6791368499398232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18098345771431923, "step": 5182 }, { "epoch": 0.432, "grad_norm": 4.875, "grad_norm_var": 0.08318684895833334, "learning_rate": 4e-05, "loss": 4.7292, "loss/crossentropy": 2.429518163204193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2394704520702362, "step": 5184 }, { "epoch": 0.43216666666666664, "grad_norm": 4.8125, "grad_norm_var": 0.065625, "learning_rate": 4e-05, "loss": 5.0259, "loss/crossentropy": 1.6873462200164795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16258270666003227, "step": 5186 }, { "epoch": 0.43233333333333335, "grad_norm": 4.875, "grad_norm_var": 0.051285807291666666, "learning_rate": 4e-05, "loss": 4.8518, "loss/crossentropy": 2.167505532503128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19529220461845398, "step": 5188 }, { "epoch": 0.4325, "grad_norm": 4.84375, "grad_norm_var": 0.04334309895833333, "learning_rate": 4e-05, "loss": 5.1402, "loss/crossentropy": 1.5462488010525703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15644877590239048, "step": 5190 }, { "epoch": 0.43266666666666664, "grad_norm": 5.25, "grad_norm_var": 0.046284993489583336, "learning_rate": 4e-05, "loss": 5.1826, "loss/crossentropy": 1.9765098094940186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2178989090025425, "step": 5192 }, { "epoch": 0.43283333333333335, "grad_norm": 5.125, "grad_norm_var": 0.02320556640625, "learning_rate": 4e-05, "loss": 4.9383, "loss/crossentropy": 2.181311994791031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18956802040338516, "step": 5194 }, { "epoch": 0.433, "grad_norm": 4.75, "grad_norm_var": 0.02037353515625, "learning_rate": 4e-05, "loss": 4.545, "loss/crossentropy": 2.1313266456127167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24127675592899323, "step": 5196 }, { "epoch": 0.43316666666666664, "grad_norm": 4.78125, "grad_norm_var": 0.020052083333333335, "learning_rate": 4e-05, "loss": 4.9439, "loss/crossentropy": 1.9447228908538818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2084346041083336, "step": 5198 }, { "epoch": 0.43333333333333335, "grad_norm": 4.65625, "grad_norm_var": 0.023697916666666666, "learning_rate": 4e-05, "loss": 4.7933, "loss/crossentropy": 2.6325061917304993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21256360411643982, "step": 5200 }, { "epoch": 0.4335, "grad_norm": 4.875, "grad_norm_var": 0.023291015625, "learning_rate": 4e-05, "loss": 4.7973, "loss/crossentropy": 1.343225508928299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1388614997267723, "step": 5202 }, { "epoch": 0.43366666666666664, "grad_norm": 4.90625, "grad_norm_var": 0.031571451822916666, "learning_rate": 4e-05, "loss": 5.1977, "loss/crossentropy": 2.4446049332618713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23277466744184494, "step": 5204 }, { "epoch": 0.43383333333333335, "grad_norm": 5.21875, "grad_norm_var": 0.04006754557291667, "learning_rate": 4e-05, "loss": 4.991, "loss/crossentropy": 2.2790105640888214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22518282383680344, "step": 5206 }, { "epoch": 0.434, "grad_norm": 5.0, "grad_norm_var": 0.02945556640625, "learning_rate": 4e-05, "loss": 4.7089, "loss/crossentropy": 2.0995849072933197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2328692339360714, "step": 5208 }, { "epoch": 0.43416666666666665, "grad_norm": 4.75, "grad_norm_var": 0.03365478515625, "learning_rate": 4e-05, "loss": 5.2833, "loss/crossentropy": 2.38425749540329, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22432530671358109, "step": 5210 }, { "epoch": 0.43433333333333335, "grad_norm": 5.3125, "grad_norm_var": 0.04550374348958333, "learning_rate": 4e-05, "loss": 4.9349, "loss/crossentropy": 2.3226277828216553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2002677097916603, "step": 5212 }, { "epoch": 0.4345, "grad_norm": 4.75, "grad_norm_var": 0.045947265625, "learning_rate": 4e-05, "loss": 5.1692, "loss/crossentropy": 2.459660768508911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22898616641759872, "step": 5214 }, { "epoch": 0.43466666666666665, "grad_norm": 5.03125, "grad_norm_var": 0.052587890625, "learning_rate": 4e-05, "loss": 4.7851, "loss/crossentropy": 1.7144055142998695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18077606335282326, "step": 5216 }, { "epoch": 0.43483333333333335, "grad_norm": 5.46875, "grad_norm_var": 0.06946614583333334, "learning_rate": 4e-05, "loss": 5.4372, "loss/crossentropy": 1.9776408672332764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18408696725964546, "step": 5218 }, { "epoch": 0.435, "grad_norm": 5.53125, "grad_norm_var": 0.07146809895833334, "learning_rate": 4e-05, "loss": 4.7817, "loss/crossentropy": 1.2995164021849632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1558908000588417, "step": 5220 }, { "epoch": 0.43516666666666665, "grad_norm": 4.6875, "grad_norm_var": 0.07316080729166667, "learning_rate": 4e-05, "loss": 4.5949, "loss/crossentropy": 1.725936196744442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1701155286282301, "step": 5222 }, { "epoch": 0.43533333333333335, "grad_norm": 4.9375, "grad_norm_var": 0.09664306640625, "learning_rate": 4e-05, "loss": 5.2427, "loss/crossentropy": 2.4887034595012665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21807188540697098, "step": 5224 }, { "epoch": 0.4355, "grad_norm": 5.0, "grad_norm_var": 0.09021809895833334, "learning_rate": 4e-05, "loss": 4.3535, "loss/crossentropy": 1.597044050693512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17133367992937565, "step": 5226 }, { "epoch": 0.43566666666666665, "grad_norm": 4.75, "grad_norm_var": 0.09550374348958333, "learning_rate": 4e-05, "loss": 5.2829, "loss/crossentropy": 2.532274305820465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22373450174927711, "step": 5228 }, { "epoch": 0.43583333333333335, "grad_norm": 5.3125, "grad_norm_var": 0.10360921223958333, "learning_rate": 4e-05, "loss": 4.6998, "loss/crossentropy": 2.2400224208831787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20386741310358047, "step": 5230 }, { "epoch": 0.436, "grad_norm": 4.8125, "grad_norm_var": 0.10012613932291667, "learning_rate": 4e-05, "loss": 4.8858, "loss/crossentropy": 2.234561562538147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22216036915779114, "step": 5232 }, { "epoch": 0.43616666666666665, "grad_norm": 4.96875, "grad_norm_var": 0.09280192057291667, "learning_rate": 4e-05, "loss": 4.6863, "loss/crossentropy": 1.9974671080708504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18877053633332253, "step": 5234 }, { "epoch": 0.43633333333333335, "grad_norm": 4.8125, "grad_norm_var": 0.06643473307291667, "learning_rate": 4e-05, "loss": 4.8685, "loss/crossentropy": 1.6357809603214264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16103867627680302, "step": 5236 }, { "epoch": 0.4365, "grad_norm": 4.5625, "grad_norm_var": 0.07776285807291666, "learning_rate": 4e-05, "loss": 4.5777, "loss/crossentropy": 1.2994728162884712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1460007168352604, "step": 5238 }, { "epoch": 0.43666666666666665, "grad_norm": 4.84375, "grad_norm_var": 0.06417643229166667, "learning_rate": 4e-05, "loss": 4.9275, "loss/crossentropy": 2.199091613292694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144564837217331, "step": 5240 }, { "epoch": 0.43683333333333335, "grad_norm": 5.0625, "grad_norm_var": 0.07066650390625, "learning_rate": 4e-05, "loss": 4.7809, "loss/crossentropy": 2.668358266353607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22134612500667572, "step": 5242 }, { "epoch": 0.437, "grad_norm": 4.71875, "grad_norm_var": 0.051102701822916666, "learning_rate": 4e-05, "loss": 4.9155, "loss/crossentropy": 1.8332997113466263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1924358643591404, "step": 5244 }, { "epoch": 0.43716666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.056864420572916664, "learning_rate": 4e-05, "loss": 5.0669, "loss/crossentropy": 2.1800646483898163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20784780383110046, "step": 5246 }, { "epoch": 0.43733333333333335, "grad_norm": 5.15625, "grad_norm_var": 0.06339518229166667, "learning_rate": 4e-05, "loss": 4.6012, "loss/crossentropy": 1.463565170764923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1575961671769619, "step": 5248 }, { "epoch": 0.4375, "grad_norm": 5.53125, "grad_norm_var": 0.08502604166666666, "learning_rate": 4e-05, "loss": 5.103, "loss/crossentropy": 2.0702124536037445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20388288795948029, "step": 5250 }, { "epoch": 0.43766666666666665, "grad_norm": 5.15625, "grad_norm_var": 0.09503580729166666, "learning_rate": 4e-05, "loss": 5.2124, "loss/crossentropy": 2.0828773379325867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22234292700886726, "step": 5252 }, { "epoch": 0.43783333333333335, "grad_norm": 4.65625, "grad_norm_var": 0.08854166666666667, "learning_rate": 4e-05, "loss": 5.07, "loss/crossentropy": 2.2166521549224854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087559849023819, "step": 5254 }, { "epoch": 0.438, "grad_norm": 4.6875, "grad_norm_var": 0.09374593098958334, "learning_rate": 4e-05, "loss": 5.3975, "loss/crossentropy": 2.173314794898033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19079249911010265, "step": 5256 }, { "epoch": 0.43816666666666665, "grad_norm": 5.3125, "grad_norm_var": 0.09436442057291666, "learning_rate": 4e-05, "loss": 5.0493, "loss/crossentropy": 2.52229842543602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2559218965470791, "step": 5258 }, { "epoch": 0.43833333333333335, "grad_norm": 4.71875, "grad_norm_var": 0.08440348307291666, "learning_rate": 4e-05, "loss": 5.0227, "loss/crossentropy": 2.1676777005195618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21036133915185928, "step": 5260 }, { "epoch": 0.4385, "grad_norm": 8.5, "grad_norm_var": 0.8345011393229167, "learning_rate": 4e-05, "loss": 5.1052, "loss/crossentropy": 1.9067611992359161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087782323360443, "step": 5262 }, { "epoch": 0.43866666666666665, "grad_norm": 5.3125, "grad_norm_var": 0.82457275390625, "learning_rate": 4e-05, "loss": 5.1143, "loss/crossentropy": 2.3210455179214478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18462146818637848, "step": 5264 }, { "epoch": 0.43883333333333335, "grad_norm": 4.65625, "grad_norm_var": 0.83912353515625, "learning_rate": 4e-05, "loss": 4.959, "loss/crossentropy": 1.3365162461996078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16913891211152077, "step": 5266 }, { "epoch": 0.439, "grad_norm": 5.625, "grad_norm_var": 3.219038899739583, "learning_rate": 4e-05, "loss": 4.8483, "loss/crossentropy": 2.61382520198822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22090576961636543, "step": 5268 }, { "epoch": 0.43916666666666665, "grad_norm": 4.71875, "grad_norm_var": 3.216727701822917, "learning_rate": 4e-05, "loss": 4.9946, "loss/crossentropy": 2.3929781913757324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2272535227239132, "step": 5270 }, { "epoch": 0.43933333333333335, "grad_norm": 5.25, "grad_norm_var": 3.1981608072916665, "learning_rate": 4e-05, "loss": 4.844, "loss/crossentropy": 2.355992078781128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21010024100542068, "step": 5272 }, { "epoch": 0.4395, "grad_norm": 4.65625, "grad_norm_var": 3.239306640625, "learning_rate": 4e-05, "loss": 5.1044, "loss/crossentropy": 2.033922716975212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21508634090423584, "step": 5274 }, { "epoch": 0.43966666666666665, "grad_norm": 5.46875, "grad_norm_var": 3.19302978515625, "learning_rate": 4e-05, "loss": 5.1998, "loss/crossentropy": 2.0046669840812683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1982124038040638, "step": 5276 }, { "epoch": 0.43983333333333335, "grad_norm": 4.875, "grad_norm_var": 2.647591145833333, "learning_rate": 4e-05, "loss": 5.5324, "loss/crossentropy": 2.5933563113212585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21993929892778397, "step": 5278 }, { "epoch": 0.44, "grad_norm": 4.78125, "grad_norm_var": 2.7052042643229166, "learning_rate": 4e-05, "loss": 4.7645, "loss/crossentropy": 1.6767731830477715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17139611765742302, "step": 5280 }, { "epoch": 0.44016666666666665, "grad_norm": 5.0, "grad_norm_var": 2.71695556640625, "learning_rate": 4e-05, "loss": 5.2355, "loss/crossentropy": 1.993862234055996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19030576571822166, "step": 5282 }, { "epoch": 0.44033333333333335, "grad_norm": 5.15625, "grad_norm_var": 0.10299072265625, "learning_rate": 4e-05, "loss": 4.9784, "loss/crossentropy": 1.885690025985241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19479876197874546, "step": 5284 }, { "epoch": 0.4405, "grad_norm": 4.9375, "grad_norm_var": 0.09381510416666666, "learning_rate": 4e-05, "loss": 5.0184, "loss/crossentropy": 2.0076128244400024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18920595943927765, "step": 5286 }, { "epoch": 0.44066666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.08638916015625, "learning_rate": 4e-05, "loss": 5.2074, "loss/crossentropy": 2.4117564260959625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2313983030617237, "step": 5288 }, { "epoch": 0.44083333333333335, "grad_norm": 4.5625, "grad_norm_var": 0.110009765625, "learning_rate": 4e-05, "loss": 4.6849, "loss/crossentropy": 1.497763104736805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16671940125524998, "step": 5290 }, { "epoch": 0.441, "grad_norm": 5.21875, "grad_norm_var": 0.094775390625, "learning_rate": 4e-05, "loss": 5.2303, "loss/crossentropy": 1.4492312595248222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15011000633239746, "step": 5292 }, { "epoch": 0.44116666666666665, "grad_norm": 5.84375, "grad_norm_var": 0.13225504557291667, "learning_rate": 4e-05, "loss": 4.7391, "loss/crossentropy": 2.051460087299347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20685597881674767, "step": 5294 }, { "epoch": 0.44133333333333336, "grad_norm": 5.0, "grad_norm_var": 0.13450520833333332, "learning_rate": 4e-05, "loss": 4.9108, "loss/crossentropy": 2.753096580505371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23638440668582916, "step": 5296 }, { "epoch": 0.4415, "grad_norm": 4.6875, "grad_norm_var": 0.13847249348958332, "learning_rate": 4e-05, "loss": 4.4445, "loss/crossentropy": 2.3885613679885864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2244817018508911, "step": 5298 }, { "epoch": 0.44166666666666665, "grad_norm": 4.46875, "grad_norm_var": 0.13592122395833334, "learning_rate": 4e-05, "loss": 5.1649, "loss/crossentropy": 2.4205052852630615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21577660739421844, "step": 5300 }, { "epoch": 0.44183333333333336, "grad_norm": 5.0625, "grad_norm_var": 0.15230712890625, "learning_rate": 4e-05, "loss": 5.2738, "loss/crossentropy": 2.2331049740314484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21633132547140121, "step": 5302 }, { "epoch": 0.442, "grad_norm": 4.71875, "grad_norm_var": 0.14698893229166668, "learning_rate": 4e-05, "loss": 4.7522, "loss/crossentropy": 2.1147951781749725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21170584484934807, "step": 5304 }, { "epoch": 0.44216666666666665, "grad_norm": 5.125, "grad_norm_var": 0.12317301432291666, "learning_rate": 4e-05, "loss": 5.2276, "loss/crossentropy": 2.4038414657115936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2141760066151619, "step": 5306 }, { "epoch": 0.44233333333333336, "grad_norm": 5.46875, "grad_norm_var": 0.13899739583333334, "learning_rate": 4e-05, "loss": 5.1861, "loss/crossentropy": 1.9134333208203316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17545868270099163, "step": 5308 }, { "epoch": 0.4425, "grad_norm": 5.5, "grad_norm_var": 0.10569254557291667, "learning_rate": 4e-05, "loss": 4.6515, "loss/crossentropy": 2.0091544091701508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1976570524275303, "step": 5310 }, { "epoch": 0.44266666666666665, "grad_norm": 4.875, "grad_norm_var": 0.104931640625, "learning_rate": 4e-05, "loss": 5.3006, "loss/crossentropy": 1.9432563707232475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17379920929670334, "step": 5312 }, { "epoch": 0.44283333333333336, "grad_norm": 4.53125, "grad_norm_var": 0.11717122395833333, "learning_rate": 4e-05, "loss": 4.6163, "loss/crossentropy": 1.1306499615311623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12089996412396431, "step": 5314 }, { "epoch": 0.443, "grad_norm": 4.78125, "grad_norm_var": 0.094921875, "learning_rate": 4e-05, "loss": 4.8005, "loss/crossentropy": 2.579859673976898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22911198064684868, "step": 5316 }, { "epoch": 0.44316666666666665, "grad_norm": 4.65625, "grad_norm_var": 0.08336181640625, "learning_rate": 4e-05, "loss": 5.6242, "loss/crossentropy": 2.7599607706069946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21561214700341225, "step": 5318 }, { "epoch": 0.44333333333333336, "grad_norm": 4.90625, "grad_norm_var": 0.238525390625, "learning_rate": 4e-05, "loss": 5.1176, "loss/crossentropy": 2.208019971847534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21972987428307533, "step": 5320 }, { "epoch": 0.4435, "grad_norm": 5.0, "grad_norm_var": 0.24205729166666667, "learning_rate": 4e-05, "loss": 5.1073, "loss/crossentropy": 2.002477027475834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19431659020483494, "step": 5322 }, { "epoch": 0.44366666666666665, "grad_norm": 4.9375, "grad_norm_var": 0.22418212890625, "learning_rate": 4e-05, "loss": 4.8315, "loss/crossentropy": 2.3529436886310577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23537345603108406, "step": 5324 }, { "epoch": 0.44383333333333336, "grad_norm": 4.78125, "grad_norm_var": 0.21044514973958334, "learning_rate": 4e-05, "loss": 4.9157, "loss/crossentropy": 2.0489574670791626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18815740570425987, "step": 5326 }, { "epoch": 0.444, "grad_norm": 5.59375, "grad_norm_var": 0.23173421223958332, "learning_rate": 4e-05, "loss": 4.9874, "loss/crossentropy": 1.70937280356884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18881511315703392, "step": 5328 }, { "epoch": 0.44416666666666665, "grad_norm": 4.75, "grad_norm_var": 0.20753580729166668, "learning_rate": 4e-05, "loss": 4.9372, "loss/crossentropy": 2.4225960969924927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22473590821027756, "step": 5330 }, { "epoch": 0.44433333333333336, "grad_norm": 4.96875, "grad_norm_var": 0.20299479166666667, "learning_rate": 4e-05, "loss": 4.4347, "loss/crossentropy": 2.0197691321372986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18901053816080093, "step": 5332 }, { "epoch": 0.4445, "grad_norm": 5.84375, "grad_norm_var": 0.23058268229166667, "learning_rate": 4e-05, "loss": 4.9402, "loss/crossentropy": 2.1207685470581055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30563098564743996, "step": 5334 }, { "epoch": 0.44466666666666665, "grad_norm": 5.375, "grad_norm_var": 0.10266520182291666, "learning_rate": 4e-05, "loss": 5.0762, "loss/crossentropy": 2.138897955417633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23109456151723862, "step": 5336 }, { "epoch": 0.44483333333333336, "grad_norm": 5.625, "grad_norm_var": 0.120947265625, "learning_rate": 4e-05, "loss": 5.2804, "loss/crossentropy": 2.353893458843231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2192895971238613, "step": 5338 }, { "epoch": 0.445, "grad_norm": 5.21875, "grad_norm_var": 0.12118733723958333, "learning_rate": 4e-05, "loss": 5.394, "loss/crossentropy": 2.337477147579193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23079849779605865, "step": 5340 }, { "epoch": 0.44516666666666665, "grad_norm": 4.96875, "grad_norm_var": 0.11666259765625, "learning_rate": 4e-05, "loss": 4.7981, "loss/crossentropy": 1.877238281071186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18820980936288834, "step": 5342 }, { "epoch": 0.44533333333333336, "grad_norm": 5.0625, "grad_norm_var": 0.08951416015625, "learning_rate": 4e-05, "loss": 5.0076, "loss/crossentropy": 2.206613063812256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22341401129961014, "step": 5344 }, { "epoch": 0.4455, "grad_norm": 5.125, "grad_norm_var": 0.07415364583333334, "learning_rate": 4e-05, "loss": 4.6358, "loss/crossentropy": 1.4311936870217323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15251067653298378, "step": 5346 }, { "epoch": 0.44566666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.08357747395833333, "learning_rate": 4e-05, "loss": 5.0284, "loss/crossentropy": 2.017084077000618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756956409662962, "step": 5348 }, { "epoch": 0.44583333333333336, "grad_norm": 4.5, "grad_norm_var": 0.068603515625, "learning_rate": 4e-05, "loss": 4.7167, "loss/crossentropy": 1.0140413790941238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1561578195542097, "step": 5350 }, { "epoch": 0.446, "grad_norm": 5.4375, "grad_norm_var": 0.07177327473958334, "learning_rate": 4e-05, "loss": 5.2292, "loss/crossentropy": 1.7443393990397453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18483179062604904, "step": 5352 }, { "epoch": 0.44616666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.06536458333333334, "learning_rate": 4e-05, "loss": 4.7097, "loss/crossentropy": 2.484953999519348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1995408609509468, "step": 5354 }, { "epoch": 0.44633333333333336, "grad_norm": 5.125, "grad_norm_var": 0.06027018229166667, "learning_rate": 4e-05, "loss": 4.8299, "loss/crossentropy": 1.8088392987847328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17983365431427956, "step": 5356 }, { "epoch": 0.4465, "grad_norm": 4.78125, "grad_norm_var": 0.06532796223958333, "learning_rate": 4e-05, "loss": 5.2104, "loss/crossentropy": 2.070494204759598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20856108516454697, "step": 5358 }, { "epoch": 0.44666666666666666, "grad_norm": 5.4375, "grad_norm_var": 0.08186442057291667, "learning_rate": 4e-05, "loss": 5.1108, "loss/crossentropy": 1.826586052775383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19579278863966465, "step": 5360 }, { "epoch": 0.44683333333333336, "grad_norm": 4.84375, "grad_norm_var": 0.08690999348958334, "learning_rate": 4e-05, "loss": 4.6195, "loss/crossentropy": 1.4399391859769821, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14422345533967018, "step": 5362 }, { "epoch": 0.447, "grad_norm": 5.40625, "grad_norm_var": 0.10937093098958334, "learning_rate": 4e-05, "loss": 4.6871, "loss/crossentropy": 1.527749978005886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15237411856651306, "step": 5364 }, { "epoch": 0.44716666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.10432535807291667, "learning_rate": 4e-05, "loss": 4.6966, "loss/crossentropy": 2.0162869840860367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18172578141093254, "step": 5366 }, { "epoch": 0.44733333333333336, "grad_norm": 4.84375, "grad_norm_var": 0.09322916666666667, "learning_rate": 4e-05, "loss": 5.3323, "loss/crossentropy": 2.02753733843565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17405341938138008, "step": 5368 }, { "epoch": 0.4475, "grad_norm": 5.25, "grad_norm_var": 0.09101155598958334, "learning_rate": 4e-05, "loss": 4.9713, "loss/crossentropy": 1.9596833288669586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19608371704816818, "step": 5370 }, { "epoch": 0.44766666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.09021809895833334, "learning_rate": 4e-05, "loss": 4.8812, "loss/crossentropy": 1.6146743893623352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17602908238768578, "step": 5372 }, { "epoch": 0.44783333333333336, "grad_norm": 5.03125, "grad_norm_var": 0.08720296223958333, "learning_rate": 4e-05, "loss": 5.2716, "loss/crossentropy": 2.4960675835609436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2139119803905487, "step": 5374 }, { "epoch": 0.448, "grad_norm": 5.15625, "grad_norm_var": 0.07763264973958334, "learning_rate": 4e-05, "loss": 5.3561, "loss/crossentropy": 2.2325498163700104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21507864072918892, "step": 5376 }, { "epoch": 0.44816666666666666, "grad_norm": 4.4375, "grad_norm_var": 0.09062093098958333, "learning_rate": 4e-05, "loss": 4.4405, "loss/crossentropy": 1.2764653414487839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1504039615392685, "step": 5378 }, { "epoch": 0.4483333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.086181640625, "learning_rate": 4e-05, "loss": 5.2766, "loss/crossentropy": 1.6668346375226974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16964828968048096, "step": 5380 }, { "epoch": 0.4485, "grad_norm": 5.875, "grad_norm_var": 1.3891764322916667, "learning_rate": 4e-05, "loss": 4.7435, "loss/crossentropy": 2.3967296481132507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25826695933938026, "step": 5382 }, { "epoch": 0.44866666666666666, "grad_norm": 4.71875, "grad_norm_var": 1.398291015625, "learning_rate": 4e-05, "loss": 4.6032, "loss/crossentropy": 1.7601360231637955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18853917345404625, "step": 5384 }, { "epoch": 0.4488333333333333, "grad_norm": 4.8125, "grad_norm_var": 1.4064453125, "learning_rate": 4e-05, "loss": 5.4187, "loss/crossentropy": 1.5048007890582085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15315121971070766, "step": 5386 }, { "epoch": 0.449, "grad_norm": 4.6875, "grad_norm_var": 1.4166666666666667, "learning_rate": 4e-05, "loss": 4.8313, "loss/crossentropy": 1.4428337439894676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15264881402254105, "step": 5388 }, { "epoch": 0.44916666666666666, "grad_norm": 4.53125, "grad_norm_var": 1.4527994791666667, "learning_rate": 4e-05, "loss": 4.4607, "loss/crossentropy": 1.3986869975924492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17954625561833382, "step": 5390 }, { "epoch": 0.4493333333333333, "grad_norm": 4.625, "grad_norm_var": 1.4571451822916666, "learning_rate": 4e-05, "loss": 4.9726, "loss/crossentropy": 1.787402868270874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20106610283255577, "step": 5392 }, { "epoch": 0.4495, "grad_norm": 4.8125, "grad_norm_var": 1.4185831705729166, "learning_rate": 4e-05, "loss": 4.8828, "loss/crossentropy": 1.5303082168102264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15836665406823158, "step": 5394 }, { "epoch": 0.44966666666666666, "grad_norm": 5.25, "grad_norm_var": 1.4359212239583334, "learning_rate": 4e-05, "loss": 5.115, "loss/crossentropy": 1.9863907098770142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2083004079759121, "step": 5396 }, { "epoch": 0.4498333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.058817545572916664, "learning_rate": 4e-05, "loss": 5.0986, "loss/crossentropy": 2.172573536634445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20910141617059708, "step": 5398 }, { "epoch": 0.45, "grad_norm": 5.625, "grad_norm_var": 0.09016520182291667, "learning_rate": 4e-05, "loss": 5.1335, "loss/crossentropy": 1.42410147190094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17954059317708015, "step": 5400 }, { "epoch": 0.45016666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.09472249348958334, "learning_rate": 4e-05, "loss": 5.0265, "loss/crossentropy": 2.2131763994693756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932872235774994, "step": 5402 }, { "epoch": 0.4503333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.09081624348958334, "learning_rate": 4e-05, "loss": 4.722, "loss/crossentropy": 1.675975002348423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2252166699618101, "step": 5404 }, { "epoch": 0.4505, "grad_norm": 5.46875, "grad_norm_var": 0.10115559895833333, "learning_rate": 4e-05, "loss": 5.3349, "loss/crossentropy": 2.5156899094581604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2156628593802452, "step": 5406 }, { "epoch": 0.45066666666666666, "grad_norm": 5.125, "grad_norm_var": 0.08883056640625, "learning_rate": 4e-05, "loss": 5.3603, "loss/crossentropy": 1.6027273386716843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18063210882246494, "step": 5408 }, { "epoch": 0.4508333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.1025390625, "learning_rate": 4e-05, "loss": 5.3645, "loss/crossentropy": 2.3822204172611237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17875253781676292, "step": 5410 }, { "epoch": 0.451, "grad_norm": 5.3125, "grad_norm_var": 0.11164957682291667, "learning_rate": 4e-05, "loss": 4.9495, "loss/crossentropy": 1.816266119480133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20655769482254982, "step": 5412 }, { "epoch": 0.45116666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.10904947916666667, "learning_rate": 4e-05, "loss": 4.7501, "loss/crossentropy": 2.2936626076698303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20902733877301216, "step": 5414 }, { "epoch": 0.4513333333333333, "grad_norm": 4.75, "grad_norm_var": 0.08551025390625, "learning_rate": 4e-05, "loss": 4.7972, "loss/crossentropy": 1.6542549058794975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17094655334949493, "step": 5416 }, { "epoch": 0.4515, "grad_norm": 4.65625, "grad_norm_var": 0.08479410807291667, "learning_rate": 4e-05, "loss": 4.8174, "loss/crossentropy": 2.0066296085715294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19182928279042244, "step": 5418 }, { "epoch": 0.45166666666666666, "grad_norm": 4.625, "grad_norm_var": 0.08948160807291666, "learning_rate": 4e-05, "loss": 4.8859, "loss/crossentropy": 1.921394057571888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1805327869951725, "step": 5420 }, { "epoch": 0.4518333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.06936442057291667, "learning_rate": 4e-05, "loss": 4.0731, "loss/crossentropy": 2.1230402290821075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20618195086717606, "step": 5422 }, { "epoch": 0.452, "grad_norm": 4.65625, "grad_norm_var": 0.06545817057291667, "learning_rate": 4e-05, "loss": 5.0284, "loss/crossentropy": 0.9875800833106041, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1523461416363716, "step": 5424 }, { "epoch": 0.45216666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.058837890625, "learning_rate": 4e-05, "loss": 4.5052, "loss/crossentropy": 1.5282204896211624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15887302160263062, "step": 5426 }, { "epoch": 0.4523333333333333, "grad_norm": 6.5625, "grad_norm_var": 0.23505452473958333, "learning_rate": 4e-05, "loss": 5.1988, "loss/crossentropy": 2.2137202620506287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19665415585041046, "step": 5428 }, { "epoch": 0.4525, "grad_norm": 4.71875, "grad_norm_var": 0.23079427083333334, "learning_rate": 4e-05, "loss": 5.1186, "loss/crossentropy": 1.8932070061564445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16527743637561798, "step": 5430 }, { "epoch": 0.45266666666666666, "grad_norm": 4.75, "grad_norm_var": 0.22766927083333333, "learning_rate": 4e-05, "loss": 4.5121, "loss/crossentropy": 1.0359995141625404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15376388281583786, "step": 5432 }, { "epoch": 0.4528333333333333, "grad_norm": 5.25, "grad_norm_var": 0.22558186848958334, "learning_rate": 4e-05, "loss": 5.3009, "loss/crossentropy": 2.4695218205451965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21178413927555084, "step": 5434 }, { "epoch": 0.453, "grad_norm": 6.28125, "grad_norm_var": 0.31881103515625, "learning_rate": 4e-05, "loss": 5.5794, "loss/crossentropy": 2.6623180508613586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2121230624616146, "step": 5436 }, { "epoch": 0.45316666666666666, "grad_norm": 5.0, "grad_norm_var": 0.29618733723958335, "learning_rate": 4e-05, "loss": 5.2521, "loss/crossentropy": 1.2481238469481468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171453032642603, "step": 5438 }, { "epoch": 0.4533333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.2942708333333333, "learning_rate": 4e-05, "loss": 4.5362, "loss/crossentropy": 1.9702692329883575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18803063035011292, "step": 5440 }, { "epoch": 0.4535, "grad_norm": 4.625, "grad_norm_var": 0.28427327473958336, "learning_rate": 4e-05, "loss": 4.7512, "loss/crossentropy": 1.6472477465867996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19744962453842163, "step": 5442 }, { "epoch": 0.45366666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.15514322916666667, "learning_rate": 4e-05, "loss": 5.3208, "loss/crossentropy": 2.093437224626541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20477236062288284, "step": 5444 }, { "epoch": 0.4538333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.14885660807291667, "learning_rate": 4e-05, "loss": 5.3538, "loss/crossentropy": 2.378512978553772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103431336581707, "step": 5446 }, { "epoch": 0.454, "grad_norm": 4.875, "grad_norm_var": 0.15907796223958334, "learning_rate": 4e-05, "loss": 4.6674, "loss/crossentropy": 2.076447993516922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18661235831677914, "step": 5448 }, { "epoch": 0.45416666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.15637613932291666, "learning_rate": 4e-05, "loss": 5.0477, "loss/crossentropy": 2.223031312227249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23521675914525986, "step": 5450 }, { "epoch": 0.4543333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.05597330729166667, "learning_rate": 4e-05, "loss": 4.7088, "loss/crossentropy": 1.7138950303196907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18349325843155384, "step": 5452 }, { "epoch": 0.4545, "grad_norm": 5.34375, "grad_norm_var": 0.06467692057291667, "learning_rate": 4e-05, "loss": 4.9164, "loss/crossentropy": 1.6598485931754112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1732875034213066, "step": 5454 }, { "epoch": 0.45466666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.058268229166666664, "learning_rate": 4e-05, "loss": 4.7924, "loss/crossentropy": 1.519950993359089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1434491015970707, "step": 5456 }, { "epoch": 0.4548333333333333, "grad_norm": 4.75, "grad_norm_var": 0.05245768229166667, "learning_rate": 4e-05, "loss": 5.2509, "loss/crossentropy": 1.6798207014799118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1604145709425211, "step": 5458 }, { "epoch": 0.455, "grad_norm": 5.0625, "grad_norm_var": 0.04397379557291667, "learning_rate": 4e-05, "loss": 4.9022, "loss/crossentropy": 1.5287635251879692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15692395344376564, "step": 5460 }, { "epoch": 0.45516666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.03996988932291667, "learning_rate": 4e-05, "loss": 4.9569, "loss/crossentropy": 1.8003231510519981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980554684996605, "step": 5462 }, { "epoch": 0.4553333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.04309488932291667, "learning_rate": 4e-05, "loss": 5.4458, "loss/crossentropy": 1.9575251713395119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18819259479641914, "step": 5464 }, { "epoch": 0.4555, "grad_norm": 8.1875, "grad_norm_var": 0.6995930989583333, "learning_rate": 4e-05, "loss": 4.6394, "loss/crossentropy": 0.7658378258347511, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11739085428416729, "step": 5466 }, { "epoch": 0.45566666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.7234659830729167, "learning_rate": 4e-05, "loss": 5.2724, "loss/crossentropy": 1.6100464090704918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208255335688591, "step": 5468 }, { "epoch": 0.4558333333333333, "grad_norm": 4.875, "grad_norm_var": 0.7325358072916667, "learning_rate": 4e-05, "loss": 4.5175, "loss/crossentropy": 1.5317303538322449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17215605825185776, "step": 5470 }, { "epoch": 0.456, "grad_norm": 4.875, "grad_norm_var": 0.7384724934895833, "learning_rate": 4e-05, "loss": 5.0997, "loss/crossentropy": 2.1992835104465485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21746817231178284, "step": 5472 }, { "epoch": 0.45616666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.72574462890625, "learning_rate": 4e-05, "loss": 4.9513, "loss/crossentropy": 2.353798985481262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22958621755242348, "step": 5474 }, { "epoch": 0.4563333333333333, "grad_norm": 4.75, "grad_norm_var": 0.7391764322916666, "learning_rate": 4e-05, "loss": 4.6827, "loss/crossentropy": 1.9713662266731262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19430748745799065, "step": 5476 }, { "epoch": 0.4565, "grad_norm": 5.21875, "grad_norm_var": 0.7601847330729167, "learning_rate": 4e-05, "loss": 5.227, "loss/crossentropy": 2.5195890069007874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20874114707112312, "step": 5478 }, { "epoch": 0.45666666666666667, "grad_norm": 4.875, "grad_norm_var": 0.7831339518229167, "learning_rate": 4e-05, "loss": 5.0046, "loss/crossentropy": 2.294678211212158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2244003266096115, "step": 5480 }, { "epoch": 0.4568333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.13560791015625, "learning_rate": 4e-05, "loss": 5.0747, "loss/crossentropy": 2.0025685876607895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1901235617697239, "step": 5482 }, { "epoch": 0.457, "grad_norm": 5.0, "grad_norm_var": 0.03528645833333333, "learning_rate": 4e-05, "loss": 4.8902, "loss/crossentropy": 1.941299356520176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1734217330813408, "step": 5484 }, { "epoch": 0.45716666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.04095052083333333, "learning_rate": 4e-05, "loss": 4.7396, "loss/crossentropy": 2.118344932794571, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2606281489133835, "step": 5486 }, { "epoch": 0.4573333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.042704264322916664, "learning_rate": 4e-05, "loss": 5.2228, "loss/crossentropy": 2.236980974674225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2161729745566845, "step": 5488 }, { "epoch": 0.4575, "grad_norm": 4.59375, "grad_norm_var": 0.04644775390625, "learning_rate": 4e-05, "loss": 4.4538, "loss/crossentropy": 1.025296412408352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13174685835838318, "step": 5490 }, { "epoch": 0.45766666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.051171875, "learning_rate": 4e-05, "loss": 4.9829, "loss/crossentropy": 1.4498857855796814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1556770447641611, "step": 5492 }, { "epoch": 0.4578333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.044384765625, "learning_rate": 4e-05, "loss": 5.0496, "loss/crossentropy": 2.1010265946388245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19020748883485794, "step": 5494 }, { "epoch": 0.458, "grad_norm": 4.96875, "grad_norm_var": 0.042578125, "learning_rate": 4e-05, "loss": 4.7502, "loss/crossentropy": 1.1143567636609077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16215180046856403, "step": 5496 }, { "epoch": 0.45816666666666667, "grad_norm": 5.125, "grad_norm_var": 0.04140625, "learning_rate": 4e-05, "loss": 4.9584, "loss/crossentropy": 1.6512196511030197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1721491888165474, "step": 5498 }, { "epoch": 0.4583333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.04664306640625, "learning_rate": 4e-05, "loss": 4.8581, "loss/crossentropy": 1.7085549235343933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1505817025899887, "step": 5500 }, { "epoch": 0.4585, "grad_norm": 4.875, "grad_norm_var": 0.03931884765625, "learning_rate": 4e-05, "loss": 4.7432, "loss/crossentropy": 1.8823091089725494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18096166849136353, "step": 5502 }, { "epoch": 0.45866666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.03802083333333333, "learning_rate": 4e-05, "loss": 4.7849, "loss/crossentropy": 2.0908593386411667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18026093766093254, "step": 5504 }, { "epoch": 0.4588333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.02974853515625, "learning_rate": 4e-05, "loss": 4.611, "loss/crossentropy": 1.2987871691584587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13752495497465134, "step": 5506 }, { "epoch": 0.459, "grad_norm": 5.03125, "grad_norm_var": 0.02760009765625, "learning_rate": 4e-05, "loss": 5.1585, "loss/crossentropy": 1.8118347227573395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16236663702875376, "step": 5508 }, { "epoch": 0.45916666666666667, "grad_norm": 5.6875, "grad_norm_var": 0.06907552083333333, "learning_rate": 4e-05, "loss": 5.3165, "loss/crossentropy": 2.391406774520874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20025787502527237, "step": 5510 }, { "epoch": 0.4593333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.06890869140625, "learning_rate": 4e-05, "loss": 5.2343, "loss/crossentropy": 2.0247348248958588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17275281623005867, "step": 5512 }, { "epoch": 0.4595, "grad_norm": 5.125, "grad_norm_var": 0.07890625, "learning_rate": 4e-05, "loss": 4.3175, "loss/crossentropy": 1.8554309457540512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19631312042474747, "step": 5514 }, { "epoch": 0.45966666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.07928059895833334, "learning_rate": 4e-05, "loss": 4.6764, "loss/crossentropy": 1.0043694823980331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1664719507098198, "step": 5516 }, { "epoch": 0.4598333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.07398681640625, "learning_rate": 4e-05, "loss": 5.132, "loss/crossentropy": 2.534608483314514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2207382507622242, "step": 5518 }, { "epoch": 0.46, "grad_norm": 4.625, "grad_norm_var": 0.0775390625, "learning_rate": 4e-05, "loss": 4.7972, "loss/crossentropy": 1.9613143354654312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2146427035331726, "step": 5520 }, { "epoch": 0.46016666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.08069254557291666, "learning_rate": 4e-05, "loss": 5.2331, "loss/crossentropy": 2.5223607420921326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20144547894597054, "step": 5522 }, { "epoch": 0.4603333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.09659830729166667, "learning_rate": 4e-05, "loss": 5.4748, "loss/crossentropy": 2.462041199207306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21255720406770706, "step": 5524 }, { "epoch": 0.4605, "grad_norm": 4.84375, "grad_norm_var": 0.06861979166666667, "learning_rate": 4e-05, "loss": 5.4283, "loss/crossentropy": 1.8817952871322632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1882878541946411, "step": 5526 }, { "epoch": 0.46066666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.07229410807291667, "learning_rate": 4e-05, "loss": 5.3448, "loss/crossentropy": 2.304149329662323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20610129833221436, "step": 5528 }, { "epoch": 0.4608333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.06282145182291667, "learning_rate": 4e-05, "loss": 5.3979, "loss/crossentropy": 2.2624170780181885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21408792212605476, "step": 5530 }, { "epoch": 0.461, "grad_norm": 5.03125, "grad_norm_var": 0.05579020182291667, "learning_rate": 4e-05, "loss": 4.6877, "loss/crossentropy": 1.5460020303726196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15484758466482162, "step": 5532 }, { "epoch": 0.46116666666666667, "grad_norm": 5.25, "grad_norm_var": 0.05836181640625, "learning_rate": 4e-05, "loss": 5.0696, "loss/crossentropy": 1.4943357408046722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16306064277887344, "step": 5534 }, { "epoch": 0.4613333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.0611328125, "learning_rate": 4e-05, "loss": 5.3117, "loss/crossentropy": 2.556882083415985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21715038269758224, "step": 5536 }, { "epoch": 0.4615, "grad_norm": 5.5, "grad_norm_var": 0.08513997395833334, "learning_rate": 4e-05, "loss": 4.7835, "loss/crossentropy": 1.7690436989068985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21724151819944382, "step": 5538 }, { "epoch": 0.46166666666666667, "grad_norm": 5.0, "grad_norm_var": 0.08079427083333333, "learning_rate": 4e-05, "loss": 5.0873, "loss/crossentropy": 1.8304852917790413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18460811860859394, "step": 5540 }, { "epoch": 0.4618333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.08059895833333333, "learning_rate": 4e-05, "loss": 4.4272, "loss/crossentropy": 1.6574642956256866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16919926181435585, "step": 5542 }, { "epoch": 0.462, "grad_norm": 4.34375, "grad_norm_var": 0.10556233723958333, "learning_rate": 4e-05, "loss": 4.5421, "loss/crossentropy": 2.3935444951057434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20081859827041626, "step": 5544 }, { "epoch": 0.46216666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.10155843098958334, "learning_rate": 4e-05, "loss": 5.1131, "loss/crossentropy": 2.4130229353904724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20666294917464256, "step": 5546 }, { "epoch": 0.4623333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.10748697916666666, "learning_rate": 4e-05, "loss": 4.7261, "loss/crossentropy": 2.4770246744155884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202097252011299, "step": 5548 }, { "epoch": 0.4625, "grad_norm": 4.71875, "grad_norm_var": 0.118994140625, "learning_rate": 4e-05, "loss": 4.9954, "loss/crossentropy": 1.7167896926403046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16790836118161678, "step": 5550 }, { "epoch": 0.46266666666666667, "grad_norm": 4.75, "grad_norm_var": 0.10950520833333334, "learning_rate": 4e-05, "loss": 5.0256, "loss/crossentropy": 2.050472140312195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17495042085647583, "step": 5552 }, { "epoch": 0.4628333333333333, "grad_norm": 5.34375, "grad_norm_var": 0.07980143229166667, "learning_rate": 4e-05, "loss": 5.3496, "loss/crossentropy": 2.4477387070655823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21591287106275558, "step": 5554 }, { "epoch": 0.463, "grad_norm": 5.25, "grad_norm_var": 0.20500895182291667, "learning_rate": 4e-05, "loss": 5.2569, "loss/crossentropy": 2.42499315738678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20981594547629356, "step": 5556 }, { "epoch": 0.46316666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.20562744140625, "learning_rate": 4e-05, "loss": 4.5267, "loss/crossentropy": 1.9163185358047485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18663611635565758, "step": 5558 }, { "epoch": 0.4633333333333333, "grad_norm": 5.375, "grad_norm_var": 0.17222900390625, "learning_rate": 4e-05, "loss": 4.9847, "loss/crossentropy": 2.0122427120804787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1882903315126896, "step": 5560 }, { "epoch": 0.4635, "grad_norm": 4.96875, "grad_norm_var": 0.169775390625, "learning_rate": 4e-05, "loss": 5.2916, "loss/crossentropy": 2.5393239855766296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2253870852291584, "step": 5562 }, { "epoch": 0.46366666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.34957275390625, "learning_rate": 4e-05, "loss": 4.9768, "loss/crossentropy": 2.6330828070640564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2185838483273983, "step": 5564 }, { "epoch": 0.4638333333333333, "grad_norm": 6.84375, "grad_norm_var": 0.54713134765625, "learning_rate": 4e-05, "loss": 4.5474, "loss/crossentropy": 1.7194873318076134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1570914275944233, "step": 5566 }, { "epoch": 0.464, "grad_norm": 4.5625, "grad_norm_var": 0.55894775390625, "learning_rate": 4e-05, "loss": 4.655, "loss/crossentropy": 2.1608819663524628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23339637368917465, "step": 5568 }, { "epoch": 0.46416666666666667, "grad_norm": 5.25, "grad_norm_var": 0.56539306640625, "learning_rate": 4e-05, "loss": 5.0446, "loss/crossentropy": 2.66201913356781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21940989419817924, "step": 5570 }, { "epoch": 0.4643333333333333, "grad_norm": 5.5, "grad_norm_var": 0.4955078125, "learning_rate": 4e-05, "loss": 5.2771, "loss/crossentropy": 2.2475315630435944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19876830279827118, "step": 5572 }, { "epoch": 0.4645, "grad_norm": 4.59375, "grad_norm_var": 0.5167805989583333, "learning_rate": 4e-05, "loss": 4.3096, "loss/crossentropy": 1.4486872777342796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1465356983244419, "step": 5574 }, { "epoch": 0.4646666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.5151652018229167, "learning_rate": 4e-05, "loss": 4.9242, "loss/crossentropy": 2.028968036174774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18292693980038166, "step": 5576 }, { "epoch": 0.4648333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.5244425455729167, "learning_rate": 4e-05, "loss": 4.9943, "loss/crossentropy": 2.321197360754013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20540225505828857, "step": 5578 }, { "epoch": 0.465, "grad_norm": 4.8125, "grad_norm_var": 0.3117472330729167, "learning_rate": 4e-05, "loss": 5.019, "loss/crossentropy": 1.9598820507526398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18344413861632347, "step": 5580 }, { "epoch": 0.4651666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.058186848958333336, "learning_rate": 4e-05, "loss": 4.6168, "loss/crossentropy": 2.1863655149936676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22168518975377083, "step": 5582 }, { "epoch": 0.4653333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.11962483723958334, "learning_rate": 4e-05, "loss": 4.7241, "loss/crossentropy": 2.301854968070984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22621898725628853, "step": 5584 }, { "epoch": 0.4655, "grad_norm": 4.96875, "grad_norm_var": 0.12932535807291667, "learning_rate": 4e-05, "loss": 5.5068, "loss/crossentropy": 2.373853385448456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21982931718230247, "step": 5586 }, { "epoch": 0.4656666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.11672770182291667, "learning_rate": 4e-05, "loss": 4.9399, "loss/crossentropy": 1.9027044028043747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18401046097278595, "step": 5588 }, { "epoch": 0.4658333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.105712890625, "learning_rate": 4e-05, "loss": 5.1164, "loss/crossentropy": 2.4487122297286987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22322659194469452, "step": 5590 }, { "epoch": 0.466, "grad_norm": 5.125, "grad_norm_var": 0.10530192057291667, "learning_rate": 4e-05, "loss": 5.1738, "loss/crossentropy": 2.392216980457306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22900298237800598, "step": 5592 }, { "epoch": 0.4661666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.11092122395833333, "learning_rate": 4e-05, "loss": 5.0379, "loss/crossentropy": 2.20323982834816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22030623257160187, "step": 5594 }, { "epoch": 0.4663333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.10611979166666667, "learning_rate": 4e-05, "loss": 5.1038, "loss/crossentropy": 1.8584736064076424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.181080624461174, "step": 5596 }, { "epoch": 0.4665, "grad_norm": 5.1875, "grad_norm_var": 0.10846354166666666, "learning_rate": 4e-05, "loss": 4.5158, "loss/crossentropy": 1.710656851530075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18867123126983643, "step": 5598 }, { "epoch": 0.4666666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.04211832682291667, "learning_rate": 4e-05, "loss": 5.3547, "loss/crossentropy": 2.108785852789879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18798251450061798, "step": 5600 }, { "epoch": 0.4668333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.04000244140625, "learning_rate": 4e-05, "loss": 4.6394, "loss/crossentropy": 2.492998719215393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24159801751375198, "step": 5602 }, { "epoch": 0.467, "grad_norm": 4.96875, "grad_norm_var": 0.03899332682291667, "learning_rate": 4e-05, "loss": 5.0782, "loss/crossentropy": 1.8815812170505524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166886031627655, "step": 5604 }, { "epoch": 0.4671666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.06315104166666667, "learning_rate": 4e-05, "loss": 4.5261, "loss/crossentropy": 1.4593137428164482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.157625250518322, "step": 5606 }, { "epoch": 0.4673333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.06783854166666667, "learning_rate": 4e-05, "loss": 4.8489, "loss/crossentropy": 2.531603217124939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19394199922680855, "step": 5608 }, { "epoch": 0.4675, "grad_norm": 4.75, "grad_norm_var": 0.06327718098958333, "learning_rate": 4e-05, "loss": 4.801, "loss/crossentropy": 1.6611417457461357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18344333954155445, "step": 5610 }, { "epoch": 0.4676666666666667, "grad_norm": 4.5, "grad_norm_var": 0.06311442057291666, "learning_rate": 4e-05, "loss": 4.9979, "loss/crossentropy": 2.3051935136318207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960429958999157, "step": 5612 }, { "epoch": 0.4678333333333333, "grad_norm": 4.875, "grad_norm_var": 0.05419514973958333, "learning_rate": 4e-05, "loss": 4.5293, "loss/crossentropy": 1.8542626649141312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17750085517764091, "step": 5614 }, { "epoch": 0.468, "grad_norm": 4.71875, "grad_norm_var": 0.03982747395833333, "learning_rate": 4e-05, "loss": 5.0748, "loss/crossentropy": 2.1633825600147247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869092732667923, "step": 5616 }, { "epoch": 0.4681666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.056380208333333334, "learning_rate": 4e-05, "loss": 4.757, "loss/crossentropy": 1.6803074106574059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20325062051415443, "step": 5618 }, { "epoch": 0.4683333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.06015218098958333, "learning_rate": 4e-05, "loss": 5.0692, "loss/crossentropy": 2.228565901517868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19540579989552498, "step": 5620 }, { "epoch": 0.4685, "grad_norm": 5.40625, "grad_norm_var": 0.06326497395833333, "learning_rate": 4e-05, "loss": 4.9955, "loss/crossentropy": 2.2330249547958374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2226766161620617, "step": 5622 }, { "epoch": 0.4686666666666667, "grad_norm": 4.875, "grad_norm_var": 0.05245768229166667, "learning_rate": 4e-05, "loss": 4.9578, "loss/crossentropy": 1.7934229224920273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1788601279258728, "step": 5624 }, { "epoch": 0.4688333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.05230712890625, "learning_rate": 4e-05, "loss": 4.8806, "loss/crossentropy": 1.8773228824138641, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17370744049549103, "step": 5626 }, { "epoch": 0.469, "grad_norm": 4.65625, "grad_norm_var": 0.04778645833333333, "learning_rate": 4e-05, "loss": 5.0076, "loss/crossentropy": 2.1669468581676483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2194295935332775, "step": 5628 }, { "epoch": 0.4691666666666667, "grad_norm": 5.0, "grad_norm_var": 0.04664306640625, "learning_rate": 4e-05, "loss": 4.7817, "loss/crossentropy": 1.9204804003238678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20061272010207176, "step": 5630 }, { "epoch": 0.4693333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.04895426432291667, "learning_rate": 4e-05, "loss": 4.5336, "loss/crossentropy": 1.5425023213028908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17089655436575413, "step": 5632 }, { "epoch": 0.4695, "grad_norm": 5.125, "grad_norm_var": 0.04635009765625, "learning_rate": 4e-05, "loss": 5.03, "loss/crossentropy": 2.249193251132965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20843612030148506, "step": 5634 }, { "epoch": 0.4696666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.04583333333333333, "learning_rate": 4e-05, "loss": 4.9591, "loss/crossentropy": 2.4124104380607605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21710863336920738, "step": 5636 }, { "epoch": 0.4698333333333333, "grad_norm": 5.9375, "grad_norm_var": 0.10232747395833333, "learning_rate": 4e-05, "loss": 5.175, "loss/crossentropy": 1.8956755921244621, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18736336380243301, "step": 5638 }, { "epoch": 0.47, "grad_norm": 4.75, "grad_norm_var": 0.10364583333333334, "learning_rate": 4e-05, "loss": 4.5773, "loss/crossentropy": 2.1982105374336243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23227747902274132, "step": 5640 }, { "epoch": 0.4701666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.10295817057291666, "learning_rate": 4e-05, "loss": 4.6193, "loss/crossentropy": 1.927461177110672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18918763473629951, "step": 5642 }, { "epoch": 0.4703333333333333, "grad_norm": 4.5, "grad_norm_var": 0.14256184895833332, "learning_rate": 4e-05, "loss": 4.8908, "loss/crossentropy": 2.513116717338562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22445828840136528, "step": 5644 }, { "epoch": 0.4705, "grad_norm": 4.84375, "grad_norm_var": 0.13472900390625, "learning_rate": 4e-05, "loss": 5.4053, "loss/crossentropy": 1.8445745781064034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20689787901937962, "step": 5646 }, { "epoch": 0.4706666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.14055989583333334, "learning_rate": 4e-05, "loss": 4.5119, "loss/crossentropy": 2.787231981754303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22696134075522423, "step": 5648 }, { "epoch": 0.4708333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.14680582682291668, "learning_rate": 4e-05, "loss": 4.6691, "loss/crossentropy": 1.9298951923847198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20483119785785675, "step": 5650 }, { "epoch": 0.471, "grad_norm": 5.28125, "grad_norm_var": 0.16521809895833334, "learning_rate": 4e-05, "loss": 4.5899, "loss/crossentropy": 1.9479724541306496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18522769957780838, "step": 5652 }, { "epoch": 0.4711666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.12180989583333333, "learning_rate": 4e-05, "loss": 4.9636, "loss/crossentropy": 1.995558775961399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19092020578682423, "step": 5654 }, { "epoch": 0.4713333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.11763916015625, "learning_rate": 4e-05, "loss": 4.9866, "loss/crossentropy": 1.3711708784103394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17136229574680328, "step": 5656 }, { "epoch": 0.4715, "grad_norm": 4.84375, "grad_norm_var": 0.115087890625, "learning_rate": 4e-05, "loss": 4.7872, "loss/crossentropy": 2.1775683164596558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25280786678195, "step": 5658 }, { "epoch": 0.4716666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.06936442057291667, "learning_rate": 4e-05, "loss": 5.4841, "loss/crossentropy": 2.1796337962150574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23797861486673355, "step": 5660 }, { "epoch": 0.4718333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.09254150390625, "learning_rate": 4e-05, "loss": 4.8985, "loss/crossentropy": 1.956167332828045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071479931473732, "step": 5662 }, { "epoch": 0.472, "grad_norm": 4.9375, "grad_norm_var": 0.08229166666666667, "learning_rate": 4e-05, "loss": 4.5594, "loss/crossentropy": 1.6870819255709648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18948296085000038, "step": 5664 }, { "epoch": 0.4721666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.08292643229166667, "learning_rate": 4e-05, "loss": 4.3645, "loss/crossentropy": 1.7608007118105888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16702809929847717, "step": 5666 }, { "epoch": 0.4723333333333333, "grad_norm": 4.75, "grad_norm_var": 0.07180582682291667, "learning_rate": 4e-05, "loss": 4.3888, "loss/crossentropy": 1.1737506687641144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15226943045854568, "step": 5668 }, { "epoch": 0.4725, "grad_norm": 5.0625, "grad_norm_var": 0.050244140625, "learning_rate": 4e-05, "loss": 4.9121, "loss/crossentropy": 2.3276381492614746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23176956549286842, "step": 5670 }, { "epoch": 0.4726666666666667, "grad_norm": 8.375, "grad_norm_var": 0.77730712890625, "learning_rate": 4e-05, "loss": 5.1473, "loss/crossentropy": 1.7255630418658257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22438404709100723, "step": 5672 }, { "epoch": 0.4728333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.7973958333333333, "learning_rate": 4e-05, "loss": 4.6398, "loss/crossentropy": 1.3483816534280777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1676221415400505, "step": 5674 }, { "epoch": 0.473, "grad_norm": 5.5, "grad_norm_var": 0.8156087239583333, "learning_rate": 4e-05, "loss": 5.5553, "loss/crossentropy": 2.0339736565947533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17984124645590782, "step": 5676 }, { "epoch": 0.4731666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.7845052083333334, "learning_rate": 4e-05, "loss": 4.8993, "loss/crossentropy": 1.665101781487465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18875110894441605, "step": 5678 }, { "epoch": 0.47333333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.7824055989583333, "learning_rate": 4e-05, "loss": 4.9043, "loss/crossentropy": 1.4573597237467766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1709212139248848, "step": 5680 }, { "epoch": 0.4735, "grad_norm": 4.65625, "grad_norm_var": 0.8001261393229167, "learning_rate": 4e-05, "loss": 5.3516, "loss/crossentropy": 2.237562984228134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20515824109315872, "step": 5682 }, { "epoch": 0.4736666666666667, "grad_norm": 4.875, "grad_norm_var": 0.8140462239583334, "learning_rate": 4e-05, "loss": 4.329, "loss/crossentropy": 1.8115737065672874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17292770743370056, "step": 5684 }, { "epoch": 0.47383333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.84068603515625, "learning_rate": 4e-05, "loss": 4.4173, "loss/crossentropy": 1.8549021109938622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1680241972208023, "step": 5686 }, { "epoch": 0.474, "grad_norm": 4.8125, "grad_norm_var": 0.11907552083333334, "learning_rate": 4e-05, "loss": 4.6369, "loss/crossentropy": 1.2896523252129555, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13351555354893208, "step": 5688 }, { "epoch": 0.4741666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.10696207682291667, "learning_rate": 4e-05, "loss": 4.8859, "loss/crossentropy": 1.4676887169480324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1461086068302393, "step": 5690 }, { "epoch": 0.47433333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.09140625, "learning_rate": 4e-05, "loss": 5.577, "loss/crossentropy": 1.770861029624939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1623640414327383, "step": 5692 }, { "epoch": 0.4745, "grad_norm": 5.21875, "grad_norm_var": 0.06417643229166667, "learning_rate": 4e-05, "loss": 5.0672, "loss/crossentropy": 2.0532081723213196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19289374724030495, "step": 5694 }, { "epoch": 0.4746666666666667, "grad_norm": 5.625, "grad_norm_var": 0.08800455729166666, "learning_rate": 4e-05, "loss": 5.2478, "loss/crossentropy": 1.9786882400512695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24169421941041946, "step": 5696 }, { "epoch": 0.47483333333333333, "grad_norm": 4.875, "grad_norm_var": 0.086181640625, "learning_rate": 4e-05, "loss": 4.8398, "loss/crossentropy": 2.2744025588035583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21215546131134033, "step": 5698 }, { "epoch": 0.475, "grad_norm": 5.21875, "grad_norm_var": 0.07922770182291666, "learning_rate": 4e-05, "loss": 5.4644, "loss/crossentropy": 2.2868226170539856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20553449168801308, "step": 5700 }, { "epoch": 0.4751666666666667, "grad_norm": 5.125, "grad_norm_var": 0.09225260416666667, "learning_rate": 4e-05, "loss": 4.792, "loss/crossentropy": 2.4458898305892944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21513555943965912, "step": 5702 }, { "epoch": 0.47533333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.09256184895833333, "learning_rate": 4e-05, "loss": 4.9897, "loss/crossentropy": 1.9003973007202148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19777457788586617, "step": 5704 }, { "epoch": 0.4755, "grad_norm": 4.65625, "grad_norm_var": 0.099853515625, "learning_rate": 4e-05, "loss": 4.1787, "loss/crossentropy": 2.359318822622299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21146679669618607, "step": 5706 }, { "epoch": 0.4756666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.096337890625, "learning_rate": 4e-05, "loss": 4.6372, "loss/crossentropy": 2.3056346774101257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23220019042491913, "step": 5708 }, { "epoch": 0.47583333333333333, "grad_norm": 4.875, "grad_norm_var": 0.09208577473958333, "learning_rate": 4e-05, "loss": 5.5974, "loss/crossentropy": 2.6773802042007446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22551625221967697, "step": 5710 }, { "epoch": 0.476, "grad_norm": 5.1875, "grad_norm_var": 0.061572265625, "learning_rate": 4e-05, "loss": 4.952, "loss/crossentropy": 2.311539113521576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19972623512148857, "step": 5712 }, { "epoch": 0.4761666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.05826416015625, "learning_rate": 4e-05, "loss": 4.5923, "loss/crossentropy": 1.410909503698349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15724024921655655, "step": 5714 }, { "epoch": 0.47633333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.08722330729166666, "learning_rate": 4e-05, "loss": 4.6575, "loss/crossentropy": 2.465664803981781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142348736524582, "step": 5716 }, { "epoch": 0.4765, "grad_norm": 4.6875, "grad_norm_var": 0.10777587890625, "learning_rate": 4e-05, "loss": 4.4435, "loss/crossentropy": 2.299306809902191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2054593190550804, "step": 5718 }, { "epoch": 0.4766666666666667, "grad_norm": 4.5, "grad_norm_var": 0.13987223307291666, "learning_rate": 4e-05, "loss": 4.5301, "loss/crossentropy": 1.9581206738948822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18420690298080444, "step": 5720 }, { "epoch": 0.47683333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.13697916666666668, "learning_rate": 4e-05, "loss": 4.7601, "loss/crossentropy": 1.5340687707066536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17616295255720615, "step": 5722 }, { "epoch": 0.477, "grad_norm": 4.59375, "grad_norm_var": 0.17867431640625, "learning_rate": 4e-05, "loss": 3.8758, "loss/crossentropy": 0.8211743906140327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12213561870157719, "step": 5724 }, { "epoch": 0.4771666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.17720947265625, "learning_rate": 4e-05, "loss": 5.1329, "loss/crossentropy": 2.3268213868141174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2231154851615429, "step": 5726 }, { "epoch": 0.47733333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.18736979166666667, "learning_rate": 4e-05, "loss": 5.0385, "loss/crossentropy": 2.3459609746932983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21290796995162964, "step": 5728 }, { "epoch": 0.4775, "grad_norm": 5.375, "grad_norm_var": 0.21158854166666666, "learning_rate": 4e-05, "loss": 5.4045, "loss/crossentropy": 2.478249251842499, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21360737085342407, "step": 5730 }, { "epoch": 0.4776666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.1826171875, "learning_rate": 4e-05, "loss": 5.2075, "loss/crossentropy": 2.6454553604125977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2205870822072029, "step": 5732 }, { "epoch": 0.47783333333333333, "grad_norm": 5.625, "grad_norm_var": 0.18603108723958334, "learning_rate": 4e-05, "loss": 4.5918, "loss/crossentropy": 2.2463018894195557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2118573747575283, "step": 5734 }, { "epoch": 0.478, "grad_norm": 4.96875, "grad_norm_var": 0.15319010416666667, "learning_rate": 4e-05, "loss": 4.457, "loss/crossentropy": 1.66935233771801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1990772783756256, "step": 5736 }, { "epoch": 0.4781666666666667, "grad_norm": 5.40625, "grad_norm_var": 0.16799723307291667, "learning_rate": 4e-05, "loss": 5.0837, "loss/crossentropy": 2.6120508909225464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2278040051460266, "step": 5738 }, { "epoch": 0.47833333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.10572916666666667, "learning_rate": 4e-05, "loss": 5.4204, "loss/crossentropy": 2.6417598128318787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22373153269290924, "step": 5740 }, { "epoch": 0.4785, "grad_norm": 4.8125, "grad_norm_var": 0.12102457682291666, "learning_rate": 4e-05, "loss": 4.3854, "loss/crossentropy": 2.144157826900482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20237886905670166, "step": 5742 }, { "epoch": 0.4786666666666667, "grad_norm": 4.875, "grad_norm_var": 0.10663655598958334, "learning_rate": 4e-05, "loss": 4.6867, "loss/crossentropy": 2.346675455570221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053035832941532, "step": 5744 }, { "epoch": 0.47883333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.1123046875, "learning_rate": 4e-05, "loss": 5.184, "loss/crossentropy": 2.6332274079322815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2171645276248455, "step": 5746 }, { "epoch": 0.479, "grad_norm": 4.875, "grad_norm_var": 0.131103515625, "learning_rate": 4e-05, "loss": 4.0919, "loss/crossentropy": 1.4563121870160103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14949826709926128, "step": 5748 }, { "epoch": 0.4791666666666667, "grad_norm": 4.875, "grad_norm_var": 0.10089518229166666, "learning_rate": 4e-05, "loss": 5.0827, "loss/crossentropy": 2.034213662147522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20901034027338028, "step": 5750 }, { "epoch": 0.47933333333333333, "grad_norm": 4.625, "grad_norm_var": 0.10390625, "learning_rate": 4e-05, "loss": 4.4502, "loss/crossentropy": 2.4095794558525085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22046563774347305, "step": 5752 }, { "epoch": 0.4795, "grad_norm": 5.3125, "grad_norm_var": 0.09759114583333334, "learning_rate": 4e-05, "loss": 5.2505, "loss/crossentropy": 2.034136213362217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1787286102771759, "step": 5754 }, { "epoch": 0.4796666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.09998372395833334, "learning_rate": 4e-05, "loss": 5.1684, "loss/crossentropy": 2.084101490676403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18571361154317856, "step": 5756 }, { "epoch": 0.47983333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.09599202473958333, "learning_rate": 4e-05, "loss": 4.8376, "loss/crossentropy": 1.6878532022237778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.179075812920928, "step": 5758 }, { "epoch": 0.48, "grad_norm": 4.84375, "grad_norm_var": 0.095947265625, "learning_rate": 4e-05, "loss": 4.6523, "loss/crossentropy": 1.9728622436523438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1716511808335781, "step": 5760 }, { "epoch": 0.4801666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.09329427083333333, "learning_rate": 4e-05, "loss": 4.7272, "loss/crossentropy": 1.4189490303397179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16327936947345734, "step": 5762 }, { "epoch": 0.48033333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.07408854166666666, "learning_rate": 4e-05, "loss": 5.0876, "loss/crossentropy": 2.339596748352051, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20580720156431198, "step": 5764 }, { "epoch": 0.4805, "grad_norm": 5.59375, "grad_norm_var": 0.09114583333333333, "learning_rate": 4e-05, "loss": 5.0123, "loss/crossentropy": 1.7912172004580498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17723418772220612, "step": 5766 }, { "epoch": 0.4806666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.089697265625, "learning_rate": 4e-05, "loss": 4.9938, "loss/crossentropy": 1.4451691582798958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14382265135645866, "step": 5768 }, { "epoch": 0.48083333333333333, "grad_norm": 5.0, "grad_norm_var": 0.08567708333333333, "learning_rate": 4e-05, "loss": 4.8534, "loss/crossentropy": 1.743345096707344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15984099358320236, "step": 5770 }, { "epoch": 0.481, "grad_norm": 4.84375, "grad_norm_var": 0.07342122395833334, "learning_rate": 4e-05, "loss": 4.7223, "loss/crossentropy": 2.2444933652877808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19508197531104088, "step": 5772 }, { "epoch": 0.4811666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.07095947265625, "learning_rate": 4e-05, "loss": 4.6951, "loss/crossentropy": 1.3921042084693909, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14450440928339958, "step": 5774 }, { "epoch": 0.48133333333333334, "grad_norm": 5.125, "grad_norm_var": 0.07554931640625, "learning_rate": 4e-05, "loss": 5.5692, "loss/crossentropy": 2.309263288974762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22191011533141136, "step": 5776 }, { "epoch": 0.4815, "grad_norm": 5.15625, "grad_norm_var": 0.06796875, "learning_rate": 4e-05, "loss": 5.0408, "loss/crossentropy": 1.2080154567956924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14104336686432362, "step": 5778 }, { "epoch": 0.4816666666666667, "grad_norm": 4.875, "grad_norm_var": 0.06402587890625, "learning_rate": 4e-05, "loss": 4.5586, "loss/crossentropy": 2.0010958090424538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1737840212881565, "step": 5780 }, { "epoch": 0.48183333333333334, "grad_norm": 5.25, "grad_norm_var": 0.047265625, "learning_rate": 4e-05, "loss": 4.8221, "loss/crossentropy": 2.0100313425064087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19126557931303978, "step": 5782 }, { "epoch": 0.482, "grad_norm": 5.09375, "grad_norm_var": 0.04394124348958333, "learning_rate": 4e-05, "loss": 5.491, "loss/crossentropy": 1.8984070345759392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18304388225078583, "step": 5784 }, { "epoch": 0.4821666666666667, "grad_norm": 5.90625, "grad_norm_var": 0.0921875, "learning_rate": 4e-05, "loss": 5.3534, "loss/crossentropy": 1.232503592967987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13390603102743626, "step": 5786 }, { "epoch": 0.48233333333333334, "grad_norm": 5.125, "grad_norm_var": 0.09010009765625, "learning_rate": 4e-05, "loss": 4.7457, "loss/crossentropy": 1.4511554315686226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1644621528685093, "step": 5788 }, { "epoch": 0.4825, "grad_norm": 4.71875, "grad_norm_var": 0.08827718098958333, "learning_rate": 4e-05, "loss": 4.4682, "loss/crossentropy": 1.909536212682724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18038103729486465, "step": 5790 }, { "epoch": 0.4826666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.08603108723958333, "learning_rate": 4e-05, "loss": 4.6379, "loss/crossentropy": 1.01704840362072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1405244879424572, "step": 5792 }, { "epoch": 0.48283333333333334, "grad_norm": 4.875, "grad_norm_var": 0.08162434895833333, "learning_rate": 4e-05, "loss": 5.1905, "loss/crossentropy": 1.8805341720581055, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17578154057264328, "step": 5794 }, { "epoch": 0.483, "grad_norm": 4.96875, "grad_norm_var": 0.08098958333333334, "learning_rate": 4e-05, "loss": 5.0812, "loss/crossentropy": 2.587492525577545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23445945233106613, "step": 5796 }, { "epoch": 0.4831666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.07029622395833333, "learning_rate": 4e-05, "loss": 5.2722, "loss/crossentropy": 1.608366496860981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2221035696566105, "step": 5798 }, { "epoch": 0.48333333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.1158203125, "learning_rate": 4e-05, "loss": 4.206, "loss/crossentropy": 1.8016544580459595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18007242307066917, "step": 5800 }, { "epoch": 0.4835, "grad_norm": 5.0625, "grad_norm_var": 0.052197265625, "learning_rate": 4e-05, "loss": 5.1328, "loss/crossentropy": 1.5784991532564163, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1633721888065338, "step": 5802 }, { "epoch": 0.4836666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.05338134765625, "learning_rate": 4e-05, "loss": 5.2282, "loss/crossentropy": 2.152147799730301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19616486132144928, "step": 5804 }, { "epoch": 0.48383333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.053055826822916666, "learning_rate": 4e-05, "loss": 4.6495, "loss/crossentropy": 1.6643903106451035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19107349775731564, "step": 5806 }, { "epoch": 0.484, "grad_norm": 4.78125, "grad_norm_var": 0.05358072916666667, "learning_rate": 4e-05, "loss": 4.4893, "loss/crossentropy": 1.6722280532121658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17675404995679855, "step": 5808 }, { "epoch": 0.4841666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.06399739583333333, "learning_rate": 4e-05, "loss": 4.9003, "loss/crossentropy": 2.3588092923164368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2119477353990078, "step": 5810 }, { "epoch": 0.48433333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.06327718098958333, "learning_rate": 4e-05, "loss": 4.5094, "loss/crossentropy": 1.411509931087494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15218713879585266, "step": 5812 }, { "epoch": 0.4845, "grad_norm": 4.84375, "grad_norm_var": 0.18619384765625, "learning_rate": 4e-05, "loss": 4.9774, "loss/crossentropy": 1.8252098262310028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2733977921307087, "step": 5814 }, { "epoch": 0.4846666666666667, "grad_norm": 4.625, "grad_norm_var": 0.17486979166666666, "learning_rate": 4e-05, "loss": 4.4479, "loss/crossentropy": 1.9259876608848572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18517810851335526, "step": 5816 }, { "epoch": 0.48483333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.17278238932291667, "learning_rate": 4e-05, "loss": 4.8732, "loss/crossentropy": 1.6283398121595383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14427206851541996, "step": 5818 }, { "epoch": 0.485, "grad_norm": 4.9375, "grad_norm_var": 0.16829020182291668, "learning_rate": 4e-05, "loss": 5.2812, "loss/crossentropy": 1.330165982246399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16302420757710934, "step": 5820 }, { "epoch": 0.4851666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.18448893229166666, "learning_rate": 4e-05, "loss": 5.1328, "loss/crossentropy": 1.5612802058458328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18193678371608257, "step": 5822 }, { "epoch": 0.48533333333333334, "grad_norm": 5.125, "grad_norm_var": 0.18346354166666667, "learning_rate": 4e-05, "loss": 5.1239, "loss/crossentropy": 2.374065101146698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2208290696144104, "step": 5824 }, { "epoch": 0.4855, "grad_norm": 4.6875, "grad_norm_var": 0.200390625, "learning_rate": 4e-05, "loss": 4.663, "loss/crossentropy": 1.4791902005672455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15314685553312302, "step": 5826 }, { "epoch": 0.4856666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.19998372395833333, "learning_rate": 4e-05, "loss": 5.0844, "loss/crossentropy": 1.3942490443587303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17971058189868927, "step": 5828 }, { "epoch": 0.48583333333333334, "grad_norm": 5.0, "grad_norm_var": 0.09544270833333333, "learning_rate": 4e-05, "loss": 5.1706, "loss/crossentropy": 2.2101835906505585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22206401452422142, "step": 5830 }, { "epoch": 0.486, "grad_norm": 4.8125, "grad_norm_var": 0.06829427083333334, "learning_rate": 4e-05, "loss": 4.9569, "loss/crossentropy": 1.806466780602932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16677996329963207, "step": 5832 }, { "epoch": 0.4861666666666667, "grad_norm": 5.40625, "grad_norm_var": 0.08826497395833334, "learning_rate": 4e-05, "loss": 4.8622, "loss/crossentropy": 1.182845950126648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15750311501324177, "step": 5834 }, { "epoch": 0.48633333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.09351806640625, "learning_rate": 4e-05, "loss": 4.9245, "loss/crossentropy": 1.6443369388580322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15455692261457443, "step": 5836 }, { "epoch": 0.4865, "grad_norm": 4.9375, "grad_norm_var": 0.130712890625, "learning_rate": 4e-05, "loss": 5.2779, "loss/crossentropy": 2.343976229429245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20677870512008667, "step": 5838 }, { "epoch": 0.4866666666666667, "grad_norm": 3.9375, "grad_norm_var": 0.20206705729166666, "learning_rate": 4e-05, "loss": 4.0648, "loss/crossentropy": 1.9890966042876244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19900896027684212, "step": 5840 }, { "epoch": 0.48683333333333334, "grad_norm": 5.0, "grad_norm_var": 0.17771809895833332, "learning_rate": 4e-05, "loss": 4.9045, "loss/crossentropy": 1.9553634375333786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19644594565033913, "step": 5842 }, { "epoch": 0.487, "grad_norm": 4.6875, "grad_norm_var": 0.1740234375, "learning_rate": 4e-05, "loss": 4.5715, "loss/crossentropy": 1.8074621483683586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17907745763659477, "step": 5844 }, { "epoch": 0.4871666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.16474202473958333, "learning_rate": 4e-05, "loss": 4.7783, "loss/crossentropy": 2.4416297674179077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21168984845280647, "step": 5846 }, { "epoch": 0.48733333333333334, "grad_norm": 5.46875, "grad_norm_var": 0.18603108723958334, "learning_rate": 4e-05, "loss": 4.6668, "loss/crossentropy": 2.470840334892273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23379643633961678, "step": 5848 }, { "epoch": 0.4875, "grad_norm": 5.15625, "grad_norm_var": 0.16584879557291668, "learning_rate": 4e-05, "loss": 5.2838, "loss/crossentropy": 2.0527156069874763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1883339285850525, "step": 5850 }, { "epoch": 0.4876666666666667, "grad_norm": 5.4375, "grad_norm_var": 0.17708333333333334, "learning_rate": 4e-05, "loss": 5.0268, "loss/crossentropy": 2.331518530845642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21710406616330147, "step": 5852 }, { "epoch": 0.48783333333333334, "grad_norm": 5.875, "grad_norm_var": 0.18625895182291666, "learning_rate": 4e-05, "loss": 4.8177, "loss/crossentropy": 2.0739459693431854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22978762909770012, "step": 5854 }, { "epoch": 0.488, "grad_norm": 4.71875, "grad_norm_var": 0.11404622395833333, "learning_rate": 4e-05, "loss": 4.6849, "loss/crossentropy": 1.164167359471321, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13028892129659653, "step": 5856 }, { "epoch": 0.4881666666666667, "grad_norm": 4.75, "grad_norm_var": 0.11591389973958334, "learning_rate": 4e-05, "loss": 5.3475, "loss/crossentropy": 2.5920631885528564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20778515562415123, "step": 5858 }, { "epoch": 0.48833333333333334, "grad_norm": 4.875, "grad_norm_var": 0.10546468098958334, "learning_rate": 4e-05, "loss": 5.1718, "loss/crossentropy": 1.9089709669351578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1964315064251423, "step": 5860 }, { "epoch": 0.4885, "grad_norm": 5.0, "grad_norm_var": 0.09967041015625, "learning_rate": 4e-05, "loss": 5.2793, "loss/crossentropy": 1.874973475933075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18953284621238708, "step": 5862 }, { "epoch": 0.4886666666666667, "grad_norm": 4.875, "grad_norm_var": 0.09140218098958333, "learning_rate": 4e-05, "loss": 5.2577, "loss/crossentropy": 2.303260922431946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19869283586740494, "step": 5864 }, { "epoch": 0.48883333333333334, "grad_norm": 5.0, "grad_norm_var": 0.09527587890625, "learning_rate": 4e-05, "loss": 4.5316, "loss/crossentropy": 1.5516202598810196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2037242390215397, "step": 5866 }, { "epoch": 0.489, "grad_norm": 4.875, "grad_norm_var": 0.08443603515625, "learning_rate": 4e-05, "loss": 4.3666, "loss/crossentropy": 1.716199368238449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23071671649813652, "step": 5868 }, { "epoch": 0.4891666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.028450520833333333, "learning_rate": 4e-05, "loss": 5.1654, "loss/crossentropy": 1.994306929409504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18211017921566963, "step": 5870 }, { "epoch": 0.48933333333333334, "grad_norm": 4.875, "grad_norm_var": 0.029020182291666665, "learning_rate": 4e-05, "loss": 4.7998, "loss/crossentropy": 1.303352952003479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16013509593904018, "step": 5872 }, { "epoch": 0.4895, "grad_norm": 4.8125, "grad_norm_var": 0.02847900390625, "learning_rate": 4e-05, "loss": 5.3041, "loss/crossentropy": 1.8330368399620056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1668311320245266, "step": 5874 }, { "epoch": 0.48966666666666664, "grad_norm": 4.65625, "grad_norm_var": 0.031640625, "learning_rate": 4e-05, "loss": 4.8962, "loss/crossentropy": 2.1987491250038147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968229003250599, "step": 5876 }, { "epoch": 0.48983333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.023942057291666666, "learning_rate": 4e-05, "loss": 4.7749, "loss/crossentropy": 2.274143636226654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117280811071396, "step": 5878 }, { "epoch": 0.49, "grad_norm": 4.96875, "grad_norm_var": 0.027762858072916667, "learning_rate": 4e-05, "loss": 4.2549, "loss/crossentropy": 1.1879191398620605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13721632584929466, "step": 5880 }, { "epoch": 0.49016666666666664, "grad_norm": 4.8125, "grad_norm_var": 0.025113932291666665, "learning_rate": 4e-05, "loss": 4.8011, "loss/crossentropy": 1.788271814584732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16815048828721046, "step": 5882 }, { "epoch": 0.49033333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.03131510416666667, "learning_rate": 4e-05, "loss": 4.6725, "loss/crossentropy": 1.9695579707622528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19962314143776894, "step": 5884 }, { "epoch": 0.4905, "grad_norm": 4.5, "grad_norm_var": 0.03216145833333333, "learning_rate": 4e-05, "loss": 4.3649, "loss/crossentropy": 1.6603060066699982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18644647859036922, "step": 5886 }, { "epoch": 0.49066666666666664, "grad_norm": 4.8125, "grad_norm_var": 0.036572265625, "learning_rate": 4e-05, "loss": 5.1558, "loss/crossentropy": 1.9313186779618263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20377793721854687, "step": 5888 }, { "epoch": 0.49083333333333334, "grad_norm": 5.5, "grad_norm_var": 0.06510009765625, "learning_rate": 4e-05, "loss": 4.7934, "loss/crossentropy": 1.7631231471896172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17797018960118294, "step": 5890 }, { "epoch": 0.491, "grad_norm": 4.53125, "grad_norm_var": 0.07079671223958334, "learning_rate": 4e-05, "loss": 4.7736, "loss/crossentropy": 1.8846217468380928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16871808841824532, "step": 5892 }, { "epoch": 0.49116666666666664, "grad_norm": 5.125, "grad_norm_var": 0.071875, "learning_rate": 4e-05, "loss": 5.2073, "loss/crossentropy": 2.2401039600372314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20969317108392715, "step": 5894 }, { "epoch": 0.49133333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.07029622395833333, "learning_rate": 4e-05, "loss": 5.0011, "loss/crossentropy": 1.9672669917345047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19397153332829475, "step": 5896 }, { "epoch": 0.4915, "grad_norm": 4.59375, "grad_norm_var": 0.07460530598958333, "learning_rate": 4e-05, "loss": 4.0801, "loss/crossentropy": 0.41661109030246735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.0850940328091383, "step": 5898 }, { "epoch": 0.49166666666666664, "grad_norm": 4.96875, "grad_norm_var": 0.0677734375, "learning_rate": 4e-05, "loss": 4.9046, "loss/crossentropy": 1.5046052262187004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13949467055499554, "step": 5900 }, { "epoch": 0.49183333333333334, "grad_norm": 4.4375, "grad_norm_var": 0.07603759765625, "learning_rate": 4e-05, "loss": 4.4854, "loss/crossentropy": 1.941825695335865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18303821794688702, "step": 5902 }, { "epoch": 0.492, "grad_norm": 4.875, "grad_norm_var": 0.080712890625, "learning_rate": 4e-05, "loss": 4.8065, "loss/crossentropy": 1.7237118035554886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17298521287739277, "step": 5904 }, { "epoch": 0.49216666666666664, "grad_norm": 4.875, "grad_norm_var": 0.06565348307291667, "learning_rate": 4e-05, "loss": 4.8444, "loss/crossentropy": 1.5659492053091526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17988939210772514, "step": 5906 }, { "epoch": 0.49233333333333335, "grad_norm": 4.78125, "grad_norm_var": 0.059794108072916664, "learning_rate": 4e-05, "loss": 4.7725, "loss/crossentropy": 2.2780866622924805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22615566104650497, "step": 5908 }, { "epoch": 0.4925, "grad_norm": 4.90625, "grad_norm_var": 0.06047770182291667, "learning_rate": 4e-05, "loss": 4.6949, "loss/crossentropy": 2.317236602306366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.197988148778677, "step": 5910 }, { "epoch": 0.49266666666666664, "grad_norm": 4.875, "grad_norm_var": 0.05944010416666667, "learning_rate": 4e-05, "loss": 4.452, "loss/crossentropy": 1.369862139225006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1475118100643158, "step": 5912 }, { "epoch": 0.49283333333333335, "grad_norm": 4.8125, "grad_norm_var": 0.06951497395833334, "learning_rate": 4e-05, "loss": 4.9848, "loss/crossentropy": 1.3414551764726639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16197285428643227, "step": 5914 }, { "epoch": 0.493, "grad_norm": 4.96875, "grad_norm_var": 0.09034830729166667, "learning_rate": 4e-05, "loss": 5.1543, "loss/crossentropy": 2.490071475505829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21345427632331848, "step": 5916 }, { "epoch": 0.49316666666666664, "grad_norm": 4.75, "grad_norm_var": 0.06568603515625, "learning_rate": 4e-05, "loss": 5.0928, "loss/crossentropy": 2.5531476736068726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24787741899490356, "step": 5918 }, { "epoch": 0.49333333333333335, "grad_norm": 5.0, "grad_norm_var": 0.060139973958333336, "learning_rate": 4e-05, "loss": 5.1398, "loss/crossentropy": 1.9292317777872086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17992111667990685, "step": 5920 }, { "epoch": 0.4935, "grad_norm": 4.71875, "grad_norm_var": 0.07379150390625, "learning_rate": 4e-05, "loss": 5.0046, "loss/crossentropy": 2.194719046354294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2291640229523182, "step": 5922 }, { "epoch": 0.49366666666666664, "grad_norm": 5.09375, "grad_norm_var": 0.07433268229166666, "learning_rate": 4e-05, "loss": 5.3898, "loss/crossentropy": 2.461084246635437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.223949883133173, "step": 5924 }, { "epoch": 0.49383333333333335, "grad_norm": 5.34375, "grad_norm_var": 0.07948811848958333, "learning_rate": 4e-05, "loss": 5.0161, "loss/crossentropy": 1.2005824074149132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12999506667256355, "step": 5926 }, { "epoch": 0.494, "grad_norm": 4.65625, "grad_norm_var": 0.08186442057291667, "learning_rate": 4e-05, "loss": 5.2245, "loss/crossentropy": 1.230931095778942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1690246555954218, "step": 5928 }, { "epoch": 0.49416666666666664, "grad_norm": 5.09375, "grad_norm_var": 0.07248942057291667, "learning_rate": 4e-05, "loss": 5.0945, "loss/crossentropy": 2.3888412415981293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2041553296148777, "step": 5930 }, { "epoch": 0.49433333333333335, "grad_norm": 4.96875, "grad_norm_var": 0.05597330729166667, "learning_rate": 4e-05, "loss": 4.8557, "loss/crossentropy": 1.6737447902560234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22364658117294312, "step": 5932 }, { "epoch": 0.4945, "grad_norm": 4.6875, "grad_norm_var": 0.06412760416666667, "learning_rate": 4e-05, "loss": 4.6837, "loss/crossentropy": 2.177261143922806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21531370282173157, "step": 5934 }, { "epoch": 0.49466666666666664, "grad_norm": 4.5625, "grad_norm_var": 0.068359375, "learning_rate": 4e-05, "loss": 4.3736, "loss/crossentropy": 1.9087681472301483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18165162578225136, "step": 5936 }, { "epoch": 0.49483333333333335, "grad_norm": 4.6875, "grad_norm_var": 0.08919270833333333, "learning_rate": 4e-05, "loss": 4.684, "loss/crossentropy": 2.2595274448394775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23523679375648499, "step": 5938 }, { "epoch": 0.495, "grad_norm": 6.6875, "grad_norm_var": 0.2752888997395833, "learning_rate": 4e-05, "loss": 4.3276, "loss/crossentropy": 1.8927638530731201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26145458966493607, "step": 5940 }, { "epoch": 0.49516666666666664, "grad_norm": 5.15625, "grad_norm_var": 0.27274983723958335, "learning_rate": 4e-05, "loss": 5.1695, "loss/crossentropy": 2.340158134698868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2244977205991745, "step": 5942 }, { "epoch": 0.49533333333333335, "grad_norm": 5.28125, "grad_norm_var": 0.27708333333333335, "learning_rate": 4e-05, "loss": 5.0179, "loss/crossentropy": 1.9942467659711838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1774813625961542, "step": 5944 }, { "epoch": 0.4955, "grad_norm": 4.96875, "grad_norm_var": 0.28964436848958336, "learning_rate": 4e-05, "loss": 5.1942, "loss/crossentropy": 1.7467531263828278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19287551939487457, "step": 5946 }, { "epoch": 0.49566666666666664, "grad_norm": 4.75, "grad_norm_var": 0.30271809895833335, "learning_rate": 4e-05, "loss": 4.9724, "loss/crossentropy": 2.288071572780609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23296159133315086, "step": 5948 }, { "epoch": 0.49583333333333335, "grad_norm": 4.625, "grad_norm_var": 0.2931925455729167, "learning_rate": 4e-05, "loss": 5.0832, "loss/crossentropy": 1.5061530321836472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16233282163739204, "step": 5950 }, { "epoch": 0.496, "grad_norm": 4.5, "grad_norm_var": 0.30162353515625, "learning_rate": 4e-05, "loss": 4.4171, "loss/crossentropy": 1.308549128472805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13862483203411102, "step": 5952 }, { "epoch": 0.49616666666666664, "grad_norm": 4.59375, "grad_norm_var": 0.294384765625, "learning_rate": 4e-05, "loss": 4.2719, "loss/crossentropy": 1.2562294602394104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.136533934623003, "step": 5954 }, { "epoch": 0.49633333333333335, "grad_norm": 4.75, "grad_norm_var": 0.07823893229166666, "learning_rate": 4e-05, "loss": 4.1481, "loss/crossentropy": 1.8677359819412231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17084606923162937, "step": 5956 }, { "epoch": 0.4965, "grad_norm": 4.78125, "grad_norm_var": 0.042952473958333334, "learning_rate": 4e-05, "loss": 5.1479, "loss/crossentropy": 2.244591236114502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2130465917289257, "step": 5958 }, { "epoch": 0.49666666666666665, "grad_norm": 4.75, "grad_norm_var": 0.022379557291666668, "learning_rate": 4e-05, "loss": 5.0567, "loss/crossentropy": 2.3259174823760986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980872005224228, "step": 5960 }, { "epoch": 0.49683333333333335, "grad_norm": 4.78125, "grad_norm_var": 0.030729166666666665, "learning_rate": 4e-05, "loss": 4.8392, "loss/crossentropy": 2.577570676803589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21869254857301712, "step": 5962 }, { "epoch": 0.497, "grad_norm": 5.0, "grad_norm_var": 0.043473307291666666, "learning_rate": 4e-05, "loss": 5.403, "loss/crossentropy": 2.4040364921092987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2173318862915039, "step": 5964 }, { "epoch": 0.49716666666666665, "grad_norm": 5.09375, "grad_norm_var": 0.047526041666666664, "learning_rate": 4e-05, "loss": 4.6977, "loss/crossentropy": 1.6452934443950653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20848311856389046, "step": 5966 }, { "epoch": 0.49733333333333335, "grad_norm": 4.6875, "grad_norm_var": 0.05041910807291667, "learning_rate": 4e-05, "loss": 4.1572, "loss/crossentropy": 2.2454931437969208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20870861783623695, "step": 5968 }, { "epoch": 0.4975, "grad_norm": 5.0, "grad_norm_var": 0.04763997395833333, "learning_rate": 4e-05, "loss": 4.9585, "loss/crossentropy": 1.4836616143584251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17538996413350105, "step": 5970 }, { "epoch": 0.49766666666666665, "grad_norm": 4.53125, "grad_norm_var": 0.051416015625, "learning_rate": 4e-05, "loss": 4.847, "loss/crossentropy": 1.3948156237602234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14281159173697233, "step": 5972 }, { "epoch": 0.49783333333333335, "grad_norm": 4.78125, "grad_norm_var": 0.05172119140625, "learning_rate": 4e-05, "loss": 5.1803, "loss/crossentropy": 2.3170337080955505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23617496713995934, "step": 5974 }, { "epoch": 0.498, "grad_norm": 4.46875, "grad_norm_var": 0.060546875, "learning_rate": 4e-05, "loss": 4.885, "loss/crossentropy": 1.7463590651750565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15852132812142372, "step": 5976 }, { "epoch": 0.49816666666666665, "grad_norm": 4.625, "grad_norm_var": 0.056233723958333336, "learning_rate": 4e-05, "loss": 4.8101, "loss/crossentropy": 2.4009880125522614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22088930383324623, "step": 5978 }, { "epoch": 0.49833333333333335, "grad_norm": 4.90625, "grad_norm_var": 0.048567708333333334, "learning_rate": 4e-05, "loss": 5.1921, "loss/crossentropy": 1.3776784762740135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.142588060349226, "step": 5980 }, { "epoch": 0.4985, "grad_norm": 4.71875, "grad_norm_var": 0.04544270833333333, "learning_rate": 4e-05, "loss": 4.3889, "loss/crossentropy": 1.7223493158817291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1928764395415783, "step": 5982 }, { "epoch": 0.49866666666666665, "grad_norm": 4.8125, "grad_norm_var": 0.03860677083333333, "learning_rate": 4e-05, "loss": 5.2635, "loss/crossentropy": 1.702957108616829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1879901885986328, "step": 5984 }, { "epoch": 0.49883333333333335, "grad_norm": 4.4375, "grad_norm_var": 0.03290608723958333, "learning_rate": 4e-05, "loss": 3.8626, "loss/crossentropy": 0.9118227884173393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11705210246145725, "step": 5986 }, { "epoch": 0.499, "grad_norm": 5.0625, "grad_norm_var": 0.03631184895833333, "learning_rate": 4e-05, "loss": 4.7101, "loss/crossentropy": 0.9368212670087814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14234120398759842, "step": 5988 }, { "epoch": 0.49916666666666665, "grad_norm": 4.84375, "grad_norm_var": 0.04215087890625, "learning_rate": 4e-05, "loss": 4.8971, "loss/crossentropy": 2.262044668197632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21489252150058746, "step": 5990 }, { "epoch": 0.49933333333333335, "grad_norm": 4.96875, "grad_norm_var": 0.03534749348958333, "learning_rate": 4e-05, "loss": 5.4815, "loss/crossentropy": 2.360913395881653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2074732705950737, "step": 5992 }, { "epoch": 0.4995, "grad_norm": 4.78125, "grad_norm_var": 0.035445149739583334, "learning_rate": 4e-05, "loss": 5.2325, "loss/crossentropy": 2.536450207233429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22463076934218407, "step": 5994 }, { "epoch": 0.49966666666666665, "grad_norm": 4.90625, "grad_norm_var": 0.048046875, "learning_rate": 4e-05, "loss": 4.2879, "loss/crossentropy": 0.678945854306221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.0957408007234335, "step": 5996 }, { "epoch": 0.49983333333333335, "grad_norm": 5.09375, "grad_norm_var": 0.04803059895833333, "learning_rate": 4e-05, "loss": 4.6463, "loss/crossentropy": 1.7992531657218933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17922177724540234, "step": 5998 }, { "epoch": 0.5, "grad_norm": 4.96875, "grad_norm_var": 0.049072265625, "learning_rate": 4e-05, "loss": 5.3301, "loss/crossentropy": 1.8574589490890503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18118897452950478, "step": 6000 }, { "epoch": 0.5001666666666666, "grad_norm": 5.0, "grad_norm_var": 0.029488118489583333, "learning_rate": 4e-05, "loss": 5.2455, "loss/crossentropy": 1.382308728992939, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16513625532388687, "step": 6002 }, { "epoch": 0.5003333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.03919270833333333, "learning_rate": 4e-05, "loss": 4.9577, "loss/crossentropy": 1.8322591856122017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16798565536737442, "step": 6004 }, { "epoch": 0.5005, "grad_norm": 5.625, "grad_norm_var": 0.07190348307291666, "learning_rate": 4e-05, "loss": 5.3397, "loss/crossentropy": 2.5801175832748413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2178741730749607, "step": 6006 }, { "epoch": 0.5006666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.074462890625, "learning_rate": 4e-05, "loss": 4.9032, "loss/crossentropy": 2.0123944729566574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1762095633894205, "step": 6008 }, { "epoch": 0.5008333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.07369384765625, "learning_rate": 4e-05, "loss": 4.325, "loss/crossentropy": 1.4120425209403038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16691729426383972, "step": 6010 }, { "epoch": 0.501, "grad_norm": 5.03125, "grad_norm_var": 0.07083333333333333, "learning_rate": 4e-05, "loss": 5.4007, "loss/crossentropy": 2.539487302303314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22366130352020264, "step": 6012 }, { "epoch": 0.5011666666666666, "grad_norm": 5.25, "grad_norm_var": 0.07381184895833333, "learning_rate": 4e-05, "loss": 5.3621, "loss/crossentropy": 2.442401111125946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21156932041049004, "step": 6014 }, { "epoch": 0.5013333333333333, "grad_norm": 5.25, "grad_norm_var": 0.07408447265625, "learning_rate": 4e-05, "loss": 4.9838, "loss/crossentropy": 1.941146932542324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16361217759549618, "step": 6016 }, { "epoch": 0.5015, "grad_norm": 5.53125, "grad_norm_var": 0.09104410807291667, "learning_rate": 4e-05, "loss": 4.8045, "loss/crossentropy": 1.6995574682950974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17772125452756882, "step": 6018 }, { "epoch": 0.5016666666666667, "grad_norm": 4.875, "grad_norm_var": 0.07024332682291666, "learning_rate": 4e-05, "loss": 5.028, "loss/crossentropy": 1.2951749190688133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13708342798054218, "step": 6020 }, { "epoch": 0.5018333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.05862223307291667, "learning_rate": 4e-05, "loss": 5.2686, "loss/crossentropy": 1.886262372136116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18839742615818977, "step": 6022 }, { "epoch": 0.502, "grad_norm": 5.3125, "grad_norm_var": 0.06679280598958333, "learning_rate": 4e-05, "loss": 4.8845, "loss/crossentropy": 2.3439903557300568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20835384353995323, "step": 6024 }, { "epoch": 0.5021666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.09217122395833334, "learning_rate": 4e-05, "loss": 4.1525, "loss/crossentropy": 1.9972389563918114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18215650878846645, "step": 6026 }, { "epoch": 0.5023333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.09192708333333334, "learning_rate": 4e-05, "loss": 5.0969, "loss/crossentropy": 1.8251912593841553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17888565175235271, "step": 6028 }, { "epoch": 0.5025, "grad_norm": 4.75, "grad_norm_var": 0.08951416015625, "learning_rate": 4e-05, "loss": 4.6262, "loss/crossentropy": 1.6110865250229836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845040712505579, "step": 6030 }, { "epoch": 0.5026666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.115869140625, "learning_rate": 4e-05, "loss": 4.6456, "loss/crossentropy": 2.026170499622822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19897928833961487, "step": 6032 }, { "epoch": 0.5028333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.09097900390625, "learning_rate": 4e-05, "loss": 5.3882, "loss/crossentropy": 2.2215274199843407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18462110683321953, "step": 6034 }, { "epoch": 0.503, "grad_norm": 4.875, "grad_norm_var": 0.10445556640625, "learning_rate": 4e-05, "loss": 4.7315, "loss/crossentropy": 1.7266902402043343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15911870449781418, "step": 6036 }, { "epoch": 0.5031666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.09023030598958333, "learning_rate": 4e-05, "loss": 4.7497, "loss/crossentropy": 1.2634576633572578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14578200317919254, "step": 6038 }, { "epoch": 0.5033333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.07203369140625, "learning_rate": 4e-05, "loss": 4.5089, "loss/crossentropy": 1.9131877347826958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1862852443009615, "step": 6040 }, { "epoch": 0.5035, "grad_norm": 5.25, "grad_norm_var": 0.09212239583333333, "learning_rate": 4e-05, "loss": 4.9267, "loss/crossentropy": 1.783204659819603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17316385731101036, "step": 6042 }, { "epoch": 0.5036666666666667, "grad_norm": 4.875, "grad_norm_var": 0.090087890625, "learning_rate": 4e-05, "loss": 4.7129, "loss/crossentropy": 1.7621163725852966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1662435531616211, "step": 6044 }, { "epoch": 0.5038333333333334, "grad_norm": 5.375, "grad_norm_var": 0.10611979166666667, "learning_rate": 4e-05, "loss": 4.1912, "loss/crossentropy": 1.622048631310463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18177297711372375, "step": 6046 }, { "epoch": 0.504, "grad_norm": 5.15625, "grad_norm_var": 0.11750895182291667, "learning_rate": 4e-05, "loss": 5.0865, "loss/crossentropy": 1.6797449514269829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16225052624940872, "step": 6048 }, { "epoch": 0.5041666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.11949462890625, "learning_rate": 4e-05, "loss": 4.9798, "loss/crossentropy": 1.8889866098761559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17602252773940563, "step": 6050 }, { "epoch": 0.5043333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.1078125, "learning_rate": 4e-05, "loss": 4.5108, "loss/crossentropy": 2.5241169333457947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20590389892458916, "step": 6052 }, { "epoch": 0.5045, "grad_norm": 4.6875, "grad_norm_var": 0.09468994140625, "learning_rate": 4e-05, "loss": 4.8736, "loss/crossentropy": 2.277352422475815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1986519955098629, "step": 6054 }, { "epoch": 0.5046666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.08453369140625, "learning_rate": 4e-05, "loss": 4.6058, "loss/crossentropy": 2.3752284348011017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21625737473368645, "step": 6056 }, { "epoch": 0.5048333333333334, "grad_norm": 4.75, "grad_norm_var": 0.08570556640625, "learning_rate": 4e-05, "loss": 5.2332, "loss/crossentropy": 2.3863165974617004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20476922765374184, "step": 6058 }, { "epoch": 0.505, "grad_norm": 4.875, "grad_norm_var": 0.087353515625, "learning_rate": 4e-05, "loss": 4.7747, "loss/crossentropy": 2.209844708442688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20129455253481865, "step": 6060 }, { "epoch": 0.5051666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.0900390625, "learning_rate": 4e-05, "loss": 5.2351, "loss/crossentropy": 1.8845185935497284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2019658386707306, "step": 6062 }, { "epoch": 0.5053333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.06399739583333333, "learning_rate": 4e-05, "loss": 4.499, "loss/crossentropy": 1.9752169027924538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18215681612491608, "step": 6064 }, { "epoch": 0.5055, "grad_norm": 5.09375, "grad_norm_var": 0.06015218098958333, "learning_rate": 4e-05, "loss": 4.6792, "loss/crossentropy": 1.0717046111822128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1789306327700615, "step": 6066 }, { "epoch": 0.5056666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.04303385416666667, "learning_rate": 4e-05, "loss": 5.2286, "loss/crossentropy": 1.7968225330114365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18911275640130043, "step": 6068 }, { "epoch": 0.5058333333333334, "grad_norm": 4.375, "grad_norm_var": 0.05865885416666667, "learning_rate": 4e-05, "loss": 4.6731, "loss/crossentropy": 2.1856305301189423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19559626653790474, "step": 6070 }, { "epoch": 0.506, "grad_norm": 4.6875, "grad_norm_var": 0.060400390625, "learning_rate": 4e-05, "loss": 4.9951, "loss/crossentropy": 1.5920969396829605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17270361259579659, "step": 6072 }, { "epoch": 0.5061666666666667, "grad_norm": 4.75, "grad_norm_var": 0.060400390625, "learning_rate": 4e-05, "loss": 4.8435, "loss/crossentropy": 1.9251913204789162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16875243186950684, "step": 6074 }, { "epoch": 0.5063333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.06226806640625, "learning_rate": 4e-05, "loss": 5.0282, "loss/crossentropy": 1.7353182584047318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15810954943299294, "step": 6076 }, { "epoch": 0.5065, "grad_norm": 4.84375, "grad_norm_var": 0.058426920572916666, "learning_rate": 4e-05, "loss": 5.5596, "loss/crossentropy": 1.6096114814281464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17892133630812168, "step": 6078 }, { "epoch": 0.5066666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.054036458333333336, "learning_rate": 4e-05, "loss": 5.0621, "loss/crossentropy": 1.56081011146307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16593561880290508, "step": 6080 }, { "epoch": 0.5068333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.06131184895833333, "learning_rate": 4e-05, "loss": 4.4695, "loss/crossentropy": 2.1837804913520813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104526273906231, "step": 6082 }, { "epoch": 0.507, "grad_norm": 4.6875, "grad_norm_var": 0.056233723958333336, "learning_rate": 4e-05, "loss": 5.0153, "loss/crossentropy": 2.1096479892730713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1877879686653614, "step": 6084 }, { "epoch": 0.5071666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.0396484375, "learning_rate": 4e-05, "loss": 4.3041, "loss/crossentropy": 1.398997388780117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14874712191522121, "step": 6086 }, { "epoch": 0.5073333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.05792643229166667, "learning_rate": 4e-05, "loss": 4.0589, "loss/crossentropy": 2.1173028349876404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18377171456813812, "step": 6088 }, { "epoch": 0.5075, "grad_norm": 5.3125, "grad_norm_var": 0.07200113932291667, "learning_rate": 4e-05, "loss": 5.1461, "loss/crossentropy": 2.301940530538559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2225518599152565, "step": 6090 }, { "epoch": 0.5076666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.07841389973958333, "learning_rate": 4e-05, "loss": 4.7475, "loss/crossentropy": 1.870749220252037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839730478823185, "step": 6092 }, { "epoch": 0.5078333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.08440348307291666, "learning_rate": 4e-05, "loss": 4.3053, "loss/crossentropy": 1.5619140639901161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14815925806760788, "step": 6094 }, { "epoch": 0.508, "grad_norm": 4.59375, "grad_norm_var": 0.088916015625, "learning_rate": 4e-05, "loss": 5.0066, "loss/crossentropy": 2.1684592068195343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2090744599699974, "step": 6096 }, { "epoch": 0.5081666666666667, "grad_norm": 4.625, "grad_norm_var": 0.0810546875, "learning_rate": 4e-05, "loss": 5.1679, "loss/crossentropy": 2.247919350862503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19475699588656425, "step": 6098 }, { "epoch": 0.5083333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.08186442057291667, "learning_rate": 4e-05, "loss": 4.7223, "loss/crossentropy": 1.5078487992286682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15344743058085442, "step": 6100 }, { "epoch": 0.5085, "grad_norm": 4.84375, "grad_norm_var": 0.09134114583333333, "learning_rate": 4e-05, "loss": 4.7778, "loss/crossentropy": 2.0286532193422318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19421415030956268, "step": 6102 }, { "epoch": 0.5086666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.069775390625, "learning_rate": 4e-05, "loss": 4.9994, "loss/crossentropy": 1.5452463030815125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15388812869787216, "step": 6104 }, { "epoch": 0.5088333333333334, "grad_norm": 4.3125, "grad_norm_var": 0.06877848307291666, "learning_rate": 4e-05, "loss": 4.6816, "loss/crossentropy": 1.1183883771300316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1324480827897787, "step": 6106 }, { "epoch": 0.509, "grad_norm": 4.5, "grad_norm_var": 0.08717447916666667, "learning_rate": 4e-05, "loss": 4.3537, "loss/crossentropy": 1.9015508890151978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20949223265051842, "step": 6108 }, { "epoch": 0.5091666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.08577067057291667, "learning_rate": 4e-05, "loss": 4.794, "loss/crossentropy": 1.9852671101689339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18729106336832047, "step": 6110 }, { "epoch": 0.5093333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.0826171875, "learning_rate": 4e-05, "loss": 5.6819, "loss/crossentropy": 1.9998779222369194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17011355608701706, "step": 6112 }, { "epoch": 0.5095, "grad_norm": 5.125, "grad_norm_var": 0.08938802083333333, "learning_rate": 4e-05, "loss": 4.6003, "loss/crossentropy": 2.7957329154014587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22699139639735222, "step": 6114 }, { "epoch": 0.5096666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.10341389973958333, "learning_rate": 4e-05, "loss": 4.765, "loss/crossentropy": 2.4798883199691772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2112659588456154, "step": 6116 }, { "epoch": 0.5098333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.09123942057291666, "learning_rate": 4e-05, "loss": 4.8308, "loss/crossentropy": 1.5427830889821053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14681899175047874, "step": 6118 }, { "epoch": 0.51, "grad_norm": 4.875, "grad_norm_var": 0.091259765625, "learning_rate": 4e-05, "loss": 5.116, "loss/crossentropy": 1.948954276740551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17982318252325058, "step": 6120 }, { "epoch": 0.5101666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.095556640625, "learning_rate": 4e-05, "loss": 5.2846, "loss/crossentropy": 2.349464535713196, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22288982570171356, "step": 6122 }, { "epoch": 0.5103333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.10142822265625, "learning_rate": 4e-05, "loss": 4.8594, "loss/crossentropy": 1.2270648926496506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12373272702097893, "step": 6124 }, { "epoch": 0.5105, "grad_norm": 5.0, "grad_norm_var": 0.09576416015625, "learning_rate": 4e-05, "loss": 5.194, "loss/crossentropy": 2.047919809818268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2000446319580078, "step": 6126 }, { "epoch": 0.5106666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.094140625, "learning_rate": 4e-05, "loss": 5.3205, "loss/crossentropy": 1.957608625292778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17870837077498436, "step": 6128 }, { "epoch": 0.5108333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.0912109375, "learning_rate": 4e-05, "loss": 5.3583, "loss/crossentropy": 2.141770154237747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25270281732082367, "step": 6130 }, { "epoch": 0.511, "grad_norm": 4.8125, "grad_norm_var": 0.10217692057291666, "learning_rate": 4e-05, "loss": 4.4733, "loss/crossentropy": 1.920982912182808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17835867032408714, "step": 6132 }, { "epoch": 0.5111666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.10384114583333333, "learning_rate": 4e-05, "loss": 4.8358, "loss/crossentropy": 1.4349671453237534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14867802895605564, "step": 6134 }, { "epoch": 0.5113333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.10084635416666667, "learning_rate": 4e-05, "loss": 5.0694, "loss/crossentropy": 2.0233269333839417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19398606568574905, "step": 6136 }, { "epoch": 0.5115, "grad_norm": 4.46875, "grad_norm_var": 0.07564697265625, "learning_rate": 4e-05, "loss": 4.6461, "loss/crossentropy": 1.884231187403202, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1786806397140026, "step": 6138 }, { "epoch": 0.5116666666666667, "grad_norm": 4.875, "grad_norm_var": 0.08941650390625, "learning_rate": 4e-05, "loss": 5.2206, "loss/crossentropy": 1.6101298779249191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20260108821094036, "step": 6140 }, { "epoch": 0.5118333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.10045166015625, "learning_rate": 4e-05, "loss": 4.8658, "loss/crossentropy": 2.56222003698349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21393615752458572, "step": 6142 }, { "epoch": 0.512, "grad_norm": 5.1875, "grad_norm_var": 0.11510009765625, "learning_rate": 4e-05, "loss": 5.1734, "loss/crossentropy": 2.5355464816093445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23778853192925453, "step": 6144 }, { "epoch": 0.5121666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.12394205729166667, "learning_rate": 4e-05, "loss": 4.9488, "loss/crossentropy": 1.8643578216433525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17698352597653866, "step": 6146 }, { "epoch": 0.5123333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.10491129557291666, "learning_rate": 4e-05, "loss": 5.2033, "loss/crossentropy": 1.9535821378231049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19483724609017372, "step": 6148 }, { "epoch": 0.5125, "grad_norm": 4.78125, "grad_norm_var": 0.10349934895833333, "learning_rate": 4e-05, "loss": 4.9737, "loss/crossentropy": 1.9888366162776947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.200862318277359, "step": 6150 }, { "epoch": 0.5126666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.11552327473958333, "learning_rate": 4e-05, "loss": 4.453, "loss/crossentropy": 2.2788354754447937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21191953867673874, "step": 6152 }, { "epoch": 0.5128333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.09972330729166666, "learning_rate": 4e-05, "loss": 5.0921, "loss/crossentropy": 1.3500697389245033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16570369713008404, "step": 6154 }, { "epoch": 0.513, "grad_norm": 5.625, "grad_norm_var": 0.10514322916666667, "learning_rate": 4e-05, "loss": 4.9128, "loss/crossentropy": 1.7618460059165955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21831801161170006, "step": 6156 }, { "epoch": 0.5131666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.10064697265625, "learning_rate": 4e-05, "loss": 4.8258, "loss/crossentropy": 1.8500609695911407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215445153415203, "step": 6158 }, { "epoch": 0.5133333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.09073893229166667, "learning_rate": 4e-05, "loss": 4.9059, "loss/crossentropy": 2.0788095593452454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21428800374269485, "step": 6160 }, { "epoch": 0.5135, "grad_norm": 4.75, "grad_norm_var": 0.08290608723958333, "learning_rate": 4e-05, "loss": 5.0222, "loss/crossentropy": 2.1214606761932373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101825326681137, "step": 6162 }, { "epoch": 0.5136666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.08365478515625, "learning_rate": 4e-05, "loss": 3.6792, "loss/crossentropy": 1.6810518354177475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17640621587634087, "step": 6164 }, { "epoch": 0.5138333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.08957926432291667, "learning_rate": 4e-05, "loss": 4.7143, "loss/crossentropy": 1.536653459072113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18369175493717194, "step": 6166 }, { "epoch": 0.514, "grad_norm": 4.875, "grad_norm_var": 0.07291666666666667, "learning_rate": 4e-05, "loss": 4.8028, "loss/crossentropy": 2.130642056465149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23484623059630394, "step": 6168 }, { "epoch": 0.5141666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.075244140625, "learning_rate": 4e-05, "loss": 5.0955, "loss/crossentropy": 2.1398507356643677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19409223273396492, "step": 6170 }, { "epoch": 0.5143333333333333, "grad_norm": 5.25, "grad_norm_var": 0.07330322265625, "learning_rate": 4e-05, "loss": 5.0557, "loss/crossentropy": 1.8753467500209808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19965557008981705, "step": 6172 }, { "epoch": 0.5145, "grad_norm": 4.96875, "grad_norm_var": 0.06314697265625, "learning_rate": 4e-05, "loss": 4.6537, "loss/crossentropy": 2.6288909912109375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21764767169952393, "step": 6174 }, { "epoch": 0.5146666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.069775390625, "learning_rate": 4e-05, "loss": 5.392, "loss/crossentropy": 1.5557663962244987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104162685573101, "step": 6176 }, { "epoch": 0.5148333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.07288004557291666, "learning_rate": 4e-05, "loss": 4.5591, "loss/crossentropy": 1.727570228278637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1829282995313406, "step": 6178 }, { "epoch": 0.515, "grad_norm": 4.78125, "grad_norm_var": 0.0646484375, "learning_rate": 4e-05, "loss": 4.7078, "loss/crossentropy": 1.828254558146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17748953867703676, "step": 6180 }, { "epoch": 0.5151666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.059619140625, "learning_rate": 4e-05, "loss": 4.459, "loss/crossentropy": 1.7436736971139908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18016472458839417, "step": 6182 }, { "epoch": 0.5153333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.06357014973958333, "learning_rate": 4e-05, "loss": 4.848, "loss/crossentropy": 1.2710848674178123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1631698366254568, "step": 6184 }, { "epoch": 0.5155, "grad_norm": 4.5625, "grad_norm_var": 0.06633707682291666, "learning_rate": 4e-05, "loss": 4.2305, "loss/crossentropy": 1.4865316599607468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17193517833948135, "step": 6186 }, { "epoch": 0.5156666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.03606363932291667, "learning_rate": 4e-05, "loss": 4.6158, "loss/crossentropy": 1.702145777642727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.168340552598238, "step": 6188 }, { "epoch": 0.5158333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.04273681640625, "learning_rate": 4e-05, "loss": 4.5417, "loss/crossentropy": 1.9604062549769878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17854683846235275, "step": 6190 }, { "epoch": 0.516, "grad_norm": 5.15625, "grad_norm_var": 0.0416015625, "learning_rate": 4e-05, "loss": 4.8238, "loss/crossentropy": 2.2761952579021454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19496627151966095, "step": 6192 }, { "epoch": 0.5161666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.04178059895833333, "learning_rate": 4e-05, "loss": 4.8672, "loss/crossentropy": 1.6333682388067245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18837878666818142, "step": 6194 }, { "epoch": 0.5163333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.049544270833333334, "learning_rate": 4e-05, "loss": 5.2122, "loss/crossentropy": 2.6109012365341187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22294483333826065, "step": 6196 }, { "epoch": 0.5165, "grad_norm": 4.8125, "grad_norm_var": 0.058882649739583334, "learning_rate": 4e-05, "loss": 4.9916, "loss/crossentropy": 2.4936388731002808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22632497176527977, "step": 6198 }, { "epoch": 0.5166666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.05090738932291667, "learning_rate": 4e-05, "loss": 4.752, "loss/crossentropy": 2.026665262877941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2111969329416752, "step": 6200 }, { "epoch": 0.5168333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.051953125, "learning_rate": 4e-05, "loss": 4.8235, "loss/crossentropy": 2.217519849538803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086241953074932, "step": 6202 }, { "epoch": 0.517, "grad_norm": 4.34375, "grad_norm_var": 0.073046875, "learning_rate": 4e-05, "loss": 4.9519, "loss/crossentropy": 2.1753300726413727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2227613627910614, "step": 6204 }, { "epoch": 0.5171666666666667, "grad_norm": 5.125, "grad_norm_var": 0.06366780598958334, "learning_rate": 4e-05, "loss": 4.4093, "loss/crossentropy": 1.853736698627472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19453158415853977, "step": 6206 }, { "epoch": 0.5173333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.0796875, "learning_rate": 4e-05, "loss": 4.5713, "loss/crossentropy": 1.8317307531833649, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17688237130641937, "step": 6208 }, { "epoch": 0.5175, "grad_norm": 4.84375, "grad_norm_var": 0.07265625, "learning_rate": 4e-05, "loss": 4.6111, "loss/crossentropy": 1.2401080802083015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18187956511974335, "step": 6210 }, { "epoch": 0.5176666666666667, "grad_norm": 4.625, "grad_norm_var": 0.10650634765625, "learning_rate": 4e-05, "loss": 4.5749, "loss/crossentropy": 2.200599730014801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23270919546484947, "step": 6212 }, { "epoch": 0.5178333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.09986572265625, "learning_rate": 4e-05, "loss": 4.7749, "loss/crossentropy": 1.5861978828907013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17271048948168755, "step": 6214 }, { "epoch": 0.518, "grad_norm": 4.46875, "grad_norm_var": 0.11744384765625, "learning_rate": 4e-05, "loss": 4.3848, "loss/crossentropy": 1.9050172120332718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19479480385780334, "step": 6216 }, { "epoch": 0.5181666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.110009765625, "learning_rate": 4e-05, "loss": 4.8868, "loss/crossentropy": 2.2683032155036926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20415033772587776, "step": 6218 }, { "epoch": 0.5183333333333333, "grad_norm": 4.75, "grad_norm_var": 0.09374593098958334, "learning_rate": 4e-05, "loss": 4.7636, "loss/crossentropy": 1.3371253162622452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14753723703324795, "step": 6220 }, { "epoch": 0.5185, "grad_norm": 4.5625, "grad_norm_var": 0.08896077473958333, "learning_rate": 4e-05, "loss": 4.9785, "loss/crossentropy": 1.7137665003538132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17877374030649662, "step": 6222 }, { "epoch": 0.5186666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.05022379557291667, "learning_rate": 4e-05, "loss": 4.9261, "loss/crossentropy": 2.3642892837524414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21913127228617668, "step": 6224 }, { "epoch": 0.5188333333333334, "grad_norm": 5.71875, "grad_norm_var": 0.12693684895833332, "learning_rate": 4e-05, "loss": 5.1342, "loss/crossentropy": 2.3839961886405945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21350585669279099, "step": 6226 }, { "epoch": 0.519, "grad_norm": 4.84375, "grad_norm_var": 0.11490885416666667, "learning_rate": 4e-05, "loss": 4.795, "loss/crossentropy": 1.5752469822764397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1464600432664156, "step": 6228 }, { "epoch": 0.5191666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.14156494140625, "learning_rate": 4e-05, "loss": 5.2829, "loss/crossentropy": 2.2646824717521667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26518501713871956, "step": 6230 }, { "epoch": 0.5193333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.12550455729166668, "learning_rate": 4e-05, "loss": 5.1681, "loss/crossentropy": 2.501288950443268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19936766847968102, "step": 6232 }, { "epoch": 0.5195, "grad_norm": 5.0625, "grad_norm_var": 0.14386393229166666, "learning_rate": 4e-05, "loss": 5.4947, "loss/crossentropy": 2.6087101250886917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20095998793840408, "step": 6234 }, { "epoch": 0.5196666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.12096354166666666, "learning_rate": 4e-05, "loss": 5.2234, "loss/crossentropy": 2.112656258046627, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1662088967859745, "step": 6236 }, { "epoch": 0.5198333333333334, "grad_norm": 5.53125, "grad_norm_var": 0.11321207682291666, "learning_rate": 4e-05, "loss": 5.4278, "loss/crossentropy": 2.4622672498226166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20646344870328903, "step": 6238 }, { "epoch": 0.52, "grad_norm": 4.875, "grad_norm_var": 0.09568684895833333, "learning_rate": 4e-05, "loss": 5.0155, "loss/crossentropy": 2.2638790011405945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19504579156637192, "step": 6240 }, { "epoch": 0.5201666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.08492431640625, "learning_rate": 4e-05, "loss": 4.8904, "loss/crossentropy": 2.4643925726413727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21222343668341637, "step": 6242 }, { "epoch": 0.5203333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.10026041666666667, "learning_rate": 4e-05, "loss": 4.8743, "loss/crossentropy": 2.4396408200263977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22357990220189095, "step": 6244 }, { "epoch": 0.5205, "grad_norm": 5.03125, "grad_norm_var": 0.09052327473958334, "learning_rate": 4e-05, "loss": 5.0041, "loss/crossentropy": 1.6959142982959747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19862833246588707, "step": 6246 }, { "epoch": 0.5206666666666667, "grad_norm": 5.65625, "grad_norm_var": 0.10611979166666667, "learning_rate": 4e-05, "loss": 5.5068, "loss/crossentropy": 2.444189488887787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21889224275946617, "step": 6248 }, { "epoch": 0.5208333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.09104410807291667, "learning_rate": 4e-05, "loss": 5.0639, "loss/crossentropy": 1.7642913609743118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18225438706576824, "step": 6250 }, { "epoch": 0.521, "grad_norm": 5.03125, "grad_norm_var": 0.09120686848958333, "learning_rate": 4e-05, "loss": 5.5272, "loss/crossentropy": 1.9865412786602974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17439224757254124, "step": 6252 }, { "epoch": 0.5211666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.078759765625, "learning_rate": 4e-05, "loss": 4.7066, "loss/crossentropy": 1.6813689842820168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18430567532777786, "step": 6254 }, { "epoch": 0.5213333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.08268229166666667, "learning_rate": 4e-05, "loss": 4.8611, "loss/crossentropy": 1.4353890866041183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14516296423971653, "step": 6256 }, { "epoch": 0.5215, "grad_norm": 5.0, "grad_norm_var": 0.076025390625, "learning_rate": 4e-05, "loss": 4.7658, "loss/crossentropy": 1.1730327010154724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.135228231549263, "step": 6258 }, { "epoch": 0.5216666666666666, "grad_norm": 4.875, "grad_norm_var": 0.058837890625, "learning_rate": 4e-05, "loss": 4.8194, "loss/crossentropy": 1.7282505184412003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.166041424497962, "step": 6260 }, { "epoch": 0.5218333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.056233723958333336, "learning_rate": 4e-05, "loss": 4.8927, "loss/crossentropy": 2.077870637178421, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21072274073958397, "step": 6262 }, { "epoch": 0.522, "grad_norm": 4.75, "grad_norm_var": 0.019596354166666666, "learning_rate": 4e-05, "loss": 4.94, "loss/crossentropy": 1.6653959900140762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1690856534987688, "step": 6264 }, { "epoch": 0.5221666666666667, "grad_norm": 5.0, "grad_norm_var": 0.017508951822916667, "learning_rate": 4e-05, "loss": 4.796, "loss/crossentropy": 2.3305214643478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2003948614001274, "step": 6266 }, { "epoch": 0.5223333333333333, "grad_norm": 5.34375, "grad_norm_var": 0.033984375, "learning_rate": 4e-05, "loss": 4.9536, "loss/crossentropy": 1.6702621951699257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18134449422359467, "step": 6268 }, { "epoch": 0.5225, "grad_norm": 4.8125, "grad_norm_var": 0.03811442057291667, "learning_rate": 4e-05, "loss": 4.5768, "loss/crossentropy": 1.830449789762497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21275094151496887, "step": 6270 }, { "epoch": 0.5226666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.03720296223958333, "learning_rate": 4e-05, "loss": 5.1588, "loss/crossentropy": 2.4117337092757225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16365902870893478, "step": 6272 }, { "epoch": 0.5228333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.05169270833333333, "learning_rate": 4e-05, "loss": 5.2077, "loss/crossentropy": 2.003813534975052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18846618384122849, "step": 6274 }, { "epoch": 0.523, "grad_norm": 4.71875, "grad_norm_var": 0.05474853515625, "learning_rate": 4e-05, "loss": 4.8736, "loss/crossentropy": 1.146153837442398, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14847451448440552, "step": 6276 }, { "epoch": 0.5231666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.045426432291666666, "learning_rate": 4e-05, "loss": 4.9438, "loss/crossentropy": 2.475542187690735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23931154608726501, "step": 6278 }, { "epoch": 0.5233333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.07845052083333333, "learning_rate": 4e-05, "loss": 4.6778, "loss/crossentropy": 1.7261252030730247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1629408374428749, "step": 6280 }, { "epoch": 0.5235, "grad_norm": 4.8125, "grad_norm_var": 0.07971598307291666, "learning_rate": 4e-05, "loss": 5.3948, "loss/crossentropy": 2.4525701999664307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2252441942691803, "step": 6282 }, { "epoch": 0.5236666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.06601155598958333, "learning_rate": 4e-05, "loss": 5.304, "loss/crossentropy": 1.8351141512393951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17090753465890884, "step": 6284 }, { "epoch": 0.5238333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.06482747395833334, "learning_rate": 4e-05, "loss": 5.3146, "loss/crossentropy": 1.9516530483961105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1840406134724617, "step": 6286 }, { "epoch": 0.524, "grad_norm": 4.71875, "grad_norm_var": 0.07057291666666667, "learning_rate": 4e-05, "loss": 5.4645, "loss/crossentropy": 1.8888097256422043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17698637954890728, "step": 6288 }, { "epoch": 0.5241666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.058447265625, "learning_rate": 4e-05, "loss": 5.0941, "loss/crossentropy": 1.8953617364168167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1759849712252617, "step": 6290 }, { "epoch": 0.5243333333333333, "grad_norm": 4.625, "grad_norm_var": 0.05982666015625, "learning_rate": 4e-05, "loss": 4.7776, "loss/crossentropy": 1.4816829040646553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15172210335731506, "step": 6292 }, { "epoch": 0.5245, "grad_norm": 4.5625, "grad_norm_var": 0.07691650390625, "learning_rate": 4e-05, "loss": 4.3964, "loss/crossentropy": 1.4299919679760933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14875125512480736, "step": 6294 }, { "epoch": 0.5246666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.053125, "learning_rate": 4e-05, "loss": 4.6746, "loss/crossentropy": 1.0042973533272743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1491411328315735, "step": 6296 }, { "epoch": 0.5248333333333334, "grad_norm": 4.75, "grad_norm_var": 0.05129801432291667, "learning_rate": 4e-05, "loss": 4.5342, "loss/crossentropy": 2.0254055559635162, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21776164323091507, "step": 6298 }, { "epoch": 0.525, "grad_norm": 5.1875, "grad_norm_var": 0.058268229166666664, "learning_rate": 4e-05, "loss": 5.0111, "loss/crossentropy": 2.035038098692894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1758381500840187, "step": 6300 }, { "epoch": 0.5251666666666667, "grad_norm": 4.875, "grad_norm_var": 0.049332682291666666, "learning_rate": 4e-05, "loss": 5.2614, "loss/crossentropy": 2.0022889897227287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18497100099921227, "step": 6302 }, { "epoch": 0.5253333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.044270833333333336, "learning_rate": 4e-05, "loss": 4.9755, "loss/crossentropy": 2.0294620618224144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19521377608180046, "step": 6304 }, { "epoch": 0.5255, "grad_norm": 5.40625, "grad_norm_var": 0.09446614583333333, "learning_rate": 4e-05, "loss": 5.3874, "loss/crossentropy": 1.909932941198349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23021681234240532, "step": 6306 }, { "epoch": 0.5256666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.09296875, "learning_rate": 4e-05, "loss": 4.9068, "loss/crossentropy": 2.2554367184638977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20018938928842545, "step": 6308 }, { "epoch": 0.5258333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.06612955729166667, "learning_rate": 4e-05, "loss": 4.9175, "loss/crossentropy": 1.856409564614296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1912154108285904, "step": 6310 }, { "epoch": 0.526, "grad_norm": 4.75, "grad_norm_var": 0.06750895182291666, "learning_rate": 4e-05, "loss": 4.9821, "loss/crossentropy": 1.97897869348526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190910242497921, "step": 6312 }, { "epoch": 0.5261666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.0728515625, "learning_rate": 4e-05, "loss": 5.2764, "loss/crossentropy": 1.6196852624416351, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1537347286939621, "step": 6314 }, { "epoch": 0.5263333333333333, "grad_norm": 5.0, "grad_norm_var": 0.07271728515625, "learning_rate": 4e-05, "loss": 4.9833, "loss/crossentropy": 2.2614503502845764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2053113467991352, "step": 6316 }, { "epoch": 0.5265, "grad_norm": 4.71875, "grad_norm_var": 9.188346354166667, "learning_rate": 4e-05, "loss": 5.0568, "loss/crossentropy": 1.965643584728241, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18929314240813255, "step": 6318 }, { "epoch": 0.5266666666666666, "grad_norm": 5.0625, "grad_norm_var": 9.125504557291666, "learning_rate": 4e-05, "loss": 5.2748, "loss/crossentropy": 1.6978831887245178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21468260884284973, "step": 6320 }, { "epoch": 0.5268333333333334, "grad_norm": 5.0, "grad_norm_var": 9.159228515625, "learning_rate": 4e-05, "loss": 5.1958, "loss/crossentropy": 1.967081904411316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20972293615341187, "step": 6322 }, { "epoch": 0.527, "grad_norm": 4.96875, "grad_norm_var": 9.119755045572917, "learning_rate": 4e-05, "loss": 4.5926, "loss/crossentropy": 1.6954425349831581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1549944244325161, "step": 6324 }, { "epoch": 0.5271666666666667, "grad_norm": 5.4375, "grad_norm_var": 9.114957682291667, "learning_rate": 4e-05, "loss": 5.5283, "loss/crossentropy": 2.016428828239441, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2225477620959282, "step": 6326 }, { "epoch": 0.5273333333333333, "grad_norm": 5.28125, "grad_norm_var": 9.015478515625, "learning_rate": 4e-05, "loss": 5.3683, "loss/crossentropy": 2.1403996646404266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20416608452796936, "step": 6328 }, { "epoch": 0.5275, "grad_norm": 4.53125, "grad_norm_var": 8.976236979166666, "learning_rate": 4e-05, "loss": 4.8732, "loss/crossentropy": 1.7906945049762726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17962076887488365, "step": 6330 }, { "epoch": 0.5276666666666666, "grad_norm": 4.75, "grad_norm_var": 9.127864583333333, "learning_rate": 4e-05, "loss": 4.3215, "loss/crossentropy": 1.274952009320259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.134726008400321, "step": 6332 }, { "epoch": 0.5278333333333334, "grad_norm": 4.625, "grad_norm_var": 0.23472900390625, "learning_rate": 4e-05, "loss": 4.8964, "loss/crossentropy": 1.7225348949432373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21106019616127014, "step": 6334 }, { "epoch": 0.528, "grad_norm": 5.46875, "grad_norm_var": 0.24270833333333333, "learning_rate": 4e-05, "loss": 5.323, "loss/crossentropy": 2.2533539831638336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20605823397636414, "step": 6336 }, { "epoch": 0.5281666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.239306640625, "learning_rate": 4e-05, "loss": 5.1128, "loss/crossentropy": 2.207033485174179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21361444517970085, "step": 6338 }, { "epoch": 0.5283333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.2624308268229167, "learning_rate": 4e-05, "loss": 4.8296, "loss/crossentropy": 2.4584856629371643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20966831594705582, "step": 6340 }, { "epoch": 0.5285, "grad_norm": 5.03125, "grad_norm_var": 0.25774739583333334, "learning_rate": 4e-05, "loss": 4.717, "loss/crossentropy": 1.4203026741743088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15853681229054928, "step": 6342 }, { "epoch": 0.5286666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.21503499348958333, "learning_rate": 4e-05, "loss": 4.8863, "loss/crossentropy": 1.733842521905899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21481941640377045, "step": 6344 }, { "epoch": 0.5288333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.104931640625, "learning_rate": 4e-05, "loss": 4.6211, "loss/crossentropy": 2.055474132299423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19885016605257988, "step": 6346 }, { "epoch": 0.529, "grad_norm": 4.75, "grad_norm_var": 0.073681640625, "learning_rate": 4e-05, "loss": 4.5627, "loss/crossentropy": 1.6423608288168907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17252543941140175, "step": 6348 }, { "epoch": 0.5291666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.072509765625, "learning_rate": 4e-05, "loss": 4.8631, "loss/crossentropy": 2.213515669107437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20754803717136383, "step": 6350 }, { "epoch": 0.5293333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.0546875, "learning_rate": 4e-05, "loss": 4.9258, "loss/crossentropy": 1.778283067047596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.174639031291008, "step": 6352 }, { "epoch": 0.5295, "grad_norm": 5.4375, "grad_norm_var": 0.07261962890625, "learning_rate": 4e-05, "loss": 4.9626, "loss/crossentropy": 2.278725653886795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21602385863661766, "step": 6354 }, { "epoch": 0.5296666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.0583984375, "learning_rate": 4e-05, "loss": 4.9928, "loss/crossentropy": 1.8386949226260185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17753521539270878, "step": 6356 }, { "epoch": 0.5298333333333334, "grad_norm": 4.75, "grad_norm_var": 0.063525390625, "learning_rate": 4e-05, "loss": 4.4814, "loss/crossentropy": 1.8008562847971916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1900603324174881, "step": 6358 }, { "epoch": 0.53, "grad_norm": 4.59375, "grad_norm_var": 0.07496337890625, "learning_rate": 4e-05, "loss": 4.5678, "loss/crossentropy": 1.1889959871768951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15015914104878902, "step": 6360 }, { "epoch": 0.5301666666666667, "grad_norm": 5.0, "grad_norm_var": 0.07643229166666667, "learning_rate": 4e-05, "loss": 5.053, "loss/crossentropy": 2.290656328201294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23251566290855408, "step": 6362 }, { "epoch": 0.5303333333333333, "grad_norm": 5.84375, "grad_norm_var": 0.11545817057291667, "learning_rate": 4e-05, "loss": 5.3294, "loss/crossentropy": 2.3371450304985046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21618635579943657, "step": 6364 }, { "epoch": 0.5305, "grad_norm": 5.125, "grad_norm_var": 0.11022135416666666, "learning_rate": 4e-05, "loss": 5.2062, "loss/crossentropy": 2.769070029258728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21105346828699112, "step": 6366 }, { "epoch": 0.5306666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.10240885416666666, "learning_rate": 4e-05, "loss": 5.3696, "loss/crossentropy": 2.346166968345642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2124275527894497, "step": 6368 }, { "epoch": 0.5308333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.098046875, "learning_rate": 4e-05, "loss": 5.5516, "loss/crossentropy": 2.1462940871715546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2227632701396942, "step": 6370 }, { "epoch": 0.531, "grad_norm": 4.625, "grad_norm_var": 0.10780843098958333, "learning_rate": 4e-05, "loss": 4.9736, "loss/crossentropy": 2.0071266889572144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20992008224129677, "step": 6372 }, { "epoch": 0.5311666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.108056640625, "learning_rate": 4e-05, "loss": 4.4188, "loss/crossentropy": 1.9117163717746735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18441595882177353, "step": 6374 }, { "epoch": 0.5313333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.08993733723958333, "learning_rate": 4e-05, "loss": 4.6976, "loss/crossentropy": 1.9666685834527016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19188417494297028, "step": 6376 }, { "epoch": 0.5315, "grad_norm": 5.0625, "grad_norm_var": 0.09898681640625, "learning_rate": 4e-05, "loss": 4.5196, "loss/crossentropy": 1.652912124991417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16498231142759323, "step": 6378 }, { "epoch": 0.5316666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.05237223307291667, "learning_rate": 4e-05, "loss": 4.3164, "loss/crossentropy": 1.8182547390460968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19134299270808697, "step": 6380 }, { "epoch": 0.5318333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.05318603515625, "learning_rate": 4e-05, "loss": 5.0686, "loss/crossentropy": 2.0795591175556183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21680190041661263, "step": 6382 }, { "epoch": 0.532, "grad_norm": 5.03125, "grad_norm_var": 0.05011393229166667, "learning_rate": 4e-05, "loss": 5.5201, "loss/crossentropy": 1.865036815404892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2006813958287239, "step": 6384 }, { "epoch": 0.5321666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.07470296223958334, "learning_rate": 4e-05, "loss": 5.0107, "loss/crossentropy": 2.5886573791503906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073320634663105, "step": 6386 }, { "epoch": 0.5323333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.06744384765625, "learning_rate": 4e-05, "loss": 4.2948, "loss/crossentropy": 1.7393722236156464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19270563125610352, "step": 6388 }, { "epoch": 0.5325, "grad_norm": 4.59375, "grad_norm_var": 0.07693684895833333, "learning_rate": 4e-05, "loss": 4.8029, "loss/crossentropy": 1.8167866170406342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17310548946261406, "step": 6390 }, { "epoch": 0.5326666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.10432535807291667, "learning_rate": 4e-05, "loss": 4.0045, "loss/crossentropy": 1.742269590497017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17706291005015373, "step": 6392 }, { "epoch": 0.5328333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.10787353515625, "learning_rate": 4e-05, "loss": 5.0616, "loss/crossentropy": 1.8916596919298172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17616764828562737, "step": 6394 }, { "epoch": 0.533, "grad_norm": 4.6875, "grad_norm_var": 0.10435791015625, "learning_rate": 4e-05, "loss": 5.1281, "loss/crossentropy": 2.4375243186950684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24222580343484879, "step": 6396 }, { "epoch": 0.5331666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.10123291015625, "learning_rate": 4e-05, "loss": 4.469, "loss/crossentropy": 2.201574385166168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19397074729204178, "step": 6398 }, { "epoch": 0.5333333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.09850260416666666, "learning_rate": 4e-05, "loss": 4.7124, "loss/crossentropy": 1.9350962042808533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19658659398555756, "step": 6400 }, { "epoch": 0.5335, "grad_norm": 4.78125, "grad_norm_var": 0.06560872395833334, "learning_rate": 4e-05, "loss": 5.2431, "loss/crossentropy": 1.468439057469368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16111770644783974, "step": 6402 }, { "epoch": 0.5336666666666666, "grad_norm": 4.875, "grad_norm_var": 0.067822265625, "learning_rate": 4e-05, "loss": 5.0469, "loss/crossentropy": 2.328448623418808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2267051674425602, "step": 6404 }, { "epoch": 0.5338333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.06275634765625, "learning_rate": 4e-05, "loss": 4.628, "loss/crossentropy": 2.319155514240265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2297566793859005, "step": 6406 }, { "epoch": 0.534, "grad_norm": 5.1875, "grad_norm_var": 0.06370035807291667, "learning_rate": 4e-05, "loss": 4.4522, "loss/crossentropy": 1.6990758031606674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14935988560318947, "step": 6408 }, { "epoch": 0.5341666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.052083333333333336, "learning_rate": 4e-05, "loss": 4.8484, "loss/crossentropy": 1.4202308654785156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16745430044829845, "step": 6410 }, { "epoch": 0.5343333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.05245768229166667, "learning_rate": 4e-05, "loss": 5.2703, "loss/crossentropy": 2.6230361461639404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23828474059700966, "step": 6412 }, { "epoch": 0.5345, "grad_norm": 4.6875, "grad_norm_var": 0.05701497395833333, "learning_rate": 4e-05, "loss": 4.6883, "loss/crossentropy": 2.146642565727234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937054954469204, "step": 6414 }, { "epoch": 0.5346666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.06092122395833333, "learning_rate": 4e-05, "loss": 4.7823, "loss/crossentropy": 1.6736676394939423, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18550512567162514, "step": 6416 }, { "epoch": 0.5348333333333334, "grad_norm": 5.21875, "grad_norm_var": 0.07342122395833334, "learning_rate": 4e-05, "loss": 4.7432, "loss/crossentropy": 1.7560848370194435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17786714434623718, "step": 6418 }, { "epoch": 0.535, "grad_norm": 4.78125, "grad_norm_var": 0.07454427083333333, "learning_rate": 4e-05, "loss": 5.1035, "loss/crossentropy": 1.8662885650992393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1651218943297863, "step": 6420 }, { "epoch": 0.5351666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.072265625, "learning_rate": 4e-05, "loss": 5.3528, "loss/crossentropy": 2.4681405425071716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21770339086651802, "step": 6422 }, { "epoch": 0.5353333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.05243733723958333, "learning_rate": 4e-05, "loss": 5.2777, "loss/crossentropy": 1.8275192975997925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23562873154878616, "step": 6424 }, { "epoch": 0.5355, "grad_norm": 4.5, "grad_norm_var": 0.06552327473958333, "learning_rate": 4e-05, "loss": 4.8877, "loss/crossentropy": 2.511595845222473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2235119715332985, "step": 6426 }, { "epoch": 0.5356666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.06607666015625, "learning_rate": 4e-05, "loss": 4.6033, "loss/crossentropy": 2.1008317470550537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20949330553412437, "step": 6428 }, { "epoch": 0.5358333333333334, "grad_norm": 4.625, "grad_norm_var": 0.06357014973958333, "learning_rate": 4e-05, "loss": 4.7668, "loss/crossentropy": 1.8257400766015053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19425482116639614, "step": 6430 }, { "epoch": 0.536, "grad_norm": 5.21875, "grad_norm_var": 0.0697265625, "learning_rate": 4e-05, "loss": 4.8881, "loss/crossentropy": 2.2771336138248444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20748082920908928, "step": 6432 }, { "epoch": 0.5361666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.06666666666666667, "learning_rate": 4e-05, "loss": 5.2098, "loss/crossentropy": 1.7778798043727875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17552071809768677, "step": 6434 }, { "epoch": 0.5363333333333333, "grad_norm": 5.0, "grad_norm_var": 0.06443684895833333, "learning_rate": 4e-05, "loss": 4.7605, "loss/crossentropy": 1.6425282210111618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1681121438741684, "step": 6436 }, { "epoch": 0.5365, "grad_norm": 4.75, "grad_norm_var": 0.06529541015625, "learning_rate": 4e-05, "loss": 5.4284, "loss/crossentropy": 2.383900284767151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20339232310652733, "step": 6438 }, { "epoch": 0.5366666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.060139973958333336, "learning_rate": 4e-05, "loss": 5.2453, "loss/crossentropy": 2.399388551712036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.196612898260355, "step": 6440 }, { "epoch": 0.5368333333333334, "grad_norm": 5.375, "grad_norm_var": 0.06261393229166666, "learning_rate": 4e-05, "loss": 5.3347, "loss/crossentropy": 1.2793971821665764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15150474943220615, "step": 6442 }, { "epoch": 0.537, "grad_norm": 4.65625, "grad_norm_var": 0.07916259765625, "learning_rate": 4e-05, "loss": 4.8687, "loss/crossentropy": 1.5082692801952362, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1459091752767563, "step": 6444 }, { "epoch": 0.5371666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.07336832682291666, "learning_rate": 4e-05, "loss": 4.8662, "loss/crossentropy": 2.350295513868332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22841082885861397, "step": 6446 }, { "epoch": 0.5373333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.05792643229166667, "learning_rate": 4e-05, "loss": 4.9448, "loss/crossentropy": 2.585313081741333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21179723367094994, "step": 6448 }, { "epoch": 0.5375, "grad_norm": 4.75, "grad_norm_var": 0.05243733723958333, "learning_rate": 4e-05, "loss": 4.3929, "loss/crossentropy": 2.019158661365509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110278531908989, "step": 6450 }, { "epoch": 0.5376666666666666, "grad_norm": 5.125, "grad_norm_var": 0.05935872395833333, "learning_rate": 4e-05, "loss": 4.6044, "loss/crossentropy": 1.8365314081311226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1840047501027584, "step": 6452 }, { "epoch": 0.5378333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.06471354166666667, "learning_rate": 4e-05, "loss": 4.2156, "loss/crossentropy": 2.1785257756710052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20807776227593422, "step": 6454 }, { "epoch": 0.538, "grad_norm": 4.6875, "grad_norm_var": 0.09576416015625, "learning_rate": 4e-05, "loss": 5.2141, "loss/crossentropy": 2.0053779631853104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18812064826488495, "step": 6456 }, { "epoch": 0.5381666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.0833984375, "learning_rate": 4e-05, "loss": 5.6892, "loss/crossentropy": 1.9643101394176483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20990155264735222, "step": 6458 }, { "epoch": 0.5383333333333333, "grad_norm": 6.03125, "grad_norm_var": 0.14892171223958334, "learning_rate": 4e-05, "loss": 5.1658, "loss/crossentropy": 1.853164553642273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23421235010027885, "step": 6460 }, { "epoch": 0.5385, "grad_norm": 4.625, "grad_norm_var": 0.16213785807291667, "learning_rate": 4e-05, "loss": 5.281, "loss/crossentropy": 1.398292914032936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15396402776241302, "step": 6462 }, { "epoch": 0.5386666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.16419270833333333, "learning_rate": 4e-05, "loss": 4.6955, "loss/crossentropy": 2.4291781187057495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2074362002313137, "step": 6464 }, { "epoch": 0.5388333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.161181640625, "learning_rate": 4e-05, "loss": 4.573, "loss/crossentropy": 1.8937534540891647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.178242489695549, "step": 6466 }, { "epoch": 0.539, "grad_norm": 4.875, "grad_norm_var": 0.15754801432291668, "learning_rate": 4e-05, "loss": 5.2892, "loss/crossentropy": 2.6727017164230347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22374076396226883, "step": 6468 }, { "epoch": 0.5391666666666667, "grad_norm": 4.75, "grad_norm_var": 0.14425455729166667, "learning_rate": 4e-05, "loss": 4.9342, "loss/crossentropy": 2.3451661467552185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21111416071653366, "step": 6470 }, { "epoch": 0.5393333333333333, "grad_norm": 4.875, "grad_norm_var": 0.12112223307291667, "learning_rate": 4e-05, "loss": 4.7019, "loss/crossentropy": 1.8271106332540512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1798660308122635, "step": 6472 }, { "epoch": 0.5395, "grad_norm": 4.84375, "grad_norm_var": 0.11998291015625, "learning_rate": 4e-05, "loss": 5.0453, "loss/crossentropy": 2.053663656115532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18994637206196785, "step": 6474 }, { "epoch": 0.5396666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.03534749348958333, "learning_rate": 4e-05, "loss": 5.4068, "loss/crossentropy": 2.02348530292511, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2410239391028881, "step": 6476 }, { "epoch": 0.5398333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.03681233723958333, "learning_rate": 4e-05, "loss": 4.5837, "loss/crossentropy": 1.0813293680548668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13367479853332043, "step": 6478 }, { "epoch": 0.54, "grad_norm": 5.34375, "grad_norm_var": 0.05956624348958333, "learning_rate": 4e-05, "loss": 4.6037, "loss/crossentropy": 1.643570214509964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726165246218443, "step": 6480 }, { "epoch": 0.5401666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.063134765625, "learning_rate": 4e-05, "loss": 4.9758, "loss/crossentropy": 2.1287569105625153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20907136425375938, "step": 6482 }, { "epoch": 0.5403333333333333, "grad_norm": 4.875, "grad_norm_var": 0.05318603515625, "learning_rate": 4e-05, "loss": 4.8937, "loss/crossentropy": 1.5239028334617615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893562152981758, "step": 6484 }, { "epoch": 0.5405, "grad_norm": 5.0, "grad_norm_var": 0.0537109375, "learning_rate": 4e-05, "loss": 5.1236, "loss/crossentropy": 2.2508333921432495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21869652345776558, "step": 6486 }, { "epoch": 0.5406666666666666, "grad_norm": 4.875, "grad_norm_var": 0.12961832682291666, "learning_rate": 4e-05, "loss": 5.061, "loss/crossentropy": 2.1860940158367157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25951458886265755, "step": 6488 }, { "epoch": 0.5408333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.12838541666666667, "learning_rate": 4e-05, "loss": 4.693, "loss/crossentropy": 1.9693054556846619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21282178536057472, "step": 6490 }, { "epoch": 0.541, "grad_norm": 5.09375, "grad_norm_var": 0.12858072916666666, "learning_rate": 4e-05, "loss": 4.1882, "loss/crossentropy": 1.9670451954007149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18181007727980614, "step": 6492 }, { "epoch": 0.5411666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.12255452473958334, "learning_rate": 4e-05, "loss": 4.8558, "loss/crossentropy": 2.0306060314178467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18981993943452835, "step": 6494 }, { "epoch": 0.5413333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.10234375, "learning_rate": 4e-05, "loss": 5.1075, "loss/crossentropy": 1.1169188246130943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11659727245569229, "step": 6496 }, { "epoch": 0.5415, "grad_norm": 5.0, "grad_norm_var": 0.10701497395833333, "learning_rate": 4e-05, "loss": 5.3556, "loss/crossentropy": 2.5187776684761047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22361308708786964, "step": 6498 }, { "epoch": 0.5416666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.11262613932291667, "learning_rate": 4e-05, "loss": 4.6649, "loss/crossentropy": 2.0278166234493256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21406276151537895, "step": 6500 }, { "epoch": 0.5418333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.14423421223958333, "learning_rate": 4e-05, "loss": 4.5689, "loss/crossentropy": 0.8223181739449501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11125335656106472, "step": 6502 }, { "epoch": 0.542, "grad_norm": 4.90625, "grad_norm_var": 0.0802734375, "learning_rate": 4e-05, "loss": 4.9923, "loss/crossentropy": 1.8635541796684265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24781085550785065, "step": 6504 }, { "epoch": 0.5421666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.088671875, "learning_rate": 4e-05, "loss": 4.5678, "loss/crossentropy": 1.9120320081710815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1711360290646553, "step": 6506 }, { "epoch": 0.5423333333333333, "grad_norm": 5.0, "grad_norm_var": 0.08307291666666666, "learning_rate": 4e-05, "loss": 5.228, "loss/crossentropy": 2.482433497905731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151547260582447, "step": 6508 }, { "epoch": 0.5425, "grad_norm": 5.0, "grad_norm_var": 0.08990885416666666, "learning_rate": 4e-05, "loss": 5.2356, "loss/crossentropy": 1.492501512169838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15523040667176247, "step": 6510 }, { "epoch": 0.5426666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.10666910807291667, "learning_rate": 4e-05, "loss": 4.8739, "loss/crossentropy": 1.1665829196572304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1341555155813694, "step": 6512 }, { "epoch": 0.5428333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.09289957682291666, "learning_rate": 4e-05, "loss": 4.411, "loss/crossentropy": 1.2798718959093094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13985286466777325, "step": 6514 }, { "epoch": 0.543, "grad_norm": 4.65625, "grad_norm_var": 0.11809895833333334, "learning_rate": 4e-05, "loss": 4.8671, "loss/crossentropy": 1.679627038538456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18767642788589, "step": 6516 }, { "epoch": 0.5431666666666667, "grad_norm": 5.65625, "grad_norm_var": 0.12561442057291666, "learning_rate": 4e-05, "loss": 4.9561, "loss/crossentropy": 2.608290433883667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20927531272172928, "step": 6518 }, { "epoch": 0.5433333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.13268229166666667, "learning_rate": 4e-05, "loss": 4.8244, "loss/crossentropy": 2.111786961555481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18980854004621506, "step": 6520 }, { "epoch": 0.5435, "grad_norm": 5.09375, "grad_norm_var": 0.12102457682291666, "learning_rate": 4e-05, "loss": 5.3594, "loss/crossentropy": 2.082971006631851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20488620921969414, "step": 6522 }, { "epoch": 0.5436666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.12263997395833333, "learning_rate": 4e-05, "loss": 5.182, "loss/crossentropy": 1.7450605109333992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21161355637013912, "step": 6524 }, { "epoch": 0.5438333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.11705729166666666, "learning_rate": 4e-05, "loss": 5.3596, "loss/crossentropy": 2.4712526500225067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20881268754601479, "step": 6526 }, { "epoch": 0.544, "grad_norm": 4.71875, "grad_norm_var": 0.101025390625, "learning_rate": 4e-05, "loss": 5.2519, "loss/crossentropy": 2.10575695335865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18501529842615128, "step": 6528 }, { "epoch": 0.5441666666666667, "grad_norm": 5.53125, "grad_norm_var": 0.11418863932291666, "learning_rate": 4e-05, "loss": 5.2655, "loss/crossentropy": 2.047022193670273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2391316220164299, "step": 6530 }, { "epoch": 0.5443333333333333, "grad_norm": 4.75, "grad_norm_var": 0.09221598307291666, "learning_rate": 4e-05, "loss": 4.3932, "loss/crossentropy": 2.1609503626823425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19315851852297783, "step": 6532 }, { "epoch": 0.5445, "grad_norm": 5.15625, "grad_norm_var": 0.05872395833333333, "learning_rate": 4e-05, "loss": 5.2578, "loss/crossentropy": 2.1153732389211655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17025521025061607, "step": 6534 }, { "epoch": 0.5446666666666666, "grad_norm": 5.0, "grad_norm_var": 0.055074055989583336, "learning_rate": 4e-05, "loss": 4.9669, "loss/crossentropy": 1.997760385274887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18793719820678234, "step": 6536 }, { "epoch": 0.5448333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.06131184895833333, "learning_rate": 4e-05, "loss": 4.9012, "loss/crossentropy": 1.9758115112781525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17915968596935272, "step": 6538 }, { "epoch": 0.545, "grad_norm": 5.28125, "grad_norm_var": 0.06653645833333334, "learning_rate": 4e-05, "loss": 5.3909, "loss/crossentropy": 1.5755042657256126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.164706502109766, "step": 6540 }, { "epoch": 0.5451666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.06952718098958334, "learning_rate": 4e-05, "loss": 4.3076, "loss/crossentropy": 0.9922100901603699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12771181762218475, "step": 6542 }, { "epoch": 0.5453333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.0744140625, "learning_rate": 4e-05, "loss": 5.0314, "loss/crossentropy": 2.3435046076774597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20274561271071434, "step": 6544 }, { "epoch": 0.5455, "grad_norm": 4.65625, "grad_norm_var": 0.05074462890625, "learning_rate": 4e-05, "loss": 5.3025, "loss/crossentropy": 2.2309907376766205, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21106267720460892, "step": 6546 }, { "epoch": 0.5456666666666666, "grad_norm": 5.125, "grad_norm_var": 0.058577473958333334, "learning_rate": 4e-05, "loss": 4.8551, "loss/crossentropy": 1.9048963338136673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19614364951848984, "step": 6548 }, { "epoch": 0.5458333333333333, "grad_norm": 5.375, "grad_norm_var": 0.06712239583333333, "learning_rate": 4e-05, "loss": 5.2287, "loss/crossentropy": 2.3945577144622803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21453485265374184, "step": 6550 }, { "epoch": 0.546, "grad_norm": 4.8125, "grad_norm_var": 0.066259765625, "learning_rate": 4e-05, "loss": 4.4469, "loss/crossentropy": 1.5723232179880142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20032503828406334, "step": 6552 }, { "epoch": 0.5461666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.05904541015625, "learning_rate": 4e-05, "loss": 4.8559, "loss/crossentropy": 1.4620828106999397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15637214109301567, "step": 6554 }, { "epoch": 0.5463333333333333, "grad_norm": 4.875, "grad_norm_var": 0.057535807291666664, "learning_rate": 4e-05, "loss": 4.9627, "loss/crossentropy": 2.2152084708213806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839854922145605, "step": 6556 }, { "epoch": 0.5465, "grad_norm": 4.625, "grad_norm_var": 0.06353759765625, "learning_rate": 4e-05, "loss": 4.8659, "loss/crossentropy": 1.2578969895839691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1370962280780077, "step": 6558 }, { "epoch": 0.5466666666666666, "grad_norm": 4.3125, "grad_norm_var": 0.11829427083333334, "learning_rate": 4e-05, "loss": 4.7406, "loss/crossentropy": 1.8498205542564392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.182185810059309, "step": 6560 }, { "epoch": 0.5468333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.11184895833333333, "learning_rate": 4e-05, "loss": 5.1587, "loss/crossentropy": 2.459185838699341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21943962574005127, "step": 6562 }, { "epoch": 0.547, "grad_norm": 4.65625, "grad_norm_var": 0.10388997395833334, "learning_rate": 4e-05, "loss": 4.7703, "loss/crossentropy": 1.976406842470169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1768235471099615, "step": 6564 }, { "epoch": 0.5471666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.10022379557291666, "learning_rate": 4e-05, "loss": 4.7495, "loss/crossentropy": 2.067137509584427, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20023474469780922, "step": 6566 }, { "epoch": 0.5473333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.101416015625, "learning_rate": 4e-05, "loss": 5.0631, "loss/crossentropy": 2.285651355981827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20043394714593887, "step": 6568 }, { "epoch": 0.5475, "grad_norm": 4.8125, "grad_norm_var": 0.09970296223958333, "learning_rate": 4e-05, "loss": 5.0448, "loss/crossentropy": 2.1361614763736725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.196052685379982, "step": 6570 }, { "epoch": 0.5476666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.09622395833333333, "learning_rate": 4e-05, "loss": 5.0062, "loss/crossentropy": 2.1231243014335632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22360917553305626, "step": 6572 }, { "epoch": 0.5478333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.08665364583333333, "learning_rate": 4e-05, "loss": 4.873, "loss/crossentropy": 1.7845403030514717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827959530055523, "step": 6574 }, { "epoch": 0.548, "grad_norm": 5.0, "grad_norm_var": 0.026302083333333334, "learning_rate": 4e-05, "loss": 4.5735, "loss/crossentropy": 1.8567826747894287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2372213713824749, "step": 6576 }, { "epoch": 0.5481666666666667, "grad_norm": 5.125, "grad_norm_var": 0.038960774739583336, "learning_rate": 4e-05, "loss": 5.2786, "loss/crossentropy": 2.3799859285354614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2384014129638672, "step": 6578 }, { "epoch": 0.5483333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.039322916666666666, "learning_rate": 4e-05, "loss": 4.8077, "loss/crossentropy": 1.8382329940795898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2353934571146965, "step": 6580 }, { "epoch": 0.5485, "grad_norm": 4.8125, "grad_norm_var": 0.032666015625, "learning_rate": 4e-05, "loss": 5.2369, "loss/crossentropy": 2.491079866886139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22125229239463806, "step": 6582 }, { "epoch": 0.5486666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.04107666015625, "learning_rate": 4e-05, "loss": 4.5376, "loss/crossentropy": 1.3073586374521255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16539104282855988, "step": 6584 }, { "epoch": 0.5488333333333333, "grad_norm": 4.875, "grad_norm_var": 0.043863932291666664, "learning_rate": 4e-05, "loss": 4.9317, "loss/crossentropy": 2.1026156544685364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20953651517629623, "step": 6586 }, { "epoch": 0.549, "grad_norm": 5.0, "grad_norm_var": 0.039957682291666664, "learning_rate": 4e-05, "loss": 4.5531, "loss/crossentropy": 2.3541979789733887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21336714923381805, "step": 6588 }, { "epoch": 0.5491666666666667, "grad_norm": 4.75, "grad_norm_var": 0.0408203125, "learning_rate": 4e-05, "loss": 4.8354, "loss/crossentropy": 1.3233697563409805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15271325409412384, "step": 6590 }, { "epoch": 0.5493333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.039713541666666664, "learning_rate": 4e-05, "loss": 4.7838, "loss/crossentropy": 1.797164410352707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19014303386211395, "step": 6592 }, { "epoch": 0.5495, "grad_norm": 5.03125, "grad_norm_var": 0.028369140625, "learning_rate": 4e-05, "loss": 4.5686, "loss/crossentropy": 2.195107936859131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21631298959255219, "step": 6594 }, { "epoch": 0.5496666666666666, "grad_norm": 5.0, "grad_norm_var": 0.028804524739583334, "learning_rate": 4e-05, "loss": 5.015, "loss/crossentropy": 1.7663453668355942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18998459354043007, "step": 6596 }, { "epoch": 0.5498333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.03534749348958333, "learning_rate": 4e-05, "loss": 5.1948, "loss/crossentropy": 1.7619916647672653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1652333326637745, "step": 6598 }, { "epoch": 0.55, "grad_norm": 5.25, "grad_norm_var": 0.03240559895833333, "learning_rate": 4e-05, "loss": 5.046, "loss/crossentropy": 2.5450315475463867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2705002650618553, "step": 6600 }, { "epoch": 0.5501666666666667, "grad_norm": 5.125, "grad_norm_var": 0.028446451822916666, "learning_rate": 4e-05, "loss": 5.451, "loss/crossentropy": 1.96433687210083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157164253294468, "step": 6602 }, { "epoch": 0.5503333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.035477701822916666, "learning_rate": 4e-05, "loss": 4.8976, "loss/crossentropy": 2.3359290957450867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21508868411183357, "step": 6604 }, { "epoch": 0.5505, "grad_norm": 4.90625, "grad_norm_var": 0.03186442057291667, "learning_rate": 4e-05, "loss": 4.457, "loss/crossentropy": 1.537950836122036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15239088609814644, "step": 6606 }, { "epoch": 0.5506666666666666, "grad_norm": 4.5, "grad_norm_var": 0.050065104166666666, "learning_rate": 4e-05, "loss": 4.447, "loss/crossentropy": 1.3219322934746742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14767685532569885, "step": 6608 }, { "epoch": 0.5508333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.05035400390625, "learning_rate": 4e-05, "loss": 5.0461, "loss/crossentropy": 1.9098602086305618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19876160100102425, "step": 6610 }, { "epoch": 0.551, "grad_norm": 5.1875, "grad_norm_var": 0.04855143229166667, "learning_rate": 4e-05, "loss": 5.2442, "loss/crossentropy": 2.309624195098877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23638612031936646, "step": 6612 }, { "epoch": 0.5511666666666667, "grad_norm": 4.75, "grad_norm_var": 0.047265625, "learning_rate": 4e-05, "loss": 4.8379, "loss/crossentropy": 1.9966806918382645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17079764790832996, "step": 6614 }, { "epoch": 0.5513333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.04394124348958333, "learning_rate": 4e-05, "loss": 4.3536, "loss/crossentropy": 2.0822777450084686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22634311020374298, "step": 6616 }, { "epoch": 0.5515, "grad_norm": 4.8125, "grad_norm_var": 0.04021809895833333, "learning_rate": 4e-05, "loss": 4.7073, "loss/crossentropy": 1.2211828529834747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18051068857312202, "step": 6618 }, { "epoch": 0.5516666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.03762613932291667, "learning_rate": 4e-05, "loss": 5.5942, "loss/crossentropy": 2.4567679166793823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20884644612669945, "step": 6620 }, { "epoch": 0.5518333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.05188395182291667, "learning_rate": 4e-05, "loss": 5.4697, "loss/crossentropy": 2.2509296238422394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2176789492368698, "step": 6622 }, { "epoch": 0.552, "grad_norm": 4.6875, "grad_norm_var": 0.042801920572916666, "learning_rate": 4e-05, "loss": 5.278, "loss/crossentropy": 1.8549513220787048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1713106855750084, "step": 6624 }, { "epoch": 0.5521666666666667, "grad_norm": 4.5, "grad_norm_var": 0.048681640625, "learning_rate": 4e-05, "loss": 4.3434, "loss/crossentropy": 1.9512775838375092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18272232450544834, "step": 6626 }, { "epoch": 0.5523333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.05758056640625, "learning_rate": 4e-05, "loss": 4.2377, "loss/crossentropy": 1.8292421698570251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19010495953261852, "step": 6628 }, { "epoch": 0.5525, "grad_norm": 5.0, "grad_norm_var": 0.06249593098958333, "learning_rate": 4e-05, "loss": 4.8372, "loss/crossentropy": 1.668544426560402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17395753040909767, "step": 6630 }, { "epoch": 0.5526666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.05657145182291667, "learning_rate": 4e-05, "loss": 4.9377, "loss/crossentropy": 1.7942739725112915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17174866795539856, "step": 6632 }, { "epoch": 0.5528333333333333, "grad_norm": 5.5, "grad_norm_var": 0.08566080729166667, "learning_rate": 4e-05, "loss": 5.575, "loss/crossentropy": 2.2777758836746216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2089482769370079, "step": 6634 }, { "epoch": 0.553, "grad_norm": 4.71875, "grad_norm_var": 0.08332926432291667, "learning_rate": 4e-05, "loss": 4.9785, "loss/crossentropy": 2.664782464504242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22308696061372757, "step": 6636 }, { "epoch": 0.5531666666666667, "grad_norm": 4.1875, "grad_norm_var": 0.1029296875, "learning_rate": 4e-05, "loss": 4.2995, "loss/crossentropy": 1.231726422905922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14293777011334896, "step": 6638 }, { "epoch": 0.5533333333333333, "grad_norm": 5.0, "grad_norm_var": 0.10354410807291667, "learning_rate": 4e-05, "loss": 4.8424, "loss/crossentropy": 1.7617796063423157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960979737341404, "step": 6640 }, { "epoch": 0.5535, "grad_norm": 4.8125, "grad_norm_var": 0.0982421875, "learning_rate": 4e-05, "loss": 4.9507, "loss/crossentropy": 2.1851932406425476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22447919100522995, "step": 6642 }, { "epoch": 0.5536666666666666, "grad_norm": 4.625, "grad_norm_var": 0.08658854166666667, "learning_rate": 4e-05, "loss": 4.4639, "loss/crossentropy": 1.8027335181832314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19817146845161915, "step": 6644 }, { "epoch": 0.5538333333333333, "grad_norm": 4.625, "grad_norm_var": 0.08967692057291667, "learning_rate": 4e-05, "loss": 5.0155, "loss/crossentropy": 1.1861390694975853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14436486922204494, "step": 6646 }, { "epoch": 0.554, "grad_norm": 4.875, "grad_norm_var": 0.09073893229166667, "learning_rate": 4e-05, "loss": 5.6074, "loss/crossentropy": 2.3672031462192535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19832666590809822, "step": 6648 }, { "epoch": 0.5541666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.07629801432291666, "learning_rate": 4e-05, "loss": 5.1871, "loss/crossentropy": 2.238451138138771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17222367227077484, "step": 6650 }, { "epoch": 0.5543333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.08801676432291666, "learning_rate": 4e-05, "loss": 4.3575, "loss/crossentropy": 1.422397181391716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17442157492041588, "step": 6652 }, { "epoch": 0.5545, "grad_norm": 4.84375, "grad_norm_var": 0.05206705729166667, "learning_rate": 4e-05, "loss": 4.9704, "loss/crossentropy": 2.4562787413597107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20161322504281998, "step": 6654 }, { "epoch": 0.5546666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.057145182291666666, "learning_rate": 4e-05, "loss": 5.1275, "loss/crossentropy": 2.188231348991394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20168805122375488, "step": 6656 }, { "epoch": 0.5548333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.055078125, "learning_rate": 4e-05, "loss": 5.0781, "loss/crossentropy": 2.32028391957283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20598405227065086, "step": 6658 }, { "epoch": 0.555, "grad_norm": 5.34375, "grad_norm_var": 0.05712483723958333, "learning_rate": 4e-05, "loss": 4.9123, "loss/crossentropy": 1.8713824450969696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19735869392752647, "step": 6660 }, { "epoch": 0.5551666666666667, "grad_norm": 5.0, "grad_norm_var": 0.05240478515625, "learning_rate": 4e-05, "loss": 4.9659, "loss/crossentropy": 1.9562078714370728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18440138176083565, "step": 6662 }, { "epoch": 0.5553333333333333, "grad_norm": 4.75, "grad_norm_var": 0.054976399739583334, "learning_rate": 4e-05, "loss": 5.1777, "loss/crossentropy": 1.6304501295089722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18326781317591667, "step": 6664 }, { "epoch": 0.5555, "grad_norm": 4.75, "grad_norm_var": 0.0427734375, "learning_rate": 4e-05, "loss": 4.0937, "loss/crossentropy": 0.6879162788391113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14016366563737392, "step": 6666 }, { "epoch": 0.5556666666666666, "grad_norm": 4.625, "grad_norm_var": 0.04429931640625, "learning_rate": 4e-05, "loss": 4.9816, "loss/crossentropy": 1.8157347962260246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18013755604624748, "step": 6668 }, { "epoch": 0.5558333333333333, "grad_norm": 5.375, "grad_norm_var": 0.059098307291666666, "learning_rate": 4e-05, "loss": 5.1179, "loss/crossentropy": 2.056758761405945, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23633424937725067, "step": 6670 }, { "epoch": 0.556, "grad_norm": 4.25, "grad_norm_var": 0.08990885416666666, "learning_rate": 4e-05, "loss": 4.5153, "loss/crossentropy": 2.417732000350952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20401347428560257, "step": 6672 }, { "epoch": 0.5561666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.09514567057291666, "learning_rate": 4e-05, "loss": 5.3679, "loss/crossentropy": 1.909975491464138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16564680822193623, "step": 6674 }, { "epoch": 0.5563333333333333, "grad_norm": 5.0, "grad_norm_var": 0.08058268229166667, "learning_rate": 4e-05, "loss": 5.439, "loss/crossentropy": 2.06892479211092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18954100832343102, "step": 6676 }, { "epoch": 0.5565, "grad_norm": 4.4375, "grad_norm_var": 0.09250895182291667, "learning_rate": 4e-05, "loss": 4.6291, "loss/crossentropy": 2.054016627371311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1676958091557026, "step": 6678 }, { "epoch": 0.5566666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.10558268229166666, "learning_rate": 4e-05, "loss": 4.9014, "loss/crossentropy": 2.0140282213687897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17690856754779816, "step": 6680 }, { "epoch": 0.5568333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.10507405598958333, "learning_rate": 4e-05, "loss": 4.8231, "loss/crossentropy": 2.3354055285453796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20604095980525017, "step": 6682 }, { "epoch": 0.557, "grad_norm": 5.1875, "grad_norm_var": 0.10859375, "learning_rate": 4e-05, "loss": 5.232, "loss/crossentropy": 2.084495782852173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18740139529109, "step": 6684 }, { "epoch": 0.5571666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.07734375, "learning_rate": 4e-05, "loss": 4.4036, "loss/crossentropy": 1.638416811823845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15969400480389595, "step": 6686 }, { "epoch": 0.5573333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.08196614583333334, "learning_rate": 4e-05, "loss": 4.7544, "loss/crossentropy": 0.9400245994329453, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12616140954196453, "step": 6688 }, { "epoch": 0.5575, "grad_norm": 4.875, "grad_norm_var": 0.07766927083333333, "learning_rate": 4e-05, "loss": 5.2425, "loss/crossentropy": 2.1369471848011017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22332263365387917, "step": 6690 }, { "epoch": 0.5576666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.07616780598958334, "learning_rate": 4e-05, "loss": 5.1849, "loss/crossentropy": 2.384535789489746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22261762246489525, "step": 6692 }, { "epoch": 0.5578333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.06249593098958333, "learning_rate": 4e-05, "loss": 4.6685, "loss/crossentropy": 1.8818542137742043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18431300669908524, "step": 6694 }, { "epoch": 0.558, "grad_norm": 4.5625, "grad_norm_var": 0.05487874348958333, "learning_rate": 4e-05, "loss": 4.7571, "loss/crossentropy": 0.9524921476840973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11215276457369328, "step": 6696 }, { "epoch": 0.5581666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.05974934895833333, "learning_rate": 4e-05, "loss": 4.6185, "loss/crossentropy": 1.4619218409061432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1464745569974184, "step": 6698 }, { "epoch": 0.5583333333333333, "grad_norm": 4.875, "grad_norm_var": 0.04837239583333333, "learning_rate": 4e-05, "loss": 4.8486, "loss/crossentropy": 2.051357090473175, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19964271783828735, "step": 6700 }, { "epoch": 0.5585, "grad_norm": 4.71875, "grad_norm_var": 0.051102701822916666, "learning_rate": 4e-05, "loss": 4.0687, "loss/crossentropy": 1.5994284898042679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16306093521416187, "step": 6702 }, { "epoch": 0.5586666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.029817708333333335, "learning_rate": 4e-05, "loss": 4.4709, "loss/crossentropy": 2.0101925432682037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20254620909690857, "step": 6704 }, { "epoch": 0.5588333333333333, "grad_norm": 4.625, "grad_norm_var": 0.029150390625, "learning_rate": 4e-05, "loss": 4.2, "loss/crossentropy": 1.8986879587173462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18273010104894638, "step": 6706 }, { "epoch": 0.559, "grad_norm": 5.28125, "grad_norm_var": 0.056929524739583334, "learning_rate": 4e-05, "loss": 4.9344, "loss/crossentropy": 1.509821131825447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1933259814977646, "step": 6708 }, { "epoch": 0.5591666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.06949462890625, "learning_rate": 4e-05, "loss": 4.5876, "loss/crossentropy": 1.5221448838710785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1912355963140726, "step": 6710 }, { "epoch": 0.5593333333333333, "grad_norm": 4.875, "grad_norm_var": 0.06962483723958333, "learning_rate": 4e-05, "loss": 4.812, "loss/crossentropy": 2.3658363819122314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21914341673254967, "step": 6712 }, { "epoch": 0.5595, "grad_norm": 4.96875, "grad_norm_var": 0.0685546875, "learning_rate": 4e-05, "loss": 5.2591, "loss/crossentropy": 2.0233709514141083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21164459735155106, "step": 6714 }, { "epoch": 0.5596666666666666, "grad_norm": 5.53125, "grad_norm_var": 0.10091145833333333, "learning_rate": 4e-05, "loss": 5.6136, "loss/crossentropy": 2.1298616975545883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18793256022036076, "step": 6716 }, { "epoch": 0.5598333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.10846354166666666, "learning_rate": 4e-05, "loss": 5.0254, "loss/crossentropy": 1.5498792678117752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15589362382888794, "step": 6718 }, { "epoch": 0.56, "grad_norm": 5.46875, "grad_norm_var": 0.12470296223958334, "learning_rate": 4e-05, "loss": 5.0242, "loss/crossentropy": 2.247347056865692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19609899818897247, "step": 6720 }, { "epoch": 0.5601666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.09837239583333333, "learning_rate": 4e-05, "loss": 5.0665, "loss/crossentropy": 1.7242010906338692, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1808859072625637, "step": 6722 }, { "epoch": 0.5603333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.06534830729166667, "learning_rate": 4e-05, "loss": 5.3355, "loss/crossentropy": 2.5626547932624817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20478646084666252, "step": 6724 }, { "epoch": 0.5605, "grad_norm": 9.75, "grad_norm_var": 1.4786417643229166, "learning_rate": 4e-05, "loss": 5.0271, "loss/crossentropy": 2.60895174741745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22574814409017563, "step": 6726 }, { "epoch": 0.5606666666666666, "grad_norm": 4.5625, "grad_norm_var": 1.514453125, "learning_rate": 4e-05, "loss": 4.5541, "loss/crossentropy": 1.6328820586204529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18884171918034554, "step": 6728 }, { "epoch": 0.5608333333333333, "grad_norm": 4.90625, "grad_norm_var": 1.5160115559895833, "learning_rate": 4e-05, "loss": 4.5824, "loss/crossentropy": 0.8445823714137077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11832733266055584, "step": 6730 }, { "epoch": 0.561, "grad_norm": 4.875, "grad_norm_var": 1.5098917643229166, "learning_rate": 4e-05, "loss": 4.8565, "loss/crossentropy": 2.287008821964264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20950447022914886, "step": 6732 }, { "epoch": 0.5611666666666667, "grad_norm": 4.65625, "grad_norm_var": 1.5292277018229166, "learning_rate": 4e-05, "loss": 4.6073, "loss/crossentropy": 2.079039454460144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.198956910520792, "step": 6734 }, { "epoch": 0.5613333333333334, "grad_norm": 4.90625, "grad_norm_var": 1.56275634765625, "learning_rate": 4e-05, "loss": 5.5193, "loss/crossentropy": 2.555041193962097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20681289210915565, "step": 6736 }, { "epoch": 0.5615, "grad_norm": 5.0625, "grad_norm_var": 1.5798828125, "learning_rate": 4e-05, "loss": 5.5108, "loss/crossentropy": 2.1208256036043167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17366488836705685, "step": 6738 }, { "epoch": 0.5616666666666666, "grad_norm": 4.75, "grad_norm_var": 1.5755167643229167, "learning_rate": 4e-05, "loss": 5.2524, "loss/crossentropy": 2.362846553325653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2502614036202431, "step": 6740 }, { "epoch": 0.5618333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.104931640625, "learning_rate": 4e-05, "loss": 4.9545, "loss/crossentropy": 2.3334818482398987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19263581559062004, "step": 6742 }, { "epoch": 0.562, "grad_norm": 5.125, "grad_norm_var": 0.09755452473958333, "learning_rate": 4e-05, "loss": 5.4646, "loss/crossentropy": 2.254566043615341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19652917608618736, "step": 6744 }, { "epoch": 0.5621666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.10774739583333333, "learning_rate": 4e-05, "loss": 4.4787, "loss/crossentropy": 1.0434362962841988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.122630275785923, "step": 6746 }, { "epoch": 0.5623333333333334, "grad_norm": 4.875, "grad_norm_var": 0.10755208333333334, "learning_rate": 4e-05, "loss": 5.2754, "loss/crossentropy": 2.429089665412903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22747541218996048, "step": 6748 }, { "epoch": 0.5625, "grad_norm": 4.40625, "grad_norm_var": 0.11708577473958333, "learning_rate": 4e-05, "loss": 4.9665, "loss/crossentropy": 1.912009358406067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17672326415777206, "step": 6750 }, { "epoch": 0.5626666666666666, "grad_norm": 4.75, "grad_norm_var": 0.06705322265625, "learning_rate": 4e-05, "loss": 4.5696, "loss/crossentropy": 1.6353125348687172, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.179355189204216, "step": 6752 }, { "epoch": 0.5628333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.06363525390625, "learning_rate": 4e-05, "loss": 5.0427, "loss/crossentropy": 1.8525151684880257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19017129763960838, "step": 6754 }, { "epoch": 0.563, "grad_norm": 5.0625, "grad_norm_var": 0.07001546223958334, "learning_rate": 4e-05, "loss": 4.8502, "loss/crossentropy": 2.0079415440559387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22154979780316353, "step": 6756 }, { "epoch": 0.5631666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.07353108723958333, "learning_rate": 4e-05, "loss": 5.2495, "loss/crossentropy": 2.432105541229248, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2078854702413082, "step": 6758 }, { "epoch": 0.5633333333333334, "grad_norm": 5.125, "grad_norm_var": 0.07310791015625, "learning_rate": 4e-05, "loss": 4.8135, "loss/crossentropy": 2.318072497844696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19287996366620064, "step": 6760 }, { "epoch": 0.5635, "grad_norm": 5.03125, "grad_norm_var": 0.06873372395833334, "learning_rate": 4e-05, "loss": 4.9044, "loss/crossentropy": 2.040756940841675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.234269879758358, "step": 6762 }, { "epoch": 0.5636666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.31116129557291666, "learning_rate": 4e-05, "loss": 4.9236, "loss/crossentropy": 2.3868152499198914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1909792795777321, "step": 6764 }, { "epoch": 0.5638333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.30419514973958334, "learning_rate": 4e-05, "loss": 4.947, "loss/crossentropy": 2.2354209423065186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2142213024199009, "step": 6766 }, { "epoch": 0.564, "grad_norm": 5.125, "grad_norm_var": 0.29257405598958336, "learning_rate": 4e-05, "loss": 5.5168, "loss/crossentropy": 2.0357046499848366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17524536699056625, "step": 6768 }, { "epoch": 0.5641666666666667, "grad_norm": 5.0, "grad_norm_var": 0.29412434895833334, "learning_rate": 4e-05, "loss": 4.8833, "loss/crossentropy": 1.6228312253952026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1723469439893961, "step": 6770 }, { "epoch": 0.5643333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.2816243489583333, "learning_rate": 4e-05, "loss": 5.1799, "loss/crossentropy": 1.7683988213539124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16633950546383858, "step": 6772 }, { "epoch": 0.5645, "grad_norm": 5.53125, "grad_norm_var": 0.29003499348958334, "learning_rate": 4e-05, "loss": 4.7752, "loss/crossentropy": 1.9569614827632904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17459377273917198, "step": 6774 }, { "epoch": 0.5646666666666667, "grad_norm": 4.625, "grad_norm_var": 0.3126953125, "learning_rate": 4e-05, "loss": 4.7691, "loss/crossentropy": 1.7250538617372513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17075425572693348, "step": 6776 }, { "epoch": 0.5648333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.30500895182291665, "learning_rate": 4e-05, "loss": 5.0686, "loss/crossentropy": 2.08631694316864, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18906661123037338, "step": 6778 }, { "epoch": 0.565, "grad_norm": 4.84375, "grad_norm_var": 0.08043212890625, "learning_rate": 4e-05, "loss": 5.6042, "loss/crossentropy": 2.473922371864319, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215004812926054, "step": 6780 }, { "epoch": 0.5651666666666667, "grad_norm": 5.0, "grad_norm_var": 0.06443684895833333, "learning_rate": 4e-05, "loss": 5.2425, "loss/crossentropy": 2.2826786637306213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24417144805192947, "step": 6782 }, { "epoch": 0.5653333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.06868489583333333, "learning_rate": 4e-05, "loss": 4.7002, "loss/crossentropy": 2.471294343471527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2048209197819233, "step": 6784 }, { "epoch": 0.5655, "grad_norm": 5.09375, "grad_norm_var": 0.06669514973958333, "learning_rate": 4e-05, "loss": 5.3737, "loss/crossentropy": 2.359356611967087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23297657072544098, "step": 6786 }, { "epoch": 0.5656666666666667, "grad_norm": 5.0, "grad_norm_var": 0.057145182291666666, "learning_rate": 4e-05, "loss": 4.8805, "loss/crossentropy": 1.9158901870250702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1967592779546976, "step": 6788 }, { "epoch": 0.5658333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.04816080729166667, "learning_rate": 4e-05, "loss": 5.2392, "loss/crossentropy": 1.9844098091125488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17694342508912086, "step": 6790 }, { "epoch": 0.566, "grad_norm": 4.84375, "grad_norm_var": 0.05026041666666667, "learning_rate": 4e-05, "loss": 4.4574, "loss/crossentropy": 1.1681826636195183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12684665992856026, "step": 6792 }, { "epoch": 0.5661666666666667, "grad_norm": 5.65625, "grad_norm_var": 0.086181640625, "learning_rate": 4e-05, "loss": 5.2646, "loss/crossentropy": 1.9428167939186096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1700345128774643, "step": 6794 }, { "epoch": 0.5663333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.08409830729166666, "learning_rate": 4e-05, "loss": 5.1086, "loss/crossentropy": 1.7468329519033432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17568138800561428, "step": 6796 }, { "epoch": 0.5665, "grad_norm": 5.03125, "grad_norm_var": 0.086572265625, "learning_rate": 4e-05, "loss": 4.9981, "loss/crossentropy": 1.8271580636501312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1846771389245987, "step": 6798 }, { "epoch": 0.5666666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.089306640625, "learning_rate": 4e-05, "loss": 4.9397, "loss/crossentropy": 1.8120107203722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16303765773773193, "step": 6800 }, { "epoch": 0.5668333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.114306640625, "learning_rate": 4e-05, "loss": 5.0096, "loss/crossentropy": 2.0087318643927574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18550613150000572, "step": 6802 }, { "epoch": 0.567, "grad_norm": 4.90625, "grad_norm_var": 0.11334228515625, "learning_rate": 4e-05, "loss": 4.6148, "loss/crossentropy": 2.5129515528678894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20342765003442764, "step": 6804 }, { "epoch": 0.5671666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.1056640625, "learning_rate": 4e-05, "loss": 4.8834, "loss/crossentropy": 2.082836002111435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1934790275990963, "step": 6806 }, { "epoch": 0.5673333333333334, "grad_norm": 4.875, "grad_norm_var": 0.08605143229166666, "learning_rate": 4e-05, "loss": 4.6374, "loss/crossentropy": 2.0405823960900307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18986310623586178, "step": 6808 }, { "epoch": 0.5675, "grad_norm": 5.0625, "grad_norm_var": 0.04537353515625, "learning_rate": 4e-05, "loss": 5.0763, "loss/crossentropy": 2.2310905158519745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21792350336909294, "step": 6810 }, { "epoch": 0.5676666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.04186197916666667, "learning_rate": 4e-05, "loss": 4.9157, "loss/crossentropy": 2.1328996419906616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22394980490207672, "step": 6812 }, { "epoch": 0.5678333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.04071858723958333, "learning_rate": 4e-05, "loss": 4.3477, "loss/crossentropy": 1.644135631620884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20680170506238937, "step": 6814 }, { "epoch": 0.568, "grad_norm": 4.84375, "grad_norm_var": 0.02926025390625, "learning_rate": 4e-05, "loss": 4.8556, "loss/crossentropy": 2.437521994113922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220199353992939, "step": 6816 }, { "epoch": 0.5681666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.0193359375, "learning_rate": 4e-05, "loss": 4.9518, "loss/crossentropy": 2.237753540277481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20501817017793655, "step": 6818 }, { "epoch": 0.5683333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.022135416666666668, "learning_rate": 4e-05, "loss": 4.4411, "loss/crossentropy": 1.9096024632453918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19846992194652557, "step": 6820 }, { "epoch": 0.5685, "grad_norm": 4.6875, "grad_norm_var": 0.028934733072916666, "learning_rate": 4e-05, "loss": 4.6075, "loss/crossentropy": 2.0617934688925743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20183325186371803, "step": 6822 }, { "epoch": 0.5686666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.03420817057291667, "learning_rate": 4e-05, "loss": 5.1295, "loss/crossentropy": 1.7742072641849518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17162226885557175, "step": 6824 }, { "epoch": 0.5688333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.031245930989583334, "learning_rate": 4e-05, "loss": 4.8558, "loss/crossentropy": 2.2291614413261414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22294742614030838, "step": 6826 }, { "epoch": 0.569, "grad_norm": 4.71875, "grad_norm_var": 0.031083170572916666, "learning_rate": 4e-05, "loss": 5.1684, "loss/crossentropy": 2.303519546985626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20848600566387177, "step": 6828 }, { "epoch": 0.5691666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.03515218098958333, "learning_rate": 4e-05, "loss": 5.1765, "loss/crossentropy": 1.977641612291336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18245596811175346, "step": 6830 }, { "epoch": 0.5693333333333334, "grad_norm": 5.78125, "grad_norm_var": 0.10136311848958333, "learning_rate": 4e-05, "loss": 4.9752, "loss/crossentropy": 2.1543014645576477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22348742187023163, "step": 6832 }, { "epoch": 0.5695, "grad_norm": 4.75, "grad_norm_var": 0.3146769205729167, "learning_rate": 4e-05, "loss": 4.4685, "loss/crossentropy": 1.5171415954828262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1520980577915907, "step": 6834 }, { "epoch": 0.5696666666666667, "grad_norm": 5.125, "grad_norm_var": 0.3045572916666667, "learning_rate": 4e-05, "loss": 5.0365, "loss/crossentropy": 1.4744467735290527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1628479491919279, "step": 6836 }, { "epoch": 0.5698333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.278515625, "learning_rate": 4e-05, "loss": 5.1384, "loss/crossentropy": 2.248887836933136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23819807171821594, "step": 6838 }, { "epoch": 0.57, "grad_norm": 4.6875, "grad_norm_var": 0.274853515625, "learning_rate": 4e-05, "loss": 5.007, "loss/crossentropy": 1.9553302228450775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1953351702541113, "step": 6840 }, { "epoch": 0.5701666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.27421875, "learning_rate": 4e-05, "loss": 4.758, "loss/crossentropy": 1.7406494319438934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21699757128953934, "step": 6842 }, { "epoch": 0.5703333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.276416015625, "learning_rate": 4e-05, "loss": 4.3834, "loss/crossentropy": 1.7711199223995209, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1848914809525013, "step": 6844 }, { "epoch": 0.5705, "grad_norm": 4.53125, "grad_norm_var": 0.2936482747395833, "learning_rate": 4e-05, "loss": 4.9482, "loss/crossentropy": 2.113058567047119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19523290544748306, "step": 6846 }, { "epoch": 0.5706666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.25104166666666666, "learning_rate": 4e-05, "loss": 4.7867, "loss/crossentropy": 1.378658078610897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14851684123277664, "step": 6848 }, { "epoch": 0.5708333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.03561197916666667, "learning_rate": 4e-05, "loss": 5.1184, "loss/crossentropy": 2.6506794095039368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21903324127197266, "step": 6850 }, { "epoch": 0.571, "grad_norm": 5.0, "grad_norm_var": 0.03209228515625, "learning_rate": 4e-05, "loss": 4.8231, "loss/crossentropy": 1.9687937498092651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869981847703457, "step": 6852 }, { "epoch": 0.5711666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.03723551432291667, "learning_rate": 4e-05, "loss": 4.8397, "loss/crossentropy": 1.3870623856782913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16671552881598473, "step": 6854 }, { "epoch": 0.5713333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.040478515625, "learning_rate": 4e-05, "loss": 4.7197, "loss/crossentropy": 1.9801999479532242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19567257165908813, "step": 6856 }, { "epoch": 0.5715, "grad_norm": 4.5, "grad_norm_var": 0.048421223958333336, "learning_rate": 4e-05, "loss": 5.1194, "loss/crossentropy": 1.1472266912460327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11946773529052734, "step": 6858 }, { "epoch": 0.5716666666666667, "grad_norm": 4.75, "grad_norm_var": 0.048563639322916664, "learning_rate": 4e-05, "loss": 5.605, "loss/crossentropy": 2.2561517730355263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16934522055089474, "step": 6860 }, { "epoch": 0.5718333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.04241129557291667, "learning_rate": 4e-05, "loss": 5.3357, "loss/crossentropy": 2.355598211288452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21308866888284683, "step": 6862 }, { "epoch": 0.572, "grad_norm": 4.5, "grad_norm_var": 0.04537760416666667, "learning_rate": 4e-05, "loss": 5.0089, "loss/crossentropy": 2.11463263630867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2152191400527954, "step": 6864 }, { "epoch": 0.5721666666666667, "grad_norm": 5.09375, "grad_norm_var": 1.681494140625, "learning_rate": 4e-05, "loss": 4.8888, "loss/crossentropy": 1.9020079374313354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944811660796404, "step": 6866 }, { "epoch": 0.5723333333333334, "grad_norm": 4.78125, "grad_norm_var": 1.6887003580729167, "learning_rate": 4e-05, "loss": 4.6484, "loss/crossentropy": 1.2562482431530952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15086103230714798, "step": 6868 }, { "epoch": 0.5725, "grad_norm": 4.75, "grad_norm_var": 1.6973592122395833, "learning_rate": 4e-05, "loss": 4.9415, "loss/crossentropy": 2.1037851870059967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20293112844228745, "step": 6870 }, { "epoch": 0.5726666666666667, "grad_norm": 4.90625, "grad_norm_var": 1.678369140625, "learning_rate": 4e-05, "loss": 5.1511, "loss/crossentropy": 2.3142004013061523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23257246613502502, "step": 6872 }, { "epoch": 0.5728333333333333, "grad_norm": 4.65625, "grad_norm_var": 1.6506144205729167, "learning_rate": 4e-05, "loss": 4.994, "loss/crossentropy": 1.8683245033025742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2040301226079464, "step": 6874 }, { "epoch": 0.573, "grad_norm": 4.53125, "grad_norm_var": 1.6702962239583334, "learning_rate": 4e-05, "loss": 4.4974, "loss/crossentropy": 2.2722477316856384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20786982402205467, "step": 6876 }, { "epoch": 0.5731666666666667, "grad_norm": 5.0, "grad_norm_var": 1.6706868489583333, "learning_rate": 4e-05, "loss": 5.012, "loss/crossentropy": 2.5765005946159363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2184704802930355, "step": 6878 }, { "epoch": 0.5733333333333334, "grad_norm": 4.75, "grad_norm_var": 1.642041015625, "learning_rate": 4e-05, "loss": 5.3578, "loss/crossentropy": 2.108662247657776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19640013948082924, "step": 6880 }, { "epoch": 0.5735, "grad_norm": 4.84375, "grad_norm_var": 0.027278645833333334, "learning_rate": 4e-05, "loss": 5.3062, "loss/crossentropy": 2.601102828979492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24048221856355667, "step": 6882 }, { "epoch": 0.5736666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.03229166666666667, "learning_rate": 4e-05, "loss": 5.0257, "loss/crossentropy": 0.8844295963644981, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1019323579967022, "step": 6884 }, { "epoch": 0.5738333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.03280843098958333, "learning_rate": 4e-05, "loss": 4.7654, "loss/crossentropy": 2.2570102512836456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22596049681305885, "step": 6886 }, { "epoch": 0.574, "grad_norm": 5.28125, "grad_norm_var": 0.04843343098958333, "learning_rate": 4e-05, "loss": 5.5993, "loss/crossentropy": 2.4165295362472534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2355419434607029, "step": 6888 }, { "epoch": 0.5741666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.0533203125, "learning_rate": 4e-05, "loss": 4.3093, "loss/crossentropy": 1.5194483771920204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1469650249928236, "step": 6890 }, { "epoch": 0.5743333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.04732666015625, "learning_rate": 4e-05, "loss": 5.1506, "loss/crossentropy": 2.3721578419208527, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1981596052646637, "step": 6892 }, { "epoch": 0.5745, "grad_norm": 4.875, "grad_norm_var": 0.047526041666666664, "learning_rate": 4e-05, "loss": 5.0537, "loss/crossentropy": 2.3216958045959473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19841821864247322, "step": 6894 }, { "epoch": 0.5746666666666667, "grad_norm": 4.5, "grad_norm_var": 0.06116129557291667, "learning_rate": 4e-05, "loss": 4.1328, "loss/crossentropy": 1.9104135930538177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16793107241392136, "step": 6896 }, { "epoch": 0.5748333333333333, "grad_norm": 5.125, "grad_norm_var": 0.091650390625, "learning_rate": 4e-05, "loss": 4.1164, "loss/crossentropy": 0.9877064228057861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12135954014956951, "step": 6898 }, { "epoch": 0.575, "grad_norm": 4.75, "grad_norm_var": 0.08938802083333333, "learning_rate": 4e-05, "loss": 4.8793, "loss/crossentropy": 2.3174608945846558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19842035695910454, "step": 6900 }, { "epoch": 0.5751666666666667, "grad_norm": 5.25, "grad_norm_var": 0.09859619140625, "learning_rate": 4e-05, "loss": 5.2412, "loss/crossentropy": 2.267774134874344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262442149221897, "step": 6902 }, { "epoch": 0.5753333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.0853515625, "learning_rate": 4e-05, "loss": 5.2218, "loss/crossentropy": 1.7064669355750084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17715736478567123, "step": 6904 }, { "epoch": 0.5755, "grad_norm": 5.1875, "grad_norm_var": 0.084619140625, "learning_rate": 4e-05, "loss": 5.1483, "loss/crossentropy": 2.3594651222229004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23974528908729553, "step": 6906 }, { "epoch": 0.5756666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.09683837890625, "learning_rate": 4e-05, "loss": 5.252, "loss/crossentropy": 2.312884032726288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21780531853437424, "step": 6908 }, { "epoch": 0.5758333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.10323893229166667, "learning_rate": 4e-05, "loss": 4.6783, "loss/crossentropy": 1.9019780158996582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16511645540595055, "step": 6910 }, { "epoch": 0.576, "grad_norm": 5.46875, "grad_norm_var": 0.09452718098958333, "learning_rate": 4e-05, "loss": 4.7793, "loss/crossentropy": 2.478718101978302, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21854493767023087, "step": 6912 }, { "epoch": 0.5761666666666667, "grad_norm": 5.25, "grad_norm_var": 0.05383707682291667, "learning_rate": 4e-05, "loss": 4.687, "loss/crossentropy": 1.895049013197422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19955835677683353, "step": 6914 }, { "epoch": 0.5763333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.06718343098958333, "learning_rate": 4e-05, "loss": 4.9585, "loss/crossentropy": 1.823567271232605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18378795869648457, "step": 6916 }, { "epoch": 0.5765, "grad_norm": 9.125, "grad_norm_var": 1.12574462890625, "learning_rate": 4e-05, "loss": 5.0141, "loss/crossentropy": 1.417652204632759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1524863112717867, "step": 6918 }, { "epoch": 0.5766666666666667, "grad_norm": 4.4375, "grad_norm_var": 1.175634765625, "learning_rate": 4e-05, "loss": 4.7746, "loss/crossentropy": 1.9156860336661339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19187748990952969, "step": 6920 }, { "epoch": 0.5768333333333333, "grad_norm": 5.03125, "grad_norm_var": 1.1823527018229167, "learning_rate": 4e-05, "loss": 4.6849, "loss/crossentropy": 1.7854193970561028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17549366503953934, "step": 6922 }, { "epoch": 0.577, "grad_norm": 4.375, "grad_norm_var": 1.2242024739583333, "learning_rate": 4e-05, "loss": 4.8228, "loss/crossentropy": 1.835989773273468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17153911851346493, "step": 6924 }, { "epoch": 0.5771666666666667, "grad_norm": 6.65625, "grad_norm_var": 1.3544230143229166, "learning_rate": 4e-05, "loss": 5.2973, "loss/crossentropy": 2.4750488996505737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117002233862877, "step": 6926 }, { "epoch": 0.5773333333333334, "grad_norm": 4.625, "grad_norm_var": 1.3676717122395834, "learning_rate": 4e-05, "loss": 5.4856, "loss/crossentropy": 2.3786118626594543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20669499039649963, "step": 6928 }, { "epoch": 0.5775, "grad_norm": 4.75, "grad_norm_var": 1.3824503580729166, "learning_rate": 4e-05, "loss": 4.343, "loss/crossentropy": 1.9303578808903694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1886876206845045, "step": 6930 }, { "epoch": 0.5776666666666667, "grad_norm": 5.0625, "grad_norm_var": 1.350634765625, "learning_rate": 4e-05, "loss": 4.7448, "loss/crossentropy": 2.3733231723308563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21326325461268425, "step": 6932 }, { "epoch": 0.5778333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.2679646809895833, "learning_rate": 4e-05, "loss": 4.925, "loss/crossentropy": 2.333590805530548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2030404508113861, "step": 6934 }, { "epoch": 0.578, "grad_norm": 4.71875, "grad_norm_var": 0.24911702473958333, "learning_rate": 4e-05, "loss": 4.8514, "loss/crossentropy": 2.50896617770195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22089635208249092, "step": 6936 }, { "epoch": 0.5781666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.2536417643229167, "learning_rate": 4e-05, "loss": 4.5747, "loss/crossentropy": 1.3991278186440468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19910966977477074, "step": 6938 }, { "epoch": 0.5783333333333334, "grad_norm": 5.625, "grad_norm_var": 0.243603515625, "learning_rate": 4e-05, "loss": 4.7782, "loss/crossentropy": 1.2879233956336975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14194754138588905, "step": 6940 }, { "epoch": 0.5785, "grad_norm": 4.84375, "grad_norm_var": 0.07394205729166667, "learning_rate": 4e-05, "loss": 4.8375, "loss/crossentropy": 2.180011212825775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19299915060400963, "step": 6942 }, { "epoch": 0.5786666666666667, "grad_norm": 4.5, "grad_norm_var": 0.08391927083333334, "learning_rate": 4e-05, "loss": 5.0074, "loss/crossentropy": 1.9094977304339409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1777942907065153, "step": 6944 }, { "epoch": 0.5788333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.09099934895833334, "learning_rate": 4e-05, "loss": 4.9617, "loss/crossentropy": 2.395069420337677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20748833194375038, "step": 6946 }, { "epoch": 0.579, "grad_norm": 5.0625, "grad_norm_var": 0.08644205729166667, "learning_rate": 4e-05, "loss": 5.1167, "loss/crossentropy": 2.3528851866722107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21444562450051308, "step": 6948 }, { "epoch": 0.5791666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.08899739583333334, "learning_rate": 4e-05, "loss": 5.3257, "loss/crossentropy": 1.9694509357213974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1694854572415352, "step": 6950 }, { "epoch": 0.5793333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.08683268229166667, "learning_rate": 4e-05, "loss": 5.4056, "loss/crossentropy": 1.936231642961502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19177152961492538, "step": 6952 }, { "epoch": 0.5795, "grad_norm": 4.6875, "grad_norm_var": 0.083837890625, "learning_rate": 4e-05, "loss": 5.1056, "loss/crossentropy": 2.1117332875728607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18018162995576859, "step": 6954 }, { "epoch": 0.5796666666666667, "grad_norm": 5.40625, "grad_norm_var": 0.06443684895833333, "learning_rate": 4e-05, "loss": 5.1666, "loss/crossentropy": 2.5744773745536804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2220405377447605, "step": 6956 }, { "epoch": 0.5798333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.06927083333333334, "learning_rate": 4e-05, "loss": 4.5248, "loss/crossentropy": 2.1000839695334435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16782992519438267, "step": 6958 }, { "epoch": 0.58, "grad_norm": 4.90625, "grad_norm_var": 0.061442057291666664, "learning_rate": 4e-05, "loss": 5.0504, "loss/crossentropy": 2.1625142991542816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22695934772491455, "step": 6960 }, { "epoch": 0.5801666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.060546875, "learning_rate": 4e-05, "loss": 5.3221, "loss/crossentropy": 2.1675389409065247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19221728295087814, "step": 6962 }, { "epoch": 0.5803333333333334, "grad_norm": 4.5, "grad_norm_var": 0.07320556640625, "learning_rate": 4e-05, "loss": 4.8257, "loss/crossentropy": 1.7454765737056732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20795752108097076, "step": 6964 }, { "epoch": 0.5805, "grad_norm": 4.53125, "grad_norm_var": 0.07224934895833333, "learning_rate": 4e-05, "loss": 5.3364, "loss/crossentropy": 2.321167379617691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211841382086277, "step": 6966 }, { "epoch": 0.5806666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.07450764973958333, "learning_rate": 4e-05, "loss": 4.9915, "loss/crossentropy": 1.9849184900522232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19291441701352596, "step": 6968 }, { "epoch": 0.5808333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.07317301432291666, "learning_rate": 4e-05, "loss": 4.9854, "loss/crossentropy": 1.7237927541136742, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16498488560318947, "step": 6970 }, { "epoch": 0.581, "grad_norm": 5.0, "grad_norm_var": 0.05777587890625, "learning_rate": 4e-05, "loss": 4.9658, "loss/crossentropy": 2.640030264854431, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21197672188282013, "step": 6972 }, { "epoch": 0.5811666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.0595703125, "learning_rate": 4e-05, "loss": 4.6484, "loss/crossentropy": 1.8918295353651047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1703386828303337, "step": 6974 }, { "epoch": 0.5813333333333334, "grad_norm": 4.75, "grad_norm_var": 0.061848958333333336, "learning_rate": 4e-05, "loss": 4.6603, "loss/crossentropy": 1.0444296523928642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16650191694498062, "step": 6976 }, { "epoch": 0.5815, "grad_norm": 4.59375, "grad_norm_var": 0.058333333333333334, "learning_rate": 4e-05, "loss": 5.0858, "loss/crossentropy": 2.292292296886444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22024738788604736, "step": 6978 }, { "epoch": 0.5816666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.059619140625, "learning_rate": 4e-05, "loss": 5.0752, "loss/crossentropy": 2.2760500013828278, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20995662361383438, "step": 6980 }, { "epoch": 0.5818333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.046122233072916664, "learning_rate": 4e-05, "loss": 5.1017, "loss/crossentropy": 2.164345234632492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2195327877998352, "step": 6982 }, { "epoch": 0.582, "grad_norm": 5.1875, "grad_norm_var": 0.051025390625, "learning_rate": 4e-05, "loss": 4.9941, "loss/crossentropy": 1.9390491545200348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.251364566385746, "step": 6984 }, { "epoch": 0.5821666666666667, "grad_norm": 5.5625, "grad_norm_var": 0.07655843098958333, "learning_rate": 4e-05, "loss": 5.4158, "loss/crossentropy": 2.521660327911377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20530570298433304, "step": 6986 }, { "epoch": 0.5823333333333334, "grad_norm": 4.28125, "grad_norm_var": 0.10627848307291667, "learning_rate": 4e-05, "loss": 4.8529, "loss/crossentropy": 2.434011995792389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2161153182387352, "step": 6988 }, { "epoch": 0.5825, "grad_norm": 4.9375, "grad_norm_var": 0.10323893229166667, "learning_rate": 4e-05, "loss": 5.091, "loss/crossentropy": 2.19405135512352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19611788541078568, "step": 6990 }, { "epoch": 0.5826666666666667, "grad_norm": 4.625, "grad_norm_var": 0.1046875, "learning_rate": 4e-05, "loss": 4.9027, "loss/crossentropy": 2.0773863047361374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.199423398822546, "step": 6992 }, { "epoch": 0.5828333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.116015625, "learning_rate": 4e-05, "loss": 5.4493, "loss/crossentropy": 2.2322590053081512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22856701165437698, "step": 6994 }, { "epoch": 0.583, "grad_norm": 4.875, "grad_norm_var": 0.11282145182291667, "learning_rate": 4e-05, "loss": 5.4103, "loss/crossentropy": 2.223317414522171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22666733711957932, "step": 6996 }, { "epoch": 0.5831666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.14628499348958332, "learning_rate": 4e-05, "loss": 5.0099, "loss/crossentropy": 2.306236118078232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21099332347512245, "step": 6998 }, { "epoch": 0.5833333333333334, "grad_norm": 4.875, "grad_norm_var": 0.16679280598958332, "learning_rate": 4e-05, "loss": 4.9629, "loss/crossentropy": 2.1312642991542816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20312974229454994, "step": 7000 }, { "epoch": 0.5835, "grad_norm": 5.125, "grad_norm_var": 0.14654947916666666, "learning_rate": 4e-05, "loss": 5.2156, "loss/crossentropy": 2.47050142288208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21174683049321175, "step": 7002 }, { "epoch": 0.5836666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.11672770182291667, "learning_rate": 4e-05, "loss": 4.2701, "loss/crossentropy": 2.2160302698612213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22286521643400192, "step": 7004 }, { "epoch": 0.5838333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.11373697916666667, "learning_rate": 4e-05, "loss": 4.5535, "loss/crossentropy": 1.4888541474938393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15817414224147797, "step": 7006 }, { "epoch": 0.584, "grad_norm": 5.65625, "grad_norm_var": 0.14075520833333333, "learning_rate": 4e-05, "loss": 4.574, "loss/crossentropy": 2.411957621574402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22817474603652954, "step": 7008 }, { "epoch": 0.5841666666666666, "grad_norm": 4.75, "grad_norm_var": 0.14016927083333333, "learning_rate": 4e-05, "loss": 4.7713, "loss/crossentropy": 1.4711432233452797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15142634697258472, "step": 7010 }, { "epoch": 0.5843333333333334, "grad_norm": 4.28125, "grad_norm_var": 0.16521809895833334, "learning_rate": 4e-05, "loss": 4.5345, "loss/crossentropy": 1.834930658340454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19916201382875443, "step": 7012 }, { "epoch": 0.5845, "grad_norm": 4.71875, "grad_norm_var": 0.13837483723958333, "learning_rate": 4e-05, "loss": 4.5504, "loss/crossentropy": 1.7337545081973076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18489226698875427, "step": 7014 }, { "epoch": 0.5846666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.11373291015625, "learning_rate": 4e-05, "loss": 4.3701, "loss/crossentropy": 1.3228175267577171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15250758454203606, "step": 7016 }, { "epoch": 0.5848333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.117041015625, "learning_rate": 4e-05, "loss": 4.8881, "loss/crossentropy": 2.3420262932777405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1912631243467331, "step": 7018 }, { "epoch": 0.585, "grad_norm": 6.03125, "grad_norm_var": 0.22639567057291668, "learning_rate": 4e-05, "loss": 5.0456, "loss/crossentropy": 1.3368181511759758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14204655215144157, "step": 7020 }, { "epoch": 0.5851666666666666, "grad_norm": 5.28125, "grad_norm_var": 0.24104410807291668, "learning_rate": 4e-05, "loss": 4.6956, "loss/crossentropy": 1.5645621120929718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1582186594605446, "step": 7022 }, { "epoch": 0.5853333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.2123046875, "learning_rate": 4e-05, "loss": 5.308, "loss/crossentropy": 2.4173710644245148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20801213383674622, "step": 7024 }, { "epoch": 0.5855, "grad_norm": 4.875, "grad_norm_var": 0.21207275390625, "learning_rate": 4e-05, "loss": 5.1631, "loss/crossentropy": 2.3601708114147186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22110438346862793, "step": 7026 }, { "epoch": 0.5856666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.19423421223958334, "learning_rate": 4e-05, "loss": 5.0976, "loss/crossentropy": 2.0202344954013824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23262187093496323, "step": 7028 }, { "epoch": 0.5858333333333333, "grad_norm": 5.0, "grad_norm_var": 0.426025390625, "learning_rate": 4e-05, "loss": 5.1829, "loss/crossentropy": 2.080652594566345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20888086408376694, "step": 7030 }, { "epoch": 0.586, "grad_norm": 4.84375, "grad_norm_var": 0.3563435872395833, "learning_rate": 4e-05, "loss": 5.0934, "loss/crossentropy": 2.4877448081970215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21074169129133224, "step": 7032 }, { "epoch": 0.5861666666666666, "grad_norm": 4.875, "grad_norm_var": 0.34060872395833336, "learning_rate": 4e-05, "loss": 4.7548, "loss/crossentropy": 1.833594799041748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944413259625435, "step": 7034 }, { "epoch": 0.5863333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.31399739583333336, "learning_rate": 4e-05, "loss": 5.1317, "loss/crossentropy": 1.8810269013047218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18199764378368855, "step": 7036 }, { "epoch": 0.5865, "grad_norm": 4.8125, "grad_norm_var": 0.3049112955729167, "learning_rate": 4e-05, "loss": 5.0683, "loss/crossentropy": 2.4290638267993927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21032685041427612, "step": 7038 }, { "epoch": 0.5866666666666667, "grad_norm": 5.125, "grad_norm_var": 0.3036417643229167, "learning_rate": 4e-05, "loss": 5.0505, "loss/crossentropy": 2.0547506511211395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208097193390131, "step": 7040 }, { "epoch": 0.5868333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.32157796223958335, "learning_rate": 4e-05, "loss": 4.4601, "loss/crossentropy": 1.7334094047546387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20311187207698822, "step": 7042 }, { "epoch": 0.587, "grad_norm": 4.46875, "grad_norm_var": 0.3348592122395833, "learning_rate": 4e-05, "loss": 4.74, "loss/crossentropy": 2.338983803987503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23134483397006989, "step": 7044 }, { "epoch": 0.5871666666666666, "grad_norm": 4.625, "grad_norm_var": 0.026981608072916666, "learning_rate": 4e-05, "loss": 4.4947, "loss/crossentropy": 1.6076650097966194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.170147143304348, "step": 7046 }, { "epoch": 0.5873333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.044775390625, "learning_rate": 4e-05, "loss": 5.3415, "loss/crossentropy": 2.398401141166687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23924263939261436, "step": 7048 }, { "epoch": 0.5875, "grad_norm": 5.125, "grad_norm_var": 0.058915201822916666, "learning_rate": 4e-05, "loss": 5.33, "loss/crossentropy": 1.9874465465545654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18284693360328674, "step": 7050 }, { "epoch": 0.5876666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.06334228515625, "learning_rate": 4e-05, "loss": 4.3923, "loss/crossentropy": 2.0420145988464355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16938142478466034, "step": 7052 }, { "epoch": 0.5878333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.08411051432291666, "learning_rate": 4e-05, "loss": 4.8238, "loss/crossentropy": 2.0706692337989807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21137084066867828, "step": 7054 }, { "epoch": 0.588, "grad_norm": 4.78125, "grad_norm_var": 0.08052978515625, "learning_rate": 4e-05, "loss": 4.834, "loss/crossentropy": 2.1098215356469154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18781218118965626, "step": 7056 }, { "epoch": 0.5881666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.09280192057291667, "learning_rate": 4e-05, "loss": 4.623, "loss/crossentropy": 1.389645166695118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1635691300034523, "step": 7058 }, { "epoch": 0.5883333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.08860677083333333, "learning_rate": 4e-05, "loss": 4.7686, "loss/crossentropy": 2.2585472464561462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21468041092157364, "step": 7060 }, { "epoch": 0.5885, "grad_norm": 5.0, "grad_norm_var": 0.088916015625, "learning_rate": 4e-05, "loss": 5.1589, "loss/crossentropy": 2.4640525579452515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21823417022824287, "step": 7062 }, { "epoch": 0.5886666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.06848551432291666, "learning_rate": 4e-05, "loss": 4.6937, "loss/crossentropy": 2.095867395401001, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18419134989380836, "step": 7064 }, { "epoch": 0.5888333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.060546875, "learning_rate": 4e-05, "loss": 4.4197, "loss/crossentropy": 2.244805634021759, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19950323924422264, "step": 7066 }, { "epoch": 0.589, "grad_norm": 4.75, "grad_norm_var": 0.055078125, "learning_rate": 4e-05, "loss": 4.805, "loss/crossentropy": 2.1685686707496643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20743118226528168, "step": 7068 }, { "epoch": 0.5891666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.04694010416666667, "learning_rate": 4e-05, "loss": 4.5978, "loss/crossentropy": 1.2904389277100563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14947044663131237, "step": 7070 }, { "epoch": 0.5893333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.045182291666666666, "learning_rate": 4e-05, "loss": 5.1091, "loss/crossentropy": 1.944626122713089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1817072257399559, "step": 7072 }, { "epoch": 0.5895, "grad_norm": 4.53125, "grad_norm_var": 0.035400390625, "learning_rate": 4e-05, "loss": 4.663, "loss/crossentropy": 1.7342427596449852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18484355323016644, "step": 7074 }, { "epoch": 0.5896666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.03592122395833333, "learning_rate": 4e-05, "loss": 4.5442, "loss/crossentropy": 1.944994330406189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20828309655189514, "step": 7076 }, { "epoch": 0.5898333333333333, "grad_norm": 4.875, "grad_norm_var": 0.03284098307291667, "learning_rate": 4e-05, "loss": 5.1822, "loss/crossentropy": 1.6501160487532616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18159450963139534, "step": 7078 }, { "epoch": 0.59, "grad_norm": 4.8125, "grad_norm_var": 0.03658854166666667, "learning_rate": 4e-05, "loss": 4.8808, "loss/crossentropy": 2.1811063289642334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19801967963576317, "step": 7080 }, { "epoch": 0.5901666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.042801920572916666, "learning_rate": 4e-05, "loss": 4.81, "loss/crossentropy": 2.1836779415607452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19767899811267853, "step": 7082 }, { "epoch": 0.5903333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.04412434895833333, "learning_rate": 4e-05, "loss": 4.4882, "loss/crossentropy": 1.9349441826343536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839659884572029, "step": 7084 }, { "epoch": 0.5905, "grad_norm": 5.1875, "grad_norm_var": 0.06760660807291667, "learning_rate": 4e-05, "loss": 4.8502, "loss/crossentropy": 1.5366474315524101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1782370787113905, "step": 7086 }, { "epoch": 0.5906666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.06760660807291667, "learning_rate": 4e-05, "loss": 4.4517, "loss/crossentropy": 1.4672022983431816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14767078682780266, "step": 7088 }, { "epoch": 0.5908333333333333, "grad_norm": 5.375, "grad_norm_var": 0.08331705729166666, "learning_rate": 4e-05, "loss": 4.8765, "loss/crossentropy": 2.0351298972964287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845032162964344, "step": 7090 }, { "epoch": 0.591, "grad_norm": 5.03125, "grad_norm_var": 0.07682291666666667, "learning_rate": 4e-05, "loss": 5.2773, "loss/crossentropy": 1.6402394473552704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16715828701853752, "step": 7092 }, { "epoch": 0.5911666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.08498942057291667, "learning_rate": 4e-05, "loss": 4.7207, "loss/crossentropy": 2.1903760731220245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975751407444477, "step": 7094 }, { "epoch": 0.5913333333333334, "grad_norm": 4.875, "grad_norm_var": 0.06490478515625, "learning_rate": 4e-05, "loss": 5.4179, "loss/crossentropy": 2.4919445514678955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21300790458917618, "step": 7096 }, { "epoch": 0.5915, "grad_norm": 4.84375, "grad_norm_var": 0.06760660807291667, "learning_rate": 4e-05, "loss": 5.2518, "loss/crossentropy": 2.238806664943695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21941359341144562, "step": 7098 }, { "epoch": 0.5916666666666667, "grad_norm": 5.0, "grad_norm_var": 0.07082926432291667, "learning_rate": 4e-05, "loss": 5.3808, "loss/crossentropy": 2.211260676383972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18881267309188843, "step": 7100 }, { "epoch": 0.5918333333333333, "grad_norm": 6.03125, "grad_norm_var": 0.13489176432291666, "learning_rate": 4e-05, "loss": 4.523, "loss/crossentropy": 1.7687021493911743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20016569271683693, "step": 7102 }, { "epoch": 0.592, "grad_norm": 4.6875, "grad_norm_var": 0.13357747395833333, "learning_rate": 4e-05, "loss": 4.8254, "loss/crossentropy": 1.8385881558060646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18539466150105, "step": 7104 }, { "epoch": 0.5921666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.11302083333333333, "learning_rate": 4e-05, "loss": 5.0105, "loss/crossentropy": 1.5129987746477127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15814178064465523, "step": 7106 }, { "epoch": 0.5923333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.11754150390625, "learning_rate": 4e-05, "loss": 5.0845, "loss/crossentropy": 1.2784345149993896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1473633572459221, "step": 7108 }, { "epoch": 0.5925, "grad_norm": 4.65625, "grad_norm_var": 0.11926676432291666, "learning_rate": 4e-05, "loss": 5.1103, "loss/crossentropy": 2.039908640086651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18187290988862514, "step": 7110 }, { "epoch": 0.5926666666666667, "grad_norm": 5.125, "grad_norm_var": 0.11959228515625, "learning_rate": 4e-05, "loss": 4.4894, "loss/crossentropy": 1.7723973244428635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224184051156044, "step": 7112 }, { "epoch": 0.5928333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.1162109375, "learning_rate": 4e-05, "loss": 4.787, "loss/crossentropy": 1.9356326535344124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17493806034326553, "step": 7114 }, { "epoch": 0.593, "grad_norm": 4.75, "grad_norm_var": 0.13271077473958334, "learning_rate": 4e-05, "loss": 5.3488, "loss/crossentropy": 1.8205601423978806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18349266424775124, "step": 7116 }, { "epoch": 0.5931666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.058186848958333336, "learning_rate": 4e-05, "loss": 4.8379, "loss/crossentropy": 1.5122859254479408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1873751487582922, "step": 7118 }, { "epoch": 0.5933333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.05779622395833333, "learning_rate": 4e-05, "loss": 4.7082, "loss/crossentropy": 2.456811845302582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22585484385490417, "step": 7120 }, { "epoch": 0.5935, "grad_norm": 4.59375, "grad_norm_var": 0.06979166666666667, "learning_rate": 4e-05, "loss": 4.7246, "loss/crossentropy": 1.925706960260868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17225532606244087, "step": 7122 }, { "epoch": 0.5936666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.10154622395833333, "learning_rate": 4e-05, "loss": 4.2237, "loss/crossentropy": 1.8182199075818062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1815893966704607, "step": 7124 }, { "epoch": 0.5938333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.09735921223958334, "learning_rate": 4e-05, "loss": 4.6819, "loss/crossentropy": 1.501136690378189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14409414678812027, "step": 7126 }, { "epoch": 0.594, "grad_norm": 4.78125, "grad_norm_var": 0.09283854166666666, "learning_rate": 4e-05, "loss": 5.0641, "loss/crossentropy": 1.9566605687141418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1724204383790493, "step": 7128 }, { "epoch": 0.5941666666666666, "grad_norm": 5.0, "grad_norm_var": 0.08902587890625, "learning_rate": 4e-05, "loss": 4.3695, "loss/crossentropy": 1.9445571303367615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21243244409561157, "step": 7130 }, { "epoch": 0.5943333333333334, "grad_norm": 4.1875, "grad_norm_var": 0.09998372395833334, "learning_rate": 4e-05, "loss": 4.8659, "loss/crossentropy": 2.362126111984253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20181460306048393, "step": 7132 }, { "epoch": 0.5945, "grad_norm": 4.9375, "grad_norm_var": 0.11194254557291666, "learning_rate": 4e-05, "loss": 4.295, "loss/crossentropy": 1.7851531505584717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17282582819461823, "step": 7134 }, { "epoch": 0.5946666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.11448160807291667, "learning_rate": 4e-05, "loss": 4.6094, "loss/crossentropy": 1.4123722687363625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1530421730130911, "step": 7136 }, { "epoch": 0.5948333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.11262613932291667, "learning_rate": 4e-05, "loss": 4.9726, "loss/crossentropy": 1.876221090555191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224697545170784, "step": 7138 }, { "epoch": 0.595, "grad_norm": 4.65625, "grad_norm_var": 0.08733317057291666, "learning_rate": 4e-05, "loss": 5.2092, "loss/crossentropy": 1.2281717583537102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12058732472360134, "step": 7140 }, { "epoch": 0.5951666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.087109375, "learning_rate": 4e-05, "loss": 5.1486, "loss/crossentropy": 2.3574686646461487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21202539652585983, "step": 7142 }, { "epoch": 0.5953333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.08839518229166667, "learning_rate": 4e-05, "loss": 4.394, "loss/crossentropy": 1.3628652021288872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17282887548208237, "step": 7144 }, { "epoch": 0.5955, "grad_norm": 4.6875, "grad_norm_var": 0.1013671875, "learning_rate": 4e-05, "loss": 5.1276, "loss/crossentropy": 2.1821780800819397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21669187769293785, "step": 7146 }, { "epoch": 0.5956666666666667, "grad_norm": 5.34375, "grad_norm_var": 0.07962239583333333, "learning_rate": 4e-05, "loss": 4.6122, "loss/crossentropy": 2.189154863357544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20566852018237114, "step": 7148 }, { "epoch": 0.5958333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.09544270833333333, "learning_rate": 4e-05, "loss": 3.5778, "loss/crossentropy": 1.0040019303560257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1260563600808382, "step": 7150 }, { "epoch": 0.596, "grad_norm": 4.625, "grad_norm_var": 0.13378499348958334, "learning_rate": 4e-05, "loss": 4.8097, "loss/crossentropy": 2.3464534282684326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20970812812447548, "step": 7152 }, { "epoch": 0.5961666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.13209228515625, "learning_rate": 4e-05, "loss": 4.6268, "loss/crossentropy": 1.6716122701764107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19498688727617264, "step": 7154 }, { "epoch": 0.5963333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.13245035807291666, "learning_rate": 4e-05, "loss": 4.5956, "loss/crossentropy": 1.6933601424098015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20203623175621033, "step": 7156 }, { "epoch": 0.5965, "grad_norm": 5.21875, "grad_norm_var": 0.13417561848958334, "learning_rate": 4e-05, "loss": 5.1846, "loss/crossentropy": 2.599808931350708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22032839059829712, "step": 7158 }, { "epoch": 0.5966666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.1357421875, "learning_rate": 4e-05, "loss": 4.5418, "loss/crossentropy": 1.4756020605564117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14345270954072475, "step": 7160 }, { "epoch": 0.5968333333333333, "grad_norm": 4.875, "grad_norm_var": 0.12616780598958333, "learning_rate": 4e-05, "loss": 4.5142, "loss/crossentropy": 1.753265455365181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1787324883043766, "step": 7162 }, { "epoch": 0.597, "grad_norm": 4.3125, "grad_norm_var": 0.12823893229166666, "learning_rate": 4e-05, "loss": 4.7166, "loss/crossentropy": 2.619646430015564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22686060145497322, "step": 7164 }, { "epoch": 0.5971666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.1130859375, "learning_rate": 4e-05, "loss": 4.1743, "loss/crossentropy": 1.353781297802925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14187632501125336, "step": 7166 }, { "epoch": 0.5973333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.08345947265625, "learning_rate": 4e-05, "loss": 4.5593, "loss/crossentropy": 1.8787141367793083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18714572489261627, "step": 7168 }, { "epoch": 0.5975, "grad_norm": 4.78125, "grad_norm_var": 0.07535400390625, "learning_rate": 4e-05, "loss": 4.8598, "loss/crossentropy": 2.3827298283576965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21606054157018661, "step": 7170 }, { "epoch": 0.5976666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.06848958333333334, "learning_rate": 4e-05, "loss": 4.9034, "loss/crossentropy": 2.169047087430954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20684335753321648, "step": 7172 }, { "epoch": 0.5978333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.05670572916666667, "learning_rate": 4e-05, "loss": 4.7007, "loss/crossentropy": 0.976897768676281, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15186763554811478, "step": 7174 }, { "epoch": 0.598, "grad_norm": 4.84375, "grad_norm_var": 0.05636393229166667, "learning_rate": 4e-05, "loss": 4.4089, "loss/crossentropy": 1.524364024400711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1366695910692215, "step": 7176 }, { "epoch": 0.5981666666666666, "grad_norm": 5.0, "grad_norm_var": 0.06011962890625, "learning_rate": 4e-05, "loss": 5.1123, "loss/crossentropy": 2.1570041179656982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20499436184763908, "step": 7178 }, { "epoch": 0.5983333333333334, "grad_norm": 4.5, "grad_norm_var": 0.05718994140625, "learning_rate": 4e-05, "loss": 5.104, "loss/crossentropy": 2.673922121524811, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113421931862831, "step": 7180 }, { "epoch": 0.5985, "grad_norm": 4.6875, "grad_norm_var": 0.038134765625, "learning_rate": 4e-05, "loss": 4.8084, "loss/crossentropy": 2.6139089465141296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2268553152680397, "step": 7182 }, { "epoch": 0.5986666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.03899739583333333, "learning_rate": 4e-05, "loss": 4.821, "loss/crossentropy": 2.4553991556167603, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23415345698595047, "step": 7184 }, { "epoch": 0.5988333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.05054931640625, "learning_rate": 4e-05, "loss": 5.1587, "loss/crossentropy": 2.134661704301834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19149811938405037, "step": 7186 }, { "epoch": 0.599, "grad_norm": 5.0, "grad_norm_var": 0.053515625, "learning_rate": 4e-05, "loss": 5.1202, "loss/crossentropy": 2.0437642335891724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21648352220654488, "step": 7188 }, { "epoch": 0.5991666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.052534993489583334, "learning_rate": 4e-05, "loss": 4.8893, "loss/crossentropy": 1.434989832341671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14296963065862656, "step": 7190 }, { "epoch": 0.5993333333333334, "grad_norm": 4.5, "grad_norm_var": 0.050455729166666664, "learning_rate": 4e-05, "loss": 4.9558, "loss/crossentropy": 1.83430977165699, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16601720824837685, "step": 7192 }, { "epoch": 0.5995, "grad_norm": 5.0, "grad_norm_var": 0.050764973958333334, "learning_rate": 4e-05, "loss": 4.8337, "loss/crossentropy": 2.1914051175117493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19750383496284485, "step": 7194 }, { "epoch": 0.5996666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.4123006184895833, "learning_rate": 4e-05, "loss": 4.896, "loss/crossentropy": 1.6491469144821167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20295769348740578, "step": 7196 }, { "epoch": 0.5998333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.40487874348958336, "learning_rate": 4e-05, "loss": 4.8493, "loss/crossentropy": 1.4159668385982513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1523029077798128, "step": 7198 }, { "epoch": 0.6, "grad_norm": 4.90625, "grad_norm_var": 0.3759073893229167, "learning_rate": 4e-05, "loss": 5.1873, "loss/crossentropy": 2.0024573504924774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2217225655913353, "step": 7200 }, { "epoch": 0.6001666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.38033447265625, "learning_rate": 4e-05, "loss": 4.803, "loss/crossentropy": 2.018974833190441, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17445803619921207, "step": 7202 }, { "epoch": 0.6003333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.38218994140625, "learning_rate": 4e-05, "loss": 4.4835, "loss/crossentropy": 2.6279727816581726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21568148583173752, "step": 7204 }, { "epoch": 0.6005, "grad_norm": 6.0, "grad_norm_var": 0.46005452473958336, "learning_rate": 4e-05, "loss": 4.5529, "loss/crossentropy": 2.3042045533657074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125856727361679, "step": 7206 }, { "epoch": 0.6006666666666667, "grad_norm": 4.875, "grad_norm_var": 0.43339436848958335, "learning_rate": 4e-05, "loss": 5.0155, "loss/crossentropy": 1.9355166032910347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18338329531252384, "step": 7208 }, { "epoch": 0.6008333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.44442952473958336, "learning_rate": 4e-05, "loss": 4.7165, "loss/crossentropy": 1.6675493568181992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16726251505315304, "step": 7210 }, { "epoch": 0.601, "grad_norm": 4.59375, "grad_norm_var": 0.11536458333333334, "learning_rate": 4e-05, "loss": 4.7583, "loss/crossentropy": 2.4564692974090576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22821878641843796, "step": 7212 }, { "epoch": 0.6011666666666666, "grad_norm": 7.5625, "grad_norm_var": 0.58125, "learning_rate": 4e-05, "loss": 4.6558, "loss/crossentropy": 1.9894456341862679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19508633390069008, "step": 7214 }, { "epoch": 0.6013333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.5986287434895833, "learning_rate": 4e-05, "loss": 4.4333, "loss/crossentropy": 2.0114836394786835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24474351853132248, "step": 7216 }, { "epoch": 0.6015, "grad_norm": 4.90625, "grad_norm_var": 0.6019816080729167, "learning_rate": 4e-05, "loss": 4.8185, "loss/crossentropy": 1.790744572877884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19527454674243927, "step": 7218 }, { "epoch": 0.6016666666666667, "grad_norm": 4.625, "grad_norm_var": 0.60279541015625, "learning_rate": 4e-05, "loss": 4.7642, "loss/crossentropy": 1.9990643709897995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20038366317749023, "step": 7220 }, { "epoch": 0.6018333333333333, "grad_norm": 5.375, "grad_norm_var": 0.530712890625, "learning_rate": 4e-05, "loss": 5.0586, "loss/crossentropy": 1.5327222123742104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18884196318686008, "step": 7222 }, { "epoch": 0.602, "grad_norm": 4.84375, "grad_norm_var": 0.5283162434895833, "learning_rate": 4e-05, "loss": 5.0143, "loss/crossentropy": 2.18381866812706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2246886007487774, "step": 7224 }, { "epoch": 0.6021666666666666, "grad_norm": 5.8125, "grad_norm_var": 0.56920166015625, "learning_rate": 4e-05, "loss": 5.0421, "loss/crossentropy": 2.5714552998542786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21956535056233406, "step": 7226 }, { "epoch": 0.6023333333333334, "grad_norm": 6.53125, "grad_norm_var": 0.6641886393229167, "learning_rate": 4e-05, "loss": 5.1027, "loss/crossentropy": 2.4347121119499207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21895084530115128, "step": 7228 }, { "epoch": 0.6025, "grad_norm": 4.1875, "grad_norm_var": 0.31151936848958334, "learning_rate": 4e-05, "loss": 4.6368, "loss/crossentropy": 1.82130666077137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18457898125052452, "step": 7230 }, { "epoch": 0.6026666666666667, "grad_norm": 5.25, "grad_norm_var": 0.29791259765625, "learning_rate": 4e-05, "loss": 4.8637, "loss/crossentropy": 2.497299909591675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19863051548600197, "step": 7232 }, { "epoch": 0.6028333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.31643473307291664, "learning_rate": 4e-05, "loss": 4.5341, "loss/crossentropy": 2.023450642824173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18142622709274292, "step": 7234 }, { "epoch": 0.603, "grad_norm": 4.75, "grad_norm_var": 0.31177978515625, "learning_rate": 4e-05, "loss": 4.2234, "loss/crossentropy": 1.542768731713295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14925481751561165, "step": 7236 }, { "epoch": 0.6031666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.3067545572916667, "learning_rate": 4e-05, "loss": 5.2911, "loss/crossentropy": 2.219216376543045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20528903231024742, "step": 7238 }, { "epoch": 0.6033333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.32060139973958335, "learning_rate": 4e-05, "loss": 4.9617, "loss/crossentropy": 1.9249942675232887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1838216297328472, "step": 7240 }, { "epoch": 0.6035, "grad_norm": 4.40625, "grad_norm_var": 0.27928059895833335, "learning_rate": 4e-05, "loss": 4.6277, "loss/crossentropy": 1.7711387276649475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17735876329243183, "step": 7242 }, { "epoch": 0.6036666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.09462483723958333, "learning_rate": 4e-05, "loss": 5.0941, "loss/crossentropy": 1.6195759177207947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17690261639654636, "step": 7244 }, { "epoch": 0.6038333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.06443684895833333, "learning_rate": 4e-05, "loss": 5.2268, "loss/crossentropy": 2.1065678000450134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2140568532049656, "step": 7246 }, { "epoch": 0.604, "grad_norm": 4.65625, "grad_norm_var": 0.05831705729166667, "learning_rate": 4e-05, "loss": 4.7785, "loss/crossentropy": 2.4431713819503784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23185305297374725, "step": 7248 }, { "epoch": 0.6041666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.04993082682291667, "learning_rate": 4e-05, "loss": 5.1371, "loss/crossentropy": 1.6138255596160889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1841035783290863, "step": 7250 }, { "epoch": 0.6043333333333333, "grad_norm": 5.71875, "grad_norm_var": 0.10154622395833333, "learning_rate": 4e-05, "loss": 4.811, "loss/crossentropy": 1.9559299945831299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17412950471043587, "step": 7252 }, { "epoch": 0.6045, "grad_norm": 4.6875, "grad_norm_var": 0.10126546223958334, "learning_rate": 4e-05, "loss": 4.9001, "loss/crossentropy": 1.585697665810585, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17267544195055962, "step": 7254 }, { "epoch": 0.6046666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.11376546223958334, "learning_rate": 4e-05, "loss": 4.6544, "loss/crossentropy": 2.1703633964061737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22978663071990013, "step": 7256 }, { "epoch": 0.6048333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.09869384765625, "learning_rate": 4e-05, "loss": 5.1685, "loss/crossentropy": 2.224595546722412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069346234202385, "step": 7258 }, { "epoch": 0.605, "grad_norm": 4.25, "grad_norm_var": 0.12902018229166667, "learning_rate": 4e-05, "loss": 4.3922, "loss/crossentropy": 1.8856493830680847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18326758965849876, "step": 7260 }, { "epoch": 0.6051666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.127587890625, "learning_rate": 4e-05, "loss": 5.6004, "loss/crossentropy": 2.3320149183273315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21202293783426285, "step": 7262 }, { "epoch": 0.6053333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.12056884765625, "learning_rate": 4e-05, "loss": 5.1004, "loss/crossentropy": 2.044320672750473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24393825232982635, "step": 7264 }, { "epoch": 0.6055, "grad_norm": 5.0, "grad_norm_var": 0.11912434895833333, "learning_rate": 4e-05, "loss": 5.1452, "loss/crossentropy": 2.3974433541297913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2371625006198883, "step": 7266 }, { "epoch": 0.6056666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.08511962890625, "learning_rate": 4e-05, "loss": 5.325, "loss/crossentropy": 2.257835239171982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960177905857563, "step": 7268 }, { "epoch": 0.6058333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.11458333333333333, "learning_rate": 4e-05, "loss": 5.4307, "loss/crossentropy": 1.788444608449936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2374886106699705, "step": 7270 }, { "epoch": 0.606, "grad_norm": 4.84375, "grad_norm_var": 0.09384358723958333, "learning_rate": 4e-05, "loss": 4.9727, "loss/crossentropy": 1.87558251619339, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17660346627235413, "step": 7272 }, { "epoch": 0.6061666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.09933268229166667, "learning_rate": 4e-05, "loss": 4.775, "loss/crossentropy": 2.021895110607147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23071924597024918, "step": 7274 }, { "epoch": 0.6063333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.07789306640625, "learning_rate": 4e-05, "loss": 4.7348, "loss/crossentropy": 1.7142114639282227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17037047445774078, "step": 7276 }, { "epoch": 0.6065, "grad_norm": 4.78125, "grad_norm_var": 0.075244140625, "learning_rate": 4e-05, "loss": 5.193, "loss/crossentropy": 2.4209593534469604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22260507941246033, "step": 7278 }, { "epoch": 0.6066666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.07877604166666667, "learning_rate": 4e-05, "loss": 4.8547, "loss/crossentropy": 1.5159368366003036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.155701145529747, "step": 7280 }, { "epoch": 0.6068333333333333, "grad_norm": 5.0, "grad_norm_var": 0.08313395182291666, "learning_rate": 4e-05, "loss": 4.9023, "loss/crossentropy": 1.7176839709281921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18741050362586975, "step": 7282 }, { "epoch": 0.607, "grad_norm": 4.96875, "grad_norm_var": 0.06588134765625, "learning_rate": 4e-05, "loss": 4.9645, "loss/crossentropy": 1.6779725551605225, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16802698001265526, "step": 7284 }, { "epoch": 0.6071666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.032275390625, "learning_rate": 4e-05, "loss": 4.805, "loss/crossentropy": 2.100606143474579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19785485044121742, "step": 7286 }, { "epoch": 0.6073333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.038802083333333334, "learning_rate": 4e-05, "loss": 5.5269, "loss/crossentropy": 2.4047402143478394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22336392104625702, "step": 7288 }, { "epoch": 0.6075, "grad_norm": 5.09375, "grad_norm_var": 0.041304524739583334, "learning_rate": 4e-05, "loss": 5.0355, "loss/crossentropy": 2.4837300777435303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2262895181775093, "step": 7290 }, { "epoch": 0.6076666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.0435546875, "learning_rate": 4e-05, "loss": 4.5059, "loss/crossentropy": 0.9277792125940323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13263830170035362, "step": 7292 }, { "epoch": 0.6078333333333333, "grad_norm": 4.875, "grad_norm_var": 0.23424479166666667, "learning_rate": 4e-05, "loss": 5.1121, "loss/crossentropy": 2.300870269536972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2035658285021782, "step": 7294 }, { "epoch": 0.608, "grad_norm": 4.84375, "grad_norm_var": 0.23014322916666666, "learning_rate": 4e-05, "loss": 4.4993, "loss/crossentropy": 2.54541939496994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21243872493505478, "step": 7296 }, { "epoch": 0.6081666666666666, "grad_norm": 4.3125, "grad_norm_var": 0.26519775390625, "learning_rate": 4e-05, "loss": 4.1937, "loss/crossentropy": 1.6491773575544357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16693933680653572, "step": 7298 }, { "epoch": 0.6083333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.27346598307291664, "learning_rate": 4e-05, "loss": 5.6926, "loss/crossentropy": 2.7386369705200195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123209573328495, "step": 7300 }, { "epoch": 0.6085, "grad_norm": 4.65625, "grad_norm_var": 0.27667643229166666, "learning_rate": 4e-05, "loss": 4.6207, "loss/crossentropy": 2.468148171901703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21846356615424156, "step": 7302 }, { "epoch": 0.6086666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.28013916015625, "learning_rate": 4e-05, "loss": 4.7455, "loss/crossentropy": 2.2846281826496124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20414729788899422, "step": 7304 }, { "epoch": 0.6088333333333333, "grad_norm": 5.4375, "grad_norm_var": 0.29498291015625, "learning_rate": 4e-05, "loss": 5.2679, "loss/crossentropy": 2.3819915056228638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19434510171413422, "step": 7306 }, { "epoch": 0.609, "grad_norm": 4.71875, "grad_norm_var": 0.2815755208333333, "learning_rate": 4e-05, "loss": 5.0059, "loss/crossentropy": 1.7940093278884888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17341401614248753, "step": 7308 }, { "epoch": 0.6091666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.05950113932291667, "learning_rate": 4e-05, "loss": 5.0488, "loss/crossentropy": 2.2041455805301666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19079307839274406, "step": 7310 }, { "epoch": 0.6093333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.06679280598958333, "learning_rate": 4e-05, "loss": 4.3424, "loss/crossentropy": 2.475742816925049, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21777204051613808, "step": 7312 }, { "epoch": 0.6095, "grad_norm": 4.71875, "grad_norm_var": 0.11339518229166666, "learning_rate": 4e-05, "loss": 5.1092, "loss/crossentropy": 1.4416265115141869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1719446536153555, "step": 7314 }, { "epoch": 0.6096666666666667, "grad_norm": 5.21875, "grad_norm_var": 0.119775390625, "learning_rate": 4e-05, "loss": 5.4393, "loss/crossentropy": 2.565885007381439, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22275138646364212, "step": 7316 }, { "epoch": 0.6098333333333333, "grad_norm": 4.625, "grad_norm_var": 0.12745768229166668, "learning_rate": 4e-05, "loss": 5.0595, "loss/crossentropy": 1.6449436023831367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839300487190485, "step": 7318 }, { "epoch": 0.61, "grad_norm": 5.03125, "grad_norm_var": 0.12981363932291667, "learning_rate": 4e-05, "loss": 5.0024, "loss/crossentropy": 1.2995290160179138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17145265080034733, "step": 7320 }, { "epoch": 0.6101666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.11057535807291667, "learning_rate": 4e-05, "loss": 4.8652, "loss/crossentropy": 2.277883529663086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.240261971950531, "step": 7322 }, { "epoch": 0.6103333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.11011962890625, "learning_rate": 4e-05, "loss": 4.9789, "loss/crossentropy": 1.3303010761737823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1409695502370596, "step": 7324 }, { "epoch": 0.6105, "grad_norm": 4.625, "grad_norm_var": 0.11197916666666667, "learning_rate": 4e-05, "loss": 4.695, "loss/crossentropy": 2.046277731657028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17989829368889332, "step": 7326 }, { "epoch": 0.6106666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.10054931640625, "learning_rate": 4e-05, "loss": 5.6208, "loss/crossentropy": 2.087553530931473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23087459430098534, "step": 7328 }, { "epoch": 0.6108333333333333, "grad_norm": 4.5, "grad_norm_var": 0.06222330729166667, "learning_rate": 4e-05, "loss": 4.5278, "loss/crossentropy": 1.748817302286625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16369684785604477, "step": 7330 }, { "epoch": 0.611, "grad_norm": 4.78125, "grad_norm_var": 0.05284830729166667, "learning_rate": 4e-05, "loss": 4.5774, "loss/crossentropy": 2.064897298812866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19502196460962296, "step": 7332 }, { "epoch": 0.6111666666666666, "grad_norm": 4.875, "grad_norm_var": 0.045308430989583336, "learning_rate": 4e-05, "loss": 4.9905, "loss/crossentropy": 1.2127100005745888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13186858594417572, "step": 7334 }, { "epoch": 0.6113333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.06873372395833334, "learning_rate": 4e-05, "loss": 4.3084, "loss/crossentropy": 1.5775117054581642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1517433263361454, "step": 7336 }, { "epoch": 0.6115, "grad_norm": 5.6875, "grad_norm_var": 0.11087239583333333, "learning_rate": 4e-05, "loss": 5.3517, "loss/crossentropy": 2.102786421775818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23160099610686302, "step": 7338 }, { "epoch": 0.6116666666666667, "grad_norm": 5.125, "grad_norm_var": 0.11573893229166667, "learning_rate": 4e-05, "loss": 4.8422, "loss/crossentropy": 1.8997114524245262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1689708549529314, "step": 7340 }, { "epoch": 0.6118333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.15728759765625, "learning_rate": 4e-05, "loss": 4.8982, "loss/crossentropy": 2.1801356077194214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23223424702882767, "step": 7342 }, { "epoch": 0.612, "grad_norm": 5.03125, "grad_norm_var": 0.14894205729166668, "learning_rate": 4e-05, "loss": 4.9091, "loss/crossentropy": 2.1929211616516113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20502665266394615, "step": 7344 }, { "epoch": 0.6121666666666666, "grad_norm": 4.75, "grad_norm_var": 0.14576822916666668, "learning_rate": 4e-05, "loss": 4.6603, "loss/crossentropy": 1.4914572164416313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15952800959348679, "step": 7346 }, { "epoch": 0.6123333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.15396728515625, "learning_rate": 4e-05, "loss": 4.6071, "loss/crossentropy": 1.3951681330800056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1512138657271862, "step": 7348 }, { "epoch": 0.6125, "grad_norm": 4.53125, "grad_norm_var": 0.15078125, "learning_rate": 4e-05, "loss": 4.837, "loss/crossentropy": 1.9563019052147865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891426406800747, "step": 7350 }, { "epoch": 0.6126666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.12941080729166668, "learning_rate": 4e-05, "loss": 5.0656, "loss/crossentropy": 2.1818154975771904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1797526851296425, "step": 7352 }, { "epoch": 0.6128333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.09466145833333334, "learning_rate": 4e-05, "loss": 4.9402, "loss/crossentropy": 2.425243556499481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22774088010191917, "step": 7354 }, { "epoch": 0.613, "grad_norm": 4.5, "grad_norm_var": 1.2548177083333334, "learning_rate": 4e-05, "loss": 4.8706, "loss/crossentropy": 2.3366977274417877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20710158720612526, "step": 7356 }, { "epoch": 0.6131666666666666, "grad_norm": 4.875, "grad_norm_var": 1.1992146809895834, "learning_rate": 4e-05, "loss": 5.0991, "loss/crossentropy": 2.3591907620429993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22993936762213707, "step": 7358 }, { "epoch": 0.6133333333333333, "grad_norm": 4.78125, "grad_norm_var": 1.1929646809895833, "learning_rate": 4e-05, "loss": 5.0655, "loss/crossentropy": 1.9983976259827614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18225537240505219, "step": 7360 }, { "epoch": 0.6135, "grad_norm": 4.75, "grad_norm_var": 1.1788411458333334, "learning_rate": 4e-05, "loss": 4.6148, "loss/crossentropy": 1.7058027386665344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1712406538426876, "step": 7362 }, { "epoch": 0.6136666666666667, "grad_norm": 5.0, "grad_norm_var": 1.16724853515625, "learning_rate": 4e-05, "loss": 5.3608, "loss/crossentropy": 1.91152223944664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19123814068734646, "step": 7364 }, { "epoch": 0.6138333333333333, "grad_norm": 6.0, "grad_norm_var": 1.1889322916666667, "learning_rate": 4e-05, "loss": 5.179, "loss/crossentropy": 1.94556924700737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2352980338037014, "step": 7366 }, { "epoch": 0.614, "grad_norm": 4.8125, "grad_norm_var": 1.21763916015625, "learning_rate": 4e-05, "loss": 4.8045, "loss/crossentropy": 2.543876588344574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20267504453659058, "step": 7368 }, { "epoch": 0.6141666666666666, "grad_norm": 4.59375, "grad_norm_var": 1.2033162434895834, "learning_rate": 4e-05, "loss": 4.7137, "loss/crossentropy": 2.6393585205078125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21013517677783966, "step": 7370 }, { "epoch": 0.6143333333333333, "grad_norm": 5.40625, "grad_norm_var": 0.12252197265625, "learning_rate": 4e-05, "loss": 5.2704, "loss/crossentropy": 2.2312487065792084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21544279530644417, "step": 7372 }, { "epoch": 0.6145, "grad_norm": 4.9375, "grad_norm_var": 0.12138264973958333, "learning_rate": 4e-05, "loss": 4.8709, "loss/crossentropy": 1.9392684027552605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17965877056121826, "step": 7374 }, { "epoch": 0.6146666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.11951497395833334, "learning_rate": 4e-05, "loss": 4.9357, "loss/crossentropy": 1.7078978717327118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18131383322179317, "step": 7376 }, { "epoch": 0.6148333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.12294514973958333, "learning_rate": 4e-05, "loss": 4.9165, "loss/crossentropy": 1.4037601873278618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14496637880802155, "step": 7378 }, { "epoch": 0.615, "grad_norm": 4.28125, "grad_norm_var": 0.15657145182291668, "learning_rate": 4e-05, "loss": 4.5314, "loss/crossentropy": 2.505313813686371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22614199295639992, "step": 7380 }, { "epoch": 0.6151666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.08131510416666667, "learning_rate": 4e-05, "loss": 4.7505, "loss/crossentropy": 1.5547932982444763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1560380645096302, "step": 7382 }, { "epoch": 0.6153333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.08300374348958334, "learning_rate": 4e-05, "loss": 4.4938, "loss/crossentropy": 1.399954080581665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15437544509768486, "step": 7384 }, { "epoch": 0.6155, "grad_norm": 4.75, "grad_norm_var": 0.07844645182291667, "learning_rate": 4e-05, "loss": 4.8019, "loss/crossentropy": 2.378181368112564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19977695494890213, "step": 7386 }, { "epoch": 0.6156666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.06213785807291667, "learning_rate": 4e-05, "loss": 4.9903, "loss/crossentropy": 2.203928917646408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22653977200388908, "step": 7388 }, { "epoch": 0.6158333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.07096354166666667, "learning_rate": 4e-05, "loss": 5.456, "loss/crossentropy": 2.281449168920517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20882750302553177, "step": 7390 }, { "epoch": 0.616, "grad_norm": 4.28125, "grad_norm_var": 0.09495035807291667, "learning_rate": 4e-05, "loss": 3.8264, "loss/crossentropy": 1.2839400321245193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15107953548431396, "step": 7392 }, { "epoch": 0.6161666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.09646809895833333, "learning_rate": 4e-05, "loss": 4.3006, "loss/crossentropy": 1.7372924834489822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894401628524065, "step": 7394 }, { "epoch": 0.6163333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.06378580729166666, "learning_rate": 4e-05, "loss": 4.738, "loss/crossentropy": 2.499216377735138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21850312128663063, "step": 7396 }, { "epoch": 0.6165, "grad_norm": 4.53125, "grad_norm_var": 0.0611328125, "learning_rate": 4e-05, "loss": 4.3971, "loss/crossentropy": 2.232160449028015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22296073287725449, "step": 7398 }, { "epoch": 0.6166666666666667, "grad_norm": 4.75, "grad_norm_var": 0.05774332682291667, "learning_rate": 4e-05, "loss": 4.5329, "loss/crossentropy": 1.8162498325109482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18512892350554466, "step": 7400 }, { "epoch": 0.6168333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.06106363932291667, "learning_rate": 4e-05, "loss": 4.7576, "loss/crossentropy": 2.1651022732257843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21410580724477768, "step": 7402 }, { "epoch": 0.617, "grad_norm": 4.65625, "grad_norm_var": 0.053385416666666664, "learning_rate": 4e-05, "loss": 4.4337, "loss/crossentropy": 1.6273088455200195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19039241410791874, "step": 7404 }, { "epoch": 0.6171666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.08040364583333333, "learning_rate": 4e-05, "loss": 5.1997, "loss/crossentropy": 2.3385795950889587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21196548268198967, "step": 7406 }, { "epoch": 0.6173333333333333, "grad_norm": 4.875, "grad_norm_var": 0.06519775390625, "learning_rate": 4e-05, "loss": 4.7667, "loss/crossentropy": 2.227331906557083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21265869960188866, "step": 7408 }, { "epoch": 0.6175, "grad_norm": 5.71875, "grad_norm_var": 0.12955322265625, "learning_rate": 4e-05, "loss": 5.4909, "loss/crossentropy": 2.0978061258792877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2349342331290245, "step": 7410 }, { "epoch": 0.6176666666666667, "grad_norm": 5.96875, "grad_norm_var": 0.20193684895833333, "learning_rate": 4e-05, "loss": 5.0476, "loss/crossentropy": 2.3273271322250366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20061522722244263, "step": 7412 }, { "epoch": 0.6178333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.19308268229166667, "learning_rate": 4e-05, "loss": 4.736, "loss/crossentropy": 1.521895594894886, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1687323059886694, "step": 7414 }, { "epoch": 0.618, "grad_norm": 4.53125, "grad_norm_var": 0.18791910807291667, "learning_rate": 4e-05, "loss": 4.4298, "loss/crossentropy": 2.3158479034900665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2391265109181404, "step": 7416 }, { "epoch": 0.6181666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.17003580729166667, "learning_rate": 4e-05, "loss": 5.0987, "loss/crossentropy": 1.928847923874855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19217653200030327, "step": 7418 }, { "epoch": 0.6183333333333333, "grad_norm": 4.875, "grad_norm_var": 0.14069010416666666, "learning_rate": 4e-05, "loss": 5.3043, "loss/crossentropy": 2.1934374272823334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24121011793613434, "step": 7420 }, { "epoch": 0.6185, "grad_norm": 5.0, "grad_norm_var": 0.17079671223958334, "learning_rate": 4e-05, "loss": 5.1373, "loss/crossentropy": 1.699301615357399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16717437282204628, "step": 7422 }, { "epoch": 0.6186666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.17146809895833334, "learning_rate": 4e-05, "loss": 5.0396, "loss/crossentropy": 2.2893040478229523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21165359392762184, "step": 7424 }, { "epoch": 0.6188333333333333, "grad_norm": 4.75, "grad_norm_var": 0.1810546875, "learning_rate": 4e-05, "loss": 4.2959, "loss/crossentropy": 1.883228361606598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17740708962082863, "step": 7426 }, { "epoch": 0.619, "grad_norm": 4.96875, "grad_norm_var": 0.11334635416666666, "learning_rate": 4e-05, "loss": 4.9929, "loss/crossentropy": 1.7570676431059837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17012443765997887, "step": 7428 }, { "epoch": 0.6191666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.10549723307291667, "learning_rate": 4e-05, "loss": 5.0408, "loss/crossentropy": 1.8213330134749413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1704744454473257, "step": 7430 }, { "epoch": 0.6193333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.09777018229166666, "learning_rate": 4e-05, "loss": 5.172, "loss/crossentropy": 1.6421629637479782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18456821888685226, "step": 7432 }, { "epoch": 0.6195, "grad_norm": 4.625, "grad_norm_var": 0.103125, "learning_rate": 4e-05, "loss": 4.587, "loss/crossentropy": 1.8029606891795993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16571101709268987, "step": 7434 }, { "epoch": 0.6196666666666667, "grad_norm": 5.0, "grad_norm_var": 0.10384114583333333, "learning_rate": 4e-05, "loss": 5.0261, "loss/crossentropy": 0.9726422056555748, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1137815099209547, "step": 7436 }, { "epoch": 0.6198333333333333, "grad_norm": 4.75, "grad_norm_var": 0.045817057291666664, "learning_rate": 4e-05, "loss": 5.0299, "loss/crossentropy": 1.7549875676631927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18394076451659203, "step": 7438 }, { "epoch": 0.62, "grad_norm": 4.78125, "grad_norm_var": 0.04550374348958333, "learning_rate": 4e-05, "loss": 4.4456, "loss/crossentropy": 1.8778206706047058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20654203370213509, "step": 7440 }, { "epoch": 0.6201666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.02984619140625, "learning_rate": 4e-05, "loss": 4.7478, "loss/crossentropy": 2.4427354633808136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21256711333990097, "step": 7442 }, { "epoch": 0.6203333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.03866780598958333, "learning_rate": 4e-05, "loss": 4.5859, "loss/crossentropy": 2.2122822999954224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21503214538097382, "step": 7444 }, { "epoch": 0.6205, "grad_norm": 4.65625, "grad_norm_var": 0.042252604166666666, "learning_rate": 4e-05, "loss": 4.5618, "loss/crossentropy": 2.0259710252285004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19461403414607048, "step": 7446 }, { "epoch": 0.6206666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.041910807291666664, "learning_rate": 4e-05, "loss": 5.1102, "loss/crossentropy": 2.308306932449341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.225881177932024, "step": 7448 }, { "epoch": 0.6208333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.04159749348958333, "learning_rate": 4e-05, "loss": 5.0064, "loss/crossentropy": 2.103279024362564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2261260412633419, "step": 7450 }, { "epoch": 0.621, "grad_norm": 4.8125, "grad_norm_var": 0.026936848958333332, "learning_rate": 4e-05, "loss": 5.1623, "loss/crossentropy": 1.6729852855205536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.169599249958992, "step": 7452 }, { "epoch": 0.6211666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.01734619140625, "learning_rate": 4e-05, "loss": 4.3505, "loss/crossentropy": 1.8214651942253113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827968992292881, "step": 7454 }, { "epoch": 0.6213333333333333, "grad_norm": 4.75, "grad_norm_var": 0.017606608072916665, "learning_rate": 4e-05, "loss": 4.4708, "loss/crossentropy": 1.6592305526137352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15527719631791115, "step": 7456 }, { "epoch": 0.6215, "grad_norm": 5.03125, "grad_norm_var": 0.03435872395833333, "learning_rate": 4e-05, "loss": 5.0017, "loss/crossentropy": 2.079524904489517, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18644290789961815, "step": 7458 }, { "epoch": 0.6216666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.029423014322916666, "learning_rate": 4e-05, "loss": 4.6251, "loss/crossentropy": 1.945441111922264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1680115982890129, "step": 7460 }, { "epoch": 0.6218333333333333, "grad_norm": 4.5, "grad_norm_var": 0.03189697265625, "learning_rate": 4e-05, "loss": 4.8618, "loss/crossentropy": 2.3849238753318787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19866472855210304, "step": 7462 }, { "epoch": 0.622, "grad_norm": 4.46875, "grad_norm_var": 0.04309895833333333, "learning_rate": 4e-05, "loss": 4.4236, "loss/crossentropy": 1.5672996863722801, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.160741176456213, "step": 7464 }, { "epoch": 0.6221666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.04309895833333333, "learning_rate": 4e-05, "loss": 4.7951, "loss/crossentropy": 1.4478224888443947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15010821633040905, "step": 7466 }, { "epoch": 0.6223333333333333, "grad_norm": 4.21875, "grad_norm_var": 0.09257405598958333, "learning_rate": 4e-05, "loss": 4.695, "loss/crossentropy": 1.4308890029788017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18856440111994743, "step": 7468 }, { "epoch": 0.6225, "grad_norm": 4.6875, "grad_norm_var": 0.09608968098958333, "learning_rate": 4e-05, "loss": 4.9713, "loss/crossentropy": 1.7661740481853485, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19487347826361656, "step": 7470 }, { "epoch": 0.6226666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.09607747395833334, "learning_rate": 4e-05, "loss": 4.7833, "loss/crossentropy": 1.2931400835514069, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15353696048259735, "step": 7472 }, { "epoch": 0.6228333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.08606770833333334, "learning_rate": 4e-05, "loss": 5.6731, "loss/crossentropy": 2.2657946050167084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23214036226272583, "step": 7474 }, { "epoch": 0.623, "grad_norm": 4.5625, "grad_norm_var": 0.104150390625, "learning_rate": 4e-05, "loss": 4.544, "loss/crossentropy": 1.616468869149685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15361887589097023, "step": 7476 }, { "epoch": 0.6231666666666666, "grad_norm": 5.125, "grad_norm_var": 0.100634765625, "learning_rate": 4e-05, "loss": 5.0915, "loss/crossentropy": 1.716950848698616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18519181199371815, "step": 7478 }, { "epoch": 0.6233333333333333, "grad_norm": 4.875, "grad_norm_var": 0.08511962890625, "learning_rate": 4e-05, "loss": 4.7246, "loss/crossentropy": 2.2773490250110626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1976032555103302, "step": 7480 }, { "epoch": 0.6235, "grad_norm": 4.65625, "grad_norm_var": 0.11910400390625, "learning_rate": 4e-05, "loss": 4.9252, "loss/crossentropy": 1.9374547228217125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17996854707598686, "step": 7482 }, { "epoch": 0.6236666666666667, "grad_norm": 4.875, "grad_norm_var": 0.088134765625, "learning_rate": 4e-05, "loss": 4.6941, "loss/crossentropy": 2.4025683403015137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20034634694457054, "step": 7484 }, { "epoch": 0.6238333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.08339436848958333, "learning_rate": 4e-05, "loss": 5.1253, "loss/crossentropy": 1.978261910378933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18603325076401234, "step": 7486 }, { "epoch": 0.624, "grad_norm": 4.625, "grad_norm_var": 0.08707275390625, "learning_rate": 4e-05, "loss": 5.0676, "loss/crossentropy": 2.561383068561554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25106481462717056, "step": 7488 }, { "epoch": 0.6241666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.09231363932291667, "learning_rate": 4e-05, "loss": 5.1759, "loss/crossentropy": 1.8795787617564201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17086850106716156, "step": 7490 }, { "epoch": 0.6243333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.08017171223958333, "learning_rate": 4e-05, "loss": 4.4529, "loss/crossentropy": 2.1656236350536346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20800131186842918, "step": 7492 }, { "epoch": 0.6245, "grad_norm": 4.84375, "grad_norm_var": 0.07763264973958334, "learning_rate": 4e-05, "loss": 5.2549, "loss/crossentropy": 2.350065529346466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21179821342229843, "step": 7494 }, { "epoch": 0.6246666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.08951822916666667, "learning_rate": 4e-05, "loss": 4.4729, "loss/crossentropy": 2.0494428873062134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21732918173074722, "step": 7496 }, { "epoch": 0.6248333333333334, "grad_norm": 5.40625, "grad_norm_var": 0.07083333333333333, "learning_rate": 4e-05, "loss": 5.1736, "loss/crossentropy": 2.5712009966373444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21002069488167763, "step": 7498 }, { "epoch": 0.625, "grad_norm": 5.0625, "grad_norm_var": 0.058056640625, "learning_rate": 4e-05, "loss": 4.6009, "loss/crossentropy": 1.6923210993409157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1651032567024231, "step": 7500 }, { "epoch": 0.6251666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.06161702473958333, "learning_rate": 4e-05, "loss": 4.2048, "loss/crossentropy": 0.818359375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10744648613035679, "step": 7502 }, { "epoch": 0.6253333333333333, "grad_norm": 5.34375, "grad_norm_var": 0.06638997395833333, "learning_rate": 4e-05, "loss": 4.3508, "loss/crossentropy": 1.6889969930052757, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20452985167503357, "step": 7504 }, { "epoch": 0.6255, "grad_norm": 5.25, "grad_norm_var": 0.06679280598958333, "learning_rate": 4e-05, "loss": 4.9417, "loss/crossentropy": 1.9900458455085754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20413101464509964, "step": 7506 }, { "epoch": 0.6256666666666667, "grad_norm": 5.40625, "grad_norm_var": 0.07245686848958334, "learning_rate": 4e-05, "loss": 4.9704, "loss/crossentropy": 2.0661009550094604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2114063873887062, "step": 7508 }, { "epoch": 0.6258333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.07298177083333333, "learning_rate": 4e-05, "loss": 4.8257, "loss/crossentropy": 1.449072316288948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15182125568389893, "step": 7510 }, { "epoch": 0.626, "grad_norm": 4.96875, "grad_norm_var": 0.06640218098958334, "learning_rate": 4e-05, "loss": 5.5008, "loss/crossentropy": 2.6559234857559204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21250774711370468, "step": 7512 }, { "epoch": 0.6261666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.05933837890625, "learning_rate": 4e-05, "loss": 5.0174, "loss/crossentropy": 1.3103863522410393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14163080044090748, "step": 7514 }, { "epoch": 0.6263333333333333, "grad_norm": 4.625, "grad_norm_var": 0.06064046223958333, "learning_rate": 4e-05, "loss": 5.2193, "loss/crossentropy": 2.289563000202179, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21135593950748444, "step": 7516 }, { "epoch": 0.6265, "grad_norm": 4.5625, "grad_norm_var": 0.06496988932291667, "learning_rate": 4e-05, "loss": 5.1146, "loss/crossentropy": 2.310837507247925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19352519512176514, "step": 7518 }, { "epoch": 0.6266666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.07411702473958333, "learning_rate": 4e-05, "loss": 4.5612, "loss/crossentropy": 1.3394847959280014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13622993975877762, "step": 7520 }, { "epoch": 0.6268333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.07394205729166667, "learning_rate": 4e-05, "loss": 5.2805, "loss/crossentropy": 1.9260080009698868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18235324323177338, "step": 7522 }, { "epoch": 0.627, "grad_norm": 5.15625, "grad_norm_var": 0.056494140625, "learning_rate": 4e-05, "loss": 5.5087, "loss/crossentropy": 2.140301376581192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202216237783432, "step": 7524 }, { "epoch": 0.6271666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.06822916666666666, "learning_rate": 4e-05, "loss": 4.4574, "loss/crossentropy": 2.1392059326171875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24074340984225273, "step": 7526 }, { "epoch": 0.6273333333333333, "grad_norm": 4.875, "grad_norm_var": 0.064697265625, "learning_rate": 4e-05, "loss": 4.4673, "loss/crossentropy": 1.8771500810980797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1791480854153633, "step": 7528 }, { "epoch": 0.6275, "grad_norm": 4.96875, "grad_norm_var": 0.06702067057291666, "learning_rate": 4e-05, "loss": 4.9868, "loss/crossentropy": 1.9266760498285294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19410490244627, "step": 7530 }, { "epoch": 0.6276666666666667, "grad_norm": 5.125, "grad_norm_var": 0.06809488932291667, "learning_rate": 4e-05, "loss": 5.5456, "loss/crossentropy": 2.4995266795158386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21117721870541573, "step": 7532 }, { "epoch": 0.6278333333333334, "grad_norm": 4.625, "grad_norm_var": 0.073681640625, "learning_rate": 4e-05, "loss": 4.5547, "loss/crossentropy": 2.4195366203784943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22145430743694305, "step": 7534 }, { "epoch": 0.628, "grad_norm": 4.3125, "grad_norm_var": 0.072509765625, "learning_rate": 4e-05, "loss": 4.1866, "loss/crossentropy": 1.2680030390620232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13905576802790165, "step": 7536 }, { "epoch": 0.6281666666666667, "grad_norm": 5.40625, "grad_norm_var": 0.08404947916666666, "learning_rate": 4e-05, "loss": 5.0348, "loss/crossentropy": 1.3818910643458366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14404772967100143, "step": 7538 }, { "epoch": 0.6283333333333333, "grad_norm": 4.75, "grad_norm_var": 0.07902018229166667, "learning_rate": 4e-05, "loss": 5.0873, "loss/crossentropy": 1.975064903497696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19577939808368683, "step": 7540 }, { "epoch": 0.6285, "grad_norm": 4.46875, "grad_norm_var": 0.078515625, "learning_rate": 4e-05, "loss": 4.6554, "loss/crossentropy": 1.7756718024611473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17718594893813133, "step": 7542 }, { "epoch": 0.6286666666666667, "grad_norm": 5.125, "grad_norm_var": 0.08674723307291667, "learning_rate": 4e-05, "loss": 4.5838, "loss/crossentropy": 1.5631278902292252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20101314038038254, "step": 7544 }, { "epoch": 0.6288333333333334, "grad_norm": 4.625, "grad_norm_var": 0.09472249348958334, "learning_rate": 4e-05, "loss": 4.3893, "loss/crossentropy": 2.0111151337623596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1673036515712738, "step": 7546 }, { "epoch": 0.629, "grad_norm": 4.34375, "grad_norm_var": 0.10676676432291667, "learning_rate": 4e-05, "loss": 4.9599, "loss/crossentropy": 1.497259370982647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15227816626429558, "step": 7548 }, { "epoch": 0.6291666666666667, "grad_norm": 4.625, "grad_norm_var": 0.11248372395833334, "learning_rate": 4e-05, "loss": 4.2137, "loss/crossentropy": 2.412768602371216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20763478055596352, "step": 7550 }, { "epoch": 0.6293333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.10826822916666666, "learning_rate": 4e-05, "loss": 4.8871, "loss/crossentropy": 1.8732339143753052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2120702899992466, "step": 7552 }, { "epoch": 0.6295, "grad_norm": 5.125, "grad_norm_var": 0.09029541015625, "learning_rate": 4e-05, "loss": 4.7153, "loss/crossentropy": 2.2625193893909454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1922021098434925, "step": 7554 }, { "epoch": 0.6296666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.09811197916666667, "learning_rate": 4e-05, "loss": 5.2024, "loss/crossentropy": 2.018505483865738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894335299730301, "step": 7556 }, { "epoch": 0.6298333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.09052327473958334, "learning_rate": 4e-05, "loss": 5.1375, "loss/crossentropy": 1.974723607301712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21125193685293198, "step": 7558 }, { "epoch": 0.63, "grad_norm": 4.625, "grad_norm_var": 0.12745768229166668, "learning_rate": 4e-05, "loss": 4.7333, "loss/crossentropy": 1.1625150069594383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1372869722545147, "step": 7560 }, { "epoch": 0.6301666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.911328125, "learning_rate": 4e-05, "loss": 4.9296, "loss/crossentropy": 1.8650590181350708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1866096295416355, "step": 7562 }, { "epoch": 0.6303333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.8719889322916666, "learning_rate": 4e-05, "loss": 4.8647, "loss/crossentropy": 2.071987845003605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18294048868119717, "step": 7564 }, { "epoch": 0.6305, "grad_norm": 5.3125, "grad_norm_var": 0.82847900390625, "learning_rate": 4e-05, "loss": 4.9965, "loss/crossentropy": 1.72061687707901, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1748257651925087, "step": 7566 }, { "epoch": 0.6306666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.895166015625, "learning_rate": 4e-05, "loss": 4.1251, "loss/crossentropy": 2.17667618393898, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039562426507473, "step": 7568 }, { "epoch": 0.6308333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.882421875, "learning_rate": 4e-05, "loss": 4.9057, "loss/crossentropy": 1.8860152289271355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19144929759204388, "step": 7570 }, { "epoch": 0.631, "grad_norm": 4.4375, "grad_norm_var": 0.9200480143229167, "learning_rate": 4e-05, "loss": 4.4815, "loss/crossentropy": 2.2932342290878296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1954507753252983, "step": 7572 }, { "epoch": 0.6311666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.9251139322916667, "learning_rate": 4e-05, "loss": 5.395, "loss/crossentropy": 2.4433215856552124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20751015469431877, "step": 7574 }, { "epoch": 0.6313333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.9438802083333333, "learning_rate": 4e-05, "loss": 4.6266, "loss/crossentropy": 2.486309230327606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23241155594587326, "step": 7576 }, { "epoch": 0.6315, "grad_norm": 4.5625, "grad_norm_var": 0.092041015625, "learning_rate": 4e-05, "loss": 4.7142, "loss/crossentropy": 1.9126665592193604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2041993960738182, "step": 7578 }, { "epoch": 0.6316666666666667, "grad_norm": 5.25, "grad_norm_var": 0.09306233723958333, "learning_rate": 4e-05, "loss": 5.2434, "loss/crossentropy": 1.77960654348135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16748891957104206, "step": 7580 }, { "epoch": 0.6318333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.07392171223958334, "learning_rate": 4e-05, "loss": 4.9963, "loss/crossentropy": 1.656422920525074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18162991851568222, "step": 7582 }, { "epoch": 0.632, "grad_norm": 4.75, "grad_norm_var": 0.05924072265625, "learning_rate": 4e-05, "loss": 5.164, "loss/crossentropy": 1.8230694606900215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17624171450734138, "step": 7584 }, { "epoch": 0.6321666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.06884358723958334, "learning_rate": 4e-05, "loss": 5.0233, "loss/crossentropy": 1.4461549371480942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15429365262389183, "step": 7586 }, { "epoch": 0.6323333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.06164957682291667, "learning_rate": 4e-05, "loss": 5.1898, "loss/crossentropy": 2.1677410900592804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.191360954195261, "step": 7588 }, { "epoch": 0.6325, "grad_norm": 4.84375, "grad_norm_var": 0.05924072265625, "learning_rate": 4e-05, "loss": 5.0244, "loss/crossentropy": 1.9723278135061264, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18035429902374744, "step": 7590 }, { "epoch": 0.6326666666666667, "grad_norm": 4.5, "grad_norm_var": 0.048567708333333334, "learning_rate": 4e-05, "loss": 4.6673, "loss/crossentropy": 1.7898750752210617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19324330985546112, "step": 7592 }, { "epoch": 0.6328333333333334, "grad_norm": 5.25, "grad_norm_var": 0.053446451822916664, "learning_rate": 4e-05, "loss": 4.982, "loss/crossentropy": 2.2220281660556793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20492373406887054, "step": 7594 }, { "epoch": 0.633, "grad_norm": 4.5625, "grad_norm_var": 0.057535807291666664, "learning_rate": 4e-05, "loss": 5.0144, "loss/crossentropy": 1.870813049376011, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17919510789215565, "step": 7596 }, { "epoch": 0.6331666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.05536702473958333, "learning_rate": 4e-05, "loss": 4.7825, "loss/crossentropy": 1.8371545374393463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20575408078730106, "step": 7598 }, { "epoch": 0.6333333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.057417805989583334, "learning_rate": 4e-05, "loss": 4.8203, "loss/crossentropy": 1.3351125866174698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14372671209275723, "step": 7600 }, { "epoch": 0.6335, "grad_norm": 5.09375, "grad_norm_var": 0.05584309895833333, "learning_rate": 4e-05, "loss": 5.2663, "loss/crossentropy": 2.3292965590953827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20524241402745247, "step": 7602 }, { "epoch": 0.6336666666666667, "grad_norm": 4.875, "grad_norm_var": 0.05517171223958333, "learning_rate": 4e-05, "loss": 4.7048, "loss/crossentropy": 2.4347563982009888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2168492116034031, "step": 7604 }, { "epoch": 0.6338333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.05689697265625, "learning_rate": 4e-05, "loss": 4.607, "loss/crossentropy": 2.1245033144950867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22462333366274834, "step": 7606 }, { "epoch": 0.634, "grad_norm": 5.6875, "grad_norm_var": 0.08586832682291666, "learning_rate": 4e-05, "loss": 4.44, "loss/crossentropy": 1.540616787970066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18548547476530075, "step": 7608 }, { "epoch": 0.6341666666666667, "grad_norm": 5.4375, "grad_norm_var": 0.09299723307291667, "learning_rate": 4e-05, "loss": 4.502, "loss/crossentropy": 1.3562129810452461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18716447241604328, "step": 7610 }, { "epoch": 0.6343333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.08883056640625, "learning_rate": 4e-05, "loss": 4.3424, "loss/crossentropy": 1.135543867945671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14085247367620468, "step": 7612 }, { "epoch": 0.6345, "grad_norm": 4.9375, "grad_norm_var": 0.10015869140625, "learning_rate": 4e-05, "loss": 5.2584, "loss/crossentropy": 2.25225293636322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21712664887309074, "step": 7614 }, { "epoch": 0.6346666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.09390869140625, "learning_rate": 4e-05, "loss": 5.0247, "loss/crossentropy": 1.9538735672831535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17630420625209808, "step": 7616 }, { "epoch": 0.6348333333333334, "grad_norm": 4.5, "grad_norm_var": 0.112353515625, "learning_rate": 4e-05, "loss": 3.9214, "loss/crossentropy": 1.6917081847786903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17729554325342178, "step": 7618 }, { "epoch": 0.635, "grad_norm": 6.4375, "grad_norm_var": 0.48606770833333335, "learning_rate": 4e-05, "loss": 5.2245, "loss/crossentropy": 2.008490338921547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19701511040329933, "step": 7620 }, { "epoch": 0.6351666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.49247639973958335, "learning_rate": 4e-05, "loss": 5.2463, "loss/crossentropy": 1.7468746230006218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2234695442020893, "step": 7622 }, { "epoch": 0.6353333333333333, "grad_norm": 4.75, "grad_norm_var": 0.4777628580729167, "learning_rate": 4e-05, "loss": 4.6194, "loss/crossentropy": 1.1543590053915977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1463829204440117, "step": 7624 }, { "epoch": 0.6355, "grad_norm": 4.96875, "grad_norm_var": 0.49244791666666665, "learning_rate": 4e-05, "loss": 4.8176, "loss/crossentropy": 1.722088746726513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1737196370959282, "step": 7626 }, { "epoch": 0.6356666666666667, "grad_norm": 4.75, "grad_norm_var": 0.4774698893229167, "learning_rate": 4e-05, "loss": 5.0913, "loss/crossentropy": 2.0701277554035187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123371586203575, "step": 7628 }, { "epoch": 0.6358333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.4684244791666667, "learning_rate": 4e-05, "loss": 5.1948, "loss/crossentropy": 1.5187406539916992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1752415895462036, "step": 7630 }, { "epoch": 0.636, "grad_norm": 5.3125, "grad_norm_var": 0.48138020833333334, "learning_rate": 4e-05, "loss": 5.2926, "loss/crossentropy": 2.065512478351593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2165343016386032, "step": 7632 }, { "epoch": 0.6361666666666667, "grad_norm": 4.125, "grad_norm_var": 0.5079060872395833, "learning_rate": 4e-05, "loss": 4.5435, "loss/crossentropy": 1.003174789249897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11726399697363377, "step": 7634 }, { "epoch": 0.6363333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.09542643229166667, "learning_rate": 4e-05, "loss": 4.9272, "loss/crossentropy": 2.772099256515503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22614924237132072, "step": 7636 }, { "epoch": 0.6365, "grad_norm": 4.5, "grad_norm_var": 0.10167643229166666, "learning_rate": 4e-05, "loss": 5.1145, "loss/crossentropy": 2.4037272334098816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22226005792617798, "step": 7638 }, { "epoch": 0.6366666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.08762613932291667, "learning_rate": 4e-05, "loss": 5.2337, "loss/crossentropy": 2.631228506565094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077028527855873, "step": 7640 }, { "epoch": 0.6368333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.080322265625, "learning_rate": 4e-05, "loss": 5.4376, "loss/crossentropy": 2.5499427318573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166195549070835, "step": 7642 }, { "epoch": 0.637, "grad_norm": 4.9375, "grad_norm_var": 0.0669921875, "learning_rate": 4e-05, "loss": 5.0609, "loss/crossentropy": 2.2906173169612885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19342058897018433, "step": 7644 }, { "epoch": 0.6371666666666667, "grad_norm": 4.375, "grad_norm_var": 0.08687744140625, "learning_rate": 4e-05, "loss": 4.6802, "loss/crossentropy": 1.5942266285419464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20375887118279934, "step": 7646 }, { "epoch": 0.6373333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.07571207682291667, "learning_rate": 4e-05, "loss": 4.6925, "loss/crossentropy": 1.7787268534302711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17380007728934288, "step": 7648 }, { "epoch": 0.6375, "grad_norm": 4.84375, "grad_norm_var": 0.08821614583333333, "learning_rate": 4e-05, "loss": 5.239, "loss/crossentropy": 2.2324607968330383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22638193517923355, "step": 7650 }, { "epoch": 0.6376666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.1076171875, "learning_rate": 4e-05, "loss": 5.1601, "loss/crossentropy": 2.2216447293758392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2244611196219921, "step": 7652 }, { "epoch": 0.6378333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.102978515625, "learning_rate": 4e-05, "loss": 5.0098, "loss/crossentropy": 1.621771179139614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16405992954969406, "step": 7654 }, { "epoch": 0.638, "grad_norm": 4.59375, "grad_norm_var": 0.108056640625, "learning_rate": 4e-05, "loss": 4.9768, "loss/crossentropy": 1.562087506055832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15578421019017696, "step": 7656 }, { "epoch": 0.6381666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.113134765625, "learning_rate": 4e-05, "loss": 4.6871, "loss/crossentropy": 1.9255924224853516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18294049426913261, "step": 7658 }, { "epoch": 0.6383333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.14205322265625, "learning_rate": 4e-05, "loss": 4.3626, "loss/crossentropy": 1.799445129930973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18437882140278816, "step": 7660 }, { "epoch": 0.6385, "grad_norm": 4.8125, "grad_norm_var": 0.12337239583333333, "learning_rate": 4e-05, "loss": 4.7016, "loss/crossentropy": 1.9358659163117409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17831017449498177, "step": 7662 }, { "epoch": 0.6386666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.11278889973958334, "learning_rate": 4e-05, "loss": 4.6857, "loss/crossentropy": 2.1596154496073723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.186475221067667, "step": 7664 }, { "epoch": 0.6388333333333334, "grad_norm": 4.875, "grad_norm_var": 0.07310791015625, "learning_rate": 4e-05, "loss": 4.7901, "loss/crossentropy": 1.5760779231786728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1756380796432495, "step": 7666 }, { "epoch": 0.639, "grad_norm": 4.6875, "grad_norm_var": 0.05640869140625, "learning_rate": 4e-05, "loss": 5.2057, "loss/crossentropy": 2.23982173204422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1747075356543064, "step": 7668 }, { "epoch": 0.6391666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.053450520833333334, "learning_rate": 4e-05, "loss": 4.9526, "loss/crossentropy": 1.7911747694015503, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22356000542640686, "step": 7670 }, { "epoch": 0.6393333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.04879150390625, "learning_rate": 4e-05, "loss": 5.1562, "loss/crossentropy": 1.8702715337276459, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1837063878774643, "step": 7672 }, { "epoch": 0.6395, "grad_norm": 4.84375, "grad_norm_var": 0.04827067057291667, "learning_rate": 4e-05, "loss": 5.0209, "loss/crossentropy": 1.9256580173969269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18390294164419174, "step": 7674 }, { "epoch": 0.6396666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.019917805989583332, "learning_rate": 4e-05, "loss": 4.7435, "loss/crossentropy": 1.92479457706213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19825425744056702, "step": 7676 }, { "epoch": 0.6398333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.0185546875, "learning_rate": 4e-05, "loss": 5.3588, "loss/crossentropy": 2.2302474975585938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21018041297793388, "step": 7678 }, { "epoch": 0.64, "grad_norm": 4.8125, "grad_norm_var": 0.017431640625, "learning_rate": 4e-05, "loss": 4.9013, "loss/crossentropy": 2.454475373029709, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20873108878731728, "step": 7680 }, { "epoch": 0.6401666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.02008056640625, "learning_rate": 4e-05, "loss": 5.0343, "loss/crossentropy": 1.556854486465454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1480635069310665, "step": 7682 }, { "epoch": 0.6403333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.029166666666666667, "learning_rate": 4e-05, "loss": 4.6421, "loss/crossentropy": 1.8543910756707191, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16730140149593353, "step": 7684 }, { "epoch": 0.6405, "grad_norm": 4.875, "grad_norm_var": 0.04205322265625, "learning_rate": 4e-05, "loss": 4.8912, "loss/crossentropy": 2.4121593832969666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19549377635121346, "step": 7686 }, { "epoch": 0.6406666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.04146728515625, "learning_rate": 4e-05, "loss": 4.1813, "loss/crossentropy": 1.6175341084599495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17516151070594788, "step": 7688 }, { "epoch": 0.6408333333333334, "grad_norm": 5.25, "grad_norm_var": 0.051656087239583336, "learning_rate": 4e-05, "loss": 5.1848, "loss/crossentropy": 2.531389057636261, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26043013855814934, "step": 7690 }, { "epoch": 0.641, "grad_norm": 5.03125, "grad_norm_var": 0.0525390625, "learning_rate": 4e-05, "loss": 5.6131, "loss/crossentropy": 1.8920731022953987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1748149711638689, "step": 7692 }, { "epoch": 0.6411666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.054947916666666666, "learning_rate": 4e-05, "loss": 5.2251, "loss/crossentropy": 2.11933171749115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19750068709254265, "step": 7694 }, { "epoch": 0.6413333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.30357666015625, "learning_rate": 4e-05, "loss": 4.493, "loss/crossentropy": 2.253427028656006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19837215542793274, "step": 7696 }, { "epoch": 0.6415, "grad_norm": 4.96875, "grad_norm_var": 0.30155843098958335, "learning_rate": 4e-05, "loss": 4.5354, "loss/crossentropy": 0.9745614528656006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10782595910131931, "step": 7698 }, { "epoch": 0.6416666666666667, "grad_norm": 5.0, "grad_norm_var": 0.2813639322916667, "learning_rate": 4e-05, "loss": 5.3764, "loss/crossentropy": 2.150584578514099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23296813294291496, "step": 7700 }, { "epoch": 0.6418333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.26998697916666664, "learning_rate": 4e-05, "loss": 5.004, "loss/crossentropy": 2.3049569725990295, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19985363632440567, "step": 7702 }, { "epoch": 0.642, "grad_norm": 4.84375, "grad_norm_var": 0.2631144205729167, "learning_rate": 4e-05, "loss": 4.3756, "loss/crossentropy": 1.7743771374225616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2020826954394579, "step": 7704 }, { "epoch": 0.6421666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.25484619140625, "learning_rate": 4e-05, "loss": 5.093, "loss/crossentropy": 1.6929708272218704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18312661536037922, "step": 7706 }, { "epoch": 0.6423333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.26470947265625, "learning_rate": 4e-05, "loss": 5.1116, "loss/crossentropy": 2.522721290588379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22130097448825836, "step": 7708 }, { "epoch": 0.6425, "grad_norm": 4.90625, "grad_norm_var": 0.2642578125, "learning_rate": 4e-05, "loss": 5.7148, "loss/crossentropy": 2.4536512196063995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.236469566822052, "step": 7710 }, { "epoch": 0.6426666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.04290364583333333, "learning_rate": 4e-05, "loss": 4.4442, "loss/crossentropy": 2.4721298217773438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2147650420665741, "step": 7712 }, { "epoch": 0.6428333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.039286295572916664, "learning_rate": 4e-05, "loss": 5.2998, "loss/crossentropy": 1.9547239020466805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19260060787200928, "step": 7714 }, { "epoch": 0.643, "grad_norm": 4.84375, "grad_norm_var": 0.037919108072916666, "learning_rate": 4e-05, "loss": 4.6844, "loss/crossentropy": 1.1047619432210922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15300701186060905, "step": 7716 }, { "epoch": 0.6431666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.03424479166666667, "learning_rate": 4e-05, "loss": 4.967, "loss/crossentropy": 1.8797119855880737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17335743829607964, "step": 7718 }, { "epoch": 0.6433333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.035445149739583334, "learning_rate": 4e-05, "loss": 5.6001, "loss/crossentropy": 1.82417381554842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1801386997103691, "step": 7720 }, { "epoch": 0.6435, "grad_norm": 4.71875, "grad_norm_var": 0.03954671223958333, "learning_rate": 4e-05, "loss": 5.1648, "loss/crossentropy": 1.1887472122907639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17347997426986694, "step": 7722 }, { "epoch": 0.6436666666666667, "grad_norm": 5.0, "grad_norm_var": 0.04073893229166667, "learning_rate": 4e-05, "loss": 4.9365, "loss/crossentropy": 2.210069417953491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106613926589489, "step": 7724 }, { "epoch": 0.6438333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.04937744140625, "learning_rate": 4e-05, "loss": 4.6794, "loss/crossentropy": 1.8511146306991577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16800183057785034, "step": 7726 }, { "epoch": 0.644, "grad_norm": 4.875, "grad_norm_var": 0.04504801432291667, "learning_rate": 4e-05, "loss": 5.1563, "loss/crossentropy": 2.2941965758800507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2198324017226696, "step": 7728 }, { "epoch": 0.6441666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.04576822916666667, "learning_rate": 4e-05, "loss": 5.0422, "loss/crossentropy": 2.0403133034706116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22183560580015182, "step": 7730 }, { "epoch": 0.6443333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.054036458333333336, "learning_rate": 4e-05, "loss": 4.7523, "loss/crossentropy": 2.1832509338855743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20109236985445023, "step": 7732 }, { "epoch": 0.6445, "grad_norm": 4.65625, "grad_norm_var": 0.046728515625, "learning_rate": 4e-05, "loss": 5.1595, "loss/crossentropy": 2.494588077068329, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22531383857131004, "step": 7734 }, { "epoch": 0.6446666666666667, "grad_norm": 4.75, "grad_norm_var": 0.03834228515625, "learning_rate": 4e-05, "loss": 4.8467, "loss/crossentropy": 2.134985640645027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1793641895055771, "step": 7736 }, { "epoch": 0.6448333333333334, "grad_norm": 4.5, "grad_norm_var": 0.045426432291666666, "learning_rate": 4e-05, "loss": 4.5305, "loss/crossentropy": 1.2465531900525093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13715138658881187, "step": 7738 }, { "epoch": 0.645, "grad_norm": 4.84375, "grad_norm_var": 0.0361328125, "learning_rate": 4e-05, "loss": 4.0631, "loss/crossentropy": 1.8641687408089638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15544311329722404, "step": 7740 }, { "epoch": 0.6451666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.06328125, "learning_rate": 4e-05, "loss": 5.2158, "loss/crossentropy": 2.3051227927207947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19798275083303452, "step": 7742 }, { "epoch": 0.6453333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.08329671223958333, "learning_rate": 4e-05, "loss": 5.2336, "loss/crossentropy": 2.479782283306122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21739281341433525, "step": 7744 }, { "epoch": 0.6455, "grad_norm": 4.75, "grad_norm_var": 0.08508707682291666, "learning_rate": 4e-05, "loss": 4.7113, "loss/crossentropy": 1.2954497933387756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15466006100177765, "step": 7746 }, { "epoch": 0.6456666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.07316080729166667, "learning_rate": 4e-05, "loss": 5.0855, "loss/crossentropy": 2.1735799908638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.240029476583004, "step": 7748 }, { "epoch": 0.6458333333333334, "grad_norm": 4.625, "grad_norm_var": 0.072900390625, "learning_rate": 4e-05, "loss": 4.7663, "loss/crossentropy": 2.0483902767300606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18308479711413383, "step": 7750 }, { "epoch": 0.646, "grad_norm": 4.625, "grad_norm_var": 0.07317708333333334, "learning_rate": 4e-05, "loss": 4.6245, "loss/crossentropy": 1.47433602809906, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1538691483438015, "step": 7752 }, { "epoch": 0.6461666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.06796468098958333, "learning_rate": 4e-05, "loss": 4.8267, "loss/crossentropy": 2.449679970741272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22104224935173988, "step": 7754 }, { "epoch": 0.6463333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.10439046223958333, "learning_rate": 4e-05, "loss": 5.3427, "loss/crossentropy": 2.2434877157211304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18906432762742043, "step": 7756 }, { "epoch": 0.6465, "grad_norm": 5.03125, "grad_norm_var": 0.09078369140625, "learning_rate": 4e-05, "loss": 4.9494, "loss/crossentropy": 2.0335726141929626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20091040432453156, "step": 7758 }, { "epoch": 0.6466666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.08088785807291667, "learning_rate": 4e-05, "loss": 5.1213, "loss/crossentropy": 2.3035090565681458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22087782993912697, "step": 7760 }, { "epoch": 0.6468333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.09371337890625, "learning_rate": 4e-05, "loss": 4.4378, "loss/crossentropy": 1.9872702360153198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20616934821009636, "step": 7762 }, { "epoch": 0.647, "grad_norm": 4.96875, "grad_norm_var": 0.09544270833333333, "learning_rate": 4e-05, "loss": 5.3507, "loss/crossentropy": 1.9482222869992256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16111544147133827, "step": 7764 }, { "epoch": 0.6471666666666667, "grad_norm": 4.625, "grad_norm_var": 0.09641927083333333, "learning_rate": 4e-05, "loss": 4.7896, "loss/crossentropy": 1.0095570906996727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13183519057929516, "step": 7766 }, { "epoch": 0.6473333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.09052327473958334, "learning_rate": 4e-05, "loss": 4.8455, "loss/crossentropy": 1.3340253606438637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13347483985126019, "step": 7768 }, { "epoch": 0.6475, "grad_norm": 4.78125, "grad_norm_var": 0.08826497395833334, "learning_rate": 4e-05, "loss": 5.1285, "loss/crossentropy": 2.3872081637382507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22220738232135773, "step": 7770 }, { "epoch": 0.6476666666666666, "grad_norm": 5.53125, "grad_norm_var": 0.077734375, "learning_rate": 4e-05, "loss": 4.8552, "loss/crossentropy": 1.511668123304844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15176920779049397, "step": 7772 }, { "epoch": 0.6478333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.074609375, "learning_rate": 4e-05, "loss": 4.8755, "loss/crossentropy": 2.030316300690174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17819524556398392, "step": 7774 }, { "epoch": 0.648, "grad_norm": 4.40625, "grad_norm_var": 0.07812093098958334, "learning_rate": 4e-05, "loss": 4.4122, "loss/crossentropy": 1.7088908702135086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17149026691913605, "step": 7776 }, { "epoch": 0.6481666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.073291015625, "learning_rate": 4e-05, "loss": 5.2213, "loss/crossentropy": 2.429815411567688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2040964849293232, "step": 7778 }, { "epoch": 0.6483333333333333, "grad_norm": 5.0, "grad_norm_var": 0.071875, "learning_rate": 4e-05, "loss": 4.5178, "loss/crossentropy": 0.8023601695895195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13062872551381588, "step": 7780 }, { "epoch": 0.6485, "grad_norm": 4.90625, "grad_norm_var": 0.073291015625, "learning_rate": 4e-05, "loss": 4.7439, "loss/crossentropy": 2.149062544107437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2019280605018139, "step": 7782 }, { "epoch": 0.6486666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.07073160807291666, "learning_rate": 4e-05, "loss": 5.0257, "loss/crossentropy": 1.923517182469368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21313883364200592, "step": 7784 }, { "epoch": 0.6488333333333334, "grad_norm": 5.0, "grad_norm_var": 0.11425374348958334, "learning_rate": 4e-05, "loss": 4.9594, "loss/crossentropy": 1.1835918948054314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14380118064582348, "step": 7786 }, { "epoch": 0.649, "grad_norm": 5.875, "grad_norm_var": 0.157275390625, "learning_rate": 4e-05, "loss": 5.5351, "loss/crossentropy": 2.353056013584137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2323654629290104, "step": 7788 }, { "epoch": 0.6491666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.16378580729166667, "learning_rate": 4e-05, "loss": 5.4731, "loss/crossentropy": 2.396900922060013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21475185081362724, "step": 7790 }, { "epoch": 0.6493333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.13557535807291668, "learning_rate": 4e-05, "loss": 5.4009, "loss/crossentropy": 1.9842062294483185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2102402187883854, "step": 7792 }, { "epoch": 0.6495, "grad_norm": 5.1875, "grad_norm_var": 0.134228515625, "learning_rate": 4e-05, "loss": 4.9706, "loss/crossentropy": 2.2572127282619476, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21562613546848297, "step": 7794 }, { "epoch": 0.6496666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.1216796875, "learning_rate": 4e-05, "loss": 4.9391, "loss/crossentropy": 1.6953379437327385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19858373701572418, "step": 7796 }, { "epoch": 0.6498333333333334, "grad_norm": 5.125, "grad_norm_var": 0.11487223307291666, "learning_rate": 4e-05, "loss": 4.6978, "loss/crossentropy": 1.352690041065216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1570559199899435, "step": 7798 }, { "epoch": 0.65, "grad_norm": 4.96875, "grad_norm_var": 0.10323893229166667, "learning_rate": 4e-05, "loss": 4.8123, "loss/crossentropy": 1.9550207555294037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17728200927376747, "step": 7800 }, { "epoch": 0.6501666666666667, "grad_norm": 5.0, "grad_norm_var": 0.08469645182291667, "learning_rate": 4e-05, "loss": 4.508, "loss/crossentropy": 2.0163797438144684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042006030678749, "step": 7802 }, { "epoch": 0.6503333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.032666015625, "learning_rate": 4e-05, "loss": 4.978, "loss/crossentropy": 2.063169986009598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2335309386253357, "step": 7804 }, { "epoch": 0.6505, "grad_norm": 4.625, "grad_norm_var": 0.03357747395833333, "learning_rate": 4e-05, "loss": 4.9353, "loss/crossentropy": 2.104892671108246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22495409473776817, "step": 7806 }, { "epoch": 0.6506666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.05907796223958333, "learning_rate": 4e-05, "loss": 5.4607, "loss/crossentropy": 2.243934750556946, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2479833886027336, "step": 7808 }, { "epoch": 0.6508333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.056473795572916666, "learning_rate": 4e-05, "loss": 4.7778, "loss/crossentropy": 1.9320513978600502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17580274306237698, "step": 7810 }, { "epoch": 0.651, "grad_norm": 4.59375, "grad_norm_var": 0.06705322265625, "learning_rate": 4e-05, "loss": 5.3106, "loss/crossentropy": 2.0712881311774254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1777807716280222, "step": 7812 }, { "epoch": 0.6511666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.06222330729166667, "learning_rate": 4e-05, "loss": 4.9737, "loss/crossentropy": 2.4067393839359283, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23931296914815903, "step": 7814 }, { "epoch": 0.6513333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.061572265625, "learning_rate": 4e-05, "loss": 5.1568, "loss/crossentropy": 1.9289019256830215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20855563879013062, "step": 7816 }, { "epoch": 0.6515, "grad_norm": 4.9375, "grad_norm_var": 0.06404622395833333, "learning_rate": 4e-05, "loss": 4.5316, "loss/crossentropy": 2.230655610561371, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22553424537181854, "step": 7818 }, { "epoch": 0.6516666666666666, "grad_norm": 4.75, "grad_norm_var": 0.062353515625, "learning_rate": 4e-05, "loss": 4.1795, "loss/crossentropy": 2.017218291759491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18330439925193787, "step": 7820 }, { "epoch": 0.6518333333333334, "grad_norm": 5.25, "grad_norm_var": 0.063916015625, "learning_rate": 4e-05, "loss": 5.1191, "loss/crossentropy": 1.7517078816890717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20479023829102516, "step": 7822 }, { "epoch": 0.652, "grad_norm": 4.78125, "grad_norm_var": 0.03424072265625, "learning_rate": 4e-05, "loss": 4.5937, "loss/crossentropy": 2.2532346844673157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2107340209186077, "step": 7824 }, { "epoch": 0.6521666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.03717447916666667, "learning_rate": 4e-05, "loss": 5.0257, "loss/crossentropy": 1.69417604804039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16969127021729946, "step": 7826 }, { "epoch": 0.6523333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.03306884765625, "learning_rate": 4e-05, "loss": 4.7318, "loss/crossentropy": 1.354017548263073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15083578042685986, "step": 7828 }, { "epoch": 0.6525, "grad_norm": 4.34375, "grad_norm_var": 0.0708984375, "learning_rate": 4e-05, "loss": 4.5279, "loss/crossentropy": 1.8364375233650208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1852422822266817, "step": 7830 }, { "epoch": 0.6526666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.074462890625, "learning_rate": 4e-05, "loss": 5.042, "loss/crossentropy": 2.439578354358673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2096551414579153, "step": 7832 }, { "epoch": 0.6528333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.0798828125, "learning_rate": 4e-05, "loss": 5.3277, "loss/crossentropy": 2.394729971885681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21963701769709587, "step": 7834 }, { "epoch": 0.653, "grad_norm": 4.8125, "grad_norm_var": 0.07838541666666667, "learning_rate": 4e-05, "loss": 4.3895, "loss/crossentropy": 1.419870764017105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18464216031134129, "step": 7836 }, { "epoch": 0.6531666666666667, "grad_norm": 5.71875, "grad_norm_var": 0.15162760416666668, "learning_rate": 4e-05, "loss": 5.1471, "loss/crossentropy": 1.7623902410268784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1748678907752037, "step": 7838 }, { "epoch": 0.6533333333333333, "grad_norm": 4.625, "grad_norm_var": 0.16365559895833334, "learning_rate": 4e-05, "loss": 4.3374, "loss/crossentropy": 1.9380313530564308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16397200338542461, "step": 7840 }, { "epoch": 0.6535, "grad_norm": 4.84375, "grad_norm_var": 0.15388997395833334, "learning_rate": 4e-05, "loss": 5.1512, "loss/crossentropy": 1.9005918502807617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17274107970297337, "step": 7842 }, { "epoch": 0.6536666666666666, "grad_norm": 4.625, "grad_norm_var": 0.16404622395833332, "learning_rate": 4e-05, "loss": 4.846, "loss/crossentropy": 1.4002360850572586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827119942754507, "step": 7844 }, { "epoch": 0.6538333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.12750244140625, "learning_rate": 4e-05, "loss": 5.2873, "loss/crossentropy": 2.5820748805999756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22619497776031494, "step": 7846 }, { "epoch": 0.654, "grad_norm": 5.4375, "grad_norm_var": 0.13632405598958333, "learning_rate": 4e-05, "loss": 5.1209, "loss/crossentropy": 2.4924808740615845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21206573769450188, "step": 7848 }, { "epoch": 0.6541666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.13551025390625, "learning_rate": 4e-05, "loss": 5.3657, "loss/crossentropy": 2.2897271811962128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21945805847644806, "step": 7850 }, { "epoch": 0.6543333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.13984375, "learning_rate": 4e-05, "loss": 5.1915, "loss/crossentropy": 2.0121906995773315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21069852635264397, "step": 7852 }, { "epoch": 0.6545, "grad_norm": 4.625, "grad_norm_var": 0.06678059895833334, "learning_rate": 4e-05, "loss": 4.7018, "loss/crossentropy": 1.3283841013908386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14112715609371662, "step": 7854 }, { "epoch": 0.6546666666666666, "grad_norm": 4.875, "grad_norm_var": 0.059370930989583334, "learning_rate": 4e-05, "loss": 4.966, "loss/crossentropy": 2.381036549806595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22558944672346115, "step": 7856 }, { "epoch": 0.6548333333333334, "grad_norm": 4.875, "grad_norm_var": 0.06064046223958333, "learning_rate": 4e-05, "loss": 4.6811, "loss/crossentropy": 2.217640519142151, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2052822895348072, "step": 7858 }, { "epoch": 0.655, "grad_norm": 5.0, "grad_norm_var": 0.05896809895833333, "learning_rate": 4e-05, "loss": 4.4663, "loss/crossentropy": 1.9649121761322021, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994537878781557, "step": 7860 }, { "epoch": 0.6551666666666667, "grad_norm": 5.0, "grad_norm_var": 0.058837890625, "learning_rate": 4e-05, "loss": 4.5006, "loss/crossentropy": 1.4738883003592491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20674226433038712, "step": 7862 }, { "epoch": 0.6553333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.06842041015625, "learning_rate": 4e-05, "loss": 5.0418, "loss/crossentropy": 2.4609346985816956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20670482888817787, "step": 7864 }, { "epoch": 0.6555, "grad_norm": 4.65625, "grad_norm_var": 0.05831705729166667, "learning_rate": 4e-05, "loss": 4.8698, "loss/crossentropy": 2.553082287311554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.218223724514246, "step": 7866 }, { "epoch": 0.6556666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.06835530598958334, "learning_rate": 4e-05, "loss": 5.264, "loss/crossentropy": 2.2438072860240936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2258710041642189, "step": 7868 }, { "epoch": 0.6558333333333334, "grad_norm": 4.3125, "grad_norm_var": 0.08420817057291667, "learning_rate": 4e-05, "loss": 4.5106, "loss/crossentropy": 1.7579245939850807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17779571935534477, "step": 7870 }, { "epoch": 0.656, "grad_norm": 4.75, "grad_norm_var": 0.07965087890625, "learning_rate": 4e-05, "loss": 4.7868, "loss/crossentropy": 2.037400543689728, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17647516541182995, "step": 7872 }, { "epoch": 0.6561666666666667, "grad_norm": 5.28125, "grad_norm_var": 0.10354410807291667, "learning_rate": 4e-05, "loss": 4.9759, "loss/crossentropy": 2.227533906698227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21963432803750038, "step": 7874 }, { "epoch": 0.6563333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.09895426432291667, "learning_rate": 4e-05, "loss": 4.9878, "loss/crossentropy": 2.487546145915985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22717060893774033, "step": 7876 }, { "epoch": 0.6565, "grad_norm": 4.53125, "grad_norm_var": 0.10475260416666667, "learning_rate": 4e-05, "loss": 4.3423, "loss/crossentropy": 1.5704541355371475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1635904349386692, "step": 7878 }, { "epoch": 0.6566666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.07662760416666667, "learning_rate": 4e-05, "loss": 5.1448, "loss/crossentropy": 2.16184538602829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24196895584464073, "step": 7880 }, { "epoch": 0.6568333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.07323811848958334, "learning_rate": 4e-05, "loss": 5.1361, "loss/crossentropy": 2.436331033706665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24281031265854836, "step": 7882 }, { "epoch": 0.657, "grad_norm": 4.5, "grad_norm_var": 0.07433268229166666, "learning_rate": 4e-05, "loss": 4.6996, "loss/crossentropy": 2.312442362308502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2044270522892475, "step": 7884 }, { "epoch": 0.6571666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.06145833333333333, "learning_rate": 4e-05, "loss": 5.1136, "loss/crossentropy": 2.0718987584114075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22442586347460747, "step": 7886 }, { "epoch": 0.6573333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.06789957682291667, "learning_rate": 4e-05, "loss": 5.0928, "loss/crossentropy": 2.368440628051758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2424367070198059, "step": 7888 }, { "epoch": 0.6575, "grad_norm": 5.65625, "grad_norm_var": 0.08917643229166666, "learning_rate": 4e-05, "loss": 5.4366, "loss/crossentropy": 1.8111486434936523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17810833640396595, "step": 7890 }, { "epoch": 0.6576666666666666, "grad_norm": 5.0, "grad_norm_var": 0.08948160807291666, "learning_rate": 4e-05, "loss": 5.411, "loss/crossentropy": 2.2830857932567596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2076493538916111, "step": 7892 }, { "epoch": 0.6578333333333334, "grad_norm": 4.875, "grad_norm_var": 0.08710530598958334, "learning_rate": 4e-05, "loss": 4.8047, "loss/crossentropy": 1.8694885224103928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18636782839894295, "step": 7894 }, { "epoch": 0.658, "grad_norm": 5.125, "grad_norm_var": 0.09218343098958333, "learning_rate": 4e-05, "loss": 4.9802, "loss/crossentropy": 2.084577538073063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19515875913202763, "step": 7896 }, { "epoch": 0.6581666666666667, "grad_norm": 4.375, "grad_norm_var": 0.09505208333333333, "learning_rate": 4e-05, "loss": 4.1378, "loss/crossentropy": 2.35347381234169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1936819739639759, "step": 7898 }, { "epoch": 0.6583333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.09334309895833333, "learning_rate": 4e-05, "loss": 5.2054, "loss/crossentropy": 2.1593450531363487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18632697872817516, "step": 7900 }, { "epoch": 0.6585, "grad_norm": 5.3125, "grad_norm_var": 0.10419514973958334, "learning_rate": 4e-05, "loss": 5.343, "loss/crossentropy": 2.141828954219818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113342024385929, "step": 7902 }, { "epoch": 0.6586666666666666, "grad_norm": 5.25, "grad_norm_var": 0.11951497395833334, "learning_rate": 4e-05, "loss": 5.1705, "loss/crossentropy": 2.043430596590042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20793258398771286, "step": 7904 }, { "epoch": 0.6588333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.07897135416666666, "learning_rate": 4e-05, "loss": 4.4168, "loss/crossentropy": 2.251025438308716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22544211894273758, "step": 7906 }, { "epoch": 0.659, "grad_norm": 5.1875, "grad_norm_var": 0.08293863932291666, "learning_rate": 4e-05, "loss": 5.2882, "loss/crossentropy": 1.9112261459231377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17405965924263, "step": 7908 }, { "epoch": 0.6591666666666667, "grad_norm": 4.375, "grad_norm_var": 0.103125, "learning_rate": 4e-05, "loss": 4.5415, "loss/crossentropy": 2.078736662864685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160506434738636, "step": 7910 }, { "epoch": 0.6593333333333333, "grad_norm": 5.34375, "grad_norm_var": 0.10813802083333333, "learning_rate": 4e-05, "loss": 4.7367, "loss/crossentropy": 0.9118631333112717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10875993594527245, "step": 7912 }, { "epoch": 0.6595, "grad_norm": 4.75, "grad_norm_var": 0.08681233723958333, "learning_rate": 4e-05, "loss": 4.7211, "loss/crossentropy": 2.090813308954239, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23957878351211548, "step": 7914 }, { "epoch": 0.6596666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.09996337890625, "learning_rate": 4e-05, "loss": 4.9209, "loss/crossentropy": 1.5122576355934143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19297392293810844, "step": 7916 }, { "epoch": 0.6598333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.08917643229166666, "learning_rate": 4e-05, "loss": 5.0074, "loss/crossentropy": 2.0265481024980545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22811248153448105, "step": 7918 }, { "epoch": 0.66, "grad_norm": 4.5625, "grad_norm_var": 0.07057291666666667, "learning_rate": 4e-05, "loss": 4.9616, "loss/crossentropy": 1.9131137356162071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1730040479451418, "step": 7920 }, { "epoch": 0.6601666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.08290608723958333, "learning_rate": 4e-05, "loss": 4.8872, "loss/crossentropy": 2.4343717992305756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.30416465178132057, "step": 7922 }, { "epoch": 0.6603333333333333, "grad_norm": 4.25, "grad_norm_var": 0.09394124348958334, "learning_rate": 4e-05, "loss": 4.7282, "loss/crossentropy": 1.0288232266902924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13559334725141525, "step": 7924 }, { "epoch": 0.6605, "grad_norm": 5.03125, "grad_norm_var": 0.08899739583333334, "learning_rate": 4e-05, "loss": 5.2148, "loss/crossentropy": 2.2694281935691833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19374223798513412, "step": 7926 }, { "epoch": 0.6606666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.07346598307291667, "learning_rate": 4e-05, "loss": 4.9215, "loss/crossentropy": 2.181885540485382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091890163719654, "step": 7928 }, { "epoch": 0.6608333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.07870686848958333, "learning_rate": 4e-05, "loss": 4.5869, "loss/crossentropy": 1.8901971653103828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878662332892418, "step": 7930 }, { "epoch": 0.661, "grad_norm": 5.3125, "grad_norm_var": 0.083056640625, "learning_rate": 4e-05, "loss": 4.8568, "loss/crossentropy": 1.9051894173026085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19481675140559673, "step": 7932 }, { "epoch": 0.6611666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.08756103515625, "learning_rate": 4e-05, "loss": 4.6283, "loss/crossentropy": 1.8722472786903381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24349390342831612, "step": 7934 }, { "epoch": 0.6613333333333333, "grad_norm": 5.5625, "grad_norm_var": 0.10948893229166666, "learning_rate": 4e-05, "loss": 4.563, "loss/crossentropy": 1.7739543914794922, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16185857728123665, "step": 7936 }, { "epoch": 0.6615, "grad_norm": 4.6875, "grad_norm_var": 0.10233968098958333, "learning_rate": 4e-05, "loss": 5.0303, "loss/crossentropy": 2.4332188963890076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20997020602226257, "step": 7938 }, { "epoch": 0.6616666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.07991129557291667, "learning_rate": 4e-05, "loss": 5.1234, "loss/crossentropy": 2.0778674632310867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.176247363910079, "step": 7940 }, { "epoch": 0.6618333333333334, "grad_norm": 4.875, "grad_norm_var": 0.06897379557291666, "learning_rate": 4e-05, "loss": 5.1767, "loss/crossentropy": 2.163965940475464, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073141485452652, "step": 7942 }, { "epoch": 0.662, "grad_norm": 4.1875, "grad_norm_var": 0.10546468098958334, "learning_rate": 4e-05, "loss": 4.2894, "loss/crossentropy": 0.9995723515748978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11586864292621613, "step": 7944 }, { "epoch": 0.6621666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.10299072265625, "learning_rate": 4e-05, "loss": 4.6738, "loss/crossentropy": 2.281874358654022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23054268583655357, "step": 7946 }, { "epoch": 0.6623333333333333, "grad_norm": 4.875, "grad_norm_var": 0.09205322265625, "learning_rate": 4e-05, "loss": 4.9369, "loss/crossentropy": 2.096975475549698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18830178305506706, "step": 7948 }, { "epoch": 0.6625, "grad_norm": 5.0, "grad_norm_var": 0.09138997395833333, "learning_rate": 4e-05, "loss": 4.9355, "loss/crossentropy": 1.595092996954918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1848563738167286, "step": 7950 }, { "epoch": 0.6626666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.06002197265625, "learning_rate": 4e-05, "loss": 5.0445, "loss/crossentropy": 1.7920393347740173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1907125525176525, "step": 7952 }, { "epoch": 0.6628333333333334, "grad_norm": 4.75, "grad_norm_var": 0.05959879557291667, "learning_rate": 4e-05, "loss": 4.4764, "loss/crossentropy": 1.4217949956655502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15368330664932728, "step": 7954 }, { "epoch": 0.663, "grad_norm": 4.8125, "grad_norm_var": 0.05276285807291667, "learning_rate": 4e-05, "loss": 5.1335, "loss/crossentropy": 2.128646582365036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20609202608466148, "step": 7956 }, { "epoch": 0.6631666666666667, "grad_norm": 4.1875, "grad_norm_var": 0.07659098307291666, "learning_rate": 4e-05, "loss": 4.6351, "loss/crossentropy": 1.244727723300457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13210402987897396, "step": 7958 }, { "epoch": 0.6633333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.05608317057291667, "learning_rate": 4e-05, "loss": 4.7433, "loss/crossentropy": 1.747454211115837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16521551832556725, "step": 7960 }, { "epoch": 0.6635, "grad_norm": 5.03125, "grad_norm_var": 0.05468343098958333, "learning_rate": 4e-05, "loss": 4.939, "loss/crossentropy": 2.3972195982933044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21212521940469742, "step": 7962 }, { "epoch": 0.6636666666666666, "grad_norm": 4.625, "grad_norm_var": 0.050244140625, "learning_rate": 4e-05, "loss": 4.5736, "loss/crossentropy": 2.157024621963501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1839892379939556, "step": 7964 }, { "epoch": 0.6638333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.04920247395833333, "learning_rate": 4e-05, "loss": 4.3667, "loss/crossentropy": 1.5357392206788063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14107412099838257, "step": 7966 }, { "epoch": 0.664, "grad_norm": 4.40625, "grad_norm_var": 0.07037760416666666, "learning_rate": 4e-05, "loss": 4.8995, "loss/crossentropy": 2.07469192892313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1992726493626833, "step": 7968 }, { "epoch": 0.6641666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.07597249348958333, "learning_rate": 4e-05, "loss": 5.0096, "loss/crossentropy": 1.7797765955328941, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16863209381699562, "step": 7970 }, { "epoch": 0.6643333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.08508707682291666, "learning_rate": 4e-05, "loss": 4.9404, "loss/crossentropy": 1.9922729581594467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18705208972096443, "step": 7972 }, { "epoch": 0.6645, "grad_norm": 4.875, "grad_norm_var": 0.054427083333333334, "learning_rate": 4e-05, "loss": 4.8838, "loss/crossentropy": 2.2012872993946075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21382101997733116, "step": 7974 }, { "epoch": 0.6646666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.04801025390625, "learning_rate": 4e-05, "loss": 4.9927, "loss/crossentropy": 1.9671935513615608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18664169497787952, "step": 7976 }, { "epoch": 0.6648333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.0658203125, "learning_rate": 4e-05, "loss": 4.6905, "loss/crossentropy": 1.8137053772807121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16209018975496292, "step": 7978 }, { "epoch": 0.665, "grad_norm": 4.84375, "grad_norm_var": 0.063134765625, "learning_rate": 4e-05, "loss": 4.7317, "loss/crossentropy": 2.605897605419159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2145151011645794, "step": 7980 }, { "epoch": 0.6651666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.06926676432291666, "learning_rate": 4e-05, "loss": 4.7387, "loss/crossentropy": 1.8791739642620087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18151278793811798, "step": 7982 }, { "epoch": 0.6653333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.049332682291666666, "learning_rate": 4e-05, "loss": 4.7535, "loss/crossentropy": 2.591217875480652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23360199108719826, "step": 7984 }, { "epoch": 0.6655, "grad_norm": 5.0, "grad_norm_var": 0.07307535807291667, "learning_rate": 4e-05, "loss": 5.5989, "loss/crossentropy": 2.3981366753578186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21705374494194984, "step": 7986 }, { "epoch": 0.6656666666666666, "grad_norm": 5.375, "grad_norm_var": 0.081884765625, "learning_rate": 4e-05, "loss": 4.7857, "loss/crossentropy": 2.2135126292705536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21088680252432823, "step": 7988 }, { "epoch": 0.6658333333333334, "grad_norm": 5.53125, "grad_norm_var": 0.10944010416666666, "learning_rate": 4e-05, "loss": 5.1127, "loss/crossentropy": 2.6215781569480896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20877355709671974, "step": 7990 }, { "epoch": 0.666, "grad_norm": 5.09375, "grad_norm_var": 0.11129150390625, "learning_rate": 4e-05, "loss": 4.6155, "loss/crossentropy": 1.5245660692453384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1763434261083603, "step": 7992 }, { "epoch": 0.6661666666666667, "grad_norm": 5.0, "grad_norm_var": 0.08307291666666666, "learning_rate": 4e-05, "loss": 5.0979, "loss/crossentropy": 1.9757955074310303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18706656992435455, "step": 7994 }, { "epoch": 0.6663333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.07701822916666666, "learning_rate": 4e-05, "loss": 5.22, "loss/crossentropy": 2.126909226179123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23862241953611374, "step": 7996 }, { "epoch": 0.6665, "grad_norm": 4.84375, "grad_norm_var": 0.075244140625, "learning_rate": 4e-05, "loss": 4.9789, "loss/crossentropy": 2.364075005054474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21161803603172302, "step": 7998 }, { "epoch": 0.6666666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.06415608723958334, "learning_rate": 4e-05, "loss": 4.4117, "loss/crossentropy": 2.0129463002085686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17588723078370094, "step": 8000 }, { "epoch": 0.6668333333333333, "grad_norm": 4.625, "grad_norm_var": 0.05810139973958333, "learning_rate": 3.999998026079526e-05, "loss": 4.7781, "loss/crossentropy": 1.8196282386779785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22529863938689232, "step": 8002 }, { "epoch": 0.667, "grad_norm": 4.875, "grad_norm_var": 0.0544921875, "learning_rate": 3.9999921043229736e-05, "loss": 4.6482, "loss/crossentropy": 2.20221546292305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19770921766757965, "step": 8004 }, { "epoch": 0.6671666666666667, "grad_norm": 4.75, "grad_norm_var": 0.029410807291666667, "learning_rate": 3.9999822347449543e-05, "loss": 5.1931, "loss/crossentropy": 1.919562578201294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19418702088296413, "step": 8006 }, { "epoch": 0.6673333333333333, "grad_norm": 4.875, "grad_norm_var": 0.02633056640625, "learning_rate": 3.99996841736982e-05, "loss": 5.1969, "loss/crossentropy": 2.100528210401535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1811746470630169, "step": 8008 }, { "epoch": 0.6675, "grad_norm": 4.96875, "grad_norm_var": 0.02330322265625, "learning_rate": 3.999950652231664e-05, "loss": 4.5698, "loss/crossentropy": 1.828367918729782, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19560833647847176, "step": 8010 }, { "epoch": 0.6676666666666666, "grad_norm": 4.65625, "grad_norm_var": 1940.4690714518229, "learning_rate": 3.99992893937432e-05, "loss": 4.8554, "loss/crossentropy": 2.049312300980091, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17662812024354935, "step": 8012 }, { "epoch": 0.6678333333333333, "grad_norm": 5.03125, "grad_norm_var": 1939.649149576823, "learning_rate": 3.9999032788513625e-05, "loss": 5.1692, "loss/crossentropy": 2.4652374386787415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22743260487914085, "step": 8014 }, { "epoch": 0.668, "grad_norm": 4.46875, "grad_norm_var": 1940.2542317708333, "learning_rate": 3.999873670726106e-05, "loss": 4.8301, "loss/crossentropy": 1.8886771276593208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19605330377817154, "step": 8016 }, { "epoch": 0.6681666666666667, "grad_norm": 5.15625, "grad_norm_var": 1938.9496704101562, "learning_rate": 3.999840115071606e-05, "loss": 5.126, "loss/crossentropy": 2.337659776210785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21413381025195122, "step": 8018 }, { "epoch": 0.6683333333333333, "grad_norm": 4.40625, "grad_norm_var": 1938.8692993164063, "learning_rate": 3.9998026119706576e-05, "loss": 4.5696, "loss/crossentropy": 2.1056962609291077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2247220054268837, "step": 8020 }, { "epoch": 0.6685, "grad_norm": 5.25, "grad_norm_var": 1937.8184733072917, "learning_rate": 3.999761161515795e-05, "loss": 5.6617, "loss/crossentropy": 2.4179972410202026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.223285723477602, "step": 8022 }, { "epoch": 0.6686666666666666, "grad_norm": 5.25, "grad_norm_var": 1937.643603515625, "learning_rate": 3.9997157638092944e-05, "loss": 5.005, "loss/crossentropy": 1.895853579044342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043449804186821, "step": 8024 }, { "epoch": 0.6688333333333333, "grad_norm": 4.46875, "grad_norm_var": 1937.6156860351562, "learning_rate": 3.999666418963171e-05, "loss": 4.7058, "loss/crossentropy": 2.025197595357895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19659354910254478, "step": 8026 }, { "epoch": 0.669, "grad_norm": 4.8125, "grad_norm_var": 0.107666015625, "learning_rate": 3.999613127099175e-05, "loss": 4.2922, "loss/crossentropy": 1.6821075975894928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1879300493746996, "step": 8028 }, { "epoch": 0.6691666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.10526936848958333, "learning_rate": 3.999555888348801e-05, "loss": 4.9131, "loss/crossentropy": 2.2622806429862976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22173702344298363, "step": 8030 }, { "epoch": 0.6693333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.092431640625, "learning_rate": 3.99949470285328e-05, "loss": 5.4962, "loss/crossentropy": 2.7419689893722534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21366914361715317, "step": 8032 }, { "epoch": 0.6695, "grad_norm": 6.84375, "grad_norm_var": 0.3025390625, "learning_rate": 3.999429570763581e-05, "loss": 5.3351, "loss/crossentropy": 2.4165670573711395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20715122669935226, "step": 8034 }, { "epoch": 0.6696666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.27447509765625, "learning_rate": 3.999360492240411e-05, "loss": 5.1791, "loss/crossentropy": 1.8621023744344711, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17758541740477085, "step": 8036 }, { "epoch": 0.6698333333333333, "grad_norm": 5.96875, "grad_norm_var": 0.32437744140625, "learning_rate": 3.999287467454214e-05, "loss": 5.3983, "loss/crossentropy": 1.1967885345220566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17341040447354317, "step": 8038 }, { "epoch": 0.67, "grad_norm": 5.09375, "grad_norm_var": 0.3204264322916667, "learning_rate": 3.999210496585171e-05, "loss": 5.1228, "loss/crossentropy": 1.89522323012352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1980042066425085, "step": 8040 }, { "epoch": 0.6701666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.30373942057291664, "learning_rate": 3.9991295798232e-05, "loss": 5.0264, "loss/crossentropy": 1.2332193851470947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1300823837518692, "step": 8042 }, { "epoch": 0.6703333333333333, "grad_norm": 4.375, "grad_norm_var": 0.4376953125, "learning_rate": 3.999044717367957e-05, "loss": 4.7728, "loss/crossentropy": 2.335726737976074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21574588492512703, "step": 8044 }, { "epoch": 0.6705, "grad_norm": 5.125, "grad_norm_var": 0.4343058268229167, "learning_rate": 3.99895590942883e-05, "loss": 4.919, "loss/crossentropy": 2.3052841424942017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2212902568280697, "step": 8046 }, { "epoch": 0.6706666666666666, "grad_norm": 4.75, "grad_norm_var": 0.4610514322916667, "learning_rate": 3.9988631562249435e-05, "loss": 4.6745, "loss/crossentropy": 2.4768862426280975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21305079013109207, "step": 8048 }, { "epoch": 0.6708333333333333, "grad_norm": 4.5, "grad_norm_var": 0.28905843098958334, "learning_rate": 3.9987664579851574e-05, "loss": 4.3145, "loss/crossentropy": 1.7946585342288017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17041044309735298, "step": 8050 }, { "epoch": 0.671, "grad_norm": 5.03125, "grad_norm_var": 0.2899576822916667, "learning_rate": 3.998665814948065e-05, "loss": 5.0085, "loss/crossentropy": 2.1682648360729218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21787675842642784, "step": 8052 }, { "epoch": 0.6711666666666667, "grad_norm": 4.75, "grad_norm_var": 0.23605143229166667, "learning_rate": 3.9985612273619924e-05, "loss": 4.1648, "loss/crossentropy": 0.8689647540450096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1191219910979271, "step": 8054 }, { "epoch": 0.6713333333333333, "grad_norm": 4.875, "grad_norm_var": 0.2359375, "learning_rate": 3.9984526954850003e-05, "loss": 4.8656, "loss/crossentropy": 1.978205069899559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915741264820099, "step": 8056 }, { "epoch": 0.6715, "grad_norm": 4.78125, "grad_norm_var": 0.2380859375, "learning_rate": 3.9983402195848796e-05, "loss": 4.519, "loss/crossentropy": 1.9606445729732513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19727950543165207, "step": 8058 }, { "epoch": 0.6716666666666666, "grad_norm": 4.375, "grad_norm_var": 0.051285807291666666, "learning_rate": 3.998223799939153e-05, "loss": 4.43, "loss/crossentropy": 1.5983156114816666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1602168083190918, "step": 8060 }, { "epoch": 0.6718333333333333, "grad_norm": 4.625, "grad_norm_var": 0.06291910807291666, "learning_rate": 3.9981034368350744e-05, "loss": 5.0673, "loss/crossentropy": 2.5904553532600403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2175530567765236, "step": 8062 }, { "epoch": 0.672, "grad_norm": 5.34375, "grad_norm_var": 0.09251302083333333, "learning_rate": 3.997979130569628e-05, "loss": 5.1659, "loss/crossentropy": 2.0036058127880096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17185713909566402, "step": 8064 }, { "epoch": 0.6721666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.08782145182291666, "learning_rate": 3.9978508814495287e-05, "loss": 5.0144, "loss/crossentropy": 1.981281191110611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.172686118632555, "step": 8066 }, { "epoch": 0.6723333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.08162434895833333, "learning_rate": 3.9977186897912166e-05, "loss": 4.8809, "loss/crossentropy": 1.9285836815834045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19455789402127266, "step": 8068 }, { "epoch": 0.6725, "grad_norm": 4.375, "grad_norm_var": 0.09081624348958334, "learning_rate": 3.997582555920861e-05, "loss": 4.3465, "loss/crossentropy": 0.8334982395172119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1363061834126711, "step": 8070 }, { "epoch": 0.6726666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.1041015625, "learning_rate": 3.997442480174361e-05, "loss": 5.0726, "loss/crossentropy": 1.5748215094208717, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17971625551581383, "step": 8072 }, { "epoch": 0.6728333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.10546468098958334, "learning_rate": 3.997298462897336e-05, "loss": 4.7462, "loss/crossentropy": 1.3154399320483208, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15049860626459122, "step": 8074 }, { "epoch": 0.673, "grad_norm": 4.9375, "grad_norm_var": 0.08870035807291667, "learning_rate": 3.99715050444514e-05, "loss": 5.0491, "loss/crossentropy": 0.9168932139873505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11337370611727238, "step": 8076 }, { "epoch": 0.6731666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.07183837890625, "learning_rate": 3.9969986051828394e-05, "loss": 4.5893, "loss/crossentropy": 2.0407353341579437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2022136151790619, "step": 8078 }, { "epoch": 0.6733333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.04529622395833333, "learning_rate": 3.996842765485235e-05, "loss": 5.294, "loss/crossentropy": 2.0593449771404266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1811201088130474, "step": 8080 }, { "epoch": 0.6735, "grad_norm": 5.03125, "grad_norm_var": 0.0453125, "learning_rate": 3.9966829857368434e-05, "loss": 4.9266, "loss/crossentropy": 1.1097619906067848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1245882660150528, "step": 8082 }, { "epoch": 0.6736666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.052046712239583334, "learning_rate": 3.996519266331907e-05, "loss": 4.6228, "loss/crossentropy": 1.8012422546744347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16311753541231155, "step": 8084 }, { "epoch": 0.6738333333333333, "grad_norm": 4.75, "grad_norm_var": 0.042867024739583336, "learning_rate": 3.9963516076743856e-05, "loss": 4.9789, "loss/crossentropy": 2.0890884697437286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20314529910683632, "step": 8086 }, { "epoch": 0.674, "grad_norm": 4.8125, "grad_norm_var": 0.03216145833333333, "learning_rate": 3.996180010177961e-05, "loss": 4.9535, "loss/crossentropy": 1.5582296922802925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15331477485597134, "step": 8088 }, { "epoch": 0.6741666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.02750244140625, "learning_rate": 3.996004474266033e-05, "loss": 5.154, "loss/crossentropy": 2.3426169753074646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2202622890472412, "step": 8090 }, { "epoch": 0.6743333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.030301920572916665, "learning_rate": 3.9958250003717184e-05, "loss": 5.4054, "loss/crossentropy": 2.036056786775589, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18563580885529518, "step": 8092 }, { "epoch": 0.6745, "grad_norm": 4.6875, "grad_norm_var": 0.049723307291666664, "learning_rate": 3.995641588937852e-05, "loss": 4.4591, "loss/crossentropy": 1.8210117146372795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20190994441509247, "step": 8094 }, { "epoch": 0.6746666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.051005045572916664, "learning_rate": 3.995454240416982e-05, "loss": 5.1467, "loss/crossentropy": 1.740770123898983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19278324209153652, "step": 8096 }, { "epoch": 0.6748333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.05422770182291667, "learning_rate": 3.9952629552713745e-05, "loss": 4.8513, "loss/crossentropy": 2.1134003698825836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23501833155751228, "step": 8098 }, { "epoch": 0.675, "grad_norm": 4.75, "grad_norm_var": 0.03857014973958333, "learning_rate": 3.995067733973005e-05, "loss": 5.3558, "loss/crossentropy": 1.907908834517002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19258636608719826, "step": 8100 }, { "epoch": 0.6751666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.03892822265625, "learning_rate": 3.994868577003563e-05, "loss": 4.6607, "loss/crossentropy": 1.755495123565197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17224089056253433, "step": 8102 }, { "epoch": 0.6753333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.0474609375, "learning_rate": 3.9946654848544477e-05, "loss": 4.3648, "loss/crossentropy": 1.9235477447509766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1895141527056694, "step": 8104 }, { "epoch": 0.6755, "grad_norm": 4.625, "grad_norm_var": 0.051106770833333336, "learning_rate": 3.9944584580267706e-05, "loss": 4.8674, "loss/crossentropy": 2.156323105096817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22421907261013985, "step": 8106 }, { "epoch": 0.6756666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.047770182291666664, "learning_rate": 3.9942474970313485e-05, "loss": 4.2895, "loss/crossentropy": 2.0829486325383186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18314629420638084, "step": 8108 }, { "epoch": 0.6758333333333333, "grad_norm": 4.15625, "grad_norm_var": 0.05015869140625, "learning_rate": 3.994032602388706e-05, "loss": 4.8896, "loss/crossentropy": 1.6454570293426514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16752009466290474, "step": 8110 }, { "epoch": 0.676, "grad_norm": 4.75, "grad_norm_var": 0.043603515625, "learning_rate": 3.993813774629076e-05, "loss": 4.8309, "loss/crossentropy": 2.204928934574127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20959094911813736, "step": 8112 }, { "epoch": 0.6761666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.052978515625, "learning_rate": 3.9935910142923934e-05, "loss": 4.8202, "loss/crossentropy": 1.6364878937602043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1698533445596695, "step": 8114 }, { "epoch": 0.6763333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.04820556640625, "learning_rate": 3.993364321928298e-05, "loss": 4.8303, "loss/crossentropy": 1.7198282107710838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16924752667546272, "step": 8116 }, { "epoch": 0.6765, "grad_norm": 4.5, "grad_norm_var": 0.050093587239583334, "learning_rate": 3.993133698096129e-05, "loss": 4.3707, "loss/crossentropy": 1.4806988760828972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13833208149299026, "step": 8118 }, { "epoch": 0.6766666666666666, "grad_norm": 4.34375, "grad_norm_var": 0.09440104166666667, "learning_rate": 3.9928991433649284e-05, "loss": 4.9286, "loss/crossentropy": 2.1212473809719086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18642381206154823, "step": 8120 }, { "epoch": 0.6768333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.09680582682291666, "learning_rate": 3.992660658313438e-05, "loss": 4.7489, "loss/crossentropy": 1.7608193159103394, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17317914962768555, "step": 8122 }, { "epoch": 0.677, "grad_norm": 4.5, "grad_norm_var": 0.10084635416666667, "learning_rate": 3.992418243530094e-05, "loss": 4.4682, "loss/crossentropy": 1.4573740735650063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14598331600427628, "step": 8124 }, { "epoch": 0.6771666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.081494140625, "learning_rate": 3.9921718996130326e-05, "loss": 4.8036, "loss/crossentropy": 2.064685195684433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20185653865337372, "step": 8126 }, { "epoch": 0.6773333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.10362955729166666, "learning_rate": 3.991921627170081e-05, "loss": 4.5552, "loss/crossentropy": 1.5148289203643799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1409279704093933, "step": 8128 }, { "epoch": 0.6775, "grad_norm": 4.8125, "grad_norm_var": 0.10191650390625, "learning_rate": 3.9916674268187625e-05, "loss": 5.0554, "loss/crossentropy": 2.1727925539016724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20459099113941193, "step": 8130 }, { "epoch": 0.6776666666666666, "grad_norm": 5.0, "grad_norm_var": 0.10625, "learning_rate": 3.991409299186292e-05, "loss": 4.9051, "loss/crossentropy": 1.7988258972764015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21256174892187119, "step": 8132 }, { "epoch": 0.6778333333333333, "grad_norm": 4.75, "grad_norm_var": 0.10076497395833334, "learning_rate": 3.9911472449095726e-05, "loss": 5.0322, "loss/crossentropy": 2.352916330099106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2223893664777279, "step": 8134 }, { "epoch": 0.678, "grad_norm": 4.6875, "grad_norm_var": 0.058394368489583334, "learning_rate": 3.990881264635198e-05, "loss": 5.0109, "loss/crossentropy": 1.9112081602215767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17446013167500496, "step": 8136 }, { "epoch": 0.6781666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.05572509765625, "learning_rate": 3.990611359019449e-05, "loss": 5.1502, "loss/crossentropy": 1.8885821998119354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19072378613054752, "step": 8138 }, { "epoch": 0.6783333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.05416259765625, "learning_rate": 3.9903375287282886e-05, "loss": 4.6403, "loss/crossentropy": 1.7756406664848328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20463722944259644, "step": 8140 }, { "epoch": 0.6785, "grad_norm": 4.90625, "grad_norm_var": 0.05475260416666667, "learning_rate": 3.990059774437366e-05, "loss": 4.6098, "loss/crossentropy": 1.41706994920969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16518711298704147, "step": 8142 }, { "epoch": 0.6786666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.04700113932291667, "learning_rate": 3.989778096832014e-05, "loss": 4.5598, "loss/crossentropy": 1.7809503972530365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18425824865698814, "step": 8144 }, { "epoch": 0.6788333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.043192545572916664, "learning_rate": 3.989492496607243e-05, "loss": 4.8872, "loss/crossentropy": 1.4895486384630203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15744570270180702, "step": 8146 }, { "epoch": 0.679, "grad_norm": 4.8125, "grad_norm_var": 0.03967692057291667, "learning_rate": 3.989202974467744e-05, "loss": 4.9619, "loss/crossentropy": 2.420316845178604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21129243820905685, "step": 8148 }, { "epoch": 0.6791666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.04114176432291667, "learning_rate": 3.988909531127883e-05, "loss": 4.6627, "loss/crossentropy": 1.2393221259117126, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15679911896586418, "step": 8150 }, { "epoch": 0.6793333333333333, "grad_norm": 5.28125, "grad_norm_var": 0.05513916015625, "learning_rate": 3.988612167311703e-05, "loss": 5.4727, "loss/crossentropy": 1.9080857932567596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22484339028596878, "step": 8152 }, { "epoch": 0.6795, "grad_norm": 4.8125, "grad_norm_var": 0.060009765625, "learning_rate": 3.988310883752918e-05, "loss": 5.0678, "loss/crossentropy": 2.225371241569519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21629876643419266, "step": 8154 }, { "epoch": 0.6796666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.055013020833333336, "learning_rate": 3.9880056811949186e-05, "loss": 4.8673, "loss/crossentropy": 1.2217218354344368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14047273807227612, "step": 8156 }, { "epoch": 0.6798333333333333, "grad_norm": 4.5, "grad_norm_var": 0.05299072265625, "learning_rate": 3.9876965603907585e-05, "loss": 4.8334, "loss/crossentropy": 1.7158519104123116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16879184916615486, "step": 8158 }, { "epoch": 0.68, "grad_norm": 4.78125, "grad_norm_var": 0.042561848958333336, "learning_rate": 3.987383522103165e-05, "loss": 4.9505, "loss/crossentropy": 2.0144409984350204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20102826319634914, "step": 8160 }, { "epoch": 0.6801666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.0390625, "learning_rate": 3.987066567104528e-05, "loss": 5.6142, "loss/crossentropy": 2.599462568759918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125120908021927, "step": 8162 }, { "epoch": 0.6803333333333333, "grad_norm": 5.125, "grad_norm_var": 0.05136311848958333, "learning_rate": 3.986745696176901e-05, "loss": 5.547, "loss/crossentropy": 2.1270923539996147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18128559738397598, "step": 8164 }, { "epoch": 0.6805, "grad_norm": 5.21875, "grad_norm_var": 0.053759765625, "learning_rate": 3.986420910112003e-05, "loss": 5.038, "loss/crossentropy": 1.7307285517454147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1786797009408474, "step": 8166 }, { "epoch": 0.6806666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.06728108723958333, "learning_rate": 3.986092209711211e-05, "loss": 4.7684, "loss/crossentropy": 1.9707505255937576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1803161595016718, "step": 8168 }, { "epoch": 0.6808333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.08736572265625, "learning_rate": 3.98575959578556e-05, "loss": 3.7211, "loss/crossentropy": 1.5715525671839714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16525976173579693, "step": 8170 }, { "epoch": 0.681, "grad_norm": 5.15625, "grad_norm_var": 0.09810791015625, "learning_rate": 3.9854230691557425e-05, "loss": 4.322, "loss/crossentropy": 1.225288264453411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12830936163663864, "step": 8172 }, { "epoch": 0.6811666666666667, "grad_norm": 5.375, "grad_norm_var": 0.114306640625, "learning_rate": 3.9850826306521036e-05, "loss": 4.7835, "loss/crossentropy": 1.218340426683426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12600534223020077, "step": 8174 }, { "epoch": 0.6813333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.11770426432291667, "learning_rate": 3.984738281114642e-05, "loss": 5.2156, "loss/crossentropy": 2.4113438725471497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21955899521708488, "step": 8176 }, { "epoch": 0.6815, "grad_norm": 4.46875, "grad_norm_var": 0.13287760416666666, "learning_rate": 3.984390021393007e-05, "loss": 5.0152, "loss/crossentropy": 1.543459229171276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1737987082451582, "step": 8178 }, { "epoch": 0.6816666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.13944905598958332, "learning_rate": 3.9840378523464924e-05, "loss": 4.7537, "loss/crossentropy": 1.7875322103500366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1727149300277233, "step": 8180 }, { "epoch": 0.6818333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.12537434895833333, "learning_rate": 3.9836817748440424e-05, "loss": 4.9742, "loss/crossentropy": 2.043481595814228, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18028966709971428, "step": 8182 }, { "epoch": 0.682, "grad_norm": 4.78125, "grad_norm_var": 0.11300455729166667, "learning_rate": 3.983321789764242e-05, "loss": 4.878, "loss/crossentropy": 1.3496059402823448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1595297921448946, "step": 8184 }, { "epoch": 0.6821666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.10204671223958334, "learning_rate": 3.9829578979953195e-05, "loss": 5.0171, "loss/crossentropy": 2.472753405570984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23509525135159492, "step": 8186 }, { "epoch": 0.6823333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.08912353515625, "learning_rate": 3.982590100435139e-05, "loss": 5.0676, "loss/crossentropy": 2.0272571817040443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17441200464963913, "step": 8188 }, { "epoch": 0.6825, "grad_norm": 5.15625, "grad_norm_var": 0.07385660807291666, "learning_rate": 3.982218397991208e-05, "loss": 5.6034, "loss/crossentropy": 1.9016704335808754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17647476121783257, "step": 8190 }, { "epoch": 0.6826666666666666, "grad_norm": 4.4375, "grad_norm_var": 0.0853515625, "learning_rate": 3.981842791580663e-05, "loss": 4.9338, "loss/crossentropy": 2.406555950641632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20240669697523117, "step": 8192 }, { "epoch": 0.6828333333333333, "grad_norm": 4.625, "grad_norm_var": 0.07428385416666666, "learning_rate": 3.981463282130277e-05, "loss": 5.2944, "loss/crossentropy": 2.195953816175461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23554722219705582, "step": 8194 }, { "epoch": 0.683, "grad_norm": 4.84375, "grad_norm_var": 0.05347900390625, "learning_rate": 3.98107987057645e-05, "loss": 4.6559, "loss/crossentropy": 2.159575939178467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20401450619101524, "step": 8196 }, { "epoch": 0.6831666666666667, "grad_norm": 4.75, "grad_norm_var": 0.05390218098958333, "learning_rate": 3.9806925578652125e-05, "loss": 5.1964, "loss/crossentropy": 1.8091963231563568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16383634880185127, "step": 8198 }, { "epoch": 0.6833333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.05966389973958333, "learning_rate": 3.980301344952221e-05, "loss": 4.4383, "loss/crossentropy": 1.9880341216921806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19586319476366043, "step": 8200 }, { "epoch": 0.6835, "grad_norm": 4.625, "grad_norm_var": 0.05774739583333333, "learning_rate": 3.979906232802754e-05, "loss": 5.3086, "loss/crossentropy": 2.4634940028190613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22515181452035904, "step": 8202 }, { "epoch": 0.6836666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.05872395833333333, "learning_rate": 3.9795072223917115e-05, "loss": 4.8084, "loss/crossentropy": 1.6714323610067368, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18010685592889786, "step": 8204 }, { "epoch": 0.6838333333333333, "grad_norm": 4.625, "grad_norm_var": 0.056441243489583334, "learning_rate": 3.9791043147036114e-05, "loss": 4.746, "loss/crossentropy": 2.0559864789247513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17964443936944008, "step": 8206 }, { "epoch": 0.684, "grad_norm": 5.0625, "grad_norm_var": 0.04459228515625, "learning_rate": 3.978697510732589e-05, "loss": 5.1632, "loss/crossentropy": 2.4614692330360413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2225230485200882, "step": 8208 }, { "epoch": 0.6841666666666667, "grad_norm": 5.125, "grad_norm_var": 0.045796712239583336, "learning_rate": 3.9782868114823936e-05, "loss": 4.6625, "loss/crossentropy": 1.9659627079963684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19717267900705338, "step": 8210 }, { "epoch": 0.6843333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.047749837239583336, "learning_rate": 3.9778722179663826e-05, "loss": 5.2607, "loss/crossentropy": 2.311997652053833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20555757358670235, "step": 8212 }, { "epoch": 0.6845, "grad_norm": 4.9375, "grad_norm_var": 0.046708170572916666, "learning_rate": 3.9774537312075254e-05, "loss": 4.8792, "loss/crossentropy": 1.5093220099806786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14826051332056522, "step": 8214 }, { "epoch": 0.6846666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.04023030598958333, "learning_rate": 3.977031352238397e-05, "loss": 4.7302, "loss/crossentropy": 1.310145728290081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15178019180893898, "step": 8216 }, { "epoch": 0.6848333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.05331624348958333, "learning_rate": 3.976605082101175e-05, "loss": 4.8753, "loss/crossentropy": 1.9999983832240105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20402120612561703, "step": 8218 }, { "epoch": 0.685, "grad_norm": 4.625, "grad_norm_var": 0.05364176432291667, "learning_rate": 3.976174921847639e-05, "loss": 5.1049, "loss/crossentropy": 2.197313755750656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19855907186865807, "step": 8220 }, { "epoch": 0.6851666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.04724934895833333, "learning_rate": 3.975740872539166e-05, "loss": 4.9291, "loss/crossentropy": 2.1756413877010345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19770053774118423, "step": 8222 }, { "epoch": 0.6853333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.04763997395833333, "learning_rate": 3.975302935246729e-05, "loss": 5.3972, "loss/crossentropy": 2.2892523109912872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869375966489315, "step": 8224 }, { "epoch": 0.6855, "grad_norm": 4.90625, "grad_norm_var": 0.04488525390625, "learning_rate": 3.9748611110508964e-05, "loss": 5.0387, "loss/crossentropy": 2.574119806289673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21818042173981667, "step": 8226 }, { "epoch": 0.6856666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.033984375, "learning_rate": 3.974415401041824e-05, "loss": 5.1188, "loss/crossentropy": 2.054434657096863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17497070133686066, "step": 8228 }, { "epoch": 0.6858333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.030192057291666668, "learning_rate": 3.9739658063192575e-05, "loss": 4.8558, "loss/crossentropy": 1.874234914779663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1658182181417942, "step": 8230 }, { "epoch": 0.686, "grad_norm": 4.65625, "grad_norm_var": 0.04016927083333333, "learning_rate": 3.973512327992528e-05, "loss": 5.2414, "loss/crossentropy": 2.5035698115825653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22155991941690445, "step": 8232 }, { "epoch": 0.6861666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.02890625, "learning_rate": 3.973054967180547e-05, "loss": 4.7098, "loss/crossentropy": 1.3449292182922363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23684380762279034, "step": 8234 }, { "epoch": 0.6863333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.03860677083333333, "learning_rate": 3.972593725011807e-05, "loss": 4.3327, "loss/crossentropy": 0.3794103041291237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.08249375969171524, "step": 8236 }, { "epoch": 0.6865, "grad_norm": 5.25, "grad_norm_var": 0.052469889322916664, "learning_rate": 3.972128602624378e-05, "loss": 5.2133, "loss/crossentropy": 1.9596935585141182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16619658470153809, "step": 8238 }, { "epoch": 0.6866666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.1208984375, "learning_rate": 3.971659601165903e-05, "loss": 5.5184, "loss/crossentropy": 2.493978977203369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21593138948082924, "step": 8240 }, { "epoch": 0.6868333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.1208984375, "learning_rate": 3.971186721793595e-05, "loss": 4.7815, "loss/crossentropy": 1.8379368782043457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18796057254076004, "step": 8242 }, { "epoch": 0.687, "grad_norm": 4.8125, "grad_norm_var": 0.12193603515625, "learning_rate": 3.970709965674239e-05, "loss": 5.2517, "loss/crossentropy": 2.4483371675014496, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22528450563549995, "step": 8244 }, { "epoch": 0.6871666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.12923177083333334, "learning_rate": 3.970229333984182e-05, "loss": 5.2067, "loss/crossentropy": 1.752450205385685, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1706908978521824, "step": 8246 }, { "epoch": 0.6873333333333334, "grad_norm": 4.5, "grad_norm_var": 0.12902018229166667, "learning_rate": 3.9697448279093346e-05, "loss": 5.0662, "loss/crossentropy": 1.8285248652100563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17460808157920837, "step": 8248 }, { "epoch": 0.6875, "grad_norm": 4.625, "grad_norm_var": 0.13079020182291667, "learning_rate": 3.969256448645169e-05, "loss": 5.042, "loss/crossentropy": 1.361061304807663, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13284870982170105, "step": 8250 }, { "epoch": 0.6876666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.11708577473958333, "learning_rate": 3.968764197396712e-05, "loss": 5.0501, "loss/crossentropy": 2.5378739833831787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2144315354526043, "step": 8252 }, { "epoch": 0.6878333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.10657145182291666, "learning_rate": 3.968268075378543e-05, "loss": 4.9037, "loss/crossentropy": 2.0044125020504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2190082147717476, "step": 8254 }, { "epoch": 0.688, "grad_norm": 4.625, "grad_norm_var": 0.03590087890625, "learning_rate": 3.967768083814796e-05, "loss": 4.8427, "loss/crossentropy": 2.0641330182552338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22053687274456024, "step": 8256 }, { "epoch": 0.6881666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.052734375, "learning_rate": 3.9672642239391486e-05, "loss": 4.4691, "loss/crossentropy": 2.459451824426651, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19304845109581947, "step": 8258 }, { "epoch": 0.6883333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.056315104166666664, "learning_rate": 3.966756496994825e-05, "loss": 4.9431, "loss/crossentropy": 1.7906870916485786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16057110950350761, "step": 8260 }, { "epoch": 0.6885, "grad_norm": 5.1875, "grad_norm_var": 0.06370035807291667, "learning_rate": 3.966244904234594e-05, "loss": 5.1878, "loss/crossentropy": 1.838362380862236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1645890176296234, "step": 8262 }, { "epoch": 0.6886666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.06640625, "learning_rate": 3.965729446920755e-05, "loss": 5.6991, "loss/crossentropy": 1.9389175474643707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18414141610264778, "step": 8264 }, { "epoch": 0.6888333333333333, "grad_norm": 5.1875, "grad_norm_var": 0.07884114583333333, "learning_rate": 3.965210126325153e-05, "loss": 5.2045, "loss/crossentropy": 2.096444122493267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1620175652205944, "step": 8266 }, { "epoch": 0.689, "grad_norm": 4.90625, "grad_norm_var": 0.07902018229166667, "learning_rate": 3.964686943729155e-05, "loss": 4.2653, "loss/crossentropy": 1.6484524458646774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16247570887207985, "step": 8268 }, { "epoch": 0.6891666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.07941080729166666, "learning_rate": 3.964159900423666e-05, "loss": 5.6302, "loss/crossentropy": 2.382517784833908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19215881079435349, "step": 8270 }, { "epoch": 0.6893333333333334, "grad_norm": 4.75, "grad_norm_var": 0.07743733723958333, "learning_rate": 3.9636289977091104e-05, "loss": 4.8313, "loss/crossentropy": 1.8752048686146736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1806214228272438, "step": 8272 }, { "epoch": 0.6895, "grad_norm": 4.90625, "grad_norm_var": 0.052958170572916664, "learning_rate": 3.963094236895439e-05, "loss": 4.4626, "loss/crossentropy": 2.2586154341697693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22339026257395744, "step": 8274 }, { "epoch": 0.6896666666666667, "grad_norm": 5.375, "grad_norm_var": 0.04334309895833333, "learning_rate": 3.96255561930212e-05, "loss": 4.9586, "loss/crossentropy": 2.25962632894516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24645965546369553, "step": 8276 }, { "epoch": 0.6898333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.05657552083333333, "learning_rate": 3.96201314625814e-05, "loss": 4.8752, "loss/crossentropy": 1.8089765384793282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19039946794509888, "step": 8278 }, { "epoch": 0.69, "grad_norm": 4.71875, "grad_norm_var": 0.05618489583333333, "learning_rate": 3.961466819101996e-05, "loss": 4.3569, "loss/crossentropy": 1.9538972973823547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19948555529117584, "step": 8280 }, { "epoch": 0.6901666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.050374348958333336, "learning_rate": 3.960916639181697e-05, "loss": 5.1086, "loss/crossentropy": 2.0360175147652626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17786091938614845, "step": 8282 }, { "epoch": 0.6903333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.05133056640625, "learning_rate": 3.960362607854758e-05, "loss": 4.9383, "loss/crossentropy": 2.5083267092704773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21976816654205322, "step": 8284 }, { "epoch": 0.6905, "grad_norm": 4.90625, "grad_norm_var": 0.048811848958333334, "learning_rate": 3.9598047264881946e-05, "loss": 5.0042, "loss/crossentropy": 2.357286214828491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20402342081069946, "step": 8286 }, { "epoch": 0.6906666666666667, "grad_norm": 5.71875, "grad_norm_var": 0.09566650390625, "learning_rate": 3.959242996458524e-05, "loss": 5.4444, "loss/crossentropy": 1.7867268919944763, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19183974713087082, "step": 8288 }, { "epoch": 0.6908333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.09709879557291666, "learning_rate": 3.95867741915176e-05, "loss": 4.9998, "loss/crossentropy": 1.7130176201462746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19901876337826252, "step": 8290 }, { "epoch": 0.691, "grad_norm": 4.9375, "grad_norm_var": 0.07997639973958333, "learning_rate": 3.958107995963406e-05, "loss": 5.3135, "loss/crossentropy": 2.3072088062763214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21593845263123512, "step": 8292 }, { "epoch": 0.6911666666666667, "grad_norm": 4.625, "grad_norm_var": 0.071728515625, "learning_rate": 3.957534728298461e-05, "loss": 4.991, "loss/crossentropy": 2.2377262711524963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21270013973116875, "step": 8294 }, { "epoch": 0.6913333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.07509358723958333, "learning_rate": 3.956957617571403e-05, "loss": 4.5893, "loss/crossentropy": 1.2319274619221687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14451301470398903, "step": 8296 }, { "epoch": 0.6915, "grad_norm": 4.9375, "grad_norm_var": 0.07672119140625, "learning_rate": 3.956376665206196e-05, "loss": 5.013, "loss/crossentropy": 2.2056290805339813, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1942349076271057, "step": 8298 }, { "epoch": 0.6916666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.07068684895833334, "learning_rate": 3.955791872636283e-05, "loss": 4.9761, "loss/crossentropy": 2.356662005186081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21107817068696022, "step": 8300 }, { "epoch": 0.6918333333333333, "grad_norm": 4.5, "grad_norm_var": 0.07646077473958333, "learning_rate": 3.95520324130458e-05, "loss": 4.8055, "loss/crossentropy": 1.8195563182234764, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17385952547192574, "step": 8302 }, { "epoch": 0.692, "grad_norm": 5.375, "grad_norm_var": 0.04503580729166667, "learning_rate": 3.954610772663479e-05, "loss": 4.8658, "loss/crossentropy": 1.4324785694479942, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1714583933353424, "step": 8304 }, { "epoch": 0.6921666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.05318603515625, "learning_rate": 3.9540144681748343e-05, "loss": 4.4637, "loss/crossentropy": 1.8403845950961113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19282516837120056, "step": 8306 }, { "epoch": 0.6923333333333334, "grad_norm": 4.75, "grad_norm_var": 0.05845947265625, "learning_rate": 3.95341432930997e-05, "loss": 4.6448, "loss/crossentropy": 2.415658622980118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2018880546092987, "step": 8308 }, { "epoch": 0.6925, "grad_norm": 4.46875, "grad_norm_var": 0.06295572916666667, "learning_rate": 3.952810357549669e-05, "loss": 4.9796, "loss/crossentropy": 1.79848662763834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1668144389986992, "step": 8310 }, { "epoch": 0.6926666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.07053629557291667, "learning_rate": 3.95220255438417e-05, "loss": 4.2825, "loss/crossentropy": 1.9337844401597977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18568622693419456, "step": 8312 }, { "epoch": 0.6928333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.07552083333333333, "learning_rate": 3.951590921313169e-05, "loss": 4.5296, "loss/crossentropy": 1.2211369574069977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14676949754357338, "step": 8314 }, { "epoch": 0.693, "grad_norm": 4.65625, "grad_norm_var": 0.06962483723958333, "learning_rate": 3.950975459845807e-05, "loss": 4.7562, "loss/crossentropy": 1.9180386438965797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1833595335483551, "step": 8316 }, { "epoch": 0.6931666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.06532796223958333, "learning_rate": 3.9503561715006775e-05, "loss": 4.5347, "loss/crossentropy": 1.9306902810931206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19284119084477425, "step": 8318 }, { "epoch": 0.6933333333333334, "grad_norm": 5.125, "grad_norm_var": 0.04804280598958333, "learning_rate": 3.94973305780581e-05, "loss": 5.1963, "loss/crossentropy": 2.3883610665798187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22568393871188164, "step": 8320 }, { "epoch": 0.6935, "grad_norm": 4.78125, "grad_norm_var": 0.04607747395833333, "learning_rate": 3.9491061202986776e-05, "loss": 4.8591, "loss/crossentropy": 1.6141887456178665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17095645144581795, "step": 8322 }, { "epoch": 0.6936666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.08450113932291667, "learning_rate": 3.9484753605261856e-05, "loss": 4.9119, "loss/crossentropy": 1.8141232132911682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1782233528792858, "step": 8324 }, { "epoch": 0.6938333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.08157552083333333, "learning_rate": 3.94784078004467e-05, "loss": 4.696, "loss/crossentropy": 1.6317244544625282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.161282641813159, "step": 8326 }, { "epoch": 0.694, "grad_norm": 5.125, "grad_norm_var": 0.07226155598958334, "learning_rate": 3.9472023804198966e-05, "loss": 4.7567, "loss/crossentropy": 2.129282683134079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21127504110336304, "step": 8328 }, { "epoch": 0.6941666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.14412434895833334, "learning_rate": 3.946560163227052e-05, "loss": 4.7442, "loss/crossentropy": 1.7705247178673744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15480435080826283, "step": 8330 }, { "epoch": 0.6943333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.13527018229166668, "learning_rate": 3.945914130050744e-05, "loss": 4.2676, "loss/crossentropy": 1.9477231204509735, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19183451309800148, "step": 8332 }, { "epoch": 0.6945, "grad_norm": 4.53125, "grad_norm_var": 0.13759358723958334, "learning_rate": 3.9452642824849944e-05, "loss": 4.8556, "loss/crossentropy": 2.5035791397094727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22337283194065094, "step": 8334 }, { "epoch": 0.6946666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.13567708333333334, "learning_rate": 3.9446106221332384e-05, "loss": 4.5276, "loss/crossentropy": 1.919069766998291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20139261707663536, "step": 8336 }, { "epoch": 0.6948333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.1396484375, "learning_rate": 3.943953150608318e-05, "loss": 4.5294, "loss/crossentropy": 1.7460784316062927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1773892566561699, "step": 8338 }, { "epoch": 0.695, "grad_norm": 4.71875, "grad_norm_var": 0.119384765625, "learning_rate": 3.9432918695324775e-05, "loss": 4.8103, "loss/crossentropy": 1.711449757218361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17536772415041924, "step": 8340 }, { "epoch": 0.6951666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.11343994140625, "learning_rate": 3.9426267805373626e-05, "loss": 5.095, "loss/crossentropy": 1.6863776296377182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16977461241185665, "step": 8342 }, { "epoch": 0.6953333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.11013997395833333, "learning_rate": 3.941957885264017e-05, "loss": 4.9459, "loss/crossentropy": 2.0749086141586304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21645259484648705, "step": 8344 }, { "epoch": 0.6955, "grad_norm": 4.9375, "grad_norm_var": 0.03443603515625, "learning_rate": 3.941285185362868e-05, "loss": 5.0884, "loss/crossentropy": 2.0320696011185646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18671347945928574, "step": 8346 }, { "epoch": 0.6956666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.03765869140625, "learning_rate": 3.940608682493741e-05, "loss": 5.2533, "loss/crossentropy": 1.2317260429263115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13544929958879948, "step": 8348 }, { "epoch": 0.6958333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.03215738932291667, "learning_rate": 3.939928378325836e-05, "loss": 5.1139, "loss/crossentropy": 2.150977909564972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.224623154848814, "step": 8350 }, { "epoch": 0.696, "grad_norm": 4.9375, "grad_norm_var": 0.03323160807291667, "learning_rate": 3.939244274537738e-05, "loss": 5.476, "loss/crossentropy": 2.4628894329071045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21918050199747086, "step": 8352 }, { "epoch": 0.6961666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.026822916666666665, "learning_rate": 3.938556372817404e-05, "loss": 5.2391, "loss/crossentropy": 2.3663055896759033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21987644210457802, "step": 8354 }, { "epoch": 0.6963333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.022591145833333333, "learning_rate": 3.937864674862163e-05, "loss": 4.7939, "loss/crossentropy": 2.5858985781669617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24316586554050446, "step": 8356 }, { "epoch": 0.6965, "grad_norm": 4.90625, "grad_norm_var": 0.04348551432291667, "learning_rate": 3.937169182378712e-05, "loss": 5.0335, "loss/crossentropy": 1.2397507652640343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.162556741386652, "step": 8358 }, { "epoch": 0.6966666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.05911051432291667, "learning_rate": 3.936469897083109e-05, "loss": 4.7473, "loss/crossentropy": 1.5406540408730507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22426093369722366, "step": 8360 }, { "epoch": 0.6968333333333333, "grad_norm": 4.75, "grad_norm_var": 0.058268229166666664, "learning_rate": 3.935766820700771e-05, "loss": 5.0673, "loss/crossentropy": 2.4371124505996704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19626253098249435, "step": 8362 }, { "epoch": 0.697, "grad_norm": 5.15625, "grad_norm_var": 0.05093994140625, "learning_rate": 3.935059954966469e-05, "loss": 5.4225, "loss/crossentropy": 2.520164370536804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2460576295852661, "step": 8364 }, { "epoch": 0.6971666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.08385009765625, "learning_rate": 3.934349301624324e-05, "loss": 4.5993, "loss/crossentropy": 2.55728417634964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20566748455166817, "step": 8366 }, { "epoch": 0.6973333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.08424479166666667, "learning_rate": 3.933634862427802e-05, "loss": 4.9533, "loss/crossentropy": 1.4302483797073364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16049204394221306, "step": 8368 }, { "epoch": 0.6975, "grad_norm": 4.28125, "grad_norm_var": 0.11964518229166667, "learning_rate": 3.9329166391397116e-05, "loss": 4.5304, "loss/crossentropy": 2.4105213284492493, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21557223051786423, "step": 8370 }, { "epoch": 0.6976666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.13444010416666666, "learning_rate": 3.932194633532196e-05, "loss": 4.6363, "loss/crossentropy": 1.7642899453639984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1885760761797428, "step": 8372 }, { "epoch": 0.6978333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.10345052083333334, "learning_rate": 3.931468847386734e-05, "loss": 4.4008, "loss/crossentropy": 1.898881435394287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19243913888931274, "step": 8374 }, { "epoch": 0.698, "grad_norm": 5.15625, "grad_norm_var": 0.097119140625, "learning_rate": 3.93073928249413e-05, "loss": 5.3528, "loss/crossentropy": 2.535469710826874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21360251680016518, "step": 8376 }, { "epoch": 0.6981666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.08599853515625, "learning_rate": 3.930005940654511e-05, "loss": 5.0111, "loss/crossentropy": 2.345153719186783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27940667420625687, "step": 8378 }, { "epoch": 0.6983333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.070947265625, "learning_rate": 3.9292688236773286e-05, "loss": 4.7551, "loss/crossentropy": 1.8373343795537949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17152095399796963, "step": 8380 }, { "epoch": 0.6985, "grad_norm": 4.9375, "grad_norm_var": 0.06575113932291667, "learning_rate": 3.928527933381344e-05, "loss": 4.8176, "loss/crossentropy": 2.3495190739631653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21102092787623405, "step": 8382 }, { "epoch": 0.6986666666666667, "grad_norm": 4.625, "grad_norm_var": 0.06389567057291666, "learning_rate": 3.92778327159463e-05, "loss": 4.7387, "loss/crossentropy": 1.8806327432394028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15044947527348995, "step": 8384 }, { "epoch": 0.6988333333333333, "grad_norm": 4.75, "grad_norm_var": 0.041666666666666664, "learning_rate": 3.9270348401545646e-05, "loss": 5.4241, "loss/crossentropy": 2.6885854601860046, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21951700001955032, "step": 8386 }, { "epoch": 0.699, "grad_norm": 5.625, "grad_norm_var": 0.07209879557291667, "learning_rate": 3.92628264090783e-05, "loss": 4.5313, "loss/crossentropy": 1.8490508198738098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25332874804735184, "step": 8388 }, { "epoch": 0.6991666666666667, "grad_norm": 4.75, "grad_norm_var": 0.07044270833333334, "learning_rate": 3.9255266757104025e-05, "loss": 4.2339, "loss/crossentropy": 2.2976879477500916, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21890880912542343, "step": 8390 }, { "epoch": 0.6993333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.07303059895833333, "learning_rate": 3.924766946427551e-05, "loss": 4.7814, "loss/crossentropy": 1.930149868130684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19942124374210835, "step": 8392 }, { "epoch": 0.6995, "grad_norm": 4.6875, "grad_norm_var": 0.07916259765625, "learning_rate": 3.9240034549338315e-05, "loss": 4.9137, "loss/crossentropy": 2.1267817318439484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.221748698502779, "step": 8394 }, { "epoch": 0.6996666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.4212849934895833, "learning_rate": 3.9232362031130836e-05, "loss": 4.4343, "loss/crossentropy": 0.8540246337652206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1094401404261589, "step": 8396 }, { "epoch": 0.6998333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.42107747395833334, "learning_rate": 3.9224651928584246e-05, "loss": 4.9973, "loss/crossentropy": 2.14926341176033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1886780634522438, "step": 8398 }, { "epoch": 0.7, "grad_norm": 5.21875, "grad_norm_var": 0.41067708333333336, "learning_rate": 3.921690426072246e-05, "loss": 5.3, "loss/crossentropy": 1.972128227353096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19318870827555656, "step": 8400 }, { "epoch": 0.7001666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.41717122395833334, "learning_rate": 3.9209119046662085e-05, "loss": 4.8935, "loss/crossentropy": 1.6673232913017273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16348423436284065, "step": 8402 }, { "epoch": 0.7003333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.397119140625, "learning_rate": 3.920129630561235e-05, "loss": 4.7038, "loss/crossentropy": 0.7960238456726074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10878065042197704, "step": 8404 }, { "epoch": 0.7005, "grad_norm": 4.625, "grad_norm_var": 0.38163655598958335, "learning_rate": 3.9193436056875106e-05, "loss": 4.913, "loss/crossentropy": 2.3446280360221863, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20906392112374306, "step": 8406 }, { "epoch": 0.7006666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.40325520833333334, "learning_rate": 3.918553831984472e-05, "loss": 4.7213, "loss/crossentropy": 1.2463391497731209, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13812248408794403, "step": 8408 }, { "epoch": 0.7008333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.39319254557291666, "learning_rate": 3.917760311400808e-05, "loss": 4.6313, "loss/crossentropy": 2.367984890937805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071567215025425, "step": 8410 }, { "epoch": 0.701, "grad_norm": 5.03125, "grad_norm_var": 0.04934488932291667, "learning_rate": 3.9169630458944515e-05, "loss": 5.3454, "loss/crossentropy": 2.3054229021072388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.243374515324831, "step": 8412 }, { "epoch": 0.7011666666666667, "grad_norm": 5.25, "grad_norm_var": 0.06051025390625, "learning_rate": 3.916162037432576e-05, "loss": 5.1157, "loss/crossentropy": 2.388631224632263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20708012580871582, "step": 8414 }, { "epoch": 0.7013333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.04605712890625, "learning_rate": 3.915357287991591e-05, "loss": 5.0288, "loss/crossentropy": 1.7904658913612366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20545685291290283, "step": 8416 }, { "epoch": 0.7015, "grad_norm": 4.75, "grad_norm_var": 0.047119140625, "learning_rate": 3.914548799557135e-05, "loss": 4.7721, "loss/crossentropy": 2.0648528784513474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1930786818265915, "step": 8418 }, { "epoch": 0.7016666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.04724934895833333, "learning_rate": 3.9137365741240734e-05, "loss": 5.0071, "loss/crossentropy": 2.4154167771339417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22159772738814354, "step": 8420 }, { "epoch": 0.7018333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.05435791015625, "learning_rate": 3.9129206136964903e-05, "loss": 4.4603, "loss/crossentropy": 1.9558910503983498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17343437299132347, "step": 8422 }, { "epoch": 0.702, "grad_norm": 5.0, "grad_norm_var": 0.04010009765625, "learning_rate": 3.912100920287688e-05, "loss": 4.9526, "loss/crossentropy": 2.4329636991024017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20050939917564392, "step": 8424 }, { "epoch": 0.7021666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.04713134765625, "learning_rate": 3.911277495920179e-05, "loss": 5.5551, "loss/crossentropy": 2.05565532296896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18992381542921066, "step": 8426 }, { "epoch": 0.7023333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.04583333333333333, "learning_rate": 3.91045034262568e-05, "loss": 4.5382, "loss/crossentropy": 1.910774514079094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17602653056383133, "step": 8428 }, { "epoch": 0.7025, "grad_norm": 5.03125, "grad_norm_var": 0.043229166666666666, "learning_rate": 3.9096194624451104e-05, "loss": 4.8823, "loss/crossentropy": 2.303587019443512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22907770797610283, "step": 8430 }, { "epoch": 0.7026666666666667, "grad_norm": 4.75, "grad_norm_var": 0.051025390625, "learning_rate": 3.908784857428583e-05, "loss": 4.411, "loss/crossentropy": 1.3701264038681984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1463383063673973, "step": 8432 }, { "epoch": 0.7028333333333333, "grad_norm": 5.3125, "grad_norm_var": 0.06806233723958334, "learning_rate": 3.907946529635405e-05, "loss": 5.2984, "loss/crossentropy": 1.987550988793373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17481104657053947, "step": 8434 }, { "epoch": 0.703, "grad_norm": 4.75, "grad_norm_var": 0.07459309895833334, "learning_rate": 3.907104481134066e-05, "loss": 4.4821, "loss/crossentropy": 1.3826638907194138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15103114023804665, "step": 8436 }, { "epoch": 0.7031666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.06278889973958333, "learning_rate": 3.906258714002236e-05, "loss": 4.6062, "loss/crossentropy": 1.6567226275801659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17642017267644405, "step": 8438 }, { "epoch": 0.7033333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.07336832682291666, "learning_rate": 3.905409230326761e-05, "loss": 4.3813, "loss/crossentropy": 1.7283048927783966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1893971562385559, "step": 8440 }, { "epoch": 0.7035, "grad_norm": 4.78125, "grad_norm_var": 0.06656494140625, "learning_rate": 3.90455603220366e-05, "loss": 4.974, "loss/crossentropy": 2.4172494411468506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20795316621661186, "step": 8442 }, { "epoch": 0.7036666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.0798828125, "learning_rate": 3.903699121738112e-05, "loss": 4.3795, "loss/crossentropy": 1.8104211688041687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2350565455853939, "step": 8444 }, { "epoch": 0.7038333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.07877197265625, "learning_rate": 3.9028385010444593e-05, "loss": 4.6677, "loss/crossentropy": 1.9519396424293518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1947762556374073, "step": 8446 }, { "epoch": 0.704, "grad_norm": 4.78125, "grad_norm_var": 0.07434488932291666, "learning_rate": 3.901974172246199e-05, "loss": 5.0585, "loss/crossentropy": 2.076841115951538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1911839358508587, "step": 8448 }, { "epoch": 0.7041666666666667, "grad_norm": 5.125, "grad_norm_var": 0.04772135416666667, "learning_rate": 3.9011061374759756e-05, "loss": 4.869, "loss/crossentropy": 2.004154473543167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17821232788264751, "step": 8450 }, { "epoch": 0.7043333333333334, "grad_norm": 5.0, "grad_norm_var": 0.04804280598958333, "learning_rate": 3.900234398875578e-05, "loss": 5.1626, "loss/crossentropy": 2.0224228501319885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1770163904875517, "step": 8452 }, { "epoch": 0.7045, "grad_norm": 4.90625, "grad_norm_var": 0.05178629557291667, "learning_rate": 3.899358958595935e-05, "loss": 5.1691, "loss/crossentropy": 2.1597854495048523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20969395712018013, "step": 8454 }, { "epoch": 0.7046666666666667, "grad_norm": 4.625, "grad_norm_var": 0.04895833333333333, "learning_rate": 3.898479818797108e-05, "loss": 5.0392, "loss/crossentropy": 2.580492913722992, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20104866474866867, "step": 8456 }, { "epoch": 0.7048333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.05481770833333333, "learning_rate": 3.8975969816482884e-05, "loss": 5.0146, "loss/crossentropy": 2.439221531152725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20938636735081673, "step": 8458 }, { "epoch": 0.705, "grad_norm": 4.65625, "grad_norm_var": 0.04034830729166667, "learning_rate": 3.896710449327788e-05, "loss": 4.6385, "loss/crossentropy": 2.402409076690674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21491533517837524, "step": 8460 }, { "epoch": 0.7051666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.03411051432291667, "learning_rate": 3.8958202240230376e-05, "loss": 5.0256, "loss/crossentropy": 1.7773304283618927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1971539780497551, "step": 8462 }, { "epoch": 0.7053333333333334, "grad_norm": 5.125, "grad_norm_var": 0.05380452473958333, "learning_rate": 3.89492630793058e-05, "loss": 5.1185, "loss/crossentropy": 2.059245079755783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2149088755249977, "step": 8464 }, { "epoch": 0.7055, "grad_norm": 4.59375, "grad_norm_var": 0.05224202473958333, "learning_rate": 3.894028703256063e-05, "loss": 4.4259, "loss/crossentropy": 1.6723755300045013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16442426852881908, "step": 8466 }, { "epoch": 0.7056666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.049609375, "learning_rate": 3.893127412214238e-05, "loss": 4.72, "loss/crossentropy": 1.6438677459955215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16178004071116447, "step": 8468 }, { "epoch": 0.7058333333333333, "grad_norm": 5.375, "grad_norm_var": 0.07011311848958333, "learning_rate": 3.8922224370289517e-05, "loss": 4.9171, "loss/crossentropy": 2.445003926753998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24207842722535133, "step": 8470 }, { "epoch": 0.706, "grad_norm": 5.0, "grad_norm_var": 0.06881510416666667, "learning_rate": 3.891313779933138e-05, "loss": 5.1656, "loss/crossentropy": 2.066555440425873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20761993527412415, "step": 8472 }, { "epoch": 0.7061666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.06461181640625, "learning_rate": 3.89040144316882e-05, "loss": 4.9594, "loss/crossentropy": 2.183867871761322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20912669971585274, "step": 8474 }, { "epoch": 0.7063333333333334, "grad_norm": 4.75, "grad_norm_var": 0.06148681640625, "learning_rate": 3.889485428987097e-05, "loss": 5.1491, "loss/crossentropy": 2.3668190836906433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22393818199634552, "step": 8476 }, { "epoch": 0.7065, "grad_norm": 4.28125, "grad_norm_var": 0.090869140625, "learning_rate": 3.888565739648145e-05, "loss": 4.7785, "loss/crossentropy": 2.4078583121299744, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20308464020490646, "step": 8478 }, { "epoch": 0.7066666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.0720703125, "learning_rate": 3.887642377421203e-05, "loss": 4.7113, "loss/crossentropy": 1.0246254950761795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14504259265959263, "step": 8480 }, { "epoch": 0.7068333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.07873942057291666, "learning_rate": 3.886715344584577e-05, "loss": 5.002, "loss/crossentropy": 2.1630527675151825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19843829050660133, "step": 8482 }, { "epoch": 0.707, "grad_norm": 4.90625, "grad_norm_var": 0.07499593098958333, "learning_rate": 3.885784643425628e-05, "loss": 4.9709, "loss/crossentropy": 1.5243459790945053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1480504274368286, "step": 8484 }, { "epoch": 0.7071666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.06018473307291667, "learning_rate": 3.884850276240769e-05, "loss": 4.9724, "loss/crossentropy": 2.170057028532028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22325589135289192, "step": 8486 }, { "epoch": 0.7073333333333334, "grad_norm": 4.625, "grad_norm_var": 0.05816650390625, "learning_rate": 3.8839122453354584e-05, "loss": 4.9309, "loss/crossentropy": 1.7268838658928871, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19534722715616226, "step": 8488 }, { "epoch": 0.7075, "grad_norm": 4.5, "grad_norm_var": 0.06337483723958333, "learning_rate": 3.882970553024193e-05, "loss": 4.21, "loss/crossentropy": 2.080011636018753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039019875228405, "step": 8490 }, { "epoch": 0.7076666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.068994140625, "learning_rate": 3.8820252016305066e-05, "loss": 4.5204, "loss/crossentropy": 2.4818327128887177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21026042476296425, "step": 8492 }, { "epoch": 0.7078333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.04407552083333333, "learning_rate": 3.881076193486959e-05, "loss": 5.0249, "loss/crossentropy": 2.102166533470154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19555450603365898, "step": 8494 }, { "epoch": 0.708, "grad_norm": 4.625, "grad_norm_var": 0.04455973307291667, "learning_rate": 3.8801235309351326e-05, "loss": 4.5784, "loss/crossentropy": 1.0780503954738379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10496869310736656, "step": 8496 }, { "epoch": 0.7081666666666667, "grad_norm": 5.0, "grad_norm_var": 0.037398274739583334, "learning_rate": 3.87916721632563e-05, "loss": 4.8757, "loss/crossentropy": 1.9232516288757324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2197490967810154, "step": 8498 }, { "epoch": 0.7083333333333334, "grad_norm": 4.4375, "grad_norm_var": 0.044331868489583336, "learning_rate": 3.878207252018059e-05, "loss": 4.8544, "loss/crossentropy": 2.091248132288456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18537914380431175, "step": 8500 }, { "epoch": 0.7085, "grad_norm": 4.8125, "grad_norm_var": 0.030989583333333334, "learning_rate": 3.877243640381038e-05, "loss": 4.8074, "loss/crossentropy": 1.759295605123043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1624660287052393, "step": 8502 }, { "epoch": 0.7086666666666667, "grad_norm": 4.5, "grad_norm_var": 0.033589680989583336, "learning_rate": 3.876276383792184e-05, "loss": 4.4416, "loss/crossentropy": 1.1989781931042671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1289712656289339, "step": 8504 }, { "epoch": 0.7088333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.030061848958333335, "learning_rate": 3.875305484638105e-05, "loss": 4.9461, "loss/crossentropy": 1.7124052718281746, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17960615642368793, "step": 8506 }, { "epoch": 0.709, "grad_norm": 4.96875, "grad_norm_var": 0.03209635416666667, "learning_rate": 3.874330945314398e-05, "loss": 4.7397, "loss/crossentropy": 1.9230768084526062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19067246094346046, "step": 8508 }, { "epoch": 0.7091666666666666, "grad_norm": 4.28125, "grad_norm_var": 0.0423828125, "learning_rate": 3.873352768225643e-05, "loss": 4.2579, "loss/crossentropy": 1.9107168316841125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18083369359374046, "step": 8510 }, { "epoch": 0.7093333333333334, "grad_norm": 5.5, "grad_norm_var": 0.08183186848958333, "learning_rate": 3.8723709557853935e-05, "loss": 5.4434, "loss/crossentropy": 2.5605077147483826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24422496184706688, "step": 8512 }, { "epoch": 0.7095, "grad_norm": 4.5, "grad_norm_var": 0.0818359375, "learning_rate": 3.871385510416175e-05, "loss": 4.6419, "loss/crossentropy": 1.6347190141677856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.167040653526783, "step": 8514 }, { "epoch": 0.7096666666666667, "grad_norm": 7.25, "grad_norm_var": 0.47897135416666664, "learning_rate": 3.8703964345494747e-05, "loss": 4.8956, "loss/crossentropy": 1.5641431733965874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1506042554974556, "step": 8516 }, { "epoch": 0.7098333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.49269205729166665, "learning_rate": 3.869403730625741e-05, "loss": 4.9614, "loss/crossentropy": 2.4893141984939575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.214422095566988, "step": 8518 }, { "epoch": 0.71, "grad_norm": 4.15625, "grad_norm_var": 0.5141764322916667, "learning_rate": 3.86840740109437e-05, "loss": 4.093, "loss/crossentropy": 1.1999619975686073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1375366821885109, "step": 8520 }, { "epoch": 0.7101666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.514306640625, "learning_rate": 3.8674074484137075e-05, "loss": 4.9127, "loss/crossentropy": 1.6504128351807594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15782082825899124, "step": 8522 }, { "epoch": 0.7103333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.506103515625, "learning_rate": 3.866403875051037e-05, "loss": 4.9036, "loss/crossentropy": 1.891563042998314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19191880524158478, "step": 8524 }, { "epoch": 0.7105, "grad_norm": 5.53125, "grad_norm_var": 0.5021769205729166, "learning_rate": 3.865396683482575e-05, "loss": 5.1794, "loss/crossentropy": 2.358280599117279, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2268039658665657, "step": 8526 }, { "epoch": 0.7106666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.47706705729166665, "learning_rate": 3.864385876193469e-05, "loss": 4.8347, "loss/crossentropy": 2.0874394476413727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2218267060816288, "step": 8528 }, { "epoch": 0.7108333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.46678059895833335, "learning_rate": 3.8633714556777817e-05, "loss": 5.0689, "loss/crossentropy": 0.9933740720152855, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.116140803322196, "step": 8530 }, { "epoch": 0.711, "grad_norm": 4.9375, "grad_norm_var": 0.09361979166666666, "learning_rate": 3.8623534244384984e-05, "loss": 5.2475, "loss/crossentropy": 2.2980607450008392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18490897864103317, "step": 8532 }, { "epoch": 0.7111666666666666, "grad_norm": 4.75, "grad_norm_var": 0.08357747395833333, "learning_rate": 3.861331784987508e-05, "loss": 4.93, "loss/crossentropy": 1.8772684335708618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1812591589987278, "step": 8534 }, { "epoch": 0.7113333333333334, "grad_norm": 4.875, "grad_norm_var": 0.05115559895833333, "learning_rate": 3.8603065398456056e-05, "loss": 5.0756, "loss/crossentropy": 1.7558320760726929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20000910758972168, "step": 8536 }, { "epoch": 0.7115, "grad_norm": 5.0625, "grad_norm_var": 0.05191650390625, "learning_rate": 3.85927769154248e-05, "loss": 4.9146, "loss/crossentropy": 1.9606410264968872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17154713906347752, "step": 8538 }, { "epoch": 0.7116666666666667, "grad_norm": 4.75, "grad_norm_var": 0.05181884765625, "learning_rate": 3.858245242616713e-05, "loss": 4.8698, "loss/crossentropy": 1.5781254023313522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16780022345483303, "step": 8540 }, { "epoch": 0.7118333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.022196451822916668, "learning_rate": 3.857209195615769e-05, "loss": 5.4512, "loss/crossentropy": 2.496997833251953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22793800756335258, "step": 8542 }, { "epoch": 0.712, "grad_norm": 4.5, "grad_norm_var": 0.07580973307291666, "learning_rate": 3.856169553095994e-05, "loss": 4.7199, "loss/crossentropy": 1.7486284598708153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16116427071392536, "step": 8544 }, { "epoch": 0.7121666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.08079020182291667, "learning_rate": 3.855126317622598e-05, "loss": 4.5685, "loss/crossentropy": 1.554433859884739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16488460823893547, "step": 8546 }, { "epoch": 0.7123333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.08307291666666666, "learning_rate": 3.854079491769665e-05, "loss": 5.0647, "loss/crossentropy": 1.7844418436288834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16470178216695786, "step": 8548 }, { "epoch": 0.7125, "grad_norm": 4.65625, "grad_norm_var": 0.09338785807291666, "learning_rate": 3.853029078120131e-05, "loss": 4.7518, "loss/crossentropy": 1.723744347691536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1574373096227646, "step": 8550 }, { "epoch": 0.7126666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.09439697265625, "learning_rate": 3.851975079265788e-05, "loss": 5.3427, "loss/crossentropy": 2.4624204635620117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21660296246409416, "step": 8552 }, { "epoch": 0.7128333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.09768473307291667, "learning_rate": 3.850917497807273e-05, "loss": 4.9311, "loss/crossentropy": 2.056380547583103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827603466808796, "step": 8554 }, { "epoch": 0.713, "grad_norm": 4.625, "grad_norm_var": 0.10230712890625, "learning_rate": 3.849856336354064e-05, "loss": 5.3591, "loss/crossentropy": 2.038110814988613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19497457519173622, "step": 8556 }, { "epoch": 0.7131666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.108447265625, "learning_rate": 3.8487915975244715e-05, "loss": 5.4798, "loss/crossentropy": 2.337386816740036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22147011384367943, "step": 8558 }, { "epoch": 0.7133333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.0740234375, "learning_rate": 3.847723283945632e-05, "loss": 5.0219, "loss/crossentropy": 2.2437821328639984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1856003701686859, "step": 8560 }, { "epoch": 0.7135, "grad_norm": 5.4375, "grad_norm_var": 0.08938802083333333, "learning_rate": 3.846651398253503e-05, "loss": 4.9609, "loss/crossentropy": 1.8475798070430756, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18377842381596565, "step": 8562 }, { "epoch": 0.7136666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.08642171223958334, "learning_rate": 3.845575943092857e-05, "loss": 4.8802, "loss/crossentropy": 1.9503494873642921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18385779485106468, "step": 8564 }, { "epoch": 0.7138333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.070556640625, "learning_rate": 3.8444969211172704e-05, "loss": 5.2131, "loss/crossentropy": 2.542492926120758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972789764404297, "step": 8566 }, { "epoch": 0.714, "grad_norm": 4.8125, "grad_norm_var": 0.083837890625, "learning_rate": 3.843414334989125e-05, "loss": 4.7795, "loss/crossentropy": 2.350885808467865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20130325853824615, "step": 8568 }, { "epoch": 0.7141666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.31819254557291665, "learning_rate": 3.842328187379593e-05, "loss": 4.414, "loss/crossentropy": 1.7121087461709976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17743447422981262, "step": 8570 }, { "epoch": 0.7143333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.3076171875, "learning_rate": 3.841238480968637e-05, "loss": 4.8087, "loss/crossentropy": 2.0848701670765877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18633460253477097, "step": 8572 }, { "epoch": 0.7145, "grad_norm": 4.625, "grad_norm_var": 0.3153483072916667, "learning_rate": 3.840145218444999e-05, "loss": 4.5425, "loss/crossentropy": 1.3898528441786766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17466096580028534, "step": 8574 }, { "epoch": 0.7146666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.30987955729166666, "learning_rate": 3.839048402506194e-05, "loss": 4.7232, "loss/crossentropy": 1.9185372442007065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18338378705084324, "step": 8576 }, { "epoch": 0.7148333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.28677978515625, "learning_rate": 3.837948035858508e-05, "loss": 4.7647, "loss/crossentropy": 2.4466370940208435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22165575250983238, "step": 8578 }, { "epoch": 0.715, "grad_norm": 5.15625, "grad_norm_var": 0.29127604166666665, "learning_rate": 3.8368441212169856e-05, "loss": 4.9122, "loss/crossentropy": 2.3279071152210236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20262134820222855, "step": 8580 }, { "epoch": 0.7151666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.2951131184895833, "learning_rate": 3.8357366613054265e-05, "loss": 4.4034, "loss/crossentropy": 1.2494020238518715, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17146893590688705, "step": 8582 }, { "epoch": 0.7153333333333334, "grad_norm": 5.34375, "grad_norm_var": 0.29737955729166665, "learning_rate": 3.834625658856378e-05, "loss": 4.92, "loss/crossentropy": 2.271469384431839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19555164128541946, "step": 8584 }, { "epoch": 0.7155, "grad_norm": 5.125, "grad_norm_var": 0.062235514322916664, "learning_rate": 3.833511116611128e-05, "loss": 4.9768, "loss/crossentropy": 2.541659891605377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22345850616693497, "step": 8586 }, { "epoch": 0.7156666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.0658203125, "learning_rate": 3.8323930373196994e-05, "loss": 4.9504, "loss/crossentropy": 1.7460681796073914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19049026630818844, "step": 8588 }, { "epoch": 0.7158333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.08502604166666666, "learning_rate": 3.83127142374084e-05, "loss": 5.0767, "loss/crossentropy": 2.4137668907642365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21288975328207016, "step": 8590 }, { "epoch": 0.716, "grad_norm": 4.78125, "grad_norm_var": 0.08006184895833333, "learning_rate": 3.830146278642023e-05, "loss": 4.8836, "loss/crossentropy": 2.0978833809494972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21597089432179928, "step": 8592 }, { "epoch": 0.7161666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.08245035807291666, "learning_rate": 3.829017604799428e-05, "loss": 4.8027, "loss/crossentropy": 1.518560267984867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15250182338058949, "step": 8594 }, { "epoch": 0.7163333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.07903238932291666, "learning_rate": 3.8278854049979495e-05, "loss": 5.1256, "loss/crossentropy": 1.6443413645029068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18051048554480076, "step": 8596 }, { "epoch": 0.7165, "grad_norm": 4.8125, "grad_norm_var": 0.07862955729166667, "learning_rate": 3.826749682031174e-05, "loss": 5.122, "loss/crossentropy": 1.3595605567097664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12122759409248829, "step": 8598 }, { "epoch": 0.7166666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.07864583333333333, "learning_rate": 3.8256104387013886e-05, "loss": 4.3628, "loss/crossentropy": 1.3635896146297455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14798260852694511, "step": 8600 }, { "epoch": 0.7168333333333333, "grad_norm": 5.25, "grad_norm_var": 0.08088785807291667, "learning_rate": 3.824467677819562e-05, "loss": 5.4972, "loss/crossentropy": 1.7752568274736404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16927327774465084, "step": 8602 }, { "epoch": 0.717, "grad_norm": 5.09375, "grad_norm_var": 0.076806640625, "learning_rate": 3.8233214022053414e-05, "loss": 4.3435, "loss/crossentropy": 2.242382973432541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19916261360049248, "step": 8604 }, { "epoch": 0.7171666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.05725504557291667, "learning_rate": 3.822171614687049e-05, "loss": 4.8377, "loss/crossentropy": 1.064011611044407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1281402837485075, "step": 8606 }, { "epoch": 0.7173333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.05732014973958333, "learning_rate": 3.821018318101672e-05, "loss": 5.0572, "loss/crossentropy": 2.07350555062294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20168068632483482, "step": 8608 }, { "epoch": 0.7175, "grad_norm": 4.59375, "grad_norm_var": 0.06131184895833333, "learning_rate": 3.8198615152948534e-05, "loss": 5.551, "loss/crossentropy": 1.9728271514177322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18988211080431938, "step": 8610 }, { "epoch": 0.7176666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.061356608072916666, "learning_rate": 3.818701209120891e-05, "loss": 5.1561, "loss/crossentropy": 2.1180964708328247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1928955800831318, "step": 8612 }, { "epoch": 0.7178333333333333, "grad_norm": 5.0625, "grad_norm_var": 0.05520426432291667, "learning_rate": 3.8175374024427233e-05, "loss": 5.2349, "loss/crossentropy": 2.4339922070503235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2179810367524624, "step": 8614 }, { "epoch": 0.718, "grad_norm": 4.78125, "grad_norm_var": 0.03258056640625, "learning_rate": 3.816370098131929e-05, "loss": 5.0511, "loss/crossentropy": 2.0979133695364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20639016665518284, "step": 8616 }, { "epoch": 0.7181666666666666, "grad_norm": 5.125, "grad_norm_var": 0.039388020833333336, "learning_rate": 3.815199299068714e-05, "loss": 5.1404, "loss/crossentropy": 1.4955974891781807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16872404888272285, "step": 8618 }, { "epoch": 0.7183333333333334, "grad_norm": 4.4375, "grad_norm_var": 0.045638020833333334, "learning_rate": 3.8140250081419105e-05, "loss": 4.4802, "loss/crossentropy": 0.6960400566458702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11802476830780506, "step": 8620 }, { "epoch": 0.7185, "grad_norm": 5.03125, "grad_norm_var": 0.05211181640625, "learning_rate": 3.812847228248962e-05, "loss": 5.2375, "loss/crossentropy": 2.1082040667533875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18984466418623924, "step": 8622 }, { "epoch": 0.7186666666666667, "grad_norm": 4.875, "grad_norm_var": 0.060400390625, "learning_rate": 3.811665962295925e-05, "loss": 5.2651, "loss/crossentropy": 2.195831149816513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19905410893261433, "step": 8624 }, { "epoch": 0.7188333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.06471354166666667, "learning_rate": 3.8104812131974565e-05, "loss": 4.4659, "loss/crossentropy": 1.7844280079007149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.194871723651886, "step": 8626 }, { "epoch": 0.719, "grad_norm": 4.5625, "grad_norm_var": 0.06649983723958333, "learning_rate": 3.809292983876806e-05, "loss": 4.8771, "loss/crossentropy": 2.350356310606003, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22957272082567215, "step": 8628 }, { "epoch": 0.7191666666666666, "grad_norm": 4.75, "grad_norm_var": 0.0626953125, "learning_rate": 3.8081012772658125e-05, "loss": 5.2387, "loss/crossentropy": 2.595982849597931, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20795315876603127, "step": 8630 }, { "epoch": 0.7193333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.06503499348958333, "learning_rate": 3.8069060963048904e-05, "loss": 4.9113, "loss/crossentropy": 1.6220935881137848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17125827446579933, "step": 8632 }, { "epoch": 0.7195, "grad_norm": 4.90625, "grad_norm_var": 0.05388997395833333, "learning_rate": 3.8057074439430326e-05, "loss": 5.3481, "loss/crossentropy": 2.598345994949341, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20299434289336205, "step": 8634 }, { "epoch": 0.7196666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.042769368489583334, "learning_rate": 3.804505323137796e-05, "loss": 4.5609, "loss/crossentropy": 1.8253272399306297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18018994107842445, "step": 8636 }, { "epoch": 0.7198333333333333, "grad_norm": 4.75, "grad_norm_var": 0.0361328125, "learning_rate": 3.80329973685529e-05, "loss": 5.1573, "loss/crossentropy": 2.299446254968643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21431740745902061, "step": 8638 }, { "epoch": 0.72, "grad_norm": 5.1875, "grad_norm_var": 0.04149983723958333, "learning_rate": 3.802090688070182e-05, "loss": 4.6841, "loss/crossentropy": 1.5538584291934967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2675389163196087, "step": 8640 }, { "epoch": 0.7201666666666666, "grad_norm": 5.75, "grad_norm_var": 0.08362223307291666, "learning_rate": 3.800878179765679e-05, "loss": 5.4819, "loss/crossentropy": 1.719673328101635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20472576469182968, "step": 8642 }, { "epoch": 0.7203333333333334, "grad_norm": 4.5, "grad_norm_var": 0.08683268229166667, "learning_rate": 3.799662214933525e-05, "loss": 4.9331, "loss/crossentropy": 1.8904408514499664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1706375703215599, "step": 8644 }, { "epoch": 0.7205, "grad_norm": 4.65625, "grad_norm_var": 0.08547770182291667, "learning_rate": 3.7984427965739914e-05, "loss": 5.1704, "loss/crossentropy": 1.9823104441165924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20099147781729698, "step": 8646 }, { "epoch": 0.7206666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.10026041666666667, "learning_rate": 3.7972199276958726e-05, "loss": 4.5451, "loss/crossentropy": 1.9567938223481178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17212271131575108, "step": 8648 }, { "epoch": 0.7208333333333333, "grad_norm": 4.75, "grad_norm_var": 0.10146077473958333, "learning_rate": 3.795993611316476e-05, "loss": 5.1056, "loss/crossentropy": 2.315292328596115, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21807067096233368, "step": 8650 }, { "epoch": 0.721, "grad_norm": 4.78125, "grad_norm_var": 0.10428059895833333, "learning_rate": 3.794763850461615e-05, "loss": 4.9331, "loss/crossentropy": 1.7855356112122536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16460080444812775, "step": 8652 }, { "epoch": 0.7211666666666666, "grad_norm": 4.75, "grad_norm_var": 0.104150390625, "learning_rate": 3.793530648165602e-05, "loss": 4.8287, "loss/crossentropy": 1.8243321552872658, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18671520613133907, "step": 8654 }, { "epoch": 0.7213333333333334, "grad_norm": 4.28125, "grad_norm_var": 0.122900390625, "learning_rate": 3.792294007471242e-05, "loss": 5.3489, "loss/crossentropy": 1.904386505484581, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1730734258890152, "step": 8656 }, { "epoch": 0.7215, "grad_norm": 4.46875, "grad_norm_var": 0.083447265625, "learning_rate": 3.791053931429821e-05, "loss": 4.2947, "loss/crossentropy": 2.0987056344747543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18425085954368114, "step": 8658 }, { "epoch": 0.7216666666666667, "grad_norm": 5.53125, "grad_norm_var": 0.1107421875, "learning_rate": 3.7898104231011065e-05, "loss": 5.074, "loss/crossentropy": 2.2960754334926605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22611650452017784, "step": 8660 }, { "epoch": 0.7218333333333333, "grad_norm": 4.875, "grad_norm_var": 0.10351155598958334, "learning_rate": 3.788563485553329e-05, "loss": 5.2099, "loss/crossentropy": 2.3180200457572937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.195101797580719, "step": 8662 }, { "epoch": 0.722, "grad_norm": 4.6875, "grad_norm_var": 0.09192301432291666, "learning_rate": 3.787313121863185e-05, "loss": 4.5433, "loss/crossentropy": 1.462849237024784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.160456795245409, "step": 8664 }, { "epoch": 0.7221666666666666, "grad_norm": 4.5, "grad_norm_var": 0.0955078125, "learning_rate": 3.7860593351158205e-05, "loss": 4.4727, "loss/crossentropy": 2.4242143034934998, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20100093632936478, "step": 8666 }, { "epoch": 0.7223333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.08951416015625, "learning_rate": 3.784802128404831e-05, "loss": 4.9187, "loss/crossentropy": 1.6116406470537186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1766587197780609, "step": 8668 }, { "epoch": 0.7225, "grad_norm": 4.90625, "grad_norm_var": 0.09071858723958333, "learning_rate": 3.7835415048322486e-05, "loss": 4.7858, "loss/crossentropy": 2.023942083120346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17556135542690754, "step": 8670 }, { "epoch": 0.7226666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.06656494140625, "learning_rate": 3.782277467508537e-05, "loss": 4.8446, "loss/crossentropy": 1.9526407718658447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2195216380059719, "step": 8672 }, { "epoch": 0.7228333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.073046875, "learning_rate": 3.7810100195525825e-05, "loss": 5.1322, "loss/crossentropy": 2.075814664363861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2480899766087532, "step": 8674 }, { "epoch": 0.723, "grad_norm": 4.5625, "grad_norm_var": 0.054280598958333336, "learning_rate": 3.7797391640916865e-05, "loss": 4.4837, "loss/crossentropy": 1.9470646530389786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18394303135573864, "step": 8676 }, { "epoch": 0.7231666666666666, "grad_norm": 4.3125, "grad_norm_var": 0.06907552083333333, "learning_rate": 3.7784649042615594e-05, "loss": 4.3918, "loss/crossentropy": 1.4085872247815132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14427426271140575, "step": 8678 }, { "epoch": 0.7233333333333334, "grad_norm": 5.125, "grad_norm_var": 0.08391927083333334, "learning_rate": 3.7771872432063104e-05, "loss": 5.0086, "loss/crossentropy": 1.5730762034654617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20427705347537994, "step": 8680 }, { "epoch": 0.7235, "grad_norm": 4.4375, "grad_norm_var": 0.08664957682291667, "learning_rate": 3.775906184078441e-05, "loss": 4.7252, "loss/crossentropy": 1.6773125976324081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17820323258638382, "step": 8682 }, { "epoch": 0.7236666666666667, "grad_norm": 4.625, "grad_norm_var": 0.08824462890625, "learning_rate": 3.7746217300388364e-05, "loss": 4.8914, "loss/crossentropy": 1.9949692338705063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16665949299931526, "step": 8684 }, { "epoch": 0.7238333333333333, "grad_norm": 4.75, "grad_norm_var": 0.08761393229166667, "learning_rate": 3.7733338842567604e-05, "loss": 4.4487, "loss/crossentropy": 1.8528357148170471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21053320541977882, "step": 8686 }, { "epoch": 0.724, "grad_norm": 4.65625, "grad_norm_var": 0.08865559895833333, "learning_rate": 3.772042649909845e-05, "loss": 4.2227, "loss/crossentropy": 1.7226733341813087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1530767474323511, "step": 8688 }, { "epoch": 0.7241666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.05767822265625, "learning_rate": 3.77074803018408e-05, "loss": 4.3961, "loss/crossentropy": 1.7308517843484879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16562194749712944, "step": 8690 }, { "epoch": 0.7243333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.10458577473958333, "learning_rate": 3.769450028273814e-05, "loss": 5.2013, "loss/crossentropy": 2.163651943206787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21022338047623634, "step": 8692 }, { "epoch": 0.7245, "grad_norm": 4.875, "grad_norm_var": 0.09110921223958333, "learning_rate": 3.768148647381735e-05, "loss": 4.9322, "loss/crossentropy": 2.3275624215602875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20518775284290314, "step": 8694 }, { "epoch": 0.7246666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.07489827473958334, "learning_rate": 3.766843890718873e-05, "loss": 5.2929, "loss/crossentropy": 2.217619091272354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21295088529586792, "step": 8696 }, { "epoch": 0.7248333333333333, "grad_norm": 5.34375, "grad_norm_var": 0.08983968098958334, "learning_rate": 3.765535761504584e-05, "loss": 4.8328, "loss/crossentropy": 1.4501753821969032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1492973156273365, "step": 8698 }, { "epoch": 0.725, "grad_norm": 5.03125, "grad_norm_var": 0.08943684895833333, "learning_rate": 3.764224262966548e-05, "loss": 4.6842, "loss/crossentropy": 2.3924825191497803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22406524047255516, "step": 8700 }, { "epoch": 0.7251666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.09021809895833334, "learning_rate": 3.7629093983407565e-05, "loss": 5.6793, "loss/crossentropy": 2.3391090631484985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21643299236893654, "step": 8702 }, { "epoch": 0.7253333333333334, "grad_norm": 5.0, "grad_norm_var": 0.07743733723958333, "learning_rate": 3.761591170871507e-05, "loss": 5.411, "loss/crossentropy": 2.587038576602936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19993890821933746, "step": 8704 }, { "epoch": 0.7255, "grad_norm": 5.28125, "grad_norm_var": 0.05618489583333333, "learning_rate": 3.760269583811396e-05, "loss": 4.9137, "loss/crossentropy": 2.3714127242565155, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2080126442015171, "step": 8706 }, { "epoch": 0.7256666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.03183186848958333, "learning_rate": 3.758944640421307e-05, "loss": 5.3241, "loss/crossentropy": 1.6202038303017616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17815641313791275, "step": 8708 }, { "epoch": 0.7258333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.04556884765625, "learning_rate": 3.7576163439704066e-05, "loss": 4.9787, "loss/crossentropy": 1.8317546993494034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2205317411571741, "step": 8710 }, { "epoch": 0.726, "grad_norm": 4.875, "grad_norm_var": 0.055078125, "learning_rate": 3.756284697736134e-05, "loss": 5.2559, "loss/crossentropy": 2.6816230416297913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20220203697681427, "step": 8712 }, { "epoch": 0.7261666666666666, "grad_norm": 4.5, "grad_norm_var": 0.05284830729166667, "learning_rate": 3.7549497050041936e-05, "loss": 5.216, "loss/crossentropy": 2.396820366382599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21334237977862358, "step": 8714 }, { "epoch": 0.7263333333333334, "grad_norm": 5.40625, "grad_norm_var": 0.08918863932291667, "learning_rate": 3.753611369068548e-05, "loss": 5.0165, "loss/crossentropy": 2.489267408847809, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21773302927613258, "step": 8716 }, { "epoch": 0.7265, "grad_norm": 4.78125, "grad_norm_var": 0.09117431640625, "learning_rate": 3.7522696932314076e-05, "loss": 4.9107, "loss/crossentropy": 2.376191943883896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045009806752205, "step": 8718 }, { "epoch": 0.7266666666666667, "grad_norm": 4.15625, "grad_norm_var": 0.124462890625, "learning_rate": 3.750924680803224e-05, "loss": 4.8471, "loss/crossentropy": 1.4641473963856697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14925377815961838, "step": 8720 }, { "epoch": 0.7268333333333333, "grad_norm": 5.125, "grad_norm_var": 0.11754150390625, "learning_rate": 3.749576335102683e-05, "loss": 4.9705, "loss/crossentropy": 1.9205654561519623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1824258267879486, "step": 8722 }, { "epoch": 0.727, "grad_norm": 4.625, "grad_norm_var": 0.11812744140625, "learning_rate": 3.748224659456692e-05, "loss": 4.912, "loss/crossentropy": 1.3279554545879364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.147995226085186, "step": 8724 }, { "epoch": 0.7271666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.08865559895833333, "learning_rate": 3.7468696572003773e-05, "loss": 5.0937, "loss/crossentropy": 2.371949940919876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19854775071144104, "step": 8726 }, { "epoch": 0.7273333333333334, "grad_norm": 4.65625, "grad_norm_var": 719.303153483073, "learning_rate": 3.7455113316770714e-05, "loss": 5.064, "loss/crossentropy": 1.8971856757998466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21301062777638435, "step": 8728 }, { "epoch": 0.7275, "grad_norm": 4.75, "grad_norm_var": 718.9658813476562, "learning_rate": 3.7441496862383074e-05, "loss": 4.6886, "loss/crossentropy": 1.646051250398159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17237389832735062, "step": 8730 }, { "epoch": 0.7276666666666667, "grad_norm": 4.875, "grad_norm_var": 718.8195597330729, "learning_rate": 3.742784724243811e-05, "loss": 5.0265, "loss/crossentropy": 2.2616125643253326, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21027613058686256, "step": 8732 }, { "epoch": 0.7278333333333333, "grad_norm": 4.65625, "grad_norm_var": 718.8765258789062, "learning_rate": 3.74141644906149e-05, "loss": 4.8026, "loss/crossentropy": 1.7153765857219696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16985262744128704, "step": 8734 }, { "epoch": 0.728, "grad_norm": 5.46875, "grad_norm_var": 717.9027180989583, "learning_rate": 3.740044864067428e-05, "loss": 4.9463, "loss/crossentropy": 1.7610290348529816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22300682961940765, "step": 8736 }, { "epoch": 0.7281666666666666, "grad_norm": 4.75, "grad_norm_var": 717.5597290039062, "learning_rate": 3.7386699726458725e-05, "loss": 5.2234, "loss/crossentropy": 1.659450389444828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17890477553009987, "step": 8738 }, { "epoch": 0.7283333333333334, "grad_norm": 4.4375, "grad_norm_var": 717.5659790039062, "learning_rate": 3.7372917781892335e-05, "loss": 4.771, "loss/crossentropy": 2.1801012456417084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21790499612689018, "step": 8740 }, { "epoch": 0.7285, "grad_norm": 4.84375, "grad_norm_var": 717.3072265625, "learning_rate": 3.735910284098068e-05, "loss": 5.2941, "loss/crossentropy": 2.0166616439819336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17672479711472988, "step": 8742 }, { "epoch": 0.7286666666666667, "grad_norm": 5.1875, "grad_norm_var": 0.07498372395833333, "learning_rate": 3.7345254937810746e-05, "loss": 5.0446, "loss/crossentropy": 1.6543507799506187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18381119146943092, "step": 8744 }, { "epoch": 0.7288333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.07459309895833334, "learning_rate": 3.733137410655087e-05, "loss": 5.1135, "loss/crossentropy": 2.328328937292099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19200216978788376, "step": 8746 }, { "epoch": 0.729, "grad_norm": 5.46875, "grad_norm_var": 0.09269205729166667, "learning_rate": 3.7317460381450616e-05, "loss": 5.5765, "loss/crossentropy": 2.3491774797439575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2052808813750744, "step": 8748 }, { "epoch": 0.7291666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.08795166015625, "learning_rate": 3.7303513796840724e-05, "loss": 4.8585, "loss/crossentropy": 1.4594552740454674, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14136701077222824, "step": 8750 }, { "epoch": 0.7293333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.07408854166666666, "learning_rate": 3.7289534387133e-05, "loss": 4.7951, "loss/crossentropy": 2.2565941512584686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20282186567783356, "step": 8752 }, { "epoch": 0.7295, "grad_norm": 4.875, "grad_norm_var": 0.05584309895833333, "learning_rate": 3.727552218682026e-05, "loss": 5.109, "loss/crossentropy": 1.2331131994724274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13799532130360603, "step": 8754 }, { "epoch": 0.7296666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.05237223307291667, "learning_rate": 3.7261477230476194e-05, "loss": 5.5758, "loss/crossentropy": 2.656588077545166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.211183350533247, "step": 8756 }, { "epoch": 0.7298333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.059228515625, "learning_rate": 3.724739955275535e-05, "loss": 4.9283, "loss/crossentropy": 1.393723078072071, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15066486410796642, "step": 8758 }, { "epoch": 0.73, "grad_norm": 4.5625, "grad_norm_var": 0.060807291666666666, "learning_rate": 3.7233289188392994e-05, "loss": 4.314, "loss/crossentropy": 1.7706470787525177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16479334980249405, "step": 8760 }, { "epoch": 0.7301666666666666, "grad_norm": 5.625, "grad_norm_var": 0.09605712890625, "learning_rate": 3.7219146172205054e-05, "loss": 4.926, "loss/crossentropy": 1.4691421911120415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15686069056391716, "step": 8762 }, { "epoch": 0.7303333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.07431233723958333, "learning_rate": 3.7204970539088005e-05, "loss": 4.3474, "loss/crossentropy": 1.8123872131109238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17503276839852333, "step": 8764 }, { "epoch": 0.7305, "grad_norm": 5.53125, "grad_norm_var": 0.10562744140625, "learning_rate": 3.719076232401881e-05, "loss": 5.5435, "loss/crossentropy": 2.8346092104911804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20754270255565643, "step": 8766 }, { "epoch": 0.7306666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.11506754557291667, "learning_rate": 3.717652156205485e-05, "loss": 4.4182, "loss/crossentropy": 1.4685752764344215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1536390818655491, "step": 8768 }, { "epoch": 0.7308333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.12760416666666666, "learning_rate": 3.716224828833376e-05, "loss": 4.502, "loss/crossentropy": 1.5342864394187927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14854193665087223, "step": 8770 }, { "epoch": 0.731, "grad_norm": 4.71875, "grad_norm_var": 0.12336832682291667, "learning_rate": 3.714794253807345e-05, "loss": 4.9686, "loss/crossentropy": 2.0194079279899597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18694549053907394, "step": 8772 }, { "epoch": 0.7311666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.12018229166666666, "learning_rate": 3.7133604346571923e-05, "loss": 4.5597, "loss/crossentropy": 1.5956484377384186, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19843536987900734, "step": 8774 }, { "epoch": 0.7313333333333333, "grad_norm": 4.5, "grad_norm_var": 0.12237955729166666, "learning_rate": 3.711923374920724e-05, "loss": 4.5708, "loss/crossentropy": 1.9300574213266373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20277779176831245, "step": 8776 }, { "epoch": 0.7315, "grad_norm": 4.96875, "grad_norm_var": 0.0798828125, "learning_rate": 3.7104830781437435e-05, "loss": 5.3787, "loss/crossentropy": 1.9987846314907074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19426734000444412, "step": 8778 }, { "epoch": 0.7316666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.08232014973958333, "learning_rate": 3.709039547880038e-05, "loss": 5.3359, "loss/crossentropy": 1.9980473741889, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17278934083878994, "step": 8780 }, { "epoch": 0.7318333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.04505208333333333, "learning_rate": 3.7075927876913765e-05, "loss": 5.0242, "loss/crossentropy": 1.8991079032421112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16986040398478508, "step": 8782 }, { "epoch": 0.732, "grad_norm": 4.875, "grad_norm_var": 0.041796875, "learning_rate": 3.706142801147495e-05, "loss": 4.6015, "loss/crossentropy": 2.006039559841156, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17685510218143463, "step": 8784 }, { "epoch": 0.7321666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.034098307291666664, "learning_rate": 3.7046895918260916e-05, "loss": 4.8398, "loss/crossentropy": 1.5066589415073395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15621322393417358, "step": 8786 }, { "epoch": 0.7323333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.046858723958333334, "learning_rate": 3.703233163312816e-05, "loss": 4.8451, "loss/crossentropy": 2.0729255378246307, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18724722787737846, "step": 8788 }, { "epoch": 0.7325, "grad_norm": 4.875, "grad_norm_var": 0.04940999348958333, "learning_rate": 3.70177351920126e-05, "loss": 5.5098, "loss/crossentropy": 2.3151272237300873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22275524586439133, "step": 8790 }, { "epoch": 0.7326666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.0490234375, "learning_rate": 3.700310663092951e-05, "loss": 4.5555, "loss/crossentropy": 1.7588667497038841, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17178992182016373, "step": 8792 }, { "epoch": 0.7328333333333333, "grad_norm": 4.5, "grad_norm_var": 0.05675455729166667, "learning_rate": 3.698844598597341e-05, "loss": 5.298, "loss/crossentropy": 2.5000760555267334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21289020031690598, "step": 8794 }, { "epoch": 0.733, "grad_norm": 4.375, "grad_norm_var": 0.07838541666666667, "learning_rate": 3.6973753293317975e-05, "loss": 4.4096, "loss/crossentropy": 1.6862092539668083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16006714291870594, "step": 8796 }, { "epoch": 0.7331666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.07721354166666666, "learning_rate": 3.6959028589215986e-05, "loss": 4.6138, "loss/crossentropy": 1.5368055179715157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15705449134111404, "step": 8798 }, { "epoch": 0.7333333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.07858072916666667, "learning_rate": 3.6944271909999166e-05, "loss": 5.0054, "loss/crossentropy": 2.119047671556473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1983763799071312, "step": 8800 }, { "epoch": 0.7335, "grad_norm": 4.78125, "grad_norm_var": 0.07515869140625, "learning_rate": 3.6929483292078156e-05, "loss": 4.8087, "loss/crossentropy": 1.6268837228417397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1544271968305111, "step": 8802 }, { "epoch": 0.7336666666666667, "grad_norm": 5.125, "grad_norm_var": 0.06884358723958334, "learning_rate": 3.69146627719424e-05, "loss": 5.0262, "loss/crossentropy": 2.24159637093544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2014218308031559, "step": 8804 }, { "epoch": 0.7338333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.0875, "learning_rate": 3.689981038616008e-05, "loss": 5.1324, "loss/crossentropy": 1.908030480146408, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18177608400583267, "step": 8806 }, { "epoch": 0.734, "grad_norm": 4.96875, "grad_norm_var": 0.07862955729166667, "learning_rate": 3.6884926171377955e-05, "loss": 4.4723, "loss/crossentropy": 1.1692449301481247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12820919789373875, "step": 8808 }, { "epoch": 0.7341666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.07277018229166667, "learning_rate": 3.6870010164321354e-05, "loss": 4.5284, "loss/crossentropy": 1.6573495715856552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.189057357609272, "step": 8810 }, { "epoch": 0.7343333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.05279541015625, "learning_rate": 3.685506240179405e-05, "loss": 5.107, "loss/crossentropy": 1.549419716000557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16974429041147232, "step": 8812 }, { "epoch": 0.7345, "grad_norm": 5.3125, "grad_norm_var": 0.07511393229166667, "learning_rate": 3.684008292067814e-05, "loss": 4.5746, "loss/crossentropy": 2.343492843210697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1637168973684311, "step": 8814 }, { "epoch": 0.7346666666666667, "grad_norm": 5.0, "grad_norm_var": 0.08798421223958333, "learning_rate": 3.6825071757934034e-05, "loss": 4.8706, "loss/crossentropy": 1.5027789995074272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1454361155629158, "step": 8816 }, { "epoch": 0.7348333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.09440104166666667, "learning_rate": 3.681002895060026e-05, "loss": 4.7813, "loss/crossentropy": 2.3462014198303223, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2297021485865116, "step": 8818 }, { "epoch": 0.735, "grad_norm": 4.71875, "grad_norm_var": 0.11614583333333334, "learning_rate": 3.679495453579345e-05, "loss": 4.4424, "loss/crossentropy": 1.9516724050045013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17961198277771473, "step": 8820 }, { "epoch": 0.7351666666666666, "grad_norm": 4.375, "grad_norm_var": 0.10041910807291667, "learning_rate": 3.677984855070824e-05, "loss": 4.989, "loss/crossentropy": 1.7319712713360786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1646728366613388, "step": 8822 }, { "epoch": 0.7353333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.09280192057291667, "learning_rate": 3.6764711032617146e-05, "loss": 5.0011, "loss/crossentropy": 1.472038134932518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16302834451198578, "step": 8824 }, { "epoch": 0.7355, "grad_norm": 4.375, "grad_norm_var": 0.07450764973958333, "learning_rate": 3.6749542018870464e-05, "loss": 4.7102, "loss/crossentropy": 2.418479323387146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23650340735912323, "step": 8826 }, { "epoch": 0.7356666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.08409830729166666, "learning_rate": 3.673434154689626e-05, "loss": 4.9501, "loss/crossentropy": 1.8115656673908234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16492757201194763, "step": 8828 }, { "epoch": 0.7358333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.057275390625, "learning_rate": 3.671910965420017e-05, "loss": 4.6584, "loss/crossentropy": 1.308703638613224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15016014873981476, "step": 8830 }, { "epoch": 0.736, "grad_norm": 4.8125, "grad_norm_var": 0.049853515625, "learning_rate": 3.6703846378365374e-05, "loss": 4.7533, "loss/crossentropy": 2.000911645591259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17556572891771793, "step": 8832 }, { "epoch": 0.7361666666666666, "grad_norm": 5.15625, "grad_norm_var": 0.06378580729166666, "learning_rate": 3.668855175705249e-05, "loss": 5.1723, "loss/crossentropy": 2.0119586139917374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.176089683547616, "step": 8834 }, { "epoch": 0.7363333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.09065348307291667, "learning_rate": 3.6673225827999475e-05, "loss": 5.2057, "loss/crossentropy": 1.8934685587882996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1788729391992092, "step": 8836 }, { "epoch": 0.7365, "grad_norm": 5.15625, "grad_norm_var": 0.08587239583333334, "learning_rate": 3.665786862902155e-05, "loss": 5.2444, "loss/crossentropy": 1.591829739511013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17553285881876945, "step": 8838 }, { "epoch": 0.7366666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.08079427083333333, "learning_rate": 3.664248019801105e-05, "loss": 5.3846, "loss/crossentropy": 1.8678766191005707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1833246424794197, "step": 8840 }, { "epoch": 0.7368333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.08619791666666667, "learning_rate": 3.662706057293743e-05, "loss": 4.9773, "loss/crossentropy": 2.21164333820343, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19135256856679916, "step": 8842 }, { "epoch": 0.737, "grad_norm": 5.3125, "grad_norm_var": 0.10657145182291666, "learning_rate": 3.661160979184705e-05, "loss": 4.5775, "loss/crossentropy": 2.1143080592155457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20492056012153625, "step": 8844 }, { "epoch": 0.7371666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.11614176432291666, "learning_rate": 3.659612789286319e-05, "loss": 5.7674, "loss/crossentropy": 2.6996708512306213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100643552839756, "step": 8846 }, { "epoch": 0.7373333333333333, "grad_norm": 4.875, "grad_norm_var": 0.11809895833333334, "learning_rate": 3.658061491418591e-05, "loss": 4.9873, "loss/crossentropy": 1.7583412826061249, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17828240431845188, "step": 8848 }, { "epoch": 0.7375, "grad_norm": 4.40625, "grad_norm_var": 0.13606770833333334, "learning_rate": 3.656507089409192e-05, "loss": 4.5474, "loss/crossentropy": 1.7216490358114243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15573652274906635, "step": 8850 }, { "epoch": 0.7376666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.1318359375, "learning_rate": 3.654949587093456e-05, "loss": 5.4153, "loss/crossentropy": 1.8738081008195877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18332227692008018, "step": 8852 }, { "epoch": 0.7378333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.13045247395833334, "learning_rate": 3.653388988314365e-05, "loss": 4.8978, "loss/crossentropy": 2.121942013502121, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.189912848174572, "step": 8854 }, { "epoch": 0.738, "grad_norm": 4.78125, "grad_norm_var": 0.130322265625, "learning_rate": 3.651825296922541e-05, "loss": 5.0859, "loss/crossentropy": 2.175195187330246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22438178583979607, "step": 8856 }, { "epoch": 0.7381666666666666, "grad_norm": 5.4375, "grad_norm_var": 0.123291015625, "learning_rate": 3.6502585167762374e-05, "loss": 4.8487, "loss/crossentropy": 2.214916653931141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20211252197623253, "step": 8858 }, { "epoch": 0.7383333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.10701497395833333, "learning_rate": 3.648688651741328e-05, "loss": 5.1052, "loss/crossentropy": 1.367978423833847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1402355097234249, "step": 8860 }, { "epoch": 0.7385, "grad_norm": 5.46875, "grad_norm_var": 0.17057291666666666, "learning_rate": 3.647115705691299e-05, "loss": 5.2058, "loss/crossentropy": 2.0758024752140045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20703085139393806, "step": 8862 }, { "epoch": 0.7386666666666667, "grad_norm": 4.375, "grad_norm_var": 0.19576416015625, "learning_rate": 3.645539682507238e-05, "loss": 4.8559, "loss/crossentropy": 2.5436695218086243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21577593311667442, "step": 8864 }, { "epoch": 0.7388333333333333, "grad_norm": 4.75, "grad_norm_var": 0.17511393229166666, "learning_rate": 3.6439605860778255e-05, "loss": 5.2327, "loss/crossentropy": 2.345476508140564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21046575531363487, "step": 8866 }, { "epoch": 0.739, "grad_norm": 4.6875, "grad_norm_var": 0.18541666666666667, "learning_rate": 3.642378420299326e-05, "loss": 4.6524, "loss/crossentropy": 2.3013267815113068, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18924878537654877, "step": 8868 }, { "epoch": 0.7391666666666666, "grad_norm": 6.53125, "grad_norm_var": 0.33944905598958336, "learning_rate": 3.640793189075576e-05, "loss": 5.1503, "loss/crossentropy": 1.9642613977193832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24627995491027832, "step": 8870 }, { "epoch": 0.7393333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.35559488932291666, "learning_rate": 3.639204896317974e-05, "loss": 4.6375, "loss/crossentropy": 1.7182991802692413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16883813217282295, "step": 8872 }, { "epoch": 0.7395, "grad_norm": 4.53125, "grad_norm_var": 0.35794270833333336, "learning_rate": 3.6376135459454775e-05, "loss": 4.9347, "loss/crossentropy": 2.0507873594760895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1764291562139988, "step": 8874 }, { "epoch": 0.7396666666666667, "grad_norm": 4.625, "grad_norm_var": 0.3622355143229167, "learning_rate": 3.636019141884584e-05, "loss": 4.8631, "loss/crossentropy": 2.2530939877033234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20754842087626457, "step": 8876 }, { "epoch": 0.7398333333333333, "grad_norm": 4.375, "grad_norm_var": 0.2731730143229167, "learning_rate": 3.634421688069326e-05, "loss": 4.3604, "loss/crossentropy": 1.693645179271698, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18852153420448303, "step": 8878 }, { "epoch": 0.74, "grad_norm": 4.59375, "grad_norm_var": 0.2623046875, "learning_rate": 3.632821188441264e-05, "loss": 5.0167, "loss/crossentropy": 2.5044451355934143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22125179693102837, "step": 8880 }, { "epoch": 0.7401666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.262890625, "learning_rate": 3.631217646949469e-05, "loss": 6.0373, "loss/crossentropy": 2.3829991221427917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22329098731279373, "step": 8882 }, { "epoch": 0.7403333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.249853515625, "learning_rate": 3.629611067550523e-05, "loss": 5.285, "loss/crossentropy": 2.3799403607845306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19088751077651978, "step": 8884 }, { "epoch": 0.7405, "grad_norm": 4.71875, "grad_norm_var": 0.044514973958333336, "learning_rate": 3.6280014542084996e-05, "loss": 4.5858, "loss/crossentropy": 1.8241610005497932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17614194378256798, "step": 8886 }, { "epoch": 0.7406666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.061263020833333334, "learning_rate": 3.62638881089496e-05, "loss": 4.832, "loss/crossentropy": 1.690228745341301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1766295973211527, "step": 8888 }, { "epoch": 0.7408333333333333, "grad_norm": 4.75, "grad_norm_var": 0.05585530598958333, "learning_rate": 3.624773141588942e-05, "loss": 4.5258, "loss/crossentropy": 1.9454505145549774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19673113897442818, "step": 8890 }, { "epoch": 0.741, "grad_norm": 4.625, "grad_norm_var": 0.05585530598958333, "learning_rate": 3.623154450276947e-05, "loss": 5.4359, "loss/crossentropy": 1.4792904779314995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1534273698925972, "step": 8892 }, { "epoch": 0.7411666666666666, "grad_norm": 4.75, "grad_norm_var": 0.059228515625, "learning_rate": 3.621532740952937e-05, "loss": 4.7134, "loss/crossentropy": 2.3156608641147614, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20420100539922714, "step": 8894 }, { "epoch": 0.7413333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.058915201822916666, "learning_rate": 3.6199080176183174e-05, "loss": 5.1667, "loss/crossentropy": 2.1931245625019073, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2118113823235035, "step": 8896 }, { "epoch": 0.7415, "grad_norm": 5.0, "grad_norm_var": 0.04755452473958333, "learning_rate": 3.618280284281931e-05, "loss": 4.8052, "loss/crossentropy": 2.5484583973884583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23700064048171043, "step": 8898 }, { "epoch": 0.7416666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.055192057291666666, "learning_rate": 3.616649544960051e-05, "loss": 4.474, "loss/crossentropy": 2.38634717464447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2101435624063015, "step": 8900 }, { "epoch": 0.7418333333333333, "grad_norm": 5.25, "grad_norm_var": 0.8354451497395833, "learning_rate": 3.61501580367636e-05, "loss": 4.8441, "loss/crossentropy": 2.2860844433307648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1810862421989441, "step": 8902 }, { "epoch": 0.742, "grad_norm": 4.65625, "grad_norm_var": 0.8156087239583333, "learning_rate": 3.613379064461955e-05, "loss": 4.2613, "loss/crossentropy": 1.6806901693344116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1674313172698021, "step": 8904 }, { "epoch": 0.7421666666666666, "grad_norm": 4.875, "grad_norm_var": 0.84283447265625, "learning_rate": 3.6117393313553276e-05, "loss": 4.8571, "loss/crossentropy": 2.260170102119446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19418415799736977, "step": 8906 }, { "epoch": 0.7423333333333333, "grad_norm": 4.875, "grad_norm_var": 0.8539998372395833, "learning_rate": 3.610096608402356e-05, "loss": 4.6393, "loss/crossentropy": 2.18383064866066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21976784616708755, "step": 8908 }, { "epoch": 0.7425, "grad_norm": 5.0, "grad_norm_var": 0.82359619140625, "learning_rate": 3.6084508996562945e-05, "loss": 5.3469, "loss/crossentropy": 1.9949154406785965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17736472189426422, "step": 8910 }, { "epoch": 0.7426666666666667, "grad_norm": 5.15625, "grad_norm_var": 0.8265462239583333, "learning_rate": 3.606802209177766e-05, "loss": 5.2167, "loss/crossentropy": 2.670268714427948, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21612682566046715, "step": 8912 }, { "epoch": 0.7428333333333333, "grad_norm": 5.0, "grad_norm_var": 0.8615193684895833, "learning_rate": 3.605150541034752e-05, "loss": 4.6836, "loss/crossentropy": 1.777511551976204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17616182379424572, "step": 8914 }, { "epoch": 0.743, "grad_norm": 4.71875, "grad_norm_var": 0.8450154622395833, "learning_rate": 3.603495899302579e-05, "loss": 5.0119, "loss/crossentropy": 1.4489165171980858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15824375115334988, "step": 8916 }, { "epoch": 0.7431666666666666, "grad_norm": 4.875, "grad_norm_var": 0.06897379557291666, "learning_rate": 3.60183828806391e-05, "loss": 4.708, "loss/crossentropy": 2.3155589401721954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2166098654270172, "step": 8918 }, { "epoch": 0.7433333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.07502848307291667, "learning_rate": 3.6001777114087364e-05, "loss": 5.0273, "loss/crossentropy": 2.040462851524353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932218372821808, "step": 8920 }, { "epoch": 0.7435, "grad_norm": 5.0, "grad_norm_var": 0.06405843098958333, "learning_rate": 3.598514173434366e-05, "loss": 5.1278, "loss/crossentropy": 1.7882616519927979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18532326072454453, "step": 8922 }, { "epoch": 0.7436666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.05627848307291667, "learning_rate": 3.5968476782454126e-05, "loss": 5.381, "loss/crossentropy": 2.228593498468399, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952892802655697, "step": 8924 }, { "epoch": 0.7438333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.06327718098958333, "learning_rate": 3.595178229953789e-05, "loss": 4.7112, "loss/crossentropy": 1.9281839057803154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19580445811152458, "step": 8926 }, { "epoch": 0.744, "grad_norm": 4.96875, "grad_norm_var": 0.057515462239583336, "learning_rate": 3.593505832678692e-05, "loss": 5.1409, "loss/crossentropy": 2.7174503803253174, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2320035845041275, "step": 8928 }, { "epoch": 0.7441666666666666, "grad_norm": 5.40625, "grad_norm_var": 0.06417643229166667, "learning_rate": 3.591830490546596e-05, "loss": 4.9454, "loss/crossentropy": 1.646928757429123, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15842007473111153, "step": 8930 }, { "epoch": 0.7443333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.0865234375, "learning_rate": 3.59015220769124e-05, "loss": 5.0737, "loss/crossentropy": 1.9822481498122215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17872773855924606, "step": 8932 }, { "epoch": 0.7445, "grad_norm": 4.6875, "grad_norm_var": 0.08538004557291666, "learning_rate": 3.588470988253622e-05, "loss": 4.9158, "loss/crossentropy": 2.2272008657455444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22174109145998955, "step": 8934 }, { "epoch": 0.7446666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.08404947916666666, "learning_rate": 3.5867868363819836e-05, "loss": 5.3303, "loss/crossentropy": 1.3292552679777145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13451922498643398, "step": 8936 }, { "epoch": 0.7448333333333333, "grad_norm": 4.875, "grad_norm_var": 0.08240559895833334, "learning_rate": 3.5850997562318006e-05, "loss": 4.8913, "loss/crossentropy": 2.4469125866889954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2009117528796196, "step": 8938 }, { "epoch": 0.745, "grad_norm": 4.59375, "grad_norm_var": 0.08245035807291666, "learning_rate": 3.583409751965776e-05, "loss": 4.9503, "loss/crossentropy": 1.9291583746671677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17131099104881287, "step": 8940 }, { "epoch": 0.7451666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.08800455729166666, "learning_rate": 3.5817168277538286e-05, "loss": 5.0222, "loss/crossentropy": 1.9054948091506958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17595425620675087, "step": 8942 }, { "epoch": 0.7453333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.08059488932291667, "learning_rate": 3.580020987773079e-05, "loss": 5.0269, "loss/crossentropy": 2.2758138179779053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2193683236837387, "step": 8944 }, { "epoch": 0.7455, "grad_norm": 4.71875, "grad_norm_var": 0.14568684895833334, "learning_rate": 3.578322236207845e-05, "loss": 5.4236, "loss/crossentropy": 2.432409644126892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22839342057704926, "step": 8946 }, { "epoch": 0.7456666666666667, "grad_norm": 4.875, "grad_norm_var": 0.11780192057291666, "learning_rate": 3.576620577249626e-05, "loss": 4.6177, "loss/crossentropy": 1.9744727090001106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.168179078027606, "step": 8948 }, { "epoch": 0.7458333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.13570556640625, "learning_rate": 3.574916015097097e-05, "loss": 4.4341, "loss/crossentropy": 2.073435589671135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18709623627364635, "step": 8950 }, { "epoch": 0.746, "grad_norm": 4.4375, "grad_norm_var": 0.143603515625, "learning_rate": 3.5732085539560965e-05, "loss": 4.8361, "loss/crossentropy": 1.7923277840018272, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17312389239668846, "step": 8952 }, { "epoch": 0.7461666666666666, "grad_norm": 4.75, "grad_norm_var": 0.14683837890625, "learning_rate": 3.5714981980396144e-05, "loss": 4.6879, "loss/crossentropy": 1.9266493320465088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17960952781140804, "step": 8954 }, { "epoch": 0.7463333333333333, "grad_norm": 4.875, "grad_norm_var": 0.1439453125, "learning_rate": 3.5697849515677836e-05, "loss": 4.9524, "loss/crossentropy": 2.126235529780388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18564694002270699, "step": 8956 }, { "epoch": 0.7465, "grad_norm": 4.28125, "grad_norm_var": 0.1525390625, "learning_rate": 3.568068818767869e-05, "loss": 4.7523, "loss/crossentropy": 2.540748953819275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21789639070630074, "step": 8958 }, { "epoch": 0.7466666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.15510660807291668, "learning_rate": 3.5663498038742585e-05, "loss": 5.043, "loss/crossentropy": 2.259571760892868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22068355977535248, "step": 8960 }, { "epoch": 0.7468333333333333, "grad_norm": 5.375, "grad_norm_var": 0.09560139973958333, "learning_rate": 3.564627911128451e-05, "loss": 4.5511, "loss/crossentropy": 1.8589156866073608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19934171438217163, "step": 8962 }, { "epoch": 0.747, "grad_norm": 5.21875, "grad_norm_var": 0.10930582682291666, "learning_rate": 3.562903144779045e-05, "loss": 5.0822, "loss/crossentropy": 1.9405502825975418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18191159516572952, "step": 8964 }, { "epoch": 0.7471666666666666, "grad_norm": 4.75, "grad_norm_var": 0.095947265625, "learning_rate": 3.5611755090817294e-05, "loss": 4.4999, "loss/crossentropy": 2.3405293822288513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21351077407598495, "step": 8966 }, { "epoch": 0.7473333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.107421875, "learning_rate": 3.559445008299276e-05, "loss": 4.3914, "loss/crossentropy": 1.6899859458208084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1662097293883562, "step": 8968 }, { "epoch": 0.7475, "grad_norm": 4.65625, "grad_norm_var": 0.104150390625, "learning_rate": 3.55771164670152e-05, "loss": 4.6844, "loss/crossentropy": 1.4420829191803932, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13939605839550495, "step": 8970 }, { "epoch": 0.7476666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.10428059895833333, "learning_rate": 3.555975428565361e-05, "loss": 5.1911, "loss/crossentropy": 2.3206411004066467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20939678698778152, "step": 8972 }, { "epoch": 0.7478333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.08381754557291667, "learning_rate": 3.554236358174743e-05, "loss": 4.8117, "loss/crossentropy": 1.8067068308591843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19425064884126186, "step": 8974 }, { "epoch": 0.748, "grad_norm": 4.4375, "grad_norm_var": 0.08238525390625, "learning_rate": 3.5524944398206516e-05, "loss": 4.4304, "loss/crossentropy": 2.1453236043453217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19834518060088158, "step": 8976 }, { "epoch": 0.7481666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.045817057291666664, "learning_rate": 3.5507496778010964e-05, "loss": 4.7814, "loss/crossentropy": 2.388603627681732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22290612012147903, "step": 8978 }, { "epoch": 0.7483333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.023563639322916666, "learning_rate": 3.549002076421102e-05, "loss": 4.7707, "loss/crossentropy": 2.538282871246338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2229936309158802, "step": 8980 }, { "epoch": 0.7485, "grad_norm": 5.125, "grad_norm_var": 0.037613932291666666, "learning_rate": 3.5472516399927047e-05, "loss": 5.0232, "loss/crossentropy": 2.1156225353479385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18581297993659973, "step": 8982 }, { "epoch": 0.7486666666666667, "grad_norm": 9.0625, "grad_norm_var": 1.2496053059895833, "learning_rate": 3.5454983728349305e-05, "loss": 4.5365, "loss/crossentropy": 2.3697937726974487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17561548948287964, "step": 8984 }, { "epoch": 0.7488333333333334, "grad_norm": 4.625, "grad_norm_var": 1.249853515625, "learning_rate": 3.543742279273792e-05, "loss": 5.2116, "loss/crossentropy": 2.474073052406311, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21809647977352142, "step": 8986 }, { "epoch": 0.749, "grad_norm": 4.75, "grad_norm_var": 1.2510416666666666, "learning_rate": 3.541983363642275e-05, "loss": 4.8202, "loss/crossentropy": 1.280080884695053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1427099145948887, "step": 8988 }, { "epoch": 0.7491666666666666, "grad_norm": 5.03125, "grad_norm_var": 1.2544108072916667, "learning_rate": 3.5402216302803296e-05, "loss": 4.8564, "loss/crossentropy": 2.147283583879471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20935291796922684, "step": 8990 }, { "epoch": 0.7493333333333333, "grad_norm": 4.84375, "grad_norm_var": 1.2329264322916667, "learning_rate": 3.538457083534858e-05, "loss": 5.1555, "loss/crossentropy": 2.4556437134742737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24310623481869698, "step": 8992 }, { "epoch": 0.7495, "grad_norm": 4.71875, "grad_norm_var": 1.2042277018229166, "learning_rate": 3.536689727759702e-05, "loss": 4.7679, "loss/crossentropy": 2.4080257415771484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2176681011915207, "step": 8994 }, { "epoch": 0.7496666666666667, "grad_norm": 4.59375, "grad_norm_var": 1.193994140625, "learning_rate": 3.5349195673156385e-05, "loss": 4.7875, "loss/crossentropy": 2.589709520339966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2113696150481701, "step": 8996 }, { "epoch": 0.7498333333333334, "grad_norm": 4.8125, "grad_norm_var": 1.192578125, "learning_rate": 3.533146606570362e-05, "loss": 5.1712, "loss/crossentropy": 2.271325647830963, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2238401062786579, "step": 8998 }, { "epoch": 0.75, "grad_norm": 4.75, "grad_norm_var": 0.02838134765625, "learning_rate": 3.531370849898476e-05, "loss": 4.8331, "loss/crossentropy": 1.5186460092663765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.152960903942585, "step": 9000 }, { "epoch": 0.7501666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.03674723307291667, "learning_rate": 3.5295923016814856e-05, "loss": 4.5761, "loss/crossentropy": 1.513818047940731, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13120390567928553, "step": 9002 }, { "epoch": 0.7503333333333333, "grad_norm": 5.46875, "grad_norm_var": 0.08802083333333334, "learning_rate": 3.527810966307779e-05, "loss": 5.1174, "loss/crossentropy": 1.9752911627292633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18689009174704552, "step": 9004 }, { "epoch": 0.7505, "grad_norm": 5.40625, "grad_norm_var": 0.10621337890625, "learning_rate": 3.5260268481726256e-05, "loss": 4.9953, "loss/crossentropy": 2.0769334733486176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2100028581917286, "step": 9006 }, { "epoch": 0.7506666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.129296875, "learning_rate": 3.5242399516781595e-05, "loss": 4.5564, "loss/crossentropy": 1.2513089552521706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14101386442780495, "step": 9008 }, { "epoch": 0.7508333333333334, "grad_norm": 5.09375, "grad_norm_var": 0.12589518229166666, "learning_rate": 3.5224502812333694e-05, "loss": 4.8856, "loss/crossentropy": 1.1583551615476608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1508794203400612, "step": 9010 }, { "epoch": 0.751, "grad_norm": 4.59375, "grad_norm_var": 0.12643229166666667, "learning_rate": 3.520657841254091e-05, "loss": 4.6876, "loss/crossentropy": 2.0375124514102936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22591862082481384, "step": 9012 }, { "epoch": 0.7511666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.130712890625, "learning_rate": 3.51886263616299e-05, "loss": 4.9499, "loss/crossentropy": 2.563260316848755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23242852464318275, "step": 9014 }, { "epoch": 0.7513333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.13368733723958334, "learning_rate": 3.517064670389557e-05, "loss": 5.2829, "loss/crossentropy": 2.5851563215255737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2215106524527073, "step": 9016 }, { "epoch": 0.7515, "grad_norm": 5.1875, "grad_norm_var": 0.12610677083333333, "learning_rate": 3.5152639483700936e-05, "loss": 5.1981, "loss/crossentropy": 2.2794704139232635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125004678964615, "step": 9018 }, { "epoch": 0.7516666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.0740234375, "learning_rate": 3.513460474547703e-05, "loss": 5.3094, "loss/crossentropy": 2.4865044355392456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22369953617453575, "step": 9020 }, { "epoch": 0.7518333333333334, "grad_norm": 4.40625, "grad_norm_var": 0.05260009765625, "learning_rate": 3.5116542533722775e-05, "loss": 4.71, "loss/crossentropy": 2.2633769810199738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2011515162885189, "step": 9022 }, { "epoch": 0.752, "grad_norm": 4.90625, "grad_norm_var": 0.038525390625, "learning_rate": 3.509845289300488e-05, "loss": 4.6829, "loss/crossentropy": 2.047989845275879, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18051733635365963, "step": 9024 }, { "epoch": 0.7521666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.033036295572916666, "learning_rate": 3.5080335867957744e-05, "loss": 4.6169, "loss/crossentropy": 1.6083292067050934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1622084677219391, "step": 9026 }, { "epoch": 0.7523333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.04763997395833333, "learning_rate": 3.50621915032833e-05, "loss": 4.2666, "loss/crossentropy": 2.077538877725601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20892773941159248, "step": 9028 }, { "epoch": 0.7525, "grad_norm": 4.3125, "grad_norm_var": 0.059488932291666664, "learning_rate": 3.5044019843751e-05, "loss": 4.1963, "loss/crossentropy": 2.139458805322647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20242808759212494, "step": 9030 }, { "epoch": 0.7526666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.061812337239583334, "learning_rate": 3.502582093419758e-05, "loss": 5.3637, "loss/crossentropy": 2.3458738029003143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19767314568161964, "step": 9032 }, { "epoch": 0.7528333333333334, "grad_norm": 4.3125, "grad_norm_var": 0.06243489583333333, "learning_rate": 3.5007594819527054e-05, "loss": 5.1482, "loss/crossentropy": 1.56951804459095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15344494953751564, "step": 9034 }, { "epoch": 0.753, "grad_norm": 4.90625, "grad_norm_var": 0.06510416666666667, "learning_rate": 3.4989341544710543e-05, "loss": 5.106, "loss/crossentropy": 2.4358200430870056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20194847881793976, "step": 9036 }, { "epoch": 0.7531666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.06614176432291667, "learning_rate": 3.497106115478618e-05, "loss": 4.6428, "loss/crossentropy": 2.2978862822055817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043941207230091, "step": 9038 }, { "epoch": 0.7533333333333333, "grad_norm": 4.625, "grad_norm_var": 0.0623046875, "learning_rate": 3.495275369485902e-05, "loss": 4.9244, "loss/crossentropy": 2.148787200450897, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20230147242546082, "step": 9040 }, { "epoch": 0.7535, "grad_norm": 5.03125, "grad_norm_var": 0.07042643229166666, "learning_rate": 3.4934419210100906e-05, "loss": 4.5074, "loss/crossentropy": 1.943856105208397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1838915403932333, "step": 9042 }, { "epoch": 0.7536666666666667, "grad_norm": 4.25, "grad_norm_var": 0.07066650390625, "learning_rate": 3.491605774575034e-05, "loss": 4.4208, "loss/crossentropy": 1.7964168190956116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17760272696614265, "step": 9044 }, { "epoch": 0.7538333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.06513264973958334, "learning_rate": 3.489766934711243e-05, "loss": 4.7686, "loss/crossentropy": 2.1065956354141235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23584268614649773, "step": 9046 }, { "epoch": 0.754, "grad_norm": 5.0625, "grad_norm_var": 0.08222249348958334, "learning_rate": 3.487925405955872e-05, "loss": 5.1146, "loss/crossentropy": 2.0449488013982773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19515445083379745, "step": 9048 }, { "epoch": 0.7541666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.06780192057291666, "learning_rate": 3.486081192852708e-05, "loss": 5.348, "loss/crossentropy": 2.6749900579452515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22742577642202377, "step": 9050 }, { "epoch": 0.7543333333333333, "grad_norm": 4.75, "grad_norm_var": 0.06324462890625, "learning_rate": 3.4842342999521644e-05, "loss": 4.9745, "loss/crossentropy": 1.704499438405037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16869292221963406, "step": 9052 }, { "epoch": 0.7545, "grad_norm": 4.65625, "grad_norm_var": 0.07303059895833333, "learning_rate": 3.482384731811267e-05, "loss": 4.5628, "loss/crossentropy": 2.1640761494636536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1985640451312065, "step": 9054 }, { "epoch": 0.7546666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.07727864583333334, "learning_rate": 3.4805324929936394e-05, "loss": 5.0844, "loss/crossentropy": 2.241463929414749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2031102292239666, "step": 9056 }, { "epoch": 0.7548333333333334, "grad_norm": 4.875, "grad_norm_var": 0.07125244140625, "learning_rate": 3.478677588069499e-05, "loss": 5.1096, "loss/crossentropy": 2.2909657061100006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24316620454192162, "step": 9058 }, { "epoch": 0.755, "grad_norm": 5.46875, "grad_norm_var": 0.06946614583333334, "learning_rate": 3.4768200216156374e-05, "loss": 4.8016, "loss/crossentropy": 1.831287831068039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1837274581193924, "step": 9060 }, { "epoch": 0.7551666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.06702067057291666, "learning_rate": 3.4749597982154166e-05, "loss": 4.6735, "loss/crossentropy": 1.2566091194748878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1334764687344432, "step": 9062 }, { "epoch": 0.7553333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.06536051432291666, "learning_rate": 3.4730969224587525e-05, "loss": 5.2192, "loss/crossentropy": 1.172278493642807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15206410735845566, "step": 9064 }, { "epoch": 0.7555, "grad_norm": 5.625, "grad_norm_var": 0.10373942057291667, "learning_rate": 3.471231398942105e-05, "loss": 4.9221, "loss/crossentropy": 1.4909002631902695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15898577868938446, "step": 9066 }, { "epoch": 0.7556666666666667, "grad_norm": 5.0, "grad_norm_var": 0.10611572265625, "learning_rate": 3.469363232268469e-05, "loss": 5.1231, "loss/crossentropy": 2.3849256336688995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2019842453300953, "step": 9068 }, { "epoch": 0.7558333333333334, "grad_norm": 4.625, "grad_norm_var": 0.09842122395833333, "learning_rate": 3.4674924270473607e-05, "loss": 4.3009, "loss/crossentropy": 1.8146369010210037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19062471948564053, "step": 9070 }, { "epoch": 0.756, "grad_norm": 4.96875, "grad_norm_var": 0.09713134765625, "learning_rate": 3.465618987894803e-05, "loss": 4.413, "loss/crossentropy": 2.3734322786331177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2367638796567917, "step": 9072 }, { "epoch": 0.7561666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.10637613932291666, "learning_rate": 3.463742919433323e-05, "loss": 5.1331, "loss/crossentropy": 2.602075159549713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20435499399900436, "step": 9074 }, { "epoch": 0.7563333333333333, "grad_norm": 4.875, "grad_norm_var": 0.08162434895833333, "learning_rate": 3.461864226291934e-05, "loss": 4.7168, "loss/crossentropy": 2.37225678563118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2264081910252571, "step": 9076 }, { "epoch": 0.7565, "grad_norm": 4.5, "grad_norm_var": 0.08435872395833334, "learning_rate": 3.4599829131061225e-05, "loss": 4.6292, "loss/crossentropy": 1.7959840223193169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16916048899292946, "step": 9078 }, { "epoch": 0.7566666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.09055989583333333, "learning_rate": 3.458098984517843e-05, "loss": 4.7667, "loss/crossentropy": 1.5935606062412262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16453899256885052, "step": 9080 }, { "epoch": 0.7568333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.0314453125, "learning_rate": 3.456212445175502e-05, "loss": 5.0477, "loss/crossentropy": 2.24808931350708, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21594270691275597, "step": 9082 }, { "epoch": 0.757, "grad_norm": 4.59375, "grad_norm_var": 0.028450520833333333, "learning_rate": 3.454323299733948e-05, "loss": 4.6522, "loss/crossentropy": 1.8796441107988358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1952054239809513, "step": 9084 }, { "epoch": 0.7571666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.042252604166666666, "learning_rate": 3.452431552854458e-05, "loss": 4.597, "loss/crossentropy": 1.292886197566986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13541504368185997, "step": 9086 }, { "epoch": 0.7573333333333333, "grad_norm": 4.125, "grad_norm_var": 0.04885660807291667, "learning_rate": 3.450537209204731e-05, "loss": 4.3493, "loss/crossentropy": 1.193882331252098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13158982805907726, "step": 9088 }, { "epoch": 0.7575, "grad_norm": 5.0, "grad_norm_var": 0.07349853515625, "learning_rate": 3.44864027345887e-05, "loss": 4.9141, "loss/crossentropy": 2.6352381110191345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20736566931009293, "step": 9090 }, { "epoch": 0.7576666666666667, "grad_norm": 4.9375, "grad_norm_var": 0.07591145833333333, "learning_rate": 3.446740750297378e-05, "loss": 4.9487, "loss/crossentropy": 2.3482372760772705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2141609936952591, "step": 9092 }, { "epoch": 0.7578333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.20416666666666666, "learning_rate": 3.444838644407138e-05, "loss": 5.0464, "loss/crossentropy": 1.3707880079746246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13958771526813507, "step": 9094 }, { "epoch": 0.758, "grad_norm": 5.125, "grad_norm_var": 0.35090738932291665, "learning_rate": 3.442933960481407e-05, "loss": 5.0027, "loss/crossentropy": 1.7247644662857056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1777170617133379, "step": 9096 }, { "epoch": 0.7581666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.35113525390625, "learning_rate": 3.441026703219803e-05, "loss": 5.351, "loss/crossentropy": 2.6863598823547363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22101808339357376, "step": 9098 }, { "epoch": 0.7583333333333333, "grad_norm": 5.375, "grad_norm_var": 0.35640869140625, "learning_rate": 3.439116877328294e-05, "loss": 4.6485, "loss/crossentropy": 2.349628359079361, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2090320773422718, "step": 9100 }, { "epoch": 0.7585, "grad_norm": 4.71875, "grad_norm_var": 0.3270467122395833, "learning_rate": 3.437204487519186e-05, "loss": 5.0351, "loss/crossentropy": 2.57798308134079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016715221107006, "step": 9102 }, { "epoch": 0.7586666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.27294514973958334, "learning_rate": 3.435289538511111e-05, "loss": 5.1272, "loss/crossentropy": 2.202754706144333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18523726612329483, "step": 9104 }, { "epoch": 0.7588333333333334, "grad_norm": 5.90625, "grad_norm_var": 0.32711181640625, "learning_rate": 3.433372035029015e-05, "loss": 5.1368, "loss/crossentropy": 2.47508043050766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21155068278312683, "step": 9106 }, { "epoch": 0.759, "grad_norm": 4.84375, "grad_norm_var": 0.329541015625, "learning_rate": 3.4314519818041466e-05, "loss": 5.2032, "loss/crossentropy": 2.169353663921356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20374875143170357, "step": 9108 }, { "epoch": 0.7591666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.2648396809895833, "learning_rate": 3.429529383574047e-05, "loss": 4.7644, "loss/crossentropy": 1.9236654192209244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17431189492344856, "step": 9110 }, { "epoch": 0.7593333333333333, "grad_norm": 4.875, "grad_norm_var": 0.128759765625, "learning_rate": 3.4276042450825355e-05, "loss": 4.6065, "loss/crossentropy": 1.8178698271512985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19074127450585365, "step": 9112 }, { "epoch": 0.7595, "grad_norm": 4.53125, "grad_norm_var": 0.13720296223958334, "learning_rate": 3.4256765710797006e-05, "loss": 4.9505, "loss/crossentropy": 1.6263808757066727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17589782178401947, "step": 9114 }, { "epoch": 0.7596666666666667, "grad_norm": 4.625, "grad_norm_var": 0.114453125, "learning_rate": 3.4237463663218853e-05, "loss": 4.7901, "loss/crossentropy": 2.326656460762024, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2385944351553917, "step": 9116 }, { "epoch": 0.7598333333333334, "grad_norm": 4.4375, "grad_norm_var": 0.12330729166666667, "learning_rate": 3.42181363557168e-05, "loss": 4.6892, "loss/crossentropy": 2.12799334526062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20836078003048897, "step": 9118 }, { "epoch": 0.76, "grad_norm": 5.0625, "grad_norm_var": 0.12297770182291666, "learning_rate": 3.4198783835979034e-05, "loss": 5.1581, "loss/crossentropy": 2.33326955139637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1998424343764782, "step": 9120 }, { "epoch": 0.7601666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.04657796223958333, "learning_rate": 3.417940615175599e-05, "loss": 5.0946, "loss/crossentropy": 1.7350738197565079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1729459259659052, "step": 9122 }, { "epoch": 0.7603333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.050679524739583336, "learning_rate": 3.4160003350860176e-05, "loss": 4.9605, "loss/crossentropy": 2.4162497520446777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20327191054821014, "step": 9124 }, { "epoch": 0.7605, "grad_norm": 4.46875, "grad_norm_var": 0.05579427083333333, "learning_rate": 3.4140575481166066e-05, "loss": 4.5611, "loss/crossentropy": 1.4746525883674622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13860525004565716, "step": 9126 }, { "epoch": 0.7606666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.04638264973958333, "learning_rate": 3.412112259061e-05, "loss": 4.2153, "loss/crossentropy": 1.289049193263054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16961247101426125, "step": 9128 }, { "epoch": 0.7608333333333334, "grad_norm": 4.625, "grad_norm_var": 0.0521484375, "learning_rate": 3.410164472719005e-05, "loss": 4.4881, "loss/crossentropy": 1.420512616634369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15469906572252512, "step": 9130 }, { "epoch": 0.761, "grad_norm": 4.875, "grad_norm_var": 0.050390625, "learning_rate": 3.4082141938965915e-05, "loss": 5.0561, "loss/crossentropy": 2.3881621956825256, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22114987671375275, "step": 9132 }, { "epoch": 0.7611666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.0599609375, "learning_rate": 3.406261427405878e-05, "loss": 4.7353, "loss/crossentropy": 2.0259829089045525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19267887063324451, "step": 9134 }, { "epoch": 0.7613333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.05152587890625, "learning_rate": 3.404306178065121e-05, "loss": 4.8248, "loss/crossentropy": 2.682315409183502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208783358335495, "step": 9136 }, { "epoch": 0.7615, "grad_norm": 4.5, "grad_norm_var": 0.05089518229166667, "learning_rate": 3.4023484506987064e-05, "loss": 4.6531, "loss/crossentropy": 2.019272468984127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19835746102035046, "step": 9138 }, { "epoch": 0.7616666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.04771728515625, "learning_rate": 3.4003882501371296e-05, "loss": 4.4325, "loss/crossentropy": 1.8939252644777298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1691152397543192, "step": 9140 }, { "epoch": 0.7618333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.04503580729166667, "learning_rate": 3.39842558121699e-05, "loss": 4.8541, "loss/crossentropy": 1.8951895460486412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1793217770755291, "step": 9142 }, { "epoch": 0.762, "grad_norm": 4.78125, "grad_norm_var": 0.042704264322916664, "learning_rate": 3.3964604487809806e-05, "loss": 5.0357, "loss/crossentropy": 1.9603685438632965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17539142072200775, "step": 9144 }, { "epoch": 0.7621666666666667, "grad_norm": 4.75, "grad_norm_var": 0.03372395833333333, "learning_rate": 3.3944928576778694e-05, "loss": 4.9157, "loss/crossentropy": 1.379582405090332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1512586548924446, "step": 9146 }, { "epoch": 0.7623333333333333, "grad_norm": 4.75, "grad_norm_var": 0.0396484375, "learning_rate": 3.39252281276249e-05, "loss": 4.8829, "loss/crossentropy": 1.871251530945301, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19466626271605492, "step": 9148 }, { "epoch": 0.7625, "grad_norm": 4.4375, "grad_norm_var": 0.028369140625, "learning_rate": 3.3905503188957354e-05, "loss": 4.3252, "loss/crossentropy": 1.522882029414177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1446057464927435, "step": 9150 }, { "epoch": 0.7626666666666667, "grad_norm": 5.25, "grad_norm_var": 0.08661702473958334, "learning_rate": 3.388575380944535e-05, "loss": 5.5142, "loss/crossentropy": 2.8795396983623505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21250687539577484, "step": 9152 }, { "epoch": 0.7628333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.08860677083333333, "learning_rate": 3.386598003781855e-05, "loss": 5.0395, "loss/crossentropy": 2.2749998569488525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20907314494252205, "step": 9154 }, { "epoch": 0.763, "grad_norm": 5.15625, "grad_norm_var": 0.08435872395833334, "learning_rate": 3.3846181922866746e-05, "loss": 4.9776, "loss/crossentropy": 1.7697783410549164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16960179060697556, "step": 9156 }, { "epoch": 0.7631666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.08255208333333333, "learning_rate": 3.382635951343983e-05, "loss": 4.7008, "loss/crossentropy": 1.9313219785690308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19448504596948624, "step": 9158 }, { "epoch": 0.7633333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.07860921223958334, "learning_rate": 3.3806512858447626e-05, "loss": 4.5204, "loss/crossentropy": 1.810696929693222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1649162843823433, "step": 9160 }, { "epoch": 0.7635, "grad_norm": 4.59375, "grad_norm_var": 0.07343343098958334, "learning_rate": 3.378664200685978e-05, "loss": 4.6922, "loss/crossentropy": 1.2390378192067146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14889622665941715, "step": 9162 }, { "epoch": 0.7636666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.08101806640625, "learning_rate": 3.376674700770564e-05, "loss": 4.617, "loss/crossentropy": 1.7813236862421036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18585302121937275, "step": 9164 }, { "epoch": 0.7638333333333334, "grad_norm": 4.5, "grad_norm_var": 0.07858072916666667, "learning_rate": 3.3746827910074154e-05, "loss": 4.4724, "loss/crossentropy": 1.8114677891135216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18985752575099468, "step": 9166 }, { "epoch": 0.764, "grad_norm": 4.53125, "grad_norm_var": 0.039306640625, "learning_rate": 3.3726884763113693e-05, "loss": 5.1584, "loss/crossentropy": 1.3699896410107613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.154366759583354, "step": 9168 }, { "epoch": 0.7641666666666667, "grad_norm": 5.25, "grad_norm_var": 0.07047119140625, "learning_rate": 3.3706917616032e-05, "loss": 5.6044, "loss/crossentropy": 1.8340007662773132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20330247655510902, "step": 9170 }, { "epoch": 0.7643333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.0587890625, "learning_rate": 3.3686926518096026e-05, "loss": 5.2916, "loss/crossentropy": 2.0827116072177887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1957671195268631, "step": 9172 }, { "epoch": 0.7645, "grad_norm": 4.28125, "grad_norm_var": 0.08700764973958333, "learning_rate": 3.366691151863182e-05, "loss": 4.1972, "loss/crossentropy": 1.7677662521600723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18560653924942017, "step": 9174 }, { "epoch": 0.7646666666666667, "grad_norm": 4.875, "grad_norm_var": 0.08938395182291667, "learning_rate": 3.36468726670244e-05, "loss": 4.7011, "loss/crossentropy": 1.5455302894115448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1788640320301056, "step": 9176 }, { "epoch": 0.7648333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.10976155598958333, "learning_rate": 3.3626810012717646e-05, "loss": 5.3509, "loss/crossentropy": 1.846191093325615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2068962249904871, "step": 9178 }, { "epoch": 0.765, "grad_norm": 4.59375, "grad_norm_var": 0.11092122395833333, "learning_rate": 3.360672360521415e-05, "loss": 4.898, "loss/crossentropy": 1.9306019470095634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1716877445578575, "step": 9180 }, { "epoch": 0.7651666666666667, "grad_norm": 4.09375, "grad_norm_var": 0.13440348307291666, "learning_rate": 3.3586613494075135e-05, "loss": 4.7658, "loss/crossentropy": 2.1522144228219986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1841568574309349, "step": 9182 }, { "epoch": 0.7653333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.13489176432291666, "learning_rate": 3.356647972892031e-05, "loss": 4.8803, "loss/crossentropy": 1.5114280879497528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15638689696788788, "step": 9184 }, { "epoch": 0.7655, "grad_norm": 4.90625, "grad_norm_var": 0.12597249348958334, "learning_rate": 3.3546322359427726e-05, "loss": 5.2088, "loss/crossentropy": 1.538307212293148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17916510254144669, "step": 9186 }, { "epoch": 0.7656666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.13098551432291666, "learning_rate": 3.3526141435333684e-05, "loss": 5.1529, "loss/crossentropy": 1.8003139421343803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1915070228278637, "step": 9188 }, { "epoch": 0.7658333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.10657552083333334, "learning_rate": 3.350593700643262e-05, "loss": 5.0874, "loss/crossentropy": 1.7107343226671219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16837585344910622, "step": 9190 }, { "epoch": 0.766, "grad_norm": 4.5, "grad_norm_var": 0.13522135416666667, "learning_rate": 3.348570912257695e-05, "loss": 4.7146, "loss/crossentropy": 2.154672235250473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2126365266740322, "step": 9192 }, { "epoch": 0.7661666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.12069905598958333, "learning_rate": 3.346545783367697e-05, "loss": 4.6693, "loss/crossentropy": 2.692975878715515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21004782244563103, "step": 9194 }, { "epoch": 0.7663333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.11728108723958333, "learning_rate": 3.3445183189700716e-05, "loss": 4.913, "loss/crossentropy": 1.548985317349434, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15373594500124454, "step": 9196 }, { "epoch": 0.7665, "grad_norm": 4.78125, "grad_norm_var": 0.10428059895833333, "learning_rate": 3.3424885240673866e-05, "loss": 4.9349, "loss/crossentropy": 1.9586029201745987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17618397064507008, "step": 9198 }, { "epoch": 0.7666666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.09163004557291667, "learning_rate": 3.340456403667958e-05, "loss": 4.3688, "loss/crossentropy": 1.9201075732707977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19281110912561417, "step": 9200 }, { "epoch": 0.7668333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.08001302083333334, "learning_rate": 3.338421962785841e-05, "loss": 4.7653, "loss/crossentropy": 1.4313116371631622, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1664351150393486, "step": 9202 }, { "epoch": 0.767, "grad_norm": 4.96875, "grad_norm_var": 0.07958577473958334, "learning_rate": 3.3363852064408165e-05, "loss": 5.2229, "loss/crossentropy": 2.160913795232773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19118301942944527, "step": 9204 }, { "epoch": 0.7671666666666667, "grad_norm": 4.75, "grad_norm_var": 0.08670247395833333, "learning_rate": 3.3343461396583784e-05, "loss": 4.8138, "loss/crossentropy": 1.6962665170431137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16171543672680855, "step": 9206 }, { "epoch": 0.7673333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.06145833333333333, "learning_rate": 3.3323047674697224e-05, "loss": 5.163, "loss/crossentropy": 2.673032522201538, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21397614479064941, "step": 9208 }, { "epoch": 0.7675, "grad_norm": 4.78125, "grad_norm_var": 0.06483968098958333, "learning_rate": 3.330261094911729e-05, "loss": 4.8433, "loss/crossentropy": 1.8518261313438416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22120384871959686, "step": 9210 }, { "epoch": 0.7676666666666667, "grad_norm": 4.625, "grad_norm_var": 0.06728108723958333, "learning_rate": 3.328215127026959e-05, "loss": 4.8807, "loss/crossentropy": 1.8135938048362732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18345564603805542, "step": 9212 }, { "epoch": 0.7678333333333334, "grad_norm": 4.3125, "grad_norm_var": 0.05364583333333333, "learning_rate": 3.326166868863634e-05, "loss": 4.4959, "loss/crossentropy": 1.942594051361084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18999501317739487, "step": 9214 }, { "epoch": 0.768, "grad_norm": 4.5625, "grad_norm_var": 0.05480143229166667, "learning_rate": 3.324116325475628e-05, "loss": 4.7555, "loss/crossentropy": 1.651905320584774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13776842784136534, "step": 9216 }, { "epoch": 0.7681666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.05709228515625, "learning_rate": 3.322063501922453e-05, "loss": 5.1112, "loss/crossentropy": 2.402420222759247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22454120218753815, "step": 9218 }, { "epoch": 0.7683333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.055582682291666664, "learning_rate": 3.320008403269246e-05, "loss": 4.7769, "loss/crossentropy": 1.7973673362284899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15573907841462642, "step": 9220 }, { "epoch": 0.7685, "grad_norm": 4.4375, "grad_norm_var": 0.06456705729166666, "learning_rate": 3.317951034586759e-05, "loss": 4.6862, "loss/crossentropy": 2.4214308857917786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21835685148835182, "step": 9222 }, { "epoch": 0.7686666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.05526936848958333, "learning_rate": 3.315891400951346e-05, "loss": 4.6974, "loss/crossentropy": 1.9672070443630219, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1876031868159771, "step": 9224 }, { "epoch": 0.7688333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.055338541666666664, "learning_rate": 3.313829507444946e-05, "loss": 4.8264, "loss/crossentropy": 1.510596327483654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15548838302493095, "step": 9226 }, { "epoch": 0.769, "grad_norm": 4.625, "grad_norm_var": 0.07864176432291667, "learning_rate": 3.311765359155079e-05, "loss": 4.6752, "loss/crossentropy": 2.789728343486786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2263692542910576, "step": 9228 }, { "epoch": 0.7691666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.05896809895833333, "learning_rate": 3.309698961174823e-05, "loss": 4.2339, "loss/crossentropy": 2.3551100194454193, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22738180309534073, "step": 9230 }, { "epoch": 0.7693333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.058577473958333334, "learning_rate": 3.307630318602811e-05, "loss": 5.6364, "loss/crossentropy": 2.2606790959835052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2105814702808857, "step": 9232 }, { "epoch": 0.7695, "grad_norm": 4.4375, "grad_norm_var": 0.062890625, "learning_rate": 3.3055594365432124e-05, "loss": 4.6461, "loss/crossentropy": 1.849206268787384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17526361718773842, "step": 9234 }, { "epoch": 0.7696666666666667, "grad_norm": 5.0, "grad_norm_var": 0.074853515625, "learning_rate": 3.303486320105724e-05, "loss": 4.9883, "loss/crossentropy": 1.5751914456486702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15906815975904465, "step": 9236 }, { "epoch": 0.7698333333333334, "grad_norm": 5.0, "grad_norm_var": 0.06951497395833334, "learning_rate": 3.3014109744055524e-05, "loss": 5.1432, "loss/crossentropy": 1.3298326507210732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15809830278158188, "step": 9238 }, { "epoch": 0.77, "grad_norm": 5.0, "grad_norm_var": 0.06217041015625, "learning_rate": 3.29933340456341e-05, "loss": 5.0645, "loss/crossentropy": 1.6231756582856178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22281446680426598, "step": 9240 }, { "epoch": 0.7701666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.06326497395833333, "learning_rate": 3.29725361570549e-05, "loss": 5.0422, "loss/crossentropy": 2.274557799100876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104065828025341, "step": 9242 }, { "epoch": 0.7703333333333333, "grad_norm": 5.0, "grad_norm_var": 0.04876302083333333, "learning_rate": 3.2951716129634675e-05, "loss": 5.0096, "loss/crossentropy": 1.4321745932102203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16160675697028637, "step": 9244 }, { "epoch": 0.7705, "grad_norm": 4.5, "grad_norm_var": 0.049609375, "learning_rate": 3.293087401474476e-05, "loss": 4.3341, "loss/crossentropy": 1.8292163461446762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17593476921319962, "step": 9246 }, { "epoch": 0.7706666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.04986979166666667, "learning_rate": 3.291000986381101e-05, "loss": 5.4821, "loss/crossentropy": 2.490095376968384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2170156165957451, "step": 9248 }, { "epoch": 0.7708333333333334, "grad_norm": 5.03125, "grad_norm_var": 0.04694010416666667, "learning_rate": 3.288912372831364e-05, "loss": 5.1315, "loss/crossentropy": 2.3378700017929077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23920363187789917, "step": 9250 }, { "epoch": 0.771, "grad_norm": 4.96875, "grad_norm_var": 0.035791015625, "learning_rate": 3.286821565978711e-05, "loss": 5.3335, "loss/crossentropy": 2.1893119513988495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20198990032076836, "step": 9252 }, { "epoch": 0.7711666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.0548828125, "learning_rate": 3.284728570982e-05, "loss": 4.9941, "loss/crossentropy": 1.962324395775795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17419682629406452, "step": 9254 }, { "epoch": 0.7713333333333333, "grad_norm": 4.875, "grad_norm_var": 0.0517578125, "learning_rate": 3.282633393005489e-05, "loss": 5.0291, "loss/crossentropy": 1.6728404238820076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20481682196259499, "step": 9256 }, { "epoch": 0.7715, "grad_norm": 4.59375, "grad_norm_var": 0.05419514973958333, "learning_rate": 3.28053603721882e-05, "loss": 5.759, "loss/crossentropy": 1.7875987961888313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16401218622922897, "step": 9258 }, { "epoch": 0.7716666666666666, "grad_norm": 4.875, "grad_norm_var": 0.05237223307291667, "learning_rate": 3.278436508797011e-05, "loss": 4.9107, "loss/crossentropy": 2.0292908400297165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.175339225679636, "step": 9260 }, { "epoch": 0.7718333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.04108072916666667, "learning_rate": 3.2763348129204396e-05, "loss": 4.749, "loss/crossentropy": 1.8519446104764938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16933886520564556, "step": 9262 }, { "epoch": 0.772, "grad_norm": 4.65625, "grad_norm_var": 0.05286051432291667, "learning_rate": 3.2742309547748314e-05, "loss": 5.0763, "loss/crossentropy": 2.253142923116684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19896778464317322, "step": 9264 }, { "epoch": 0.7721666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.06002604166666667, "learning_rate": 3.272124939551247e-05, "loss": 5.3582, "loss/crossentropy": 2.13630610704422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20657182484865189, "step": 9266 }, { "epoch": 0.7723333333333333, "grad_norm": 4.875, "grad_norm_var": 0.053238932291666666, "learning_rate": 3.2700167724460685e-05, "loss": 4.8069, "loss/crossentropy": 1.8119681552052498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17984369210898876, "step": 9268 }, { "epoch": 0.7725, "grad_norm": 4.875, "grad_norm_var": 0.03173421223958333, "learning_rate": 3.26790645866099e-05, "loss": 5.1939, "loss/crossentropy": 2.4118016362190247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21238981559872627, "step": 9270 }, { "epoch": 0.7726666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.04440104166666667, "learning_rate": 3.265794003403002e-05, "loss": 4.5605, "loss/crossentropy": 2.2320240437984467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1951523795723915, "step": 9272 }, { "epoch": 0.7728333333333334, "grad_norm": 6.375, "grad_norm_var": 0.18730061848958332, "learning_rate": 3.263679411884375e-05, "loss": 5.361, "loss/crossentropy": 1.4449108317494392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16431713104248047, "step": 9274 }, { "epoch": 0.773, "grad_norm": 4.875, "grad_norm_var": 0.18987223307291667, "learning_rate": 3.2615626893226564e-05, "loss": 5.3566, "loss/crossentropy": 2.320689380168915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18379362672567368, "step": 9276 }, { "epoch": 0.7731666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.19381510416666667, "learning_rate": 3.2594438409406475e-05, "loss": 5.102, "loss/crossentropy": 1.8211688697338104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20534205064177513, "step": 9278 }, { "epoch": 0.7733333333333333, "grad_norm": 5.25, "grad_norm_var": 0.19495035807291666, "learning_rate": 3.2573228719663944e-05, "loss": 5.5899, "loss/crossentropy": 2.3344379365444183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23429518193006516, "step": 9280 }, { "epoch": 0.7735, "grad_norm": 4.71875, "grad_norm_var": 0.20771077473958333, "learning_rate": 3.2551997876331805e-05, "loss": 4.9082, "loss/crossentropy": 1.7352554872632027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16921756975352764, "step": 9282 }, { "epoch": 0.7736666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.21252848307291666, "learning_rate": 3.253074593179502e-05, "loss": 4.9955, "loss/crossentropy": 2.216016709804535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23066405951976776, "step": 9284 }, { "epoch": 0.7738333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.22278238932291666, "learning_rate": 3.2509472938490674e-05, "loss": 4.9624, "loss/crossentropy": 1.9695368334650993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1958236563950777, "step": 9286 }, { "epoch": 0.774, "grad_norm": 4.9375, "grad_norm_var": 0.23668212890625, "learning_rate": 3.2488178948907746e-05, "loss": 4.6204, "loss/crossentropy": 1.8463789224624634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17317039147019386, "step": 9288 }, { "epoch": 0.7741666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.07766927083333333, "learning_rate": 3.2466864015587054e-05, "loss": 4.4168, "loss/crossentropy": 1.736086145043373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16505318135023117, "step": 9290 }, { "epoch": 0.7743333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.07763264973958334, "learning_rate": 3.244552819112107e-05, "loss": 4.71, "loss/crossentropy": 1.9710333943367004, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19168449193239212, "step": 9292 }, { "epoch": 0.7745, "grad_norm": 4.71875, "grad_norm_var": 0.057145182291666666, "learning_rate": 3.242417152815381e-05, "loss": 5.183, "loss/crossentropy": 2.472516894340515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2013029269874096, "step": 9294 }, { "epoch": 0.7746666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.04885660807291667, "learning_rate": 3.240279407938074e-05, "loss": 5.2434, "loss/crossentropy": 2.4441158175468445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029016651213169, "step": 9296 }, { "epoch": 0.7748333333333334, "grad_norm": 4.40625, "grad_norm_var": 0.0490234375, "learning_rate": 3.2381395897548563e-05, "loss": 5.1106, "loss/crossentropy": 1.6266438364982605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16166551038622856, "step": 9298 }, { "epoch": 0.775, "grad_norm": 4.71875, "grad_norm_var": 0.06200764973958333, "learning_rate": 3.2359977035455185e-05, "loss": 4.833, "loss/crossentropy": 1.9613151028752327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19063337706029415, "step": 9300 }, { "epoch": 0.7751666666666667, "grad_norm": 6.0, "grad_norm_var": 0.158203125, "learning_rate": 3.233853754594951e-05, "loss": 5.113, "loss/crossentropy": 2.270957499742508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21738599985837936, "step": 9302 }, { "epoch": 0.7753333333333333, "grad_norm": 4.375, "grad_norm_var": 0.148828125, "learning_rate": 3.2317077481931355e-05, "loss": 4.8684, "loss/crossentropy": 1.7900439202785492, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20083611086010933, "step": 9304 }, { "epoch": 0.7755, "grad_norm": 5.0625, "grad_norm_var": 0.1513671875, "learning_rate": 3.229559689635129e-05, "loss": 4.8177, "loss/crossentropy": 1.4736758545041084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13412130996584892, "step": 9306 }, { "epoch": 0.7756666666666666, "grad_norm": 4.375, "grad_norm_var": 0.186572265625, "learning_rate": 3.227409584221052e-05, "loss": 4.3715, "loss/crossentropy": 1.3929030001163483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14130380935966969, "step": 9308 }, { "epoch": 0.7758333333333334, "grad_norm": 4.5, "grad_norm_var": 0.19280192057291667, "learning_rate": 3.225257437256076e-05, "loss": 5.3773, "loss/crossentropy": 2.001873791217804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029735241085291, "step": 9310 }, { "epoch": 0.776, "grad_norm": 4.625, "grad_norm_var": 0.18925374348958332, "learning_rate": 3.22310325405041e-05, "loss": 5.4514, "loss/crossentropy": 2.3293404579162598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19467134773731232, "step": 9312 }, { "epoch": 0.7761666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.18466389973958333, "learning_rate": 3.220947039919288e-05, "loss": 4.6479, "loss/crossentropy": 1.5509610995650291, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20042119547724724, "step": 9314 }, { "epoch": 0.7763333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.19384358723958334, "learning_rate": 3.218788800182952e-05, "loss": 4.2091, "loss/crossentropy": 1.2999482825398445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1503276266157627, "step": 9316 }, { "epoch": 0.7765, "grad_norm": 5.5, "grad_norm_var": 0.12343343098958333, "learning_rate": 3.216628540166645e-05, "loss": 4.8884, "loss/crossentropy": 1.7634951025247574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17525204457342625, "step": 9318 }, { "epoch": 0.7766666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.10983072916666667, "learning_rate": 3.214466265200595e-05, "loss": 5.2293, "loss/crossentropy": 2.389511853456497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21278628706932068, "step": 9320 }, { "epoch": 0.7768333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.10089518229166666, "learning_rate": 3.212301980619998e-05, "loss": 4.6147, "loss/crossentropy": 2.306184262037277, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19868546724319458, "step": 9322 }, { "epoch": 0.777, "grad_norm": 4.375, "grad_norm_var": 0.08619791666666667, "learning_rate": 3.210135691765012e-05, "loss": 4.4001, "loss/crossentropy": 1.817717507481575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16615832969546318, "step": 9324 }, { "epoch": 0.7771666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.09478759765625, "learning_rate": 3.2079674039807404e-05, "loss": 4.6573, "loss/crossentropy": 1.8692854642868042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17973013408482075, "step": 9326 }, { "epoch": 0.7773333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.09153238932291667, "learning_rate": 3.2057971226172174e-05, "loss": 5.1534, "loss/crossentropy": 2.0862750113010406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20505443587899208, "step": 9328 }, { "epoch": 0.7775, "grad_norm": 4.59375, "grad_norm_var": 0.08401285807291667, "learning_rate": 3.203624853029396e-05, "loss": 5.1276, "loss/crossentropy": 2.589634597301483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21999847516417503, "step": 9330 }, { "epoch": 0.7776666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.08013916015625, "learning_rate": 3.2014506005771364e-05, "loss": 4.9304, "loss/crossentropy": 2.1367976665496826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22664149850606918, "step": 9332 }, { "epoch": 0.7778333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.04104410807291667, "learning_rate": 3.199274370625189e-05, "loss": 4.6075, "loss/crossentropy": 1.7962630540132523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17372861318290234, "step": 9334 }, { "epoch": 0.778, "grad_norm": 4.34375, "grad_norm_var": 0.04996337890625, "learning_rate": 3.197096168543186e-05, "loss": 4.6964, "loss/crossentropy": 1.823552280664444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19311099871993065, "step": 9336 }, { "epoch": 0.7781666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.04934895833333333, "learning_rate": 3.1949159997056235e-05, "loss": 4.4811, "loss/crossentropy": 1.6573946326971054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16165916807949543, "step": 9338 }, { "epoch": 0.7783333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.05517171223958333, "learning_rate": 3.192733869491853e-05, "loss": 4.574, "loss/crossentropy": 2.3132286369800568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21056906878948212, "step": 9340 }, { "epoch": 0.7785, "grad_norm": 4.6875, "grad_norm_var": 0.04735921223958333, "learning_rate": 3.190549783286062e-05, "loss": 4.2814, "loss/crossentropy": 1.885126568377018, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18343673273921013, "step": 9342 }, { "epoch": 0.7786666666666666, "grad_norm": 4.625, "grad_norm_var": 0.042708333333333334, "learning_rate": 3.1883637464772665e-05, "loss": 4.2948, "loss/crossentropy": 1.6604382917284966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1493774987757206, "step": 9344 }, { "epoch": 0.7788333333333334, "grad_norm": 5.46875, "grad_norm_var": 0.08863525390625, "learning_rate": 3.1861757644592963e-05, "loss": 5.6045, "loss/crossentropy": 2.4773399233818054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2180468514561653, "step": 9346 }, { "epoch": 0.779, "grad_norm": 4.8125, "grad_norm_var": 0.08704020182291666, "learning_rate": 3.1839858426307784e-05, "loss": 4.8078, "loss/crossentropy": 2.260939121246338, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.197475366294384, "step": 9348 }, { "epoch": 0.7791666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.07862955729166667, "learning_rate": 3.1817939863951284e-05, "loss": 4.7583, "loss/crossentropy": 1.5021531581878662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16176176443696022, "step": 9350 }, { "epoch": 0.7793333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.08121337890625, "learning_rate": 3.179600201160532e-05, "loss": 4.8546, "loss/crossentropy": 1.4230839125812054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14267605170607567, "step": 9352 }, { "epoch": 0.7795, "grad_norm": 4.625, "grad_norm_var": 0.091259765625, "learning_rate": 3.177404492339937e-05, "loss": 5.7562, "loss/crossentropy": 2.712648332118988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20741987600922585, "step": 9354 }, { "epoch": 0.7796666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.08375244140625, "learning_rate": 3.175206865351038e-05, "loss": 4.9446, "loss/crossentropy": 2.2300324141979218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21562551707029343, "step": 9356 }, { "epoch": 0.7798333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.08150634765625, "learning_rate": 3.173007325616258e-05, "loss": 4.9502, "loss/crossentropy": 2.4897924661636353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22505389899015427, "step": 9358 }, { "epoch": 0.78, "grad_norm": 4.6875, "grad_norm_var": 0.09602457682291667, "learning_rate": 3.170805878562745e-05, "loss": 5.3735, "loss/crossentropy": 1.269807867705822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1314986441284418, "step": 9360 }, { "epoch": 0.7801666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.07063395182291667, "learning_rate": 3.1686025296223505e-05, "loss": 4.8595, "loss/crossentropy": 1.7563334554433823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16725478693842888, "step": 9362 }, { "epoch": 0.7803333333333333, "grad_norm": 4.5, "grad_norm_var": 0.069140625, "learning_rate": 3.166397284231618e-05, "loss": 4.3453, "loss/crossentropy": 2.3237995505332947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043895721435547, "step": 9364 }, { "epoch": 0.7805, "grad_norm": 4.59375, "grad_norm_var": 0.06842041015625, "learning_rate": 3.1641901478317725e-05, "loss": 4.8902, "loss/crossentropy": 2.6030715703964233, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23151734843850136, "step": 9366 }, { "epoch": 0.7806666666666666, "grad_norm": 5.375, "grad_norm_var": 0.07511393229166667, "learning_rate": 3.1619811258687035e-05, "loss": 4.9789, "loss/crossentropy": 1.7157761678099632, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061714269220829, "step": 9368 }, { "epoch": 0.7808333333333334, "grad_norm": 5.125, "grad_norm_var": 0.07776285807291666, "learning_rate": 3.159770223792952e-05, "loss": 4.9378, "loss/crossentropy": 2.217753827571869, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20972023904323578, "step": 9370 }, { "epoch": 0.781, "grad_norm": 4.3125, "grad_norm_var": 0.09915364583333333, "learning_rate": 3.1575574470596996e-05, "loss": 4.2703, "loss/crossentropy": 0.9079168289899826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10304674133658409, "step": 9372 }, { "epoch": 0.7811666666666667, "grad_norm": 4.625, "grad_norm_var": 0.09931233723958334, "learning_rate": 3.155342801128754e-05, "loss": 4.9554, "loss/crossentropy": 1.7163361012935638, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16222969815135002, "step": 9374 }, { "epoch": 0.7813333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.09996337890625, "learning_rate": 3.153126291464533e-05, "loss": 4.9338, "loss/crossentropy": 1.8250629603862762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23775455728173256, "step": 9376 }, { "epoch": 0.7815, "grad_norm": 4.9375, "grad_norm_var": 0.08956705729166667, "learning_rate": 3.1509079235360534e-05, "loss": 4.9173, "loss/crossentropy": 2.315726935863495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20576253160834312, "step": 9378 }, { "epoch": 0.7816666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.08209635416666666, "learning_rate": 3.1486877028169174e-05, "loss": 4.8982, "loss/crossentropy": 1.8110148012638092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19745264574885368, "step": 9380 }, { "epoch": 0.7818333333333334, "grad_norm": 6.15625, "grad_norm_var": 0.20013020833333334, "learning_rate": 3.146465634785301e-05, "loss": 4.7892, "loss/crossentropy": 1.715107500553131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18671299517154694, "step": 9382 }, { "epoch": 0.782, "grad_norm": 4.9375, "grad_norm_var": 0.18372395833333333, "learning_rate": 3.144241724923934e-05, "loss": 4.0597, "loss/crossentropy": 2.425957441329956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160644568502903, "step": 9384 }, { "epoch": 0.7821666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.18396809895833333, "learning_rate": 3.1420159787200934e-05, "loss": 5.1833, "loss/crossentropy": 1.9202543646097183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16482832096517086, "step": 9386 }, { "epoch": 0.7823333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.170556640625, "learning_rate": 3.1397884016655876e-05, "loss": 4.7629, "loss/crossentropy": 2.2666742503643036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2014549858868122, "step": 9388 }, { "epoch": 0.7825, "grad_norm": 4.96875, "grad_norm_var": 0.17939046223958333, "learning_rate": 3.13755899925674e-05, "loss": 4.6324, "loss/crossentropy": 1.1317738816142082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14520836994051933, "step": 9390 }, { "epoch": 0.7826666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.18013916015625, "learning_rate": 3.1353277769943815e-05, "loss": 4.8204, "loss/crossentropy": 1.505985789000988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1771007440984249, "step": 9392 }, { "epoch": 0.7828333333333334, "grad_norm": 4.15625, "grad_norm_var": 0.225390625, "learning_rate": 3.133094740383829e-05, "loss": 3.7728, "loss/crossentropy": 1.7403645440936089, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1842222698032856, "step": 9394 }, { "epoch": 0.783, "grad_norm": 4.4375, "grad_norm_var": 0.23824462890625, "learning_rate": 3.1308598949348796e-05, "loss": 4.7321, "loss/crossentropy": 1.2648060396313667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13886748626828194, "step": 9396 }, { "epoch": 0.7831666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.1080078125, "learning_rate": 3.1286232461617926e-05, "loss": 4.4106, "loss/crossentropy": 2.309142082929611, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22487016394734383, "step": 9398 }, { "epoch": 0.7833333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.09801025390625, "learning_rate": 3.1263847995832755e-05, "loss": 4.8193, "loss/crossentropy": 2.0457848384976387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1811012402176857, "step": 9400 }, { "epoch": 0.7835, "grad_norm": 4.84375, "grad_norm_var": 0.07929280598958334, "learning_rate": 3.124144560722473e-05, "loss": 4.9667, "loss/crossentropy": 2.0342873632907867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22702273726463318, "step": 9402 }, { "epoch": 0.7836666666666666, "grad_norm": 4.5, "grad_norm_var": 0.07317708333333334, "learning_rate": 3.1219025351069524e-05, "loss": 4.3217, "loss/crossentropy": 1.3957015573978424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.142100278288126, "step": 9404 }, { "epoch": 0.7838333333333334, "grad_norm": 5.28125, "grad_norm_var": 0.08587239583333334, "learning_rate": 3.119658728268689e-05, "loss": 4.8953, "loss/crossentropy": 1.9280966818332672, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1804770566523075, "step": 9406 }, { "epoch": 0.784, "grad_norm": 4.34375, "grad_norm_var": 0.09166666666666666, "learning_rate": 3.1174131457440524e-05, "loss": 5.0648, "loss/crossentropy": 2.139180600643158, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20616818591952324, "step": 9408 }, { "epoch": 0.7841666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.055497233072916666, "learning_rate": 3.115165793073795e-05, "loss": 5.0601, "loss/crossentropy": 2.2152554094791412, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19830195233225822, "step": 9410 }, { "epoch": 0.7843333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.04425455729166667, "learning_rate": 3.1129166758030344e-05, "loss": 4.9589, "loss/crossentropy": 1.9690501242876053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18640769086778164, "step": 9412 }, { "epoch": 0.7845, "grad_norm": 4.625, "grad_norm_var": 0.041910807291666664, "learning_rate": 3.110665799481246e-05, "loss": 5.2133, "loss/crossentropy": 2.339284062385559, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21954040229320526, "step": 9414 }, { "epoch": 0.7846666666666666, "grad_norm": 5.0, "grad_norm_var": 0.05175374348958333, "learning_rate": 3.1084131696622435e-05, "loss": 5.2811, "loss/crossentropy": 1.925955355167389, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19665737450122833, "step": 9416 }, { "epoch": 0.7848333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.052978515625, "learning_rate": 3.106158791904164e-05, "loss": 5.0989, "loss/crossentropy": 1.8825834766030312, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18833110481500626, "step": 9418 }, { "epoch": 0.785, "grad_norm": 4.625, "grad_norm_var": 0.05054931640625, "learning_rate": 3.103902671769465e-05, "loss": 4.9473, "loss/crossentropy": 2.5766605734825134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20159252732992172, "step": 9420 }, { "epoch": 0.7851666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.035791015625, "learning_rate": 3.1016448148248955e-05, "loss": 5.0349, "loss/crossentropy": 1.7484197169542313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18348832800984383, "step": 9422 }, { "epoch": 0.7853333333333333, "grad_norm": 4.625, "grad_norm_var": 0.02877197265625, "learning_rate": 3.099385226641493e-05, "loss": 4.5261, "loss/crossentropy": 1.3600659668445587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14355047047138214, "step": 9424 }, { "epoch": 0.7855, "grad_norm": 4.625, "grad_norm_var": 0.0302734375, "learning_rate": 3.097123912794569e-05, "loss": 5.0282, "loss/crossentropy": 2.190640449523926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21116185933351517, "step": 9426 }, { "epoch": 0.7856666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.033919270833333334, "learning_rate": 3.0948608788636875e-05, "loss": 5.4235, "loss/crossentropy": 2.3429831862449646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19852236658334732, "step": 9428 }, { "epoch": 0.7858333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.03362223307291667, "learning_rate": 3.0925961304326634e-05, "loss": 4.8775, "loss/crossentropy": 2.2385424375534058, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20526361465454102, "step": 9430 }, { "epoch": 0.786, "grad_norm": 4.6875, "grad_norm_var": 0.023681640625, "learning_rate": 3.0903296730895354e-05, "loss": 4.6021, "loss/crossentropy": 1.8072471469640732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18658201955258846, "step": 9432 }, { "epoch": 0.7861666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.026025390625, "learning_rate": 3.088061512426563e-05, "loss": 4.7022, "loss/crossentropy": 2.5440629720687866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039373368024826, "step": 9434 }, { "epoch": 0.7863333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.0326171875, "learning_rate": 3.085791654040206e-05, "loss": 5.3461, "loss/crossentropy": 1.874840334057808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19369103014469147, "step": 9436 }, { "epoch": 0.7865, "grad_norm": 5.21875, "grad_norm_var": 0.04568684895833333, "learning_rate": 3.083520103531115e-05, "loss": 4.5034, "loss/crossentropy": 2.5266154408454895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19745132699608803, "step": 9438 }, { "epoch": 0.7866666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.04784749348958333, "learning_rate": 3.0812468665041165e-05, "loss": 5.1865, "loss/crossentropy": 1.9486228823661804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20566469430923462, "step": 9440 }, { "epoch": 0.7868333333333334, "grad_norm": 4.5, "grad_norm_var": 0.051981608072916664, "learning_rate": 3.078971948568195e-05, "loss": 5.0026, "loss/crossentropy": 2.0742533802986145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22526142373681068, "step": 9442 }, { "epoch": 0.787, "grad_norm": 5.125, "grad_norm_var": 0.067578125, "learning_rate": 3.076695355336486e-05, "loss": 4.9311, "loss/crossentropy": 1.456095166504383, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14632639847695827, "step": 9444 }, { "epoch": 0.7871666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.08843994140625, "learning_rate": 3.0744170924262546e-05, "loss": 4.2479, "loss/crossentropy": 1.304304301738739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14765658788383007, "step": 9446 }, { "epoch": 0.7873333333333333, "grad_norm": 4.875, "grad_norm_var": 0.092822265625, "learning_rate": 3.072137165458891e-05, "loss": 4.3201, "loss/crossentropy": 1.4218315780162811, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14105146750807762, "step": 9448 }, { "epoch": 0.7875, "grad_norm": 4.625, "grad_norm_var": 0.087744140625, "learning_rate": 3.069855580059885e-05, "loss": 5.2229, "loss/crossentropy": 1.6259961053729057, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1994849592447281, "step": 9450 }, { "epoch": 0.7876666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.09256184895833333, "learning_rate": 3.067572341858825e-05, "loss": 5.2859, "loss/crossentropy": 2.0877254605293274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24095411598682404, "step": 9452 }, { "epoch": 0.7878333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.07789306640625, "learning_rate": 3.065287456489372e-05, "loss": 4.8259, "loss/crossentropy": 0.8618222922086716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12632916308939457, "step": 9454 }, { "epoch": 0.788, "grad_norm": 4.625, "grad_norm_var": 0.06678059895833334, "learning_rate": 3.063000929589255e-05, "loss": 4.6287, "loss/crossentropy": 2.2364018261432648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24320093169808388, "step": 9456 }, { "epoch": 0.7881666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.07047119140625, "learning_rate": 3.0607127668002506e-05, "loss": 5.0525, "loss/crossentropy": 2.4567251205444336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22456028684973717, "step": 9458 }, { "epoch": 0.7883333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.07724202473958333, "learning_rate": 3.058422973768175e-05, "loss": 5.3273, "loss/crossentropy": 1.9736377596855164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19915513694286346, "step": 9460 }, { "epoch": 0.7885, "grad_norm": 4.40625, "grad_norm_var": 0.06350504557291667, "learning_rate": 3.056131556142861e-05, "loss": 4.8735, "loss/crossentropy": 1.8944967985153198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18105413764715195, "step": 9462 }, { "epoch": 0.7886666666666666, "grad_norm": 4.625, "grad_norm_var": 0.059488932291666664, "learning_rate": 3.0538385195781594e-05, "loss": 4.5573, "loss/crossentropy": 1.660656489431858, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15973762422800064, "step": 9464 }, { "epoch": 0.7888333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.06197916666666667, "learning_rate": 3.051543869731905e-05, "loss": 5.3056, "loss/crossentropy": 2.2200306951999664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2009948380291462, "step": 9466 }, { "epoch": 0.789, "grad_norm": 4.875, "grad_norm_var": 0.05468343098958333, "learning_rate": 3.04924761226592e-05, "loss": 4.9293, "loss/crossentropy": 1.854020357131958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18061123974621296, "step": 9468 }, { "epoch": 0.7891666666666667, "grad_norm": 4.09375, "grad_norm_var": 0.09368489583333334, "learning_rate": 3.0469497528459924e-05, "loss": 4.5673, "loss/crossentropy": 1.4358838349580765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1429310292005539, "step": 9470 }, { "epoch": 0.7893333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.09322509765625, "learning_rate": 3.0446502971418607e-05, "loss": 4.8726, "loss/crossentropy": 1.8105507493019104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17184137552976608, "step": 9472 }, { "epoch": 0.7895, "grad_norm": 4.59375, "grad_norm_var": 0.20871988932291666, "learning_rate": 3.0423492508272036e-05, "loss": 4.8678, "loss/crossentropy": 1.4386665895581245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14651955105364323, "step": 9474 }, { "epoch": 0.7896666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.18596598307291667, "learning_rate": 3.0400466195796238e-05, "loss": 5.3026, "loss/crossentropy": 2.505110263824463, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073451578617096, "step": 9476 }, { "epoch": 0.7898333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.18605143229166668, "learning_rate": 3.037742409080636e-05, "loss": 5.1978, "loss/crossentropy": 1.93652855604887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.220738735049963, "step": 9478 }, { "epoch": 0.79, "grad_norm": 5.0625, "grad_norm_var": 0.19530843098958334, "learning_rate": 3.035436625015649e-05, "loss": 4.8821, "loss/crossentropy": 1.0793606489896774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1280514094978571, "step": 9480 }, { "epoch": 0.7901666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.19407552083333332, "learning_rate": 3.0331292730739583e-05, "loss": 4.8433, "loss/crossentropy": 1.9875987321138382, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17475885152816772, "step": 9482 }, { "epoch": 0.7903333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.19924723307291667, "learning_rate": 3.030820358948722e-05, "loss": 4.6097, "loss/crossentropy": 2.2878986299037933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20894601568579674, "step": 9484 }, { "epoch": 0.7905, "grad_norm": 4.4375, "grad_norm_var": 0.16497395833333334, "learning_rate": 3.0285098883369587e-05, "loss": 4.6595, "loss/crossentropy": 1.6879505664110184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16620928049087524, "step": 9486 }, { "epoch": 0.7906666666666666, "grad_norm": 4.625, "grad_norm_var": 0.16573893229166667, "learning_rate": 3.0261978669395246e-05, "loss": 4.6243, "loss/crossentropy": 1.9199838414788246, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18426301330327988, "step": 9488 }, { "epoch": 0.7908333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.042252604166666666, "learning_rate": 3.0238843004611014e-05, "loss": 5.0213, "loss/crossentropy": 2.375735104084015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2256057783961296, "step": 9490 }, { "epoch": 0.791, "grad_norm": 4.84375, "grad_norm_var": 0.059098307291666666, "learning_rate": 3.0215691946101865e-05, "loss": 5.7123, "loss/crossentropy": 2.1991631910204887, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18081208877265453, "step": 9492 }, { "epoch": 0.7911666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.06379801432291667, "learning_rate": 3.0192525550990715e-05, "loss": 4.9129, "loss/crossentropy": 1.7236268892884254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19379206374287605, "step": 9494 }, { "epoch": 0.7913333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.052469889322916664, "learning_rate": 3.0169343876438354e-05, "loss": 4.6993, "loss/crossentropy": 2.3961364030838013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21208176389336586, "step": 9496 }, { "epoch": 0.7915, "grad_norm": 4.28125, "grad_norm_var": 0.06428629557291667, "learning_rate": 3.0146146979643248e-05, "loss": 4.5953, "loss/crossentropy": 1.9669974148273468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18356753140687943, "step": 9498 }, { "epoch": 0.7916666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.07668863932291667, "learning_rate": 3.012293491784144e-05, "loss": 5.4238, "loss/crossentropy": 2.3267141580581665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22954250872135162, "step": 9500 }, { "epoch": 0.7918333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.07040608723958333, "learning_rate": 3.009970774830639e-05, "loss": 4.6428, "loss/crossentropy": 1.8814620971679688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21085096895694733, "step": 9502 }, { "epoch": 0.792, "grad_norm": 4.5, "grad_norm_var": 0.073046875, "learning_rate": 3.0076465528348825e-05, "loss": 4.9866, "loss/crossentropy": 1.8643994852900505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17112858220934868, "step": 9504 }, { "epoch": 0.7921666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.08059488932291667, "learning_rate": 3.0053208315316608e-05, "loss": 4.9273, "loss/crossentropy": 1.931690700352192, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17383356019854546, "step": 9506 }, { "epoch": 0.7923333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.070556640625, "learning_rate": 3.0029936166594606e-05, "loss": 5.0846, "loss/crossentropy": 1.6738494783639908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17133232951164246, "step": 9508 }, { "epoch": 0.7925, "grad_norm": 4.78125, "grad_norm_var": 0.06183268229166667, "learning_rate": 3.0006649139604537e-05, "loss": 4.6146, "loss/crossentropy": 1.1198093742132187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1320702861994505, "step": 9510 }, { "epoch": 0.7926666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.06119384765625, "learning_rate": 2.9983347291804805e-05, "loss": 4.7334, "loss/crossentropy": 2.1339576840400696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1965150125324726, "step": 9512 }, { "epoch": 0.7928333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.04986572265625, "learning_rate": 2.996003068069043e-05, "loss": 4.9968, "loss/crossentropy": 1.8556120991706848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21033604815602303, "step": 9514 }, { "epoch": 0.793, "grad_norm": 4.5625, "grad_norm_var": 0.04166259765625, "learning_rate": 2.9936699363792816e-05, "loss": 5.1152, "loss/crossentropy": 2.0661367923021317, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21299026906490326, "step": 9516 }, { "epoch": 0.7931666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.04459228515625, "learning_rate": 2.991335339867968e-05, "loss": 4.6905, "loss/crossentropy": 1.0509056076407433, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1323660109192133, "step": 9518 }, { "epoch": 0.7933333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.04654541015625, "learning_rate": 2.9889992842954858e-05, "loss": 5.3345, "loss/crossentropy": 2.2113268077373505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20679356902837753, "step": 9520 }, { "epoch": 0.7935, "grad_norm": 4.9375, "grad_norm_var": 0.04537760416666667, "learning_rate": 2.9866617754258197e-05, "loss": 4.7611, "loss/crossentropy": 1.4634685143828392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1467580944299698, "step": 9522 }, { "epoch": 0.7936666666666666, "grad_norm": 5.625, "grad_norm_var": 0.08841145833333333, "learning_rate": 2.984322819026541e-05, "loss": 5.2701, "loss/crossentropy": 2.333310306072235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280527651309967, "step": 9524 }, { "epoch": 0.7938333333333333, "grad_norm": 5.0, "grad_norm_var": 0.09016520182291667, "learning_rate": 2.981982420868792e-05, "loss": 5.1634, "loss/crossentropy": 1.7275255471467972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19213207438588142, "step": 9526 }, { "epoch": 0.794, "grad_norm": 4.78125, "grad_norm_var": 0.09178059895833333, "learning_rate": 2.979640586727274e-05, "loss": 5.0111, "loss/crossentropy": 1.9508731663227081, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20765764266252518, "step": 9528 }, { "epoch": 0.7941666666666667, "grad_norm": 4.625, "grad_norm_var": 0.08651936848958333, "learning_rate": 2.977297322380227e-05, "loss": 4.674, "loss/crossentropy": 2.0042631030082703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18659953027963638, "step": 9530 }, { "epoch": 0.7943333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.08294270833333334, "learning_rate": 2.9749526336094255e-05, "loss": 4.7152, "loss/crossentropy": 1.7674919664859772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17473849654197693, "step": 9532 }, { "epoch": 0.7945, "grad_norm": 4.4375, "grad_norm_var": 0.08474934895833333, "learning_rate": 2.9726065262001545e-05, "loss": 4.6997, "loss/crossentropy": 2.370339721441269, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2148333080112934, "step": 9534 }, { "epoch": 0.7946666666666666, "grad_norm": 5.1875, "grad_norm_var": 0.09361572265625, "learning_rate": 2.970259005941201e-05, "loss": 5.1521, "loss/crossentropy": 1.9504710137844086, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21056831628084183, "step": 9536 }, { "epoch": 0.7948333333333333, "grad_norm": 4.875, "grad_norm_var": 0.08626302083333333, "learning_rate": 2.967910078624839e-05, "loss": 4.9986, "loss/crossentropy": 1.970159761607647, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18404271081089973, "step": 9538 }, { "epoch": 0.795, "grad_norm": 4.78125, "grad_norm_var": 0.040425618489583336, "learning_rate": 2.9655597500468122e-05, "loss": 5.1379, "loss/crossentropy": 1.9936151206493378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17894721776247025, "step": 9540 }, { "epoch": 0.7951666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.04761962890625, "learning_rate": 2.9632080260063224e-05, "loss": 4.9476, "loss/crossentropy": 1.8085922375321388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1799444779753685, "step": 9542 }, { "epoch": 0.7953333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.04332275390625, "learning_rate": 2.9608549123060145e-05, "loss": 5.2912, "loss/crossentropy": 1.8493381887674332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1686496790498495, "step": 9544 }, { "epoch": 0.7955, "grad_norm": 4.625, "grad_norm_var": 0.040087890625, "learning_rate": 2.9585004147519644e-05, "loss": 5.4614, "loss/crossentropy": 2.317984402179718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21581026539206505, "step": 9546 }, { "epoch": 0.7956666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.04117431640625, "learning_rate": 2.95614453915366e-05, "loss": 4.834, "loss/crossentropy": 1.8879902809858322, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18581601977348328, "step": 9548 }, { "epoch": 0.7958333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.03463134765625, "learning_rate": 2.9537872913239892e-05, "loss": 4.4235, "loss/crossentropy": 1.325361706316471, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13260122202336788, "step": 9550 }, { "epoch": 0.796, "grad_norm": 4.6875, "grad_norm_var": 0.024995930989583335, "learning_rate": 2.9514286770792275e-05, "loss": 5.1549, "loss/crossentropy": 1.5360118001699448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1876349337399006, "step": 9552 }, { "epoch": 0.7961666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.026302083333333334, "learning_rate": 2.9490687022390215e-05, "loss": 4.6383, "loss/crossentropy": 1.5563682615756989, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1847225520759821, "step": 9554 }, { "epoch": 0.7963333333333333, "grad_norm": 5.125, "grad_norm_var": 0.03883056640625, "learning_rate": 2.9467073726263736e-05, "loss": 4.6207, "loss/crossentropy": 1.750094898045063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1583553459495306, "step": 9556 }, { "epoch": 0.7965, "grad_norm": 4.75, "grad_norm_var": 0.030367024739583335, "learning_rate": 2.9443446940676305e-05, "loss": 4.4711, "loss/crossentropy": 2.0352243930101395, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18048042058944702, "step": 9558 }, { "epoch": 0.7966666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.028515625, "learning_rate": 2.9419806723924673e-05, "loss": 4.8025, "loss/crossentropy": 2.5280230045318604, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2519477494060993, "step": 9560 }, { "epoch": 0.7968333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.042041015625, "learning_rate": 2.93961531343387e-05, "loss": 4.5649, "loss/crossentropy": 2.3008410036563873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2132852002978325, "step": 9562 }, { "epoch": 0.797, "grad_norm": 4.71875, "grad_norm_var": 0.14338785807291668, "learning_rate": 2.937248623028129e-05, "loss": 4.7427, "loss/crossentropy": 2.0998832881450653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2082512266933918, "step": 9564 }, { "epoch": 0.7971666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.14140625, "learning_rate": 2.9348806070148178e-05, "loss": 5.0877, "loss/crossentropy": 2.1150071918964386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22998831048607826, "step": 9566 }, { "epoch": 0.7973333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.1619140625, "learning_rate": 2.9325112712367788e-05, "loss": 4.4993, "loss/crossentropy": 0.7897375747561455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13665344007313251, "step": 9568 }, { "epoch": 0.7975, "grad_norm": 4.53125, "grad_norm_var": 0.15969645182291667, "learning_rate": 2.9301406215401136e-05, "loss": 5.0347, "loss/crossentropy": 2.2986485958099365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23015353456139565, "step": 9570 }, { "epoch": 0.7976666666666666, "grad_norm": 5.03125, "grad_norm_var": 0.15565999348958334, "learning_rate": 2.927768663774165e-05, "loss": 5.3172, "loss/crossentropy": 1.9723598957061768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20267291367053986, "step": 9572 }, { "epoch": 0.7978333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.16689046223958334, "learning_rate": 2.9253954037915016e-05, "loss": 4.6553, "loss/crossentropy": 2.072106420993805, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16975676827132702, "step": 9574 }, { "epoch": 0.798, "grad_norm": 4.4375, "grad_norm_var": 0.17294514973958333, "learning_rate": 2.9230208474479077e-05, "loss": 4.7887, "loss/crossentropy": 2.2334997951984406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19652622565627098, "step": 9576 }, { "epoch": 0.7981666666666667, "grad_norm": 5.03125, "grad_norm_var": 0.16925455729166666, "learning_rate": 2.920645000602366e-05, "loss": 5.0195, "loss/crossentropy": 1.7784138470888138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1887425109744072, "step": 9578 }, { "epoch": 0.7983333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.0705078125, "learning_rate": 2.9182678691170392e-05, "loss": 4.4802, "loss/crossentropy": 1.1156157106161118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.121920857578516, "step": 9580 }, { "epoch": 0.7985, "grad_norm": 5.09375, "grad_norm_var": 0.07708333333333334, "learning_rate": 2.915889458857266e-05, "loss": 4.9223, "loss/crossentropy": 1.6830340400338173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1527923159301281, "step": 9582 }, { "epoch": 0.7986666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.07379150390625, "learning_rate": 2.9135097756915357e-05, "loss": 5.501, "loss/crossentropy": 2.1039809063076973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1852792724967003, "step": 9584 }, { "epoch": 0.7988333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.07128499348958334, "learning_rate": 2.9111288254914803e-05, "loss": 4.6736, "loss/crossentropy": 2.2788360714912415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24036888033151627, "step": 9586 }, { "epoch": 0.799, "grad_norm": 4.9375, "grad_norm_var": 0.06496988932291667, "learning_rate": 2.9087466141318573e-05, "loss": 4.8876, "loss/crossentropy": 2.326200306415558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2151721641421318, "step": 9588 }, { "epoch": 0.7991666666666667, "grad_norm": 4.75, "grad_norm_var": 0.052079264322916666, "learning_rate": 2.9063631474905382e-05, "loss": 4.9736, "loss/crossentropy": 2.0066977441310883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977986916899681, "step": 9590 }, { "epoch": 0.7993333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.046675618489583334, "learning_rate": 2.9039784314484884e-05, "loss": 5.3869, "loss/crossentropy": 2.163942277431488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1966785341501236, "step": 9592 }, { "epoch": 0.7995, "grad_norm": 4.46875, "grad_norm_var": 0.046468098958333336, "learning_rate": 2.9015924718897577e-05, "loss": 5.0935, "loss/crossentropy": 2.0210544764995575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1696550790220499, "step": 9594 }, { "epoch": 0.7996666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.045166015625, "learning_rate": 2.8992052747014648e-05, "loss": 4.3569, "loss/crossentropy": 0.8158632516860962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10923364758491516, "step": 9596 }, { "epoch": 0.7998333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.034228515625, "learning_rate": 2.8968168457737805e-05, "loss": 4.3597, "loss/crossentropy": 2.0650247782468796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20714912563562393, "step": 9598 }, { "epoch": 0.8, "grad_norm": 4.4375, "grad_norm_var": 0.027278645833333334, "learning_rate": 2.894427190999916e-05, "loss": 4.945, "loss/crossentropy": 2.01848566532135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1932220160961151, "step": 9600 }, { "epoch": 0.8001666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.034505208333333336, "learning_rate": 2.8920363162761078e-05, "loss": 4.6152, "loss/crossentropy": 2.331765651702881, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20866511762142181, "step": 9602 }, { "epoch": 0.8003333333333333, "grad_norm": 5.21875, "grad_norm_var": 0.04485270182291667, "learning_rate": 2.8896442275016014e-05, "loss": 4.9153, "loss/crossentropy": 2.022328555583954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19683456048369408, "step": 9604 }, { "epoch": 0.8005, "grad_norm": 4.46875, "grad_norm_var": 0.04869791666666667, "learning_rate": 2.8872509305786375e-05, "loss": 5.2109, "loss/crossentropy": 2.3472258746623993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.208266019821167, "step": 9606 }, { "epoch": 0.8006666666666666, "grad_norm": 4.375, "grad_norm_var": 0.05182291666666667, "learning_rate": 2.8848564314124386e-05, "loss": 4.0321, "loss/crossentropy": 1.353347197175026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18437976483255625, "step": 9608 }, { "epoch": 0.8008333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.047135416666666666, "learning_rate": 2.8824607359111935e-05, "loss": 4.8684, "loss/crossentropy": 1.3156828135252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15243668109178543, "step": 9610 }, { "epoch": 0.801, "grad_norm": 4.75, "grad_norm_var": 0.04537353515625, "learning_rate": 2.8800638499860425e-05, "loss": 4.6918, "loss/crossentropy": 1.940132163465023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18036840856075287, "step": 9612 }, { "epoch": 0.8011666666666667, "grad_norm": 4.75, "grad_norm_var": 0.04763997395833333, "learning_rate": 2.8776657795510634e-05, "loss": 5.0228, "loss/crossentropy": 1.8538916110992432, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1759151928126812, "step": 9614 }, { "epoch": 0.8013333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.04478759765625, "learning_rate": 2.8752665305232565e-05, "loss": 4.695, "loss/crossentropy": 1.3918126970529556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13803145475685596, "step": 9616 }, { "epoch": 0.8015, "grad_norm": 4.59375, "grad_norm_var": 0.03713785807291667, "learning_rate": 2.87286610882253e-05, "loss": 4.7662, "loss/crossentropy": 2.2778570353984833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2131534069776535, "step": 9618 }, { "epoch": 0.8016666666666666, "grad_norm": 5.9375, "grad_norm_var": 0.11991780598958333, "learning_rate": 2.8704645203716864e-05, "loss": 5.0824, "loss/crossentropy": 2.300149440765381, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2163809835910797, "step": 9620 }, { "epoch": 0.8018333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.11591389973958334, "learning_rate": 2.8680617710964064e-05, "loss": 4.2839, "loss/crossentropy": 2.0871397852897644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19335642829537392, "step": 9622 }, { "epoch": 0.802, "grad_norm": 4.84375, "grad_norm_var": 0.10818684895833333, "learning_rate": 2.8656578669252355e-05, "loss": 5.2061, "loss/crossentropy": 2.6689072847366333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22146183252334595, "step": 9624 }, { "epoch": 0.8021666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.10741780598958334, "learning_rate": 2.8632528137895677e-05, "loss": 5.1304, "loss/crossentropy": 1.9833775535225868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18096048012375832, "step": 9626 }, { "epoch": 0.8023333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.10771077473958333, "learning_rate": 2.860846617623631e-05, "loss": 4.7472, "loss/crossentropy": 2.048772692680359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1738439705222845, "step": 9628 }, { "epoch": 0.8025, "grad_norm": 4.59375, "grad_norm_var": 0.11308186848958333, "learning_rate": 2.8584392843644777e-05, "loss": 4.5055, "loss/crossentropy": 1.6863243579864502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1545063853263855, "step": 9630 }, { "epoch": 0.8026666666666666, "grad_norm": 4.375, "grad_norm_var": 0.12261962890625, "learning_rate": 2.856030819951962e-05, "loss": 5.2981, "loss/crossentropy": 2.2145843654870987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.180853221565485, "step": 9632 }, { "epoch": 0.8028333333333333, "grad_norm": 5.0, "grad_norm_var": 0.15, "learning_rate": 2.853621230328732e-05, "loss": 4.8301, "loss/crossentropy": 2.0582179874181747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1821475587785244, "step": 9634 }, { "epoch": 0.803, "grad_norm": 4.5, "grad_norm_var": 0.068603515625, "learning_rate": 2.851210521440208e-05, "loss": 4.9309, "loss/crossentropy": 2.4601719677448273, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22456849366426468, "step": 9636 }, { "epoch": 0.8031666666666667, "grad_norm": 4.75, "grad_norm_var": 0.06881103515625, "learning_rate": 2.8487986992345756e-05, "loss": 5.1272, "loss/crossentropy": 2.1401634514331818, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21643871068954468, "step": 9638 }, { "epoch": 0.8033333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.07799072265625, "learning_rate": 2.846385769662767e-05, "loss": 4.8438, "loss/crossentropy": 1.3249876573681831, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18818671256303787, "step": 9640 }, { "epoch": 0.8035, "grad_norm": 4.9375, "grad_norm_var": 0.079931640625, "learning_rate": 2.8439717386784464e-05, "loss": 4.5864, "loss/crossentropy": 2.387733817100525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22228409722447395, "step": 9642 }, { "epoch": 0.8036666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.08235270182291667, "learning_rate": 2.8415566122379937e-05, "loss": 4.8169, "loss/crossentropy": 1.6120276674628258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1609376147389412, "step": 9644 }, { "epoch": 0.8038333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.08834228515625, "learning_rate": 2.8391403963004943e-05, "loss": 4.539, "loss/crossentropy": 2.0225760638713837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24541480466723442, "step": 9646 }, { "epoch": 0.804, "grad_norm": 4.71875, "grad_norm_var": 0.07415364583333334, "learning_rate": 2.8367230968277213e-05, "loss": 4.6075, "loss/crossentropy": 2.2183853089809418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2087656781077385, "step": 9648 }, { "epoch": 0.8041666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.05618082682291667, "learning_rate": 2.8343047197841192e-05, "loss": 4.7195, "loss/crossentropy": 2.461825728416443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21937000378966331, "step": 9650 }, { "epoch": 0.8043333333333333, "grad_norm": 4.875, "grad_norm_var": 0.05480143229166667, "learning_rate": 2.831885271136795e-05, "loss": 4.9463, "loss/crossentropy": 2.300680994987488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21464381739497185, "step": 9652 }, { "epoch": 0.8045, "grad_norm": 5.0, "grad_norm_var": 0.07053629557291667, "learning_rate": 2.8294647568554956e-05, "loss": 4.4777, "loss/crossentropy": 1.5217168852686882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1655261069536209, "step": 9654 }, { "epoch": 0.8046666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.061197916666666664, "learning_rate": 2.8270431829126015e-05, "loss": 4.5535, "loss/crossentropy": 1.9004539996385574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1946029346436262, "step": 9656 }, { "epoch": 0.8048333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.05618489583333333, "learning_rate": 2.8246205552831047e-05, "loss": 4.8719, "loss/crossentropy": 2.249520570039749, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19344881922006607, "step": 9658 }, { "epoch": 0.805, "grad_norm": 4.8125, "grad_norm_var": 0.05618489583333333, "learning_rate": 2.8221968799445973e-05, "loss": 4.6141, "loss/crossentropy": 1.7338961511850357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16334066353738308, "step": 9660 }, { "epoch": 0.8051666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.05126546223958333, "learning_rate": 2.819772162877258e-05, "loss": 5.2458, "loss/crossentropy": 2.602845251560211, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20655466988682747, "step": 9662 }, { "epoch": 0.8053333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.058837890625, "learning_rate": 2.817346410063835e-05, "loss": 5.3631, "loss/crossentropy": 1.9564568027853966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18752934224903584, "step": 9664 }, { "epoch": 0.8055, "grad_norm": 4.53125, "grad_norm_var": 0.054541015625, "learning_rate": 2.8149196274896334e-05, "loss": 4.7131, "loss/crossentropy": 2.6057686805725098, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19163187593221664, "step": 9666 }, { "epoch": 0.8056666666666666, "grad_norm": 4.4375, "grad_norm_var": 0.07291666666666667, "learning_rate": 2.812491821142496e-05, "loss": 4.4306, "loss/crossentropy": 2.0900171995162964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22258975729346275, "step": 9668 }, { "epoch": 0.8058333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.07522379557291667, "learning_rate": 2.8100629970127955e-05, "loss": 5.1028, "loss/crossentropy": 1.5734562277793884, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16964438371360302, "step": 9670 }, { "epoch": 0.806, "grad_norm": 4.8125, "grad_norm_var": 0.06819254557291667, "learning_rate": 2.8076331610934117e-05, "loss": 4.7141, "loss/crossentropy": 0.9716506451368332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1379980333149433, "step": 9672 }, { "epoch": 0.8061666666666667, "grad_norm": 4.75, "grad_norm_var": 0.06724853515625, "learning_rate": 2.805202319379725e-05, "loss": 5.4769, "loss/crossentropy": 2.2032998502254486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21792401000857353, "step": 9674 }, { "epoch": 0.8063333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.06024983723958333, "learning_rate": 2.8027704778695962e-05, "loss": 4.949, "loss/crossentropy": 2.4872482419013977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23049870505928993, "step": 9676 }, { "epoch": 0.8065, "grad_norm": 4.90625, "grad_norm_var": 0.062093098958333336, "learning_rate": 2.80033764256335e-05, "loss": 5.4824, "loss/crossentropy": 2.6353384852409363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21061846613883972, "step": 9678 }, { "epoch": 0.8066666666666666, "grad_norm": 5.125, "grad_norm_var": 0.06451416015625, "learning_rate": 2.7979038194637683e-05, "loss": 5.5415, "loss/crossentropy": 2.14662966132164, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17899227887392044, "step": 9680 }, { "epoch": 0.8068333333333333, "grad_norm": 4.21875, "grad_norm_var": 0.084228515625, "learning_rate": 2.7954690145760656e-05, "loss": 4.889, "loss/crossentropy": 2.0768280178308487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17490868642926216, "step": 9682 }, { "epoch": 0.807, "grad_norm": 4.75, "grad_norm_var": 0.060139973958333336, "learning_rate": 2.793033233907883e-05, "loss": 4.6971, "loss/crossentropy": 2.2648986876010895, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21894178539514542, "step": 9684 }, { "epoch": 0.8071666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.04556884765625, "learning_rate": 2.7905964834692648e-05, "loss": 4.091, "loss/crossentropy": 1.7198558524250984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17033100128173828, "step": 9686 }, { "epoch": 0.8073333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.044905598958333334, "learning_rate": 2.788158769272652e-05, "loss": 4.7955, "loss/crossentropy": 1.8393462970852852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1787256933748722, "step": 9688 }, { "epoch": 0.8075, "grad_norm": 4.9375, "grad_norm_var": 0.05115559895833333, "learning_rate": 2.7857200973328624e-05, "loss": 4.9282, "loss/crossentropy": 1.9202441275119781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1960473172366619, "step": 9690 }, { "epoch": 0.8076666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.05513916015625, "learning_rate": 2.7832804736670754e-05, "loss": 4.404, "loss/crossentropy": 2.503723382949829, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21644175052642822, "step": 9692 }, { "epoch": 0.8078333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.049540201822916664, "learning_rate": 2.78083990429482e-05, "loss": 5.0705, "loss/crossentropy": 2.5052966475486755, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2255629561841488, "step": 9694 }, { "epoch": 0.808, "grad_norm": 4.71875, "grad_norm_var": 0.03518473307291667, "learning_rate": 2.77839839523796e-05, "loss": 5.1541, "loss/crossentropy": 2.4121972918510437, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21121644973754883, "step": 9696 }, { "epoch": 0.8081666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.028999837239583333, "learning_rate": 2.775955952520675e-05, "loss": 5.3142, "loss/crossentropy": 2.036761313676834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21132348477840424, "step": 9698 }, { "epoch": 0.8083333333333333, "grad_norm": 5.125, "grad_norm_var": 0.037886555989583334, "learning_rate": 2.7735125821694492e-05, "loss": 5.3845, "loss/crossentropy": 2.2135225534439087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23048130422830582, "step": 9700 }, { "epoch": 0.8085, "grad_norm": 4.96875, "grad_norm_var": 0.04491780598958333, "learning_rate": 2.771068290213057e-05, "loss": 5.3062, "loss/crossentropy": 2.3597964346408844, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20864511281251907, "step": 9702 }, { "epoch": 0.8086666666666666, "grad_norm": 4.25, "grad_norm_var": 0.06451416015625, "learning_rate": 2.7686230826825453e-05, "loss": 4.4362, "loss/crossentropy": 2.1759497225284576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25393587350845337, "step": 9704 }, { "epoch": 0.8088333333333333, "grad_norm": 5.15625, "grad_norm_var": 0.0767578125, "learning_rate": 2.766176965611221e-05, "loss": 4.7853, "loss/crossentropy": 1.5988976433873177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22598283365368843, "step": 9706 }, { "epoch": 0.809, "grad_norm": 4.875, "grad_norm_var": 0.07576497395833333, "learning_rate": 2.7637299450346345e-05, "loss": 5.5758, "loss/crossentropy": 2.635635018348694, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24151397868990898, "step": 9708 }, { "epoch": 0.8091666666666667, "grad_norm": 4.25, "grad_norm_var": 0.08878580729166667, "learning_rate": 2.7612820269905665e-05, "loss": 4.2942, "loss/crossentropy": 1.248511366546154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17816242016851902, "step": 9710 }, { "epoch": 0.8093333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.09659830729166667, "learning_rate": 2.7588332175190102e-05, "loss": 5.3882, "loss/crossentropy": 2.1789558827877045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20762834697961807, "step": 9712 }, { "epoch": 0.8095, "grad_norm": 4.625, "grad_norm_var": 0.08642171223958334, "learning_rate": 2.7563835226621606e-05, "loss": 4.8216, "loss/crossentropy": 2.2960515320301056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117706909775734, "step": 9714 }, { "epoch": 0.8096666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.07844645182291667, "learning_rate": 2.753932948464396e-05, "loss": 4.8336, "loss/crossentropy": 2.016137406229973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17403754591941833, "step": 9716 }, { "epoch": 0.8098333333333333, "grad_norm": 4.875, "grad_norm_var": 0.078515625, "learning_rate": 2.751481500972264e-05, "loss": 4.9171, "loss/crossentropy": 1.544787235558033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1772163286805153, "step": 9718 }, { "epoch": 0.81, "grad_norm": 4.53125, "grad_norm_var": 0.06425374348958333, "learning_rate": 2.7490291862344686e-05, "loss": 5.0292, "loss/crossentropy": 1.6385806947946548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17973128333687782, "step": 9720 }, { "epoch": 0.8101666666666667, "grad_norm": 5.75, "grad_norm_var": 0.11888020833333333, "learning_rate": 2.7465760103018516e-05, "loss": 4.8005, "loss/crossentropy": 2.2076993584632874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21309059485793114, "step": 9722 }, { "epoch": 0.8103333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.125390625, "learning_rate": 2.744121979227382e-05, "loss": 4.6816, "loss/crossentropy": 1.827271208167076, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17865308560431004, "step": 9724 }, { "epoch": 0.8105, "grad_norm": 4.3125, "grad_norm_var": 0.1271484375, "learning_rate": 2.7416670990661365e-05, "loss": 4.8622, "loss/crossentropy": 2.055354692041874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18429454416036606, "step": 9726 }, { "epoch": 0.8106666666666666, "grad_norm": 5.125, "grad_norm_var": 1.784228515625, "learning_rate": 2.739211375875288e-05, "loss": 5.4435, "loss/crossentropy": 2.217236667871475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2077849544584751, "step": 9728 }, { "epoch": 0.8108333333333333, "grad_norm": 5.21875, "grad_norm_var": 1.763134765625, "learning_rate": 2.7367548157140888e-05, "loss": 4.8811, "loss/crossentropy": 2.5398449301719666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2191455401480198, "step": 9730 }, { "epoch": 0.811, "grad_norm": 4.28125, "grad_norm_var": 1.8050130208333333, "learning_rate": 2.7342974246438586e-05, "loss": 4.1932, "loss/crossentropy": 1.9677127003669739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19635247439146042, "step": 9732 }, { "epoch": 0.8111666666666667, "grad_norm": 4.875, "grad_norm_var": 1.7762654622395833, "learning_rate": 2.7318392087279648e-05, "loss": 4.9551, "loss/crossentropy": 2.4844985604286194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22329407185316086, "step": 9734 }, { "epoch": 0.8113333333333334, "grad_norm": 4.375, "grad_norm_var": 1.811181640625, "learning_rate": 2.7293801740318104e-05, "loss": 4.2019, "loss/crossentropy": 0.9313739463686943, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12556781060993671, "step": 9736 }, { "epoch": 0.8115, "grad_norm": 4.5625, "grad_norm_var": 1.80064697265625, "learning_rate": 2.7269203266228196e-05, "loss": 4.8225, "loss/crossentropy": 2.24322646856308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20698126405477524, "step": 9738 }, { "epoch": 0.8116666666666666, "grad_norm": 4.125, "grad_norm_var": 1.8283162434895834, "learning_rate": 2.7244596725704204e-05, "loss": 4.3012, "loss/crossentropy": 1.607184648513794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15676794946193695, "step": 9740 }, { "epoch": 0.8118333333333333, "grad_norm": 4.59375, "grad_norm_var": 1.79058837890625, "learning_rate": 2.7219982179460333e-05, "loss": 4.9994, "loss/crossentropy": 1.8960078209638596, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17588616535067558, "step": 9742 }, { "epoch": 0.812, "grad_norm": 5.0625, "grad_norm_var": 0.08196614583333334, "learning_rate": 2.7195359688230514e-05, "loss": 4.9943, "loss/crossentropy": 2.059798449277878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20114361122250557, "step": 9744 }, { "epoch": 0.8121666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.07496337890625, "learning_rate": 2.717072931276832e-05, "loss": 5.2328, "loss/crossentropy": 2.3908294439315796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2179257869720459, "step": 9746 }, { "epoch": 0.8123333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.06873372395833334, "learning_rate": 2.7146091113846723e-05, "loss": 4.8158, "loss/crossentropy": 2.3867982923984528, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20485028997063637, "step": 9748 }, { "epoch": 0.8125, "grad_norm": 4.375, "grad_norm_var": 0.0720703125, "learning_rate": 2.7121445152258056e-05, "loss": 4.4072, "loss/crossentropy": 1.6835525631904602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20599596947431564, "step": 9750 }, { "epoch": 0.8126666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.07486979166666667, "learning_rate": 2.7096791488813772e-05, "loss": 5.2649, "loss/crossentropy": 2.514313280582428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2239019237458706, "step": 9752 }, { "epoch": 0.8128333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.07277018229166667, "learning_rate": 2.7072130184344324e-05, "loss": 4.9717, "loss/crossentropy": 2.6661786437034607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23028026148676872, "step": 9754 }, { "epoch": 0.813, "grad_norm": 4.84375, "grad_norm_var": 0.04407552083333333, "learning_rate": 2.7047461299699045e-05, "loss": 5.3989, "loss/crossentropy": 2.0026678144931793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17631691321730614, "step": 9756 }, { "epoch": 0.8131666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.04778645833333333, "learning_rate": 2.7022784895745942e-05, "loss": 5.3907, "loss/crossentropy": 1.366507887840271, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16259120404720306, "step": 9758 }, { "epoch": 0.8133333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.04959309895833333, "learning_rate": 2.6998101033371598e-05, "loss": 4.5622, "loss/crossentropy": 1.3968349806964397, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14244722109287977, "step": 9760 }, { "epoch": 0.8135, "grad_norm": 4.4375, "grad_norm_var": 0.06288655598958333, "learning_rate": 2.6973409773480983e-05, "loss": 4.5042, "loss/crossentropy": 1.9490249156951904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17202415689826012, "step": 9762 }, { "epoch": 0.8136666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.063525390625, "learning_rate": 2.6948711176997338e-05, "loss": 4.5769, "loss/crossentropy": 2.4208853244781494, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22516579553484917, "step": 9764 }, { "epoch": 0.8138333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.05347900390625, "learning_rate": 2.6924005304861976e-05, "loss": 4.3336, "loss/crossentropy": 1.5947562903165817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1803539153188467, "step": 9766 }, { "epoch": 0.814, "grad_norm": 5.15625, "grad_norm_var": 0.0922515869140625, "learning_rate": 2.6899292218034202e-05, "loss": 4.4487, "loss/crossentropy": 1.780159056186676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1578317228704691, "step": 9768 }, { "epoch": 0.8141666666666667, "grad_norm": 4.5, "grad_norm_var": 0.09517313639322916, "learning_rate": 2.6874571977491087e-05, "loss": 4.5388, "loss/crossentropy": 1.6249744519591331, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15210830606520176, "step": 9770 }, { "epoch": 0.8143333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.0975738525390625, "learning_rate": 2.684984464422736e-05, "loss": 4.7648, "loss/crossentropy": 1.5476508736610413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1978946216404438, "step": 9772 }, { "epoch": 0.8145, "grad_norm": 4.59375, "grad_norm_var": 0.08953348795572917, "learning_rate": 2.6825110279255286e-05, "loss": 5.0773, "loss/crossentropy": 1.603047177195549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18620534241199493, "step": 9774 }, { "epoch": 0.8146666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.08882548014322916, "learning_rate": 2.680036894360442e-05, "loss": 4.8267, "loss/crossentropy": 2.0404116213321686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23134134337306023, "step": 9776 }, { "epoch": 0.8148333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.0740142822265625, "learning_rate": 2.6775620698321568e-05, "loss": 5.0537, "loss/crossentropy": 0.9913991242647171, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11973785981535912, "step": 9778 }, { "epoch": 0.815, "grad_norm": 4.5625, "grad_norm_var": 0.1052642822265625, "learning_rate": 2.6750865604470554e-05, "loss": 4.9444, "loss/crossentropy": 1.849538080394268, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18608948774635792, "step": 9780 }, { "epoch": 0.8151666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.11380106608072917, "learning_rate": 2.6726103723132122e-05, "loss": 4.6016, "loss/crossentropy": 1.9843580573797226, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17255659773945808, "step": 9782 }, { "epoch": 0.8153333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.07148030598958334, "learning_rate": 2.6701335115403747e-05, "loss": 4.8852, "loss/crossentropy": 1.60471910238266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17869190871715546, "step": 9784 }, { "epoch": 0.8155, "grad_norm": 4.4375, "grad_norm_var": 0.08479410807291667, "learning_rate": 2.66765598423995e-05, "loss": 4.1572, "loss/crossentropy": 2.1039493903517723, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1869774367660284, "step": 9786 }, { "epoch": 0.8156666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.08381754557291667, "learning_rate": 2.665177796524992e-05, "loss": 5.0702, "loss/crossentropy": 2.3638014793395996, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21606038138270378, "step": 9788 }, { "epoch": 0.8158333333333333, "grad_norm": 4.75, "grad_norm_var": 0.08101806640625, "learning_rate": 2.662698954510181e-05, "loss": 5.1611, "loss/crossentropy": 2.450950562953949, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21199432387948036, "step": 9790 }, { "epoch": 0.816, "grad_norm": 4.46875, "grad_norm_var": 0.08368733723958334, "learning_rate": 2.6602194643118142e-05, "loss": 4.5387, "loss/crossentropy": 2.1611936390399933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19382005743682384, "step": 9792 }, { "epoch": 0.8161666666666667, "grad_norm": 5.25, "grad_norm_var": 0.10491129557291666, "learning_rate": 2.6577393320477868e-05, "loss": 5.4112, "loss/crossentropy": 2.7818479537963867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20983505249023438, "step": 9794 }, { "epoch": 0.8163333333333334, "grad_norm": 4.5, "grad_norm_var": 0.06769205729166666, "learning_rate": 2.6552585638375786e-05, "loss": 5.1181, "loss/crossentropy": 2.173560857772827, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22058776766061783, "step": 9796 }, { "epoch": 0.8165, "grad_norm": 4.59375, "grad_norm_var": 0.05987955729166667, "learning_rate": 2.6527771658022384e-05, "loss": 4.9415, "loss/crossentropy": 1.7440677136182785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19470586255192757, "step": 9798 }, { "epoch": 0.8166666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.05279947916666667, "learning_rate": 2.65029514406437e-05, "loss": 4.4766, "loss/crossentropy": 2.262479215860367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19687332212924957, "step": 9800 }, { "epoch": 0.8168333333333333, "grad_norm": 4.625, "grad_norm_var": 0.06916910807291667, "learning_rate": 2.6478125047481138e-05, "loss": 5.0711, "loss/crossentropy": 2.4867063760757446, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21951580047607422, "step": 9802 }, { "epoch": 0.817, "grad_norm": 4.78125, "grad_norm_var": 0.058919270833333336, "learning_rate": 2.6453292539791374e-05, "loss": 4.5929, "loss/crossentropy": 2.3370174169540405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19681474193930626, "step": 9804 }, { "epoch": 0.8171666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.07511393229166667, "learning_rate": 2.642845397884614e-05, "loss": 5.0763, "loss/crossentropy": 1.8368133306503296, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2001226581633091, "step": 9806 }, { "epoch": 0.8173333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.07662760416666667, "learning_rate": 2.640360942593212e-05, "loss": 4.9234, "loss/crossentropy": 1.2672517523169518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1805922258645296, "step": 9808 }, { "epoch": 0.8175, "grad_norm": 4.65625, "grad_norm_var": 0.06122639973958333, "learning_rate": 2.6378758942350775e-05, "loss": 4.7927, "loss/crossentropy": 1.7979520708322525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17544420808553696, "step": 9810 }, { "epoch": 0.8176666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.05725504557291667, "learning_rate": 2.6353902589418204e-05, "loss": 4.6291, "loss/crossentropy": 1.7286670580506325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17648831382393837, "step": 9812 }, { "epoch": 0.8178333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.05862223307291667, "learning_rate": 2.632904042846499e-05, "loss": 4.9676, "loss/crossentropy": 2.0067897960543633, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1861281618475914, "step": 9814 }, { "epoch": 0.818, "grad_norm": 4.4375, "grad_norm_var": 0.064697265625, "learning_rate": 2.6304172520836034e-05, "loss": 5.1672, "loss/crossentropy": 2.495649516582489, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20421000942587852, "step": 9816 }, { "epoch": 0.8181666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.041796875, "learning_rate": 2.6279298927890447e-05, "loss": 5.1169, "loss/crossentropy": 2.244947165250778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18983267061412334, "step": 9818 }, { "epoch": 0.8183333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.04247639973958333, "learning_rate": 2.6254419711001325e-05, "loss": 5.1038, "loss/crossentropy": 2.5352413654327393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20742696896195412, "step": 9820 }, { "epoch": 0.8185, "grad_norm": 5.03125, "grad_norm_var": 0.03839518229166667, "learning_rate": 2.6229534931555675e-05, "loss": 4.776, "loss/crossentropy": 2.2929417490959167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21830854937434196, "step": 9822 }, { "epoch": 0.8186666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.04303385416666667, "learning_rate": 2.6204644650954212e-05, "loss": 4.0816, "loss/crossentropy": 1.6211243867874146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18405551463365555, "step": 9824 }, { "epoch": 0.8188333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.0560546875, "learning_rate": 2.6179748930611227e-05, "loss": 4.984, "loss/crossentropy": 2.5118577778339386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24132562428712845, "step": 9826 }, { "epoch": 0.819, "grad_norm": 4.84375, "grad_norm_var": 0.07498372395833333, "learning_rate": 2.615484783195444e-05, "loss": 4.371, "loss/crossentropy": 1.8359337151050568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16906186938285828, "step": 9828 }, { "epoch": 0.8191666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.08030192057291667, "learning_rate": 2.6129941416424844e-05, "loss": 4.7897, "loss/crossentropy": 2.5312774777412415, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21105752140283585, "step": 9830 }, { "epoch": 0.8193333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.08176676432291667, "learning_rate": 2.6105029745476524e-05, "loss": 4.8841, "loss/crossentropy": 1.6842858046293259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15886010602116585, "step": 9832 }, { "epoch": 0.8195, "grad_norm": 4.8125, "grad_norm_var": 0.08046875, "learning_rate": 2.6080112880576564e-05, "loss": 5.2355, "loss/crossentropy": 2.119531899690628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1937863491475582, "step": 9834 }, { "epoch": 0.8196666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.08267822265625, "learning_rate": 2.605519088320485e-05, "loss": 5.2572, "loss/crossentropy": 2.3377262353897095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22751981765031815, "step": 9836 }, { "epoch": 0.8198333333333333, "grad_norm": 5.25, "grad_norm_var": 0.097119140625, "learning_rate": 2.6030263814853928e-05, "loss": 4.5545, "loss/crossentropy": 2.4934067130088806, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20816605538129807, "step": 9838 }, { "epoch": 0.82, "grad_norm": 4.625, "grad_norm_var": 0.0767578125, "learning_rate": 2.6005331737028875e-05, "loss": 4.9717, "loss/crossentropy": 2.5231724977493286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19992580264806747, "step": 9840 }, { "epoch": 0.8201666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.06790364583333333, "learning_rate": 2.598039471124709e-05, "loss": 4.7991, "loss/crossentropy": 1.669595293700695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16963953524827957, "step": 9842 }, { "epoch": 0.8203333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.049605305989583334, "learning_rate": 2.5955452799038235e-05, "loss": 5.3925, "loss/crossentropy": 2.155703604221344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19724318757653236, "step": 9844 }, { "epoch": 0.8205, "grad_norm": 4.375, "grad_norm_var": 0.04657796223958333, "learning_rate": 2.593050606194398e-05, "loss": 4.3041, "loss/crossentropy": 2.2015575766563416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2218138948082924, "step": 9846 }, { "epoch": 0.8206666666666667, "grad_norm": 4.75, "grad_norm_var": 0.059488932291666664, "learning_rate": 2.5905554561517923e-05, "loss": 5.0173, "loss/crossentropy": 1.5508754402399063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1607704758644104, "step": 9848 }, { "epoch": 0.8208333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.07763264973958334, "learning_rate": 2.5880598359325405e-05, "loss": 4.7229, "loss/crossentropy": 1.8121439665555954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22899498790502548, "step": 9850 }, { "epoch": 0.821, "grad_norm": 4.6875, "grad_norm_var": 0.07708333333333334, "learning_rate": 2.5855637516943386e-05, "loss": 4.6838, "loss/crossentropy": 1.7467404007911682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17679844796657562, "step": 9852 }, { "epoch": 0.8211666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.06638997395833333, "learning_rate": 2.5830672095960258e-05, "loss": 4.9116, "loss/crossentropy": 2.3431586623191833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21176928654313087, "step": 9854 }, { "epoch": 0.8213333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.06470947265625, "learning_rate": 2.580570215797571e-05, "loss": 4.7261, "loss/crossentropy": 1.483575701713562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21149089373648167, "step": 9856 }, { "epoch": 0.8215, "grad_norm": 4.59375, "grad_norm_var": 0.06311442057291666, "learning_rate": 2.5780727764600588e-05, "loss": 4.9978, "loss/crossentropy": 1.9482173770666122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.173264279961586, "step": 9858 }, { "epoch": 0.8216666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.07541910807291667, "learning_rate": 2.5755748977456722e-05, "loss": 4.2775, "loss/crossentropy": 1.2856212258338928, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16561558470129967, "step": 9860 }, { "epoch": 0.8218333333333333, "grad_norm": 4.625, "grad_norm_var": 0.08508707682291666, "learning_rate": 2.57307658581768e-05, "loss": 4.6755, "loss/crossentropy": 1.4765914678573608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14335180632770061, "step": 9862 }, { "epoch": 0.822, "grad_norm": 4.8125, "grad_norm_var": 0.06695556640625, "learning_rate": 2.5705778468404158e-05, "loss": 5.0093, "loss/crossentropy": 2.4915146827697754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20742217451334, "step": 9864 }, { "epoch": 0.8221666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.05487874348958333, "learning_rate": 2.568078686979272e-05, "loss": 4.3963, "loss/crossentropy": 2.16377916932106, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22294186055660248, "step": 9866 }, { "epoch": 0.8223333333333334, "grad_norm": 5.1875, "grad_norm_var": 0.07623697916666666, "learning_rate": 2.565579112400676e-05, "loss": 5.5977, "loss/crossentropy": 2.2194560170173645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2272995077073574, "step": 9868 }, { "epoch": 0.8225, "grad_norm": 4.40625, "grad_norm_var": 0.07727864583333334, "learning_rate": 2.5630791292720804e-05, "loss": 4.823, "loss/crossentropy": 1.572212889790535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1896784622222185, "step": 9870 }, { "epoch": 0.8226666666666667, "grad_norm": 4.75, "grad_norm_var": 0.07727457682291666, "learning_rate": 2.5605787437619443e-05, "loss": 4.8361, "loss/crossentropy": 1.8452747240662575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20604299381375313, "step": 9872 }, { "epoch": 0.8228333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.07838134765625, "learning_rate": 2.55807796203972e-05, "loss": 4.599, "loss/crossentropy": 1.5376994386315346, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13793596252799034, "step": 9874 }, { "epoch": 0.823, "grad_norm": 5.375, "grad_norm_var": 0.105712890625, "learning_rate": 2.5555767902758398e-05, "loss": 5.1798, "loss/crossentropy": 1.973397634923458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19167595729231834, "step": 9876 }, { "epoch": 0.8231666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.08648681640625, "learning_rate": 2.5530752346416934e-05, "loss": 4.8992, "loss/crossentropy": 1.2014049515128136, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13308897987008095, "step": 9878 }, { "epoch": 0.8233333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.0916015625, "learning_rate": 2.5505733013096236e-05, "loss": 4.9195, "loss/crossentropy": 2.278904974460602, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2010067068040371, "step": 9880 }, { "epoch": 0.8235, "grad_norm": 4.75, "grad_norm_var": 0.06940104166666666, "learning_rate": 2.5480709964529e-05, "loss": 5.2708, "loss/crossentropy": 2.5370509028434753, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21254604309797287, "step": 9882 }, { "epoch": 0.8236666666666667, "grad_norm": 4.375, "grad_norm_var": 0.060139973958333336, "learning_rate": 2.5455683262457127e-05, "loss": 4.6252, "loss/crossentropy": 1.7036000490188599, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1745048500597477, "step": 9884 }, { "epoch": 0.8238333333333333, "grad_norm": 4.75, "grad_norm_var": 0.05310872395833333, "learning_rate": 2.54306529686315e-05, "loss": 5.043, "loss/crossentropy": 2.425258755683899, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19860392063856125, "step": 9886 }, { "epoch": 0.824, "grad_norm": 4.40625, "grad_norm_var": 0.05771077473958333, "learning_rate": 2.54056191448119e-05, "loss": 5.3246, "loss/crossentropy": 2.094254046678543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20764774084091187, "step": 9888 }, { "epoch": 0.8241666666666667, "grad_norm": 4.625, "grad_norm_var": 0.05481770833333333, "learning_rate": 2.538058185276678e-05, "loss": 5.0141, "loss/crossentropy": 2.3725812435150146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19575026631355286, "step": 9890 }, { "epoch": 0.8243333333333334, "grad_norm": 4.875, "grad_norm_var": 0.030712890625, "learning_rate": 2.535554115427318e-05, "loss": 5.0221, "loss/crossentropy": 2.5913625955581665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21525685489177704, "step": 9892 }, { "epoch": 0.8245, "grad_norm": 4.71875, "grad_norm_var": 0.04927978515625, "learning_rate": 2.5330497111116536e-05, "loss": 4.4302, "loss/crossentropy": 2.011984132230282, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17140764743089676, "step": 9894 }, { "epoch": 0.8246666666666667, "grad_norm": 4.875, "grad_norm_var": 0.04724934895833333, "learning_rate": 2.5305449785090526e-05, "loss": 4.5671, "loss/crossentropy": 2.1562889516353607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18358014523983002, "step": 9896 }, { "epoch": 0.8248333333333333, "grad_norm": 3.875, "grad_norm_var": 0.07486979166666667, "learning_rate": 2.5280399237996946e-05, "loss": 4.2635, "loss/crossentropy": 1.3045726418495178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14434479922056198, "step": 9898 }, { "epoch": 0.825, "grad_norm": 4.40625, "grad_norm_var": 0.07786051432291667, "learning_rate": 2.525534553164552e-05, "loss": 4.1657, "loss/crossentropy": 1.8639636635780334, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1716986782848835, "step": 9900 }, { "epoch": 0.8251666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.07552083333333333, "learning_rate": 2.5230288727853794e-05, "loss": 4.9954, "loss/crossentropy": 2.545925259590149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2545745261013508, "step": 9902 }, { "epoch": 0.8253333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.0828125, "learning_rate": 2.520522888844693e-05, "loss": 5.1413, "loss/crossentropy": 2.4325710237026215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21113605424761772, "step": 9904 }, { "epoch": 0.8255, "grad_norm": 5.0625, "grad_norm_var": 0.109228515625, "learning_rate": 2.518016607525759e-05, "loss": 5.6142, "loss/crossentropy": 2.188798248767853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2361520528793335, "step": 9906 }, { "epoch": 0.8256666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.10181884765625, "learning_rate": 2.5155100350125777e-05, "loss": 4.8668, "loss/crossentropy": 2.0001536905765533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2016005963087082, "step": 9908 }, { "epoch": 0.8258333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.08548177083333333, "learning_rate": 2.513003177489867e-05, "loss": 4.6742, "loss/crossentropy": 1.218208484351635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1346103437244892, "step": 9910 }, { "epoch": 0.826, "grad_norm": 4.4375, "grad_norm_var": 0.09514567057291666, "learning_rate": 2.5104960411430498e-05, "loss": 4.0595, "loss/crossentropy": 1.4826791658997536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15879861637949944, "step": 9912 }, { "epoch": 0.8261666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.06496988932291667, "learning_rate": 2.507988632158235e-05, "loss": 4.3835, "loss/crossentropy": 1.2572619915008545, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14165034517645836, "step": 9914 }, { "epoch": 0.8263333333333334, "grad_norm": 4.21875, "grad_norm_var": 0.09234619140625, "learning_rate": 2.505480956722205e-05, "loss": 4.9511, "loss/crossentropy": 1.2375790998339653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.134348314255476, "step": 9916 }, { "epoch": 0.8265, "grad_norm": 3.890625, "grad_norm_var": 0.13383687337239583, "learning_rate": 2.5029730210224e-05, "loss": 4.1284, "loss/crossentropy": 2.0290369763970375, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17147668451070786, "step": 9918 }, { "epoch": 0.8266666666666667, "grad_norm": 4.25, "grad_norm_var": 0.14077860514322918, "learning_rate": 2.5004648312469017e-05, "loss": 4.6698, "loss/crossentropy": 1.3962792977690697, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14946845173835754, "step": 9920 }, { "epoch": 0.8268333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.14575093587239582, "learning_rate": 2.4979563935844192e-05, "loss": 4.7106, "loss/crossentropy": 2.4267687797546387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2070532701909542, "step": 9922 }, { "epoch": 0.827, "grad_norm": 4.9375, "grad_norm_var": 0.15075581868489582, "learning_rate": 2.4954477142242738e-05, "loss": 5.7162, "loss/crossentropy": 2.0101495683193207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21650001779198647, "step": 9924 }, { "epoch": 0.8271666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.15583394368489584, "learning_rate": 2.492938799356381e-05, "loss": 4.6152, "loss/crossentropy": 2.140102058649063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18941906467080116, "step": 9926 }, { "epoch": 0.8273333333333334, "grad_norm": 4.375, "grad_norm_var": 0.13820699055989583, "learning_rate": 2.49042965517124e-05, "loss": 4.2472, "loss/crossentropy": 2.1069682240486145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1918292548507452, "step": 9928 }, { "epoch": 0.8275, "grad_norm": 5.03125, "grad_norm_var": 0.1607818603515625, "learning_rate": 2.4879202878599137e-05, "loss": 4.9035, "loss/crossentropy": 2.698134481906891, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24367520585656166, "step": 9930 }, { "epoch": 0.8276666666666667, "grad_norm": 4.75, "grad_norm_var": 0.1223541259765625, "learning_rate": 2.485410703614017e-05, "loss": 4.5958, "loss/crossentropy": 2.365455448627472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22331998497247696, "step": 9932 }, { "epoch": 0.8278333333333333, "grad_norm": 4.75, "grad_norm_var": 0.08313395182291666, "learning_rate": 2.4829009086257e-05, "loss": 5.3997, "loss/crossentropy": 2.537370502948761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20775676891207695, "step": 9934 }, { "epoch": 0.828, "grad_norm": 4.84375, "grad_norm_var": 0.07506510416666666, "learning_rate": 2.4803909090876318e-05, "loss": 5.3511, "loss/crossentropy": 2.4864864349365234, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117943949997425, "step": 9936 }, { "epoch": 0.8281666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.09146728515625, "learning_rate": 2.4778807111929868e-05, "loss": 5.305, "loss/crossentropy": 2.207614630460739, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929798237979412, "step": 9938 }, { "epoch": 0.8283333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.09073893229166667, "learning_rate": 2.4753703211354285e-05, "loss": 5.2632, "loss/crossentropy": 1.7467438951134682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1611561868339777, "step": 9940 }, { "epoch": 0.8285, "grad_norm": 4.53125, "grad_norm_var": 0.08084309895833333, "learning_rate": 2.472859745109096e-05, "loss": 5.0141, "loss/crossentropy": 2.3705212473869324, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19435375928878784, "step": 9942 }, { "epoch": 0.8286666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.06728108723958333, "learning_rate": 2.4703489893085842e-05, "loss": 4.592, "loss/crossentropy": 1.6773193031549454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19796937331557274, "step": 9944 }, { "epoch": 0.8288333333333333, "grad_norm": 4.625, "grad_norm_var": 0.040999348958333334, "learning_rate": 2.4678380599289352e-05, "loss": 5.1897, "loss/crossentropy": 2.335031569004059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20903323218226433, "step": 9946 }, { "epoch": 0.829, "grad_norm": 4.5, "grad_norm_var": 0.04599202473958333, "learning_rate": 2.4653269631656164e-05, "loss": 4.9803, "loss/crossentropy": 1.9900788962841034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1772918961942196, "step": 9948 }, { "epoch": 0.8291666666666667, "grad_norm": 4.25, "grad_norm_var": 0.06456705729166666, "learning_rate": 2.46281570521451e-05, "loss": 4.5932, "loss/crossentropy": 1.3242772594094276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14987998828291893, "step": 9950 }, { "epoch": 0.8293333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.0826171875, "learning_rate": 2.4603042922718956e-05, "loss": 5.2458, "loss/crossentropy": 2.7143561840057373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20748181268572807, "step": 9952 }, { "epoch": 0.8295, "grad_norm": 4.625, "grad_norm_var": 0.04381103515625, "learning_rate": 2.4577927305344343e-05, "loss": 4.8155, "loss/crossentropy": 2.2978959679603577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21883852034807205, "step": 9954 }, { "epoch": 0.8296666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.03828125, "learning_rate": 2.4552810261991564e-05, "loss": 4.653, "loss/crossentropy": 2.2232907116413116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2165231816470623, "step": 9956 }, { "epoch": 0.8298333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.04869791666666667, "learning_rate": 2.4527691854634405e-05, "loss": 5.1506, "loss/crossentropy": 1.845031201839447, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16416467167437077, "step": 9958 }, { "epoch": 0.83, "grad_norm": 4.3125, "grad_norm_var": 0.055013020833333336, "learning_rate": 2.4502572145250055e-05, "loss": 4.6119, "loss/crossentropy": 1.3500414192676544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15559666231274605, "step": 9960 }, { "epoch": 0.8301666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.05870768229166667, "learning_rate": 2.4477451195818896e-05, "loss": 5.065, "loss/crossentropy": 1.7964719235897064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1803671047091484, "step": 9962 }, { "epoch": 0.8303333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.06027018229166667, "learning_rate": 2.4452329068324377e-05, "loss": 5.0383, "loss/crossentropy": 2.097862370312214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18102774024009705, "step": 9964 }, { "epoch": 0.8305, "grad_norm": 4.625, "grad_norm_var": 0.04924723307291667, "learning_rate": 2.4427205824752846e-05, "loss": 5.3962, "loss/crossentropy": 2.1204554736614227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20536362007260323, "step": 9966 }, { "epoch": 0.8306666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.036421712239583334, "learning_rate": 2.4402081527093407e-05, "loss": 4.4037, "loss/crossentropy": 1.9657414183020592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2005226742476225, "step": 9968 }, { "epoch": 0.8308333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.0361328125, "learning_rate": 2.4376956237337765e-05, "loss": 5.0019, "loss/crossentropy": 2.4795849323272705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20067550241947174, "step": 9970 }, { "epoch": 0.831, "grad_norm": 4.5625, "grad_norm_var": 0.035416666666666666, "learning_rate": 2.4351830017480085e-05, "loss": 4.7479, "loss/crossentropy": 1.7610122039914131, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1636796798557043, "step": 9972 }, { "epoch": 0.8311666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.02574462890625, "learning_rate": 2.4326702929516813e-05, "loss": 5.3112, "loss/crossentropy": 2.6836007833480835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21438656002283096, "step": 9974 }, { "epoch": 0.8313333333333334, "grad_norm": 4.28125, "grad_norm_var": 0.029427083333333333, "learning_rate": 2.4301575035446536e-05, "loss": 4.3757, "loss/crossentropy": 1.2276001051068306, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13492096588015556, "step": 9976 }, { "epoch": 0.8315, "grad_norm": 4.4375, "grad_norm_var": 0.028251139322916667, "learning_rate": 2.4276446397269836e-05, "loss": 4.5018, "loss/crossentropy": 1.9868988022208214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16871058754622936, "step": 9978 }, { "epoch": 0.8316666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.025419108072916665, "learning_rate": 2.4251317076989134e-05, "loss": 4.4441, "loss/crossentropy": 1.9331146478652954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1816711686551571, "step": 9980 }, { "epoch": 0.8318333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.026236979166666667, "learning_rate": 2.422618713660853e-05, "loss": 4.791, "loss/crossentropy": 2.5316545963287354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20427104085683823, "step": 9982 }, { "epoch": 0.832, "grad_norm": 4.5, "grad_norm_var": 0.021122233072916666, "learning_rate": 2.4201056638133647e-05, "loss": 4.9682, "loss/crossentropy": 1.7062507718801498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16023046895861626, "step": 9984 }, { "epoch": 0.8321666666666667, "grad_norm": 4.5, "grad_norm_var": 0.030855305989583335, "learning_rate": 2.4175925643571495e-05, "loss": 5.2089, "loss/crossentropy": 2.221466898918152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19790564477443695, "step": 9986 }, { "epoch": 0.8323333333333334, "grad_norm": 4.3125, "grad_norm_var": 0.03511962890625, "learning_rate": 2.4150794214930314e-05, "loss": 5.0092, "loss/crossentropy": 2.4697205424308777, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19727880507707596, "step": 9988 }, { "epoch": 0.8325, "grad_norm": 4.71875, "grad_norm_var": 0.03404947916666667, "learning_rate": 2.4125662414219387e-05, "loss": 4.8074, "loss/crossentropy": 1.8578790351748466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1584503035992384, "step": 9990 }, { "epoch": 0.8326666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.024674479166666666, "learning_rate": 2.4100530303448946e-05, "loss": 4.1005, "loss/crossentropy": 2.0569470822811127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2027450054883957, "step": 9992 }, { "epoch": 0.8328333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.026416015625, "learning_rate": 2.4075397944629976e-05, "loss": 4.3906, "loss/crossentropy": 1.9774124771356583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17903640680015087, "step": 9994 }, { "epoch": 0.833, "grad_norm": 4.28125, "grad_norm_var": 0.04400634765625, "learning_rate": 2.4050265399774072e-05, "loss": 4.7254, "loss/crossentropy": 1.9097615107893944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1601880006492138, "step": 9996 }, { "epoch": 0.8331666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.04309488932291667, "learning_rate": 2.4025132730893298e-05, "loss": 5.4207, "loss/crossentropy": 2.268464207649231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21927351877093315, "step": 9998 }, { "epoch": 0.8333333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.045182291666666666, "learning_rate": 2.4000000000000004e-05, "loss": 4.8914, "loss/crossentropy": 2.4459827542304993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19904977455735207, "step": 10000 }, { "epoch": 0.8335, "grad_norm": 5.8125, "grad_norm_var": 0.13088785807291667, "learning_rate": 2.397486726910671e-05, "loss": 5.2496, "loss/crossentropy": 2.393665611743927, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21754077449440956, "step": 10002 }, { "epoch": 0.8336666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.12320556640625, "learning_rate": 2.394973460022593e-05, "loss": 4.242, "loss/crossentropy": 1.3170356079936028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14232172816991806, "step": 10004 }, { "epoch": 0.8338333333333333, "grad_norm": 4.5, "grad_norm_var": 0.125244140625, "learning_rate": 2.392460205537003e-05, "loss": 4.5769, "loss/crossentropy": 1.8860590308904648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1884509939700365, "step": 10006 }, { "epoch": 0.834, "grad_norm": 4.53125, "grad_norm_var": 0.12538655598958334, "learning_rate": 2.3899469696551058e-05, "loss": 5.1368, "loss/crossentropy": 2.2285009026527405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22375357151031494, "step": 10008 }, { "epoch": 0.8341666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.12401936848958334, "learning_rate": 2.3874337585780624e-05, "loss": 4.5004, "loss/crossentropy": 1.4645071625709534, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14867208898067474, "step": 10010 }, { "epoch": 0.8343333333333334, "grad_norm": 5.15625, "grad_norm_var": 0.11417643229166667, "learning_rate": 2.3849205785069698e-05, "loss": 5.1683, "loss/crossentropy": 2.0563749074935913, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22186800837516785, "step": 10012 }, { "epoch": 0.8345, "grad_norm": 4.4375, "grad_norm_var": 0.11985270182291667, "learning_rate": 2.3824074356428513e-05, "loss": 4.7704, "loss/crossentropy": 1.532589927315712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1682576658204198, "step": 10014 }, { "epoch": 0.8346666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.134375, "learning_rate": 2.379894336186636e-05, "loss": 4.8072, "loss/crossentropy": 1.8966995403170586, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17377831041812897, "step": 10016 }, { "epoch": 0.8348333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.06295166015625, "learning_rate": 2.3773812863391483e-05, "loss": 4.8095, "loss/crossentropy": 2.5535982847213745, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20435542613267899, "step": 10018 }, { "epoch": 0.835, "grad_norm": 4.25, "grad_norm_var": 0.07265625, "learning_rate": 2.3748682923010877e-05, "loss": 4.0512, "loss/crossentropy": 1.9317252039909363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1969212256371975, "step": 10020 }, { "epoch": 0.8351666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.07431233723958333, "learning_rate": 2.3723553602730176e-05, "loss": 4.6774, "loss/crossentropy": 2.494425058364868, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20477550849318504, "step": 10022 }, { "epoch": 0.8353333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.07909749348958334, "learning_rate": 2.3698424964553475e-05, "loss": 5.1466, "loss/crossentropy": 2.3530495166778564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.201245229691267, "step": 10024 }, { "epoch": 0.8355, "grad_norm": 4.8125, "grad_norm_var": 0.061258951822916664, "learning_rate": 2.3673297070483198e-05, "loss": 5.5255, "loss/crossentropy": 2.239256501197815, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2388983592391014, "step": 10026 }, { "epoch": 0.8356666666666667, "grad_norm": 4.75, "grad_norm_var": 0.038916015625, "learning_rate": 2.3648169982519923e-05, "loss": 5.1636, "loss/crossentropy": 1.9721302911639214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19041937962174416, "step": 10028 }, { "epoch": 0.8358333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.03964436848958333, "learning_rate": 2.3623043762662247e-05, "loss": 5.3836, "loss/crossentropy": 2.394495278596878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.254102174192667, "step": 10030 }, { "epoch": 0.836, "grad_norm": 4.625, "grad_norm_var": 0.059305826822916664, "learning_rate": 2.3597918472906605e-05, "loss": 4.5989, "loss/crossentropy": 2.5070912837982178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2160685546696186, "step": 10032 }, { "epoch": 0.8361666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.05276285807291667, "learning_rate": 2.3572794175247165e-05, "loss": 5.0263, "loss/crossentropy": 2.2234988510608673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21419087797403336, "step": 10034 }, { "epoch": 0.8363333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.045426432291666666, "learning_rate": 2.3547670931675635e-05, "loss": 4.6435, "loss/crossentropy": 1.7220133692026138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973039098083973, "step": 10036 }, { "epoch": 0.8365, "grad_norm": 4.46875, "grad_norm_var": 0.0587890625, "learning_rate": 2.3522548804181112e-05, "loss": 4.8134, "loss/crossentropy": 1.4040338397026062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1425228714942932, "step": 10038 }, { "epoch": 0.8366666666666667, "grad_norm": 4.625, "grad_norm_var": 0.058056640625, "learning_rate": 2.349742785474995e-05, "loss": 5.0962, "loss/crossentropy": 2.302014708518982, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21808390691876411, "step": 10040 }, { "epoch": 0.8368333333333333, "grad_norm": 4.25, "grad_norm_var": 0.07574462890625, "learning_rate": 2.3472308145365603e-05, "loss": 4.7411, "loss/crossentropy": 2.5134881734848022, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2047807015478611, "step": 10042 }, { "epoch": 0.837, "grad_norm": 4.34375, "grad_norm_var": 0.08153889973958334, "learning_rate": 2.3447189738008448e-05, "loss": 4.4727, "loss/crossentropy": 1.8269146978855133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2027146816253662, "step": 10044 }, { "epoch": 0.8371666666666666, "grad_norm": 4.625, "grad_norm_var": 0.08370768229166667, "learning_rate": 2.3422072694655668e-05, "loss": 4.5877, "loss/crossentropy": 1.7809841856360435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1944657415151596, "step": 10046 }, { "epoch": 0.8373333333333334, "grad_norm": 4.5, "grad_norm_var": 0.057840983072916664, "learning_rate": 2.3396957077281045e-05, "loss": 4.9396, "loss/crossentropy": 2.4442911744117737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2242444083094597, "step": 10048 }, { "epoch": 0.8375, "grad_norm": 5.03125, "grad_norm_var": 0.07003580729166667, "learning_rate": 2.337184294785491e-05, "loss": 5.2301, "loss/crossentropy": 1.9728785753250122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22261008620262146, "step": 10050 }, { "epoch": 0.8376666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.061747233072916664, "learning_rate": 2.334673036834384e-05, "loss": 4.4604, "loss/crossentropy": 1.158107079565525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1579295713454485, "step": 10052 }, { "epoch": 0.8378333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.05162760416666667, "learning_rate": 2.3321619400710656e-05, "loss": 4.4305, "loss/crossentropy": 1.447898805141449, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16212822496891022, "step": 10054 }, { "epoch": 0.838, "grad_norm": 4.46875, "grad_norm_var": 0.03839518229166667, "learning_rate": 2.329651010691417e-05, "loss": 4.9035, "loss/crossentropy": 2.616535484790802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21434472128748894, "step": 10056 }, { "epoch": 0.8381666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.03218994140625, "learning_rate": 2.3271402548909054e-05, "loss": 4.8799, "loss/crossentropy": 1.4755475595593452, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1441640816628933, "step": 10058 }, { "epoch": 0.8383333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.025809733072916667, "learning_rate": 2.324629678864572e-05, "loss": 5.3515, "loss/crossentropy": 2.7159085869789124, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20213079079985619, "step": 10060 }, { "epoch": 0.8385, "grad_norm": 4.40625, "grad_norm_var": 0.030973307291666665, "learning_rate": 2.3221192888070144e-05, "loss": 4.4702, "loss/crossentropy": 2.061556816101074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1886168085038662, "step": 10062 }, { "epoch": 0.8386666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.033707682291666666, "learning_rate": 2.319609090912369e-05, "loss": 4.614, "loss/crossentropy": 1.3036933615803719, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14548559859395027, "step": 10064 }, { "epoch": 0.8388333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.025846354166666665, "learning_rate": 2.317099091374301e-05, "loss": 4.9908, "loss/crossentropy": 1.1547905504703522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16585730761289597, "step": 10066 }, { "epoch": 0.839, "grad_norm": 5.40625, "grad_norm_var": 0.07138264973958333, "learning_rate": 2.3145892963859834e-05, "loss": 5.044, "loss/crossentropy": 1.499816857278347, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1658524088561535, "step": 10068 }, { "epoch": 0.8391666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.07818603515625, "learning_rate": 2.3120797121400874e-05, "loss": 4.9807, "loss/crossentropy": 2.348451852798462, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19513800367712975, "step": 10070 }, { "epoch": 0.8393333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.078369140625, "learning_rate": 2.309570344828761e-05, "loss": 4.866, "loss/crossentropy": 2.185619443655014, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21363262459635735, "step": 10072 }, { "epoch": 0.8395, "grad_norm": 4.21875, "grad_norm_var": 0.218603515625, "learning_rate": 2.3070612006436202e-05, "loss": 4.7391, "loss/crossentropy": 2.4252246618270874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19878173992037773, "step": 10074 }, { "epoch": 0.8396666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.24859619140625, "learning_rate": 2.304552285775727e-05, "loss": 5.27, "loss/crossentropy": 2.406407594680786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21240625903010368, "step": 10076 }, { "epoch": 0.8398333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.22805989583333333, "learning_rate": 2.3020436064155813e-05, "loss": 4.9108, "loss/crossentropy": 1.5165601968765259, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16400650143623352, "step": 10078 }, { "epoch": 0.84, "grad_norm": 4.71875, "grad_norm_var": 0.22141520182291666, "learning_rate": 2.2995351687530988e-05, "loss": 4.7594, "loss/crossentropy": 1.1999849155545235, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13244791328907013, "step": 10080 }, { "epoch": 0.8401666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.23917643229166666, "learning_rate": 2.297026978977601e-05, "loss": 4.7325, "loss/crossentropy": 2.0893143713474274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20484750717878342, "step": 10082 }, { "epoch": 0.8403333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.21301676432291666, "learning_rate": 2.294519043277796e-05, "loss": 5.2237, "loss/crossentropy": 2.0791936218738556, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17905857414007187, "step": 10084 }, { "epoch": 0.8405, "grad_norm": 4.375, "grad_norm_var": 0.21646728515625, "learning_rate": 2.2920113678417666e-05, "loss": 4.2325, "loss/crossentropy": 1.9750349968671799, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17366132140159607, "step": 10086 }, { "epoch": 0.8406666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.22083333333333333, "learning_rate": 2.289503958856951e-05, "loss": 5.316, "loss/crossentropy": 2.199047952890396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2029794305562973, "step": 10088 }, { "epoch": 0.8408333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.08069254557291666, "learning_rate": 2.2869968225101342e-05, "loss": 4.9191, "loss/crossentropy": 1.9835616052150726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18883539736270905, "step": 10090 }, { "epoch": 0.841, "grad_norm": 4.5625, "grad_norm_var": 0.04021809895833333, "learning_rate": 2.2844899649874234e-05, "loss": 4.5576, "loss/crossentropy": 0.9617295414209366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14817167818546295, "step": 10092 }, { "epoch": 0.8411666666666666, "grad_norm": 5.21875, "grad_norm_var": 0.06594645182291667, "learning_rate": 2.281983392474242e-05, "loss": 5.2977, "loss/crossentropy": 2.2602842450141907, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18473618105053902, "step": 10094 }, { "epoch": 0.8413333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.0697265625, "learning_rate": 2.2794771111553082e-05, "loss": 4.7655, "loss/crossentropy": 1.6119416430592537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1529501359909773, "step": 10096 }, { "epoch": 0.8415, "grad_norm": 4.46875, "grad_norm_var": 0.060347493489583334, "learning_rate": 2.2769711272146217e-05, "loss": 4.9663, "loss/crossentropy": 1.7058739140629768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1737150400876999, "step": 10098 }, { "epoch": 0.8416666666666667, "grad_norm": 5.3125, "grad_norm_var": 0.08670247395833333, "learning_rate": 2.2744654468354485e-05, "loss": 5.3056, "loss/crossentropy": 2.452378123998642, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19069649651646614, "step": 10100 }, { "epoch": 0.8418333333333333, "grad_norm": 4.5, "grad_norm_var": 0.08199462890625, "learning_rate": 2.2719600762003066e-05, "loss": 5.1376, "loss/crossentropy": 2.5809699296951294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19522631168365479, "step": 10102 }, { "epoch": 0.842, "grad_norm": 4.71875, "grad_norm_var": 0.083984375, "learning_rate": 2.2694550214909485e-05, "loss": 5.2027, "loss/crossentropy": 1.7924980521202087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17402834072709084, "step": 10104 }, { "epoch": 0.8421666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.08214518229166666, "learning_rate": 2.2669502888883476e-05, "loss": 5.0221, "loss/crossentropy": 2.1637089550495148, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20585686340928078, "step": 10106 }, { "epoch": 0.8423333333333334, "grad_norm": 4.625, "grad_norm_var": 0.07693684895833333, "learning_rate": 2.264445884572683e-05, "loss": 5.1873, "loss/crossentropy": 2.483547031879425, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21202965080738068, "step": 10108 }, { "epoch": 0.8425, "grad_norm": 4.4375, "grad_norm_var": 0.07967122395833333, "learning_rate": 2.261941814723323e-05, "loss": 4.2006, "loss/crossentropy": 1.1384671851992607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.128371087834239, "step": 10110 }, { "epoch": 0.8426666666666667, "grad_norm": 4.625, "grad_norm_var": 0.073291015625, "learning_rate": 2.2594380855188113e-05, "loss": 4.9687, "loss/crossentropy": 2.5191361010074615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2324436865746975, "step": 10112 }, { "epoch": 0.8428333333333333, "grad_norm": 4.5, "grad_norm_var": 0.072509765625, "learning_rate": 2.2569347031368506e-05, "loss": 4.51, "loss/crossentropy": 2.028899200260639, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18104288540780544, "step": 10114 }, { "epoch": 0.843, "grad_norm": 4.78125, "grad_norm_var": 0.04250895182291667, "learning_rate": 2.2544316737542884e-05, "loss": 5.0327, "loss/crossentropy": 1.9726526737213135, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18508725985884666, "step": 10116 }, { "epoch": 0.8431666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.03573811848958333, "learning_rate": 2.251929003547101e-05, "loss": 4.9454, "loss/crossentropy": 2.0947405397892, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19356149435043335, "step": 10118 }, { "epoch": 0.8433333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.03752848307291667, "learning_rate": 2.2494266986903775e-05, "loss": 4.5932, "loss/crossentropy": 1.8272379711270332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1713714934885502, "step": 10120 }, { "epoch": 0.8435, "grad_norm": 4.875, "grad_norm_var": 0.042317708333333336, "learning_rate": 2.2469247653583074e-05, "loss": 5.0663, "loss/crossentropy": 2.5811198949813843, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20255010202527046, "step": 10122 }, { "epoch": 0.8436666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.04529622395833333, "learning_rate": 2.244423209724161e-05, "loss": 5.0194, "loss/crossentropy": 2.0795632749795914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1663488782942295, "step": 10124 }, { "epoch": 0.8438333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.04104410807291667, "learning_rate": 2.2419220379602808e-05, "loss": 5.2162, "loss/crossentropy": 2.4612520933151245, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20894955843687057, "step": 10126 }, { "epoch": 0.844, "grad_norm": 4.3125, "grad_norm_var": 0.04950764973958333, "learning_rate": 2.239421256238056e-05, "loss": 4.9813, "loss/crossentropy": 2.0469383597373962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.180439043790102, "step": 10128 }, { "epoch": 0.8441666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.06415608723958334, "learning_rate": 2.2369208707279207e-05, "loss": 5.0879, "loss/crossentropy": 2.3578098118305206, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20639550685882568, "step": 10130 }, { "epoch": 0.8443333333333334, "grad_norm": 4.34375, "grad_norm_var": 0.06575113932291667, "learning_rate": 2.234420887599324e-05, "loss": 4.605, "loss/crossentropy": 1.8820787519216537, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16899964958429337, "step": 10132 }, { "epoch": 0.8445, "grad_norm": 5.125, "grad_norm_var": 0.080322265625, "learning_rate": 2.2319213130207284e-05, "loss": 5.2779, "loss/crossentropy": 2.2708524763584137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19943390414118767, "step": 10134 }, { "epoch": 0.8446666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.0740234375, "learning_rate": 2.2294221531595843e-05, "loss": 4.7132, "loss/crossentropy": 2.0893598422408104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17067280784249306, "step": 10136 }, { "epoch": 0.8448333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.08592122395833333, "learning_rate": 2.226923414182321e-05, "loss": 4.5546, "loss/crossentropy": 0.9215280041098595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10505077801644802, "step": 10138 }, { "epoch": 0.845, "grad_norm": 4.21875, "grad_norm_var": 0.08592122395833333, "learning_rate": 2.224425102254328e-05, "loss": 4.1395, "loss/crossentropy": 2.439699411392212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2093421071767807, "step": 10140 }, { "epoch": 0.8451666666666666, "grad_norm": 4.875, "grad_norm_var": 0.55406494140625, "learning_rate": 2.2219272235399417e-05, "loss": 4.4381, "loss/crossentropy": 1.3956974297761917, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1445833072066307, "step": 10142 }, { "epoch": 0.8453333333333334, "grad_norm": 4.34375, "grad_norm_var": 0.53970947265625, "learning_rate": 2.2194297842024293e-05, "loss": 5.2216, "loss/crossentropy": 2.28021776676178, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18985600024461746, "step": 10144 }, { "epoch": 0.8455, "grad_norm": 4.625, "grad_norm_var": 0.5352864583333333, "learning_rate": 2.2169327904039754e-05, "loss": 5.1932, "loss/crossentropy": 1.7583889961242676, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17270105704665184, "step": 10146 }, { "epoch": 0.8456666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.5301717122395834, "learning_rate": 2.2144362483056622e-05, "loss": 4.6843, "loss/crossentropy": 1.759839728474617, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18034739792346954, "step": 10148 }, { "epoch": 0.8458333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.5265462239583333, "learning_rate": 2.2119401640674606e-05, "loss": 5.4814, "loss/crossentropy": 2.5909521877765656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22270163893699646, "step": 10150 }, { "epoch": 0.846, "grad_norm": 4.78125, "grad_norm_var": 0.52265625, "learning_rate": 2.209444543848209e-05, "loss": 4.5192, "loss/crossentropy": 1.9088744521141052, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19738979637622833, "step": 10152 }, { "epoch": 0.8461666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.5177083333333333, "learning_rate": 2.2069493938056033e-05, "loss": 4.2045, "loss/crossentropy": 2.0631661638617516, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19043312687426805, "step": 10154 }, { "epoch": 0.8463333333333334, "grad_norm": 4.625, "grad_norm_var": 2.4303385416666665, "learning_rate": 2.204454720096177e-05, "loss": 4.0872, "loss/crossentropy": 2.1815578043460846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19806847721338272, "step": 10156 }, { "epoch": 0.8465, "grad_norm": 4.78125, "grad_norm_var": 2.1123697916666666, "learning_rate": 2.2019605288752914e-05, "loss": 4.9222, "loss/crossentropy": 1.1190447807312012, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1384220365434885, "step": 10158 }, { "epoch": 0.8466666666666667, "grad_norm": 4.65625, "grad_norm_var": 2.0984212239583333, "learning_rate": 2.1994668262971133e-05, "loss": 4.9997, "loss/crossentropy": 1.367589220404625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1640252321958542, "step": 10160 }, { "epoch": 0.8468333333333333, "grad_norm": 4.625, "grad_norm_var": 2.0991170247395834, "learning_rate": 2.1969736185146077e-05, "loss": 4.6507, "loss/crossentropy": 1.1029272973537445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1302457209676504, "step": 10162 }, { "epoch": 0.847, "grad_norm": 4.8125, "grad_norm_var": 2.0749837239583333, "learning_rate": 2.1944809116795156e-05, "loss": 4.5281, "loss/crossentropy": 1.1658693552017212, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12650441750884056, "step": 10164 }, { "epoch": 0.8471666666666666, "grad_norm": 5.28125, "grad_norm_var": 2.0757120768229167, "learning_rate": 2.1919887119423447e-05, "loss": 5.1508, "loss/crossentropy": 1.959177941083908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1851571910083294, "step": 10166 }, { "epoch": 0.8473333333333334, "grad_norm": 4.40625, "grad_norm_var": 2.111747233072917, "learning_rate": 2.189497025452348e-05, "loss": 5.093, "loss/crossentropy": 2.24192276597023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19968308880925179, "step": 10168 }, { "epoch": 0.8475, "grad_norm": 4.78125, "grad_norm_var": 2.107405598958333, "learning_rate": 2.1870058583575168e-05, "loss": 4.9912, "loss/crossentropy": 2.1087652146816254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21696339920163155, "step": 10170 }, { "epoch": 0.8476666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.05703125, "learning_rate": 2.184515216804556e-05, "loss": 5.3077, "loss/crossentropy": 2.6343509554862976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.213670052587986, "step": 10172 }, { "epoch": 0.8478333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.08166910807291666, "learning_rate": 2.1820251069388778e-05, "loss": 4.9912, "loss/crossentropy": 2.2396809458732605, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2050902657210827, "step": 10174 }, { "epoch": 0.848, "grad_norm": 4.96875, "grad_norm_var": 0.09166259765625, "learning_rate": 2.1795355349045796e-05, "loss": 5.2577, "loss/crossentropy": 2.42908251285553, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2073732390999794, "step": 10176 }, { "epoch": 0.8481666666666666, "grad_norm": 5.125, "grad_norm_var": 0.09986979166666667, "learning_rate": 2.177046506844433e-05, "loss": 5.0577, "loss/crossentropy": 1.525025613605976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19690265133976936, "step": 10178 }, { "epoch": 0.8483333333333334, "grad_norm": 4.625, "grad_norm_var": 0.09830729166666667, "learning_rate": 2.174558028899868e-05, "loss": 5.1652, "loss/crossentropy": 2.395362615585327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21092161908745766, "step": 10180 }, { "epoch": 0.8485, "grad_norm": 5.1875, "grad_norm_var": 0.09722900390625, "learning_rate": 2.1720701072109564e-05, "loss": 5.2021, "loss/crossentropy": 1.5952081009745598, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.27194925770163536, "step": 10182 }, { "epoch": 0.8486666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.07851155598958333, "learning_rate": 2.1695827479163967e-05, "loss": 4.9746, "loss/crossentropy": 2.3871026039123535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2411057911813259, "step": 10184 }, { "epoch": 0.8488333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.07057291666666667, "learning_rate": 2.167095957153502e-05, "loss": 5.1706, "loss/crossentropy": 1.297366164624691, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14902203157544136, "step": 10186 }, { "epoch": 0.849, "grad_norm": 4.71875, "grad_norm_var": 0.08006184895833333, "learning_rate": 2.1646097410581804e-05, "loss": 4.6391, "loss/crossentropy": 1.8485392034053802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16621309891343117, "step": 10188 }, { "epoch": 0.8491666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.065869140625, "learning_rate": 2.1621241057649236e-05, "loss": 4.9836, "loss/crossentropy": 2.1049680411815643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2324654832482338, "step": 10190 }, { "epoch": 0.8493333333333334, "grad_norm": 4.625, "grad_norm_var": 0.05709635416666667, "learning_rate": 2.159639057406789e-05, "loss": 4.6144, "loss/crossentropy": 1.404382936656475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13865314982831478, "step": 10192 }, { "epoch": 0.8495, "grad_norm": 5.15625, "grad_norm_var": 0.058577473958333334, "learning_rate": 2.1571546021153863e-05, "loss": 5.2336, "loss/crossentropy": 2.1182867288589478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24229948967695236, "step": 10194 }, { "epoch": 0.8496666666666667, "grad_norm": 4.375, "grad_norm_var": 0.06638997395833333, "learning_rate": 2.1546707460208634e-05, "loss": 4.7806, "loss/crossentropy": 1.6670853942632675, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16587894223630428, "step": 10196 }, { "epoch": 0.8498333333333333, "grad_norm": 4.5, "grad_norm_var": 0.07177327473958334, "learning_rate": 2.1521874952518863e-05, "loss": 5.4903, "loss/crossentropy": 1.9937995970249176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17633923329412937, "step": 10198 }, { "epoch": 0.85, "grad_norm": 5.25, "grad_norm_var": 0.09464518229166667, "learning_rate": 2.149704855935631e-05, "loss": 5.1717, "loss/crossentropy": 1.7172137647867203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18349767476320267, "step": 10200 }, { "epoch": 0.8501666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.09670817057291667, "learning_rate": 2.1472228341977624e-05, "loss": 5.0091, "loss/crossentropy": 2.4879343509674072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22031700983643532, "step": 10202 }, { "epoch": 0.8503333333333334, "grad_norm": 22.375, "grad_norm_var": 19.593550618489584, "learning_rate": 2.1447414361624216e-05, "loss": 4.6357, "loss/crossentropy": 1.5000810474157333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13241293840110302, "step": 10204 }, { "epoch": 0.8505, "grad_norm": 4.78125, "grad_norm_var": 19.64205322265625, "learning_rate": 2.142260667952214e-05, "loss": 4.5955, "loss/crossentropy": 2.1218055486679077, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21866939589381218, "step": 10206 }, { "epoch": 0.8506666666666667, "grad_norm": 4.90625, "grad_norm_var": 19.614046223958333, "learning_rate": 2.1397805356881863e-05, "loss": 4.6238, "loss/crossentropy": 2.0160721242427826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22799114137887955, "step": 10208 }, { "epoch": 0.8508333333333333, "grad_norm": 4.46875, "grad_norm_var": 19.658268229166666, "learning_rate": 2.1373010454898198e-05, "loss": 4.6067, "loss/crossentropy": 1.691223792731762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17581338994204998, "step": 10210 }, { "epoch": 0.851, "grad_norm": 4.375, "grad_norm_var": 19.744755045572916, "learning_rate": 2.1348222034750083e-05, "loss": 4.1484, "loss/crossentropy": 1.6983703970909119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15516823064535856, "step": 10212 }, { "epoch": 0.8511666666666666, "grad_norm": 4.53125, "grad_norm_var": 19.739567057291666, "learning_rate": 2.13234401576005e-05, "loss": 4.5592, "loss/crossentropy": 1.9170349910855293, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19698243960738182, "step": 10214 }, { "epoch": 0.8513333333333334, "grad_norm": 4.84375, "grad_norm_var": 19.742708333333333, "learning_rate": 2.129866488459626e-05, "loss": 4.6056, "loss/crossentropy": 2.0503681302070618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23426194116473198, "step": 10216 }, { "epoch": 0.8515, "grad_norm": 4.59375, "grad_norm_var": 19.671354166666667, "learning_rate": 2.1273896276867886e-05, "loss": 4.7673, "loss/crossentropy": 2.1819980144500732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21132346987724304, "step": 10218 }, { "epoch": 0.8516666666666667, "grad_norm": 4.75, "grad_norm_var": 0.09136962890625, "learning_rate": 2.1249134395529447e-05, "loss": 5.0062, "loss/crossentropy": 2.1964263021945953, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2328333593904972, "step": 10220 }, { "epoch": 0.8518333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.08355712890625, "learning_rate": 2.122437930167844e-05, "loss": 4.7946, "loss/crossentropy": 1.8490115702152252, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21102577820420265, "step": 10222 }, { "epoch": 0.852, "grad_norm": 4.84375, "grad_norm_var": 0.08271077473958334, "learning_rate": 2.1199631056395583e-05, "loss": 5.2719, "loss/crossentropy": 1.9915091469883919, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18897178769111633, "step": 10224 }, { "epoch": 0.8521666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.074609375, "learning_rate": 2.1174889720744725e-05, "loss": 5.1246, "loss/crossentropy": 2.4218207597732544, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24127807095646858, "step": 10226 }, { "epoch": 0.8523333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.053369140625, "learning_rate": 2.1150155355772642e-05, "loss": 5.446, "loss/crossentropy": 2.052487760782242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854759305715561, "step": 10228 }, { "epoch": 0.8525, "grad_norm": 4.59375, "grad_norm_var": 0.026167805989583334, "learning_rate": 2.112542802250892e-05, "loss": 5.3682, "loss/crossentropy": 2.172104150056839, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20149703696370125, "step": 10230 }, { "epoch": 0.8526666666666667, "grad_norm": 4.875, "grad_norm_var": 0.027457682291666667, "learning_rate": 2.1100707781965806e-05, "loss": 5.1003, "loss/crossentropy": 1.58599391579628, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1738888956606388, "step": 10232 }, { "epoch": 0.8528333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.043778483072916666, "learning_rate": 2.1075994695138025e-05, "loss": 5.1155, "loss/crossentropy": 2.560574531555176, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20391079783439636, "step": 10234 }, { "epoch": 0.853, "grad_norm": 4.875, "grad_norm_var": 0.04412434895833333, "learning_rate": 2.1051288823002663e-05, "loss": 5.4341, "loss/crossentropy": 2.2714935541152954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.219677422195673, "step": 10236 }, { "epoch": 0.8531666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.04608968098958333, "learning_rate": 2.1026590226519018e-05, "loss": 5.1789, "loss/crossentropy": 1.3990702331066132, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14530890248715878, "step": 10238 }, { "epoch": 0.8533333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.046614583333333334, "learning_rate": 2.1001898966628403e-05, "loss": 4.559, "loss/crossentropy": 2.0317687690258026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17451823875308037, "step": 10240 }, { "epoch": 0.8535, "grad_norm": 4.3125, "grad_norm_var": 0.05474853515625, "learning_rate": 2.097721510425407e-05, "loss": 4.9075, "loss/crossentropy": 2.2877692580223083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19254744797945023, "step": 10242 }, { "epoch": 0.8536666666666667, "grad_norm": 4.625, "grad_norm_var": 0.05673421223958333, "learning_rate": 2.0952538700300966e-05, "loss": 4.7423, "loss/crossentropy": 1.2009011879563332, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12820916064083576, "step": 10244 }, { "epoch": 0.8538333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.057417805989583334, "learning_rate": 2.0927869815655684e-05, "loss": 4.9864, "loss/crossentropy": 2.1787761747837067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2240334264934063, "step": 10246 }, { "epoch": 0.854, "grad_norm": 4.625, "grad_norm_var": 0.061442057291666664, "learning_rate": 2.090320851118624e-05, "loss": 5.1471, "loss/crossentropy": 2.5853514075279236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1966766081750393, "step": 10248 }, { "epoch": 0.8541666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.037984212239583336, "learning_rate": 2.0878554847741956e-05, "loss": 5.1916, "loss/crossentropy": 2.4860697388648987, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21677172929048538, "step": 10250 }, { "epoch": 0.8543333333333333, "grad_norm": 4.875, "grad_norm_var": 0.046875, "learning_rate": 2.0853908886153285e-05, "loss": 4.6001, "loss/crossentropy": 1.6758419573307037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1749027669429779, "step": 10252 }, { "epoch": 0.8545, "grad_norm": 4.5, "grad_norm_var": 0.03707275390625, "learning_rate": 2.0829270687231693e-05, "loss": 4.2543, "loss/crossentropy": 2.1500919461250305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20609234645962715, "step": 10254 }, { "epoch": 0.8546666666666667, "grad_norm": 4.1875, "grad_norm_var": 0.042822265625, "learning_rate": 2.0804640311769494e-05, "loss": 4.4822, "loss/crossentropy": 1.4717730283737183, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15207753330469131, "step": 10256 }, { "epoch": 0.8548333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.045166015625, "learning_rate": 2.078001782053968e-05, "loss": 5.0939, "loss/crossentropy": 2.1801829636096954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21367578580975533, "step": 10258 }, { "epoch": 0.855, "grad_norm": 4.53125, "grad_norm_var": 0.0419921875, "learning_rate": 2.0755403274295807e-05, "loss": 4.4958, "loss/crossentropy": 2.5639131665229797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22637486085295677, "step": 10260 }, { "epoch": 0.8551666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.04491780598958333, "learning_rate": 2.0730796733771815e-05, "loss": 5.2363, "loss/crossentropy": 1.9212607964873314, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1764792650938034, "step": 10262 }, { "epoch": 0.8553333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.04918212890625, "learning_rate": 2.0706198259681907e-05, "loss": 4.3208, "loss/crossentropy": 2.1319038569927216, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2085258960723877, "step": 10264 }, { "epoch": 0.8555, "grad_norm": 4.96875, "grad_norm_var": 0.07272135416666667, "learning_rate": 2.0681607912720353e-05, "loss": 5.7044, "loss/crossentropy": 2.014607787132263, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217660091817379, "step": 10266 }, { "epoch": 0.8556666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.06360270182291666, "learning_rate": 2.065702575356142e-05, "loss": 4.9487, "loss/crossentropy": 1.988630086183548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18287063390016556, "step": 10268 }, { "epoch": 0.8558333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.0615234375, "learning_rate": 2.0632451842859113e-05, "loss": 4.5084, "loss/crossentropy": 2.0185526311397552, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2114994414150715, "step": 10270 }, { "epoch": 0.856, "grad_norm": 4.53125, "grad_norm_var": 0.052197265625, "learning_rate": 2.0607886241247135e-05, "loss": 4.7235, "loss/crossentropy": 2.469718277454376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23021500930190086, "step": 10272 }, { "epoch": 0.8561666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.0750152587890625, "learning_rate": 2.0583329009338646e-05, "loss": 4.3133, "loss/crossentropy": 1.1823057383298874, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1447651106864214, "step": 10274 }, { "epoch": 0.8563333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.08342997233072917, "learning_rate": 2.0558780207726193e-05, "loss": 4.6926, "loss/crossentropy": 1.8804874122142792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16276229731738567, "step": 10276 }, { "epoch": 0.8565, "grad_norm": 4.09375, "grad_norm_var": 0.0946685791015625, "learning_rate": 2.0534239896981488e-05, "loss": 4.8669, "loss/crossentropy": 2.515592932701111, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22185379639267921, "step": 10278 }, { "epoch": 0.8566666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.08923238118489583, "learning_rate": 2.050970813765533e-05, "loss": 4.7441, "loss/crossentropy": 1.2782834395766258, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1427957322448492, "step": 10280 }, { "epoch": 0.8568333333333333, "grad_norm": 4.21875, "grad_norm_var": 0.06330464680989584, "learning_rate": 2.0485184990277367e-05, "loss": 4.3806, "loss/crossentropy": 2.2299217581748962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1995675452053547, "step": 10282 }, { "epoch": 0.857, "grad_norm": 4.40625, "grad_norm_var": 0.05628153483072917, "learning_rate": 2.046067051535605e-05, "loss": 4.4903, "loss/crossentropy": 1.19523473829031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16348830796778202, "step": 10284 }, { "epoch": 0.8571666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.05873921712239583, "learning_rate": 2.0436164773378402e-05, "loss": 5.01, "loss/crossentropy": 1.890808716416359, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17876779288053513, "step": 10286 }, { "epoch": 0.8573333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.05934956868489583, "learning_rate": 2.041166782480991e-05, "loss": 4.8465, "loss/crossentropy": 1.7806348651647568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16407202184200287, "step": 10288 }, { "epoch": 0.8575, "grad_norm": 4.5, "grad_norm_var": 0.03943684895833333, "learning_rate": 2.0387179730094343e-05, "loss": 4.7088, "loss/crossentropy": 1.9317216500639915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17379080504179, "step": 10290 }, { "epoch": 0.8576666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.03470052083333333, "learning_rate": 2.0362700549653663e-05, "loss": 5.1756, "loss/crossentropy": 2.081753820180893, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.230081208050251, "step": 10292 }, { "epoch": 0.8578333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.025846354166666665, "learning_rate": 2.03382303438878e-05, "loss": 4.9575, "loss/crossentropy": 1.958198145031929, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16479731537401676, "step": 10294 }, { "epoch": 0.858, "grad_norm": 4.53125, "grad_norm_var": 0.02613525390625, "learning_rate": 2.031376917317456e-05, "loss": 4.717, "loss/crossentropy": 1.3425401076674461, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1326687391847372, "step": 10296 }, { "epoch": 0.8581666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.02265625, "learning_rate": 2.028931709786944e-05, "loss": 4.9235, "loss/crossentropy": 1.9076469615101814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17414027452468872, "step": 10298 }, { "epoch": 0.8583333333333333, "grad_norm": 5.0, "grad_norm_var": 0.04192708333333333, "learning_rate": 2.026487417830552e-05, "loss": 4.4676, "loss/crossentropy": 1.4255196824669838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15519500523805618, "step": 10300 }, { "epoch": 0.8585, "grad_norm": 4.3125, "grad_norm_var": 0.04315999348958333, "learning_rate": 2.024044047479326e-05, "loss": 4.8051, "loss/crossentropy": 1.4676533862948418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14330729097127914, "step": 10302 }, { "epoch": 0.8586666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.0408203125, "learning_rate": 2.021601604762041e-05, "loss": 5.3457, "loss/crossentropy": 2.270623505115509, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2200910672545433, "step": 10304 }, { "epoch": 0.8588333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.04218343098958333, "learning_rate": 2.0191600957051802e-05, "loss": 4.7869, "loss/crossentropy": 2.379778265953064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1987297348678112, "step": 10306 }, { "epoch": 0.859, "grad_norm": 4.84375, "grad_norm_var": 0.05764567057291667, "learning_rate": 2.016719526332926e-05, "loss": 4.9602, "loss/crossentropy": 1.8067995011806488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1776756253093481, "step": 10308 }, { "epoch": 0.8591666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.0548828125, "learning_rate": 2.0142799026671387e-05, "loss": 5.0338, "loss/crossentropy": 1.6648282185196877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1662379615008831, "step": 10310 }, { "epoch": 0.8593333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.05435791015625, "learning_rate": 2.011841230727349e-05, "loss": 5.0295, "loss/crossentropy": 2.2086196839809418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2117423191666603, "step": 10312 }, { "epoch": 0.8595, "grad_norm": 4.78125, "grad_norm_var": 0.05445556640625, "learning_rate": 2.009403516530736e-05, "loss": 4.9409, "loss/crossentropy": 1.8415422439575195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.215627308934927, "step": 10314 }, { "epoch": 0.8596666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.0404296875, "learning_rate": 2.0069667660921183e-05, "loss": 5.2048, "loss/crossentropy": 2.0204322412610054, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18452575244009495, "step": 10316 }, { "epoch": 0.8598333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.032059733072916666, "learning_rate": 2.004530985423935e-05, "loss": 4.4636, "loss/crossentropy": 1.3031515032052994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13595690671354532, "step": 10318 }, { "epoch": 0.86, "grad_norm": 4.59375, "grad_norm_var": 0.031966145833333334, "learning_rate": 2.002096180536233e-05, "loss": 4.989, "loss/crossentropy": 2.1542540416121483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19251862913370132, "step": 10320 }, { "epoch": 0.8601666666666666, "grad_norm": 4.4375, "grad_norm_var": 0.03435872395833333, "learning_rate": 1.9996623574366506e-05, "loss": 4.6652, "loss/crossentropy": 1.7660870179533958, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1680949106812477, "step": 10322 }, { "epoch": 0.8603333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.038655598958333336, "learning_rate": 1.997229522130405e-05, "loss": 5.0513, "loss/crossentropy": 1.868349775671959, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1752658188343048, "step": 10324 }, { "epoch": 0.8605, "grad_norm": 4.875, "grad_norm_var": 0.0478515625, "learning_rate": 1.994797680620275e-05, "loss": 4.5671, "loss/crossentropy": 2.250141680240631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2370268814265728, "step": 10326 }, { "epoch": 0.8606666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.04659830729166667, "learning_rate": 1.992366838906589e-05, "loss": 4.7318, "loss/crossentropy": 1.7279707714915276, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15387518890202045, "step": 10328 }, { "epoch": 0.8608333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.041910807291666664, "learning_rate": 1.9899370029872056e-05, "loss": 5.0707, "loss/crossentropy": 2.460710883140564, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20482278987765312, "step": 10330 }, { "epoch": 0.861, "grad_norm": 4.40625, "grad_norm_var": 0.048173014322916666, "learning_rate": 1.9875081788575047e-05, "loss": 4.2985, "loss/crossentropy": 2.021486707031727, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17060382664203644, "step": 10332 }, { "epoch": 0.8611666666666666, "grad_norm": 4.34375, "grad_norm_var": 0.05325113932291667, "learning_rate": 1.9850803725103674e-05, "loss": 5.1125, "loss/crossentropy": 1.4065138399600983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1290155854076147, "step": 10334 }, { "epoch": 0.8613333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.06417643229166667, "learning_rate": 1.9826535899361657e-05, "loss": 4.1981, "loss/crossentropy": 1.3243483901023865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14576907455921173, "step": 10336 }, { "epoch": 0.8615, "grad_norm": 4.78125, "grad_norm_var": 0.07786458333333333, "learning_rate": 1.9802278371227427e-05, "loss": 4.6557, "loss/crossentropy": 1.7146344780921936, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19457191228866577, "step": 10338 }, { "epoch": 0.8616666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.06560872395833334, "learning_rate": 1.9778031200554038e-05, "loss": 4.6025, "loss/crossentropy": 0.9829111769795418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13816312327980995, "step": 10340 }, { "epoch": 0.8618333333333333, "grad_norm": 4.625, "grad_norm_var": 0.07193603515625, "learning_rate": 1.9753794447168965e-05, "loss": 4.0344, "loss/crossentropy": 1.5932381376624107, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16735537350177765, "step": 10342 }, { "epoch": 0.862, "grad_norm": 4.875, "grad_norm_var": 0.0759765625, "learning_rate": 1.9729568170873997e-05, "loss": 5.3311, "loss/crossentropy": 2.4922866225242615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20134862512350082, "step": 10344 }, { "epoch": 0.8621666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.08033854166666667, "learning_rate": 1.970535243144505e-05, "loss": 4.7939, "loss/crossentropy": 1.3205213844776154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20733513124287128, "step": 10346 }, { "epoch": 0.8623333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.07483317057291666, "learning_rate": 1.9681147288632063e-05, "loss": 5.342, "loss/crossentropy": 1.4077673107385635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15148967504501343, "step": 10348 }, { "epoch": 0.8625, "grad_norm": 4.875, "grad_norm_var": 0.087109375, "learning_rate": 1.9656952802158816e-05, "loss": 5.494, "loss/crossentropy": 1.784704715013504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.165837112814188, "step": 10350 }, { "epoch": 0.8626666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.06754150390625, "learning_rate": 1.96327690317228e-05, "loss": 4.6651, "loss/crossentropy": 1.8337150737643242, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1464794110506773, "step": 10352 }, { "epoch": 0.8628333333333333, "grad_norm": 4.375, "grad_norm_var": 0.06523030598958333, "learning_rate": 1.9608596036995065e-05, "loss": 4.8617, "loss/crossentropy": 2.221179723739624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20009664818644524, "step": 10354 }, { "epoch": 0.863, "grad_norm": 4.90625, "grad_norm_var": 0.06724853515625, "learning_rate": 1.9584433877620075e-05, "loss": 4.8329, "loss/crossentropy": 2.1157293617725372, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24074679613113403, "step": 10356 }, { "epoch": 0.8631666666666666, "grad_norm": 4.75, "grad_norm_var": 0.04254150390625, "learning_rate": 1.9560282613215547e-05, "loss": 5.1425, "loss/crossentropy": 2.292990207672119, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21054679155349731, "step": 10358 }, { "epoch": 0.8633333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.04568684895833333, "learning_rate": 1.9536142303372337e-05, "loss": 4.7158, "loss/crossentropy": 1.4450874850153923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13776488974690437, "step": 10360 }, { "epoch": 0.8635, "grad_norm": 4.875, "grad_norm_var": 0.05545247395833333, "learning_rate": 1.9512013007654248e-05, "loss": 4.4627, "loss/crossentropy": 1.927704095840454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18812527135014534, "step": 10362 }, { "epoch": 0.8636666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.06100260416666667, "learning_rate": 1.9487894785597933e-05, "loss": 4.3119, "loss/crossentropy": 1.5006299167871475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1726713590323925, "step": 10364 }, { "epoch": 0.8638333333333333, "grad_norm": 4.875, "grad_norm_var": 0.05050455729166667, "learning_rate": 1.9463787696712696e-05, "loss": 5.0052, "loss/crossentropy": 1.9660705775022507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18909884989261627, "step": 10366 }, { "epoch": 0.864, "grad_norm": 4.1875, "grad_norm_var": 0.06276041666666667, "learning_rate": 1.9439691800480384e-05, "loss": 4.5395, "loss/crossentropy": 1.9638324081897736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17081649415194988, "step": 10368 }, { "epoch": 0.8641666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.05974934895833333, "learning_rate": 1.9415607156355228e-05, "loss": 5.0068, "loss/crossentropy": 1.952818602323532, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18014013394713402, "step": 10370 }, { "epoch": 0.8643333333333333, "grad_norm": 5.0, "grad_norm_var": 0.06614176432291667, "learning_rate": 1.93915338237637e-05, "loss": 4.9283, "loss/crossentropy": 1.7149565666913986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17573698051273823, "step": 10372 }, { "epoch": 0.8645, "grad_norm": 4.59375, "grad_norm_var": 0.067822265625, "learning_rate": 1.9367471862104334e-05, "loss": 5.0251, "loss/crossentropy": 1.1846980601549149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13160541094839573, "step": 10374 }, { "epoch": 0.8646666666666667, "grad_norm": 4.15625, "grad_norm_var": 0.729931640625, "learning_rate": 1.9343421330747656e-05, "loss": 4.4774, "loss/crossentropy": 1.783796139061451, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938588246703148, "step": 10376 }, { "epoch": 0.8648333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.72105712890625, "learning_rate": 1.9319382289035937e-05, "loss": 4.528, "loss/crossentropy": 2.5256577730178833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21793661266565323, "step": 10378 }, { "epoch": 0.865, "grad_norm": 4.65625, "grad_norm_var": 0.7090779622395833, "learning_rate": 1.929535479628314e-05, "loss": 5.0167, "loss/crossentropy": 1.3702474012970924, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1659705750644207, "step": 10380 }, { "epoch": 0.8651666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.7242838541666666, "learning_rate": 1.9271338911774705e-05, "loss": 4.5104, "loss/crossentropy": 2.072055459022522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18684400990605354, "step": 10382 }, { "epoch": 0.8653333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.7132649739583333, "learning_rate": 1.9247334694767446e-05, "loss": 5.0498, "loss/crossentropy": 1.4851181358098984, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1375308893620968, "step": 10384 }, { "epoch": 0.8655, "grad_norm": 4.625, "grad_norm_var": 0.70621337890625, "learning_rate": 1.9223342204489377e-05, "loss": 5.1391, "loss/crossentropy": 2.250565826892853, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24506643041968346, "step": 10386 }, { "epoch": 0.8656666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.6966756184895834, "learning_rate": 1.9199361500139587e-05, "loss": 4.4246, "loss/crossentropy": 1.9518256038427353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17143848165869713, "step": 10388 }, { "epoch": 0.8658333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.7186482747395834, "learning_rate": 1.9175392640888073e-05, "loss": 4.6129, "loss/crossentropy": 2.004493474960327, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18428033217787743, "step": 10390 }, { "epoch": 0.866, "grad_norm": 4.75, "grad_norm_var": 0.059765625, "learning_rate": 1.9151435685875622e-05, "loss": 4.9474, "loss/crossentropy": 1.5391086861491203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15014921128749847, "step": 10392 }, { "epoch": 0.8661666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.06064046223958333, "learning_rate": 1.912749069421363e-05, "loss": 5.3938, "loss/crossentropy": 2.432666063308716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20987457409501076, "step": 10394 }, { "epoch": 0.8663333333333333, "grad_norm": 5.0, "grad_norm_var": 0.07272135416666667, "learning_rate": 1.910355772498399e-05, "loss": 4.319, "loss/crossentropy": 0.9199870005249977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10082777217030525, "step": 10396 }, { "epoch": 0.8665, "grad_norm": 4.46875, "grad_norm_var": 0.06672770182291667, "learning_rate": 1.9079636837238923e-05, "loss": 4.5733, "loss/crossentropy": 1.7452101185917854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17284293472766876, "step": 10398 }, { "epoch": 0.8666666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.05514322916666667, "learning_rate": 1.9055728090000843e-05, "loss": 4.6767, "loss/crossentropy": 1.7718966230750084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16622867062687874, "step": 10400 }, { "epoch": 0.8668333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.04529622395833333, "learning_rate": 1.9031831542262203e-05, "loss": 4.9882, "loss/crossentropy": 1.9369821846485138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18681341037154198, "step": 10402 }, { "epoch": 0.867, "grad_norm": 4.53125, "grad_norm_var": 0.04973551432291667, "learning_rate": 1.9007947252985367e-05, "loss": 4.8377, "loss/crossentropy": 2.347927749156952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2116454504430294, "step": 10404 }, { "epoch": 0.8671666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.07431233723958333, "learning_rate": 1.898407528110243e-05, "loss": 5.2396, "loss/crossentropy": 2.3202788531780243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18752939999103546, "step": 10406 }, { "epoch": 0.8673333333333333, "grad_norm": 4.5, "grad_norm_var": 0.06751302083333334, "learning_rate": 1.8960215685515128e-05, "loss": 4.4656, "loss/crossentropy": 1.361263856291771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15047047100961208, "step": 10408 }, { "epoch": 0.8675, "grad_norm": 4.53125, "grad_norm_var": 0.07115885416666666, "learning_rate": 1.8936368525094623e-05, "loss": 4.9721, "loss/crossentropy": 2.467073440551758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2162875458598137, "step": 10410 }, { "epoch": 0.8676666666666667, "grad_norm": 5.4375, "grad_norm_var": 0.10846354166666666, "learning_rate": 1.891253385868143e-05, "loss": 4.8256, "loss/crossentropy": 2.3389711380004883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2039964720606804, "step": 10412 }, { "epoch": 0.8678333333333333, "grad_norm": 4.375, "grad_norm_var": 0.11080322265625, "learning_rate": 1.88887117450852e-05, "loss": 4.7788, "loss/crossentropy": 1.9970930740237236, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18604559637606144, "step": 10414 }, { "epoch": 0.868, "grad_norm": 4.6875, "grad_norm_var": 0.11174723307291666, "learning_rate": 1.8864902243084654e-05, "loss": 4.6257, "loss/crossentropy": 1.6402384638786316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1494317278265953, "step": 10416 }, { "epoch": 0.8681666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.10662434895833334, "learning_rate": 1.884110541142735e-05, "loss": 4.7642, "loss/crossentropy": 1.9170377254486084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19645358622074127, "step": 10418 }, { "epoch": 0.8683333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.10836181640625, "learning_rate": 1.8817321308829616e-05, "loss": 4.8152, "loss/crossentropy": 1.6103285178542137, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18402664922177792, "step": 10420 }, { "epoch": 0.8685, "grad_norm": 4.59375, "grad_norm_var": 0.077587890625, "learning_rate": 1.879354999397635e-05, "loss": 5.0893, "loss/crossentropy": 2.393158346414566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20873240754008293, "step": 10422 }, { "epoch": 0.8686666666666667, "grad_norm": 4.625, "grad_norm_var": 0.07649739583333333, "learning_rate": 1.8769791525520924e-05, "loss": 4.9919, "loss/crossentropy": 1.6317023634910583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17474069818854332, "step": 10424 }, { "epoch": 0.8688333333333333, "grad_norm": 4.5, "grad_norm_var": 0.07336832682291666, "learning_rate": 1.8746045962084985e-05, "loss": 5.0873, "loss/crossentropy": 2.3630973398685455, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21914517134428024, "step": 10426 }, { "epoch": 0.869, "grad_norm": 4.34375, "grad_norm_var": 0.03619791666666667, "learning_rate": 1.8722313362258357e-05, "loss": 4.7096, "loss/crossentropy": 1.8483033329248428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20285437256097794, "step": 10428 }, { "epoch": 0.8691666666666666, "grad_norm": 4.75, "grad_norm_var": 0.03404947916666667, "learning_rate": 1.8698593784598865e-05, "loss": 4.4115, "loss/crossentropy": 2.0544984862208366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19418617151677608, "step": 10430 }, { "epoch": 0.8693333333333333, "grad_norm": 4.5, "grad_norm_var": 0.03448893229166667, "learning_rate": 1.8674887287632217e-05, "loss": 4.746, "loss/crossentropy": 2.0632302463054657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20877571776509285, "step": 10432 }, { "epoch": 0.8695, "grad_norm": 4.78125, "grad_norm_var": 0.035139973958333334, "learning_rate": 1.865119392985183e-05, "loss": 4.7295, "loss/crossentropy": 2.368138611316681, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1977866180241108, "step": 10434 }, { "epoch": 0.8696666666666667, "grad_norm": 4.625, "grad_norm_var": 0.03883056640625, "learning_rate": 1.8627513769718714e-05, "loss": 5.0828, "loss/crossentropy": 1.7253614962100983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17387551069259644, "step": 10436 }, { "epoch": 0.8698333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.04732666015625, "learning_rate": 1.86038468656613e-05, "loss": 5.0308, "loss/crossentropy": 1.5639515295624733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19789879396557808, "step": 10438 }, { "epoch": 0.87, "grad_norm": 5.34375, "grad_norm_var": 0.08151041666666667, "learning_rate": 1.858019327607534e-05, "loss": 4.7967, "loss/crossentropy": 2.375213235616684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19904804602265358, "step": 10440 }, { "epoch": 0.8701666666666666, "grad_norm": 4.4375, "grad_norm_var": 0.08590087890625, "learning_rate": 1.85565530593237e-05, "loss": 4.2816, "loss/crossentropy": 2.289244920015335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22079583629965782, "step": 10442 }, { "epoch": 0.8703333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.074853515625, "learning_rate": 1.853292627373627e-05, "loss": 4.6217, "loss/crossentropy": 1.7237029895186424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15398419462144375, "step": 10444 }, { "epoch": 0.8705, "grad_norm": 4.5625, "grad_norm_var": 0.07408854166666666, "learning_rate": 1.850931297760979e-05, "loss": 5.191, "loss/crossentropy": 1.8904145956039429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17381685227155685, "step": 10446 }, { "epoch": 0.8706666666666667, "grad_norm": 4.625, "grad_norm_var": 0.06754150390625, "learning_rate": 1.8485713229207733e-05, "loss": 4.889, "loss/crossentropy": 1.340662695467472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1496703438460827, "step": 10448 }, { "epoch": 0.8708333333333333, "grad_norm": 4.75, "grad_norm_var": 0.05670572916666667, "learning_rate": 1.8462127086760112e-05, "loss": 5.1296, "loss/crossentropy": 2.0262687131762505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17123587429523468, "step": 10450 }, { "epoch": 0.871, "grad_norm": 4.1875, "grad_norm_var": 0.07317708333333334, "learning_rate": 1.843855460846341e-05, "loss": 4.965, "loss/crossentropy": 2.0109422728419304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17585521936416626, "step": 10452 }, { "epoch": 0.8711666666666666, "grad_norm": 4.625, "grad_norm_var": 0.06731770833333334, "learning_rate": 1.8414995852480357e-05, "loss": 5.0754, "loss/crossentropy": 2.5963427424430847, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2299847975373268, "step": 10454 }, { "epoch": 0.8713333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.03345947265625, "learning_rate": 1.839145087693986e-05, "loss": 5.1549, "loss/crossentropy": 1.8669070899486542, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17287633568048477, "step": 10456 }, { "epoch": 0.8715, "grad_norm": 4.9375, "grad_norm_var": 0.03785400390625, "learning_rate": 1.8367919739936788e-05, "loss": 4.9381, "loss/crossentropy": 1.4938563853502274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16861506551504135, "step": 10458 }, { "epoch": 0.8716666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.04176025390625, "learning_rate": 1.834440249953189e-05, "loss": 4.9393, "loss/crossentropy": 1.6703289598226547, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17855097353458405, "step": 10460 }, { "epoch": 0.8718333333333333, "grad_norm": 4.75, "grad_norm_var": 0.04724934895833333, "learning_rate": 1.8320899213751614e-05, "loss": 4.6414, "loss/crossentropy": 1.7185562402009964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17512892372906208, "step": 10462 }, { "epoch": 0.872, "grad_norm": 4.625, "grad_norm_var": 0.0478515625, "learning_rate": 1.829740994058799e-05, "loss": 5.2955, "loss/crossentropy": 1.9877119585871696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1850297786295414, "step": 10464 }, { "epoch": 0.8721666666666666, "grad_norm": 4.5, "grad_norm_var": 0.046773274739583336, "learning_rate": 1.827393473799846e-05, "loss": 5.2927, "loss/crossentropy": 2.5606048107147217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21541300043463707, "step": 10466 }, { "epoch": 0.8723333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.03528238932291667, "learning_rate": 1.8250473663905756e-05, "loss": 4.6312, "loss/crossentropy": 2.208735913038254, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19427185505628586, "step": 10468 }, { "epoch": 0.8725, "grad_norm": 4.4375, "grad_norm_var": 0.034077962239583336, "learning_rate": 1.8227026776197735e-05, "loss": 4.8591, "loss/crossentropy": 1.5274348929524422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14625133015215397, "step": 10470 }, { "epoch": 0.8726666666666667, "grad_norm": 4.375, "grad_norm_var": 0.038671875, "learning_rate": 1.820359413272727e-05, "loss": 5.2586, "loss/crossentropy": 1.855014145374298, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1736547350883484, "step": 10472 }, { "epoch": 0.8728333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.036181640625, "learning_rate": 1.818017579131208e-05, "loss": 4.579, "loss/crossentropy": 1.6626396775245667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1938914842903614, "step": 10474 }, { "epoch": 0.873, "grad_norm": 4.65625, "grad_norm_var": 0.042643229166666664, "learning_rate": 1.81567718097346e-05, "loss": 5.3581, "loss/crossentropy": 1.793995201587677, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17096725851297379, "step": 10476 }, { "epoch": 0.8731666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.042952473958333334, "learning_rate": 1.8133382245741814e-05, "loss": 5.1202, "loss/crossentropy": 1.8239585757255554, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21488109789788723, "step": 10478 }, { "epoch": 0.8733333333333333, "grad_norm": 4.1875, "grad_norm_var": 0.052978515625, "learning_rate": 1.8110007157045157e-05, "loss": 4.63, "loss/crossentropy": 2.220126062631607, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20925140008330345, "step": 10480 }, { "epoch": 0.8735, "grad_norm": 4.6875, "grad_norm_var": 0.05797119140625, "learning_rate": 1.8086646601320327e-05, "loss": 4.891, "loss/crossentropy": 1.5069852694869041, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14469253458082676, "step": 10482 }, { "epoch": 0.8736666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.05728759765625, "learning_rate": 1.806330063620719e-05, "loss": 4.045, "loss/crossentropy": 1.8212331235408783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20925015211105347, "step": 10484 }, { "epoch": 0.8738333333333334, "grad_norm": 5.0, "grad_norm_var": 0.06790364583333333, "learning_rate": 1.8039969319309573e-05, "loss": 4.6383, "loss/crossentropy": 1.6038372293114662, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1596646960824728, "step": 10486 }, { "epoch": 0.874, "grad_norm": 4.84375, "grad_norm_var": 0.06614583333333333, "learning_rate": 1.8016652708195196e-05, "loss": 5.2458, "loss/crossentropy": 1.9456142485141754, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19872823730111122, "step": 10488 }, { "epoch": 0.8741666666666666, "grad_norm": 4.75, "grad_norm_var": 0.05896809895833333, "learning_rate": 1.799335086039547e-05, "loss": 5.0188, "loss/crossentropy": 2.1081501841545105, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1923701912164688, "step": 10490 }, { "epoch": 0.8743333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.0552734375, "learning_rate": 1.79700638334054e-05, "loss": 5.2991, "loss/crossentropy": 1.814670369029045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19245347008109093, "step": 10492 }, { "epoch": 0.8745, "grad_norm": 4.4375, "grad_norm_var": 0.05188802083333333, "learning_rate": 1.79467916846834e-05, "loss": 4.6803, "loss/crossentropy": 2.246580570936203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2071346677839756, "step": 10494 }, { "epoch": 0.8746666666666667, "grad_norm": 5.0625, "grad_norm_var": 0.06112874348958333, "learning_rate": 1.7923534471651186e-05, "loss": 5.0703, "loss/crossentropy": 2.5355364084243774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2118670642375946, "step": 10496 }, { "epoch": 0.8748333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.05584309895833333, "learning_rate": 1.7900292251693618e-05, "loss": 4.8563, "loss/crossentropy": 1.5048917829990387, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14453205838799477, "step": 10498 }, { "epoch": 0.875, "grad_norm": 4.59375, "grad_norm_var": 0.04348551432291667, "learning_rate": 1.7877065082158567e-05, "loss": 4.7278, "loss/crossentropy": 2.3392655849456787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20232314616441727, "step": 10500 }, { "epoch": 0.8751666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.042801920572916666, "learning_rate": 1.7853853020356763e-05, "loss": 5.1296, "loss/crossentropy": 1.437909610569477, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20237400382757187, "step": 10502 }, { "epoch": 0.8753333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.04455973307291667, "learning_rate": 1.7830656123561658e-05, "loss": 5.0175, "loss/crossentropy": 1.020252212882042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12232361361384392, "step": 10504 }, { "epoch": 0.8755, "grad_norm": 4.6875, "grad_norm_var": 0.04407145182291667, "learning_rate": 1.7807474449009293e-05, "loss": 5.1914, "loss/crossentropy": 1.643537849187851, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.177352674305439, "step": 10506 }, { "epoch": 0.8756666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.0515625, "learning_rate": 1.7784308053898147e-05, "loss": 5.2378, "loss/crossentropy": 1.8363083899021149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17326289042830467, "step": 10508 }, { "epoch": 0.8758333333333334, "grad_norm": 4.96875, "grad_norm_var": 0.057535807291666664, "learning_rate": 1.7761156995388994e-05, "loss": 4.86, "loss/crossentropy": 2.0081919208168983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17866789363324642, "step": 10510 }, { "epoch": 0.876, "grad_norm": 4.25, "grad_norm_var": 0.04763997395833333, "learning_rate": 1.7738021330604765e-05, "loss": 4.4745, "loss/crossentropy": 0.8506453335285187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1177255641669035, "step": 10512 }, { "epoch": 0.8761666666666666, "grad_norm": 4.15625, "grad_norm_var": 0.06155192057291667, "learning_rate": 1.7714901116630424e-05, "loss": 4.8159, "loss/crossentropy": 2.144615203142166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23711377754807472, "step": 10514 }, { "epoch": 0.8763333333333333, "grad_norm": 5.0, "grad_norm_var": 0.08253580729166667, "learning_rate": 1.7691796410512784e-05, "loss": 5.3608, "loss/crossentropy": 2.1386347115039825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22258469462394714, "step": 10516 }, { "epoch": 0.8765, "grad_norm": 4.65625, "grad_norm_var": 0.08435872395833334, "learning_rate": 1.7668707269260435e-05, "loss": 5.1316, "loss/crossentropy": 2.270026445388794, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18253154307603836, "step": 10518 }, { "epoch": 0.8766666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.08253580729166667, "learning_rate": 1.7645633749843512e-05, "loss": 5.412, "loss/crossentropy": 2.1419003307819366, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19707633554935455, "step": 10520 }, { "epoch": 0.8768333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.08495686848958334, "learning_rate": 1.762257590919365e-05, "loss": 5.1619, "loss/crossentropy": 1.8701740205287933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16680870950222015, "step": 10522 }, { "epoch": 0.877, "grad_norm": 4.53125, "grad_norm_var": 0.09342447916666667, "learning_rate": 1.7599533804203767e-05, "loss": 4.7449, "loss/crossentropy": 1.9194505885243416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18774541094899178, "step": 10524 }, { "epoch": 0.8771666666666667, "grad_norm": 5.0, "grad_norm_var": 0.09550374348958333, "learning_rate": 1.7576507491727975e-05, "loss": 5.3806, "loss/crossentropy": 2.5566156804561615, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.26438262313604355, "step": 10526 }, { "epoch": 0.8773333333333333, "grad_norm": 4.625, "grad_norm_var": 0.08609619140625, "learning_rate": 1.75534970285814e-05, "loss": 4.9388, "loss/crossentropy": 1.4661534652113914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15747330151498318, "step": 10528 }, { "epoch": 0.8775, "grad_norm": 4.40625, "grad_norm_var": 0.07174072265625, "learning_rate": 1.7530502471540084e-05, "loss": 4.6847, "loss/crossentropy": 2.2137043476104736, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23433979973196983, "step": 10530 }, { "epoch": 0.8776666666666667, "grad_norm": 4.625, "grad_norm_var": 0.055712890625, "learning_rate": 1.7507523877340803e-05, "loss": 5.2944, "loss/crossentropy": 1.5667135491967201, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16603326424956322, "step": 10532 }, { "epoch": 0.8778333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.04195556640625, "learning_rate": 1.748456130268096e-05, "loss": 4.5756, "loss/crossentropy": 1.7613427862524986, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17346342653036118, "step": 10534 }, { "epoch": 0.878, "grad_norm": 5.0, "grad_norm_var": 0.054671223958333334, "learning_rate": 1.7461614804218417e-05, "loss": 5.0538, "loss/crossentropy": 2.336539626121521, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21434544026851654, "step": 10536 }, { "epoch": 0.8781666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.05572916666666667, "learning_rate": 1.7438684438571386e-05, "loss": 4.9415, "loss/crossentropy": 2.0774486362934113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19223777204751968, "step": 10538 }, { "epoch": 0.8783333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.03319905598958333, "learning_rate": 1.7415770262318262e-05, "loss": 4.9802, "loss/crossentropy": 2.266049236059189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19595278799533844, "step": 10540 }, { "epoch": 0.8785, "grad_norm": 4.4375, "grad_norm_var": 0.033528645833333336, "learning_rate": 1.7392872331997495e-05, "loss": 5.2426, "loss/crossentropy": 1.9866546764969826, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19384957291185856, "step": 10542 }, { "epoch": 0.8786666666666667, "grad_norm": 4.21875, "grad_norm_var": 0.044384765625, "learning_rate": 1.7369990704107458e-05, "loss": 4.8644, "loss/crossentropy": 2.1645276844501495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19293129071593285, "step": 10544 }, { "epoch": 0.8788333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.04269205729166667, "learning_rate": 1.7347125435106287e-05, "loss": 5.3348, "loss/crossentropy": 2.1431443095207214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2157515287399292, "step": 10546 }, { "epoch": 0.879, "grad_norm": 4.65625, "grad_norm_var": 0.042708333333333334, "learning_rate": 1.732427658141176e-05, "loss": 4.9541, "loss/crossentropy": 2.2488779723644257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2067658081650734, "step": 10548 }, { "epoch": 0.8791666666666667, "grad_norm": 4.90625, "grad_norm_var": 0.053629557291666664, "learning_rate": 1.7301444199401158e-05, "loss": 4.7835, "loss/crossentropy": 2.4230023622512817, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21769268438220024, "step": 10550 }, { "epoch": 0.8793333333333333, "grad_norm": 4.5, "grad_norm_var": 0.043355305989583336, "learning_rate": 1.7278628345411102e-05, "loss": 4.7411, "loss/crossentropy": 2.400269329547882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22978059947490692, "step": 10552 }, { "epoch": 0.8795, "grad_norm": 4.9375, "grad_norm_var": 0.05089518229166667, "learning_rate": 1.725582907573746e-05, "loss": 4.8078, "loss/crossentropy": 2.343903511762619, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20802440866827965, "step": 10554 }, { "epoch": 0.8796666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.046610514322916664, "learning_rate": 1.7233046446635152e-05, "loss": 5.1468, "loss/crossentropy": 1.8718384355306625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16846632957458496, "step": 10556 }, { "epoch": 0.8798333333333334, "grad_norm": 4.90625, "grad_norm_var": 0.0537109375, "learning_rate": 1.7210280514318055e-05, "loss": 4.7226, "loss/crossentropy": 1.6889416128396988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1929287426173687, "step": 10558 }, { "epoch": 0.88, "grad_norm": 4.65625, "grad_norm_var": 0.04700520833333333, "learning_rate": 1.718753133495884e-05, "loss": 5.1345, "loss/crossentropy": 2.15225350856781, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2042902112007141, "step": 10560 }, { "epoch": 0.8801666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.04607747395833333, "learning_rate": 1.7164798964688853e-05, "loss": 5.0313, "loss/crossentropy": 2.0885613709688187, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21012815833091736, "step": 10562 }, { "epoch": 0.8803333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.05234375, "learning_rate": 1.7142083459597953e-05, "loss": 4.4626, "loss/crossentropy": 1.7584224492311478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17640075087547302, "step": 10564 }, { "epoch": 0.8805, "grad_norm": 4.59375, "grad_norm_var": 0.039778645833333334, "learning_rate": 1.7119384875734388e-05, "loss": 5.147, "loss/crossentropy": 2.086060971021652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2050802819430828, "step": 10566 }, { "epoch": 0.8806666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.03892822265625, "learning_rate": 1.7096703269104658e-05, "loss": 4.9094, "loss/crossentropy": 2.1914361715316772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21584604680538177, "step": 10568 }, { "epoch": 0.8808333333333334, "grad_norm": 4.5, "grad_norm_var": 0.03411458333333333, "learning_rate": 1.7074038695673384e-05, "loss": 5.261, "loss/crossentropy": 1.7987454533576965, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16365987062454224, "step": 10570 }, { "epoch": 0.881, "grad_norm": 4.5625, "grad_norm_var": 0.05520833333333333, "learning_rate": 1.705139121136313e-05, "loss": 5.002, "loss/crossentropy": 2.0218057334423065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18577923253178596, "step": 10572 }, { "epoch": 0.8811666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.04934488932291667, "learning_rate": 1.7028760872054327e-05, "loss": 5.1821, "loss/crossentropy": 2.079995185136795, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19173727184534073, "step": 10574 }, { "epoch": 0.8813333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.048173014322916666, "learning_rate": 1.700614773358508e-05, "loss": 4.5502, "loss/crossentropy": 2.2938634157180786, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.193524319678545, "step": 10576 }, { "epoch": 0.8815, "grad_norm": 4.53125, "grad_norm_var": 0.048811848958333334, "learning_rate": 1.698355185175106e-05, "loss": 5.2259, "loss/crossentropy": 1.8245511278510094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17827428877353668, "step": 10578 }, { "epoch": 0.8816666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.041341145833333336, "learning_rate": 1.696097328230536e-05, "loss": 4.975, "loss/crossentropy": 1.785768836736679, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2040269337594509, "step": 10580 }, { "epoch": 0.8818333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.05520833333333333, "learning_rate": 1.693841208095836e-05, "loss": 4.3794, "loss/crossentropy": 1.9544300138950348, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18160400912165642, "step": 10582 }, { "epoch": 0.882, "grad_norm": 5.0625, "grad_norm_var": 0.06428629557291667, "learning_rate": 1.691586830337758e-05, "loss": 5.0472, "loss/crossentropy": 1.6755925416946411, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18401020765304565, "step": 10584 }, { "epoch": 0.8821666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.06256103515625, "learning_rate": 1.6893342005187546e-05, "loss": 4.4777, "loss/crossentropy": 2.553809404373169, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22219187021255493, "step": 10586 }, { "epoch": 0.8823333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.044514973958333336, "learning_rate": 1.687083324196966e-05, "loss": 5.5024, "loss/crossentropy": 2.220675617456436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19723805040121078, "step": 10588 }, { "epoch": 0.8825, "grad_norm": 4.75, "grad_norm_var": 0.042378743489583336, "learning_rate": 1.6848342069262065e-05, "loss": 5.155, "loss/crossentropy": 1.6391087174415588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1860688552260399, "step": 10590 }, { "epoch": 0.8826666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.047509765625, "learning_rate": 1.682586854255949e-05, "loss": 4.7607, "loss/crossentropy": 1.832770362496376, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15771115943789482, "step": 10592 }, { "epoch": 0.8828333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.06396077473958334, "learning_rate": 1.6803412717313123e-05, "loss": 5.167, "loss/crossentropy": 2.5300718545913696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20988282933831215, "step": 10594 }, { "epoch": 0.883, "grad_norm": 4.78125, "grad_norm_var": 0.06796468098958333, "learning_rate": 1.678097464893048e-05, "loss": 5.2368, "loss/crossentropy": 2.185933083295822, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19851680099964142, "step": 10596 }, { "epoch": 0.8831666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.05705973307291667, "learning_rate": 1.6758554392775276e-05, "loss": 4.483, "loss/crossentropy": 1.6705860868096352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17615766637027264, "step": 10598 }, { "epoch": 0.8833333333333333, "grad_norm": 4.75, "grad_norm_var": 0.0455078125, "learning_rate": 1.6736152004167256e-05, "loss": 4.4796, "loss/crossentropy": 1.8610120490193367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18658486753702164, "step": 10600 }, { "epoch": 0.8835, "grad_norm": 4.625, "grad_norm_var": 0.041666666666666664, "learning_rate": 1.6713767538382085e-05, "loss": 4.563, "loss/crossentropy": 2.159834563732147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20609938725829124, "step": 10602 }, { "epoch": 0.8836666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.03720296223958333, "learning_rate": 1.669140105065121e-05, "loss": 4.6492, "loss/crossentropy": 2.3123832046985626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20040404424071312, "step": 10604 }, { "epoch": 0.8838333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.03876546223958333, "learning_rate": 1.6669052596161722e-05, "loss": 5.2065, "loss/crossentropy": 2.3895527720451355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24119574576616287, "step": 10606 }, { "epoch": 0.884, "grad_norm": 4.9375, "grad_norm_var": 0.04894205729166667, "learning_rate": 1.66467222300562e-05, "loss": 4.7021, "loss/crossentropy": 1.9245961979031563, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17225523851811886, "step": 10608 }, { "epoch": 0.8841666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.040690104166666664, "learning_rate": 1.6624410007432606e-05, "loss": 4.8996, "loss/crossentropy": 1.4526910781860352, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15226059220731258, "step": 10610 }, { "epoch": 0.8843333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.051656087239583336, "learning_rate": 1.6602115983344136e-05, "loss": 5.193, "loss/crossentropy": 2.605428993701935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21228623762726784, "step": 10612 }, { "epoch": 0.8845, "grad_norm": 4.90625, "grad_norm_var": 0.05709228515625, "learning_rate": 1.6579840212799077e-05, "loss": 4.6514, "loss/crossentropy": 1.4337237551808357, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17335276678204536, "step": 10614 }, { "epoch": 0.8846666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.05894775390625, "learning_rate": 1.655758275076067e-05, "loss": 5.3302, "loss/crossentropy": 2.0682147443294525, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20112159848213196, "step": 10616 }, { "epoch": 0.8848333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.06066080729166667, "learning_rate": 1.6535343652147e-05, "loss": 4.8437, "loss/crossentropy": 2.116395853459835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18656159937381744, "step": 10618 }, { "epoch": 0.885, "grad_norm": 4.5, "grad_norm_var": 0.05676676432291667, "learning_rate": 1.651312297183083e-05, "loss": 4.8351, "loss/crossentropy": 1.824868343770504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845719050616026, "step": 10620 }, { "epoch": 0.8851666666666667, "grad_norm": 4.25, "grad_norm_var": 0.06573893229166666, "learning_rate": 1.6490920764639477e-05, "loss": 4.4162, "loss/crossentropy": 1.5410160273313522, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15590156242251396, "step": 10622 }, { "epoch": 0.8853333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.06474202473958333, "learning_rate": 1.646873708535468e-05, "loss": 5.4161, "loss/crossentropy": 1.3780269846320152, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17345618084073067, "step": 10624 }, { "epoch": 0.8855, "grad_norm": 5.03125, "grad_norm_var": 0.0734375, "learning_rate": 1.644657198871247e-05, "loss": 4.9033, "loss/crossentropy": 2.166410952806473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21835574507713318, "step": 10626 }, { "epoch": 0.8856666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.062483723958333334, "learning_rate": 1.642442552940301e-05, "loss": 5.2132, "loss/crossentropy": 2.2428570091724396, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22547245025634766, "step": 10628 }, { "epoch": 0.8858333333333334, "grad_norm": 4.625, "grad_norm_var": 0.06100260416666667, "learning_rate": 1.640229776207049e-05, "loss": 4.6793, "loss/crossentropy": 2.4395949244499207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21656383946537971, "step": 10630 }, { "epoch": 0.886, "grad_norm": 4.34375, "grad_norm_var": 0.06560872395833334, "learning_rate": 1.6380188741312976e-05, "loss": 4.8013, "loss/crossentropy": 1.8665351793169975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16414397209882736, "step": 10632 }, { "epoch": 0.8861666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.06419270833333333, "learning_rate": 1.6358098521682283e-05, "loss": 5.2442, "loss/crossentropy": 2.3931703567504883, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21243423223495483, "step": 10634 }, { "epoch": 0.8863333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.5460245768229167, "learning_rate": 1.6336027157683828e-05, "loss": 4.546, "loss/crossentropy": 1.274245411157608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13230569288134575, "step": 10636 }, { "epoch": 0.8865, "grad_norm": 4.6875, "grad_norm_var": 0.5261067708333333, "learning_rate": 1.6313974703776507e-05, "loss": 4.9338, "loss/crossentropy": 1.9635001122951508, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17589573562145233, "step": 10638 }, { "epoch": 0.8866666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.5380045572916666, "learning_rate": 1.6291941214372554e-05, "loss": 4.3675, "loss/crossentropy": 1.907171793282032, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1749830450862646, "step": 10640 }, { "epoch": 0.8868333333333334, "grad_norm": 4.4375, "grad_norm_var": 0.5325358072916667, "learning_rate": 1.6269926743837432e-05, "loss": 5.4834, "loss/crossentropy": 2.321804314851761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19393545016646385, "step": 10642 }, { "epoch": 0.887, "grad_norm": 4.3125, "grad_norm_var": 0.5464680989583334, "learning_rate": 1.6247931346489637e-05, "loss": 4.9566, "loss/crossentropy": 2.445066601037979, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21393660083413124, "step": 10644 }, { "epoch": 0.8871666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.6845052083333333, "learning_rate": 1.6225955076600636e-05, "loss": 4.7099, "loss/crossentropy": 2.3382493257522583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21867894008755684, "step": 10646 }, { "epoch": 0.8873333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.67222900390625, "learning_rate": 1.620399798839468e-05, "loss": 4.9019, "loss/crossentropy": 2.1961640417575836, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18073081225156784, "step": 10648 }, { "epoch": 0.8875, "grad_norm": 4.8125, "grad_norm_var": 0.6851399739583334, "learning_rate": 1.6182060136048727e-05, "loss": 5.0117, "loss/crossentropy": 1.1794405281543732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13596356473863125, "step": 10650 }, { "epoch": 0.8876666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.20790608723958334, "learning_rate": 1.6160141573692217e-05, "loss": 4.9391, "loss/crossentropy": 2.450029969215393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20907938852906227, "step": 10652 }, { "epoch": 0.8878333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.21145833333333333, "learning_rate": 1.613824235540704e-05, "loss": 4.8929, "loss/crossentropy": 1.566991001367569, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1968870833516121, "step": 10654 }, { "epoch": 0.888, "grad_norm": 4.53125, "grad_norm_var": 0.20331624348958333, "learning_rate": 1.611636253522734e-05, "loss": 4.4838, "loss/crossentropy": 2.345241993665695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1920331008732319, "step": 10656 }, { "epoch": 0.8881666666666667, "grad_norm": 4.375, "grad_norm_var": 0.21090087890625, "learning_rate": 1.6094502167139393e-05, "loss": 4.8726, "loss/crossentropy": 2.4404727816581726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2480272240936756, "step": 10658 }, { "epoch": 0.8883333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.20390625, "learning_rate": 1.607266130508148e-05, "loss": 5.1869, "loss/crossentropy": 1.9991124272346497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19056628830730915, "step": 10660 }, { "epoch": 0.8885, "grad_norm": 4.5, "grad_norm_var": 0.05543212890625, "learning_rate": 1.605084000294377e-05, "loss": 4.8648, "loss/crossentropy": 2.3871697783470154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21722237020730972, "step": 10662 }, { "epoch": 0.8886666666666667, "grad_norm": 4.5, "grad_norm_var": 0.04950764973958333, "learning_rate": 1.602903831456815e-05, "loss": 4.9332, "loss/crossentropy": 2.383307009935379, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19519124925136566, "step": 10664 }, { "epoch": 0.8888333333333334, "grad_norm": 4.4375, "grad_norm_var": 0.04198811848958333, "learning_rate": 1.600725629374812e-05, "loss": 4.9307, "loss/crossentropy": 1.549317441880703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15623271651566029, "step": 10666 }, { "epoch": 0.889, "grad_norm": 4.8125, "grad_norm_var": 0.06510416666666667, "learning_rate": 1.598549399422864e-05, "loss": 4.5819, "loss/crossentropy": 2.4213827252388, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21201927214860916, "step": 10668 }, { "epoch": 0.8891666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.060791015625, "learning_rate": 1.596375146970604e-05, "loss": 4.9754, "loss/crossentropy": 2.050051510334015, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18452263250946999, "step": 10670 }, { "epoch": 0.8893333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.05618489583333333, "learning_rate": 1.5942028773827827e-05, "loss": 4.357, "loss/crossentropy": 2.5166059732437134, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21152303367853165, "step": 10672 }, { "epoch": 0.8895, "grad_norm": 4.5, "grad_norm_var": 0.054423014322916664, "learning_rate": 1.59203259601926e-05, "loss": 4.7636, "loss/crossentropy": 1.8929245918989182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16911195777356625, "step": 10674 }, { "epoch": 0.8896666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.04348551432291667, "learning_rate": 1.589864308234988e-05, "loss": 4.8104, "loss/crossentropy": 2.1835354566574097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2070077657699585, "step": 10676 }, { "epoch": 0.8898333333333334, "grad_norm": 4.4375, "grad_norm_var": 0.052408854166666664, "learning_rate": 1.5876980193800033e-05, "loss": 4.1576, "loss/crossentropy": 1.198787048459053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13902889378368855, "step": 10678 }, { "epoch": 0.89, "grad_norm": 4.53125, "grad_norm_var": 0.04641927083333333, "learning_rate": 1.5855337347994062e-05, "loss": 5.2937, "loss/crossentropy": 1.7133802622556686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17863846570253372, "step": 10680 }, { "epoch": 0.8901666666666667, "grad_norm": 4.0625, "grad_norm_var": 0.055078125, "learning_rate": 1.5833714598333553e-05, "loss": 4.3412, "loss/crossentropy": 2.547089695930481, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2081010527908802, "step": 10682 }, { "epoch": 0.8903333333333333, "grad_norm": 4.625, "grad_norm_var": 0.03878580729166667, "learning_rate": 1.581211199817048e-05, "loss": 4.5989, "loss/crossentropy": 1.652288556098938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16221519000828266, "step": 10684 }, { "epoch": 0.8905, "grad_norm": 4.46875, "grad_norm_var": 0.03616129557291667, "learning_rate": 1.579052960080713e-05, "loss": 5.0203, "loss/crossentropy": 1.7919713705778122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1675052735954523, "step": 10686 }, { "epoch": 0.8906666666666667, "grad_norm": 4.0, "grad_norm_var": 0.041259765625, "learning_rate": 1.57689674594959e-05, "loss": 4.2805, "loss/crossentropy": 1.9634157121181488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1854908987879753, "step": 10688 }, { "epoch": 0.8908333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.04934488932291667, "learning_rate": 1.5747425627439242e-05, "loss": 5.0934, "loss/crossentropy": 1.4715142846107483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14984281174838543, "step": 10690 }, { "epoch": 0.891, "grad_norm": 5.15625, "grad_norm_var": 0.08531494140625, "learning_rate": 1.5725904157789487e-05, "loss": 5.2262, "loss/crossentropy": 2.327080875635147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025107815861702, "step": 10692 }, { "epoch": 0.8911666666666667, "grad_norm": 4.625, "grad_norm_var": 0.08049723307291666, "learning_rate": 1.570440310364872e-05, "loss": 5.3723, "loss/crossentropy": 1.475014977157116, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1731820423156023, "step": 10694 }, { "epoch": 0.8913333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.0837890625, "learning_rate": 1.568292251806865e-05, "loss": 5.0512, "loss/crossentropy": 2.648381471633911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21056640520691872, "step": 10696 }, { "epoch": 0.8915, "grad_norm": 4.5625, "grad_norm_var": 0.06897379557291666, "learning_rate": 1.5661462454050492e-05, "loss": 4.5428, "loss/crossentropy": 1.6267412602901459, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14332648366689682, "step": 10698 }, { "epoch": 0.8916666666666667, "grad_norm": 5.6875, "grad_norm_var": 0.14615885416666666, "learning_rate": 1.564002296454482e-05, "loss": 5.2771, "loss/crossentropy": 2.4379181265830994, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064468339085579, "step": 10700 }, { "epoch": 0.8918333333333334, "grad_norm": 4.75, "grad_norm_var": 0.1453125, "learning_rate": 1.5618604102451445e-05, "loss": 5.2395, "loss/crossentropy": 2.3840895295143127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20120671018958092, "step": 10702 }, { "epoch": 0.892, "grad_norm": 4.84375, "grad_norm_var": 0.10777587890625, "learning_rate": 1.559720592061927e-05, "loss": 5.0845, "loss/crossentropy": 1.6713618710637093, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16508845053613186, "step": 10704 }, { "epoch": 0.8921666666666667, "grad_norm": 4.875, "grad_norm_var": 0.1103515625, "learning_rate": 1.5575828471846192e-05, "loss": 5.2038, "loss/crossentropy": 2.4455989003181458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22217406332492828, "step": 10706 }, { "epoch": 0.8923333333333333, "grad_norm": 4.375, "grad_norm_var": 0.10245768229166667, "learning_rate": 1.555447180887894e-05, "loss": 5.047, "loss/crossentropy": 1.8809728920459747, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19522210396826267, "step": 10708 }, { "epoch": 0.8925, "grad_norm": 4.4375, "grad_norm_var": 0.10338541666666666, "learning_rate": 1.5533135984412954e-05, "loss": 4.2285, "loss/crossentropy": 1.6795168668031693, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18034653179347515, "step": 10710 }, { "epoch": 0.8926666666666667, "grad_norm": 4.09375, "grad_norm_var": 0.12014567057291667, "learning_rate": 1.5511821051092252e-05, "loss": 4.6844, "loss/crossentropy": 0.85299401730299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.10577042773365974, "step": 10712 }, { "epoch": 0.8928333333333334, "grad_norm": 4.21875, "grad_norm_var": 0.13359375, "learning_rate": 1.5490527061509338e-05, "loss": 4.7831, "loss/crossentropy": 1.6696652993559837, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17354433611035347, "step": 10714 }, { "epoch": 0.893, "grad_norm": 4.78125, "grad_norm_var": 0.057450358072916666, "learning_rate": 1.5469254068204985e-05, "loss": 4.9723, "loss/crossentropy": 2.289715588092804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24060748890042305, "step": 10716 }, { "epoch": 0.8931666666666667, "grad_norm": 4.5, "grad_norm_var": 0.0599609375, "learning_rate": 1.5448002123668207e-05, "loss": 4.5862, "loss/crossentropy": 1.8170486837625504, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18546131625771523, "step": 10718 }, { "epoch": 0.8933333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.05465087890625, "learning_rate": 1.5426771280336054e-05, "loss": 5.1344, "loss/crossentropy": 1.6859957575798035, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1697954684495926, "step": 10720 }, { "epoch": 0.8935, "grad_norm": 4.46875, "grad_norm_var": 0.03912760416666667, "learning_rate": 1.540556159059354e-05, "loss": 4.8421, "loss/crossentropy": 0.9791104048490524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11764907836914062, "step": 10722 }, { "epoch": 0.8936666666666667, "grad_norm": 7.375, "grad_norm_var": 0.5524576822916667, "learning_rate": 1.5384373106773437e-05, "loss": 4.7781, "loss/crossentropy": 1.7341388911008835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16041403263807297, "step": 10724 }, { "epoch": 0.8938333333333334, "grad_norm": 4.375, "grad_norm_var": 0.5555826822916666, "learning_rate": 1.5363205881156248e-05, "loss": 4.6986, "loss/crossentropy": 1.618898868560791, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18397285975515842, "step": 10726 }, { "epoch": 0.894, "grad_norm": 4.75, "grad_norm_var": 0.534228515625, "learning_rate": 1.5342059965969988e-05, "loss": 4.6764, "loss/crossentropy": 2.125587046146393, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1921604499220848, "step": 10728 }, { "epoch": 0.8941666666666667, "grad_norm": 4.75, "grad_norm_var": 0.5205037434895833, "learning_rate": 1.5320935413390107e-05, "loss": 4.8454, "loss/crossentropy": 1.786333590745926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17975015565752983, "step": 10730 }, { "epoch": 0.8943333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.5203125, "learning_rate": 1.529983227553932e-05, "loss": 5.2461, "loss/crossentropy": 1.7129372730851173, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15703948214650154, "step": 10732 }, { "epoch": 0.8945, "grad_norm": 4.78125, "grad_norm_var": 0.50650634765625, "learning_rate": 1.5278750604487543e-05, "loss": 5.5317, "loss/crossentropy": 2.582478642463684, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20744846016168594, "step": 10734 }, { "epoch": 0.8946666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.50650634765625, "learning_rate": 1.52576904522517e-05, "loss": 4.7874, "loss/crossentropy": 1.4274420738220215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17440488934516907, "step": 10736 }, { "epoch": 0.8948333333333334, "grad_norm": 4.34375, "grad_norm_var": 0.510546875, "learning_rate": 1.5236651870795612e-05, "loss": 4.8968, "loss/crossentropy": 2.414700925350189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2064298652112484, "step": 10738 }, { "epoch": 0.895, "grad_norm": 4.90625, "grad_norm_var": 0.031233723958333334, "learning_rate": 1.521563491202989e-05, "loss": 5.3921, "loss/crossentropy": 2.2058697938919067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21623685210943222, "step": 10740 }, { "epoch": 0.8951666666666667, "grad_norm": 5.0, "grad_norm_var": 0.030887858072916666, "learning_rate": 1.5194639627811803e-05, "loss": 5.3514, "loss/crossentropy": 2.025493770837784, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22063399851322174, "step": 10742 }, { "epoch": 0.8953333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.04021809895833333, "learning_rate": 1.5173666069945118e-05, "loss": 4.816, "loss/crossentropy": 1.9025913998484612, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19442759454250336, "step": 10744 }, { "epoch": 0.8955, "grad_norm": 4.8125, "grad_norm_var": 0.03730061848958333, "learning_rate": 1.5152714290180006e-05, "loss": 5.2007, "loss/crossentropy": 1.9195226430892944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1758854277431965, "step": 10746 }, { "epoch": 0.8956666666666667, "grad_norm": 4.75, "grad_norm_var": 0.04599202473958333, "learning_rate": 1.5131784340212893e-05, "loss": 4.1424, "loss/crossentropy": 1.8893222734332085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18168200738728046, "step": 10748 }, { "epoch": 0.8958333333333334, "grad_norm": 4.875, "grad_norm_var": 0.04638264973958333, "learning_rate": 1.511087627168637e-05, "loss": 5.4183, "loss/crossentropy": 2.4070481657981873, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20728513225913048, "step": 10750 }, { "epoch": 0.896, "grad_norm": 4.59375, "grad_norm_var": 0.061844889322916666, "learning_rate": 1.5089990136189e-05, "loss": 4.6507, "loss/crossentropy": 2.1939191222190857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22148913517594337, "step": 10752 }, { "epoch": 0.8961666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.05748697916666667, "learning_rate": 1.5069125985255242e-05, "loss": 5.3693, "loss/crossentropy": 2.1805800199508667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2069510854780674, "step": 10754 }, { "epoch": 0.8963333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.05050455729166667, "learning_rate": 1.5048283870365332e-05, "loss": 4.8997, "loss/crossentropy": 2.116938143968582, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18929652869701385, "step": 10756 }, { "epoch": 0.8965, "grad_norm": 4.5, "grad_norm_var": 0.04334309895833333, "learning_rate": 1.5027463842945109e-05, "loss": 4.6614, "loss/crossentropy": 1.7113404273986816, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19923411309719086, "step": 10758 }, { "epoch": 0.8966666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.037434895833333336, "learning_rate": 1.5006665954365915e-05, "loss": 5.2853, "loss/crossentropy": 2.2329909205436707, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22023383155465126, "step": 10760 }, { "epoch": 0.8968333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.04195556640625, "learning_rate": 1.4985890255944477e-05, "loss": 5.0832, "loss/crossentropy": 2.5310455560684204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2189432941377163, "step": 10762 }, { "epoch": 0.897, "grad_norm": 4.8125, "grad_norm_var": 0.036051432291666664, "learning_rate": 1.4965136798942772e-05, "loss": 5.0833, "loss/crossentropy": 2.420046091079712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2084801085293293, "step": 10764 }, { "epoch": 0.8971666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.03553059895833333, "learning_rate": 1.4944405634567883e-05, "loss": 4.9867, "loss/crossentropy": 1.9696931019425392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19564705714583397, "step": 10766 }, { "epoch": 0.8973333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.024853515625, "learning_rate": 1.4923696813971903e-05, "loss": 5.0961, "loss/crossentropy": 2.3114156424999237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21169547736644745, "step": 10768 }, { "epoch": 0.8975, "grad_norm": 4.65625, "grad_norm_var": 0.5457682291666667, "learning_rate": 1.4903010388251777e-05, "loss": 4.6873, "loss/crossentropy": 2.131648153066635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21583443507552147, "step": 10770 }, { "epoch": 0.8976666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.5373006184895833, "learning_rate": 1.4882346408449222e-05, "loss": 5.288, "loss/crossentropy": 2.201010137796402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20310936123132706, "step": 10772 }, { "epoch": 0.8978333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.5461588541666667, "learning_rate": 1.4861704925550545e-05, "loss": 4.5638, "loss/crossentropy": 1.9012616276741028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18746894598007202, "step": 10774 }, { "epoch": 0.898, "grad_norm": 4.53125, "grad_norm_var": 0.55416259765625, "learning_rate": 1.4841085990486552e-05, "loss": 5.3634, "loss/crossentropy": 1.9615696221590042, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19145160540938377, "step": 10776 }, { "epoch": 0.8981666666666667, "grad_norm": 4.375, "grad_norm_var": 0.573681640625, "learning_rate": 1.4820489654132408e-05, "loss": 5.4252, "loss/crossentropy": 1.756884180009365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1741621494293213, "step": 10778 }, { "epoch": 0.8983333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.5929646809895833, "learning_rate": 1.479991596730755e-05, "loss": 4.2336, "loss/crossentropy": 1.196315884590149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1336789671331644, "step": 10780 }, { "epoch": 0.8985, "grad_norm": 4.53125, "grad_norm_var": 0.5968587239583333, "learning_rate": 1.4779364980775476e-05, "loss": 4.7495, "loss/crossentropy": 1.9746932983398438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18156530149281025, "step": 10782 }, { "epoch": 0.8986666666666666, "grad_norm": 7.46875, "grad_norm_var": 1.0577473958333334, "learning_rate": 1.4758836745243723e-05, "loss": 4.9054, "loss/crossentropy": 1.986648440361023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18949030712246895, "step": 10784 }, { "epoch": 0.8988333333333334, "grad_norm": 4.78125, "grad_norm_var": 0.5587890625, "learning_rate": 1.4738331311363659e-05, "loss": 4.9686, "loss/crossentropy": 2.134302496910095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.217218816280365, "step": 10786 }, { "epoch": 0.899, "grad_norm": 4.8125, "grad_norm_var": 0.58033447265625, "learning_rate": 1.4717848729730417e-05, "loss": 4.996, "loss/crossentropy": 1.5596438944339752, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14135419204831123, "step": 10788 }, { "epoch": 0.8991666666666667, "grad_norm": 4.5, "grad_norm_var": 0.5920572916666667, "learning_rate": 1.4697389050882713e-05, "loss": 4.6219, "loss/crossentropy": 1.9281855672597885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18955061584711075, "step": 10790 }, { "epoch": 0.8993333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.5873697916666667, "learning_rate": 1.4676952325302787e-05, "loss": 4.8212, "loss/crossentropy": 2.278311848640442, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20813852548599243, "step": 10792 }, { "epoch": 0.8995, "grad_norm": 4.09375, "grad_norm_var": 0.6040323893229167, "learning_rate": 1.4656538603416222e-05, "loss": 4.4675, "loss/crossentropy": 2.52884042263031, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21629564464092255, "step": 10794 }, { "epoch": 0.8996666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.592578125, "learning_rate": 1.4636147935591845e-05, "loss": 4.779, "loss/crossentropy": 1.7139496207237244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17117717862129211, "step": 10796 }, { "epoch": 0.8998333333333334, "grad_norm": 4.625, "grad_norm_var": 0.59508056640625, "learning_rate": 1.46157803721416e-05, "loss": 4.4386, "loss/crossentropy": 2.108862668275833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20810528099536896, "step": 10798 }, { "epoch": 0.9, "grad_norm": 4.5625, "grad_norm_var": 0.067041015625, "learning_rate": 1.4595435963320435e-05, "loss": 4.8122, "loss/crossentropy": 2.3672678768634796, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21068596094846725, "step": 10800 }, { "epoch": 0.9001666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.06847330729166666, "learning_rate": 1.4575114759326147e-05, "loss": 5.0223, "loss/crossentropy": 1.967505268752575, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17484956234693527, "step": 10802 }, { "epoch": 0.9003333333333333, "grad_norm": 4.96875, "grad_norm_var": 0.08310139973958333, "learning_rate": 1.4554816810299292e-05, "loss": 4.1566, "loss/crossentropy": 0.8119016736745834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1141384020447731, "step": 10804 }, { "epoch": 0.9005, "grad_norm": 4.5, "grad_norm_var": 0.07125244140625, "learning_rate": 1.4534542166323037e-05, "loss": 5.0435, "loss/crossentropy": 1.9839187264442444, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1827479749917984, "step": 10806 }, { "epoch": 0.9006666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.07076416015625, "learning_rate": 1.4514290877423055e-05, "loss": 4.8319, "loss/crossentropy": 2.3538177013397217, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22079696133732796, "step": 10808 }, { "epoch": 0.9008333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.05419514973958333, "learning_rate": 1.4494062993567386e-05, "loss": 4.9642, "loss/crossentropy": 2.0697861313819885, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22400951385498047, "step": 10810 }, { "epoch": 0.901, "grad_norm": 4.375, "grad_norm_var": 0.05858968098958333, "learning_rate": 1.4473858564666326e-05, "loss": 4.7692, "loss/crossentropy": 2.2374483346939087, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18740571290254593, "step": 10812 }, { "epoch": 0.9011666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.04299723307291667, "learning_rate": 1.4453677640572284e-05, "loss": 4.4197, "loss/crossentropy": 1.543198212981224, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1734531968832016, "step": 10814 }, { "epoch": 0.9013333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.054541015625, "learning_rate": 1.4433520271079706e-05, "loss": 5.1892, "loss/crossentropy": 2.6357452273368835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21056770905852318, "step": 10816 }, { "epoch": 0.9015, "grad_norm": 4.34375, "grad_norm_var": 0.05071614583333333, "learning_rate": 1.441338650592487e-05, "loss": 4.9244, "loss/crossentropy": 1.6901346743106842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1798687893897295, "step": 10818 }, { "epoch": 0.9016666666666666, "grad_norm": 4.25, "grad_norm_var": 0.02779541015625, "learning_rate": 1.439327639478586e-05, "loss": 4.5571, "loss/crossentropy": 2.5649845004081726, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21501518785953522, "step": 10820 }, { "epoch": 0.9018333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.027473958333333333, "learning_rate": 1.4373189987282364e-05, "loss": 5.2804, "loss/crossentropy": 1.5373041331768036, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15145167149603367, "step": 10822 }, { "epoch": 0.902, "grad_norm": 4.40625, "grad_norm_var": 0.027762858072916667, "learning_rate": 1.4353127332975611e-05, "loss": 4.7435, "loss/crossentropy": 1.7477587014436722, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1664668321609497, "step": 10824 }, { "epoch": 0.9021666666666667, "grad_norm": 4.5, "grad_norm_var": 0.02720947265625, "learning_rate": 1.4333088481368188e-05, "loss": 4.8537, "loss/crossentropy": 1.6127407774329185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1708746962249279, "step": 10826 }, { "epoch": 0.9023333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.029683430989583332, "learning_rate": 1.431307348190398e-05, "loss": 4.8039, "loss/crossentropy": 1.6628762856125832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15167899429798126, "step": 10828 }, { "epoch": 0.9025, "grad_norm": 4.46875, "grad_norm_var": 0.030497233072916668, "learning_rate": 1.4293082383968008e-05, "loss": 4.8124, "loss/crossentropy": 1.5773266032338142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15623841248452663, "step": 10830 }, { "epoch": 0.9026666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.04462483723958333, "learning_rate": 1.427311523688632e-05, "loss": 4.3818, "loss/crossentropy": 1.007303848862648, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11725516617298126, "step": 10832 }, { "epoch": 0.9028333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.043843587239583336, "learning_rate": 1.4253172089925857e-05, "loss": 5.0789, "loss/crossentropy": 2.202227681875229, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21483517810702324, "step": 10834 }, { "epoch": 0.903, "grad_norm": 4.40625, "grad_norm_var": 0.04309488932291667, "learning_rate": 1.4233252992294361e-05, "loss": 5.2643, "loss/crossentropy": 2.5935566425323486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2313656397163868, "step": 10836 }, { "epoch": 0.9031666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.04230143229166667, "learning_rate": 1.4213357993140226e-05, "loss": 5.0049, "loss/crossentropy": 1.849151723086834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18203931860625744, "step": 10838 }, { "epoch": 0.9033333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.049332682291666666, "learning_rate": 1.4193487141552382e-05, "loss": 5.2893, "loss/crossentropy": 1.7852751687169075, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18054143711924553, "step": 10840 }, { "epoch": 0.9035, "grad_norm": 4.34375, "grad_norm_var": 0.05266520182291667, "learning_rate": 1.4173640486560172e-05, "loss": 4.579, "loss/crossentropy": 1.3571243658661842, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14290621131658554, "step": 10842 }, { "epoch": 0.9036666666666666, "grad_norm": 4.28125, "grad_norm_var": 0.06300455729166667, "learning_rate": 1.4153818077133257e-05, "loss": 4.8163, "loss/crossentropy": 1.5496264174580574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15174993872642517, "step": 10844 }, { "epoch": 0.9038333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.06573893229166666, "learning_rate": 1.4134019962181458e-05, "loss": 4.7493, "loss/crossentropy": 2.2139610946178436, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19633793458342552, "step": 10846 }, { "epoch": 0.904, "grad_norm": 5.03125, "grad_norm_var": 0.060770670572916664, "learning_rate": 1.4114246190554654e-05, "loss": 4.8895, "loss/crossentropy": 1.8826258331537247, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1785743087530136, "step": 10848 }, { "epoch": 0.9041666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.059956868489583336, "learning_rate": 1.4094496811042657e-05, "loss": 5.1005, "loss/crossentropy": 2.1995404064655304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22843296453356743, "step": 10850 }, { "epoch": 0.9043333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.05972900390625, "learning_rate": 1.4074771872375111e-05, "loss": 4.3757, "loss/crossentropy": 1.0549268051981926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17528251186013222, "step": 10852 }, { "epoch": 0.9045, "grad_norm": 4.5, "grad_norm_var": 0.059403483072916666, "learning_rate": 1.4055071423221321e-05, "loss": 5.1859, "loss/crossentropy": 2.5413814783096313, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2125440388917923, "step": 10854 }, { "epoch": 0.9046666666666666, "grad_norm": 4.9375, "grad_norm_var": 0.06370035807291667, "learning_rate": 1.4035395512190204e-05, "loss": 4.6172, "loss/crossentropy": 2.1019559502601624, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21940137073397636, "step": 10856 }, { "epoch": 0.9048333333333334, "grad_norm": 4.40625, "grad_norm_var": 0.064306640625, "learning_rate": 1.4015744187830102e-05, "loss": 4.7331, "loss/crossentropy": 2.2734327018260956, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1995382085442543, "step": 10858 }, { "epoch": 0.905, "grad_norm": 4.34375, "grad_norm_var": 0.062174479166666664, "learning_rate": 1.3996117498628726e-05, "loss": 4.8909, "loss/crossentropy": 2.4530131220817566, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104918509721756, "step": 10860 }, { "epoch": 0.9051666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.055989583333333336, "learning_rate": 1.397651549301295e-05, "loss": 4.5454, "loss/crossentropy": 1.7010397166013718, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17811237461864948, "step": 10862 }, { "epoch": 0.9053333333333333, "grad_norm": 4.5, "grad_norm_var": 0.04263916015625, "learning_rate": 1.3956938219348795e-05, "loss": 4.2376, "loss/crossentropy": 1.8406718373298645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19351908564567566, "step": 10864 }, { "epoch": 0.9055, "grad_norm": 4.125, "grad_norm_var": 0.052587890625, "learning_rate": 1.3937385725941234e-05, "loss": 4.4156, "loss/crossentropy": 2.0871371999382973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19260753318667412, "step": 10866 }, { "epoch": 0.9056666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.053971354166666666, "learning_rate": 1.39178580610341e-05, "loss": 4.9657, "loss/crossentropy": 2.59759783744812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2034224160015583, "step": 10868 }, { "epoch": 0.9058333333333334, "grad_norm": 4.4375, "grad_norm_var": 612.4866821289063, "learning_rate": 1.3898355272809958e-05, "loss": 4.1001, "loss/crossentropy": 1.2347158193588257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14453835971653461, "step": 10870 }, { "epoch": 0.906, "grad_norm": 4.34375, "grad_norm_var": 612.8882446289062, "learning_rate": 1.387887740939001e-05, "loss": 4.843, "loss/crossentropy": 2.1636237651109695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18093391880393028, "step": 10872 }, { "epoch": 0.9061666666666667, "grad_norm": 4.28125, "grad_norm_var": 613.5347005208333, "learning_rate": 1.3859424518833944e-05, "loss": 4.4894, "loss/crossentropy": 1.5998671725392342, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15799619629979134, "step": 10874 }, { "epoch": 0.9063333333333333, "grad_norm": 5.03125, "grad_norm_var": 613.0373697916667, "learning_rate": 1.3839996649139834e-05, "loss": 5.1337, "loss/crossentropy": 2.1352964639663696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2174740508198738, "step": 10876 }, { "epoch": 0.9065, "grad_norm": 4.5, "grad_norm_var": 612.7711873372396, "learning_rate": 1.382059384824401e-05, "loss": 5.4513, "loss/crossentropy": 2.504655659198761, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20499487966299057, "step": 10878 }, { "epoch": 0.9066666666666666, "grad_norm": 4.5, "grad_norm_var": 612.481103515625, "learning_rate": 1.3801216164020966e-05, "loss": 4.6749, "loss/crossentropy": 2.438272774219513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21250415965914726, "step": 10880 }, { "epoch": 0.9068333333333334, "grad_norm": 4.75, "grad_norm_var": 612.1158203125, "learning_rate": 1.3781863644283204e-05, "loss": 5.1241, "loss/crossentropy": 2.322550445795059, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21630997583270073, "step": 10882 }, { "epoch": 0.907, "grad_norm": 4.84375, "grad_norm_var": 611.60078125, "learning_rate": 1.376253633678115e-05, "loss": 5.3166, "loss/crossentropy": 2.158540368080139, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1832444816827774, "step": 10884 }, { "epoch": 0.9071666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.06900634765625, "learning_rate": 1.3743234289202998e-05, "loss": 4.8893, "loss/crossentropy": 1.7209921851754189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16882705315947533, "step": 10886 }, { "epoch": 0.9073333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.07086181640625, "learning_rate": 1.3723957549174652e-05, "loss": 5.1099, "loss/crossentropy": 2.0559261441230774, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1762427855283022, "step": 10888 }, { "epoch": 0.9075, "grad_norm": 4.15625, "grad_norm_var": 0.06623942057291667, "learning_rate": 1.370470616425954e-05, "loss": 4.6185, "loss/crossentropy": 1.8271742761135101, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17320568673312664, "step": 10890 }, { "epoch": 0.9076666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.05911458333333333, "learning_rate": 1.3685480181958544e-05, "loss": 5.4106, "loss/crossentropy": 2.7878470420837402, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20561690255999565, "step": 10892 }, { "epoch": 0.9078333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.04664306640625, "learning_rate": 1.3666279649709855e-05, "loss": 4.8514, "loss/crossentropy": 1.7777061834931374, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18127823993563652, "step": 10894 }, { "epoch": 0.908, "grad_norm": 4.46875, "grad_norm_var": 0.04895426432291667, "learning_rate": 1.3647104614888897e-05, "loss": 4.7707, "loss/crossentropy": 1.4975739419460297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14876068755984306, "step": 10896 }, { "epoch": 0.9081666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.044661458333333334, "learning_rate": 1.362795512480814e-05, "loss": 4.7532, "loss/crossentropy": 2.1489458978176117, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19379611685872078, "step": 10898 }, { "epoch": 0.9083333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.03919270833333333, "learning_rate": 1.3608831226717065e-05, "loss": 4.7385, "loss/crossentropy": 2.505477249622345, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21521490439772606, "step": 10900 }, { "epoch": 0.9085, "grad_norm": 4.71875, "grad_norm_var": 0.0435546875, "learning_rate": 1.358973296780198e-05, "loss": 5.414, "loss/crossentropy": 2.2798091173171997, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20843492820858955, "step": 10902 }, { "epoch": 0.9086666666666666, "grad_norm": 4.625, "grad_norm_var": 0.044270833333333336, "learning_rate": 1.3570660395185943e-05, "loss": 4.9674, "loss/crossentropy": 2.234380006790161, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19471841678023338, "step": 10904 }, { "epoch": 0.9088333333333334, "grad_norm": 4.21875, "grad_norm_var": 0.036458333333333336, "learning_rate": 1.355161355592863e-05, "loss": 4.7723, "loss/crossentropy": 1.673406831920147, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17296934872865677, "step": 10906 }, { "epoch": 0.909, "grad_norm": 4.71875, "grad_norm_var": 0.02578125, "learning_rate": 1.3532592497026228e-05, "loss": 4.7906, "loss/crossentropy": 1.6391168981790543, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15169396623969078, "step": 10908 }, { "epoch": 0.9091666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.02896728515625, "learning_rate": 1.35135972654113e-05, "loss": 5.0298, "loss/crossentropy": 2.1111242473125458, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2054269053041935, "step": 10910 }, { "epoch": 0.9093333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.027669270833333332, "learning_rate": 1.3494627907952702e-05, "loss": 5.09, "loss/crossentropy": 1.5478358790278435, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15873121563345194, "step": 10912 }, { "epoch": 0.9095, "grad_norm": 4.71875, "grad_norm_var": 0.027978515625, "learning_rate": 1.3475684471455423e-05, "loss": 5.0438, "loss/crossentropy": 1.9041509926319122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19300096109509468, "step": 10914 }, { "epoch": 0.9096666666666666, "grad_norm": 4.3125, "grad_norm_var": 0.026676432291666666, "learning_rate": 1.345676700266053e-05, "loss": 5.0814, "loss/crossentropy": 2.0266382694244385, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19544285908341408, "step": 10916 }, { "epoch": 0.9098333333333334, "grad_norm": 4.40625, "grad_norm_var": 0.027734375, "learning_rate": 1.3437875548244986e-05, "loss": 4.7484, "loss/crossentropy": 1.905693419277668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17530391551554203, "step": 10918 }, { "epoch": 0.91, "grad_norm": 4.875, "grad_norm_var": 0.03209228515625, "learning_rate": 1.3419010154821575e-05, "loss": 4.9243, "loss/crossentropy": 1.5361211821436882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17113251611590385, "step": 10920 }, { "epoch": 0.9101666666666667, "grad_norm": 4.59375, "grad_norm_var": 1.05982666015625, "learning_rate": 1.3400170868938775e-05, "loss": 5.146, "loss/crossentropy": 2.2465337216854095, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19919180497527122, "step": 10922 }, { "epoch": 0.9103333333333333, "grad_norm": 4.65625, "grad_norm_var": 1.0621744791666667, "learning_rate": 1.3381357737080665e-05, "loss": 4.9156, "loss/crossentropy": 2.347740739583969, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21325716003775597, "step": 10924 }, { "epoch": 0.9105, "grad_norm": 4.5625, "grad_norm_var": 1.0676920572916666, "learning_rate": 1.336257080566677e-05, "loss": 5.1032, "loss/crossentropy": 1.686830684542656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18585955165326595, "step": 10926 }, { "epoch": 0.9106666666666666, "grad_norm": 4.59375, "grad_norm_var": 1.075634765625, "learning_rate": 1.3343810121051977e-05, "loss": 4.5115, "loss/crossentropy": 2.1991084814071655, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18302945792675018, "step": 10928 }, { "epoch": 0.9108333333333334, "grad_norm": 4.75, "grad_norm_var": 1.0740885416666666, "learning_rate": 1.3325075729526401e-05, "loss": 5.1483, "loss/crossentropy": 2.5179224014282227, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21267320960760117, "step": 10930 }, { "epoch": 0.911, "grad_norm": 4.28125, "grad_norm_var": 1.1177734375, "learning_rate": 1.3306367677315315e-05, "loss": 4.0085, "loss/crossentropy": 0.39225253462791443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.07208161056041718, "step": 10932 }, { "epoch": 0.9111666666666667, "grad_norm": 4.40625, "grad_norm_var": 1.1162109375, "learning_rate": 1.3287686010578954e-05, "loss": 4.9949, "loss/crossentropy": 2.163930505514145, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22702163830399513, "step": 10934 }, { "epoch": 0.9113333333333333, "grad_norm": 4.5, "grad_norm_var": 1.1220011393229166, "learning_rate": 1.3269030775412481e-05, "loss": 4.8736, "loss/crossentropy": 2.1811038851737976, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20420709252357483, "step": 10936 }, { "epoch": 0.9115, "grad_norm": 4.46875, "grad_norm_var": 0.06148681640625, "learning_rate": 1.3250402017845839e-05, "loss": 4.7587, "loss/crossentropy": 1.7884374484419823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15043306909501553, "step": 10938 }, { "epoch": 0.9116666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.06451822916666666, "learning_rate": 1.323179978384363e-05, "loss": 5.5021, "loss/crossentropy": 2.5016011595726013, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1943902149796486, "step": 10940 }, { "epoch": 0.9118333333333334, "grad_norm": 4.0625, "grad_norm_var": 0.07628580729166666, "learning_rate": 1.3213224119305017e-05, "loss": 4.4301, "loss/crossentropy": 0.9773362800478935, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11285099945962429, "step": 10942 }, { "epoch": 0.912, "grad_norm": 4.65625, "grad_norm_var": 0.07906494140625, "learning_rate": 1.319467507006361e-05, "loss": 4.7561, "loss/crossentropy": 1.7318015322089195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21848906949162483, "step": 10944 }, { "epoch": 0.9121666666666667, "grad_norm": 4.625, "grad_norm_var": 0.07810872395833333, "learning_rate": 1.3176152681887345e-05, "loss": 4.764, "loss/crossentropy": 2.075657568871975, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18956227414309978, "step": 10946 }, { "epoch": 0.9123333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.05273030598958333, "learning_rate": 1.3157657000478367e-05, "loss": 4.7062, "loss/crossentropy": 1.3858967423439026, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14041148498654366, "step": 10948 }, { "epoch": 0.9125, "grad_norm": 4.6875, "grad_norm_var": 0.030712890625, "learning_rate": 1.3139188071472933e-05, "loss": 4.9684, "loss/crossentropy": 2.010605439543724, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19537147507071495, "step": 10950 }, { "epoch": 0.9126666666666666, "grad_norm": 4.875, "grad_norm_var": 0.03980712890625, "learning_rate": 1.3120745940441295e-05, "loss": 5.0574, "loss/crossentropy": 2.2360286712646484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22419559210538864, "step": 10952 }, { "epoch": 0.9128333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.054150390625, "learning_rate": 1.3102330652887573e-05, "loss": 4.4787, "loss/crossentropy": 1.4420068562030792, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13918348774313927, "step": 10954 }, { "epoch": 0.913, "grad_norm": 4.46875, "grad_norm_var": 0.0525390625, "learning_rate": 1.308394225424966e-05, "loss": 5.179, "loss/crossentropy": 1.8898730650544167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17681991308927536, "step": 10956 }, { "epoch": 0.9131666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.03752848307291667, "learning_rate": 1.3065580789899097e-05, "loss": 4.9317, "loss/crossentropy": 1.9032281190156937, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17939336225390434, "step": 10958 }, { "epoch": 0.9133333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.04269205729166667, "learning_rate": 1.3047246305140982e-05, "loss": 5.2684, "loss/crossentropy": 2.2830842435359955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21606996655464172, "step": 10960 }, { "epoch": 0.9135, "grad_norm": 4.875, "grad_norm_var": 0.04361572265625, "learning_rate": 1.3028938845213828e-05, "loss": 4.613, "loss/crossentropy": 2.186072915792465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21759696677327156, "step": 10962 }, { "epoch": 0.9136666666666666, "grad_norm": 4.90625, "grad_norm_var": 0.04894205729166667, "learning_rate": 1.3010658455289471e-05, "loss": 5.0303, "loss/crossentropy": 2.524782419204712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2118838131427765, "step": 10964 }, { "epoch": 0.9138333333333334, "grad_norm": 4.84375, "grad_norm_var": 0.05349934895833333, "learning_rate": 1.2992405180472953e-05, "loss": 5.3547, "loss/crossentropy": 2.450446605682373, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2266518585383892, "step": 10966 }, { "epoch": 0.914, "grad_norm": 4.40625, "grad_norm_var": 0.04920247395833333, "learning_rate": 1.297417906580243e-05, "loss": 4.9993, "loss/crossentropy": 2.5028828382492065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21859336644411087, "step": 10968 }, { "epoch": 0.9141666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.0435546875, "learning_rate": 1.2955980156249006e-05, "loss": 4.9976, "loss/crossentropy": 1.8704118728637695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18769513443112373, "step": 10970 }, { "epoch": 0.9143333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.04302978515625, "learning_rate": 1.2937808496716699e-05, "loss": 4.9612, "loss/crossentropy": 2.005654275417328, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18199091777205467, "step": 10972 }, { "epoch": 0.9145, "grad_norm": 4.375, "grad_norm_var": 0.049149576822916666, "learning_rate": 1.291966413204227e-05, "loss": 4.6524, "loss/crossentropy": 1.4702882021665573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16470220685005188, "step": 10974 }, { "epoch": 0.9146666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.04498697916666667, "learning_rate": 1.2901547106995125e-05, "loss": 4.5217, "loss/crossentropy": 1.9486939013004303, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17468804866075516, "step": 10976 }, { "epoch": 0.9148333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.0376953125, "learning_rate": 1.2883457466277226e-05, "loss": 4.2599, "loss/crossentropy": 2.089116282761097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19068294391036034, "step": 10978 }, { "epoch": 0.915, "grad_norm": 4.5625, "grad_norm_var": 0.028108723958333335, "learning_rate": 1.2865395254522972e-05, "loss": 5.3167, "loss/crossentropy": 2.388074040412903, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2232646606862545, "step": 10980 }, { "epoch": 0.9151666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.021744791666666666, "learning_rate": 1.284736051629907e-05, "loss": 4.6976, "loss/crossentropy": 1.801637277007103, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17922187969088554, "step": 10982 }, { "epoch": 0.9153333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.024800618489583332, "learning_rate": 1.282935329610444e-05, "loss": 4.4747, "loss/crossentropy": 0.9352747425436974, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11156686767935753, "step": 10984 }, { "epoch": 0.9155, "grad_norm": 4.40625, "grad_norm_var": 0.032145182291666664, "learning_rate": 1.2811373638370108e-05, "loss": 5.2869, "loss/crossentropy": 1.7718759551644325, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15538059920072556, "step": 10986 }, { "epoch": 0.9156666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.04332275390625, "learning_rate": 1.27934215874591e-05, "loss": 5.5746, "loss/crossentropy": 2.574616312980652, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20968396216630936, "step": 10988 }, { "epoch": 0.9158333333333334, "grad_norm": 4.34375, "grad_norm_var": 0.03982747395833333, "learning_rate": 1.277549718766631e-05, "loss": 4.4059, "loss/crossentropy": 0.8218652456998825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11896517686545849, "step": 10990 }, { "epoch": 0.916, "grad_norm": 4.90625, "grad_norm_var": 0.04698893229166667, "learning_rate": 1.2757600483218418e-05, "loss": 5.491, "loss/crossentropy": 1.6398277059197426, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1669867243617773, "step": 10992 }, { "epoch": 0.9161666666666667, "grad_norm": 5.09375, "grad_norm_var": 0.07248942057291667, "learning_rate": 1.273973151827375e-05, "loss": 5.5911, "loss/crossentropy": 2.2634086310863495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104131430387497, "step": 10994 }, { "epoch": 0.9163333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.07828369140625, "learning_rate": 1.2721890336922219e-05, "loss": 5.099, "loss/crossentropy": 2.065896801650524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17549334466457367, "step": 10996 }, { "epoch": 0.9165, "grad_norm": 4.5625, "grad_norm_var": 0.06643473307291667, "learning_rate": 1.2704076983185156e-05, "loss": 5.3278, "loss/crossentropy": 1.8941172808408737, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17898527905344963, "step": 10998 }, { "epoch": 0.9166666666666666, "grad_norm": 4.21875, "grad_norm_var": 0.0703125, "learning_rate": 1.2686291501015243e-05, "loss": 4.571, "loss/crossentropy": 1.3857896998524666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1634297538548708, "step": 11000 }, { "epoch": 0.9168333333333333, "grad_norm": 4.25, "grad_norm_var": 0.07737223307291667, "learning_rate": 1.2668533934296388e-05, "loss": 4.5265, "loss/crossentropy": 1.7431185841560364, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19076452031731606, "step": 11002 }, { "epoch": 0.917, "grad_norm": 4.71875, "grad_norm_var": 0.08941650390625, "learning_rate": 1.2650804326843624e-05, "loss": 4.7464, "loss/crossentropy": 2.4716763496398926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1999698057770729, "step": 11004 }, { "epoch": 0.9171666666666667, "grad_norm": 4.84375, "grad_norm_var": 0.08411458333333334, "learning_rate": 1.2633102722402993e-05, "loss": 4.9842, "loss/crossentropy": 1.752097338438034, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20358425192534924, "step": 11006 }, { "epoch": 0.9173333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.08411458333333334, "learning_rate": 1.2615429164651437e-05, "loss": 4.9499, "loss/crossentropy": 1.851276509463787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18495296128094196, "step": 11008 }, { "epoch": 0.9175, "grad_norm": 4.6875, "grad_norm_var": 0.054150390625, "learning_rate": 1.2597783697196717e-05, "loss": 4.9879, "loss/crossentropy": 2.3310405611991882, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24937022477388382, "step": 11010 }, { "epoch": 0.9176666666666666, "grad_norm": 4.28125, "grad_norm_var": 0.048563639322916664, "learning_rate": 1.2580166363577262e-05, "loss": 4.2779, "loss/crossentropy": 1.6911320835351944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16903837397694588, "step": 11012 }, { "epoch": 0.9178333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.06197916666666667, "learning_rate": 1.2562577207262094e-05, "loss": 4.5681, "loss/crossentropy": 2.2415121346712112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18968461081385612, "step": 11014 }, { "epoch": 0.918, "grad_norm": 4.6875, "grad_norm_var": 0.05308837890625, "learning_rate": 1.2545016271650703e-05, "loss": 4.945, "loss/crossentropy": 2.0836883261799812, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19096532836556435, "step": 11016 }, { "epoch": 0.9181666666666667, "grad_norm": 4.25, "grad_norm_var": 0.0548828125, "learning_rate": 1.2527483600072958e-05, "loss": 4.5348, "loss/crossentropy": 1.9989722445607185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17667778953909874, "step": 11018 }, { "epoch": 0.9183333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.06900634765625, "learning_rate": 1.2509979235788983e-05, "loss": 4.9737, "loss/crossentropy": 1.6975673288106918, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16448040679097176, "step": 11020 }, { "epoch": 0.9185, "grad_norm": 4.9375, "grad_norm_var": 0.07389322916666667, "learning_rate": 1.2492503221989052e-05, "loss": 5.2162, "loss/crossentropy": 1.9220678880810738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19711147621273994, "step": 11022 }, { "epoch": 0.9186666666666666, "grad_norm": 4.375, "grad_norm_var": 0.07893473307291667, "learning_rate": 1.247505560179349e-05, "loss": 4.6263, "loss/crossentropy": 2.1792136132717133, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1773302685469389, "step": 11024 }, { "epoch": 0.9188333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.07760416666666667, "learning_rate": 1.2457636418252576e-05, "loss": 4.8479, "loss/crossentropy": 1.406090959906578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16720456257462502, "step": 11026 }, { "epoch": 0.919, "grad_norm": 4.40625, "grad_norm_var": 0.09107666015625, "learning_rate": 1.2440245714346406e-05, "loss": 5.0846, "loss/crossentropy": 1.9798620790243149, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17173392325639725, "step": 11028 }, { "epoch": 0.9191666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.07174479166666667, "learning_rate": 1.2422883532984816e-05, "loss": 4.3883, "loss/crossentropy": 1.6991348788142204, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16558572091162205, "step": 11030 }, { "epoch": 0.9193333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.07125244140625, "learning_rate": 1.2405549917007256e-05, "loss": 5.5011, "loss/crossentropy": 1.8893938288092613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1703520081937313, "step": 11032 }, { "epoch": 0.9195, "grad_norm": 4.78125, "grad_norm_var": 0.0822265625, "learning_rate": 1.2388244909182714e-05, "loss": 4.652, "loss/crossentropy": 2.1600342392921448, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20012832432985306, "step": 11034 }, { "epoch": 0.9196666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.06418863932291667, "learning_rate": 1.2370968552209557e-05, "loss": 5.0321, "loss/crossentropy": 2.060860723257065, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1751319319009781, "step": 11036 }, { "epoch": 0.9198333333333333, "grad_norm": 4.625, "grad_norm_var": 0.05611572265625, "learning_rate": 1.2353720888715498e-05, "loss": 4.6567, "loss/crossentropy": 2.5102124214172363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21535085886716843, "step": 11038 }, { "epoch": 0.92, "grad_norm": 4.5, "grad_norm_var": 0.0546875, "learning_rate": 1.2336501961257421e-05, "loss": 5.4442, "loss/crossentropy": 2.1127854585647583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.202357716858387, "step": 11040 }, { "epoch": 0.9201666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.06477457682291667, "learning_rate": 1.231931181232132e-05, "loss": 5.2808, "loss/crossentropy": 2.1201717257499695, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18782678991556168, "step": 11042 }, { "epoch": 0.9203333333333333, "grad_norm": 4.5, "grad_norm_var": 0.049853515625, "learning_rate": 1.2302150484322178e-05, "loss": 5.2121, "loss/crossentropy": 2.13012208789587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18352052569389343, "step": 11044 }, { "epoch": 0.9205, "grad_norm": 4.34375, "grad_norm_var": 0.04674072265625, "learning_rate": 1.2285018019603867e-05, "loss": 4.8306, "loss/crossentropy": 2.0504641234874725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21500540524721146, "step": 11046 }, { "epoch": 0.9206666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.06425374348958333, "learning_rate": 1.2267914460439046e-05, "loss": 4.9593, "loss/crossentropy": 1.9509310349822044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2244015671312809, "step": 11048 }, { "epoch": 0.9208333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.05028889973958333, "learning_rate": 1.2250839849029038e-05, "loss": 4.9527, "loss/crossentropy": 1.923073947429657, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1771315522491932, "step": 11050 }, { "epoch": 0.921, "grad_norm": 4.65625, "grad_norm_var": 0.04788004557291667, "learning_rate": 1.2233794227503747e-05, "loss": 5.1758, "loss/crossentropy": 2.4496266841888428, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20912783220410347, "step": 11052 }, { "epoch": 0.9211666666666667, "grad_norm": 4.375, "grad_norm_var": 0.05807291666666667, "learning_rate": 1.2216777637921565e-05, "loss": 4.3485, "loss/crossentropy": 1.2497363984584808, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13991161063313484, "step": 11054 }, { "epoch": 0.9213333333333333, "grad_norm": 4.25, "grad_norm_var": 0.060009765625, "learning_rate": 1.2199790122269222e-05, "loss": 4.95, "loss/crossentropy": 2.5281606912612915, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20351628586649895, "step": 11056 }, { "epoch": 0.9215, "grad_norm": 4.78125, "grad_norm_var": 0.060791015625, "learning_rate": 1.2182831722461727e-05, "loss": 4.9759, "loss/crossentropy": 1.8820656910538673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970411352813244, "step": 11058 }, { "epoch": 0.9216666666666666, "grad_norm": 4.4375, "grad_norm_var": 0.065625, "learning_rate": 1.2165902480342244e-05, "loss": 5.4734, "loss/crossentropy": 2.3079889118671417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18381556309759617, "step": 11060 }, { "epoch": 0.9218333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.06405843098958333, "learning_rate": 1.2149002437682004e-05, "loss": 5.1037, "loss/crossentropy": 2.392060697078705, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20238174125552177, "step": 11062 }, { "epoch": 0.922, "grad_norm": 4.46875, "grad_norm_var": 0.04625244140625, "learning_rate": 1.2132131636180175e-05, "loss": 5.283, "loss/crossentropy": 2.3549709618091583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19442760944366455, "step": 11064 }, { "epoch": 0.9221666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.03883056640625, "learning_rate": 1.2115290117463785e-05, "loss": 5.2563, "loss/crossentropy": 1.8912615105509758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17870598286390305, "step": 11066 }, { "epoch": 0.9223333333333333, "grad_norm": 4.1875, "grad_norm_var": 0.04599202473958333, "learning_rate": 1.20984779230876e-05, "loss": 5.0817, "loss/crossentropy": 1.5166109129786491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1375791635364294, "step": 11068 }, { "epoch": 0.9225, "grad_norm": 4.5625, "grad_norm_var": 0.039872233072916666, "learning_rate": 1.2081695094534054e-05, "loss": 4.9119, "loss/crossentropy": 2.3969703912734985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21224704012274742, "step": 11070 }, { "epoch": 0.9226666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.03229166666666667, "learning_rate": 1.2064941673213088e-05, "loss": 4.7744, "loss/crossentropy": 2.287917584180832, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23141101002693176, "step": 11072 }, { "epoch": 0.9228333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.026102701822916668, "learning_rate": 1.204821770046212e-05, "loss": 5.204, "loss/crossentropy": 2.38395032286644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20762236416339874, "step": 11074 }, { "epoch": 0.923, "grad_norm": 4.75, "grad_norm_var": 0.022261555989583334, "learning_rate": 1.2031523217545887e-05, "loss": 4.7377, "loss/crossentropy": 2.1217075884342194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20826097577810287, "step": 11076 }, { "epoch": 0.9231666666666667, "grad_norm": 4.625, "grad_norm_var": 0.024995930989583335, "learning_rate": 1.2014858265656357e-05, "loss": 5.0618, "loss/crossentropy": 1.8859133496880531, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17268339358270168, "step": 11078 }, { "epoch": 0.9233333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.02662353515625, "learning_rate": 1.1998222885912649e-05, "loss": 4.695, "loss/crossentropy": 1.691146194934845, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18018352426588535, "step": 11080 }, { "epoch": 0.9235, "grad_norm": 4.5, "grad_norm_var": 41.13619384765625, "learning_rate": 1.1981617119360914e-05, "loss": 4.4921, "loss/crossentropy": 1.6973706856369972, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1616340707987547, "step": 11082 }, { "epoch": 0.9236666666666666, "grad_norm": 4.46875, "grad_norm_var": 41.20631103515625, "learning_rate": 1.196504100697422e-05, "loss": 4.9428, "loss/crossentropy": 1.8388321250677109, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19290312752127647, "step": 11084 }, { "epoch": 0.9238333333333333, "grad_norm": 4.59375, "grad_norm_var": 41.232906087239584, "learning_rate": 1.1948494589652487e-05, "loss": 4.5823, "loss/crossentropy": 1.5040778517723083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14187092706561089, "step": 11086 }, { "epoch": 0.924, "grad_norm": 4.5, "grad_norm_var": 41.27118733723958, "learning_rate": 1.193197790822234e-05, "loss": 4.8349, "loss/crossentropy": 1.6005319356918335, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17666389420628548, "step": 11088 }, { "epoch": 0.9241666666666667, "grad_norm": 4.75, "grad_norm_var": 41.18232014973958, "learning_rate": 1.1915491003437065e-05, "loss": 4.9295, "loss/crossentropy": 1.6699941158294678, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17156944423913956, "step": 11090 }, { "epoch": 0.9243333333333333, "grad_norm": 4.5, "grad_norm_var": 41.20974934895833, "learning_rate": 1.1899033915976453e-05, "loss": 4.8379, "loss/crossentropy": 1.6831908822059631, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17644242197275162, "step": 11092 }, { "epoch": 0.9245, "grad_norm": 4.46875, "grad_norm_var": 41.141011555989586, "learning_rate": 1.1882606686446732e-05, "loss": 5.266, "loss/crossentropy": 2.5757681727409363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20049363002181053, "step": 11094 }, { "epoch": 0.9246666666666666, "grad_norm": 4.84375, "grad_norm_var": 40.965458170572916, "learning_rate": 1.1866209355380452e-05, "loss": 4.6894, "loss/crossentropy": 1.46444021910429, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18953786976635456, "step": 11096 }, { "epoch": 0.9248333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.03970947265625, "learning_rate": 1.1849841963236408e-05, "loss": 5.0015, "loss/crossentropy": 2.0792530477046967, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22961734235286713, "step": 11098 }, { "epoch": 0.925, "grad_norm": 4.5625, "grad_norm_var": 0.03365478515625, "learning_rate": 1.1833504550399506e-05, "loss": 4.8575, "loss/crossentropy": 2.336266815662384, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21120865643024445, "step": 11100 }, { "epoch": 0.9251666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.04254150390625, "learning_rate": 1.1817197157180693e-05, "loss": 5.5143, "loss/crossentropy": 2.3953791558742523, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2296001985669136, "step": 11102 }, { "epoch": 0.9253333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.05139567057291667, "learning_rate": 1.1800919823816834e-05, "loss": 5.3345, "loss/crossentropy": 2.2379717230796814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23791631683707237, "step": 11104 }, { "epoch": 0.9255, "grad_norm": 4.375, "grad_norm_var": 0.056624348958333334, "learning_rate": 1.1784672590470643e-05, "loss": 4.544, "loss/crossentropy": 1.2902886420488358, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16380742564797401, "step": 11106 }, { "epoch": 0.9256666666666666, "grad_norm": 4.3125, "grad_norm_var": 0.06513264973958334, "learning_rate": 1.1768455497230537e-05, "loss": 4.456, "loss/crossentropy": 1.441624328494072, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13727031461894512, "step": 11108 }, { "epoch": 0.9258333333333333, "grad_norm": 4.5, "grad_norm_var": 0.060868326822916666, "learning_rate": 1.1752268584110593e-05, "loss": 4.7432, "loss/crossentropy": 1.9650721102952957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18655399791896343, "step": 11110 }, { "epoch": 0.926, "grad_norm": 4.78125, "grad_norm_var": 0.051102701822916666, "learning_rate": 1.1736111891050406e-05, "loss": 5.304, "loss/crossentropy": 2.145664870738983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.239684097468853, "step": 11112 }, { "epoch": 0.9261666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.04827067057291667, "learning_rate": 1.1719985457915014e-05, "loss": 4.5032, "loss/crossentropy": 2.4814305305480957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20958179607987404, "step": 11114 }, { "epoch": 0.9263333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.04413655598958333, "learning_rate": 1.1703889324494778e-05, "loss": 4.7755, "loss/crossentropy": 2.2378440499305725, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20146576315164566, "step": 11116 }, { "epoch": 0.9265, "grad_norm": 4.34375, "grad_norm_var": 0.03313395182291667, "learning_rate": 1.1687823530505315e-05, "loss": 5.0817, "loss/crossentropy": 2.245323598384857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19192436337471008, "step": 11118 }, { "epoch": 0.9266666666666666, "grad_norm": 4.34375, "grad_norm_var": 0.03518473307291667, "learning_rate": 1.1671788115587374e-05, "loss": 4.7758, "loss/crossentropy": 1.3894713819026947, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16750122606754303, "step": 11120 }, { "epoch": 0.9268333333333333, "grad_norm": 4.75, "grad_norm_var": 0.03463541666666667, "learning_rate": 1.1655783119306752e-05, "loss": 5.3347, "loss/crossentropy": 2.481057107448578, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20578979700803757, "step": 11122 }, { "epoch": 0.927, "grad_norm": 4.15625, "grad_norm_var": 0.04648030598958333, "learning_rate": 1.163980858115417e-05, "loss": 4.632, "loss/crossentropy": 1.7022991552948952, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15510082617402077, "step": 11124 }, { "epoch": 0.9271666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.04983317057291667, "learning_rate": 1.1623864540545231e-05, "loss": 4.7536, "loss/crossentropy": 2.399439185857773, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2043217159807682, "step": 11126 }, { "epoch": 0.9273333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.05299072265625, "learning_rate": 1.1607951036820262e-05, "loss": 4.3418, "loss/crossentropy": 1.9050696045160294, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17808566614985466, "step": 11128 }, { "epoch": 0.9275, "grad_norm": 4.8125, "grad_norm_var": 0.06236979166666667, "learning_rate": 1.1592068109244253e-05, "loss": 5.3348, "loss/crossentropy": 2.106477528810501, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24340755119919777, "step": 11130 }, { "epoch": 0.9276666666666666, "grad_norm": 4.75, "grad_norm_var": 0.06741129557291667, "learning_rate": 1.1576215797006743e-05, "loss": 5.5603, "loss/crossentropy": 2.5341862440109253, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22335410490632057, "step": 11132 }, { "epoch": 0.9278333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.07589518229166667, "learning_rate": 1.1560394139221746e-05, "loss": 4.3389, "loss/crossentropy": 1.561987891793251, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1611312460154295, "step": 11134 }, { "epoch": 0.928, "grad_norm": 4.84375, "grad_norm_var": 0.075634765625, "learning_rate": 1.154460317492763e-05, "loss": 5.1787, "loss/crossentropy": 1.756756342947483, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17010868340730667, "step": 11136 }, { "epoch": 0.9281666666666667, "grad_norm": 4.5, "grad_norm_var": 0.07965087890625, "learning_rate": 1.152884294308702e-05, "loss": 5.1386, "loss/crossentropy": 2.078896164894104, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20521379634737968, "step": 11138 }, { "epoch": 0.9283333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.06614176432291667, "learning_rate": 1.1513113482586724e-05, "loss": 5.0978, "loss/crossentropy": 2.103352040052414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20386765152215958, "step": 11140 }, { "epoch": 0.9285, "grad_norm": 4.375, "grad_norm_var": 0.06272379557291667, "learning_rate": 1.1497414832237634e-05, "loss": 5.1341, "loss/crossentropy": 2.3629302382469177, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23492885753512383, "step": 11142 }, { "epoch": 0.9286666666666666, "grad_norm": 4.625, "grad_norm_var": 0.05846354166666667, "learning_rate": 1.1481747030774593e-05, "loss": 4.9212, "loss/crossentropy": 1.937618963420391, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17890803515911102, "step": 11144 }, { "epoch": 0.9288333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.04869384765625, "learning_rate": 1.1466110116856353e-05, "loss": 4.8033, "loss/crossentropy": 1.8597223535180092, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16911944560706615, "step": 11146 }, { "epoch": 0.929, "grad_norm": 4.65625, "grad_norm_var": 0.04894205729166667, "learning_rate": 1.1450504129065438e-05, "loss": 4.7281, "loss/crossentropy": 2.203222244977951, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1975608691573143, "step": 11148 }, { "epoch": 0.9291666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.037886555989583334, "learning_rate": 1.1434929105908086e-05, "loss": 4.8444, "loss/crossentropy": 1.9725009500980377, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20708056166768074, "step": 11150 }, { "epoch": 0.9293333333333333, "grad_norm": 4.1875, "grad_norm_var": 0.03948160807291667, "learning_rate": 1.1419385085814099e-05, "loss": 4.9513, "loss/crossentropy": 2.3870702385902405, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2217625379562378, "step": 11152 }, { "epoch": 0.9295, "grad_norm": 4.8125, "grad_norm_var": 0.04568684895833333, "learning_rate": 1.1403872107136816e-05, "loss": 4.6376, "loss/crossentropy": 1.5546553134918213, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16211243718862534, "step": 11154 }, { "epoch": 0.9296666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.046223958333333336, "learning_rate": 1.1388390208152962e-05, "loss": 5.1388, "loss/crossentropy": 2.185354083776474, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2254449725151062, "step": 11156 }, { "epoch": 0.9298333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.048567708333333334, "learning_rate": 1.1372939427062588e-05, "loss": 4.4744, "loss/crossentropy": 1.2566208392381668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1643849052488804, "step": 11158 }, { "epoch": 0.93, "grad_norm": 4.28125, "grad_norm_var": 0.054671223958333334, "learning_rate": 1.1357519801988954e-05, "loss": 4.1238, "loss/crossentropy": 1.4119196981191635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16916809789836407, "step": 11160 }, { "epoch": 0.9301666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.06249593098958333, "learning_rate": 1.1342131370978461e-05, "loss": 5.0425, "loss/crossentropy": 2.1853462755680084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916177235543728, "step": 11162 }, { "epoch": 0.9303333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.05829671223958333, "learning_rate": 1.132677417200053e-05, "loss": 4.6284, "loss/crossentropy": 1.6912791430950165, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18075689859688282, "step": 11164 }, { "epoch": 0.9305, "grad_norm": 5.71875, "grad_norm_var": 0.15859375, "learning_rate": 1.131144824294752e-05, "loss": 4.5363, "loss/crossentropy": 1.6092994064092636, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17647656612098217, "step": 11166 }, { "epoch": 0.9306666666666666, "grad_norm": 4.6875, "grad_norm_var": 0.14659830729166667, "learning_rate": 1.1296153621634636e-05, "loss": 5.163, "loss/crossentropy": 1.3289310112595558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1501878947019577, "step": 11168 }, { "epoch": 0.9308333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.14178059895833334, "learning_rate": 1.1280890345799842e-05, "loss": 4.7383, "loss/crossentropy": 2.2952709197998047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19613110646605492, "step": 11170 }, { "epoch": 0.931, "grad_norm": 4.15625, "grad_norm_var": 0.151806640625, "learning_rate": 1.126565845310375e-05, "loss": 4.2535, "loss/crossentropy": 2.18592032790184, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18933845311403275, "step": 11172 }, { "epoch": 0.9311666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.15920817057291667, "learning_rate": 1.125045798112954e-05, "loss": 5.3535, "loss/crossentropy": 1.5739598274230957, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.222409188747406, "step": 11174 }, { "epoch": 0.9313333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.14685872395833333, "learning_rate": 1.1235288967382864e-05, "loss": 5.3601, "loss/crossentropy": 2.36248779296875, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22431249916553497, "step": 11176 }, { "epoch": 0.9315, "grad_norm": 4.625, "grad_norm_var": 0.15898030598958332, "learning_rate": 1.1220151449291767e-05, "loss": 5.0139, "loss/crossentropy": 1.7432967498898506, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1720932126045227, "step": 11178 }, { "epoch": 0.9316666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.14892171223958334, "learning_rate": 1.1205045464206552e-05, "loss": 5.3784, "loss/crossentropy": 2.0632041543722153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18448470905423164, "step": 11180 }, { "epoch": 0.9318333333333333, "grad_norm": 4.15625, "grad_norm_var": 0.08092447916666666, "learning_rate": 1.1189971049399753e-05, "loss": 4.7851, "loss/crossentropy": 2.0081071704626083, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17401228658854961, "step": 11182 }, { "epoch": 0.932, "grad_norm": 4.28125, "grad_norm_var": 0.08815104166666667, "learning_rate": 1.1174928242065974e-05, "loss": 4.8777, "loss/crossentropy": 1.90491384267807, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2091764397919178, "step": 11184 }, { "epoch": 0.9321666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.0841796875, "learning_rate": 1.1159917079321865e-05, "loss": 5.2098, "loss/crossentropy": 1.6461158990859985, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16235189884901047, "step": 11186 }, { "epoch": 0.9323333333333333, "grad_norm": 4.1875, "grad_norm_var": 0.07303059895833333, "learning_rate": 1.114493759820596e-05, "loss": 4.622, "loss/crossentropy": 1.760587900876999, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17077770829200745, "step": 11188 }, { "epoch": 0.9325, "grad_norm": 4.53125, "grad_norm_var": 0.065478515625, "learning_rate": 1.112998983567865e-05, "loss": 4.8809, "loss/crossentropy": 2.216802418231964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19440119341015816, "step": 11190 }, { "epoch": 0.9326666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.0654296875, "learning_rate": 1.1115073828622052e-05, "loss": 5.1407, "loss/crossentropy": 1.1693921089172363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12160374782979488, "step": 11192 }, { "epoch": 0.9328333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.04021809895833333, "learning_rate": 1.110018961383993e-05, "loss": 5.0442, "loss/crossentropy": 2.37707781791687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22920748591423035, "step": 11194 }, { "epoch": 0.933, "grad_norm": 5.1875, "grad_norm_var": 0.06210530598958333, "learning_rate": 1.1085337228057597e-05, "loss": 5.733, "loss/crossentropy": 1.9281913936138153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2165273167192936, "step": 11196 }, { "epoch": 0.9331666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.0515625, "learning_rate": 1.1070516707921849e-05, "loss": 4.6594, "loss/crossentropy": 1.7963752299547195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17561664432287216, "step": 11198 }, { "epoch": 0.9333333333333333, "grad_norm": 4.5, "grad_norm_var": 0.04371337890625, "learning_rate": 1.1055728090000844e-05, "loss": 4.6236, "loss/crossentropy": 2.2825274989008904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17308339476585388, "step": 11200 }, { "epoch": 0.9335, "grad_norm": 5.125, "grad_norm_var": 0.06678059895833334, "learning_rate": 1.1040971410784026e-05, "loss": 4.8975, "loss/crossentropy": 1.9707480520009995, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18031662702560425, "step": 11202 }, { "epoch": 0.9336666666666666, "grad_norm": 4.25, "grad_norm_var": 0.06982014973958334, "learning_rate": 1.1026246706682024e-05, "loss": 4.7378, "loss/crossentropy": 1.832257367670536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1671583391726017, "step": 11204 }, { "epoch": 0.9338333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.0720703125, "learning_rate": 1.10115540140266e-05, "loss": 5.1746, "loss/crossentropy": 1.7907705903053284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.184514869004488, "step": 11206 }, { "epoch": 0.934, "grad_norm": 4.59375, "grad_norm_var": 0.07584228515625, "learning_rate": 1.0996893369070497e-05, "loss": 4.9469, "loss/crossentropy": 1.8899082094430923, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16880904138088226, "step": 11208 }, { "epoch": 0.9341666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.07815348307291667, "learning_rate": 1.098226480798741e-05, "loss": 5.3, "loss/crossentropy": 2.023133747279644, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2057199329137802, "step": 11210 }, { "epoch": 0.9343333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.052018229166666666, "learning_rate": 1.0967668366871851e-05, "loss": 4.7523, "loss/crossentropy": 1.6161313951015472, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2110748626291752, "step": 11212 }, { "epoch": 0.9345, "grad_norm": 4.09375, "grad_norm_var": 0.0712890625, "learning_rate": 1.0953104081739094e-05, "loss": 4.1247, "loss/crossentropy": 1.3108344376087189, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14841301552951336, "step": 11214 }, { "epoch": 0.9346666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.07255452473958333, "learning_rate": 1.0938571988525059e-05, "loss": 5.3319, "loss/crossentropy": 2.4737696051597595, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20922718197107315, "step": 11216 }, { "epoch": 0.9348333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.045572916666666664, "learning_rate": 1.0924072123086247e-05, "loss": 5.1227, "loss/crossentropy": 2.4679543375968933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21668322756886482, "step": 11218 }, { "epoch": 0.935, "grad_norm": 4.59375, "grad_norm_var": 0.04117431640625, "learning_rate": 1.0909604521199624e-05, "loss": 4.2241, "loss/crossentropy": 1.8565114438533783, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19333305954933167, "step": 11220 }, { "epoch": 0.9351666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.04308268229166667, "learning_rate": 1.0895169218562578e-05, "loss": 4.6922, "loss/crossentropy": 1.6779407858848572, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1518637202680111, "step": 11222 }, { "epoch": 0.9353333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.04244791666666667, "learning_rate": 1.0880766250792765e-05, "loss": 4.6942, "loss/crossentropy": 1.5626015737652779, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16061452589929104, "step": 11224 }, { "epoch": 0.9355, "grad_norm": 4.5, "grad_norm_var": 0.04058837890625, "learning_rate": 1.0866395653428086e-05, "loss": 4.4, "loss/crossentropy": 2.4633554816246033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20592839643359184, "step": 11226 }, { "epoch": 0.9356666666666666, "grad_norm": 4.09375, "grad_norm_var": 0.05310872395833333, "learning_rate": 1.085205746192656e-05, "loss": 4.9874, "loss/crossentropy": 2.4507370591163635, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22082506865262985, "step": 11228 }, { "epoch": 0.9358333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.04401041666666667, "learning_rate": 1.0837751711666246e-05, "loss": 4.5429, "loss/crossentropy": 1.2794091627001762, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1429294366389513, "step": 11230 }, { "epoch": 0.936, "grad_norm": 4.78125, "grad_norm_var": 0.04869384765625, "learning_rate": 1.0823478437945164e-05, "loss": 5.0331, "loss/crossentropy": 2.070686124265194, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17334336414933205, "step": 11232 }, { "epoch": 0.9361666666666667, "grad_norm": 4.875, "grad_norm_var": 0.060872395833333336, "learning_rate": 1.0809237675981197e-05, "loss": 4.8727, "loss/crossentropy": 1.6955928951501846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17060757614672184, "step": 11234 }, { "epoch": 0.9363333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.06170247395833333, "learning_rate": 1.0795029460912008e-05, "loss": 4.7234, "loss/crossentropy": 1.8138331472873688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17841831222176552, "step": 11236 }, { "epoch": 0.9365, "grad_norm": 4.65625, "grad_norm_var": 0.05716145833333333, "learning_rate": 1.0780853827794959e-05, "loss": 4.884, "loss/crossentropy": 2.296046257019043, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1997358687222004, "step": 11238 }, { "epoch": 0.9366666666666666, "grad_norm": 4.84375, "grad_norm_var": 0.06222330729166667, "learning_rate": 1.0766710811607011e-05, "loss": 5.2366, "loss/crossentropy": 1.3264843076467514, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1398827638477087, "step": 11240 }, { "epoch": 0.9368333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.051656087239583336, "learning_rate": 1.0752600447244654e-05, "loss": 5.2335, "loss/crossentropy": 1.9065971076488495, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1831565536558628, "step": 11242 }, { "epoch": 0.937, "grad_norm": 4.40625, "grad_norm_var": 0.03863525390625, "learning_rate": 1.073852276952381e-05, "loss": 4.6856, "loss/crossentropy": 2.1032577455043793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24391169473528862, "step": 11244 }, { "epoch": 0.9371666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.024202473958333335, "learning_rate": 1.072447781317975e-05, "loss": 4.7772, "loss/crossentropy": 2.3768675327301025, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24659275636076927, "step": 11246 }, { "epoch": 0.9373333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.020572916666666666, "learning_rate": 1.0710465612866999e-05, "loss": 4.7231, "loss/crossentropy": 1.6266694143414497, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16489019989967346, "step": 11248 }, { "epoch": 0.9375, "grad_norm": 4.0625, "grad_norm_var": 0.035868326822916664, "learning_rate": 1.0696486203159275e-05, "loss": 4.5313, "loss/crossentropy": 1.6675259098410606, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16724395845085382, "step": 11250 }, { "epoch": 0.9376666666666666, "grad_norm": 4.65625, "grad_norm_var": 0.038798014322916664, "learning_rate": 1.068253961854939e-05, "loss": 5.0235, "loss/crossentropy": 1.755689986050129, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17844800278544426, "step": 11252 }, { "epoch": 0.9378333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.0421875, "learning_rate": 1.0668625893449138e-05, "loss": 4.6551, "loss/crossentropy": 1.8777173906564713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17272545769810677, "step": 11254 }, { "epoch": 0.938, "grad_norm": 4.375, "grad_norm_var": 0.04452718098958333, "learning_rate": 1.0654745062189265e-05, "loss": 4.3983, "loss/crossentropy": 2.261226326227188, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18433188647031784, "step": 11256 }, { "epoch": 0.9381666666666667, "grad_norm": 4.875, "grad_norm_var": 0.055562337239583336, "learning_rate": 1.0640897159019337e-05, "loss": 5.332, "loss/crossentropy": 1.8724040985107422, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1859893724322319, "step": 11258 }, { "epoch": 0.9383333333333334, "grad_norm": 4.75, "grad_norm_var": 0.0578125, "learning_rate": 1.062708221810768e-05, "loss": 4.9082, "loss/crossentropy": 1.9240873903036118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1816778089851141, "step": 11260 }, { "epoch": 0.9385, "grad_norm": 4.53125, "grad_norm_var": 0.05546875, "learning_rate": 1.0613300273541285e-05, "loss": 4.7688, "loss/crossentropy": 2.6359651684761047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23091797903180122, "step": 11262 }, { "epoch": 0.9386666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.0478515625, "learning_rate": 1.0599551359325735e-05, "loss": 5.138, "loss/crossentropy": 2.3837802410125732, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20433859154582024, "step": 11264 }, { "epoch": 0.9388333333333333, "grad_norm": 5.65625, "grad_norm_var": 0.11451416015625, "learning_rate": 1.0585835509385108e-05, "loss": 4.6649, "loss/crossentropy": 1.5584058910608292, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1602993868291378, "step": 11266 }, { "epoch": 0.939, "grad_norm": 4.28125, "grad_norm_var": 0.117822265625, "learning_rate": 1.0572152757561898e-05, "loss": 4.6806, "loss/crossentropy": 2.2199689149856567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2136174440383911, "step": 11268 }, { "epoch": 0.9391666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.11357014973958333, "learning_rate": 1.0558503137616932e-05, "loss": 4.7357, "loss/crossentropy": 1.70195122808218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19203688949346542, "step": 11270 }, { "epoch": 0.9393333333333334, "grad_norm": 4.375, "grad_norm_var": 0.10624593098958333, "learning_rate": 1.0544886683229296e-05, "loss": 4.5968, "loss/crossentropy": 2.2238181829452515, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21144040301442146, "step": 11272 }, { "epoch": 0.9395, "grad_norm": 4.625, "grad_norm_var": 0.09920247395833333, "learning_rate": 1.0531303427996238e-05, "loss": 4.9468, "loss/crossentropy": 2.153789669275284, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2122886162251234, "step": 11274 }, { "epoch": 0.9396666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.092041015625, "learning_rate": 1.0517753405433089e-05, "loss": 5.4184, "loss/crossentropy": 2.0947333574295044, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19803571701049805, "step": 11276 }, { "epoch": 0.9398333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.09853108723958333, "learning_rate": 1.0504236648973173e-05, "loss": 4.4953, "loss/crossentropy": 2.064661145210266, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.201131172478199, "step": 11278 }, { "epoch": 0.94, "grad_norm": 4.4375, "grad_norm_var": 1.8308430989583333, "learning_rate": 1.0490753191967764e-05, "loss": 5.4182, "loss/crossentropy": 2.5684571266174316, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23840726912021637, "step": 11280 }, { "epoch": 0.9401666666666667, "grad_norm": 4.40625, "grad_norm_var": 1.8093587239583333, "learning_rate": 1.047730306768593e-05, "loss": 4.9324, "loss/crossentropy": 2.354119837284088, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2058768942952156, "step": 11282 }, { "epoch": 0.9403333333333334, "grad_norm": 4.40625, "grad_norm_var": 1.820166015625, "learning_rate": 1.0463886309314528e-05, "loss": 4.8346, "loss/crossentropy": 1.460967630147934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15813573449850082, "step": 11284 }, { "epoch": 0.9405, "grad_norm": 4.25, "grad_norm_var": 1.8402303059895833, "learning_rate": 1.045050294995807e-05, "loss": 5.3441, "loss/crossentropy": 1.9262563213706017, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17652270942926407, "step": 11286 }, { "epoch": 0.9406666666666667, "grad_norm": 4.59375, "grad_norm_var": 1.8167805989583334, "learning_rate": 1.0437153022638674e-05, "loss": 4.8955, "loss/crossentropy": 1.821229636669159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17122468166053295, "step": 11288 }, { "epoch": 0.9408333333333333, "grad_norm": 4.75, "grad_norm_var": 1.8264322916666667, "learning_rate": 1.0423836560295944e-05, "loss": 4.8418, "loss/crossentropy": 2.3553628623485565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22077826783061028, "step": 11290 }, { "epoch": 0.941, "grad_norm": 4.3125, "grad_norm_var": 1.84986572265625, "learning_rate": 1.0410553595786939e-05, "loss": 4.656, "loss/crossentropy": 1.8085390403866768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14958495646715164, "step": 11292 }, { "epoch": 0.9411666666666667, "grad_norm": 4.65625, "grad_norm_var": 1.840087890625, "learning_rate": 1.0397304161886049e-05, "loss": 5.1937, "loss/crossentropy": 2.1406570374965668, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.25243185088038445, "step": 11294 }, { "epoch": 0.9413333333333334, "grad_norm": 4.4375, "grad_norm_var": 0.03472900390625, "learning_rate": 1.0384088291284935e-05, "loss": 5.2409, "loss/crossentropy": 2.0899730026721954, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21066424995660782, "step": 11296 }, { "epoch": 0.9415, "grad_norm": 4.40625, "grad_norm_var": 0.04459228515625, "learning_rate": 1.0370906016592441e-05, "loss": 5.0729, "loss/crossentropy": 2.3987383246421814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20506568253040314, "step": 11298 }, { "epoch": 0.9416666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.042561848958333336, "learning_rate": 1.0357757370334528e-05, "loss": 4.9799, "loss/crossentropy": 2.0026203393936157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17385689727962017, "step": 11300 }, { "epoch": 0.9418333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.03883056640625, "learning_rate": 1.0344642384954166e-05, "loss": 4.2991, "loss/crossentropy": 1.580312892794609, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19087418541312218, "step": 11302 }, { "epoch": 0.942, "grad_norm": 4.25, "grad_norm_var": 0.0435546875, "learning_rate": 1.0331561092811282e-05, "loss": 4.7055, "loss/crossentropy": 1.8026033341884613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18064633011817932, "step": 11304 }, { "epoch": 0.9421666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.040087890625, "learning_rate": 1.0318513526182659e-05, "loss": 5.2248, "loss/crossentropy": 1.9698734879493713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1878417208790779, "step": 11306 }, { "epoch": 0.9423333333333334, "grad_norm": 4.75, "grad_norm_var": 0.03860677083333333, "learning_rate": 1.0305499717261872e-05, "loss": 5.1038, "loss/crossentropy": 2.082389175891876, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19220246747136116, "step": 11308 }, { "epoch": 0.9425, "grad_norm": 4.875, "grad_norm_var": 0.04412434895833333, "learning_rate": 1.029251969815921e-05, "loss": 5.6204, "loss/crossentropy": 1.8455578163266182, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17056034691631794, "step": 11310 }, { "epoch": 0.9426666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.04192301432291667, "learning_rate": 1.0279573500901568e-05, "loss": 4.7678, "loss/crossentropy": 1.603665716946125, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16744246520102024, "step": 11312 }, { "epoch": 0.9428333333333333, "grad_norm": 4.25, "grad_norm_var": 0.03917643229166667, "learning_rate": 1.0266661157432403e-05, "loss": 4.763, "loss/crossentropy": 2.118795096874237, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2045084908604622, "step": 11314 }, { "epoch": 0.943, "grad_norm": 4.28125, "grad_norm_var": 0.04192301432291667, "learning_rate": 1.0253782699611648e-05, "loss": 4.7973, "loss/crossentropy": 2.4567037224769592, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21456681564450264, "step": 11316 }, { "epoch": 0.9431666666666667, "grad_norm": 4.625, "grad_norm_var": 0.03599853515625, "learning_rate": 1.0240938159215603e-05, "loss": 4.6769, "loss/crossentropy": 1.9589063227176666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2025170996785164, "step": 11318 }, { "epoch": 0.9433333333333334, "grad_norm": 4.375, "grad_norm_var": 0.044820149739583336, "learning_rate": 1.0228127567936906e-05, "loss": 4.4701, "loss/crossentropy": 1.1538459286093712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13562462851405144, "step": 11320 }, { "epoch": 0.9435, "grad_norm": 4.375, "grad_norm_var": 0.045182291666666666, "learning_rate": 1.0215350957384408e-05, "loss": 4.4514, "loss/crossentropy": 2.2808018624782562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20572230219841003, "step": 11322 }, { "epoch": 0.9436666666666667, "grad_norm": 4.96875, "grad_norm_var": 0.05520833333333333, "learning_rate": 1.0202608359083141e-05, "loss": 4.7319, "loss/crossentropy": 2.2184520065784454, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21957562491297722, "step": 11324 }, { "epoch": 0.9438333333333333, "grad_norm": 4.09375, "grad_norm_var": 0.057450358072916666, "learning_rate": 1.0189899804474181e-05, "loss": 4.4743, "loss/crossentropy": 1.53329998254776, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14534585922956467, "step": 11326 }, { "epoch": 0.944, "grad_norm": 4.875, "grad_norm_var": 0.06506754557291666, "learning_rate": 1.0177225324914637e-05, "loss": 5.4519, "loss/crossentropy": 2.229980379343033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18609286099672318, "step": 11328 }, { "epoch": 0.9441666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.06656494140625, "learning_rate": 1.0164584951677522e-05, "loss": 5.1793, "loss/crossentropy": 1.5389113202691078, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17611819133162498, "step": 11330 }, { "epoch": 0.9443333333333334, "grad_norm": 4.9375, "grad_norm_var": 0.07356770833333333, "learning_rate": 1.01519787159517e-05, "loss": 4.5413, "loss/crossentropy": 1.4153800904750824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14668962359428406, "step": 11332 }, { "epoch": 0.9445, "grad_norm": 4.71875, "grad_norm_var": 0.08761393229166667, "learning_rate": 1.0139406648841803e-05, "loss": 4.8911, "loss/crossentropy": 1.660580761730671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15345065668225288, "step": 11334 }, { "epoch": 0.9446666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.07526041666666666, "learning_rate": 1.0126868781368162e-05, "loss": 4.4777, "loss/crossentropy": 1.9693571105599403, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19380193203687668, "step": 11336 }, { "epoch": 0.9448333333333333, "grad_norm": 4.8125, "grad_norm_var": 0.07727864583333334, "learning_rate": 1.0114365144466716e-05, "loss": 5.1901, "loss/crossentropy": 2.2114559710025787, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22473587468266487, "step": 11338 }, { "epoch": 0.945, "grad_norm": 4.9375, "grad_norm_var": 0.07615559895833333, "learning_rate": 1.0101895768988945e-05, "loss": 4.8001, "loss/crossentropy": 1.65935680270195, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22352956980466843, "step": 11340 }, { "epoch": 0.9451666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.062890625, "learning_rate": 1.0089460685701788e-05, "loss": 4.7855, "loss/crossentropy": 1.4414317682385445, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1585723366588354, "step": 11342 }, { "epoch": 0.9453333333333334, "grad_norm": 4.125, "grad_norm_var": 0.069384765625, "learning_rate": 1.007705992528759e-05, "loss": 4.2898, "loss/crossentropy": 1.8804664388298988, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19146862998604774, "step": 11344 }, { "epoch": 0.9455, "grad_norm": 4.59375, "grad_norm_var": 0.07424723307291667, "learning_rate": 1.0064693518343989e-05, "loss": 4.4927, "loss/crossentropy": 0.4384430721402168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.0893248300999403, "step": 11346 }, { "epoch": 0.9456666666666667, "grad_norm": 4.625, "grad_norm_var": 0.07740885416666667, "learning_rate": 1.0052361495383862e-05, "loss": 5.2412, "loss/crossentropy": 1.9607902467250824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19718700274825096, "step": 11348 }, { "epoch": 0.9458333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.06324462890625, "learning_rate": 1.0040063886835247e-05, "loss": 5.3429, "loss/crossentropy": 2.4686543345451355, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21067270264029503, "step": 11350 }, { "epoch": 0.946, "grad_norm": 4.65625, "grad_norm_var": 0.067578125, "learning_rate": 1.0027800723041284e-05, "loss": 4.4446, "loss/crossentropy": 2.339235484600067, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19029224291443825, "step": 11352 }, { "epoch": 0.9461666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.07734375, "learning_rate": 1.0015572034260092e-05, "loss": 4.3474, "loss/crossentropy": 0.8812186121940613, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1112806610763073, "step": 11354 }, { "epoch": 0.9463333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.06470947265625, "learning_rate": 1.0003377850664759e-05, "loss": 5.0562, "loss/crossentropy": 1.98075682669878, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17196648381650448, "step": 11356 }, { "epoch": 0.9465, "grad_norm": 4.46875, "grad_norm_var": 0.07298177083333333, "learning_rate": 9.991218202343211e-06, "loss": 4.6137, "loss/crossentropy": 2.13834910094738, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17433346435427666, "step": 11358 }, { "epoch": 0.9466666666666667, "grad_norm": 4.625, "grad_norm_var": 0.07316080729166667, "learning_rate": 9.979093119298187e-06, "loss": 5.0153, "loss/crossentropy": 1.312939204275608, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14639427699148655, "step": 11360 }, { "epoch": 0.9468333333333333, "grad_norm": 4.5, "grad_norm_var": 0.06573893229166666, "learning_rate": 9.967002631447104e-06, "loss": 4.6521, "loss/crossentropy": 1.6801223307847977, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17429677583277225, "step": 11362 }, { "epoch": 0.947, "grad_norm": 4.5, "grad_norm_var": 0.04957275390625, "learning_rate": 9.954946768622056e-06, "loss": 4.7481, "loss/crossentropy": 1.2624147459864616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15043756738305092, "step": 11364 }, { "epoch": 0.9471666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.04495035807291667, "learning_rate": 9.942925560569677e-06, "loss": 4.8214, "loss/crossentropy": 1.251896284520626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13010199926793575, "step": 11366 }, { "epoch": 0.9473333333333334, "grad_norm": 4.625, "grad_norm_var": 0.040087890625, "learning_rate": 9.930939036951104e-06, "loss": 4.4365, "loss/crossentropy": 1.8336703404784203, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19744067266583443, "step": 11368 }, { "epoch": 0.9475, "grad_norm": 4.5, "grad_norm_var": 0.03287353515625, "learning_rate": 9.91898722734189e-06, "loss": 5.1877, "loss/crossentropy": 2.477368474006653, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20827669277787209, "step": 11370 }, { "epoch": 0.9476666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.03606363932291667, "learning_rate": 9.907070161231944e-06, "loss": 4.8895, "loss/crossentropy": 2.3553980588912964, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20754562690854073, "step": 11372 }, { "epoch": 0.9478333333333333, "grad_norm": 4.9375, "grad_norm_var": 0.04159749348958333, "learning_rate": 9.89518786802544e-06, "loss": 5.143, "loss/crossentropy": 2.1513184905052185, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19551891088485718, "step": 11374 }, { "epoch": 0.948, "grad_norm": 4.4375, "grad_norm_var": 0.03570556640625, "learning_rate": 9.883340377040752e-06, "loss": 5.258, "loss/crossentropy": 1.9037449285387993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17840158566832542, "step": 11376 }, { "epoch": 0.9481666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.03644205729166667, "learning_rate": 9.871527717510382e-06, "loss": 4.9532, "loss/crossentropy": 2.183581203222275, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22390246018767357, "step": 11378 }, { "epoch": 0.9483333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.03284098307291667, "learning_rate": 9.859749918580906e-06, "loss": 4.7762, "loss/crossentropy": 2.2573187053203583, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2075618952512741, "step": 11380 }, { "epoch": 0.9485, "grad_norm": 4.34375, "grad_norm_var": 0.04192301432291667, "learning_rate": 9.848007009312865e-06, "loss": 4.7298, "loss/crossentropy": 1.5737459063529968, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19046054407954216, "step": 11382 }, { "epoch": 0.9486666666666667, "grad_norm": 4.375, "grad_norm_var": 0.052144368489583336, "learning_rate": 9.836299018680719e-06, "loss": 4.4557, "loss/crossentropy": 1.3081732392311096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14434547536075115, "step": 11384 }, { "epoch": 0.9488333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.05185139973958333, "learning_rate": 9.82462597557277e-06, "loss": 4.8961, "loss/crossentropy": 2.0935000479221344, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17151028290390968, "step": 11386 }, { "epoch": 0.949, "grad_norm": 4.71875, "grad_norm_var": 0.051806640625, "learning_rate": 9.812987908791095e-06, "loss": 5.3142, "loss/crossentropy": 2.3687087893486023, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20503158122301102, "step": 11388 }, { "epoch": 0.9491666666666667, "grad_norm": 4.625, "grad_norm_var": 0.03186442057291667, "learning_rate": 9.80138484705147e-06, "loss": 5.0832, "loss/crossentropy": 2.0034788250923157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18592121824622154, "step": 11390 }, { "epoch": 0.9493333333333334, "grad_norm": 4.15625, "grad_norm_var": 0.04034830729166667, "learning_rate": 9.78981681898329e-06, "loss": 4.6754, "loss/crossentropy": 2.1087347492575645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17885474488139153, "step": 11392 }, { "epoch": 0.9495, "grad_norm": 4.53125, "grad_norm_var": 0.058203125, "learning_rate": 9.778283853129514e-06, "loss": 4.678, "loss/crossentropy": 1.7178971394896507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1700619626790285, "step": 11394 }, { "epoch": 0.9496666666666667, "grad_norm": 4.5, "grad_norm_var": 0.06832275390625, "learning_rate": 9.766785977946597e-06, "loss": 4.9998, "loss/crossentropy": 1.9785993993282318, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2348547950387001, "step": 11396 }, { "epoch": 0.9498333333333333, "grad_norm": 5.03125, "grad_norm_var": 0.07381184895833333, "learning_rate": 9.75532322180439e-06, "loss": 4.783, "loss/crossentropy": 2.074269473552704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21151044219732285, "step": 11398 }, { "epoch": 0.95, "grad_norm": 4.0625, "grad_norm_var": 0.07394205729166667, "learning_rate": 9.743895612986116e-06, "loss": 4.217, "loss/crossentropy": 1.7610245794057846, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.170846126973629, "step": 11400 }, { "epoch": 0.9501666666666667, "grad_norm": 4.875, "grad_norm_var": 0.08157145182291667, "learning_rate": 9.73250317968826e-06, "loss": 4.9011, "loss/crossentropy": 1.725033387541771, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19924123957753181, "step": 11402 }, { "epoch": 0.9503333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.08974202473958333, "learning_rate": 9.721145950020516e-06, "loss": 4.3106, "loss/crossentropy": 0.96659966558218, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1814847458153963, "step": 11404 }, { "epoch": 0.9505, "grad_norm": 4.34375, "grad_norm_var": 0.10403645833333333, "learning_rate": 9.70982395200572e-06, "loss": 4.8238, "loss/crossentropy": 2.1273521780967712, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20532144233584404, "step": 11406 }, { "epoch": 0.9506666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.08800455729166666, "learning_rate": 9.698537213579781e-06, "loss": 4.8639, "loss/crossentropy": 2.0131620913743973, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16216029226779938, "step": 11408 }, { "epoch": 0.9508333333333333, "grad_norm": 3.828125, "grad_norm_var": 0.11118062337239583, "learning_rate": 9.687285762591601e-06, "loss": 3.844, "loss/crossentropy": 1.404318891465664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15305296704173088, "step": 11410 }, { "epoch": 0.951, "grad_norm": 4.5625, "grad_norm_var": 0.11960347493489583, "learning_rate": 9.676069626803016e-06, "loss": 5.4016, "loss/crossentropy": 2.029324918985367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23561448231339455, "step": 11412 }, { "epoch": 0.9511666666666667, "grad_norm": 4.125, "grad_norm_var": 0.1167144775390625, "learning_rate": 9.664888833888724e-06, "loss": 4.7174, "loss/crossentropy": 2.380665957927704, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15510449546854943, "step": 11414 }, { "epoch": 0.9513333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.10854390462239584, "learning_rate": 9.653743411436227e-06, "loss": 5.0389, "loss/crossentropy": 2.1694408655166626, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21396595984697342, "step": 11416 }, { "epoch": 0.9515, "grad_norm": 4.25, "grad_norm_var": 0.10562235514322917, "learning_rate": 9.642633386945742e-06, "loss": 4.4986, "loss/crossentropy": 2.210035800933838, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2272140271961689, "step": 11418 }, { "epoch": 0.9516666666666667, "grad_norm": 5.125, "grad_norm_var": 0.12150777180989583, "learning_rate": 9.631558787830153e-06, "loss": 4.8095, "loss/crossentropy": 2.4195462465286255, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20426450297236443, "step": 11420 }, { "epoch": 0.9518333333333333, "grad_norm": 4.1875, "grad_norm_var": 0.1145416259765625, "learning_rate": 9.620519641414926e-06, "loss": 4.7143, "loss/crossentropy": 2.499524176120758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20367558673024178, "step": 11422 }, { "epoch": 0.952, "grad_norm": 4.53125, "grad_norm_var": 0.1120513916015625, "learning_rate": 9.609515974938064e-06, "loss": 5.2156, "loss/crossentropy": 2.4850030541419983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20476289093494415, "step": 11424 }, { "epoch": 0.9521666666666667, "grad_norm": 4.8125, "grad_norm_var": 0.08566080729166667, "learning_rate": 9.59854781555002e-06, "loss": 4.6327, "loss/crossentropy": 1.6624226868152618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1814777236431837, "step": 11426 }, { "epoch": 0.9523333333333334, "grad_norm": 4.40625, "grad_norm_var": 0.07615559895833333, "learning_rate": 9.587615190313633e-06, "loss": 4.4029, "loss/crossentropy": 1.5843016058206558, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17348378524184227, "step": 11428 }, { "epoch": 0.9525, "grad_norm": 4.78125, "grad_norm_var": 0.06573893229166666, "learning_rate": 9.576718126204069e-06, "loss": 4.5239, "loss/crossentropy": 2.3472007513046265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20952258631587029, "step": 11430 }, { "epoch": 0.9526666666666667, "grad_norm": 4.875, "grad_norm_var": 0.06991780598958333, "learning_rate": 9.565856650108758e-06, "loss": 5.1015, "loss/crossentropy": 1.7776538357138634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2009373940527439, "step": 11432 }, { "epoch": 0.9528333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.06300455729166667, "learning_rate": 9.555030788827302e-06, "loss": 5.2163, "loss/crossentropy": 1.7030307799577713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17652593553066254, "step": 11434 }, { "epoch": 0.953, "grad_norm": 4.71875, "grad_norm_var": 0.04342447916666667, "learning_rate": 9.544240569071444e-06, "loss": 4.8409, "loss/crossentropy": 1.8213524222373962, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17334122583270073, "step": 11436 }, { "epoch": 0.9531666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.034956868489583334, "learning_rate": 9.533486017464979e-06, "loss": 5.0472, "loss/crossentropy": 2.128684014081955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20218900591135025, "step": 11438 }, { "epoch": 0.9533333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.03752848307291667, "learning_rate": 9.522767160543692e-06, "loss": 5.3667, "loss/crossentropy": 2.5350887775421143, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2353416383266449, "step": 11440 }, { "epoch": 0.9535, "grad_norm": 4.53125, "grad_norm_var": 0.03162434895833333, "learning_rate": 9.512084024755293e-06, "loss": 4.6945, "loss/crossentropy": 1.500010333955288, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1599786952137947, "step": 11442 }, { "epoch": 0.9536666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.027046712239583333, "learning_rate": 9.501436636459364e-06, "loss": 4.7603, "loss/crossentropy": 2.3232173323631287, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2208048515021801, "step": 11444 }, { "epoch": 0.9538333333333333, "grad_norm": 4.25, "grad_norm_var": 0.03137613932291667, "learning_rate": 9.490825021927276e-06, "loss": 4.4929, "loss/crossentropy": 1.404862955212593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14171510562300682, "step": 11446 }, { "epoch": 0.954, "grad_norm": 5.03125, "grad_norm_var": 0.0388671875, "learning_rate": 9.48024920734213e-06, "loss": 5.2693, "loss/crossentropy": 1.9854433834552765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20603374019265175, "step": 11448 }, { "epoch": 0.9541666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.047900390625, "learning_rate": 9.4697092187987e-06, "loss": 4.7512, "loss/crossentropy": 2.452913224697113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20292839780449867, "step": 11450 }, { "epoch": 0.9543333333333334, "grad_norm": 4.59375, "grad_norm_var": 0.04527587890625, "learning_rate": 9.459205082303359e-06, "loss": 4.8945, "loss/crossentropy": 2.0005833134055138, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19060774892568588, "step": 11452 }, { "epoch": 0.9545, "grad_norm": 4.6875, "grad_norm_var": 0.05230712890625, "learning_rate": 9.44873682377402e-06, "loss": 4.4555, "loss/crossentropy": 1.4971669167280197, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15944399684667587, "step": 11454 }, { "epoch": 0.9546666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.04869384765625, "learning_rate": 9.43830446904007e-06, "loss": 5.0422, "loss/crossentropy": 2.2370823323726654, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1956566758453846, "step": 11456 }, { "epoch": 0.9548333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.047119140625, "learning_rate": 9.427908043842305e-06, "loss": 4.706, "loss/crossentropy": 2.28358057141304, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20159728825092316, "step": 11458 }, { "epoch": 0.955, "grad_norm": 4.40625, "grad_norm_var": 0.04745686848958333, "learning_rate": 9.417547573832876e-06, "loss": 5.2039, "loss/crossentropy": 2.2083005011081696, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2086111195385456, "step": 11460 }, { "epoch": 0.9551666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.048628743489583334, "learning_rate": 9.40722308457521e-06, "loss": 4.5374, "loss/crossentropy": 2.4701362252235413, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2123345211148262, "step": 11462 }, { "epoch": 0.9553333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.02965087890625, "learning_rate": 9.396934601543957e-06, "loss": 4.9871, "loss/crossentropy": 1.1636821255087852, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1338321641087532, "step": 11464 }, { "epoch": 0.9555, "grad_norm": 4.21875, "grad_norm_var": 0.023893229166666665, "learning_rate": 9.386682150124923e-06, "loss": 4.9635, "loss/crossentropy": 2.139076389372349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16577439568936825, "step": 11466 }, { "epoch": 0.9556666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.028999837239583333, "learning_rate": 9.376465755615024e-06, "loss": 4.4825, "loss/crossentropy": 1.589639350771904, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1894889585673809, "step": 11468 }, { "epoch": 0.9558333333333333, "grad_norm": 5.0, "grad_norm_var": 0.043863932291666664, "learning_rate": 9.366285443222183e-06, "loss": 4.8079, "loss/crossentropy": 1.9420047849416733, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19465864449739456, "step": 11470 }, { "epoch": 0.956, "grad_norm": 4.21875, "grad_norm_var": 0.04607747395833333, "learning_rate": 9.35614123806532e-06, "loss": 4.5104, "loss/crossentropy": 2.0238417387008667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1953933723270893, "step": 11472 }, { "epoch": 0.9561666666666667, "grad_norm": 6.25, "grad_norm_var": 0.24599202473958334, "learning_rate": 9.346033165174249e-06, "loss": 5.0515, "loss/crossentropy": 2.287261486053467, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21523414179682732, "step": 11474 }, { "epoch": 0.9563333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.24566650390625, "learning_rate": 9.335961249489635e-06, "loss": 4.8473, "loss/crossentropy": 2.182397872209549, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18900325149297714, "step": 11476 }, { "epoch": 0.9565, "grad_norm": 4.5, "grad_norm_var": 0.24120686848958334, "learning_rate": 9.325925515862926e-06, "loss": 5.0944, "loss/crossentropy": 1.7978277504444122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19625483080744743, "step": 11478 }, { "epoch": 0.9566666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.24230143229166667, "learning_rate": 9.315925989056303e-06, "loss": 4.9761, "loss/crossentropy": 2.6512285470962524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22339925542473793, "step": 11480 }, { "epoch": 0.9568333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.23248291015625, "learning_rate": 9.305962693742601e-06, "loss": 5.4635, "loss/crossentropy": 2.1777456402778625, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22406087815761566, "step": 11482 }, { "epoch": 0.957, "grad_norm": 4.5625, "grad_norm_var": 0.21929931640625, "learning_rate": 9.296035654505261e-06, "loss": 5.3445, "loss/crossentropy": 1.466300867497921, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1500567179173231, "step": 11484 }, { "epoch": 0.9571666666666667, "grad_norm": 4.71875, "grad_norm_var": 0.213134765625, "learning_rate": 9.286144895838262e-06, "loss": 5.4844, "loss/crossentropy": 2.4218156337738037, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21662846207618713, "step": 11486 }, { "epoch": 0.9573333333333334, "grad_norm": 4.34375, "grad_norm_var": 0.20597330729166666, "learning_rate": 9.276290442146075e-06, "loss": 4.6464, "loss/crossentropy": 1.5975105240941048, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16215290501713753, "step": 11488 }, { "epoch": 0.9575, "grad_norm": 4.625, "grad_norm_var": 0.029427083333333333, "learning_rate": 9.266472317743582e-06, "loss": 4.6487, "loss/crossentropy": 1.2481160312891006, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13352661207318306, "step": 11490 }, { "epoch": 0.9576666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.027718098958333333, "learning_rate": 9.256690546856028e-06, "loss": 4.895, "loss/crossentropy": 1.5689271241426468, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1847672201693058, "step": 11492 }, { "epoch": 0.9578333333333333, "grad_norm": 4.875, "grad_norm_var": 0.029231770833333334, "learning_rate": 9.246945153618955e-06, "loss": 5.2113, "loss/crossentropy": 2.406019926071167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23838234320282936, "step": 11494 }, { "epoch": 0.958, "grad_norm": 4.65625, "grad_norm_var": 0.032938639322916664, "learning_rate": 9.237236162078162e-06, "loss": 4.6259, "loss/crossentropy": 2.4825395345687866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2015017382800579, "step": 11496 }, { "epoch": 0.9581666666666667, "grad_norm": 4.5, "grad_norm_var": 0.033056640625, "learning_rate": 9.227563596189619e-06, "loss": 5.034, "loss/crossentropy": 1.7402432262897491, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.176887271925807, "step": 11498 }, { "epoch": 0.9583333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.03631184895833333, "learning_rate": 9.217927479819413e-06, "loss": 4.3762, "loss/crossentropy": 1.77406807243824, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18127808719873428, "step": 11500 }, { "epoch": 0.9585, "grad_norm": 4.0, "grad_norm_var": 0.04517822265625, "learning_rate": 9.208327836743711e-06, "loss": 4.5285, "loss/crossentropy": 1.7723116129636765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1636607013642788, "step": 11502 }, { "epoch": 0.9586666666666667, "grad_norm": 4.5, "grad_norm_var": 0.048502604166666664, "learning_rate": 9.198764690648673e-06, "loss": 4.4688, "loss/crossentropy": 1.9188175573945045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1713660228997469, "step": 11504 }, { "epoch": 0.9588333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.04698893229166667, "learning_rate": 9.189238065130415e-06, "loss": 4.9859, "loss/crossentropy": 2.25072905421257, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18884330987930298, "step": 11506 }, { "epoch": 0.959, "grad_norm": 4.4375, "grad_norm_var": 0.046858723958333334, "learning_rate": 9.179747983694935e-06, "loss": 4.478, "loss/crossentropy": 2.0359173715114594, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1767810508608818, "step": 11508 }, { "epoch": 0.9591666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.038374837239583334, "learning_rate": 9.170294469758068e-06, "loss": 4.9137, "loss/crossentropy": 1.9276671707630157, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19128254242241383, "step": 11510 }, { "epoch": 0.9593333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.04195556640625, "learning_rate": 9.16087754664542e-06, "loss": 5.1605, "loss/crossentropy": 2.04610376060009, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1933000460267067, "step": 11512 }, { "epoch": 0.9595, "grad_norm": 4.75, "grad_norm_var": 0.04224853515625, "learning_rate": 9.151497237592314e-06, "loss": 5.3041, "loss/crossentropy": 2.4091951847076416, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2099679596722126, "step": 11514 }, { "epoch": 0.9596666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.047261555989583336, "learning_rate": 9.142153565743724e-06, "loss": 4.9227, "loss/crossentropy": 1.4057952463626862, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13871552795171738, "step": 11516 }, { "epoch": 0.9598333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.031119791666666667, "learning_rate": 9.132846554154239e-06, "loss": 4.6217, "loss/crossentropy": 2.13298699259758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2115863598883152, "step": 11518 }, { "epoch": 0.96, "grad_norm": 4.375, "grad_norm_var": 0.036702473958333336, "learning_rate": 9.12357622578798e-06, "loss": 4.3296, "loss/crossentropy": 2.031085819005966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2061699703335762, "step": 11520 }, { "epoch": 0.9601666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.060380045572916666, "learning_rate": 9.114342603518563e-06, "loss": 4.9346, "loss/crossentropy": 1.6019954681396484, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1720646657049656, "step": 11522 }, { "epoch": 0.9603333333333334, "grad_norm": 5.0625, "grad_norm_var": 0.077978515625, "learning_rate": 9.105145710129028e-06, "loss": 4.7582, "loss/crossentropy": 0.9307103678584099, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.11513753607869148, "step": 11524 }, { "epoch": 0.9605, "grad_norm": 4.5625, "grad_norm_var": 0.07886962890625, "learning_rate": 9.095985568311806e-06, "loss": 5.6277, "loss/crossentropy": 2.263214409351349, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19302452355623245, "step": 11526 }, { "epoch": 0.9606666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.0796875, "learning_rate": 9.086862200668626e-06, "loss": 4.5774, "loss/crossentropy": 1.8522422462701797, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1796627752482891, "step": 11528 }, { "epoch": 0.9608333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.07847900390625, "learning_rate": 9.077775629710496e-06, "loss": 4.8885, "loss/crossentropy": 1.6978005468845367, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16846343874931335, "step": 11530 }, { "epoch": 0.961, "grad_norm": 4.96875, "grad_norm_var": 0.0826171875, "learning_rate": 9.068725877857623e-06, "loss": 5.4411, "loss/crossentropy": 2.5143747329711914, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2164710983633995, "step": 11532 }, { "epoch": 0.9611666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.0802734375, "learning_rate": 9.059712967439377e-06, "loss": 4.7038, "loss/crossentropy": 1.7296985238790512, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15602321550250053, "step": 11534 }, { "epoch": 0.9613333333333334, "grad_norm": 4.5, "grad_norm_var": 0.07636311848958334, "learning_rate": 9.050736920694208e-06, "loss": 5.0484, "loss/crossentropy": 2.0967109203338623, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19041990116238594, "step": 11536 }, { "epoch": 0.9615, "grad_norm": 4.21875, "grad_norm_var": 0.06404622395833333, "learning_rate": 9.041797759769628e-06, "loss": 4.5043, "loss/crossentropy": 2.400757282972336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.213943500071764, "step": 11538 }, { "epoch": 0.9616666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.09295247395833334, "learning_rate": 9.032895506722125e-06, "loss": 5.113, "loss/crossentropy": 2.5202205777168274, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2162243202328682, "step": 11540 }, { "epoch": 0.9618333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.11480712890625, "learning_rate": 9.024030183517124e-06, "loss": 4.9387, "loss/crossentropy": 2.189281791448593, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20729172229766846, "step": 11542 }, { "epoch": 0.962, "grad_norm": 4.28125, "grad_norm_var": 0.12498372395833333, "learning_rate": 9.015201812028924e-06, "loss": 4.2009, "loss/crossentropy": 2.25010347366333, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20368079841136932, "step": 11544 }, { "epoch": 0.9621666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.134375, "learning_rate": 9.006410414040662e-06, "loss": 4.8032, "loss/crossentropy": 1.898626983165741, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16156602837145329, "step": 11546 }, { "epoch": 0.9623333333333334, "grad_norm": 4.28125, "grad_norm_var": 0.13006184895833334, "learning_rate": 8.997656011244232e-06, "loss": 5.0192, "loss/crossentropy": 2.262854278087616, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19358738511800766, "step": 11548 }, { "epoch": 0.9625, "grad_norm": 4.46875, "grad_norm_var": 0.13123372395833333, "learning_rate": 8.988938625240257e-06, "loss": 4.6301, "loss/crossentropy": 2.4176487922668457, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21823088079690933, "step": 11550 }, { "epoch": 0.9626666666666667, "grad_norm": 4.5, "grad_norm_var": 0.109619140625, "learning_rate": 8.980258277538017e-06, "loss": 5.0695, "loss/crossentropy": 2.1023247241973877, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19244016706943512, "step": 11552 }, { "epoch": 0.9628333333333333, "grad_norm": 4.625, "grad_norm_var": 0.10989176432291667, "learning_rate": 8.971614989555408e-06, "loss": 4.7315, "loss/crossentropy": 1.242757223546505, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13383663445711136, "step": 11554 }, { "epoch": 0.963, "grad_norm": 4.3125, "grad_norm_var": 0.04986979166666667, "learning_rate": 8.963008782618887e-06, "loss": 5.2009, "loss/crossentropy": 1.6220801323652267, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1612663622945547, "step": 11556 }, { "epoch": 0.9631666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.015169270833333333, "learning_rate": 8.954439677963411e-06, "loss": 4.6598, "loss/crossentropy": 1.3346295356750488, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1350972019135952, "step": 11558 }, { "epoch": 0.9633333333333334, "grad_norm": 4.09375, "grad_norm_var": 0.034228515625, "learning_rate": 8.945907696732395e-06, "loss": 4.761, "loss/crossentropy": 1.9910719692707062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22182273492217064, "step": 11560 }, { "epoch": 0.9635, "grad_norm": 4.6875, "grad_norm_var": 0.03553059895833333, "learning_rate": 8.937412859977653e-06, "loss": 5.1157, "loss/crossentropy": 2.031724736094475, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.179793706163764, "step": 11562 }, { "epoch": 0.9636666666666667, "grad_norm": 4.75, "grad_norm_var": 0.04021809895833333, "learning_rate": 8.928955188659353e-06, "loss": 5.2434, "loss/crossentropy": 1.5864408761262894, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1709186527878046, "step": 11564 }, { "epoch": 0.9638333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.03974202473958333, "learning_rate": 8.920534703645955e-06, "loss": 4.3488, "loss/crossentropy": 1.995065838098526, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19275368005037308, "step": 11566 }, { "epoch": 0.964, "grad_norm": 4.71875, "grad_norm_var": 0.04778238932291667, "learning_rate": 8.912151425714168e-06, "loss": 4.2521, "loss/crossentropy": 1.6531179696321487, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17187139578163624, "step": 11568 }, { "epoch": 0.9641666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.07861226399739583, "learning_rate": 8.903805375548904e-06, "loss": 4.1241, "loss/crossentropy": 1.9064756259322166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16551323048770428, "step": 11570 }, { "epoch": 0.9643333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.0802398681640625, "learning_rate": 8.895496573743207e-06, "loss": 5.0267, "loss/crossentropy": 2.3756193816661835, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20273981615900993, "step": 11572 }, { "epoch": 0.9645, "grad_norm": 4.1875, "grad_norm_var": 0.0854400634765625, "learning_rate": 8.887225040798218e-06, "loss": 4.7173, "loss/crossentropy": 2.5865076184272766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.206777635961771, "step": 11574 }, { "epoch": 0.9646666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.06680399576822917, "learning_rate": 8.878990797123125e-06, "loss": 5.0553, "loss/crossentropy": 2.5730031728744507, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20398882031440735, "step": 11576 }, { "epoch": 0.9648333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.10822652180989584, "learning_rate": 8.870793863035105e-06, "loss": 5.0463, "loss/crossentropy": 1.767984189093113, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17187254317104816, "step": 11578 }, { "epoch": 0.965, "grad_norm": 4.46875, "grad_norm_var": 0.11066792805989584, "learning_rate": 8.862634258759277e-06, "loss": 4.7057, "loss/crossentropy": 1.9222623482346535, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19420848414301872, "step": 11580 }, { "epoch": 0.9651666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.11227925618489583, "learning_rate": 8.854512004428653e-06, "loss": 4.6709, "loss/crossentropy": 1.7106484100222588, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16590185649693012, "step": 11582 }, { "epoch": 0.9653333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.10446675618489583, "learning_rate": 8.846427120084094e-06, "loss": 4.9918, "loss/crossentropy": 2.2791011333465576, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21460538357496262, "step": 11584 }, { "epoch": 0.9655, "grad_norm": 4.25, "grad_norm_var": 0.07740478515625, "learning_rate": 8.838379625674243e-06, "loss": 5.0301, "loss/crossentropy": 1.9393450617790222, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23692452535033226, "step": 11586 }, { "epoch": 0.9656666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.08001302083333334, "learning_rate": 8.83036954105549e-06, "loss": 4.935, "loss/crossentropy": 1.323600873351097, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1447231750935316, "step": 11588 }, { "epoch": 0.9658333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.08019205729166666, "learning_rate": 8.822396885991927e-06, "loss": 5.1718, "loss/crossentropy": 2.269530236721039, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2282923273742199, "step": 11590 }, { "epoch": 0.966, "grad_norm": 4.65625, "grad_norm_var": 0.08072916666666667, "learning_rate": 8.81446168015529e-06, "loss": 5.4426, "loss/crossentropy": 2.427491843700409, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2319641038775444, "step": 11592 }, { "epoch": 0.9661666666666666, "grad_norm": 4.5625, "grad_norm_var": 0.038895670572916666, "learning_rate": 8.806563943124903e-06, "loss": 4.935, "loss/crossentropy": 1.817050889134407, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16897720657289028, "step": 11594 }, { "epoch": 0.9663333333333334, "grad_norm": 4.65625, "grad_norm_var": 0.03253580729166667, "learning_rate": 8.798703694387653e-06, "loss": 4.9714, "loss/crossentropy": 2.0594170689582825, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18061714619398117, "step": 11596 }, { "epoch": 0.9665, "grad_norm": 4.28125, "grad_norm_var": 0.03372395833333333, "learning_rate": 8.790880953337921e-06, "loss": 4.8757, "loss/crossentropy": 2.0173055678606033, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18694764375686646, "step": 11598 }, { "epoch": 0.9666666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.036572265625, "learning_rate": 8.783095739277544e-06, "loss": 4.7308, "loss/crossentropy": 2.0797626599669456, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18865588307380676, "step": 11600 }, { "epoch": 0.9668333333333333, "grad_norm": 4.625, "grad_norm_var": 0.03502604166666667, "learning_rate": 8.775348071415762e-06, "loss": 4.8921, "loss/crossentropy": 1.6193000376224518, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17745701409876347, "step": 11602 }, { "epoch": 0.967, "grad_norm": 4.5625, "grad_norm_var": 0.0333984375, "learning_rate": 8.767637968869175e-06, "loss": 5.0154, "loss/crossentropy": 2.262195646762848, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19456271454691887, "step": 11604 }, { "epoch": 0.9671666666666666, "grad_norm": 4.09375, "grad_norm_var": 0.043603515625, "learning_rate": 8.759965450661698e-06, "loss": 4.8588, "loss/crossentropy": 1.9921872094273567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18706602230668068, "step": 11606 }, { "epoch": 0.9673333333333334, "grad_norm": 4.5625, "grad_norm_var": 0.04299723307291667, "learning_rate": 8.752330535724502e-06, "loss": 5.1061, "loss/crossentropy": 1.6814751327037811, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17119668051600456, "step": 11608 }, { "epoch": 0.9675, "grad_norm": 4.46875, "grad_norm_var": 0.043603515625, "learning_rate": 8.744733242895983e-06, "loss": 5.0926, "loss/crossentropy": 1.2483708560466766, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13527445308864117, "step": 11610 }, { "epoch": 0.9676666666666667, "grad_norm": 4.375, "grad_norm_var": 0.042578125, "learning_rate": 8.737173590921707e-06, "loss": 4.8308, "loss/crossentropy": 1.6758858039975166, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1577359363436699, "step": 11612 }, { "epoch": 0.9678333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.03917643229166667, "learning_rate": 8.729651598454359e-06, "loss": 5.4152, "loss/crossentropy": 1.8468017801642418, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17877262830734253, "step": 11614 }, { "epoch": 0.968, "grad_norm": 4.46875, "grad_norm_var": 0.03763020833333333, "learning_rate": 8.722167284053714e-06, "loss": 4.6742, "loss/crossentropy": 2.1274545565247536, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16794894263148308, "step": 11616 }, { "epoch": 0.9681666666666666, "grad_norm": 4.21875, "grad_norm_var": 0.03189697265625, "learning_rate": 8.71472066618657e-06, "loss": 4.6825, "loss/crossentropy": 2.20508149266243, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2075282223522663, "step": 11618 }, { "epoch": 0.9683333333333334, "grad_norm": 4.75, "grad_norm_var": 0.027274576822916667, "learning_rate": 8.707311763226719e-06, "loss": 4.2036, "loss/crossentropy": 1.425516776740551, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1791730523109436, "step": 11620 }, { "epoch": 0.9685, "grad_norm": 4.21875, "grad_norm_var": 0.017513020833333334, "learning_rate": 8.699940593454892e-06, "loss": 4.9838, "loss/crossentropy": 2.060584656894207, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1716004889458418, "step": 11622 }, { "epoch": 0.9686666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.027632649739583334, "learning_rate": 8.692607175058713e-06, "loss": 4.9354, "loss/crossentropy": 1.3096114546060562, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1387592125684023, "step": 11624 }, { "epoch": 0.9688333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.029280598958333334, "learning_rate": 8.685311526132668e-06, "loss": 4.7851, "loss/crossentropy": 1.965927578508854, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17507245764136314, "step": 11626 }, { "epoch": 0.969, "grad_norm": 4.0625, "grad_norm_var": 0.03958333333333333, "learning_rate": 8.678053664678045e-06, "loss": 4.3549, "loss/crossentropy": 1.4416181147098541, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15535733103752136, "step": 11628 }, { "epoch": 0.9691666666666666, "grad_norm": 4.375, "grad_norm_var": 0.052469889322916664, "learning_rate": 8.670833608602895e-06, "loss": 4.8076, "loss/crossentropy": 1.90502218157053, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1777823492884636, "step": 11630 }, { "epoch": 0.9693333333333334, "grad_norm": 4.625, "grad_norm_var": 0.060835774739583334, "learning_rate": 8.663651375721986e-06, "loss": 4.7508, "loss/crossentropy": 1.7098061069846153, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1836803499609232, "step": 11632 }, { "epoch": 0.9695, "grad_norm": 4.71875, "grad_norm_var": 0.05601806640625, "learning_rate": 8.656506983756768e-06, "loss": 5.2306, "loss/crossentropy": 1.4646401852369308, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14851868897676468, "step": 11634 }, { "epoch": 0.9696666666666667, "grad_norm": 4.6875, "grad_norm_var": 0.051590983072916666, "learning_rate": 8.649400450335316e-06, "loss": 5.0876, "loss/crossentropy": 2.236992657184601, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19888271763920784, "step": 11636 }, { "epoch": 0.9698333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.04724934895833333, "learning_rate": 8.642331792992293e-06, "loss": 5.0013, "loss/crossentropy": 2.04328054189682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21399332210421562, "step": 11638 }, { "epoch": 0.97, "grad_norm": 5.03125, "grad_norm_var": 0.05803629557291667, "learning_rate": 8.635301029168912e-06, "loss": 4.7815, "loss/crossentropy": 1.3681641593575478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13712685741484165, "step": 11640 }, { "epoch": 0.9701666666666666, "grad_norm": 4.875, "grad_norm_var": 0.06041259765625, "learning_rate": 8.628308176212882e-06, "loss": 5.1699, "loss/crossentropy": 2.167069435119629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1925915852189064, "step": 11642 }, { "epoch": 0.9703333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.050764973958333334, "learning_rate": 8.62135325137837e-06, "loss": 4.2311, "loss/crossentropy": 1.7462330013513565, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1746540553867817, "step": 11644 }, { "epoch": 0.9705, "grad_norm": 4.1875, "grad_norm_var": 0.0578125, "learning_rate": 8.614436271825966e-06, "loss": 4.8348, "loss/crossentropy": 2.1338234543800354, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19719984009861946, "step": 11646 }, { "epoch": 0.9706666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.05416259765625, "learning_rate": 8.607557254622627e-06, "loss": 4.6722, "loss/crossentropy": 2.0075062438845634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16331494599580765, "step": 11648 }, { "epoch": 0.9708333333333333, "grad_norm": 4.75, "grad_norm_var": 0.0587890625, "learning_rate": 8.600716216741648e-06, "loss": 4.8223, "loss/crossentropy": 1.6360519081354141, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1850583702325821, "step": 11650 }, { "epoch": 0.971, "grad_norm": 4.21875, "grad_norm_var": 0.06848551432291666, "learning_rate": 8.5939131750626e-06, "loss": 4.8729, "loss/crossentropy": 2.094428636133671, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17381912097334862, "step": 11652 }, { "epoch": 0.9711666666666666, "grad_norm": 4.125, "grad_norm_var": 0.08971354166666666, "learning_rate": 8.587148146371323e-06, "loss": 4.4635, "loss/crossentropy": 1.7716087624430656, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17506087198853493, "step": 11654 }, { "epoch": 0.9713333333333334, "grad_norm": 4.625, "grad_norm_var": 0.07463785807291666, "learning_rate": 8.580421147359846e-06, "loss": 5.0966, "loss/crossentropy": 2.5120007693767548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2275298647582531, "step": 11656 }, { "epoch": 0.9715, "grad_norm": 4.25, "grad_norm_var": 0.07042643229166666, "learning_rate": 8.573732194626374e-06, "loss": 4.815, "loss/crossentropy": 2.480922281742096, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2106558196246624, "step": 11658 }, { "epoch": 0.9716666666666667, "grad_norm": 4.5625, "grad_norm_var": 0.06243489583333333, "learning_rate": 8.567081304675231e-06, "loss": 5.0735, "loss/crossentropy": 2.1009537279605865, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20914742723107338, "step": 11660 }, { "epoch": 0.9718333333333333, "grad_norm": 4.59375, "grad_norm_var": 0.05310872395833333, "learning_rate": 8.560468493916829e-06, "loss": 5.1147, "loss/crossentropy": 2.174714207649231, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22449615225195885, "step": 11662 }, { "epoch": 0.972, "grad_norm": 4.25, "grad_norm_var": 0.05829671223958333, "learning_rate": 8.553893778667619e-06, "loss": 4.4634, "loss/crossentropy": 1.4345918074250221, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13629802502691746, "step": 11664 }, { "epoch": 0.9721666666666666, "grad_norm": 4.09375, "grad_norm_var": 0.059891764322916666, "learning_rate": 8.54735717515006e-06, "loss": 4.6073, "loss/crossentropy": 1.4699408039450645, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17171907797455788, "step": 11666 }, { "epoch": 0.9723333333333334, "grad_norm": 4.8125, "grad_norm_var": 0.05836181640625, "learning_rate": 8.540858699492564e-06, "loss": 5.0484, "loss/crossentropy": 1.4010847359895706, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19796455651521683, "step": 11668 }, { "epoch": 0.9725, "grad_norm": 4.40625, "grad_norm_var": 0.04322509765625, "learning_rate": 8.534398367729485e-06, "loss": 5.1173, "loss/crossentropy": 1.8564397096633911, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20212604105472565, "step": 11670 }, { "epoch": 0.9726666666666667, "grad_norm": 4.375, "grad_norm_var": 0.04351806640625, "learning_rate": 8.52797619580104e-06, "loss": 5.2406, "loss/crossentropy": 1.4024172648787498, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14744648337364197, "step": 11672 }, { "epoch": 0.9728333333333333, "grad_norm": 4.375, "grad_norm_var": 0.04501546223958333, "learning_rate": 8.521592199553305e-06, "loss": 4.743, "loss/crossentropy": 1.6213370859622955, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17050425335764885, "step": 11674 }, { "epoch": 0.973, "grad_norm": 4.6875, "grad_norm_var": 0.05325113932291667, "learning_rate": 8.515246394738153e-06, "loss": 4.9606, "loss/crossentropy": 1.9720600247383118, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22468455135822296, "step": 11676 }, { "epoch": 0.9731666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.059895833333333336, "learning_rate": 8.50893879701323e-06, "loss": 5.2049, "loss/crossentropy": 2.5113608837127686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21483315154910088, "step": 11678 }, { "epoch": 0.9733333333333334, "grad_norm": 4.46875, "grad_norm_var": 0.074853515625, "learning_rate": 8.502669421941903e-06, "loss": 4.406, "loss/crossentropy": 1.8661476969718933, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17148572579026222, "step": 11680 }, { "epoch": 0.9735, "grad_norm": 4.34375, "grad_norm_var": 0.06451416015625, "learning_rate": 8.496438284993235e-06, "loss": 4.8, "loss/crossentropy": 2.2413404658436775, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1972121000289917, "step": 11682 }, { "epoch": 0.9736666666666667, "grad_norm": 4.0625, "grad_norm_var": 0.07109375, "learning_rate": 8.49024540154193e-06, "loss": 4.3588, "loss/crossentropy": 1.1721899956464767, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14115168899297714, "step": 11684 }, { "epoch": 0.9738333333333333, "grad_norm": 4.5, "grad_norm_var": 0.056103515625, "learning_rate": 8.484090786868324e-06, "loss": 5.1475, "loss/crossentropy": 2.327622562646866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22277278453111649, "step": 11686 }, { "epoch": 0.974, "grad_norm": 4.375, "grad_norm_var": 0.05767822265625, "learning_rate": 8.47797445615831e-06, "loss": 4.9333, "loss/crossentropy": 1.6267950534820557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1809178777039051, "step": 11688 }, { "epoch": 0.9741666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.05972900390625, "learning_rate": 8.471896424503321e-06, "loss": 5.206, "loss/crossentropy": 1.8035884648561478, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15355466306209564, "step": 11690 }, { "epoch": 0.9743333333333334, "grad_norm": 4.40625, "grad_norm_var": 0.05240478515625, "learning_rate": 8.465856706900305e-06, "loss": 4.7138, "loss/crossentropy": 2.031545266509056, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16774150729179382, "step": 11692 }, { "epoch": 0.9745, "grad_norm": 4.5, "grad_norm_var": 0.042801920572916666, "learning_rate": 8.459855318251661e-06, "loss": 5.1921, "loss/crossentropy": 2.504208981990814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22938504815101624, "step": 11694 }, { "epoch": 0.9746666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.029227701822916667, "learning_rate": 8.453892273365217e-06, "loss": 4.7644, "loss/crossentropy": 1.6472094357013702, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16006597690284252, "step": 11696 }, { "epoch": 0.9748333333333333, "grad_norm": 4.5, "grad_norm_var": 0.03290608723958333, "learning_rate": 8.447967586954199e-06, "loss": 4.7048, "loss/crossentropy": 1.7583764493465424, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18461447581648827, "step": 11698 }, { "epoch": 0.975, "grad_norm": 4.375, "grad_norm_var": 0.02076416015625, "learning_rate": 8.442081273637176e-06, "loss": 5.0457, "loss/crossentropy": 1.7519859299063683, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17579527385532856, "step": 11700 }, { "epoch": 0.9751666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.023111979166666668, "learning_rate": 8.436233347938044e-06, "loss": 4.6854, "loss/crossentropy": 1.455995261669159, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16757206618785858, "step": 11702 }, { "epoch": 0.9753333333333334, "grad_norm": 4.53125, "grad_norm_var": 0.021317545572916666, "learning_rate": 8.430423824285975e-06, "loss": 4.6304, "loss/crossentropy": 2.621677041053772, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20436899363994598, "step": 11704 }, { "epoch": 0.9755, "grad_norm": 4.25, "grad_norm_var": 0.030171712239583332, "learning_rate": 8.424652717015399e-06, "loss": 4.9472, "loss/crossentropy": 2.6113321185112, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2325909100472927, "step": 11706 }, { "epoch": 0.9756666666666667, "grad_norm": 4.5, "grad_norm_var": 0.026546223958333334, "learning_rate": 8.41892004036594e-06, "loss": 4.5195, "loss/crossentropy": 1.959231823682785, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18092519976198673, "step": 11708 }, { "epoch": 0.9758333333333333, "grad_norm": 4.875, "grad_norm_var": 0.03778889973958333, "learning_rate": 8.413225808482412e-06, "loss": 5.284, "loss/crossentropy": 2.698577105998993, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22032983228564262, "step": 11710 }, { "epoch": 0.976, "grad_norm": 4.46875, "grad_norm_var": 0.03720296223958333, "learning_rate": 8.407570035414765e-06, "loss": 4.8336, "loss/crossentropy": 0.9326044321060181, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12788059562444687, "step": 11712 }, { "epoch": 0.9761666666666666, "grad_norm": 4.03125, "grad_norm_var": 0.04550374348958333, "learning_rate": 8.401952735118062e-06, "loss": 4.3629, "loss/crossentropy": 2.06018128991127, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17738444730639458, "step": 11714 }, { "epoch": 0.9763333333333334, "grad_norm": 4.625, "grad_norm_var": 0.045182291666666666, "learning_rate": 8.396373921452428e-06, "loss": 5.0716, "loss/crossentropy": 1.9189767017960548, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18726221565157175, "step": 11716 }, { "epoch": 0.9765, "grad_norm": 4.28125, "grad_norm_var": 0.047379557291666666, "learning_rate": 8.390833608183029e-06, "loss": 5.023, "loss/crossentropy": 2.3186798691749573, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21441612765192986, "step": 11718 }, { "epoch": 0.9766666666666667, "grad_norm": 5.5625, "grad_norm_var": 0.12237955729166666, "learning_rate": 8.385331808980042e-06, "loss": 4.8558, "loss/crossentropy": 1.8235585540533066, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17181962355971336, "step": 11720 }, { "epoch": 0.9768333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.10935872395833333, "learning_rate": 8.37986853741861e-06, "loss": 4.9082, "loss/crossentropy": 1.8868702054023743, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18395415879786015, "step": 11722 }, { "epoch": 0.977, "grad_norm": 4.09375, "grad_norm_var": 0.12198893229166667, "learning_rate": 8.374443806978809e-06, "loss": 4.6495, "loss/crossentropy": 2.3554630279541016, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20914624631404877, "step": 11724 }, { "epoch": 0.9771666666666666, "grad_norm": 4.34375, "grad_norm_var": 0.14693603515625, "learning_rate": 8.369057631045622e-06, "loss": 4.9514, "loss/crossentropy": 1.8347747921943665, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17518590204417706, "step": 11726 }, { "epoch": 0.9773333333333334, "grad_norm": 4.71875, "grad_norm_var": 0.15188802083333333, "learning_rate": 8.363710022908906e-06, "loss": 5.1507, "loss/crossentropy": 2.025658816099167, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20820339396595955, "step": 11728 }, { "epoch": 0.9775, "grad_norm": 4.6875, "grad_norm_var": 0.13352864583333332, "learning_rate": 8.358400995763352e-06, "loss": 4.9741, "loss/crossentropy": 2.234781265258789, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21136652678251266, "step": 11730 }, { "epoch": 0.9776666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.13313802083333334, "learning_rate": 8.353130562708451e-06, "loss": 4.4836, "loss/crossentropy": 2.3086537420749664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21127301827073097, "step": 11732 }, { "epoch": 0.9778333333333333, "grad_norm": 4.5, "grad_norm_var": 0.12784830729166666, "learning_rate": 8.347898736748481e-06, "loss": 4.8491, "loss/crossentropy": 1.5660332068800926, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19694342091679573, "step": 11734 }, { "epoch": 0.978, "grad_norm": 4.78125, "grad_norm_var": 0.06549072265625, "learning_rate": 8.342705530792447e-06, "loss": 4.9907, "loss/crossentropy": 2.612446963787079, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22936904057860374, "step": 11736 }, { "epoch": 0.9781666666666666, "grad_norm": 4.375, "grad_norm_var": 0.06923421223958333, "learning_rate": 8.33755095765407e-06, "loss": 5.0026, "loss/crossentropy": 2.260433554649353, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20796746760606766, "step": 11738 }, { "epoch": 0.9783333333333334, "grad_norm": 4.375, "grad_norm_var": 0.15533447265625, "learning_rate": 8.332435030051747e-06, "loss": 4.7093, "loss/crossentropy": 2.0627638399600983, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2280513308942318, "step": 11740 }, { "epoch": 0.9785, "grad_norm": 4.375, "grad_norm_var": 0.13065999348958332, "learning_rate": 8.327357760608522e-06, "loss": 5.2809, "loss/crossentropy": 1.2782742008566856, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16142114251852036, "step": 11742 }, { "epoch": 0.9786666666666667, "grad_norm": 4.46875, "grad_norm_var": 0.12525634765625, "learning_rate": 8.322319161852052e-06, "loss": 5.4681, "loss/crossentropy": 2.36602121591568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2090701460838318, "step": 11744 }, { "epoch": 0.9788333333333333, "grad_norm": 4.625, "grad_norm_var": 0.12467447916666667, "learning_rate": 8.317319246214578e-06, "loss": 5.566, "loss/crossentropy": 1.765766218304634, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20317689701914787, "step": 11746 }, { "epoch": 0.979, "grad_norm": 4.53125, "grad_norm_var": 0.14269205729166667, "learning_rate": 8.31235802603289e-06, "loss": 4.831, "loss/crossentropy": 2.3671552538871765, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20855028927326202, "step": 11748 }, { "epoch": 0.9791666666666666, "grad_norm": 4.5, "grad_norm_var": 0.14560139973958333, "learning_rate": 8.307435513548314e-06, "loss": 4.8928, "loss/crossentropy": 2.134007513523102, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18522262014448643, "step": 11750 }, { "epoch": 0.9793333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.140087890625, "learning_rate": 8.302551720906658e-06, "loss": 5.2723, "loss/crossentropy": 2.4626063108444214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22433257102966309, "step": 11752 }, { "epoch": 0.9795, "grad_norm": 4.9375, "grad_norm_var": 0.139697265625, "learning_rate": 8.297706660158189e-06, "loss": 4.5625, "loss/crossentropy": 1.8229105174541473, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18363786488771439, "step": 11754 }, { "epoch": 0.9796666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.04208577473958333, "learning_rate": 8.29290034325762e-06, "loss": 4.7565, "loss/crossentropy": 2.0099611580371857, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18299407325685024, "step": 11756 }, { "epoch": 0.9798333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.03990885416666667, "learning_rate": 8.288132782064057e-06, "loss": 4.8898, "loss/crossentropy": 1.8129331469535828, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15079554915428162, "step": 11758 }, { "epoch": 0.98, "grad_norm": 4.59375, "grad_norm_var": 0.039453125, "learning_rate": 8.283403988340983e-06, "loss": 5.0303, "loss/crossentropy": 2.317029356956482, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21258693933486938, "step": 11760 }, { "epoch": 0.9801666666666666, "grad_norm": 4.3125, "grad_norm_var": 0.04296468098958333, "learning_rate": 8.278713973756227e-06, "loss": 5.0208, "loss/crossentropy": 2.253770500421524, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18931128084659576, "step": 11762 }, { "epoch": 0.9803333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.027197265625, "learning_rate": 8.274062749881934e-06, "loss": 5.3963, "loss/crossentropy": 2.4488985538482666, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19674066081643105, "step": 11764 }, { "epoch": 0.9805, "grad_norm": 5.3125, "grad_norm_var": 0.06139322916666667, "learning_rate": 8.269450328194538e-06, "loss": 4.1609, "loss/crossentropy": 1.3078017458319664, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14418474957346916, "step": 11766 }, { "epoch": 0.9806666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.07688395182291667, "learning_rate": 8.264876720074727e-06, "loss": 5.0368, "loss/crossentropy": 2.1359574496746063, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1973453313112259, "step": 11768 }, { "epoch": 0.9808333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.06926676432291666, "learning_rate": 8.260341936807425e-06, "loss": 4.7631, "loss/crossentropy": 2.2247210144996643, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20966701954603195, "step": 11770 }, { "epoch": 0.981, "grad_norm": 4.75, "grad_norm_var": 0.07185872395833333, "learning_rate": 8.255845989581765e-06, "loss": 4.5269, "loss/crossentropy": 1.9601154178380966, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16586436331272125, "step": 11772 }, { "epoch": 0.9811666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.07239583333333334, "learning_rate": 8.251388889491044e-06, "loss": 4.6656, "loss/crossentropy": 1.9562703296542168, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.190062141045928, "step": 11774 }, { "epoch": 0.9813333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.07121988932291666, "learning_rate": 8.246970647532716e-06, "loss": 5.3309, "loss/crossentropy": 2.1067320704460144, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21069613099098206, "step": 11776 }, { "epoch": 0.9815, "grad_norm": 4.6875, "grad_norm_var": 0.06979166666666667, "learning_rate": 8.242591274608351e-06, "loss": 4.9249, "loss/crossentropy": 2.6109946966171265, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20355751365423203, "step": 11778 }, { "epoch": 0.9816666666666667, "grad_norm": 4.75, "grad_norm_var": 0.068212890625, "learning_rate": 8.23825078152362e-06, "loss": 5.3303, "loss/crossentropy": 2.306352376937866, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22947991266846657, "step": 11780 }, { "epoch": 0.9818333333333333, "grad_norm": 4.46875, "grad_norm_var": 0.03203125, "learning_rate": 8.233949178988255e-06, "loss": 4.8106, "loss/crossentropy": 2.0892684012651443, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19318658113479614, "step": 11782 }, { "epoch": 0.982, "grad_norm": 4.46875, "grad_norm_var": 0.019071451822916665, "learning_rate": 8.229686477616033e-06, "loss": 5.1217, "loss/crossentropy": 2.2837354838848114, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23053519800305367, "step": 11784 }, { "epoch": 0.9821666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.022456868489583334, "learning_rate": 8.225462687924748e-06, "loss": 4.5143, "loss/crossentropy": 1.8091852068901062, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16203287802636623, "step": 11786 }, { "epoch": 0.9823333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.018778483072916668, "learning_rate": 8.22127782033618e-06, "loss": 5.1684, "loss/crossentropy": 1.8546411916613579, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17528345808386803, "step": 11788 }, { "epoch": 0.9825, "grad_norm": 4.4375, "grad_norm_var": 0.032938639322916664, "learning_rate": 8.217131885176074e-06, "loss": 4.6615, "loss/crossentropy": 1.4845838844776154, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1486959345638752, "step": 11790 }, { "epoch": 0.9826666666666667, "grad_norm": 4.4375, "grad_norm_var": 0.033003743489583334, "learning_rate": 8.213024892674113e-06, "loss": 5.1357, "loss/crossentropy": 1.778126172721386, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.207093708217144, "step": 11792 }, { "epoch": 0.9828333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.034895833333333334, "learning_rate": 8.208956852963892e-06, "loss": 4.4221, "loss/crossentropy": 2.1073838770389557, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19880036637187004, "step": 11794 }, { "epoch": 0.983, "grad_norm": 4.21875, "grad_norm_var": 0.039778645833333334, "learning_rate": 8.204927776082895e-06, "loss": 4.1364, "loss/crossentropy": 1.9823874160647392, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16754209622740746, "step": 11796 }, { "epoch": 0.9831666666666666, "grad_norm": 5.0625, "grad_norm_var": 0.07884114583333333, "learning_rate": 8.200937671972468e-06, "loss": 4.7407, "loss/crossentropy": 0.9149458408355713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16483060829341412, "step": 11798 }, { "epoch": 0.9833333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.07862955729166667, "learning_rate": 8.1969865504778e-06, "loss": 4.5806, "loss/crossentropy": 1.7888997569680214, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17346928641200066, "step": 11800 }, { "epoch": 0.9835, "grad_norm": 4.21875, "grad_norm_var": 0.08186442057291667, "learning_rate": 8.193074421347883e-06, "loss": 4.7757, "loss/crossentropy": 1.3748513013124466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13545435294508934, "step": 11802 }, { "epoch": 0.9836666666666667, "grad_norm": 4.3125, "grad_norm_var": 0.087744140625, "learning_rate": 8.189201294235514e-06, "loss": 4.6596, "loss/crossentropy": 1.8794011771678925, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18594608083367348, "step": 11804 }, { "epoch": 0.9838333333333333, "grad_norm": 4.625, "grad_norm_var": 0.07935791015625, "learning_rate": 8.185367178697244e-06, "loss": 4.6955, "loss/crossentropy": 2.4637969732284546, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1970806047320366, "step": 11806 }, { "epoch": 0.984, "grad_norm": 4.375, "grad_norm_var": 0.085009765625, "learning_rate": 8.181572084193377e-06, "loss": 5.6585, "loss/crossentropy": 2.1041803061962128, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20276985503733158, "step": 11808 }, { "epoch": 0.9841666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.08043212890625, "learning_rate": 8.177816020087929e-06, "loss": 4.4479, "loss/crossentropy": 1.4653847217559814, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1403233241289854, "step": 11810 }, { "epoch": 0.9843333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.070556640625, "learning_rate": 8.174098995648613e-06, "loss": 4.712, "loss/crossentropy": 1.5452167689800262, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.12613629549741745, "step": 11812 }, { "epoch": 0.9845, "grad_norm": 4.5, "grad_norm_var": 0.030192057291666668, "learning_rate": 8.170421020046818e-06, "loss": 5.3324, "loss/crossentropy": 2.5659364461898804, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20733560249209404, "step": 11814 }, { "epoch": 0.9846666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.030582682291666666, "learning_rate": 8.166782102357586e-06, "loss": 4.9242, "loss/crossentropy": 2.5664992928504944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21779290586709976, "step": 11816 }, { "epoch": 0.9848333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.02447509765625, "learning_rate": 8.163182251559582e-06, "loss": 4.7698, "loss/crossentropy": 1.7134768441319466, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17407608777284622, "step": 11818 }, { "epoch": 0.985, "grad_norm": 4.21875, "grad_norm_var": 0.021419270833333334, "learning_rate": 8.15962147653508e-06, "loss": 4.6578, "loss/crossentropy": 1.5287619307637215, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1948755346238613, "step": 11820 }, { "epoch": 0.9851666666666666, "grad_norm": 5.09375, "grad_norm_var": 0.04247639973958333, "learning_rate": 8.15609978606994e-06, "loss": 4.8483, "loss/crossentropy": 1.825168825685978, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16270102560520172, "step": 11822 }, { "epoch": 0.9853333333333333, "grad_norm": 4.5, "grad_norm_var": 0.044596354166666664, "learning_rate": 8.152617188853582e-06, "loss": 5.1059, "loss/crossentropy": 2.337290108203888, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2274259254336357, "step": 11824 }, { "epoch": 0.9855, "grad_norm": 4.4375, "grad_norm_var": 0.049702962239583336, "learning_rate": 8.149173693478968e-06, "loss": 4.7688, "loss/crossentropy": 2.4057921767234802, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20579170435667038, "step": 11826 }, { "epoch": 0.9856666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.055859375, "learning_rate": 8.145769308442583e-06, "loss": 4.8138, "loss/crossentropy": 2.0226185023784637, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22264505177736282, "step": 11828 }, { "epoch": 0.9858333333333333, "grad_norm": 4.15625, "grad_norm_var": 0.06705729166666667, "learning_rate": 8.142404042144405e-06, "loss": 4.9318, "loss/crossentropy": 2.267892837524414, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2211035192012787, "step": 11830 }, { "epoch": 0.986, "grad_norm": 4.15625, "grad_norm_var": 0.07336832682291666, "learning_rate": 8.139077902887897e-06, "loss": 4.3544, "loss/crossentropy": 1.4533291533589363, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16291098482906818, "step": 11832 }, { "epoch": 0.9861666666666666, "grad_norm": 4.5, "grad_norm_var": 0.12278645833333333, "learning_rate": 8.135790898879973e-06, "loss": 5.3159, "loss/crossentropy": 1.5565712675452232, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15048514120280743, "step": 11834 }, { "epoch": 0.9863333333333333, "grad_norm": 5.9375, "grad_norm_var": 0.22107747395833333, "learning_rate": 8.132543038230996e-06, "loss": 5.0178, "loss/crossentropy": 2.1774487793445587, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21005262434482574, "step": 11836 }, { "epoch": 0.9865, "grad_norm": 4.84375, "grad_norm_var": 0.22655843098958334, "learning_rate": 8.129334328954733e-06, "loss": 4.978, "loss/crossentropy": 2.3862339854240417, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21916748955845833, "step": 11838 }, { "epoch": 0.9866666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.232666015625, "learning_rate": 8.126164778968358e-06, "loss": 4.9298, "loss/crossentropy": 2.2145788967609406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1891205869615078, "step": 11840 }, { "epoch": 0.9868333333333333, "grad_norm": 4.3125, "grad_norm_var": 0.22823893229166667, "learning_rate": 8.123034396092415e-06, "loss": 4.7015, "loss/crossentropy": 2.1209593415260315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21985666453838348, "step": 11842 }, { "epoch": 0.987, "grad_norm": 4.5, "grad_norm_var": 0.22784830729166666, "learning_rate": 8.119943188050822e-06, "loss": 4.8956, "loss/crossentropy": 2.3089587688446045, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1974850781261921, "step": 11844 }, { "epoch": 0.9871666666666666, "grad_norm": 4.125, "grad_norm_var": 0.235400390625, "learning_rate": 8.116891162470822e-06, "loss": 4.3814, "loss/crossentropy": 2.284092366695404, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19774354994297028, "step": 11846 }, { "epoch": 0.9873333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.22551676432291667, "learning_rate": 8.113878326882984e-06, "loss": 4.7244, "loss/crossentropy": 2.0912757217884064, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20107288658618927, "step": 11848 }, { "epoch": 0.9875, "grad_norm": 4.3125, "grad_norm_var": 0.18136393229166667, "learning_rate": 8.110904688721181e-06, "loss": 5.111, "loss/crossentropy": 1.7536583244800568, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13841561134904623, "step": 11850 }, { "epoch": 0.9876666666666667, "grad_norm": 4.78125, "grad_norm_var": 0.04576416015625, "learning_rate": 8.107970255322572e-06, "loss": 5.3386, "loss/crossentropy": 2.4679291248321533, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21784574910998344, "step": 11852 }, { "epoch": 0.9878333333333333, "grad_norm": 4.0625, "grad_norm_var": 0.04595947265625, "learning_rate": 8.105075033927576e-06, "loss": 4.7901, "loss/crossentropy": 2.679027020931244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2104400470852852, "step": 11854 }, { "epoch": 0.988, "grad_norm": 4.5, "grad_norm_var": 0.044755045572916666, "learning_rate": 8.102219031679866e-06, "loss": 4.7311, "loss/crossentropy": 2.548813045024872, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21962737292051315, "step": 11856 }, { "epoch": 0.9881666666666666, "grad_norm": 4.8125, "grad_norm_var": 0.051041666666666666, "learning_rate": 8.099402255626345e-06, "loss": 4.8849, "loss/crossentropy": 2.0113202035427094, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20012788474559784, "step": 11858 }, { "epoch": 0.9883333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.06373291015625, "learning_rate": 8.096624712717127e-06, "loss": 5.0429, "loss/crossentropy": 2.3392894864082336, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20352452620863914, "step": 11860 }, { "epoch": 0.9885, "grad_norm": 4.65625, "grad_norm_var": 0.0501953125, "learning_rate": 8.09388640980552e-06, "loss": 5.4598, "loss/crossentropy": 2.566970646381378, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2093185856938362, "step": 11862 }, { "epoch": 0.9886666666666667, "grad_norm": 4.53125, "grad_norm_var": 0.04742431640625, "learning_rate": 8.091187353648018e-06, "loss": 5.002, "loss/crossentropy": 1.5040106773376465, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1845961958169937, "step": 11864 }, { "epoch": 0.9888333333333333, "grad_norm": 4.5625, "grad_norm_var": 0.04479166666666667, "learning_rate": 8.088527550904274e-06, "loss": 4.7993, "loss/crossentropy": 1.3132527843117714, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15961330942809582, "step": 11866 }, { "epoch": 0.989, "grad_norm": 4.53125, "grad_norm_var": 0.042801920572916666, "learning_rate": 8.085907008137084e-06, "loss": 4.6958, "loss/crossentropy": 1.2898187711834908, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15488006733357906, "step": 11868 }, { "epoch": 0.9891666666666666, "grad_norm": 4.40625, "grad_norm_var": 0.04114583333333333, "learning_rate": 8.083325731812376e-06, "loss": 5.077, "loss/crossentropy": 2.087516203522682, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2222130000591278, "step": 11870 }, { "epoch": 0.9893333333333333, "grad_norm": 4.84375, "grad_norm_var": 0.04973551432291667, "learning_rate": 8.080783728299198e-06, "loss": 4.488, "loss/crossentropy": 1.3223537430167198, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1429068874567747, "step": 11872 }, { "epoch": 0.9895, "grad_norm": 4.65625, "grad_norm_var": 0.044775390625, "learning_rate": 8.078281003869689e-06, "loss": 5.2082, "loss/crossentropy": 2.4864302277565002, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21986323222517967, "step": 11874 }, { "epoch": 0.9896666666666667, "grad_norm": 4.28125, "grad_norm_var": 0.04149983723958333, "learning_rate": 8.075817564699068e-06, "loss": 4.9471, "loss/crossentropy": 2.1714051365852356, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23419787362217903, "step": 11876 }, { "epoch": 0.9898333333333333, "grad_norm": 4.375, "grad_norm_var": 0.043192545572916664, "learning_rate": 8.07339341686563e-06, "loss": 4.5392, "loss/crossentropy": 1.5454725325107574, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15994950011372566, "step": 11878 }, { "epoch": 0.99, "grad_norm": 4.5, "grad_norm_var": 0.04659830729166667, "learning_rate": 8.071008566350721e-06, "loss": 4.6596, "loss/crossentropy": 1.6750903725624084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18367098458111286, "step": 11880 }, { "epoch": 0.9901666666666666, "grad_norm": 4.375, "grad_norm_var": 0.0462890625, "learning_rate": 8.068663019038719e-06, "loss": 4.7149, "loss/crossentropy": 1.3502802401781082, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.13940864615142345, "step": 11882 }, { "epoch": 0.9903333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.047379557291666666, "learning_rate": 8.066356780717031e-06, "loss": 4.8176, "loss/crossentropy": 1.3339405804872513, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1357364384457469, "step": 11884 }, { "epoch": 0.9905, "grad_norm": 4.75, "grad_norm_var": 0.03388264973958333, "learning_rate": 8.064089857076067e-06, "loss": 5.0944, "loss/crossentropy": 2.2128437161445618, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21685703843832016, "step": 11886 }, { "epoch": 0.9906666666666667, "grad_norm": 4.5, "grad_norm_var": 0.024723307291666666, "learning_rate": 8.06186225370924e-06, "loss": 4.9089, "loss/crossentropy": 2.279437929391861, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22451356425881386, "step": 11888 }, { "epoch": 0.9908333333333333, "grad_norm": 4.53125, "grad_norm_var": 0.024723307291666666, "learning_rate": 8.059673976112941e-06, "loss": 5.402, "loss/crossentropy": 2.281008332967758, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1916458159685135, "step": 11890 }, { "epoch": 0.991, "grad_norm": 4.34375, "grad_norm_var": 0.025113932291666665, "learning_rate": 8.057525029686523e-06, "loss": 5.0482, "loss/crossentropy": 2.2194367945194244, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23771441355347633, "step": 11892 }, { "epoch": 0.9911666666666666, "grad_norm": 4.4375, "grad_norm_var": 0.030192057291666668, "learning_rate": 8.055415419732298e-06, "loss": 4.617, "loss/crossentropy": 2.204625815153122, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21681509166955948, "step": 11894 }, { "epoch": 0.9913333333333333, "grad_norm": 4.5, "grad_norm_var": 0.026493326822916666, "learning_rate": 8.053345151455523e-06, "loss": 5.319, "loss/crossentropy": 2.290549635887146, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23177609220147133, "step": 11896 }, { "epoch": 0.9915, "grad_norm": 4.3125, "grad_norm_var": 0.029520670572916668, "learning_rate": 8.051314229964375e-06, "loss": 4.6101, "loss/crossentropy": 1.993983969092369, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18598743341863155, "step": 11898 }, { "epoch": 0.9916666666666667, "grad_norm": 4.0, "grad_norm_var": 0.04659830729166667, "learning_rate": 8.049322660269954e-06, "loss": 4.5727, "loss/crossentropy": 1.6651684641838074, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17348604835569859, "step": 11900 }, { "epoch": 0.9918333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.063134765625, "learning_rate": 8.047370447286258e-06, "loss": 4.925, "loss/crossentropy": 1.5045694410800934, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.14359690621495247, "step": 11902 }, { "epoch": 0.992, "grad_norm": 4.4375, "grad_norm_var": 0.05917561848958333, "learning_rate": 8.045457595830179e-06, "loss": 4.8642, "loss/crossentropy": 2.3525500893592834, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19554975256323814, "step": 11904 }, { "epoch": 0.9921666666666666, "grad_norm": 4.4375, "grad_norm_var": 0.05792643229166667, "learning_rate": 8.043584110621488e-06, "loss": 5.0682, "loss/crossentropy": 2.599298894405365, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20872588828206062, "step": 11906 }, { "epoch": 0.9923333333333333, "grad_norm": 4.71875, "grad_norm_var": 0.055985514322916666, "learning_rate": 8.041749996282821e-06, "loss": 4.9446, "loss/crossentropy": 1.7182498648762703, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21588006988167763, "step": 11908 }, { "epoch": 0.9925, "grad_norm": 4.6875, "grad_norm_var": 0.05523681640625, "learning_rate": 8.03995525733968e-06, "loss": 4.5029, "loss/crossentropy": 2.0742194950580597, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20126686617732048, "step": 11910 }, { "epoch": 0.9926666666666667, "grad_norm": 4.25, "grad_norm_var": 0.09256184895833333, "learning_rate": 8.038199898220398e-06, "loss": 4.5741, "loss/crossentropy": 1.3627977594733238, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1341603621840477, "step": 11912 }, { "epoch": 0.9928333333333333, "grad_norm": 4.125, "grad_norm_var": 0.10745035807291667, "learning_rate": 8.036483923256152e-06, "loss": 4.5946, "loss/crossentropy": 2.560191512107849, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20274262875318527, "step": 11914 }, { "epoch": 0.993, "grad_norm": 4.34375, "grad_norm_var": 0.08928629557291666, "learning_rate": 8.034807336680938e-06, "loss": 4.7074, "loss/crossentropy": 1.5181104466319084, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1560316327959299, "step": 11916 }, { "epoch": 0.9931666666666666, "grad_norm": 4.46875, "grad_norm_var": 0.07154541015625, "learning_rate": 8.033170142631567e-06, "loss": 5.451, "loss/crossentropy": 1.7161678597331047, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1688497867435217, "step": 11918 }, { "epoch": 0.9933333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.07255452473958333, "learning_rate": 8.031572345147655e-06, "loss": 4.7851, "loss/crossentropy": 1.6516497433185577, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17710182815790176, "step": 11920 }, { "epoch": 0.9935, "grad_norm": 4.46875, "grad_norm_var": 0.07125244140625, "learning_rate": 8.030013948171608e-06, "loss": 5.3578, "loss/crossentropy": 2.3381210267543793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2103869989514351, "step": 11922 }, { "epoch": 0.9936666666666667, "grad_norm": 4.65625, "grad_norm_var": 0.06998291015625, "learning_rate": 8.028494955548613e-06, "loss": 4.996, "loss/crossentropy": 2.2223448157310486, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.23018431290984154, "step": 11924 }, { "epoch": 0.9938333333333333, "grad_norm": 4.65625, "grad_norm_var": 0.06783447265625, "learning_rate": 8.027015371026635e-06, "loss": 5.0702, "loss/crossentropy": 2.454653322696686, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2261667139828205, "step": 11926 }, { "epoch": 0.994, "grad_norm": 4.46875, "grad_norm_var": 0.031966145833333334, "learning_rate": 8.025575198256401e-06, "loss": 4.3612, "loss/crossentropy": 1.864028476178646, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18934139795601368, "step": 11928 }, { "epoch": 0.9941666666666666, "grad_norm": 4.78125, "grad_norm_var": 0.016341145833333334, "learning_rate": 8.024174440791395e-06, "loss": 4.9677, "loss/crossentropy": 2.5450727939605713, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21843239665031433, "step": 11930 }, { "epoch": 0.9943333333333333, "grad_norm": 4.34375, "grad_norm_var": 0.016434733072916666, "learning_rate": 8.022813102087846e-06, "loss": 5.0552, "loss/crossentropy": 2.4372522234916687, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21206426993012428, "step": 11932 }, { "epoch": 0.9945, "grad_norm": 4.75, "grad_norm_var": 0.017606608072916665, "learning_rate": 8.021491185504721e-06, "loss": 5.1754, "loss/crossentropy": 2.269777476787567, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21210772916674614, "step": 11934 }, { "epoch": 0.9946666666666667, "grad_norm": 4.5, "grad_norm_var": 0.017118326822916665, "learning_rate": 8.020208694303722e-06, "loss": 5.0094, "loss/crossentropy": 1.7897434085607529, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16236883774399757, "step": 11936 }, { "epoch": 0.9948333333333333, "grad_norm": 4.28125, "grad_norm_var": 0.030171712239583332, "learning_rate": 8.018965631649264e-06, "loss": 3.9853, "loss/crossentropy": 1.755036287009716, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18948844075202942, "step": 11938 }, { "epoch": 0.995, "grad_norm": 4.125, "grad_norm_var": 0.03632405598958333, "learning_rate": 8.017762000608482e-06, "loss": 4.078, "loss/crossentropy": 1.7440339028835297, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1710295006632805, "step": 11940 }, { "epoch": 0.9951666666666666, "grad_norm": 4.53125, "grad_norm_var": 0.0333984375, "learning_rate": 8.016597804151215e-06, "loss": 5.1789, "loss/crossentropy": 2.2095680236816406, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18934720009565353, "step": 11942 }, { "epoch": 0.9953333333333333, "grad_norm": 5.09375, "grad_norm_var": 0.060009765625, "learning_rate": 8.015473045150006e-06, "loss": 5.272, "loss/crossentropy": 2.004868745803833, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18382899090647697, "step": 11944 }, { "epoch": 0.9955, "grad_norm": 4.84375, "grad_norm_var": 0.06287434895833334, "learning_rate": 8.014387726380082e-06, "loss": 5.0277, "loss/crossentropy": 1.9194257259368896, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17790965735912323, "step": 11946 }, { "epoch": 0.9956666666666667, "grad_norm": 4.59375, "grad_norm_var": 0.06617431640625, "learning_rate": 8.013341850519359e-06, "loss": 4.9612, "loss/crossentropy": 1.963321976363659, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1870459709316492, "step": 11948 }, { "epoch": 0.9958333333333333, "grad_norm": 4.6875, "grad_norm_var": 0.06884358723958334, "learning_rate": 8.012335420148435e-06, "loss": 4.6321, "loss/crossentropy": 1.8418525904417038, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1703290119767189, "step": 11950 }, { "epoch": 0.996, "grad_norm": 4.53125, "grad_norm_var": 0.07405192057291667, "learning_rate": 8.011368437750574e-06, "loss": 5.0424, "loss/crossentropy": 2.372679352760315, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18234703317284584, "step": 11952 }, { "epoch": 0.9961666666666666, "grad_norm": 4.3125, "grad_norm_var": 0.07980143229166667, "learning_rate": 8.010440905711708e-06, "loss": 5.4315, "loss/crossentropy": 1.4641002044081688, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.15950197726488113, "step": 11954 }, { "epoch": 0.9963333333333333, "grad_norm": 4.78125, "grad_norm_var": 0.065869140625, "learning_rate": 8.009552826320434e-06, "loss": 5.1694, "loss/crossentropy": 1.9262694045901299, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20404083095490932, "step": 11956 }, { "epoch": 0.9965, "grad_norm": 4.4375, "grad_norm_var": 0.06754150390625, "learning_rate": 8.008704201767998e-06, "loss": 5.0181, "loss/crossentropy": 1.6573146134614944, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20728476718068123, "step": 11958 }, { "epoch": 0.9966666666666667, "grad_norm": 4.625, "grad_norm_var": 0.06236572265625, "learning_rate": 8.007895034148296e-06, "loss": 5.3437, "loss/crossentropy": 1.4944916442036629, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17235364392399788, "step": 11960 }, { "epoch": 0.9968333333333333, "grad_norm": 4.1875, "grad_norm_var": 0.06978759765625, "learning_rate": 8.007125325457868e-06, "loss": 4.9422, "loss/crossentropy": 1.9228725656867027, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1831230465322733, "step": 11962 }, { "epoch": 0.997, "grad_norm": 4.5625, "grad_norm_var": 0.06789957682291667, "learning_rate": 8.006395077595897e-06, "loss": 4.6737, "loss/crossentropy": 2.0940150320529938, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19881772994995117, "step": 11964 }, { "epoch": 0.9971666666666666, "grad_norm": 4.71875, "grad_norm_var": 0.06066080729166667, "learning_rate": 8.005704292364192e-06, "loss": 4.8361, "loss/crossentropy": 2.0688324570655823, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.190332543104887, "step": 11966 }, { "epoch": 0.9973333333333333, "grad_norm": 4.0625, "grad_norm_var": 0.09060872395833333, "learning_rate": 8.005052971467203e-06, "loss": 3.5708, "loss/crossentropy": 1.320427618920803, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1498071327805519, "step": 11968 }, { "epoch": 0.9975, "grad_norm": 4.625, "grad_norm_var": 0.06432291666666666, "learning_rate": 8.004441116511992e-06, "loss": 5.0302, "loss/crossentropy": 1.9641523733735085, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1834118254482746, "step": 11970 }, { "epoch": 0.9976666666666667, "grad_norm": 4.34375, "grad_norm_var": 0.05987955729166667, "learning_rate": 8.003868729008256e-06, "loss": 4.1466, "loss/crossentropy": 1.9224779605865479, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.22309407964348793, "step": 11972 }, { "epoch": 0.9978333333333333, "grad_norm": 4.40625, "grad_norm_var": 0.058447265625, "learning_rate": 8.003335810368304e-06, "loss": 5.0203, "loss/crossentropy": 1.661292903125286, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18765877932310104, "step": 11974 }, { "epoch": 0.998, "grad_norm": 8.25, "grad_norm_var": 0.94605712890625, "learning_rate": 8.002842361907057e-06, "loss": 4.5332, "loss/crossentropy": 1.523488275706768, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16733020916581154, "step": 11976 }, { "epoch": 0.9981666666666666, "grad_norm": 4.96875, "grad_norm_var": 0.9265625, "learning_rate": 8.002388384842052e-06, "loss": 5.0479, "loss/crossentropy": 2.3596551716327667, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.2128112018108368, "step": 11978 }, { "epoch": 0.9983333333333333, "grad_norm": 4.90625, "grad_norm_var": 0.9274739583333333, "learning_rate": 8.001973880293432e-06, "loss": 5.511, "loss/crossentropy": 1.8143180459737778, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1826395783573389, "step": 11980 }, { "epoch": 0.9985, "grad_norm": 4.40625, "grad_norm_var": 0.9441365559895833, "learning_rate": 8.001598849283945e-06, "loss": 4.6422, "loss/crossentropy": 1.7479843944311142, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.20489901304244995, "step": 11982 }, { "epoch": 0.9986666666666667, "grad_norm": 4.40625, "grad_norm_var": 0.8844889322916667, "learning_rate": 8.001263292738943e-06, "loss": 5.034, "loss/crossentropy": 2.108782708644867, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.19143879041075706, "step": 11984 }, { "epoch": 0.9988333333333334, "grad_norm": 4.6875, "grad_norm_var": 0.9028483072916667, "learning_rate": 8.00096721148638e-06, "loss": 4.9638, "loss/crossentropy": 2.291710913181305, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.227816391736269, "step": 11986 }, { "epoch": 0.999, "grad_norm": 4.28125, "grad_norm_var": 0.9287068684895833, "learning_rate": 8.000710606256803e-06, "loss": 4.6671, "loss/crossentropy": 1.8156883418560028, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17330962419509888, "step": 11988 }, { "epoch": 0.9991666666666666, "grad_norm": 4.59375, "grad_norm_var": 0.9245930989583333, "learning_rate": 8.000493477683367e-06, "loss": 4.8698, "loss/crossentropy": 2.213321268558502, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.21897965297102928, "step": 11990 }, { "epoch": 0.9993333333333333, "grad_norm": 4.4375, "grad_norm_var": 0.05319010416666667, "learning_rate": 8.000315826301807e-06, "loss": 4.7729, "loss/crossentropy": 2.0580232441425323, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.1888243965804577, "step": 11992 }, { "epoch": 0.9995, "grad_norm": 4.4375, "grad_norm_var": 0.037495930989583336, "learning_rate": 8.000177652550465e-06, "loss": 4.8099, "loss/crossentropy": 1.9432259127497673, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.17133133299648762, "step": 11994 }, { "epoch": 0.9996666666666667, "grad_norm": 4.625, "grad_norm_var": 0.028645833333333332, "learning_rate": 8.00007895677027e-06, "loss": 4.947, "loss/crossentropy": 1.5523300170898438, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.16255545988678932, "step": 11996 }, { "epoch": 0.9998333333333334, "grad_norm": 4.375, "grad_norm_var": 0.03136393229166667, "learning_rate": 8.000019739204745e-06, "loss": 4.9184, "loss/crossentropy": 1.881621241569519, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.18999022245407104, "step": 11998 }, { "epoch": 1.0, "grad_norm": 5.0, "grad_norm_var": 0.04724934895833333, "learning_rate": 8.000000000000001e-06, "loss": 5.0732, "loss/crossentropy": 1.9301600456237793, "loss/hidden": 0.0, "loss/jsd": 0.0, "loss/logits": 0.24359508231282234, "step": 12000 } ], "logging_steps": 2, "max_steps": 12000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 6000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5200982329720832e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }