| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 100, |
| "global_step": 2494, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.020048115477145148, |
| "grad_norm": 0.3128751516342163, |
| "learning_rate": 9.6e-05, |
| "loss": 0.6108, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.040096230954290296, |
| "grad_norm": 0.2898954153060913, |
| "learning_rate": 0.00019600000000000002, |
| "loss": 0.3986, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.060144346431435444, |
| "grad_norm": 0.2638753652572632, |
| "learning_rate": 0.000296, |
| "loss": 0.3553, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08019246190858059, |
| "grad_norm": 0.2679823935031891, |
| "learning_rate": 0.00039600000000000003, |
| "loss": 0.3276, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08019246190858059, |
| "eval_loss": 0.3061896860599518, |
| "eval_runtime": 2.2347, |
| "eval_samples_per_second": 46.538, |
| "eval_steps_per_second": 0.895, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.10024057738572574, |
| "grad_norm": 0.3121950924396515, |
| "learning_rate": 0.000496, |
| "loss": 0.3272, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.12028869286287089, |
| "grad_norm": 0.2655491828918457, |
| "learning_rate": 0.000596, |
| "loss": 0.3186, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.14033680834001605, |
| "grad_norm": 0.30342063307762146, |
| "learning_rate": 0.000696, |
| "loss": 0.3094, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.16038492381716118, |
| "grad_norm": 0.3187066614627838, |
| "learning_rate": 0.000796, |
| "loss": 0.3092, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.16038492381716118, |
| "eval_loss": 0.29791951179504395, |
| "eval_runtime": 2.0135, |
| "eval_samples_per_second": 51.651, |
| "eval_steps_per_second": 0.993, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.18043303929430635, |
| "grad_norm": 0.29701462388038635, |
| "learning_rate": 0.000896, |
| "loss": 0.303, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.20048115477145148, |
| "grad_norm": 0.3302502930164337, |
| "learning_rate": 0.000996, |
| "loss": 0.302, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.22052927024859664, |
| "grad_norm": 0.2812274694442749, |
| "learning_rate": 0.0009997177878718869, |
| "loss": 0.6194, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.24057738572574178, |
| "grad_norm": 0.2586809992790222, |
| "learning_rate": 0.0009988239768018291, |
| "loss": 0.3014, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.24057738572574178, |
| "eval_loss": 0.2832469344139099, |
| "eval_runtime": 2.0065, |
| "eval_samples_per_second": 51.832, |
| "eval_steps_per_second": 0.997, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2606255012028869, |
| "grad_norm": 0.2703372538089752, |
| "learning_rate": 0.0009973191715938715, |
| "loss": 0.2981, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.2806736166800321, |
| "grad_norm": 0.2600429356098175, |
| "learning_rate": 0.0009952052154376025, |
| "loss": 0.2955, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.30072173215717724, |
| "grad_norm": 0.21045513451099396, |
| "learning_rate": 0.0009924846976528616, |
| "loss": 0.2865, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.32076984763432237, |
| "grad_norm": 0.21870078146457672, |
| "learning_rate": 0.0009891609505181592, |
| "loss": 0.2754, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.32076984763432237, |
| "eval_loss": 0.2693285048007965, |
| "eval_runtime": 2.008, |
| "eval_samples_per_second": 51.792, |
| "eval_steps_per_second": 0.996, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.3408179631114675, |
| "grad_norm": 0.23548808693885803, |
| "learning_rate": 0.0009852380451890721, |
| "loss": 0.2812, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.3608660785886127, |
| "grad_norm": 0.22311964631080627, |
| "learning_rate": 0.0009807207867116115, |
| "loss": 0.2868, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.3809141940657578, |
| "grad_norm": 0.21461476385593414, |
| "learning_rate": 0.0009756147081366672, |
| "loss": 0.2765, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.40096230954290296, |
| "grad_norm": 0.19619832932949066, |
| "learning_rate": 0.0009699260637427467, |
| "loss": 0.2759, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.40096230954290296, |
| "eval_loss": 0.25922319293022156, |
| "eval_runtime": 2.0086, |
| "eval_samples_per_second": 51.777, |
| "eval_steps_per_second": 0.996, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4210104250200481, |
| "grad_norm": 0.16149669885635376, |
| "learning_rate": 0.0009636618213753006, |
| "loss": 0.2731, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.4410585404971933, |
| "grad_norm": 0.20750294625759125, |
| "learning_rate": 0.0009568296539120225, |
| "loss": 0.2759, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.4611066559743384, |
| "grad_norm": 0.18264305591583252, |
| "learning_rate": 0.0009494379298645788, |
| "loss": 0.2625, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.48115477145148355, |
| "grad_norm": 0.17461912333965302, |
| "learning_rate": 0.0009414957031282751, |
| "loss": 0.2649, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.48115477145148355, |
| "eval_loss": 0.24944312870502472, |
| "eval_runtime": 2.0116, |
| "eval_samples_per_second": 51.701, |
| "eval_steps_per_second": 0.994, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.5012028869286287, |
| "grad_norm": 0.17447619140148163, |
| "learning_rate": 0.0009330127018922195, |
| "loss": 0.2597, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.5212510024057738, |
| "grad_norm": 0.1970607340335846, |
| "learning_rate": 0.0009239993167235614, |
| "loss": 0.2577, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.541299117882919, |
| "grad_norm": 0.17549267411231995, |
| "learning_rate": 0.0009144665878404079, |
| "loss": 0.2564, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.5613472333600642, |
| "grad_norm": 0.17851398885250092, |
| "learning_rate": 0.0009044261915889984, |
| "loss": 0.2604, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5613472333600642, |
| "eval_loss": 0.24154677987098694, |
| "eval_runtime": 2.0084, |
| "eval_samples_per_second": 51.784, |
| "eval_steps_per_second": 0.996, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5813953488372093, |
| "grad_norm": 0.20294925570487976, |
| "learning_rate": 0.0008938904261417087, |
| "loss": 0.271, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.6014434643143545, |
| "grad_norm": 0.16158199310302734, |
| "learning_rate": 0.0008828721964333975, |
| "loss": 0.2506, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.6214915797914996, |
| "grad_norm": 0.2007351964712143, |
| "learning_rate": 0.000871384998354549, |
| "loss": 0.2454, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.6415396952686447, |
| "grad_norm": 0.17736631631851196, |
| "learning_rate": 0.0008594429022205719, |
| "loss": 0.2514, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.6415396952686447, |
| "eval_loss": 0.23755024373531342, |
| "eval_runtime": 4.2146, |
| "eval_samples_per_second": 24.676, |
| "eval_steps_per_second": 0.475, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.6615878107457899, |
| "grad_norm": 0.15997523069381714, |
| "learning_rate": 0.0008470605355375032, |
| "loss": 0.2565, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.681635926222935, |
| "grad_norm": 0.15349100530147552, |
| "learning_rate": 0.0008342530650852265, |
| "loss": 0.2598, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.7016840417000801, |
| "grad_norm": 0.14758522808551788, |
| "learning_rate": 0.0008210361783401491, |
| "loss": 0.2426, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.7217321571772254, |
| "grad_norm": 0.1686255782842636, |
| "learning_rate": 0.0008074260642600964, |
| "loss": 0.2462, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.7217321571772254, |
| "eval_loss": 0.23332656919956207, |
| "eval_runtime": 4.2091, |
| "eval_samples_per_second": 24.708, |
| "eval_steps_per_second": 0.475, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.7417802726543705, |
| "grad_norm": 0.1921042650938034, |
| "learning_rate": 0.0007934393934549542, |
| "loss": 0.249, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.7618283881315157, |
| "grad_norm": 0.1523015797138214, |
| "learning_rate": 0.0007790932977673523, |
| "loss": 0.2399, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.7818765036086608, |
| "grad_norm": 0.16055895388126373, |
| "learning_rate": 0.0007644053492883989, |
| "loss": 0.2363, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.8019246190858059, |
| "grad_norm": 0.17422834038734436, |
| "learning_rate": 0.000749393538834164, |
| "loss": 0.2384, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8019246190858059, |
| "eval_loss": 0.23113039135932922, |
| "eval_runtime": 4.2589, |
| "eval_samples_per_second": 24.419, |
| "eval_steps_per_second": 0.47, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.8219727345629511, |
| "grad_norm": 0.1599196046590805, |
| "learning_rate": 0.0007340762539092858, |
| "loss": 0.2485, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.8420208500400962, |
| "grad_norm": 0.15405167639255524, |
| "learning_rate": 0.0007184722561846798, |
| "loss": 0.2464, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.8620689655172413, |
| "grad_norm": 0.18259042501449585, |
| "learning_rate": 0.0007026006585169466, |
| "loss": 0.2421, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.8821170809943866, |
| "grad_norm": 0.14938652515411377, |
| "learning_rate": 0.0006864809015376217, |
| "loss": 0.2425, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.8821170809943866, |
| "eval_loss": 0.22659502923488617, |
| "eval_runtime": 4.2121, |
| "eval_samples_per_second": 24.691, |
| "eval_steps_per_second": 0.475, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.9021651964715317, |
| "grad_norm": 0.16447846591472626, |
| "learning_rate": 0.0006701327298409448, |
| "loss": 0.2414, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.9222133119486768, |
| "grad_norm": 0.1590721160173416, |
| "learning_rate": 0.000653576167799312, |
| "loss": 0.2287, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.942261427425822, |
| "grad_norm": 0.1653919219970703, |
| "learning_rate": 0.0006368314950360416, |
| "loss": 0.2351, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.9623095429029671, |
| "grad_norm": 0.1875888705253601, |
| "learning_rate": 0.000619919221585484, |
| "loss": 0.2374, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.9623095429029671, |
| "eval_loss": 0.22220070660114288, |
| "eval_runtime": 4.2412, |
| "eval_samples_per_second": 24.521, |
| "eval_steps_per_second": 0.472, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.9823576583801122, |
| "grad_norm": 0.15826693177223206, |
| "learning_rate": 0.0006028600627709151, |
| "loss": 0.2314, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.0024057738572574, |
| "grad_norm": 0.15599651634693146, |
| "learning_rate": 0.0005856749138309716, |
| "loss": 0.2246, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.0224538893344026, |
| "grad_norm": 0.15733949840068817, |
| "learning_rate": 0.000568384824325718, |
| "loss": 0.2024, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.0425020048115476, |
| "grad_norm": 0.14656169712543488, |
| "learning_rate": 0.0005510109723536876, |
| "loss": 0.2109, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.0425020048115476, |
| "eval_loss": 0.22098909318447113, |
| "eval_runtime": 4.2171, |
| "eval_samples_per_second": 24.661, |
| "eval_steps_per_second": 0.474, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.062550120288693, |
| "grad_norm": 0.15239104628562927, |
| "learning_rate": 0.0005335746386114814, |
| "loss": 0.1941, |
| "step": 1325 |
| }, |
| { |
| "epoch": 1.082598235765838, |
| "grad_norm": 0.1525331288576126, |
| "learning_rate": 0.0005160971803276981, |
| "loss": 0.2074, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.1026463512429832, |
| "grad_norm": 0.18646268546581268, |
| "learning_rate": 0.0004986000051031212, |
| "loss": 0.2008, |
| "step": 1375 |
| }, |
| { |
| "epoch": 1.1226944667201284, |
| "grad_norm": 0.1695125252008438, |
| "learning_rate": 0.00048110454468920866, |
| "loss": 0.2019, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.1226944667201284, |
| "eval_loss": 0.21638630330562592, |
| "eval_runtime": 4.2187, |
| "eval_samples_per_second": 24.652, |
| "eval_steps_per_second": 0.474, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.1427425821972734, |
| "grad_norm": 0.15290401875972748, |
| "learning_rate": 0.0004636322287369997, |
| "loss": 0.2021, |
| "step": 1425 |
| }, |
| { |
| "epoch": 1.1627906976744187, |
| "grad_norm": 0.16536127030849457, |
| "learning_rate": 0.0004462044585485944, |
| "loss": 0.1972, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.1828388131515637, |
| "grad_norm": 0.14967386424541473, |
| "learning_rate": 0.0004288425808633575, |
| "loss": 0.2033, |
| "step": 1475 |
| }, |
| { |
| "epoch": 1.202886928628709, |
| "grad_norm": 0.15446773171424866, |
| "learning_rate": 0.00041156786171095476, |
| "loss": 0.1957, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.202886928628709, |
| "eval_loss": 0.21511909365653992, |
| "eval_runtime": 4.2185, |
| "eval_samples_per_second": 24.654, |
| "eval_steps_per_second": 0.474, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.222935044105854, |
| "grad_norm": 0.1389647275209427, |
| "learning_rate": 0.00039440146036324753, |
| "loss": 0.1964, |
| "step": 1525 |
| }, |
| { |
| "epoch": 1.2429831595829992, |
| "grad_norm": 0.13631069660186768, |
| "learning_rate": 0.00037736440341695125, |
| "loss": 0.2033, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.2630312750601442, |
| "grad_norm": 0.1525258868932724, |
| "learning_rate": 0.0003604775590388047, |
| "loss": 0.2013, |
| "step": 1575 |
| }, |
| { |
| "epoch": 1.2830793905372895, |
| "grad_norm": 0.14098823070526123, |
| "learning_rate": 0.00034376161140479495, |
| "loss": 0.1976, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.2830793905372895, |
| "eval_loss": 0.21296119689941406, |
| "eval_runtime": 4.0201, |
| "eval_samples_per_second": 25.87, |
| "eval_steps_per_second": 0.497, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.3031275060144347, |
| "grad_norm": 0.1649613082408905, |
| "learning_rate": 0.0003272370353647465, |
| "loss": 0.2001, |
| "step": 1625 |
| }, |
| { |
| "epoch": 1.3231756214915797, |
| "grad_norm": 0.17128996551036835, |
| "learning_rate": 0.00031092407136330754, |
| "loss": 0.2015, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.343223736968725, |
| "grad_norm": 0.15200765430927277, |
| "learning_rate": 0.0002948427006480528, |
| "loss": 0.2056, |
| "step": 1675 |
| }, |
| { |
| "epoch": 1.36327185244587, |
| "grad_norm": 0.15264691412448883, |
| "learning_rate": 0.00027901262079506784, |
| "loss": 0.2032, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.36327185244587, |
| "eval_loss": 0.21197493374347687, |
| "eval_runtime": 4.256, |
| "eval_samples_per_second": 24.436, |
| "eval_steps_per_second": 0.47, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.3833199679230153, |
| "grad_norm": 0.16756217181682587, |
| "learning_rate": 0.000263453221581995, |
| "loss": 0.1987, |
| "step": 1725 |
| }, |
| { |
| "epoch": 1.4033680834001605, |
| "grad_norm": 0.13861249387264252, |
| "learning_rate": 0.00024818356123809036, |
| "loss": 0.1998, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.4234161988773055, |
| "grad_norm": 0.16581584513187408, |
| "learning_rate": 0.00023322234310038588, |
| "loss": 0.1875, |
| "step": 1775 |
| }, |
| { |
| "epoch": 1.4434643143544506, |
| "grad_norm": 0.1450669765472412, |
| "learning_rate": 0.00021858789270454783, |
| "loss": 0.1953, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.4434643143544506, |
| "eval_loss": 0.2067786008119583, |
| "eval_runtime": 4.2236, |
| "eval_samples_per_second": 24.624, |
| "eval_steps_per_second": 0.474, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.4635124298315958, |
| "grad_norm": 0.1343117356300354, |
| "learning_rate": 0.00020429813533849174, |
| "loss": 0.2026, |
| "step": 1825 |
| }, |
| { |
| "epoch": 1.483560545308741, |
| "grad_norm": 0.14949767291545868, |
| "learning_rate": 0.00019037057408624846, |
| "loss": 0.1929, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.5036086607858863, |
| "grad_norm": 0.12897883355617523, |
| "learning_rate": 0.00017682226838897568, |
| "loss": 0.1907, |
| "step": 1875 |
| }, |
| { |
| "epoch": 1.5236567762630313, |
| "grad_norm": 0.15060247480869293, |
| "learning_rate": 0.00016366981314937373, |
| "loss": 0.2003, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.5236567762630313, |
| "eval_loss": 0.2053409218788147, |
| "eval_runtime": 4.2538, |
| "eval_samples_per_second": 24.449, |
| "eval_steps_per_second": 0.47, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.5437048917401763, |
| "grad_norm": 0.1608167141675949, |
| "learning_rate": 0.0001509293184050995, |
| "loss": 0.1984, |
| "step": 1925 |
| }, |
| { |
| "epoch": 1.5637530072173216, |
| "grad_norm": 0.15426403284072876, |
| "learning_rate": 0.000138616389596077, |
| "loss": 0.1955, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.5838011226944668, |
| "grad_norm": 0.1431884467601776, |
| "learning_rate": 0.0001267461084498744, |
| "loss": 0.1955, |
| "step": 1975 |
| }, |
| { |
| "epoch": 1.6038492381716118, |
| "grad_norm": 0.14500346779823303, |
| "learning_rate": 0.00011533301450856055, |
| "loss": 0.1898, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6038492381716118, |
| "eval_loss": 0.20465601980686188, |
| "eval_runtime": 4.2165, |
| "eval_samples_per_second": 24.665, |
| "eval_steps_per_second": 0.474, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.6238973536487569, |
| "grad_norm": 0.13658447563648224, |
| "learning_rate": 0.0001043910873196668, |
| "loss": 0.1882, |
| "step": 2025 |
| }, |
| { |
| "epoch": 1.6439454691259021, |
| "grad_norm": 0.14735296368598938, |
| "learning_rate": 9.393372931306943e-05, |
| "loss": 0.194, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.6639935846030474, |
| "grad_norm": 0.13270524144172668, |
| "learning_rate": 8.397374938476593e-05, |
| "loss": 0.191, |
| "step": 2075 |
| }, |
| { |
| "epoch": 1.6840417000801926, |
| "grad_norm": 0.13621263206005096, |
| "learning_rate": 7.452334720765258e-05, |
| "loss": 0.191, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.6840417000801926, |
| "eval_loss": 0.2029379904270172, |
| "eval_runtime": 4.2107, |
| "eval_samples_per_second": 24.699, |
| "eval_steps_per_second": 0.475, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.7040898155573376, |
| "grad_norm": 0.1418534368276596, |
| "learning_rate": 6.55940982885207e-05, |
| "loss": 0.1982, |
| "step": 2125 |
| }, |
| { |
| "epoch": 1.7241379310344827, |
| "grad_norm": 0.15418624877929688, |
| "learning_rate": 5.71969397895738e-05, |
| "loss": 0.1957, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.744186046511628, |
| "grad_norm": 0.15846756100654602, |
| "learning_rate": 4.934215713183526e-05, |
| "loss": 0.1904, |
| "step": 2175 |
| }, |
| { |
| "epoch": 1.7642341619887731, |
| "grad_norm": 0.15319091081619263, |
| "learning_rate": 4.203937139685188e-05, |
| "loss": 0.1835, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.7642341619887731, |
| "eval_loss": 0.20121867954730988, |
| "eval_runtime": 4.2209, |
| "eval_samples_per_second": 24.64, |
| "eval_steps_per_second": 0.474, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.7842822774659182, |
| "grad_norm": 0.14038674533367157, |
| "learning_rate": 3.529752754212767e-05, |
| "loss": 0.1886, |
| "step": 2225 |
| }, |
| { |
| "epoch": 1.8043303929430632, |
| "grad_norm": 0.13600395619869232, |
| "learning_rate": 2.9124883444720253e-05, |
| "loss": 0.186, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.8243785084202084, |
| "grad_norm": 0.1356409341096878, |
| "learning_rate": 2.3528999786421755e-05, |
| "loss": 0.183, |
| "step": 2275 |
| }, |
| { |
| "epoch": 1.8444266238973537, |
| "grad_norm": 0.14992156624794006, |
| "learning_rate": 1.851673079291216e-05, |
| "loss": 0.1738, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.8444266238973537, |
| "eval_loss": 0.20082467794418335, |
| "eval_runtime": 4.2146, |
| "eval_samples_per_second": 24.676, |
| "eval_steps_per_second": 0.475, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.864474739374499, |
| "grad_norm": 0.1463785320520401, |
| "learning_rate": 1.4094215838229174e-05, |
| "loss": 0.1982, |
| "step": 2325 |
| }, |
| { |
| "epoch": 1.884522854851644, |
| "grad_norm": 0.15394070744514465, |
| "learning_rate": 1.0266871924838216e-05, |
| "loss": 0.1887, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.904570970328789, |
| "grad_norm": 0.14152726531028748, |
| "learning_rate": 7.03938704851248e-06, |
| "loss": 0.1783, |
| "step": 2375 |
| }, |
| { |
| "epoch": 1.9246190858059342, |
| "grad_norm": 0.1539337933063507, |
| "learning_rate": 4.415714456151243e-06, |
| "loss": 0.182, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.9246190858059342, |
| "eval_loss": 0.20044730603694916, |
| "eval_runtime": 2.0109, |
| "eval_samples_per_second": 51.717, |
| "eval_steps_per_second": 0.995, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.9446672012830795, |
| "grad_norm": 0.13150149583816528, |
| "learning_rate": 2.3990678035694656e-06, |
| "loss": 0.1947, |
| "step": 2425 |
| }, |
| { |
| "epoch": 1.9647153167602245, |
| "grad_norm": 0.14985321462154388, |
| "learning_rate": 9.919172191896753e-07, |
| "loss": 0.1947, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.9847634322373697, |
| "grad_norm": 0.1531253457069397, |
| "learning_rate": 1.9598627845779372e-07, |
| "loss": 0.1918, |
| "step": 2475 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 2494, |
| "total_flos": 1.6216560612723917e+18, |
| "train_loss": 0.23998703379198943, |
| "train_runtime": 3791.0396, |
| "train_samples_per_second": 21.048, |
| "train_steps_per_second": 0.658 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 2494, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 0, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.6216560612723917e+18, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|