{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990680335507922, "eval_steps": 500, "global_step": 268, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01863932898415657, "grad_norm": 4.758676978183845, "learning_rate": 1.785714285714286e-05, "loss": 0.787, "num_tokens": 10465392.0, "step": 5 }, { "epoch": 0.03727865796831314, "grad_norm": 1.6688141341980978, "learning_rate": 3.571428571428572e-05, "loss": 0.6818, "num_tokens": 20951152.0, "step": 10 }, { "epoch": 0.05591798695246971, "grad_norm": 1.5315485414502277, "learning_rate": 4.999827900623038e-05, "loss": 0.5877, "num_tokens": 31436912.0, "step": 15 }, { "epoch": 0.07455731593662628, "grad_norm": 1.1907406620730119, "learning_rate": 4.993807186343243e-05, "loss": 0.5446, "num_tokens": 41909269.0, "step": 20 }, { "epoch": 0.09319664492078285, "grad_norm": 0.7640819097303133, "learning_rate": 4.979207812402531e-05, "loss": 0.5249, "num_tokens": 52353719.0, "step": 25 }, { "epoch": 0.11183597390493942, "grad_norm": 0.7051975732110988, "learning_rate": 4.956085596012407e-05, "loss": 0.5082, "num_tokens": 62792124.0, "step": 30 }, { "epoch": 0.13047530288909598, "grad_norm": 0.5494431157725719, "learning_rate": 4.924528939432311e-05, "loss": 0.4926, "num_tokens": 73260442.0, "step": 35 }, { "epoch": 0.14911463187325255, "grad_norm": 0.4434538386895568, "learning_rate": 4.884658491984735e-05, "loss": 0.4775, "num_tokens": 83727556.0, "step": 40 }, { "epoch": 0.16775396085740912, "grad_norm": 0.36186373302288577, "learning_rate": 4.8366266887814235e-05, "loss": 0.4756, "num_tokens": 94201938.0, "step": 45 }, { "epoch": 0.1863932898415657, "grad_norm": 0.4278036185398598, "learning_rate": 4.780617167924209e-05, "loss": 0.4688, "num_tokens": 104670740.0, "step": 50 }, { "epoch": 0.20503261882572227, "grad_norm": 0.49396611069124025, "learning_rate": 4.716844068408693e-05, "loss": 0.4707, "num_tokens": 115151933.0, "step": 55 }, { "epoch": 0.22367194780987884, "grad_norm": 0.7061064770433652, "learning_rate": 4.6455512114150546e-05, "loss": 0.4636, "num_tokens": 125607410.0, "step": 60 }, { "epoch": 0.2423112767940354, "grad_norm": 0.5058313809026105, "learning_rate": 4.5670111681161296e-05, "loss": 0.4622, "num_tokens": 136036466.0, "step": 65 }, { "epoch": 0.26095060577819196, "grad_norm": 0.4448833534200374, "learning_rate": 4.481524217566783e-05, "loss": 0.4584, "num_tokens": 146502251.0, "step": 70 }, { "epoch": 0.27958993476234856, "grad_norm": 0.4081315243864177, "learning_rate": 4.3894171986588217e-05, "loss": 0.456, "num_tokens": 156977308.0, "step": 75 }, { "epoch": 0.2982292637465051, "grad_norm": 0.31799801669428307, "learning_rate": 4.29104226053073e-05, "loss": 0.451, "num_tokens": 167423794.0, "step": 80 }, { "epoch": 0.3168685927306617, "grad_norm": 0.25130630836972606, "learning_rate": 4.186775516209732e-05, "loss": 0.445, "num_tokens": 177900466.0, "step": 85 }, { "epoch": 0.33550792171481825, "grad_norm": 0.3766061697367993, "learning_rate": 4.077015604633669e-05, "loss": 0.4479, "num_tokens": 188373813.0, "step": 90 }, { "epoch": 0.35414725069897485, "grad_norm": 0.3619897065914863, "learning_rate": 3.962182166550441e-05, "loss": 0.4462, "num_tokens": 198859382.0, "step": 95 }, { "epoch": 0.3727865796831314, "grad_norm": 0.34068435938936203, "learning_rate": 3.8427142401220634e-05, "loss": 0.4441, "num_tokens": 209345142.0, "step": 100 }, { "epoch": 0.391425908667288, "grad_norm": 0.3381570974545347, "learning_rate": 3.71906858236735e-05, "loss": 0.4428, "num_tokens": 219820636.0, "step": 105 }, { "epoch": 0.41006523765144454, "grad_norm": 0.34544406185138454, "learning_rate": 3.591717922860785e-05, "loss": 0.4371, "num_tokens": 230306396.0, "step": 110 }, { "epoch": 0.42870456663560114, "grad_norm": 0.2955967687161345, "learning_rate": 3.46114915636416e-05, "loss": 0.4399, "num_tokens": 240792156.0, "step": 115 }, { "epoch": 0.4473438956197577, "grad_norm": 0.2952519756856542, "learning_rate": 3.3278614813010034e-05, "loss": 0.4377, "num_tokens": 251259365.0, "step": 120 }, { "epoch": 0.4659832246039143, "grad_norm": 0.33987091063444214, "learning_rate": 3.1923644911909e-05, "loss": 0.4409, "num_tokens": 261736444.0, "step": 125 }, { "epoch": 0.4846225535880708, "grad_norm": 0.2945130622298854, "learning_rate": 3.0551762263406576e-05, "loss": 0.4393, "num_tokens": 272205866.0, "step": 130 }, { "epoch": 0.5032618825722274, "grad_norm": 0.3070511037477627, "learning_rate": 2.9168211932412042e-05, "loss": 0.4361, "num_tokens": 282683327.0, "step": 135 }, { "epoch": 0.5219012115563839, "grad_norm": 0.31462438644621554, "learning_rate": 2.777828359242567e-05, "loss": 0.4378, "num_tokens": 293122806.0, "step": 140 }, { "epoch": 0.5405405405405406, "grad_norm": 0.28655205481310253, "learning_rate": 2.6387291301738377e-05, "loss": 0.4372, "num_tokens": 303598947.0, "step": 145 }, { "epoch": 0.5591798695246971, "grad_norm": 0.2732532198306148, "learning_rate": 2.50005531864019e-05, "loss": 0.435, "num_tokens": 314077641.0, "step": 150 }, { "epoch": 0.5778191985088537, "grad_norm": 0.2715460467250568, "learning_rate": 2.362337110764688e-05, "loss": 0.4273, "num_tokens": 324545386.0, "step": 155 }, { "epoch": 0.5964585274930102, "grad_norm": 0.25285042415640246, "learning_rate": 2.226101039148557e-05, "loss": 0.4367, "num_tokens": 335031146.0, "step": 160 }, { "epoch": 0.6150978564771669, "grad_norm": 0.2149192519078472, "learning_rate": 2.0918679697998252e-05, "loss": 0.4212, "num_tokens": 345504904.0, "step": 165 }, { "epoch": 0.6337371854613234, "grad_norm": 0.21785863501676977, "learning_rate": 1.9601511107268255e-05, "loss": 0.4284, "num_tokens": 355956520.0, "step": 170 }, { "epoch": 0.65237651444548, "grad_norm": 0.23310475937674308, "learning_rate": 1.8314540498102216e-05, "loss": 0.4285, "num_tokens": 366434348.0, "step": 175 }, { "epoch": 0.6710158434296365, "grad_norm": 0.20067515546839765, "learning_rate": 1.7062688294552992e-05, "loss": 0.4313, "num_tokens": 376908695.0, "step": 180 }, { "epoch": 0.6896551724137931, "grad_norm": 0.2271355571528628, "learning_rate": 1.5850740653856096e-05, "loss": 0.4325, "num_tokens": 387384301.0, "step": 185 }, { "epoch": 0.7082945013979497, "grad_norm": 0.2094480953288933, "learning_rate": 1.4683331167703218e-05, "loss": 0.4251, "num_tokens": 397835323.0, "step": 190 }, { "epoch": 0.7269338303821062, "grad_norm": 0.18502938113842718, "learning_rate": 1.356492314681356e-05, "loss": 0.4243, "num_tokens": 408298322.0, "step": 195 }, { "epoch": 0.7455731593662628, "grad_norm": 0.18093742204905666, "learning_rate": 1.2499792556533716e-05, "loss": 0.429, "num_tokens": 418781566.0, "step": 200 }, { "epoch": 0.7642124883504194, "grad_norm": 0.1807954132426552, "learning_rate": 1.1492011668707753e-05, "loss": 0.4253, "num_tokens": 429246222.0, "step": 205 }, { "epoch": 0.782851817334576, "grad_norm": 0.17659671150040587, "learning_rate": 1.0545433492320603e-05, "loss": 0.4201, "num_tokens": 439717644.0, "step": 210 }, { "epoch": 0.8014911463187325, "grad_norm": 0.18699345886220464, "learning_rate": 9.663677042440537e-06, "loss": 0.4274, "num_tokens": 450170885.0, "step": 215 }, { "epoch": 0.8201304753028891, "grad_norm": 0.17996961140043713, "learning_rate": 8.850113503781367e-06, "loss": 0.4245, "num_tokens": 460656645.0, "step": 220 }, { "epoch": 0.8387698042870456, "grad_norm": 0.19087861268470965, "learning_rate": 8.107853341784671e-06, "loss": 0.4184, "num_tokens": 471142405.0, "step": 225 }, { "epoch": 0.8574091332712023, "grad_norm": 0.1690921442521712, "learning_rate": 7.439734410499752e-06, "loss": 0.4212, "num_tokens": 481616409.0, "step": 230 }, { "epoch": 0.8760484622553588, "grad_norm": 0.18989008366244767, "learning_rate": 6.848311102728011e-06, "loss": 0.4211, "num_tokens": 492097674.0, "step": 235 }, { "epoch": 0.8946877912395154, "grad_norm": 0.16510657640464366, "learning_rate": 6.335844583913515e-06, "loss": 0.4188, "num_tokens": 502555042.0, "step": 240 }, { "epoch": 0.9133271202236719, "grad_norm": 0.17123253579463338, "learning_rate": 5.904294147118193e-06, "loss": 0.4171, "num_tokens": 513031521.0, "step": 245 }, { "epoch": 0.9319664492078286, "grad_norm": 0.17312646330002712, "learning_rate": 5.555309722133842e-06, "loss": 0.4237, "num_tokens": 523517281.0, "step": 250 }, { "epoch": 0.9506057781919851, "grad_norm": 0.1755636687629801, "learning_rate": 5.290225567370509e-06, "loss": 0.4206, "num_tokens": 533998146.0, "step": 255 }, { "epoch": 0.9692451071761417, "grad_norm": 0.1535618058215829, "learning_rate": 5.110055168638854e-06, "loss": 0.4173, "num_tokens": 544470561.0, "step": 260 }, { "epoch": 0.9878844361602982, "grad_norm": 0.15891062616343388, "learning_rate": 5.0154873643297575e-06, "loss": 0.4139, "num_tokens": 554934577.0, "step": 265 }, { "epoch": 0.9990680335507922, "num_tokens": 561226033.0, "step": 268, "total_flos": 1.379511412069499e+18, "train_loss": 0.45678649598093174, "train_runtime": 12057.0545, "train_samples_per_second": 2.845, "train_steps_per_second": 0.022 } ], "logging_steps": 5, "max_steps": 268, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.379511412069499e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }