{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3029315960912053, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0016286644951140066, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6388881802558899, "kl": 1.7871618183562532e-05, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 411088.0, "reward": 0.5671041011810303, "reward_std": 0.18222743272781372, "rewards/reward_len/mean": 0.5671040415763855, "rewards/reward_len/std": 0.3448162376880646, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.003257328990228013, "frac_reward_zero_std": 0.03125, "grad_norm": 0.623892605304718, "kl": 0.005197132937610149, "learning_rate": 9.99999272789071e-07, "loss": 0.0, "num_tokens": 822496.0, "reward": 0.5394927263259888, "reward_std": 0.1948508620262146, "rewards/reward_len/mean": 0.5394927263259888, "rewards/reward_len/std": 0.34110620617866516, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.004885993485342019, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6135867834091187, "kl": 0.00516572454944253, "learning_rate": 9.999970911583997e-07, "loss": 0.0, "num_tokens": 1232992.0, "reward": 0.5903396606445312, "reward_std": 0.17986822128295898, "rewards/reward_len/mean": 0.5903396010398865, "rewards/reward_len/std": 0.3291432857513428, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.006514657980456026, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6241327524185181, "kl": 0.00507001206278801, "learning_rate": 9.99993455114332e-07, "loss": 0.0, "num_tokens": 1643680.0, "reward": 0.5089424848556519, "reward_std": 0.19693496823310852, "rewards/reward_len/mean": 0.5089424848556519, "rewards/reward_len/std": 0.3688817322254181, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.008143322475570033, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6683984398841858, "kl": 0.0051913997158408165, "learning_rate": 9.999883646674443e-07, "loss": 0.0, "num_tokens": 2056416.0, "reward": 0.5647279620170593, "reward_std": 0.17909835278987885, "rewards/reward_len/mean": 0.5647279024124146, "rewards/reward_len/std": 0.3105168342590332, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.009771986970684038, "frac_reward_zero_std": 0.0, "grad_norm": 0.6787006855010986, "kl": 0.005202514585107565, "learning_rate": 9.999818198325444e-07, "loss": 0.0, "num_tokens": 2466720.0, "reward": 0.6070664525032043, "reward_std": 0.19356480240821838, "rewards/reward_len/mean": 0.6070663928985596, "rewards/reward_len/std": 0.3106555640697479, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.011400651465798045, "frac_reward_zero_std": 0.0, "grad_norm": 0.6801379919052124, "kl": 0.005148593336343765, "learning_rate": 9.999738206286698e-07, "loss": 0.0, "num_tokens": 2879024.0, "reward": 0.6005317568778992, "reward_std": 0.19208060204982758, "rewards/reward_len/mean": 0.600531816482544, "rewards/reward_len/std": 0.28792381286621094, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.013029315960912053, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6590358018875122, "kl": 0.0051302313804626465, "learning_rate": 9.999643670790893e-07, "loss": 0.0, "num_tokens": 3290240.0, "reward": 0.5454612970352173, "reward_std": 0.18492072820663452, "rewards/reward_len/mean": 0.5454612970352173, "rewards/reward_len/std": 0.29107969999313354, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.014657980456026058, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6210021376609802, "kl": 0.005243225954473019, "learning_rate": 9.999534592113015e-07, "loss": 0.0, "num_tokens": 3700944.0, "reward": 0.5942022204399109, "reward_std": 0.15317894518375397, "rewards/reward_len/mean": 0.5942021608352661, "rewards/reward_len/std": 0.3289192020893097, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.016286644951140065, "frac_reward_zero_std": 0.03125, "grad_norm": 0.649819552898407, "kl": 0.005184438545256853, "learning_rate": 9.999410970570357e-07, "loss": 0.0, "num_tokens": 4111632.0, "reward": 0.5463387966156006, "reward_std": 0.19288502633571625, "rewards/reward_len/mean": 0.5463387966156006, "rewards/reward_len/std": 0.3073326349258423, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.017915309446254073, "frac_reward_zero_std": 0.0, "grad_norm": 0.6547185182571411, "kl": 0.005140652880072594, "learning_rate": 9.999272806522516e-07, "loss": 0.0, "num_tokens": 4522320.0, "reward": 0.6038432121276855, "reward_std": 0.2034873366355896, "rewards/reward_len/mean": 0.6038432121276855, "rewards/reward_len/std": 0.34045812487602234, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.019543973941368076, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6195882558822632, "kl": 0.005145237315446138, "learning_rate": 9.99912010037139e-07, "loss": 0.0, "num_tokens": 4931568.0, "reward": 0.6099924445152283, "reward_std": 0.17823410034179688, "rewards/reward_len/mean": 0.6099924445152283, "rewards/reward_len/std": 0.29560211300849915, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.021172638436482084, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6300224661827087, "kl": 0.005034224130213261, "learning_rate": 9.998952852561175e-07, "loss": 0.0, "num_tokens": 5342512.0, "reward": 0.655123770236969, "reward_std": 0.1695661097764969, "rewards/reward_len/mean": 0.6551237106323242, "rewards/reward_len/std": 0.30493563413619995, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02280130293159609, "frac_reward_zero_std": 0.0, "grad_norm": 0.6945013999938965, "kl": 0.00521053234115243, "learning_rate": 9.99877106357837e-07, "loss": 0.0, "num_tokens": 5754576.0, "reward": 0.5722415447235107, "reward_std": 0.21053053438663483, "rewards/reward_len/mean": 0.572241485118866, "rewards/reward_len/std": 0.3097406327724457, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.024429967426710098, "frac_reward_zero_std": 0.0, "grad_norm": 0.6892687082290649, "kl": 0.005074051208794117, "learning_rate": 9.998574733951772e-07, "loss": 0.0, "num_tokens": 6165648.0, "reward": 0.6249672174453735, "reward_std": 0.2104693353176117, "rewards/reward_len/mean": 0.6249672174453735, "rewards/reward_len/std": 0.3120920658111572, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.026058631921824105, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6226585507392883, "kl": 0.0051437318325042725, "learning_rate": 9.998363864252474e-07, "loss": 0.0, "num_tokens": 6576400.0, "reward": 0.489818811416626, "reward_std": 0.19308173656463623, "rewards/reward_len/mean": 0.48981884121894836, "rewards/reward_len/std": 0.33459267020225525, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02768729641693811, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6398124694824219, "kl": 0.00530262291431427, "learning_rate": 9.998138455093858e-07, "loss": 0.0, "num_tokens": 6987616.0, "reward": 0.600212574005127, "reward_std": 0.22005778551101685, "rewards/reward_len/mean": 0.600212574005127, "rewards/reward_len/std": 0.3237009346485138, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.029315960912052116, "frac_reward_zero_std": 0.0, "grad_norm": 0.6588078141212463, "kl": 0.005185543093830347, "learning_rate": 9.99789850713161e-07, "loss": 0.0, "num_tokens": 7398384.0, "reward": 0.5867031812667847, "reward_std": 0.2103123962879181, "rewards/reward_len/mean": 0.5867031812667847, "rewards/reward_len/std": 0.3099241554737091, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.030944625407166124, "frac_reward_zero_std": 0.0, "grad_norm": 0.6596400737762451, "kl": 0.005167409311980009, "learning_rate": 9.997644021063695e-07, "loss": 0.0, "num_tokens": 7809024.0, "reward": 0.5700578093528748, "reward_std": 0.21379604935646057, "rewards/reward_len/mean": 0.5700578093528748, "rewards/reward_len/std": 0.3059803247451782, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03257328990228013, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6208882927894592, "kl": 0.005302839912474155, "learning_rate": 9.99737499763038e-07, "loss": 0.0, "num_tokens": 8219904.0, "reward": 0.5609175562858582, "reward_std": 0.17699500918388367, "rewards/reward_len/mean": 0.5609175562858582, "rewards/reward_len/std": 0.3058301508426666, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03420195439739414, "frac_reward_zero_std": 0.0, "grad_norm": 0.6213905811309814, "kl": 0.005111793056130409, "learning_rate": 9.997091437614207e-07, "loss": 0.0, "num_tokens": 8631440.0, "reward": 0.623421311378479, "reward_std": 0.17560169100761414, "rewards/reward_len/mean": 0.623421311378479, "rewards/reward_len/std": 0.29332488775253296, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.035830618892508145, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6540530920028687, "kl": 0.00508685689419508, "learning_rate": 9.99679334184001e-07, "loss": 0.0, "num_tokens": 9042096.0, "reward": 0.5416594743728638, "reward_std": 0.20462509989738464, "rewards/reward_len/mean": 0.5416594743728638, "rewards/reward_len/std": 0.3358396887779236, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03745928338762215, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6493518948554993, "kl": 0.005144907161593437, "learning_rate": 9.996480711174902e-07, "loss": 0.0, "num_tokens": 9453760.0, "reward": 0.5618683099746704, "reward_std": 0.1791420876979828, "rewards/reward_len/mean": 0.5618683099746704, "rewards/reward_len/std": 0.3174861669540405, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03908794788273615, "frac_reward_zero_std": 0.0, "grad_norm": 0.6516464948654175, "kl": 0.005137883126735687, "learning_rate": 9.996153546528277e-07, "loss": 0.0, "num_tokens": 9864368.0, "reward": 0.6220977306365967, "reward_std": 0.1945955455303192, "rewards/reward_len/mean": 0.6220977306365967, "rewards/reward_len/std": 0.30837756395339966, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04071661237785016, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6572713851928711, "kl": 0.005388367921113968, "learning_rate": 9.995811848851804e-07, "loss": 0.0, "num_tokens": 10275424.0, "reward": 0.6355146169662476, "reward_std": 0.1857178509235382, "rewards/reward_len/mean": 0.6355146765708923, "rewards/reward_len/std": 0.29021725058555603, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04234527687296417, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6290954351425171, "kl": 0.005033119115978479, "learning_rate": 9.995455619139434e-07, "loss": 0.0, "num_tokens": 10685824.0, "reward": 0.533816933631897, "reward_std": 0.21252873539924622, "rewards/reward_len/mean": 0.533816933631897, "rewards/reward_len/std": 0.32777971029281616, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.043973941368078175, "frac_reward_zero_std": 0.0, "grad_norm": 0.6108789443969727, "kl": 0.005301167722791433, "learning_rate": 9.995084858427377e-07, "loss": 0.0, "num_tokens": 11096912.0, "reward": 0.6387288570404053, "reward_std": 0.18565475940704346, "rewards/reward_len/mean": 0.6387288570404053, "rewards/reward_len/std": 0.31867682933807373, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04560260586319218, "frac_reward_zero_std": 0.0, "grad_norm": 0.6766307353973389, "kl": 0.00535220094025135, "learning_rate": 9.994699567794122e-07, "loss": 0.0, "num_tokens": 11508144.0, "reward": 0.5715301036834717, "reward_std": 0.1956774890422821, "rewards/reward_len/mean": 0.5715301036834717, "rewards/reward_len/std": 0.30536895990371704, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04723127035830619, "frac_reward_zero_std": 0.0, "grad_norm": 0.6446996331214905, "kl": 0.005238741170614958, "learning_rate": 9.994299748360416e-07, "loss": 0.0, "num_tokens": 11919168.0, "reward": 0.6088556051254272, "reward_std": 0.20080658793449402, "rewards/reward_len/mean": 0.6088556051254272, "rewards/reward_len/std": 0.2945505380630493, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.048859934853420196, "frac_reward_zero_std": 0.0, "grad_norm": 0.67066490650177, "kl": 0.005199250299483538, "learning_rate": 9.993885401289278e-07, "loss": 0.0, "num_tokens": 12330864.0, "reward": 0.5517983436584473, "reward_std": 0.19759830832481384, "rewards/reward_len/mean": 0.5517982840538025, "rewards/reward_len/std": 0.3242999315261841, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.050488599348534204, "frac_reward_zero_std": 0.0, "grad_norm": 0.6100611090660095, "kl": 0.005241416394710541, "learning_rate": 9.99345652778597e-07, "loss": 0.0, "num_tokens": 12741456.0, "reward": 0.6541539430618286, "reward_std": 0.16861668229103088, "rewards/reward_len/mean": 0.6541539430618286, "rewards/reward_len/std": 0.29960745573043823, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05211726384364821, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6119811534881592, "kl": 0.005168698728084564, "learning_rate": 9.993013129098023e-07, "loss": 0.0, "num_tokens": 13152912.0, "reward": 0.5705752372741699, "reward_std": 0.1890994906425476, "rewards/reward_len/mean": 0.5705752372741699, "rewards/reward_len/std": 0.32708969712257385, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05374592833876222, "frac_reward_zero_std": 0.0, "grad_norm": 0.6430990099906921, "kl": 0.005400124471634626, "learning_rate": 9.992555206515211e-07, "loss": 0.0, "num_tokens": 13563600.0, "reward": 0.5565307140350342, "reward_std": 0.20705029368400574, "rewards/reward_len/mean": 0.5565307140350342, "rewards/reward_len/std": 0.3331342339515686, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05537459283387622, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6542065739631653, "kl": 0.005224845837801695, "learning_rate": 9.992082761369566e-07, "loss": 0.0, "num_tokens": 13973696.0, "reward": 0.5638452768325806, "reward_std": 0.199286550283432, "rewards/reward_len/mean": 0.5638452768325806, "rewards/reward_len/std": 0.3377383351325989, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.057003257328990226, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5807435512542725, "kl": 0.005298827309161425, "learning_rate": 9.99159579503535e-07, "loss": 0.0, "num_tokens": 14384672.0, "reward": 0.6294785737991333, "reward_std": 0.16881000995635986, "rewards/reward_len/mean": 0.6294785737991333, "rewards/reward_len/std": 0.3065549433231354, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05863192182410423, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6144348382949829, "kl": 0.0053063007071614265, "learning_rate": 9.991094308929076e-07, "loss": 0.0, "num_tokens": 14795664.0, "reward": 0.5943644046783447, "reward_std": 0.1856200397014618, "rewards/reward_len/mean": 0.5943643450737, "rewards/reward_len/std": 0.2896549701690674, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06026058631921824, "frac_reward_zero_std": 0.0, "grad_norm": 0.66520756483078, "kl": 0.005278478842228651, "learning_rate": 9.990578304509487e-07, "loss": 0.0, "num_tokens": 15207312.0, "reward": 0.6014511585235596, "reward_std": 0.19643673300743103, "rewards/reward_len/mean": 0.6014510989189148, "rewards/reward_len/std": 0.29476919770240784, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06188925081433225, "frac_reward_zero_std": 0.0, "grad_norm": 0.606436014175415, "kl": 0.005264244507998228, "learning_rate": 9.99004778327756e-07, "loss": 0.0, "num_tokens": 15618272.0, "reward": 0.5898331999778748, "reward_std": 0.1978696584701538, "rewards/reward_len/mean": 0.5898331999778748, "rewards/reward_len/std": 0.3263159394264221, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06351791530944625, "frac_reward_zero_std": 0.0, "grad_norm": 0.6182661652565002, "kl": 0.005343371070921421, "learning_rate": 9.989502746776498e-07, "loss": 0.0, "num_tokens": 16028896.0, "reward": 0.5877838134765625, "reward_std": 0.17416806519031525, "rewards/reward_len/mean": 0.5877838134765625, "rewards/reward_len/std": 0.3184359073638916, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06514657980456026, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6163160800933838, "kl": 0.0053837234154343605, "learning_rate": 9.988943196591726e-07, "loss": 0.0, "num_tokens": 16440528.0, "reward": 0.5869104862213135, "reward_std": 0.16155314445495605, "rewards/reward_len/mean": 0.5869104862213135, "rewards/reward_len/std": 0.2978645861148834, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06677524429967427, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6561365723609924, "kl": 0.005459235981106758, "learning_rate": 9.98836913435089e-07, "loss": 0.0, "num_tokens": 16850688.0, "reward": 0.637675404548645, "reward_std": 0.17939016222953796, "rewards/reward_len/mean": 0.637675404548645, "rewards/reward_len/std": 0.3196991980075836, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06840390879478828, "frac_reward_zero_std": 0.03125, "grad_norm": 0.606351375579834, "kl": 0.0054919105023145676, "learning_rate": 9.987780561723846e-07, "loss": 0.0, "num_tokens": 17260304.0, "reward": 0.5718275904655457, "reward_std": 0.18773077428340912, "rewards/reward_len/mean": 0.5718275904655457, "rewards/reward_len/std": 0.30910930037498474, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07003257328990228, "frac_reward_zero_std": 0.03125, "grad_norm": 0.638601541519165, "kl": 0.005394000560045242, "learning_rate": 9.987177480422662e-07, "loss": 0.0, "num_tokens": 17671728.0, "reward": 0.6531549096107483, "reward_std": 0.20121023058891296, "rewards/reward_len/mean": 0.6531549096107483, "rewards/reward_len/std": 0.2955949604511261, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07166123778501629, "frac_reward_zero_std": 0.0, "grad_norm": 0.6864820122718811, "kl": 0.005484000779688358, "learning_rate": 9.986559892201606e-07, "loss": 0.0, "num_tokens": 18082144.0, "reward": 0.5483388304710388, "reward_std": 0.18311038613319397, "rewards/reward_len/mean": 0.5483388900756836, "rewards/reward_len/std": 0.330456018447876, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0732899022801303, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6410079598426819, "kl": 0.005553834140300751, "learning_rate": 9.985927798857142e-07, "loss": 0.0, "num_tokens": 18493056.0, "reward": 0.5791505575180054, "reward_std": 0.20317557454109192, "rewards/reward_len/mean": 0.5791505575180054, "rewards/reward_len/std": 0.3126733601093292, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0749185667752443, "frac_reward_zero_std": 0.0625, "grad_norm": 0.620506763458252, "kl": 0.005274210590869188, "learning_rate": 9.985281202227936e-07, "loss": 0.0, "num_tokens": 18903888.0, "reward": 0.5840516090393066, "reward_std": 0.15629419684410095, "rewards/reward_len/mean": 0.5840516090393066, "rewards/reward_len/std": 0.36693575978279114, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07654723127035831, "frac_reward_zero_std": 0.0, "grad_norm": 0.6485204696655273, "kl": 0.0055670845322310925, "learning_rate": 9.984620104194832e-07, "loss": 0.0, "num_tokens": 19314720.0, "reward": 0.5956484079360962, "reward_std": 0.18044912815093994, "rewards/reward_len/mean": 0.5956484079360962, "rewards/reward_len/std": 0.29261091351509094, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0781758957654723, "frac_reward_zero_std": 0.09375, "grad_norm": 0.6602634191513062, "kl": 0.005656009539961815, "learning_rate": 9.983944506680865e-07, "loss": 0.0, "num_tokens": 19724736.0, "reward": 0.54852694272995, "reward_std": 0.17476065456867218, "rewards/reward_len/mean": 0.54852694272995, "rewards/reward_len/std": 0.34440019726753235, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07980456026058631, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6577922701835632, "kl": 0.0054042828269302845, "learning_rate": 9.98325441165124e-07, "loss": 0.0, "num_tokens": 20135280.0, "reward": 0.6323337554931641, "reward_std": 0.17598050832748413, "rewards/reward_len/mean": 0.6323337554931641, "rewards/reward_len/std": 0.31158560514450073, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08143322475570032, "frac_reward_zero_std": 0.0, "grad_norm": 0.6356520056724548, "kl": 0.005586713552474976, "learning_rate": 9.982549821113338e-07, "loss": 0.0, "num_tokens": 20546080.0, "reward": 0.6337567567825317, "reward_std": 0.22050711512565613, "rewards/reward_len/mean": 0.6337567567825317, "rewards/reward_len/std": 0.30196985602378845, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08306188925081433, "frac_reward_zero_std": 0.0, "grad_norm": 0.6533176898956299, "kl": 0.005806599743664265, "learning_rate": 9.9818307371167e-07, "loss": 0.0, "num_tokens": 20956512.0, "reward": 0.631179928779602, "reward_std": 0.17045967280864716, "rewards/reward_len/mean": 0.631179928779602, "rewards/reward_len/std": 0.3162584602832794, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08469055374592833, "frac_reward_zero_std": 0.0, "grad_norm": 0.6472312808036804, "kl": 0.005578854586929083, "learning_rate": 9.981097161753031e-07, "loss": 0.0, "num_tokens": 21366816.0, "reward": 0.6208138465881348, "reward_std": 0.18577434122562408, "rewards/reward_len/mean": 0.6208138465881348, "rewards/reward_len/std": 0.30877581238746643, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08631921824104234, "frac_reward_zero_std": 0.03125, "grad_norm": 0.625739574432373, "kl": 0.005654506850987673, "learning_rate": 9.980349097156184e-07, "loss": 0.0, "num_tokens": 21777744.0, "reward": 0.5659366250038147, "reward_std": 0.2011183649301529, "rewards/reward_len/mean": 0.5659365653991699, "rewards/reward_len/std": 0.3204178810119629, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08794788273615635, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6167154312133789, "kl": 0.005609589628875256, "learning_rate": 9.979586545502167e-07, "loss": 0.0, "num_tokens": 22188784.0, "reward": 0.6096683740615845, "reward_std": 0.1614975482225418, "rewards/reward_len/mean": 0.6096683740615845, "rewards/reward_len/std": 0.3292374908924103, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08957654723127036, "frac_reward_zero_std": 0.0, "grad_norm": 0.6277379393577576, "kl": 0.005567457992583513, "learning_rate": 9.97880950900912e-07, "loss": 0.0, "num_tokens": 22599600.0, "reward": 0.6692061424255371, "reward_std": 0.16650015115737915, "rewards/reward_len/mean": 0.6692060232162476, "rewards/reward_len/std": 0.2767939865589142, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09120521172638436, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6139224171638489, "kl": 0.005637299735099077, "learning_rate": 9.97801798993732e-07, "loss": 0.0, "num_tokens": 23011120.0, "reward": 0.5637732744216919, "reward_std": 0.2037193775177002, "rewards/reward_len/mean": 0.5637732744216919, "rewards/reward_len/std": 0.34039995074272156, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09283387622149837, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6111644506454468, "kl": 0.0056121209636330605, "learning_rate": 9.977211990589176e-07, "loss": 0.0, "num_tokens": 23422272.0, "reward": 0.626968502998352, "reward_std": 0.1974426656961441, "rewards/reward_len/mean": 0.626968502998352, "rewards/reward_len/std": 0.29587894678115845, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09446254071661238, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6233235001564026, "kl": 0.005783131346106529, "learning_rate": 9.97639151330921e-07, "loss": 0.0, "num_tokens": 23832592.0, "reward": 0.6160591244697571, "reward_std": 0.18513397872447968, "rewards/reward_len/mean": 0.6160591244697571, "rewards/reward_len/std": 0.33309534192085266, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09609120521172639, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6344835162162781, "kl": 0.005805442109704018, "learning_rate": 9.975556560484067e-07, "loss": 0.0, "num_tokens": 24243008.0, "reward": 0.6250290870666504, "reward_std": 0.1746916025876999, "rewards/reward_len/mean": 0.6250290870666504, "rewards/reward_len/std": 0.3134050965309143, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09771986970684039, "frac_reward_zero_std": 0.0, "grad_norm": 0.6519166231155396, "kl": 0.005610979162156582, "learning_rate": 9.974707134542489e-07, "loss": 0.0, "num_tokens": 24653024.0, "reward": 0.6508070230484009, "reward_std": 0.20009669661521912, "rewards/reward_len/mean": 0.6508070230484009, "rewards/reward_len/std": 0.3037513494491577, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0993485342019544, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6505119204521179, "kl": 0.005777844227850437, "learning_rate": 9.973843237955327e-07, "loss": 0.0, "num_tokens": 25063120.0, "reward": 0.508367657661438, "reward_std": 0.19143624603748322, "rewards/reward_len/mean": 0.508367657661438, "rewards/reward_len/std": 0.31634268164634705, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10097719869706841, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6800310611724854, "kl": 0.005794803611934185, "learning_rate": 9.972964873235518e-07, "loss": 0.0, "num_tokens": 25474304.0, "reward": 0.5367823839187622, "reward_std": 0.18182450532913208, "rewards/reward_len/mean": 0.5367823839187622, "rewards/reward_len/std": 0.34293103218078613, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10260586319218241, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6375138163566589, "kl": 0.005953638814389706, "learning_rate": 9.97207204293809e-07, "loss": 0.0, "num_tokens": 25886448.0, "reward": 0.6494389772415161, "reward_std": 0.18243540823459625, "rewards/reward_len/mean": 0.6494389772415161, "rewards/reward_len/std": 0.30349692702293396, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10423452768729642, "frac_reward_zero_std": 0.0, "grad_norm": 0.6679726243019104, "kl": 0.005816442891955376, "learning_rate": 9.971164749660148e-07, "loss": 0.0, "num_tokens": 26296080.0, "reward": 0.6319320201873779, "reward_std": 0.17729854583740234, "rewards/reward_len/mean": 0.6319320201873779, "rewards/reward_len/std": 0.28863874077796936, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10586319218241043, "frac_reward_zero_std": 0.0, "grad_norm": 0.6467310786247253, "kl": 0.005794237367808819, "learning_rate": 9.970242996040864e-07, "loss": 0.0, "num_tokens": 26706624.0, "reward": 0.5008686184883118, "reward_std": 0.1827966272830963, "rewards/reward_len/mean": 0.5008686184883118, "rewards/reward_len/std": 0.31599587202072144, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10749185667752444, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6153386831283569, "kl": 0.005751739256083965, "learning_rate": 9.969306784761475e-07, "loss": 0.0, "num_tokens": 27117520.0, "reward": 0.5869995355606079, "reward_std": 0.17040212452411652, "rewards/reward_len/mean": 0.5869994759559631, "rewards/reward_len/std": 0.33589106798171997, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10912052117263844, "frac_reward_zero_std": 0.0, "grad_norm": 0.6732746362686157, "kl": 0.005877629388123751, "learning_rate": 9.968356118545275e-07, "loss": 0.0, "num_tokens": 27528016.0, "reward": 0.6169115900993347, "reward_std": 0.196923166513443, "rewards/reward_len/mean": 0.6169115900993347, "rewards/reward_len/std": 0.30990681052207947, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11074918566775244, "frac_reward_zero_std": 0.0, "grad_norm": 0.6400807499885559, "kl": 0.00585567019879818, "learning_rate": 9.967391000157604e-07, "loss": 0.0, "num_tokens": 27938800.0, "reward": 0.633395254611969, "reward_std": 0.18729263544082642, "rewards/reward_len/mean": 0.633395254611969, "rewards/reward_len/std": 0.27505961060523987, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11237785016286644, "frac_reward_zero_std": 0.0, "grad_norm": 0.638647735118866, "kl": 0.006055144593119621, "learning_rate": 9.966411432405838e-07, "loss": 0.0, "num_tokens": 28349536.0, "reward": 0.6088175773620605, "reward_std": 0.17012721300125122, "rewards/reward_len/mean": 0.6088175177574158, "rewards/reward_len/std": 0.311130166053772, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11400651465798045, "frac_reward_zero_std": 0.0, "grad_norm": 0.6302928924560547, "kl": 0.006081017665565014, "learning_rate": 9.965417418139388e-07, "loss": 0.0, "num_tokens": 28760896.0, "reward": 0.6386709809303284, "reward_std": 0.1846430003643036, "rewards/reward_len/mean": 0.6386709213256836, "rewards/reward_len/std": 0.2791380286216736, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11563517915309446, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6027461290359497, "kl": 0.005914159119129181, "learning_rate": 9.964408960249686e-07, "loss": 0.0, "num_tokens": 29171600.0, "reward": 0.6008121967315674, "reward_std": 0.19992570579051971, "rewards/reward_len/mean": 0.6008121967315674, "rewards/reward_len/std": 0.3140392303466797, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11726384364820847, "frac_reward_zero_std": 0.0, "grad_norm": 0.6772151589393616, "kl": 0.0060426113195717335, "learning_rate": 9.96338606167018e-07, "loss": 0.0, "num_tokens": 29582144.0, "reward": 0.5798563361167908, "reward_std": 0.21490497887134552, "rewards/reward_len/mean": 0.5798563361167908, "rewards/reward_len/std": 0.3266470730304718, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11889250814332247, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6127480268478394, "kl": 0.006025280337780714, "learning_rate": 9.962348725376317e-07, "loss": 0.0, "num_tokens": 29994160.0, "reward": 0.53508460521698, "reward_std": 0.15128302574157715, "rewards/reward_len/mean": 0.53508460521698, "rewards/reward_len/std": 0.3344893157482147, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12052117263843648, "frac_reward_zero_std": 0.0, "grad_norm": 0.6420020461082458, "kl": 0.005925798788666725, "learning_rate": 9.96129695438555e-07, "loss": 0.0, "num_tokens": 30404944.0, "reward": 0.6764165163040161, "reward_std": 0.1624477058649063, "rewards/reward_len/mean": 0.6764165163040161, "rewards/reward_len/std": 0.2854645252227783, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12214983713355049, "frac_reward_zero_std": 0.0, "grad_norm": 0.6267188191413879, "kl": 0.006195234600454569, "learning_rate": 9.960230751757316e-07, "loss": 0.0, "num_tokens": 30815040.0, "reward": 0.5696959495544434, "reward_std": 0.1753118336200714, "rewards/reward_len/mean": 0.5696959495544434, "rewards/reward_len/std": 0.3388904631137848, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1237785016286645, "frac_reward_zero_std": 0.0, "grad_norm": 0.6991639137268066, "kl": 0.006288875825703144, "learning_rate": 9.959150120593034e-07, "loss": 0.0, "num_tokens": 31225760.0, "reward": 0.6297193765640259, "reward_std": 0.2083851397037506, "rewards/reward_len/mean": 0.6297193765640259, "rewards/reward_len/std": 0.31694766879081726, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1254071661237785, "frac_reward_zero_std": 0.0, "grad_norm": 0.6712898015975952, "kl": 0.006047003902494907, "learning_rate": 9.958055064036088e-07, "loss": 0.0, "num_tokens": 31636672.0, "reward": 0.70964115858078, "reward_std": 0.1842394769191742, "rewards/reward_len/mean": 0.70964115858078, "rewards/reward_len/std": 0.27193719148635864, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1270358306188925, "frac_reward_zero_std": 0.0, "grad_norm": 0.6062783002853394, "kl": 0.006245201453566551, "learning_rate": 9.956945585271827e-07, "loss": 0.0, "num_tokens": 32048160.0, "reward": 0.6685727834701538, "reward_std": 0.17213179171085358, "rewards/reward_len/mean": 0.6685727834701538, "rewards/reward_len/std": 0.2768580913543701, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12866449511400652, "frac_reward_zero_std": 0.0, "grad_norm": 0.6471588611602783, "kl": 0.006264387629926205, "learning_rate": 9.955821687527552e-07, "loss": 0.0, "num_tokens": 32458016.0, "reward": 0.5437395572662354, "reward_std": 0.18083874881267548, "rewards/reward_len/mean": 0.5437394976615906, "rewards/reward_len/std": 0.32849904894828796, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.13029315960912052, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6295440793037415, "kl": 0.006333072669804096, "learning_rate": 9.954683374072505e-07, "loss": 0.0, "num_tokens": 32869088.0, "reward": 0.5611273050308228, "reward_std": 0.1822039633989334, "rewards/reward_len/mean": 0.5611273050308228, "rewards/reward_len/std": 0.3136206269264221, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.13192182410423453, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6564952731132507, "kl": 0.006282507907599211, "learning_rate": 9.953530648217863e-07, "loss": 0.0, "num_tokens": 33280800.0, "reward": 0.5820149183273315, "reward_std": 0.1949148178100586, "rewards/reward_len/mean": 0.5820149183273315, "rewards/reward_len/std": 0.3016170561313629, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.13355048859934854, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6637318730354309, "kl": 0.006372909061610699, "learning_rate": 9.952363513316725e-07, "loss": 0.0, "num_tokens": 33692928.0, "reward": 0.5514626502990723, "reward_std": 0.1880742311477661, "rewards/reward_len/mean": 0.5514626502990723, "rewards/reward_len/std": 0.34031620621681213, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.13517915309446255, "frac_reward_zero_std": 0.0, "grad_norm": 0.6077294945716858, "kl": 0.006352216936647892, "learning_rate": 9.951181972764104e-07, "loss": 0.0, "num_tokens": 34103648.0, "reward": 0.5770986080169678, "reward_std": 0.21515893936157227, "rewards/reward_len/mean": 0.5770986080169678, "rewards/reward_len/std": 0.30929288268089294, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.13680781758957655, "frac_reward_zero_std": 0.0, "grad_norm": 0.6008139848709106, "kl": 0.006490306928753853, "learning_rate": 9.949986029996918e-07, "loss": 0.0, "num_tokens": 34514464.0, "reward": 0.5876795053482056, "reward_std": 0.180733323097229, "rewards/reward_len/mean": 0.5876795053482056, "rewards/reward_len/std": 0.2971385717391968, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.13843648208469056, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6295453906059265, "kl": 0.006482935976237059, "learning_rate": 9.948775688493974e-07, "loss": 0.0, "num_tokens": 34926048.0, "reward": 0.6586557626724243, "reward_std": 0.15912435948848724, "rewards/reward_len/mean": 0.6586557626724243, "rewards/reward_len/std": 0.30623695254325867, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.14006514657980457, "frac_reward_zero_std": 0.0, "grad_norm": 0.7001896500587463, "kl": 0.006533653941005468, "learning_rate": 9.94755095177597e-07, "loss": 0.0, "num_tokens": 35336608.0, "reward": 0.6186506748199463, "reward_std": 0.1895766407251358, "rewards/reward_len/mean": 0.6186506152153015, "rewards/reward_len/std": 0.2962113916873932, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.14169381107491857, "frac_reward_zero_std": 0.03125, "grad_norm": 0.652926504611969, "kl": 0.006602640729397535, "learning_rate": 9.94631182340547e-07, "loss": 0.0, "num_tokens": 35747456.0, "reward": 0.5951104164123535, "reward_std": 0.19115304946899414, "rewards/reward_len/mean": 0.5951104164123535, "rewards/reward_len/std": 0.32347673177719116, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.14332247557003258, "frac_reward_zero_std": 0.0, "grad_norm": 0.6153026223182678, "kl": 0.0066271210089325905, "learning_rate": 9.94505830698691e-07, "loss": 0.0, "num_tokens": 36158464.0, "reward": 0.6090462803840637, "reward_std": 0.20194405317306519, "rewards/reward_len/mean": 0.6090462803840637, "rewards/reward_len/std": 0.30463722348213196, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1449511400651466, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6102354526519775, "kl": 0.006770038045942783, "learning_rate": 9.943790406166569e-07, "loss": 0.0, "num_tokens": 36569344.0, "reward": 0.6451303958892822, "reward_std": 0.18065407872200012, "rewards/reward_len/mean": 0.6451303958892822, "rewards/reward_len/std": 0.2856083810329437, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1465798045602606, "frac_reward_zero_std": 0.0, "grad_norm": 0.6278913617134094, "kl": 0.006600166670978069, "learning_rate": 9.942508124632574e-07, "loss": 0.0, "num_tokens": 36980992.0, "reward": 0.645142674446106, "reward_std": 0.1782657504081726, "rewards/reward_len/mean": 0.6451427340507507, "rewards/reward_len/std": 0.29134097695350647, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1482084690553746, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6226783990859985, "kl": 0.006500097457319498, "learning_rate": 9.94121146611488e-07, "loss": 0.0, "num_tokens": 37392640.0, "reward": 0.585680365562439, "reward_std": 0.18780681490898132, "rewards/reward_len/mean": 0.5856803059577942, "rewards/reward_len/std": 0.3021359443664551, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1498371335504886, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6016721129417419, "kl": 0.006767662242054939, "learning_rate": 9.939900434385269e-07, "loss": 0.0, "num_tokens": 37803680.0, "reward": 0.6000661253929138, "reward_std": 0.1934816539287567, "rewards/reward_len/mean": 0.6000661253929138, "rewards/reward_len/std": 0.3280094861984253, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.15146579804560262, "frac_reward_zero_std": 0.0, "grad_norm": 0.6445940732955933, "kl": 0.0068554747849702835, "learning_rate": 9.93857503325732e-07, "loss": 0.0, "num_tokens": 38214816.0, "reward": 0.5607788562774658, "reward_std": 0.1987011432647705, "rewards/reward_len/mean": 0.5607788562774658, "rewards/reward_len/std": 0.31303682923316956, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.15309446254071662, "frac_reward_zero_std": 0.0, "grad_norm": 0.6296261548995972, "kl": 0.007015876471996307, "learning_rate": 9.937235266586424e-07, "loss": 0.0, "num_tokens": 38624816.0, "reward": 0.7118442058563232, "reward_std": 0.1518295556306839, "rewards/reward_len/mean": 0.711844265460968, "rewards/reward_len/std": 0.2718747854232788, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.15472312703583063, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6234762668609619, "kl": 0.006695098243653774, "learning_rate": 9.935881138269749e-07, "loss": 0.0, "num_tokens": 39036256.0, "reward": 0.6483550071716309, "reward_std": 0.1804676055908203, "rewards/reward_len/mean": 0.6483550071716309, "rewards/reward_len/std": 0.33185726404190063, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1563517915309446, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6980348825454712, "kl": 0.006886781193315983, "learning_rate": 9.934512652246245e-07, "loss": 0.0, "num_tokens": 39447136.0, "reward": 0.6074411869049072, "reward_std": 0.19725409150123596, "rewards/reward_len/mean": 0.6074411869049072, "rewards/reward_len/std": 0.32367366552352905, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.15798045602605862, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6031783819198608, "kl": 0.006962233688682318, "learning_rate": 9.933129812496622e-07, "loss": 0.0, "num_tokens": 39858688.0, "reward": 0.6138420104980469, "reward_std": 0.1680690050125122, "rewards/reward_len/mean": 0.6138420104980469, "rewards/reward_len/std": 0.3462577164173126, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.15960912052117263, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6191297769546509, "kl": 0.006897475104779005, "learning_rate": 9.931732623043346e-07, "loss": 0.0, "num_tokens": 40269456.0, "reward": 0.5790231823921204, "reward_std": 0.17291653156280518, "rewards/reward_len/mean": 0.5790231823921204, "rewards/reward_len/std": 0.3232664465904236, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.16123778501628663, "frac_reward_zero_std": 0.0, "grad_norm": 0.6361674666404724, "kl": 0.006988652050495148, "learning_rate": 9.930321087950623e-07, "loss": 0.0, "num_tokens": 40680832.0, "reward": 0.5995804071426392, "reward_std": 0.1811872273683548, "rewards/reward_len/mean": 0.5995804071426392, "rewards/reward_len/std": 0.32679077982902527, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.16286644951140064, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6776547431945801, "kl": 0.007097579538822174, "learning_rate": 9.928895211324385e-07, "loss": 0.0, "num_tokens": 41092160.0, "reward": 0.6182448267936707, "reward_std": 0.16578535735607147, "rewards/reward_len/mean": 0.6182448863983154, "rewards/reward_len/std": 0.2980058789253235, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.16449511400651465, "frac_reward_zero_std": 0.0, "grad_norm": 0.6312792301177979, "kl": 0.006928689777851105, "learning_rate": 9.92745499731229e-07, "loss": 0.0, "num_tokens": 41502848.0, "reward": 0.6401134729385376, "reward_std": 0.17420139908790588, "rewards/reward_len/mean": 0.6401134729385376, "rewards/reward_len/std": 0.2633191645145416, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.16612377850162866, "frac_reward_zero_std": 0.0, "grad_norm": 0.6165812015533447, "kl": 0.0072430698201060295, "learning_rate": 9.92600045010369e-07, "loss": 0.0, "num_tokens": 41914560.0, "reward": 0.6137415766716003, "reward_std": 0.1978541612625122, "rewards/reward_len/mean": 0.6137415766716003, "rewards/reward_len/std": 0.3144802153110504, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.16775244299674266, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6334063410758972, "kl": 0.007197507657110691, "learning_rate": 9.92453157392964e-07, "loss": 0.0, "num_tokens": 42326624.0, "reward": 0.5691064596176147, "reward_std": 0.16554251313209534, "rewards/reward_len/mean": 0.56910640001297, "rewards/reward_len/std": 0.34978583455085754, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.16938110749185667, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6432600617408752, "kl": 0.007399100344628096, "learning_rate": 9.923048373062866e-07, "loss": 0.0, "num_tokens": 42736080.0, "reward": 0.6603549122810364, "reward_std": 0.21011045575141907, "rewards/reward_len/mean": 0.6603549122810364, "rewards/reward_len/std": 0.2799879312515259, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.17100977198697068, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5903961062431335, "kl": 0.007315165363252163, "learning_rate": 9.921550851817773e-07, "loss": 0.0, "num_tokens": 43148112.0, "reward": 0.5021611452102661, "reward_std": 0.18139612674713135, "rewards/reward_len/mean": 0.5021611452102661, "rewards/reward_len/std": 0.3371685743331909, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.17263843648208468, "frac_reward_zero_std": 0.0, "grad_norm": 0.6670516133308411, "kl": 0.007557717151939869, "learning_rate": 9.920039014550412e-07, "loss": 0.0, "num_tokens": 43558048.0, "reward": 0.564050555229187, "reward_std": 0.2051321566104889, "rewards/reward_len/mean": 0.564050555229187, "rewards/reward_len/std": 0.3417951166629791, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1742671009771987, "frac_reward_zero_std": 0.0, "grad_norm": 0.6020849943161011, "kl": 0.007600951008498669, "learning_rate": 9.918512865658482e-07, "loss": 0.0, "num_tokens": 43968960.0, "reward": 0.6772098541259766, "reward_std": 0.1740013062953949, "rewards/reward_len/mean": 0.6772099137306213, "rewards/reward_len/std": 0.3080049753189087, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1758957654723127, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6087831258773804, "kl": 0.00769856758415699, "learning_rate": 9.916972409581314e-07, "loss": 0.0, "num_tokens": 44380368.0, "reward": 0.59903883934021, "reward_std": 0.16960088908672333, "rewards/reward_len/mean": 0.59903883934021, "rewards/reward_len/std": 0.3122100532054901, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1775244299674267, "frac_reward_zero_std": 0.0, "grad_norm": 0.6422587037086487, "kl": 0.007322848774492741, "learning_rate": 9.915417650799853e-07, "loss": 0.0, "num_tokens": 44790544.0, "reward": 0.6461697816848755, "reward_std": 0.202808678150177, "rewards/reward_len/mean": 0.6461697816848755, "rewards/reward_len/std": 0.2867162823677063, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1791530944625407, "frac_reward_zero_std": 0.0, "grad_norm": 0.6052092909812927, "kl": 0.007601277437061071, "learning_rate": 9.913848593836648e-07, "loss": 0.0, "num_tokens": 45201616.0, "reward": 0.5911939144134521, "reward_std": 0.1958751529455185, "rewards/reward_len/mean": 0.5911939144134521, "rewards/reward_len/std": 0.31123557686805725, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.18078175895765472, "frac_reward_zero_std": 0.0, "grad_norm": 0.6361722946166992, "kl": 0.007858152501285076, "learning_rate": 9.912265243255842e-07, "loss": 0.0, "num_tokens": 45612672.0, "reward": 0.6554419994354248, "reward_std": 0.17834407091140747, "rewards/reward_len/mean": 0.6554419994354248, "rewards/reward_len/std": 0.30987924337387085, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.18241042345276873, "frac_reward_zero_std": 0.0, "grad_norm": 0.6548160910606384, "kl": 0.007977334782481194, "learning_rate": 9.910667603663153e-07, "loss": 0.0, "num_tokens": 46024560.0, "reward": 0.5865346193313599, "reward_std": 0.1887112706899643, "rewards/reward_len/mean": 0.5865346193313599, "rewards/reward_len/std": 0.3010815680027008, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.18403908794788273, "frac_reward_zero_std": 0.0, "grad_norm": 0.6236750483512878, "kl": 0.007689779624342918, "learning_rate": 9.909055679705868e-07, "loss": 0.0, "num_tokens": 46435856.0, "reward": 0.6239224672317505, "reward_std": 0.20650212466716766, "rewards/reward_len/mean": 0.6239224076271057, "rewards/reward_len/std": 0.29176461696624756, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.18566775244299674, "frac_reward_zero_std": 0.03125, "grad_norm": 0.643318772315979, "kl": 0.007676412351429462, "learning_rate": 9.907429476072817e-07, "loss": 0.0, "num_tokens": 46846752.0, "reward": 0.5815567970275879, "reward_std": 0.20014160871505737, "rewards/reward_len/mean": 0.5815567970275879, "rewards/reward_len/std": 0.321799635887146, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.18729641693811075, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6088619828224182, "kl": 0.00827750749886036, "learning_rate": 9.905788997494377e-07, "loss": 0.0, "num_tokens": 47256336.0, "reward": 0.6639087796211243, "reward_std": 0.2063475102186203, "rewards/reward_len/mean": 0.6639087796211243, "rewards/reward_len/std": 0.2963838279247284, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.18892508143322476, "frac_reward_zero_std": 0.0, "grad_norm": 0.6200522780418396, "kl": 0.00790533609688282, "learning_rate": 9.90413424874244e-07, "loss": 0.0, "num_tokens": 47667504.0, "reward": 0.589942216873169, "reward_std": 0.17946478724479675, "rewards/reward_len/mean": 0.5899421572685242, "rewards/reward_len/std": 0.303824782371521, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.19055374592833876, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6201689839363098, "kl": 0.008061718195676804, "learning_rate": 9.902465234630413e-07, "loss": 0.0, "num_tokens": 48078544.0, "reward": 0.6075823307037354, "reward_std": 0.1754363477230072, "rewards/reward_len/mean": 0.6075823307037354, "rewards/reward_len/std": 0.32260212302207947, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.19218241042345277, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6108475923538208, "kl": 0.008119920268654823, "learning_rate": 9.900781960013198e-07, "loss": 0.0, "num_tokens": 48488240.0, "reward": 0.6552397012710571, "reward_std": 0.17725440859794617, "rewards/reward_len/mean": 0.6552397012710571, "rewards/reward_len/std": 0.2871791422367096, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.19381107491856678, "frac_reward_zero_std": 0.0, "grad_norm": 0.6389063596725464, "kl": 0.008713550865650177, "learning_rate": 9.899084429787177e-07, "loss": 0.0, "num_tokens": 48898144.0, "reward": 0.632125735282898, "reward_std": 0.20129485428333282, "rewards/reward_len/mean": 0.632125735282898, "rewards/reward_len/std": 0.33194735646247864, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.19543973941368079, "frac_reward_zero_std": 0.0, "grad_norm": 0.6458228230476379, "kl": 0.008068377152085304, "learning_rate": 9.897372648890198e-07, "loss": 0.0, "num_tokens": 49310128.0, "reward": 0.6849644780158997, "reward_std": 0.1770927906036377, "rewards/reward_len/mean": 0.6849644184112549, "rewards/reward_len/std": 0.256657212972641, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1970684039087948, "frac_reward_zero_std": 0.0, "grad_norm": 0.6447536945343018, "kl": 0.00804546382278204, "learning_rate": 9.89564662230157e-07, "loss": 0.0, "num_tokens": 49721152.0, "reward": 0.621749758720398, "reward_std": 0.1881362795829773, "rewards/reward_len/mean": 0.621749758720398, "rewards/reward_len/std": 0.304176390171051, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1986970684039088, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6143417358398438, "kl": 0.008773395791649818, "learning_rate": 9.89390635504203e-07, "loss": 0.0, "num_tokens": 50132800.0, "reward": 0.6066019535064697, "reward_std": 0.16969190537929535, "rewards/reward_len/mean": 0.606601893901825, "rewards/reward_len/std": 0.3336600661277771, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2003257328990228, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6019123792648315, "kl": 0.008725378662347794, "learning_rate": 9.892151852173742e-07, "loss": 0.0, "num_tokens": 50544032.0, "reward": 0.6760438680648804, "reward_std": 0.15929467976093292, "rewards/reward_len/mean": 0.6760438680648804, "rewards/reward_len/std": 0.2833336293697357, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.20195439739413681, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6656358242034912, "kl": 0.00818290188908577, "learning_rate": 9.890383118800284e-07, "loss": 0.0, "num_tokens": 50955136.0, "reward": 0.6127406358718872, "reward_std": 0.16684389114379883, "rewards/reward_len/mean": 0.6127406358718872, "rewards/reward_len/std": 0.3196088373661041, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.20358306188925082, "frac_reward_zero_std": 0.09375, "grad_norm": 0.6187402009963989, "kl": 0.008824816904962063, "learning_rate": 9.888600160066625e-07, "loss": 0.0, "num_tokens": 51365968.0, "reward": 0.58031165599823, "reward_std": 0.18844164907932281, "rewards/reward_len/mean": 0.58031165599823, "rewards/reward_len/std": 0.35089778900146484, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.20521172638436483, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6334829926490784, "kl": 0.008974723517894745, "learning_rate": 9.886802981159112e-07, "loss": 0.0, "num_tokens": 51777120.0, "reward": 0.5420486330986023, "reward_std": 0.1663423478603363, "rewards/reward_len/mean": 0.5420486330986023, "rewards/reward_len/std": 0.34929925203323364, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.20684039087947884, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5904549360275269, "kl": 0.00882301852107048, "learning_rate": 9.884991587305459e-07, "loss": 0.0, "num_tokens": 52188336.0, "reward": 0.6326888799667358, "reward_std": 0.1853259801864624, "rewards/reward_len/mean": 0.6326888799667358, "rewards/reward_len/std": 0.334570050239563, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.20846905537459284, "frac_reward_zero_std": 0.0, "grad_norm": 0.6780713200569153, "kl": 0.008832816034555435, "learning_rate": 9.883165983774726e-07, "loss": 0.0, "num_tokens": 52599472.0, "reward": 0.665580153465271, "reward_std": 0.16355182230472565, "rewards/reward_len/mean": 0.665580153465271, "rewards/reward_len/std": 0.2602837383747101, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.21009771986970685, "frac_reward_zero_std": 0.09375, "grad_norm": 0.608444094657898, "kl": 0.00896189734339714, "learning_rate": 9.881326175877307e-07, "loss": 0.0, "num_tokens": 53010912.0, "reward": 0.6455395221710205, "reward_std": 0.17724718153476715, "rewards/reward_len/mean": 0.6455395221710205, "rewards/reward_len/std": 0.3458082675933838, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.21172638436482086, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6463170051574707, "kl": 0.009073005989193916, "learning_rate": 9.87947216896492e-07, "loss": 0.0, "num_tokens": 53421744.0, "reward": 0.5620007514953613, "reward_std": 0.2030048370361328, "rewards/reward_len/mean": 0.5620007514953613, "rewards/reward_len/std": 0.3591574728488922, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.21335504885993486, "frac_reward_zero_std": 0.0, "grad_norm": 0.6374853253364563, "kl": 0.008822174742817879, "learning_rate": 9.877603968430576e-07, "loss": 0.0, "num_tokens": 53833408.0, "reward": 0.6403973698616028, "reward_std": 0.1922074407339096, "rewards/reward_len/mean": 0.6403974294662476, "rewards/reward_len/std": 0.3219207525253296, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.21498371335504887, "frac_reward_zero_std": 0.0, "grad_norm": 0.6107245683670044, "kl": 0.009435772895812988, "learning_rate": 9.875721579708583e-07, "loss": 0.0, "num_tokens": 54245088.0, "reward": 0.696793794631958, "reward_std": 0.14370962977409363, "rewards/reward_len/mean": 0.696793794631958, "rewards/reward_len/std": 0.2734077274799347, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.21661237785016288, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5707911849021912, "kl": 0.009059187024831772, "learning_rate": 9.873825008274514e-07, "loss": 0.0, "num_tokens": 54655664.0, "reward": 0.6300230026245117, "reward_std": 0.16554559767246246, "rewards/reward_len/mean": 0.6300230026245117, "rewards/reward_len/std": 0.31014660000801086, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2182410423452769, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6054690480232239, "kl": 0.009222352877259254, "learning_rate": 9.871914259645198e-07, "loss": 0.0, "num_tokens": 55066944.0, "reward": 0.5656280517578125, "reward_std": 0.16571784019470215, "rewards/reward_len/mean": 0.5656280517578125, "rewards/reward_len/std": 0.3423321545124054, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.21986970684039087, "frac_reward_zero_std": 0.0, "grad_norm": 0.6033143401145935, "kl": 0.009392160922288895, "learning_rate": 9.869989339378705e-07, "loss": 0.0, "num_tokens": 55478272.0, "reward": 0.686661958694458, "reward_std": 0.18725591897964478, "rewards/reward_len/mean": 0.686661958694458, "rewards/reward_len/std": 0.2908850908279419, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.22149837133550487, "frac_reward_zero_std": 0.0, "grad_norm": 0.5985466837882996, "kl": 0.009702052921056747, "learning_rate": 9.868050253074327e-07, "loss": 0.0, "num_tokens": 55888352.0, "reward": 0.7308776378631592, "reward_std": 0.15523472428321838, "rewards/reward_len/mean": 0.7308776378631592, "rewards/reward_len/std": 0.24497191607952118, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.22312703583061888, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6210593581199646, "kl": 0.009618960320949554, "learning_rate": 9.866097006372564e-07, "loss": 0.0, "num_tokens": 56300624.0, "reward": 0.625930666923523, "reward_std": 0.20019039511680603, "rewards/reward_len/mean": 0.625930666923523, "rewards/reward_len/std": 0.28943905234336853, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2247557003257329, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6044394969940186, "kl": 0.009718693792819977, "learning_rate": 9.864129604955105e-07, "loss": 0.0, "num_tokens": 56711728.0, "reward": 0.6907694339752197, "reward_std": 0.15632738173007965, "rewards/reward_len/mean": 0.6907694339752197, "rewards/reward_len/std": 0.3043997883796692, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2263843648208469, "frac_reward_zero_std": 0.0, "grad_norm": 0.6191521286964417, "kl": 0.009601933881640434, "learning_rate": 9.86214805454481e-07, "loss": 0.0, "num_tokens": 57122976.0, "reward": 0.6476109623908997, "reward_std": 0.20551797747612, "rewards/reward_len/mean": 0.6476109623908997, "rewards/reward_len/std": 0.2880290448665619, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2280130293159609, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6180540323257446, "kl": 0.009662277065217495, "learning_rate": 9.860152360905704e-07, "loss": 0.0, "num_tokens": 57534464.0, "reward": 0.5354117751121521, "reward_std": 0.18517854809761047, "rewards/reward_len/mean": 0.5354117751121521, "rewards/reward_len/std": 0.3297685980796814, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2296416938110749, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6000733375549316, "kl": 0.0098715266212821, "learning_rate": 9.858142529842946e-07, "loss": 0.0, "num_tokens": 57947072.0, "reward": 0.6565795540809631, "reward_std": 0.1564558744430542, "rewards/reward_len/mean": 0.6565795540809631, "rewards/reward_len/std": 0.3178524971008301, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.23127035830618892, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6353108286857605, "kl": 0.010073553770780563, "learning_rate": 9.85611856720282e-07, "loss": 0.0, "num_tokens": 58357808.0, "reward": 0.6599307656288147, "reward_std": 0.19758054614067078, "rewards/reward_len/mean": 0.6599307656288147, "rewards/reward_len/std": 0.297515869140625, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.23289902280130292, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6348649859428406, "kl": 0.010214118286967278, "learning_rate": 9.854080478872719e-07, "loss": 0.0, "num_tokens": 58769216.0, "reward": 0.631576418876648, "reward_std": 0.1462429165840149, "rewards/reward_len/mean": 0.631576418876648, "rewards/reward_len/std": 0.33017945289611816, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.23452768729641693, "frac_reward_zero_std": 0.0, "grad_norm": 0.6653222441673279, "kl": 0.010085981339216232, "learning_rate": 9.85202827078112e-07, "loss": 0.0, "num_tokens": 59180048.0, "reward": 0.6415691375732422, "reward_std": 0.16816698014736176, "rewards/reward_len/mean": 0.6415691375732422, "rewards/reward_len/std": 0.32200145721435547, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.23615635179153094, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6372671723365784, "kl": 0.010778001509606838, "learning_rate": 9.84996194889758e-07, "loss": 0.0, "num_tokens": 59591184.0, "reward": 0.617824912071228, "reward_std": 0.1678849160671234, "rewards/reward_len/mean": 0.6178249716758728, "rewards/reward_len/std": 0.27699559926986694, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.23778501628664495, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6342593431472778, "kl": 0.010270815342664719, "learning_rate": 9.847881519232703e-07, "loss": 0.0, "num_tokens": 60002224.0, "reward": 0.5836004018783569, "reward_std": 0.16626442968845367, "rewards/reward_len/mean": 0.5836004018783569, "rewards/reward_len/std": 0.3156996965408325, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.23941368078175895, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7259546518325806, "kl": 0.010464068502187729, "learning_rate": 9.845786987838135e-07, "loss": 0.0, "num_tokens": 60413392.0, "reward": 0.6087343692779541, "reward_std": 0.145502969622612, "rewards/reward_len/mean": 0.6087343692779541, "rewards/reward_len/std": 0.32716965675354004, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.24104234527687296, "frac_reward_zero_std": 0.0, "grad_norm": 0.6601978540420532, "kl": 0.010719379410147667, "learning_rate": 9.84367836080654e-07, "loss": 0.0, "num_tokens": 60825920.0, "reward": 0.630610466003418, "reward_std": 0.17687691748142242, "rewards/reward_len/mean": 0.630610466003418, "rewards/reward_len/std": 0.313233882188797, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.24267100977198697, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6246123313903809, "kl": 0.010444914922118187, "learning_rate": 9.841555644271585e-07, "loss": 0.0, "num_tokens": 61237600.0, "reward": 0.5348638296127319, "reward_std": 0.17719021439552307, "rewards/reward_len/mean": 0.5348638296127319, "rewards/reward_len/std": 0.3361101746559143, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.24429967426710097, "frac_reward_zero_std": 0.0, "grad_norm": 0.6354151964187622, "kl": 0.011177919805049896, "learning_rate": 9.83941884440792e-07, "loss": 0.0, "num_tokens": 61648000.0, "reward": 0.6724332571029663, "reward_std": 0.21544736623764038, "rewards/reward_len/mean": 0.6724332571029663, "rewards/reward_len/std": 0.3136581778526306, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.24592833876221498, "frac_reward_zero_std": 0.0, "grad_norm": 0.6094197034835815, "kl": 0.01107245683670044, "learning_rate": 9.837267967431162e-07, "loss": 0.0, "num_tokens": 62058912.0, "reward": 0.6082909107208252, "reward_std": 0.1631653904914856, "rewards/reward_len/mean": 0.60829097032547, "rewards/reward_len/std": 0.30916011333465576, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.247557003257329, "frac_reward_zero_std": 0.0, "grad_norm": 0.6505522131919861, "kl": 0.011494511738419533, "learning_rate": 9.835103019597876e-07, "loss": 0.0, "num_tokens": 62470416.0, "reward": 0.6595715284347534, "reward_std": 0.17748546600341797, "rewards/reward_len/mean": 0.6595715284347534, "rewards/reward_len/std": 0.30635571479797363, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.249185667752443, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6204918622970581, "kl": 0.011286517605185509, "learning_rate": 9.832924007205558e-07, "loss": 0.0, "num_tokens": 62880752.0, "reward": 0.6521117091178894, "reward_std": 0.14894911646842957, "rewards/reward_len/mean": 0.6521117687225342, "rewards/reward_len/std": 0.29204171895980835, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.250814332247557, "frac_reward_zero_std": 0.0, "grad_norm": 0.6199658513069153, "kl": 0.011563939973711967, "learning_rate": 9.830730936592615e-07, "loss": 0.0, "num_tokens": 63292768.0, "reward": 0.6415520906448364, "reward_std": 0.20933926105499268, "rewards/reward_len/mean": 0.6415520906448364, "rewards/reward_len/std": 0.3468521237373352, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.252442996742671, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6061365008354187, "kl": 0.011912752874195576, "learning_rate": 9.828523814138344e-07, "loss": 0.0, "num_tokens": 63703392.0, "reward": 0.7171416878700256, "reward_std": 0.12377262860536575, "rewards/reward_len/mean": 0.7171416878700256, "rewards/reward_len/std": 0.2606699764728546, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.254071661237785, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6021035313606262, "kl": 0.011456501670181751, "learning_rate": 9.82630264626292e-07, "loss": 0.0, "num_tokens": 64115088.0, "reward": 0.6230396032333374, "reward_std": 0.18078678846359253, "rewards/reward_len/mean": 0.6230396032333374, "rewards/reward_len/std": 0.3157704472541809, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.255700325732899, "frac_reward_zero_std": 0.0, "grad_norm": 0.6132100820541382, "kl": 0.011573133990168571, "learning_rate": 9.824067439427372e-07, "loss": 0.0, "num_tokens": 64526224.0, "reward": 0.6648616790771484, "reward_std": 0.16295988857746124, "rewards/reward_len/mean": 0.6648616790771484, "rewards/reward_len/std": 0.30658408999443054, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.25732899022801303, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5895265936851501, "kl": 0.011768678203225136, "learning_rate": 9.821818200133573e-07, "loss": 0.0, "num_tokens": 64936544.0, "reward": 0.6949535608291626, "reward_std": 0.1708521544933319, "rewards/reward_len/mean": 0.6949535608291626, "rewards/reward_len/std": 0.26493096351623535, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.25895765472312704, "frac_reward_zero_std": 0.0, "grad_norm": 0.6073674559593201, "kl": 0.012065891176462173, "learning_rate": 9.819554934924204e-07, "loss": 0.0, "num_tokens": 65346832.0, "reward": 0.6573740839958191, "reward_std": 0.16907837986946106, "rewards/reward_len/mean": 0.6573740839958191, "rewards/reward_len/std": 0.27072393894195557, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.26058631921824105, "frac_reward_zero_std": 0.0, "grad_norm": 0.6313510537147522, "kl": 0.011999737471342087, "learning_rate": 9.81727765038275e-07, "loss": 0.0, "num_tokens": 65757552.0, "reward": 0.6308912038803101, "reward_std": 0.18727576732635498, "rewards/reward_len/mean": 0.6308912038803101, "rewards/reward_len/std": 0.3042423725128174, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.26221498371335505, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6245948672294617, "kl": 0.011749263852834702, "learning_rate": 9.814986353133476e-07, "loss": 0.0, "num_tokens": 66169040.0, "reward": 0.6632528901100159, "reward_std": 0.17808765172958374, "rewards/reward_len/mean": 0.6632528901100159, "rewards/reward_len/std": 0.315620094537735, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.26384364820846906, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6024624705314636, "kl": 0.011982088908553123, "learning_rate": 9.812681049841406e-07, "loss": 0.0, "num_tokens": 66580304.0, "reward": 0.6024689078330994, "reward_std": 0.18025335669517517, "rewards/reward_len/mean": 0.6024689078330994, "rewards/reward_len/std": 0.3394664525985718, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.26547231270358307, "frac_reward_zero_std": 0.0, "grad_norm": 0.616497814655304, "kl": 0.011596578173339367, "learning_rate": 9.810361747212311e-07, "loss": 0.0, "num_tokens": 66989952.0, "reward": 0.5894748568534851, "reward_std": 0.17968016862869263, "rewards/reward_len/mean": 0.5894748568534851, "rewards/reward_len/std": 0.32688283920288086, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2671009771986971, "frac_reward_zero_std": 0.0, "grad_norm": 0.663997232913971, "kl": 0.012265902943909168, "learning_rate": 9.80802845199268e-07, "loss": 0.0, "num_tokens": 67400896.0, "reward": 0.7036341428756714, "reward_std": 0.15333837270736694, "rewards/reward_len/mean": 0.7036341428756714, "rewards/reward_len/std": 0.25168442726135254, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2687296416938111, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6035407781600952, "kl": 0.012150991708040237, "learning_rate": 9.805681170969699e-07, "loss": 0.0, "num_tokens": 67811376.0, "reward": 0.6226850152015686, "reward_std": 0.16924408078193665, "rewards/reward_len/mean": 0.6226849555969238, "rewards/reward_len/std": 0.3205668032169342, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2703583061889251, "frac_reward_zero_std": 0.0, "grad_norm": 0.6398211121559143, "kl": 0.012442845851182938, "learning_rate": 9.803319910971247e-07, "loss": 0.0, "num_tokens": 68221776.0, "reward": 0.6575286388397217, "reward_std": 0.1947934776544571, "rewards/reward_len/mean": 0.6575286388397217, "rewards/reward_len/std": 0.2859870493412018, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2719869706840391, "frac_reward_zero_std": 0.0, "grad_norm": 0.6559879779815674, "kl": 0.012738541699945927, "learning_rate": 9.800944678865857e-07, "loss": 0.0, "num_tokens": 68632576.0, "reward": 0.624299168586731, "reward_std": 0.19270718097686768, "rewards/reward_len/mean": 0.6242991089820862, "rewards/reward_len/std": 0.2761988043785095, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2736156351791531, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6067648530006409, "kl": 0.012320881709456444, "learning_rate": 9.798555481562706e-07, "loss": 0.0, "num_tokens": 69043824.0, "reward": 0.6217173337936401, "reward_std": 0.1518239974975586, "rewards/reward_len/mean": 0.6217173337936401, "rewards/reward_len/std": 0.303622305393219, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2752442996742671, "frac_reward_zero_std": 0.0, "grad_norm": 0.6239898800849915, "kl": 0.012742712162435055, "learning_rate": 9.796152326011601e-07, "loss": 0.0, "num_tokens": 69455600.0, "reward": 0.6339670419692993, "reward_std": 0.17541497945785522, "rewards/reward_len/mean": 0.6339670419692993, "rewards/reward_len/std": 0.2837229371070862, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2768729641693811, "frac_reward_zero_std": 0.0, "grad_norm": 0.5951758623123169, "kl": 0.013272672891616821, "learning_rate": 9.793735219202944e-07, "loss": 0.0, "num_tokens": 69866848.0, "reward": 0.7000816464424133, "reward_std": 0.1867782026529312, "rewards/reward_len/mean": 0.7000815868377686, "rewards/reward_len/std": 0.2781500220298767, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2785016286644951, "frac_reward_zero_std": 0.0, "grad_norm": 0.6368116736412048, "kl": 0.013413756154477596, "learning_rate": 9.79130416816772e-07, "loss": 0.0, "num_tokens": 70276816.0, "reward": 0.7010715007781982, "reward_std": 0.18145397305488586, "rewards/reward_len/mean": 0.7010715007781982, "rewards/reward_len/std": 0.26762473583221436, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.28013029315960913, "frac_reward_zero_std": 0.0, "grad_norm": 0.6770243644714355, "kl": 0.01287594810128212, "learning_rate": 9.788859179977478e-07, "loss": 0.0, "num_tokens": 70688816.0, "reward": 0.6305222511291504, "reward_std": 0.1679653376340866, "rewards/reward_len/mean": 0.6305222511291504, "rewards/reward_len/std": 0.3208447992801666, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.28175895765472314, "frac_reward_zero_std": 0.0, "grad_norm": 0.6429850459098816, "kl": 0.01392514817416668, "learning_rate": 9.786400261744303e-07, "loss": 0.0, "num_tokens": 71099200.0, "reward": 0.7549091577529907, "reward_std": 0.17154988646507263, "rewards/reward_len/mean": 0.7549091577529907, "rewards/reward_len/std": 0.24389731884002686, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.28338762214983715, "frac_reward_zero_std": 0.03125, "grad_norm": 0.635195791721344, "kl": 0.013571104034781456, "learning_rate": 9.783927420620807e-07, "loss": 0.0, "num_tokens": 71508608.0, "reward": 0.6613365411758423, "reward_std": 0.1841423362493515, "rewards/reward_len/mean": 0.6613365411758423, "rewards/reward_len/std": 0.2925644516944885, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.28501628664495116, "frac_reward_zero_std": 0.0, "grad_norm": 0.6616284847259521, "kl": 0.013852416537702084, "learning_rate": 9.781440663800099e-07, "loss": 0.0, "num_tokens": 71920496.0, "reward": 0.7244380712509155, "reward_std": 0.15395182371139526, "rewards/reward_len/mean": 0.7244380712509155, "rewards/reward_len/std": 0.25383904576301575, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.28664495114006516, "frac_reward_zero_std": 0.0, "grad_norm": 0.6480233669281006, "kl": 0.013356141746044159, "learning_rate": 9.778939998515764e-07, "loss": 0.0, "num_tokens": 72331520.0, "reward": 0.6194564700126648, "reward_std": 0.19531486928462982, "rewards/reward_len/mean": 0.6194564700126648, "rewards/reward_len/std": 0.29249632358551025, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.28827361563517917, "frac_reward_zero_std": 0.0, "grad_norm": 0.6187188029289246, "kl": 0.014048028737306595, "learning_rate": 9.776425432041845e-07, "loss": 0.0, "num_tokens": 72742000.0, "reward": 0.6355763673782349, "reward_std": 0.17190659046173096, "rewards/reward_len/mean": 0.6355763673782349, "rewards/reward_len/std": 0.2891819179058075, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2899022801302932, "frac_reward_zero_std": 0.0625, "grad_norm": 0.585030198097229, "kl": 0.015147175639867783, "learning_rate": 9.773896971692827e-07, "loss": 0.0, "num_tokens": 73152880.0, "reward": 0.6415553092956543, "reward_std": 0.1858704388141632, "rewards/reward_len/mean": 0.6415552496910095, "rewards/reward_len/std": 0.3436688184738159, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2915309446254072, "frac_reward_zero_std": 0.0, "grad_norm": 0.65060955286026, "kl": 0.014544211328029633, "learning_rate": 9.771354624823603e-07, "loss": 0.0, "num_tokens": 73563152.0, "reward": 0.6809031367301941, "reward_std": 0.17404703795909882, "rewards/reward_len/mean": 0.6809031367301941, "rewards/reward_len/std": 0.3191207945346832, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2931596091205212, "frac_reward_zero_std": 0.0, "grad_norm": 0.6413788199424744, "kl": 0.013781026005744934, "learning_rate": 9.768798398829465e-07, "loss": 0.0, "num_tokens": 73973920.0, "reward": 0.6470574736595154, "reward_std": 0.1739807277917862, "rewards/reward_len/mean": 0.6470574736595154, "rewards/reward_len/std": 0.30589064955711365, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2947882736156352, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6200565099716187, "kl": 0.01426330953836441, "learning_rate": 9.766228301146073e-07, "loss": 0.0, "num_tokens": 74385168.0, "reward": 0.6820328831672668, "reward_std": 0.17893603444099426, "rewards/reward_len/mean": 0.6820328831672668, "rewards/reward_len/std": 0.31948041915893555, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2964169381107492, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6034367084503174, "kl": 0.015352954156696796, "learning_rate": 9.76364433924944e-07, "loss": 0.0, "num_tokens": 74797024.0, "reward": 0.6159738302230835, "reward_std": 0.17460820078849792, "rewards/reward_len/mean": 0.6159738302230835, "rewards/reward_len/std": 0.3234553039073944, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2980456026058632, "frac_reward_zero_std": 0.0, "grad_norm": 0.6007016897201538, "kl": 0.014845458790659904, "learning_rate": 9.761046520655906e-07, "loss": 0.0, "num_tokens": 75207840.0, "reward": 0.6585937738418579, "reward_std": 0.18330806493759155, "rewards/reward_len/mean": 0.6585937738418579, "rewards/reward_len/std": 0.31571176648139954, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.2996742671009772, "frac_reward_zero_std": 0.03125, "grad_norm": 0.593712329864502, "kl": 0.01548735424876213, "learning_rate": 9.758434852922123e-07, "loss": 0.0, "num_tokens": 75619088.0, "reward": 0.6735597252845764, "reward_std": 0.17292216420173645, "rewards/reward_len/mean": 0.6735597252845764, "rewards/reward_len/std": 0.28693899512290955, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.30130293159609123, "frac_reward_zero_std": 0.0, "grad_norm": 0.6343412399291992, "kl": 0.015218332409858704, "learning_rate": 9.75580934364502e-07, "loss": 0.0, "num_tokens": 76030208.0, "reward": 0.717574417591095, "reward_std": 0.1602172553539276, "rewards/reward_len/mean": 0.7175744771957397, "rewards/reward_len/std": 0.24876491725444794, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.30293159609120524, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5904422402381897, "kl": 0.015631649643182755, "learning_rate": 9.753170000461797e-07, "loss": 0.0, "num_tokens": 76441632.0, "reward": 0.6977579593658447, "reward_std": 0.17026583850383759, "rewards/reward_len/mean": 0.6977579593658447, "rewards/reward_len/std": 0.27446815371513367, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.30456026058631924, "frac_reward_zero_std": 0.0, "grad_norm": 0.6259761452674866, "kl": 0.01640273630619049, "learning_rate": 9.750516831049888e-07, "loss": 0.0, "num_tokens": 76852064.0, "reward": 0.6606364846229553, "reward_std": 0.17960289120674133, "rewards/reward_len/mean": 0.6606364846229553, "rewards/reward_len/std": 0.2865612208843231, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.30618892508143325, "frac_reward_zero_std": 0.0, "grad_norm": 0.6383422613143921, "kl": 0.01558994222432375, "learning_rate": 9.74784984312695e-07, "loss": 0.0, "num_tokens": 77261712.0, "reward": 0.6114181280136108, "reward_std": 0.2014351785182953, "rewards/reward_len/mean": 0.6114181280136108, "rewards/reward_len/std": 0.3481508195400238, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.30781758957654726, "frac_reward_zero_std": 0.0, "grad_norm": 0.6306859850883484, "kl": 0.01501611527055502, "learning_rate": 9.745169044450833e-07, "loss": 0.0, "num_tokens": 77673456.0, "reward": 0.7041455507278442, "reward_std": 0.1571076214313507, "rewards/reward_len/mean": 0.7041456699371338, "rewards/reward_len/std": 0.25986066460609436, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.30944625407166126, "frac_reward_zero_std": 0.0, "grad_norm": 0.5981801748275757, "kl": 0.01578598842024803, "learning_rate": 9.74247444281956e-07, "loss": 0.0, "num_tokens": 78085216.0, "reward": 0.6904189586639404, "reward_std": 0.17669713497161865, "rewards/reward_len/mean": 0.6904190182685852, "rewards/reward_len/std": 0.2786377966403961, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.31107491856677527, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6294738054275513, "kl": 0.016027355566620827, "learning_rate": 9.73976604607131e-07, "loss": 0.0, "num_tokens": 78496080.0, "reward": 0.6481107473373413, "reward_std": 0.16726157069206238, "rewards/reward_len/mean": 0.6481107473373413, "rewards/reward_len/std": 0.3358181118965149, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3127035830618892, "frac_reward_zero_std": 0.0, "grad_norm": 0.6311572194099426, "kl": 0.016798939555883408, "learning_rate": 9.737043862084382e-07, "loss": 0.0, "num_tokens": 78906928.0, "reward": 0.6790983080863953, "reward_std": 0.16296333074569702, "rewards/reward_len/mean": 0.6790983080863953, "rewards/reward_len/std": 0.25305941700935364, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.31433224755700323, "frac_reward_zero_std": 0.0, "grad_norm": 0.6043026447296143, "kl": 0.016381610184907913, "learning_rate": 9.734307898777185e-07, "loss": 0.0, "num_tokens": 79318016.0, "reward": 0.7687382698059082, "reward_std": 0.14754995703697205, "rewards/reward_len/mean": 0.7687382698059082, "rewards/reward_len/std": 0.21327175199985504, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.31596091205211724, "frac_reward_zero_std": 0.0, "grad_norm": 0.6162469983100891, "kl": 0.015077002346515656, "learning_rate": 9.731558164108208e-07, "loss": 0.0, "num_tokens": 79729424.0, "reward": 0.695016622543335, "reward_std": 0.1630645990371704, "rewards/reward_len/mean": 0.6950166821479797, "rewards/reward_len/std": 0.2939361333847046, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.31758957654723124, "frac_reward_zero_std": 0.0, "grad_norm": 0.5866138339042664, "kl": 0.016148962080478668, "learning_rate": 9.728794666076003e-07, "loss": 0.0, "num_tokens": 80140064.0, "reward": 0.6964005827903748, "reward_std": 0.1842053234577179, "rewards/reward_len/mean": 0.6964005827903748, "rewards/reward_len/std": 0.2662651538848877, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.31921824104234525, "frac_reward_zero_std": 0.0, "grad_norm": 0.5951958894729614, "kl": 0.016494538635015488, "learning_rate": 9.72601741271915e-07, "loss": 0.0, "num_tokens": 80550960.0, "reward": 0.7225896120071411, "reward_std": 0.16462863981723785, "rewards/reward_len/mean": 0.7225895524024963, "rewards/reward_len/std": 0.2461133450269699, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.32084690553745926, "frac_reward_zero_std": 0.0, "grad_norm": 0.6323500275611877, "kl": 0.01595374196767807, "learning_rate": 9.723226412116245e-07, "loss": 0.0, "num_tokens": 80963152.0, "reward": 0.6692620515823364, "reward_std": 0.18334975838661194, "rewards/reward_len/mean": 0.6692620515823364, "rewards/reward_len/std": 0.29623472690582275, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.32247557003257327, "frac_reward_zero_std": 0.0, "grad_norm": 0.6301946640014648, "kl": 0.016010403633117676, "learning_rate": 9.720421672385875e-07, "loss": 0.0, "num_tokens": 81374752.0, "reward": 0.6580832600593567, "reward_std": 0.17668747901916504, "rewards/reward_len/mean": 0.6580832004547119, "rewards/reward_len/std": 0.2969336211681366, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3241042345276873, "frac_reward_zero_std": 0.0, "grad_norm": 0.6035241484642029, "kl": 0.017409296706318855, "learning_rate": 9.717603201686589e-07, "loss": 0.0, "num_tokens": 81785552.0, "reward": 0.6888733506202698, "reward_std": 0.1796298623085022, "rewards/reward_len/mean": 0.6888733506202698, "rewards/reward_len/std": 0.2697688639163971, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3257328990228013, "frac_reward_zero_std": 0.0, "grad_norm": 0.6410253047943115, "kl": 0.017523568123579025, "learning_rate": 9.714771008216875e-07, "loss": 0.0, "num_tokens": 82197216.0, "reward": 0.7292085886001587, "reward_std": 0.18127143383026123, "rewards/reward_len/mean": 0.7292085886001587, "rewards/reward_len/std": 0.2392246127128601, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3273615635179153, "frac_reward_zero_std": 0.0, "grad_norm": 0.599948525428772, "kl": 0.01768646202981472, "learning_rate": 9.711925100215147e-07, "loss": 0.0, "num_tokens": 82607872.0, "reward": 0.6972005367279053, "reward_std": 0.1562381237745285, "rewards/reward_len/mean": 0.6972005367279053, "rewards/reward_len/std": 0.26906272768974304, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3289902280130293, "frac_reward_zero_std": 0.03125, "grad_norm": 0.652214765548706, "kl": 0.017771001905202866, "learning_rate": 9.709065485959698e-07, "loss": 0.0, "num_tokens": 83018800.0, "reward": 0.6678256988525391, "reward_std": 0.15717744827270508, "rewards/reward_len/mean": 0.6678256988525391, "rewards/reward_len/std": 0.3184298574924469, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3306188925081433, "frac_reward_zero_std": 0.0, "grad_norm": 0.6053181290626526, "kl": 0.017901360988616943, "learning_rate": 9.706192173768706e-07, "loss": 0.0, "num_tokens": 83429920.0, "reward": 0.7545305490493774, "reward_std": 0.1596936583518982, "rewards/reward_len/mean": 0.7545305490493774, "rewards/reward_len/std": 0.23869705200195312, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3322475570032573, "frac_reward_zero_std": 0.0, "grad_norm": 0.5862540602684021, "kl": 0.017794501036405563, "learning_rate": 9.703305172000185e-07, "loss": 0.0, "num_tokens": 83841904.0, "reward": 0.6626321077346802, "reward_std": 0.1390525847673416, "rewards/reward_len/mean": 0.6626321077346802, "rewards/reward_len/std": 0.28679072856903076, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3338762214983713, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6002835631370544, "kl": 0.018535004928708076, "learning_rate": 9.700404489051972e-07, "loss": 0.0, "num_tokens": 84251632.0, "reward": 0.7334213256835938, "reward_std": 0.16827449202537537, "rewards/reward_len/mean": 0.7334213256835938, "rewards/reward_len/std": 0.2800980806350708, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3355048859934853, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6214887499809265, "kl": 0.017622821033000946, "learning_rate": 9.6974901333617e-07, "loss": 0.0, "num_tokens": 84662928.0, "reward": 0.640983521938324, "reward_std": 0.18006670475006104, "rewards/reward_len/mean": 0.640983521938324, "rewards/reward_len/std": 0.29937002062797546, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.33713355048859933, "frac_reward_zero_std": 0.0, "grad_norm": 0.618092954158783, "kl": 0.018075430765748024, "learning_rate": 9.694562113406774e-07, "loss": 0.0, "num_tokens": 85073376.0, "reward": 0.6993112564086914, "reward_std": 0.16046598553657532, "rewards/reward_len/mean": 0.6993112564086914, "rewards/reward_len/std": 0.3013820946216583, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.33876221498371334, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6020038723945618, "kl": 0.01837369054555893, "learning_rate": 9.69162043770435e-07, "loss": 0.0, "num_tokens": 85483184.0, "reward": 0.6728264689445496, "reward_std": 0.15876099467277527, "rewards/reward_len/mean": 0.6728264093399048, "rewards/reward_len/std": 0.27313467860221863, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.34039087947882735, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5909225940704346, "kl": 0.018888413906097412, "learning_rate": 9.688665114811297e-07, "loss": 0.0, "num_tokens": 85894800.0, "reward": 0.6816571950912476, "reward_std": 0.14230996370315552, "rewards/reward_len/mean": 0.6816571950912476, "rewards/reward_len/std": 0.2813326120376587, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.34201954397394135, "frac_reward_zero_std": 0.0, "grad_norm": 0.6153008341789246, "kl": 0.01835223287343979, "learning_rate": 9.68569615332419e-07, "loss": 0.0, "num_tokens": 86306144.0, "reward": 0.7118955254554749, "reward_std": 0.1539526730775833, "rewards/reward_len/mean": 0.7118954658508301, "rewards/reward_len/std": 0.2359369546175003, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.34364820846905536, "frac_reward_zero_std": 0.0, "grad_norm": 0.6294032335281372, "kl": 0.019603785127401352, "learning_rate": 9.682713561879274e-07, "loss": 0.0, "num_tokens": 86717552.0, "reward": 0.7469916343688965, "reward_std": 0.15009596943855286, "rewards/reward_len/mean": 0.7469916343688965, "rewards/reward_len/std": 0.2171766310930252, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.34527687296416937, "frac_reward_zero_std": 0.0, "grad_norm": 0.5833908319473267, "kl": 0.018644198775291443, "learning_rate": 9.679717349152443e-07, "loss": 0.0, "num_tokens": 87127456.0, "reward": 0.6957895755767822, "reward_std": 0.16707700490951538, "rewards/reward_len/mean": 0.6957895755767822, "rewards/reward_len/std": 0.26105034351348877, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3469055374592834, "frac_reward_zero_std": 0.0, "grad_norm": 0.6319746375083923, "kl": 0.018804941326379776, "learning_rate": 9.67670752385921e-07, "loss": 0.0, "num_tokens": 87538240.0, "reward": 0.6285851001739502, "reward_std": 0.18671727180480957, "rewards/reward_len/mean": 0.628585159778595, "rewards/reward_len/std": 0.29717645049095154, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3485342019543974, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5897444486618042, "kl": 0.018841875717043877, "learning_rate": 9.673684094754685e-07, "loss": 0.0, "num_tokens": 87948736.0, "reward": 0.6720522046089172, "reward_std": 0.1700230836868286, "rewards/reward_len/mean": 0.6720522046089172, "rewards/reward_len/std": 0.318622350692749, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3501628664495114, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6089079976081848, "kl": 0.01944504864513874, "learning_rate": 9.670647070633553e-07, "loss": 0.0, "num_tokens": 88360160.0, "reward": 0.7809866070747375, "reward_std": 0.1241120994091034, "rewards/reward_len/mean": 0.7809865474700928, "rewards/reward_len/std": 0.18395255506038666, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3517915309446254, "frac_reward_zero_std": 0.0, "grad_norm": 0.6325971484184265, "kl": 0.01945400796830654, "learning_rate": 9.667596460330044e-07, "loss": 0.0, "num_tokens": 88769968.0, "reward": 0.6621564626693726, "reward_std": 0.1687772572040558, "rewards/reward_len/mean": 0.6621564626693726, "rewards/reward_len/std": 0.30315840244293213, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3534201954397394, "frac_reward_zero_std": 0.0, "grad_norm": 0.5953924655914307, "kl": 0.01986873894929886, "learning_rate": 9.6645322727179e-07, "loss": 0.0, "num_tokens": 89181616.0, "reward": 0.7215697765350342, "reward_std": 0.16879518330097198, "rewards/reward_len/mean": 0.7215697765350342, "rewards/reward_len/std": 0.27996912598609924, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3550488599348534, "frac_reward_zero_std": 0.0, "grad_norm": 0.6000797748565674, "kl": 0.020844094455242157, "learning_rate": 9.661454516710373e-07, "loss": 0.0, "num_tokens": 89593760.0, "reward": 0.6688927412033081, "reward_std": 0.17479372024536133, "rewards/reward_len/mean": 0.6688927412033081, "rewards/reward_len/std": 0.24873340129852295, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3566775244299674, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5959054231643677, "kl": 0.02134339138865471, "learning_rate": 9.658363201260167e-07, "loss": 0.0, "num_tokens": 90006016.0, "reward": 0.7279778122901917, "reward_std": 0.1381215900182724, "rewards/reward_len/mean": 0.7279777526855469, "rewards/reward_len/std": 0.266184538602829, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3583061889250814, "frac_reward_zero_std": 0.0, "grad_norm": 0.6038768887519836, "kl": 0.02094491384923458, "learning_rate": 9.655258335359437e-07, "loss": 0.0, "num_tokens": 90415904.0, "reward": 0.6463080644607544, "reward_std": 0.18193025887012482, "rewards/reward_len/mean": 0.6463080644607544, "rewards/reward_len/std": 0.30796530842781067, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.35993485342019543, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6048327684402466, "kl": 0.0205483790487051, "learning_rate": 9.652139928039755e-07, "loss": 0.0, "num_tokens": 90827232.0, "reward": 0.7107632160186768, "reward_std": 0.1416279673576355, "rewards/reward_len/mean": 0.710763156414032, "rewards/reward_len/std": 0.28092890977859497, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.36156351791530944, "frac_reward_zero_std": 0.0, "grad_norm": 0.6205201745033264, "kl": 0.021431881934404373, "learning_rate": 9.64900798837208e-07, "loss": 0.0, "num_tokens": 91237328.0, "reward": 0.694963276386261, "reward_std": 0.1939510554075241, "rewards/reward_len/mean": 0.694963276386261, "rewards/reward_len/std": 0.2706947922706604, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.36319218241042345, "frac_reward_zero_std": 0.0, "grad_norm": 0.6635458469390869, "kl": 0.01991262659430504, "learning_rate": 9.645862525466733e-07, "loss": 0.0, "num_tokens": 91648000.0, "reward": 0.6684845685958862, "reward_std": 0.173819899559021, "rewards/reward_len/mean": 0.6684845685958862, "rewards/reward_len/std": 0.29062509536743164, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.36482084690553745, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5998081564903259, "kl": 0.02012038603425026, "learning_rate": 9.642703548473375e-07, "loss": 0.0, "num_tokens": 92059904.0, "reward": 0.7063432931900024, "reward_std": 0.18050310015678406, "rewards/reward_len/mean": 0.7063432931900024, "rewards/reward_len/std": 0.30788129568099976, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.36644951140065146, "frac_reward_zero_std": 0.0, "grad_norm": 0.6145663857460022, "kl": 0.022772468626499176, "learning_rate": 9.639531066580977e-07, "loss": 0.0, "num_tokens": 92471056.0, "reward": 0.8073631525039673, "reward_std": 0.1399720013141632, "rewards/reward_len/mean": 0.8073631525039673, "rewards/reward_len/std": 0.1826782077550888, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.36807817589576547, "frac_reward_zero_std": 0.0, "grad_norm": 0.5936559438705444, "kl": 0.02164357900619507, "learning_rate": 9.636345089017792e-07, "loss": 0.0, "num_tokens": 92881808.0, "reward": 0.7391023635864258, "reward_std": 0.14301162958145142, "rewards/reward_len/mean": 0.7391023635864258, "rewards/reward_len/std": 0.23079432547092438, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3697068403908795, "frac_reward_zero_std": 0.0, "grad_norm": 0.6075495481491089, "kl": 0.022608838975429535, "learning_rate": 9.633145625051332e-07, "loss": 0.0, "num_tokens": 93293136.0, "reward": 0.735603928565979, "reward_std": 0.1497860997915268, "rewards/reward_len/mean": 0.7356039881706238, "rewards/reward_len/std": 0.29065263271331787, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3713355048859935, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6266068816184998, "kl": 0.022978611290454865, "learning_rate": 9.62993268398834e-07, "loss": 0.0, "num_tokens": 93703488.0, "reward": 0.6720352172851562, "reward_std": 0.15604108572006226, "rewards/reward_len/mean": 0.6720352172851562, "rewards/reward_len/std": 0.30634263157844543, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3729641693811075, "frac_reward_zero_std": 0.0, "grad_norm": 0.6069141626358032, "kl": 0.0213770791888237, "learning_rate": 9.626706275174753e-07, "loss": 0.0, "num_tokens": 94114336.0, "reward": 0.6400838494300842, "reward_std": 0.1942802518606186, "rewards/reward_len/mean": 0.6400838494300842, "rewards/reward_len/std": 0.3120456635951996, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3745928338762215, "frac_reward_zero_std": 0.03125, "grad_norm": 0.617175281047821, "kl": 0.02474941685795784, "learning_rate": 9.623466407995695e-07, "loss": 0.0, "num_tokens": 94524880.0, "reward": 0.7468463182449341, "reward_std": 0.15664824843406677, "rewards/reward_len/mean": 0.7468463182449341, "rewards/reward_len/std": 0.2534652352333069, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3762214983713355, "frac_reward_zero_std": 0.0, "grad_norm": 0.5980631113052368, "kl": 0.022127531468868256, "learning_rate": 9.62021309187543e-07, "loss": 0.0, "num_tokens": 94935632.0, "reward": 0.6589415669441223, "reward_std": 0.18368130922317505, "rewards/reward_len/mean": 0.6589416265487671, "rewards/reward_len/std": 0.2911505401134491, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3778501628664495, "frac_reward_zero_std": 0.0, "grad_norm": 0.6457543969154358, "kl": 0.023502029478549957, "learning_rate": 9.61694633627735e-07, "loss": 0.0, "num_tokens": 95345920.0, "reward": 0.6950472593307495, "reward_std": 0.15559107065200806, "rewards/reward_len/mean": 0.6950472593307495, "rewards/reward_len/std": 0.2609292268753052, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3794788273615635, "frac_reward_zero_std": 0.0, "grad_norm": 0.5897790193557739, "kl": 0.022376790642738342, "learning_rate": 9.613666150703935e-07, "loss": 0.0, "num_tokens": 95756336.0, "reward": 0.6718961000442505, "reward_std": 0.171420156955719, "rewards/reward_len/mean": 0.6718961000442505, "rewards/reward_len/std": 0.2885386645793915, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3811074918566775, "frac_reward_zero_std": 0.0, "grad_norm": 0.6240487098693848, "kl": 0.023646730929613113, "learning_rate": 9.61037254469673e-07, "loss": 0.0, "num_tokens": 96167392.0, "reward": 0.7086412310600281, "reward_std": 0.16626673936843872, "rewards/reward_len/mean": 0.7086412310600281, "rewards/reward_len/std": 0.27843889594078064, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.38273615635179153, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6021559238433838, "kl": 0.024272028356790543, "learning_rate": 9.607065527836321e-07, "loss": 0.0, "num_tokens": 96578640.0, "reward": 0.6827743053436279, "reward_std": 0.18389371037483215, "rewards/reward_len/mean": 0.6827743053436279, "rewards/reward_len/std": 0.3005993962287903, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.38436482084690554, "frac_reward_zero_std": 0.0, "grad_norm": 0.616551399230957, "kl": 0.024880219250917435, "learning_rate": 9.603745109742304e-07, "loss": 0.0, "num_tokens": 96988720.0, "reward": 0.7448611259460449, "reward_std": 0.1339092254638672, "rewards/reward_len/mean": 0.7448611259460449, "rewards/reward_len/std": 0.23326420783996582, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.38599348534201955, "frac_reward_zero_std": 0.0, "grad_norm": 0.6201000809669495, "kl": 0.02420281246304512, "learning_rate": 9.600411300073258e-07, "loss": 0.0, "num_tokens": 97399904.0, "reward": 0.7135722637176514, "reward_std": 0.16904202103614807, "rewards/reward_len/mean": 0.7135722637176514, "rewards/reward_len/std": 0.2599773108959198, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.38762214983713356, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5985442399978638, "kl": 0.02609558403491974, "learning_rate": 9.597064108526713e-07, "loss": 0.0, "num_tokens": 97810352.0, "reward": 0.7110322117805481, "reward_std": 0.15377730131149292, "rewards/reward_len/mean": 0.7110322713851929, "rewards/reward_len/std": 0.3193209171295166, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.38925081433224756, "frac_reward_zero_std": 0.0, "grad_norm": 0.6056707501411438, "kl": 0.023859834298491478, "learning_rate": 9.593703544839124e-07, "loss": 0.0, "num_tokens": 98221888.0, "reward": 0.7347882986068726, "reward_std": 0.16614553332328796, "rewards/reward_len/mean": 0.7347882390022278, "rewards/reward_len/std": 0.24717359244823456, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.39087947882736157, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6397944092750549, "kl": 0.025545623153448105, "learning_rate": 9.590329618785848e-07, "loss": 0.0, "num_tokens": 98633264.0, "reward": 0.6394548416137695, "reward_std": 0.16524696350097656, "rewards/reward_len/mean": 0.6394548416137695, "rewards/reward_len/std": 0.34341222047805786, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3925081433224756, "frac_reward_zero_std": 0.0, "grad_norm": 0.5987816452980042, "kl": 0.02685512974858284, "learning_rate": 9.58694234018111e-07, "loss": 0.0, "num_tokens": 99044208.0, "reward": 0.7145577669143677, "reward_std": 0.1632346212863922, "rewards/reward_len/mean": 0.7145577669143677, "rewards/reward_len/std": 0.3004344403743744, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3941368078175896, "frac_reward_zero_std": 0.0, "grad_norm": 0.6529569029808044, "kl": 0.025826461613178253, "learning_rate": 9.58354171887797e-07, "loss": 0.0, "num_tokens": 99456112.0, "reward": 0.7061277031898499, "reward_std": 0.1827966868877411, "rewards/reward_len/mean": 0.7061277031898499, "rewards/reward_len/std": 0.28667983412742615, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3957654723127036, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6008878946304321, "kl": 0.027605362236499786, "learning_rate": 9.580127764768304e-07, "loss": 0.0, "num_tokens": 99867552.0, "reward": 0.712070107460022, "reward_std": 0.15890689194202423, "rewards/reward_len/mean": 0.712070107460022, "rewards/reward_len/std": 0.28034543991088867, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3973941368078176, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6072767972946167, "kl": 0.02527453377842903, "learning_rate": 9.576700487782773e-07, "loss": 0.0, "num_tokens": 100277696.0, "reward": 0.6811479926109314, "reward_std": 0.15893536806106567, "rewards/reward_len/mean": 0.6811479330062866, "rewards/reward_len/std": 0.26673653721809387, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.3990228013029316, "frac_reward_zero_std": 0.0, "grad_norm": 0.6054655909538269, "kl": 0.026035722345113754, "learning_rate": 9.573259897890792e-07, "loss": 0.0, "num_tokens": 100689936.0, "reward": 0.5806212425231934, "reward_std": 0.18700231611728668, "rewards/reward_len/mean": 0.5806211829185486, "rewards/reward_len/std": 0.32639798521995544, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4006514657980456, "frac_reward_zero_std": 0.0, "grad_norm": 0.5916962027549744, "kl": 0.027598140761256218, "learning_rate": 9.569806005100498e-07, "loss": 0.0, "num_tokens": 101100576.0, "reward": 0.7446458339691162, "reward_std": 0.1582031548023224, "rewards/reward_len/mean": 0.7446458339691162, "rewards/reward_len/std": 0.2543991208076477, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4022801302931596, "frac_reward_zero_std": 0.0, "grad_norm": 0.667644202709198, "kl": 0.02730438858270645, "learning_rate": 9.566338819458724e-07, "loss": 0.0, "num_tokens": 101511584.0, "reward": 0.6710622310638428, "reward_std": 0.18419399857521057, "rewards/reward_len/mean": 0.6710622310638428, "rewards/reward_len/std": 0.3116113841533661, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.40390879478827363, "frac_reward_zero_std": 0.0, "grad_norm": 0.6121068596839905, "kl": 0.027174565941095352, "learning_rate": 9.56285835105097e-07, "loss": 0.0, "num_tokens": 101923008.0, "reward": 0.7011467218399048, "reward_std": 0.17108871042728424, "rewards/reward_len/mean": 0.7011467218399048, "rewards/reward_len/std": 0.282052606344223, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.40553745928338764, "frac_reward_zero_std": 0.0, "grad_norm": 0.599940299987793, "kl": 0.030941378325223923, "learning_rate": 9.559364610001377e-07, "loss": 0.0, "num_tokens": 102333648.0, "reward": 0.7570875883102417, "reward_std": 0.15860940515995026, "rewards/reward_len/mean": 0.7570875883102417, "rewards/reward_len/std": 0.22960078716278076, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.40716612377850164, "frac_reward_zero_std": 0.0, "grad_norm": 0.6230548620223999, "kl": 0.028232954442501068, "learning_rate": 9.555857606472691e-07, "loss": 0.0, "num_tokens": 102744176.0, "reward": 0.7461468577384949, "reward_std": 0.13459302484989166, "rewards/reward_len/mean": 0.7461468577384949, "rewards/reward_len/std": 0.2544673979282379, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.40879478827361565, "frac_reward_zero_std": 0.0, "grad_norm": 0.6194649338722229, "kl": 0.02859567478299141, "learning_rate": 9.552337350666236e-07, "loss": 0.0, "num_tokens": 103155136.0, "reward": 0.6915513277053833, "reward_std": 0.17479729652404785, "rewards/reward_len/mean": 0.6915513277053833, "rewards/reward_len/std": 0.3049146831035614, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.41042345276872966, "frac_reward_zero_std": 0.0, "grad_norm": 0.6417128443717957, "kl": 0.028531357645988464, "learning_rate": 9.54880385282189e-07, "loss": 0.0, "num_tokens": 103566016.0, "reward": 0.6896210312843323, "reward_std": 0.1582648754119873, "rewards/reward_len/mean": 0.6896210312843323, "rewards/reward_len/std": 0.2972177565097809, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.41205211726384366, "frac_reward_zero_std": 0.0, "grad_norm": 0.6245321035385132, "kl": 0.02778390794992447, "learning_rate": 9.545257123218041e-07, "loss": 0.0, "num_tokens": 103976976.0, "reward": 0.723618745803833, "reward_std": 0.16919243335723877, "rewards/reward_len/mean": 0.723618745803833, "rewards/reward_len/std": 0.2767527997493744, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.41368078175895767, "frac_reward_zero_std": 0.03125, "grad_norm": 0.593848466873169, "kl": 0.03036929853260517, "learning_rate": 9.541697172171575e-07, "loss": 0.0, "num_tokens": 104387232.0, "reward": 0.7184895873069763, "reward_std": 0.15144765377044678, "rewards/reward_len/mean": 0.7184896469116211, "rewards/reward_len/std": 0.30557960271835327, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4153094462540717, "frac_reward_zero_std": 0.0, "grad_norm": 0.6051650047302246, "kl": 0.030943922698497772, "learning_rate": 9.538124010037831e-07, "loss": 0.0, "num_tokens": 104798320.0, "reward": 0.7578812837600708, "reward_std": 0.14389920234680176, "rewards/reward_len/mean": 0.7578812837600708, "rewards/reward_len/std": 0.2261507362127304, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4169381107491857, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6267803311347961, "kl": 0.028956089168787003, "learning_rate": 9.534537647210581e-07, "loss": 0.0, "num_tokens": 105208848.0, "reward": 0.6516220569610596, "reward_std": 0.19815342128276825, "rewards/reward_len/mean": 0.6516220569610596, "rewards/reward_len/std": 0.3117983043193817, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4185667752442997, "frac_reward_zero_std": 0.0, "grad_norm": 0.6216210126876831, "kl": 0.030665544793009758, "learning_rate": 9.530938094121991e-07, "loss": 0.0, "num_tokens": 105619360.0, "reward": 0.6586247682571411, "reward_std": 0.13617107272148132, "rewards/reward_len/mean": 0.6586247682571411, "rewards/reward_len/std": 0.3217509686946869, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4201954397394137, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6273130178451538, "kl": 0.030999887734651566, "learning_rate": 9.527325361242599e-07, "loss": 0.0, "num_tokens": 106029520.0, "reward": 0.7393351793289185, "reward_std": 0.15944461524486542, "rewards/reward_len/mean": 0.7393351793289185, "rewards/reward_len/std": 0.3060671091079712, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4218241042345277, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6074904203414917, "kl": 0.03175032511353493, "learning_rate": 9.523699459081285e-07, "loss": 0.0, "num_tokens": 106440240.0, "reward": 0.6874356269836426, "reward_std": 0.15634527802467346, "rewards/reward_len/mean": 0.6874356269836426, "rewards/reward_len/std": 0.2957800626754761, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4234527687296417, "frac_reward_zero_std": 0.0, "grad_norm": 0.6011847853660583, "kl": 0.03052489645779133, "learning_rate": 9.520060398185226e-07, "loss": 0.0, "num_tokens": 106850544.0, "reward": 0.660910427570343, "reward_std": 0.19087210297584534, "rewards/reward_len/mean": 0.660910427570343, "rewards/reward_len/std": 0.3132171630859375, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4250814332247557, "frac_reward_zero_std": 0.0, "grad_norm": 0.600227415561676, "kl": 0.030827732756733894, "learning_rate": 9.516408189139883e-07, "loss": 0.0, "num_tokens": 107261184.0, "reward": 0.6910867094993591, "reward_std": 0.160253643989563, "rewards/reward_len/mean": 0.6910867094993591, "rewards/reward_len/std": 0.3008071780204773, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.42671009771986973, "frac_reward_zero_std": 0.0, "grad_norm": 0.6229017376899719, "kl": 0.03164702653884888, "learning_rate": 9.512742842568963e-07, "loss": 0.0, "num_tokens": 107672544.0, "reward": 0.7197960615158081, "reward_std": 0.15801090002059937, "rewards/reward_len/mean": 0.7197960615158081, "rewards/reward_len/std": 0.24717429280281067, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.42833876221498374, "frac_reward_zero_std": 0.0, "grad_norm": 0.6571717858314514, "kl": 0.030545808374881744, "learning_rate": 9.509064369134386e-07, "loss": 0.0, "num_tokens": 108083616.0, "reward": 0.7181774377822876, "reward_std": 0.18022418022155762, "rewards/reward_len/mean": 0.7181774377822876, "rewards/reward_len/std": 0.2860702872276306, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.42996742671009774, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6330294013023376, "kl": 0.0310403760522604, "learning_rate": 9.505372779536253e-07, "loss": 0.0, "num_tokens": 108495904.0, "reward": 0.6785547733306885, "reward_std": 0.18975311517715454, "rewards/reward_len/mean": 0.6785547733306885, "rewards/reward_len/std": 0.3010791540145874, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.43159609120521175, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5954630374908447, "kl": 0.03245677053928375, "learning_rate": 9.501668084512826e-07, "loss": 0.0, "num_tokens": 108906880.0, "reward": 0.7554031610488892, "reward_std": 0.1605810821056366, "rewards/reward_len/mean": 0.7554031610488892, "rewards/reward_len/std": 0.22278310358524323, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.43322475570032576, "frac_reward_zero_std": 0.0, "grad_norm": 0.6117095947265625, "kl": 0.03354963660240173, "learning_rate": 9.49795029484048e-07, "loss": 0.0, "num_tokens": 109317280.0, "reward": 0.7424542307853699, "reward_std": 0.14807964861392975, "rewards/reward_len/mean": 0.7424541711807251, "rewards/reward_len/std": 0.2113611400127411, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.43485342019543977, "frac_reward_zero_std": 0.0, "grad_norm": 0.6157346963882446, "kl": 0.03335026651620865, "learning_rate": 9.494219421333687e-07, "loss": 0.0, "num_tokens": 109727968.0, "reward": 0.7334002256393433, "reward_std": 0.1636338233947754, "rewards/reward_len/mean": 0.7334002256393433, "rewards/reward_len/std": 0.23987358808517456, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4364820846905538, "frac_reward_zero_std": 0.0, "grad_norm": 0.646137535572052, "kl": 0.03510802984237671, "learning_rate": 9.490475474844975e-07, "loss": 0.0, "num_tokens": 110138288.0, "reward": 0.7085660696029663, "reward_std": 0.1637209802865982, "rewards/reward_len/mean": 0.7085660696029663, "rewards/reward_len/std": 0.26678866147994995, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4381107491856677, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6161985993385315, "kl": 0.034348227083683014, "learning_rate": 9.486718466264896e-07, "loss": 0.0, "num_tokens": 110550240.0, "reward": 0.7291827201843262, "reward_std": 0.14560602605342865, "rewards/reward_len/mean": 0.7291827201843262, "rewards/reward_len/std": 0.2765580713748932, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.43973941368078173, "frac_reward_zero_std": 0.0, "grad_norm": 0.6205320954322815, "kl": 0.03725236654281616, "learning_rate": 9.482948406522002e-07, "loss": 0.0, "num_tokens": 110961520.0, "reward": 0.7628772258758545, "reward_std": 0.13970954716205597, "rewards/reward_len/mean": 0.7628772258758545, "rewards/reward_len/std": 0.21154604852199554, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.44136807817589574, "frac_reward_zero_std": 0.0, "grad_norm": 0.6159811615943909, "kl": 0.03295423090457916, "learning_rate": 9.479165306582811e-07, "loss": 0.0, "num_tokens": 111371440.0, "reward": 0.716401994228363, "reward_std": 0.16333332657814026, "rewards/reward_len/mean": 0.716401994228363, "rewards/reward_len/std": 0.2840070128440857, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.44299674267100975, "frac_reward_zero_std": 0.0, "grad_norm": 0.667841374874115, "kl": 0.03356592357158661, "learning_rate": 9.475369177451764e-07, "loss": 0.0, "num_tokens": 111781904.0, "reward": 0.6633846163749695, "reward_std": 0.14591234922409058, "rewards/reward_len/mean": 0.6633846163749695, "rewards/reward_len/std": 0.2927442789077759, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.44462540716612375, "frac_reward_zero_std": 0.0, "grad_norm": 0.601255476474762, "kl": 0.033525485545396805, "learning_rate": 9.471560030171212e-07, "loss": 0.0, "num_tokens": 112193536.0, "reward": 0.6802637577056885, "reward_std": 0.1845138669013977, "rewards/reward_len/mean": 0.6802637577056885, "rewards/reward_len/std": 0.2888450026512146, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.44625407166123776, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5906575322151184, "kl": 0.03543892502784729, "learning_rate": 9.467737875821367e-07, "loss": 0.0, "num_tokens": 112603376.0, "reward": 0.6858876943588257, "reward_std": 0.18553665280342102, "rewards/reward_len/mean": 0.6858876943588257, "rewards/reward_len/std": 0.3400854766368866, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.44788273615635177, "frac_reward_zero_std": 0.0, "grad_norm": 0.6011810302734375, "kl": 0.036137379705905914, "learning_rate": 9.463902725520278e-07, "loss": 0.0, "num_tokens": 113013712.0, "reward": 0.7764402031898499, "reward_std": 0.11846504360437393, "rewards/reward_len/mean": 0.7764402031898499, "rewards/reward_len/std": 0.22058910131454468, "step": 275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4495114006514658, "frac_reward_zero_std": 0.0, "grad_norm": 0.6028671264648438, "kl": 0.037830013781785965, "learning_rate": 9.460054590423801e-07, "loss": 0.0, "num_tokens": 113424320.0, "reward": 0.7337232232093811, "reward_std": 0.14220628142356873, "rewards/reward_len/mean": 0.7337232232093811, "rewards/reward_len/std": 0.25517138838768005, "step": 276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4511400651465798, "frac_reward_zero_std": 0.0, "grad_norm": 0.6017957925796509, "kl": 0.03564239665865898, "learning_rate": 9.456193481725555e-07, "loss": 0.0, "num_tokens": 113835248.0, "reward": 0.7044472098350525, "reward_std": 0.17516377568244934, "rewards/reward_len/mean": 0.7044472098350525, "rewards/reward_len/std": 0.30733922123908997, "step": 277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4527687296416938, "frac_reward_zero_std": 0.0, "grad_norm": 0.631058394908905, "kl": 0.037275105714797974, "learning_rate": 9.452319410656906e-07, "loss": 0.0, "num_tokens": 114246208.0, "reward": 0.7240762710571289, "reward_std": 0.15857647359371185, "rewards/reward_len/mean": 0.7240762710571289, "rewards/reward_len/std": 0.22791825234889984, "step": 278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4543973941368078, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5903571844100952, "kl": 0.03861304372549057, "learning_rate": 9.448432388486917e-07, "loss": 0.0, "num_tokens": 114657552.0, "reward": 0.7175036668777466, "reward_std": 0.15163937211036682, "rewards/reward_len/mean": 0.7175036668777466, "rewards/reward_len/std": 0.30011558532714844, "step": 279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4560260586319218, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5763086080551147, "kl": 0.03746167570352554, "learning_rate": 9.444532426522334e-07, "loss": 0.0, "num_tokens": 115068672.0, "reward": 0.6812793016433716, "reward_std": 0.14588338136672974, "rewards/reward_len/mean": 0.6812793016433716, "rewards/reward_len/std": 0.3330897092819214, "step": 280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4576547231270358, "frac_reward_zero_std": 0.0, "grad_norm": 0.6616631746292114, "kl": 0.03897320479154587, "learning_rate": 9.44061953610753e-07, "loss": 0.0, "num_tokens": 115479168.0, "reward": 0.7193011045455933, "reward_std": 0.16309389472007751, "rewards/reward_len/mean": 0.7193011045455933, "rewards/reward_len/std": 0.2884176969528198, "step": 281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4592833876221498, "frac_reward_zero_std": 0.0, "grad_norm": 0.6436671018600464, "kl": 0.03609654679894447, "learning_rate": 9.436693728624496e-07, "loss": 0.0, "num_tokens": 115891120.0, "reward": 0.7789444923400879, "reward_std": 0.13357174396514893, "rewards/reward_len/mean": 0.7789444923400879, "rewards/reward_len/std": 0.19188252091407776, "step": 282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4609120521172638, "frac_reward_zero_std": 0.0, "grad_norm": 0.596704363822937, "kl": 0.04044201225042343, "learning_rate": 9.432755015492793e-07, "loss": 0.0, "num_tokens": 116302272.0, "reward": 0.7296749353408813, "reward_std": 0.15093553066253662, "rewards/reward_len/mean": 0.7296749949455261, "rewards/reward_len/std": 0.28094935417175293, "step": 283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.46254071661237783, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5959630608558655, "kl": 0.041837941855192184, "learning_rate": 9.428803408169519e-07, "loss": 0.0, "num_tokens": 116712800.0, "reward": 0.6837965846061707, "reward_std": 0.16585439443588257, "rewards/reward_len/mean": 0.6837965846061707, "rewards/reward_len/std": 0.292587548494339, "step": 284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.46416938110749184, "frac_reward_zero_std": 0.0, "grad_norm": 0.6140017509460449, "kl": 0.03815377131104469, "learning_rate": 9.424838918149284e-07, "loss": 0.0, "num_tokens": 117124576.0, "reward": 0.6881316900253296, "reward_std": 0.17004528641700745, "rewards/reward_len/mean": 0.6881316900253296, "rewards/reward_len/std": 0.29442742466926575, "step": 285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.46579804560260585, "frac_reward_zero_std": 0.0, "grad_norm": 0.6183676719665527, "kl": 0.04253058508038521, "learning_rate": 9.420861556964168e-07, "loss": 0.0, "num_tokens": 117534320.0, "reward": 0.7576490044593811, "reward_std": 0.14911717176437378, "rewards/reward_len/mean": 0.7576490044593811, "rewards/reward_len/std": 0.22604867815971375, "step": 286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.46742671009771986, "frac_reward_zero_std": 0.0, "grad_norm": 0.5734248161315918, "kl": 0.03939627856016159, "learning_rate": 9.416871336183696e-07, "loss": 0.0, "num_tokens": 117944736.0, "reward": 0.7424211502075195, "reward_std": 0.14195090532302856, "rewards/reward_len/mean": 0.7424211502075195, "rewards/reward_len/std": 0.2558203935623169, "step": 287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.46905537459283386, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6206833720207214, "kl": 0.042343951761722565, "learning_rate": 9.412868267414796e-07, "loss": 0.0, "num_tokens": 118355584.0, "reward": 0.6923401355743408, "reward_std": 0.12070412188768387, "rewards/reward_len/mean": 0.6923401355743408, "rewards/reward_len/std": 0.33021312952041626, "step": 288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.47068403908794787, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6101981997489929, "kl": 0.04635278135538101, "learning_rate": 9.408852362301766e-07, "loss": 0.0, "num_tokens": 118766448.0, "reward": 0.6821362972259521, "reward_std": 0.1363212764263153, "rewards/reward_len/mean": 0.6821362972259521, "rewards/reward_len/std": 0.3472834527492523, "step": 289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4723127035830619, "frac_reward_zero_std": 0.0, "grad_norm": 0.6217485070228577, "kl": 0.04243188351392746, "learning_rate": 9.40482363252625e-07, "loss": 0.0, "num_tokens": 119178560.0, "reward": 0.741677463054657, "reward_std": 0.15116190910339355, "rewards/reward_len/mean": 0.7416774034500122, "rewards/reward_len/std": 0.26587268710136414, "step": 290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4739413680781759, "frac_reward_zero_std": 0.0, "grad_norm": 0.6382496356964111, "kl": 0.04259657859802246, "learning_rate": 9.400782089807194e-07, "loss": 0.0, "num_tokens": 119589456.0, "reward": 0.7041587233543396, "reward_std": 0.1855737417936325, "rewards/reward_len/mean": 0.7041587233543396, "rewards/reward_len/std": 0.2886427044868469, "step": 291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4755700325732899, "frac_reward_zero_std": 0.0, "grad_norm": 0.6232771277427673, "kl": 0.04171663522720337, "learning_rate": 9.39672774590081e-07, "loss": 0.0, "num_tokens": 120001712.0, "reward": 0.7238404750823975, "reward_std": 0.13766299188137054, "rewards/reward_len/mean": 0.7238404750823975, "rewards/reward_len/std": 0.28973668813705444, "step": 292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4771986970684039, "frac_reward_zero_std": 0.0, "grad_norm": 0.6082167029380798, "kl": 0.042766693979501724, "learning_rate": 9.392660612600554e-07, "loss": 0.0, "num_tokens": 120412464.0, "reward": 0.6514843702316284, "reward_std": 0.16132752597332, "rewards/reward_len/mean": 0.6514843702316284, "rewards/reward_len/std": 0.3188502788543701, "step": 293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4788273615635179, "frac_reward_zero_std": 0.03125, "grad_norm": 0.600487470626831, "kl": 0.043773356825113297, "learning_rate": 9.388580701737081e-07, "loss": 0.0, "num_tokens": 120822944.0, "reward": 0.681302547454834, "reward_std": 0.1478988230228424, "rewards/reward_len/mean": 0.681302547454834, "rewards/reward_len/std": 0.28718799352645874, "step": 294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4804560260586319, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5856962203979492, "kl": 0.04625491052865982, "learning_rate": 9.384488025178212e-07, "loss": 0.0, "num_tokens": 121234848.0, "reward": 0.7115455269813538, "reward_std": 0.1436472088098526, "rewards/reward_len/mean": 0.711545467376709, "rewards/reward_len/std": 0.25870779156684875, "step": 295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4820846905537459, "frac_reward_zero_std": 0.0, "grad_norm": 0.5992655754089355, "kl": 0.04558287560939789, "learning_rate": 9.380382594828906e-07, "loss": 0.0, "num_tokens": 121645600.0, "reward": 0.7658069133758545, "reward_std": 0.16196781396865845, "rewards/reward_len/mean": 0.7658069133758545, "rewards/reward_len/std": 0.25649750232696533, "step": 296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.4837133550488599, "frac_reward_zero_std": 0.0, "grad_norm": 0.6334710121154785, "kl": 0.04034356027841568, "learning_rate": 9.376264422631215e-07, "loss": 0.0, "num_tokens": 122056784.0, "reward": 0.7164790630340576, "reward_std": 0.1639668494462967, "rewards/reward_len/mean": 0.7164790034294128, "rewards/reward_len/std": 0.2599491477012634, "step": 297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.48534201954397393, "frac_reward_zero_std": 0.0, "grad_norm": 0.6415756940841675, "kl": 0.044457245618104935, "learning_rate": 9.372133520564263e-07, "loss": 0.0, "num_tokens": 122468496.0, "reward": 0.7212328910827637, "reward_std": 0.15443643927574158, "rewards/reward_len/mean": 0.7212328910827637, "rewards/reward_len/std": 0.24069413542747498, "step": 298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.48697068403908794, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5830205678939819, "kl": 0.043671444058418274, "learning_rate": 9.367989900644194e-07, "loss": 0.0, "num_tokens": 122879856.0, "reward": 0.757603645324707, "reward_std": 0.12802964448928833, "rewards/reward_len/mean": 0.757603645324707, "rewards/reward_len/std": 0.2518541216850281, "step": 299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.48859934853420195, "frac_reward_zero_std": 0.0, "grad_norm": 0.6280012726783752, "kl": 0.04717230051755905, "learning_rate": 9.363833574924153e-07, "loss": 0.0, "num_tokens": 123290400.0, "reward": 0.7440518140792847, "reward_std": 0.15437236428260803, "rewards/reward_len/mean": 0.7440518140792847, "rewards/reward_len/std": 0.2883242070674896, "step": 300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.49022801302931596, "frac_reward_zero_std": 0.0, "grad_norm": 0.6179625391960144, "kl": 0.046696849167346954, "learning_rate": 9.359664555494242e-07, "loss": 0.0, "num_tokens": 123701312.0, "reward": 0.7464583516120911, "reward_std": 0.13598673045635223, "rewards/reward_len/mean": 0.7464583516120911, "rewards/reward_len/std": 0.28178679943084717, "step": 301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.49185667752442996, "frac_reward_zero_std": 0.0, "grad_norm": 0.5963777899742126, "kl": 0.044872723519802094, "learning_rate": 9.355482854481488e-07, "loss": 0.0, "num_tokens": 124112352.0, "reward": 0.701640784740448, "reward_std": 0.1472214311361313, "rewards/reward_len/mean": 0.7016407251358032, "rewards/reward_len/std": 0.28239479660987854, "step": 302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.49348534201954397, "frac_reward_zero_std": 0.0, "grad_norm": 0.6050562262535095, "kl": 0.04824438691139221, "learning_rate": 9.351288484049803e-07, "loss": 0.0, "num_tokens": 124523296.0, "reward": 0.7735297679901123, "reward_std": 0.13210155069828033, "rewards/reward_len/mean": 0.7735297679901123, "rewards/reward_len/std": 0.2219085991382599, "step": 303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.495114006514658, "frac_reward_zero_std": 0.0, "grad_norm": 0.6163731217384338, "kl": 0.045793820172548294, "learning_rate": 9.347081456399957e-07, "loss": 0.0, "num_tokens": 124934272.0, "reward": 0.6519665122032166, "reward_std": 0.185672789812088, "rewards/reward_len/mean": 0.6519665122032166, "rewards/reward_len/std": 0.318587064743042, "step": 304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.496742671009772, "frac_reward_zero_std": 0.0, "grad_norm": 0.5976883172988892, "kl": 0.047251466661691666, "learning_rate": 9.342861783769533e-07, "loss": 0.0, "num_tokens": 125345824.0, "reward": 0.6994962692260742, "reward_std": 0.14789395034313202, "rewards/reward_len/mean": 0.6994962692260742, "rewards/reward_len/std": 0.27003374695777893, "step": 305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.498371335504886, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6159687042236328, "kl": 0.048346664756536484, "learning_rate": 9.338629478432905e-07, "loss": 0.0, "num_tokens": 125756880.0, "reward": 0.7341846227645874, "reward_std": 0.14676788449287415, "rewards/reward_len/mean": 0.7341846823692322, "rewards/reward_len/std": 0.23427489399909973, "step": 306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.5738719701766968, "kl": 0.04922633618116379, "learning_rate": 9.334384552701183e-07, "loss": 0.0, "num_tokens": 126167248.0, "reward": 0.7341623306274414, "reward_std": 0.14974409341812134, "rewards/reward_len/mean": 0.7341623306274414, "rewards/reward_len/std": 0.2612164318561554, "step": 307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.501628664495114, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6371157169342041, "kl": 0.04634123668074608, "learning_rate": 9.330127018922193e-07, "loss": 0.0, "num_tokens": 126579248.0, "reward": 0.6692814826965332, "reward_std": 0.1574224978685379, "rewards/reward_len/mean": 0.6692814826965332, "rewards/reward_len/std": 0.3024684488773346, "step": 308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.503257328990228, "frac_reward_zero_std": 0.0, "grad_norm": 0.6166362762451172, "kl": 0.048391930758953094, "learning_rate": 9.325856889480437e-07, "loss": 0.0, "num_tokens": 126989808.0, "reward": 0.7370926737785339, "reward_std": 0.13657952845096588, "rewards/reward_len/mean": 0.7370926737785339, "rewards/reward_len/std": 0.22946441173553467, "step": 309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.504885993485342, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6385065317153931, "kl": 0.04213615506887436, "learning_rate": 9.321574176797054e-07, "loss": 0.0, "num_tokens": 127400560.0, "reward": 0.5855362415313721, "reward_std": 0.1669701486825943, "rewards/reward_len/mean": 0.5855362415313721, "rewards/reward_len/std": 0.3325650095939636, "step": 310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.506514657980456, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5817687511444092, "kl": 0.05152391642332077, "learning_rate": 9.317278893329785e-07, "loss": 0.0001, "num_tokens": 127810832.0, "reward": 0.8012572526931763, "reward_std": 0.12584200501441956, "rewards/reward_len/mean": 0.8012572526931763, "rewards/reward_len/std": 0.22912323474884033, "step": 311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.50814332247557, "frac_reward_zero_std": 0.0, "grad_norm": 0.6141783595085144, "kl": 0.04951164871454239, "learning_rate": 9.312971051572938e-07, "loss": 0.0, "num_tokens": 128220928.0, "reward": 0.7547109723091125, "reward_std": 0.15900370478630066, "rewards/reward_len/mean": 0.7547109723091125, "rewards/reward_len/std": 0.25045108795166016, "step": 312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.509771986970684, "frac_reward_zero_std": 0.0, "grad_norm": 0.5980782508850098, "kl": 0.05102244392037392, "learning_rate": 9.308650664057351e-07, "loss": 0.0001, "num_tokens": 128631408.0, "reward": 0.7863317728042603, "reward_std": 0.14327473938465118, "rewards/reward_len/mean": 0.786331832408905, "rewards/reward_len/std": 0.22384805977344513, "step": 313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.511400651465798, "frac_reward_zero_std": 0.0, "grad_norm": 0.6301132440567017, "kl": 0.049494851380586624, "learning_rate": 9.304317743350358e-07, "loss": 0.0, "num_tokens": 129042464.0, "reward": 0.7171204090118408, "reward_std": 0.1513771116733551, "rewards/reward_len/mean": 0.717120349407196, "rewards/reward_len/std": 0.2839781939983368, "step": 314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5130293159609121, "frac_reward_zero_std": 0.0, "grad_norm": 0.6630383133888245, "kl": 0.055470675230026245, "learning_rate": 9.299972302055749e-07, "loss": 0.0001, "num_tokens": 129452576.0, "reward": 0.769361138343811, "reward_std": 0.13244403898715973, "rewards/reward_len/mean": 0.769361138343811, "rewards/reward_len/std": 0.21908392012119293, "step": 315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5146579804560261, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5843390226364136, "kl": 0.04714905470609665, "learning_rate": 9.29561435281373e-07, "loss": 0.0, "num_tokens": 129862864.0, "reward": 0.7347447872161865, "reward_std": 0.14706143736839294, "rewards/reward_len/mean": 0.7347447872161865, "rewards/reward_len/std": 0.2941683828830719, "step": 316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5162866449511401, "frac_reward_zero_std": 0.0, "grad_norm": 0.6075944304466248, "kl": 0.05303880572319031, "learning_rate": 9.291243908300897e-07, "loss": 0.0001, "num_tokens": 130272944.0, "reward": 0.7739500403404236, "reward_std": 0.14035850763320923, "rewards/reward_len/mean": 0.7739500403404236, "rewards/reward_len/std": 0.2269812822341919, "step": 317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5179153094462541, "frac_reward_zero_std": 0.03125, "grad_norm": 0.599514365196228, "kl": 0.0515330508351326, "learning_rate": 9.286860981230189e-07, "loss": 0.0001, "num_tokens": 130684288.0, "reward": 0.6463202834129333, "reward_std": 0.1797352135181427, "rewards/reward_len/mean": 0.6463202834129333, "rewards/reward_len/std": 0.3214765191078186, "step": 318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5195439739413681, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6075353026390076, "kl": 0.05394282191991806, "learning_rate": 9.282465584350855e-07, "loss": 0.0001, "num_tokens": 131094352.0, "reward": 0.7814072370529175, "reward_std": 0.12339561432600021, "rewards/reward_len/mean": 0.7814072370529175, "rewards/reward_len/std": 0.252814382314682, "step": 319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5211726384364821, "frac_reward_zero_std": 0.0, "grad_norm": 0.6401309370994568, "kl": 0.04900750890374184, "learning_rate": 9.27805773044842e-07, "loss": 0.0, "num_tokens": 131505312.0, "reward": 0.777639627456665, "reward_std": 0.14155060052871704, "rewards/reward_len/mean": 0.7776395678520203, "rewards/reward_len/std": 0.2082737237215042, "step": 320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5228013029315961, "frac_reward_zero_std": 0.0, "grad_norm": 0.5773029327392578, "kl": 0.0538334846496582, "learning_rate": 9.273637432344642e-07, "loss": 0.0001, "num_tokens": 131915648.0, "reward": 0.7950667142868042, "reward_std": 0.13063904643058777, "rewards/reward_len/mean": 0.7950667142868042, "rewards/reward_len/std": 0.18140225112438202, "step": 321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5244299674267101, "frac_reward_zero_std": 0.0, "grad_norm": 0.6019942164421082, "kl": 0.05286746099591255, "learning_rate": 9.269204702897475e-07, "loss": 0.0001, "num_tokens": 132327632.0, "reward": 0.760003924369812, "reward_std": 0.15350961685180664, "rewards/reward_len/mean": 0.7600038647651672, "rewards/reward_len/std": 0.21423211693763733, "step": 322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5260586319218241, "frac_reward_zero_std": 0.0, "grad_norm": 0.5999805927276611, "kl": 0.055498600006103516, "learning_rate": 9.264759555001037e-07, "loss": 0.0001, "num_tokens": 132737568.0, "reward": 0.7513249516487122, "reward_std": 0.12792855501174927, "rewards/reward_len/mean": 0.7513249516487122, "rewards/reward_len/std": 0.2735826075077057, "step": 323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5276872964169381, "frac_reward_zero_std": 0.0, "grad_norm": 0.5813001394271851, "kl": 0.05921367183327675, "learning_rate": 9.260302001585569e-07, "loss": 0.0001, "num_tokens": 133148368.0, "reward": 0.8034385442733765, "reward_std": 0.11608082801103592, "rewards/reward_len/mean": 0.8034385442733765, "rewards/reward_len/std": 0.18653573095798492, "step": 324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5293159609120521, "frac_reward_zero_std": 0.0, "grad_norm": 0.5778250694274902, "kl": 0.058393292129039764, "learning_rate": 9.255832055617398e-07, "loss": 0.0001, "num_tokens": 133559424.0, "reward": 0.7292473316192627, "reward_std": 0.17201241850852966, "rewards/reward_len/mean": 0.7292473316192627, "rewards/reward_len/std": 0.288289874792099, "step": 325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5309446254071661, "frac_reward_zero_std": 0.0, "grad_norm": 0.5873188972473145, "kl": 0.054494258016347885, "learning_rate": 9.251349730098896e-07, "loss": 0.0001, "num_tokens": 133969600.0, "reward": 0.7342892289161682, "reward_std": 0.16455183923244476, "rewards/reward_len/mean": 0.7342891693115234, "rewards/reward_len/std": 0.25500741600990295, "step": 326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5325732899022801, "frac_reward_zero_std": 0.0, "grad_norm": 0.5744317770004272, "kl": 0.05659480765461922, "learning_rate": 9.246855038068449e-07, "loss": 0.0001, "num_tokens": 134381600.0, "reward": 0.7498740553855896, "reward_std": 0.13367252051830292, "rewards/reward_len/mean": 0.7498740553855896, "rewards/reward_len/std": 0.2810506522655487, "step": 327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5342019543973942, "frac_reward_zero_std": 0.0, "grad_norm": 0.598916232585907, "kl": 0.062119994312524796, "learning_rate": 9.242347992600414e-07, "loss": 0.0001, "num_tokens": 134792480.0, "reward": 0.7571966648101807, "reward_std": 0.12904345989227295, "rewards/reward_len/mean": 0.7571966648101807, "rewards/reward_len/std": 0.24664659798145294, "step": 328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5358306188925082, "frac_reward_zero_std": 0.0, "grad_norm": 0.5781741738319397, "kl": 0.05584922060370445, "learning_rate": 9.237828606805081e-07, "loss": 0.0001, "num_tokens": 135203920.0, "reward": 0.7190424203872681, "reward_std": 0.1471792459487915, "rewards/reward_len/mean": 0.7190424203872681, "rewards/reward_len/std": 0.2668127119541168, "step": 329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5374592833876222, "frac_reward_zero_std": 0.0, "grad_norm": 0.6438809633255005, "kl": 0.0593525655567646, "learning_rate": 9.233296893828637e-07, "loss": 0.0001, "num_tokens": 135615216.0, "reward": 0.7210737466812134, "reward_std": 0.15299248695373535, "rewards/reward_len/mean": 0.7210737466812134, "rewards/reward_len/std": 0.29419559240341187, "step": 330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5390879478827362, "frac_reward_zero_std": 0.0, "grad_norm": 0.6052411794662476, "kl": 0.05921280011534691, "learning_rate": 9.228752866853129e-07, "loss": 0.0001, "num_tokens": 136027312.0, "reward": 0.7527569532394409, "reward_std": 0.15777669847011566, "rewards/reward_len/mean": 0.7527569532394409, "rewards/reward_len/std": 0.23567594587802887, "step": 331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5407166123778502, "frac_reward_zero_std": 0.0, "grad_norm": 0.6036425232887268, "kl": 0.06143692135810852, "learning_rate": 9.224196539096417e-07, "loss": 0.0001, "num_tokens": 136437328.0, "reward": 0.8388419151306152, "reward_std": 0.11621682345867157, "rewards/reward_len/mean": 0.8388419151306152, "rewards/reward_len/std": 0.15794359147548676, "step": 332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5423452768729642, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6145183444023132, "kl": 0.0518437922000885, "learning_rate": 9.21962792381215e-07, "loss": 0.0001, "num_tokens": 136848320.0, "reward": 0.701367974281311, "reward_std": 0.15542176365852356, "rewards/reward_len/mean": 0.701367974281311, "rewards/reward_len/std": 0.290998250246048, "step": 333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5439739413680782, "frac_reward_zero_std": 0.0, "grad_norm": 0.631406843662262, "kl": 0.05742830038070679, "learning_rate": 9.215047034289715e-07, "loss": 0.0001, "num_tokens": 137258928.0, "reward": 0.7552425861358643, "reward_std": 0.14436909556388855, "rewards/reward_len/mean": 0.7552425861358643, "rewards/reward_len/std": 0.28110471367836, "step": 334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5456026058631922, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5940993428230286, "kl": 0.05576331168413162, "learning_rate": 9.210453883854203e-07, "loss": 0.0001, "num_tokens": 137671120.0, "reward": 0.7091786861419678, "reward_std": 0.12677699327468872, "rewards/reward_len/mean": 0.7091786861419678, "rewards/reward_len/std": 0.3073115348815918, "step": 335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5472312703583062, "frac_reward_zero_std": 0.0, "grad_norm": 0.5896310210227966, "kl": 0.05467035621404648, "learning_rate": 9.20584848586637e-07, "loss": 0.0001, "num_tokens": 138082352.0, "reward": 0.8197794556617737, "reward_std": 0.11570490151643753, "rewards/reward_len/mean": 0.8197794556617737, "rewards/reward_len/std": 0.17111869156360626, "step": 336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5488599348534202, "frac_reward_zero_std": 0.0, "grad_norm": 0.5534882545471191, "kl": 0.06567639112472534, "learning_rate": 9.201230853722602e-07, "loss": 0.0001, "num_tokens": 138493232.0, "reward": 0.8416069149971008, "reward_std": 0.11659200489521027, "rewards/reward_len/mean": 0.8416069149971008, "rewards/reward_len/std": 0.16321627795696259, "step": 337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5504885993485342, "frac_reward_zero_std": 0.0, "grad_norm": 0.6000487804412842, "kl": 0.057271216064691544, "learning_rate": 9.196601000854865e-07, "loss": 0.0001, "num_tokens": 138904576.0, "reward": 0.7240789532661438, "reward_std": 0.18429461121559143, "rewards/reward_len/mean": 0.7240789532661438, "rewards/reward_len/std": 0.2604205012321472, "step": 338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5521172638436482, "frac_reward_zero_std": 0.0, "grad_norm": 0.5917726755142212, "kl": 0.058289315551519394, "learning_rate": 9.191958940730682e-07, "loss": 0.0001, "num_tokens": 139315600.0, "reward": 0.6540768146514893, "reward_std": 0.13456660509109497, "rewards/reward_len/mean": 0.6540768146514893, "rewards/reward_len/std": 0.2893480658531189, "step": 339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5537459283387622, "frac_reward_zero_std": 0.0, "grad_norm": 0.5934238433837891, "kl": 0.05863931030035019, "learning_rate": 9.187304686853076e-07, "loss": 0.0001, "num_tokens": 139724768.0, "reward": 0.7341583967208862, "reward_std": 0.15952306985855103, "rewards/reward_len/mean": 0.7341583967208862, "rewards/reward_len/std": 0.25145331025123596, "step": 340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5553745928338762, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6047616600990295, "kl": 0.06634120643138885, "learning_rate": 9.182638252760548e-07, "loss": 0.0001, "num_tokens": 140135568.0, "reward": 0.7509185075759888, "reward_std": 0.13596314191818237, "rewards/reward_len/mean": 0.750918447971344, "rewards/reward_len/std": 0.27466651797294617, "step": 341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5570032573289903, "frac_reward_zero_std": 0.0, "grad_norm": 0.5982652902603149, "kl": 0.06449463218450546, "learning_rate": 9.177959652027023e-07, "loss": 0.0001, "num_tokens": 140546368.0, "reward": 0.7936953902244568, "reward_std": 0.1344088315963745, "rewards/reward_len/mean": 0.7936953902244568, "rewards/reward_len/std": 0.19045329093933105, "step": 342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5586319218241043, "frac_reward_zero_std": 0.0, "grad_norm": 0.6061780452728271, "kl": 0.0612606555223465, "learning_rate": 9.173268898261821e-07, "loss": 0.0001, "num_tokens": 140958800.0, "reward": 0.695629358291626, "reward_std": 0.14014306664466858, "rewards/reward_len/mean": 0.695629358291626, "rewards/reward_len/std": 0.2525576055049896, "step": 343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5602605863192183, "frac_reward_zero_std": 0.0, "grad_norm": 0.5824960470199585, "kl": 0.0672256201505661, "learning_rate": 9.16856600510961e-07, "loss": 0.0001, "num_tokens": 141370496.0, "reward": 0.7988513708114624, "reward_std": 0.13005611300468445, "rewards/reward_len/mean": 0.7988513708114624, "rewards/reward_len/std": 0.1820654273033142, "step": 344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5618892508143323, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5812193155288696, "kl": 0.06858863681554794, "learning_rate": 9.163850986250373e-07, "loss": 0.0001, "num_tokens": 141781616.0, "reward": 0.7512689828872681, "reward_std": 0.15550675988197327, "rewards/reward_len/mean": 0.7512690424919128, "rewards/reward_len/std": 0.2668238878250122, "step": 345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5635179153094463, "frac_reward_zero_std": 0.0, "grad_norm": 0.5940114855766296, "kl": 0.06105711683630943, "learning_rate": 9.159123855399362e-07, "loss": 0.0001, "num_tokens": 142193472.0, "reward": 0.7237535119056702, "reward_std": 0.13291773200035095, "rewards/reward_len/mean": 0.7237535119056702, "rewards/reward_len/std": 0.2884310185909271, "step": 346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5651465798045603, "frac_reward_zero_std": 0.0, "grad_norm": 0.6048972010612488, "kl": 0.06588634103536606, "learning_rate": 9.154384626307062e-07, "loss": 0.0001, "num_tokens": 142603936.0, "reward": 0.7809648513793945, "reward_std": 0.10585615038871765, "rewards/reward_len/mean": 0.7809649109840393, "rewards/reward_len/std": 0.24428369104862213, "step": 347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5667752442996743, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5885604619979858, "kl": 0.06833118200302124, "learning_rate": 9.149633312759148e-07, "loss": 0.0001, "num_tokens": 143013728.0, "reward": 0.7900633215904236, "reward_std": 0.11000075936317444, "rewards/reward_len/mean": 0.7900633215904236, "rewards/reward_len/std": 0.2379908412694931, "step": 348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5684039087947883, "frac_reward_zero_std": 0.0, "grad_norm": 0.591217041015625, "kl": 0.06945382058620453, "learning_rate": 9.144869928576451e-07, "loss": 0.0001, "num_tokens": 143424432.0, "reward": 0.7812597751617432, "reward_std": 0.13433900475502014, "rewards/reward_len/mean": 0.7812597751617432, "rewards/reward_len/std": 0.2225147783756256, "step": 349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5700325732899023, "frac_reward_zero_std": 0.0, "grad_norm": 0.5788832306861877, "kl": 0.0691261738538742, "learning_rate": 9.140094487614909e-07, "loss": 0.0001, "num_tokens": 143835200.0, "reward": 0.7568686008453369, "reward_std": 0.13936910033226013, "rewards/reward_len/mean": 0.7568686008453369, "rewards/reward_len/std": 0.28472137451171875, "step": 350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5716612377850163, "frac_reward_zero_std": 0.0, "grad_norm": 0.5870538353919983, "kl": 0.07328937202692032, "learning_rate": 9.135307003765537e-07, "loss": 0.0001, "num_tokens": 144246240.0, "reward": 0.7025856971740723, "reward_std": 0.1609753966331482, "rewards/reward_len/mean": 0.7025856971740723, "rewards/reward_len/std": 0.2659897208213806, "step": 351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5732899022801303, "frac_reward_zero_std": 0.0, "grad_norm": 0.5726879835128784, "kl": 0.07003065943717957, "learning_rate": 9.130507490954374e-07, "loss": 0.0001, "num_tokens": 144658400.0, "reward": 0.7533565163612366, "reward_std": 0.141841322183609, "rewards/reward_len/mean": 0.7533565759658813, "rewards/reward_len/std": 0.24973714351654053, "step": 352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5749185667752443, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5959227085113525, "kl": 0.06951335072517395, "learning_rate": 9.125695963142452e-07, "loss": 0.0001, "num_tokens": 145069584.0, "reward": 0.726377010345459, "reward_std": 0.13836193084716797, "rewards/reward_len/mean": 0.726377010345459, "rewards/reward_len/std": 0.23552857339382172, "step": 353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5765472312703583, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5852770805358887, "kl": 0.07476872950792313, "learning_rate": 9.120872434325757e-07, "loss": 0.0001, "num_tokens": 145480496.0, "reward": 0.7139648795127869, "reward_std": 0.15865987539291382, "rewards/reward_len/mean": 0.7139648199081421, "rewards/reward_len/std": 0.3113323450088501, "step": 354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5781758957654723, "frac_reward_zero_std": 0.0, "grad_norm": 0.6028524041175842, "kl": 0.07726432383060455, "learning_rate": 9.11603691853518e-07, "loss": 0.0001, "num_tokens": 145891120.0, "reward": 0.7482445240020752, "reward_std": 0.12222065776586533, "rewards/reward_len/mean": 0.7482445240020752, "rewards/reward_len/std": 0.24338537454605103, "step": 355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5798045602605864, "frac_reward_zero_std": 0.0, "grad_norm": 0.5881015062332153, "kl": 0.06589759886264801, "learning_rate": 9.111189429836477e-07, "loss": 0.0001, "num_tokens": 146301840.0, "reward": 0.7031121253967285, "reward_std": 0.1631229817867279, "rewards/reward_len/mean": 0.7031121253967285, "rewards/reward_len/std": 0.2648833394050598, "step": 356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5814332247557004, "frac_reward_zero_std": 0.0, "grad_norm": 0.5999342203140259, "kl": 0.07326776534318924, "learning_rate": 9.106329982330237e-07, "loss": 0.0001, "num_tokens": 146712960.0, "reward": 0.7763391733169556, "reward_std": 0.13977250456809998, "rewards/reward_len/mean": 0.7763391733169556, "rewards/reward_len/std": 0.20563733577728271, "step": 357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5830618892508144, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5775722861289978, "kl": 0.07648254930973053, "learning_rate": 9.101458590151836e-07, "loss": 0.0001, "num_tokens": 147123568.0, "reward": 0.8038909435272217, "reward_std": 0.12806916236877441, "rewards/reward_len/mean": 0.8038909435272217, "rewards/reward_len/std": 0.22247520089149475, "step": 358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5846905537459284, "frac_reward_zero_std": 0.0, "grad_norm": 0.605060875415802, "kl": 0.07777705788612366, "learning_rate": 9.096575267471388e-07, "loss": 0.0001, "num_tokens": 147534688.0, "reward": 0.7844817638397217, "reward_std": 0.13608147203922272, "rewards/reward_len/mean": 0.7844817638397217, "rewards/reward_len/std": 0.2598394751548767, "step": 359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5863192182410424, "frac_reward_zero_std": 0.0, "grad_norm": 0.5837016701698303, "kl": 0.07965699583292007, "learning_rate": 9.091680028493719e-07, "loss": 0.0001, "num_tokens": 147946384.0, "reward": 0.8058313727378845, "reward_std": 0.135996475815773, "rewards/reward_len/mean": 0.8058313727378845, "rewards/reward_len/std": 0.19533826410770416, "step": 360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5879478827361564, "frac_reward_zero_std": 0.0, "grad_norm": 0.6138297915458679, "kl": 0.0705721527338028, "learning_rate": 9.086772887458313e-07, "loss": 0.0001, "num_tokens": 148357840.0, "reward": 0.7948979139328003, "reward_std": 0.1347978413105011, "rewards/reward_len/mean": 0.7948979139328003, "rewards/reward_len/std": 0.22740468382835388, "step": 361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5895765472312704, "frac_reward_zero_std": 0.0, "grad_norm": 0.6106235980987549, "kl": 0.08201998472213745, "learning_rate": 9.081853858639274e-07, "loss": 0.0001, "num_tokens": 148768864.0, "reward": 0.7758291959762573, "reward_std": 0.12210247665643692, "rewards/reward_len/mean": 0.7758291959762573, "rewards/reward_len/std": 0.23948344588279724, "step": 362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5912052117263844, "frac_reward_zero_std": 0.0, "grad_norm": 0.5814768671989441, "kl": 0.08090874552726746, "learning_rate": 9.076922956345293e-07, "loss": 0.0001, "num_tokens": 149179040.0, "reward": 0.8113039135932922, "reward_std": 0.1418144255876541, "rewards/reward_len/mean": 0.8113039135932922, "rewards/reward_len/std": 0.21140410006046295, "step": 363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5928338762214984, "frac_reward_zero_std": 0.0, "grad_norm": 0.5794157981872559, "kl": 0.07644936442375183, "learning_rate": 9.07198019491959e-07, "loss": 0.0001, "num_tokens": 149588704.0, "reward": 0.777215838432312, "reward_std": 0.12448834627866745, "rewards/reward_len/mean": 0.777215838432312, "rewards/reward_len/std": 0.24366313219070435, "step": 364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5944625407166124, "frac_reward_zero_std": 0.0, "grad_norm": 0.6151131987571716, "kl": 0.07359112799167633, "learning_rate": 9.067025588739888e-07, "loss": 0.0001, "num_tokens": 149999888.0, "reward": 0.7569351196289062, "reward_std": 0.15110310912132263, "rewards/reward_len/mean": 0.7569351196289062, "rewards/reward_len/std": 0.2737268805503845, "step": 365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5960912052117264, "frac_reward_zero_std": 0.0, "grad_norm": 0.6016061902046204, "kl": 0.07231198251247406, "learning_rate": 9.062059152218361e-07, "loss": 0.0001, "num_tokens": 150411104.0, "reward": 0.772554337978363, "reward_std": 0.14863264560699463, "rewards/reward_len/mean": 0.772554337978363, "rewards/reward_len/std": 0.24347221851348877, "step": 366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5977198697068404, "frac_reward_zero_std": 0.0, "grad_norm": 0.6224422454833984, "kl": 0.07687036693096161, "learning_rate": 9.057080899801596e-07, "loss": 0.0001, "num_tokens": 150822752.0, "reward": 0.7890732288360596, "reward_std": 0.10840784013271332, "rewards/reward_len/mean": 0.7890732288360596, "rewards/reward_len/std": 0.24023322761058807, "step": 367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.5993485342019544, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5856410264968872, "kl": 0.08404499292373657, "learning_rate": 9.052090845970553e-07, "loss": 0.0001, "num_tokens": 151234592.0, "reward": 0.8001506924629211, "reward_std": 0.11971178650856018, "rewards/reward_len/mean": 0.8001506924629211, "rewards/reward_len/std": 0.20408526062965393, "step": 368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6009771986970684, "frac_reward_zero_std": 0.0, "grad_norm": 0.5993021726608276, "kl": 0.06988793611526489, "learning_rate": 9.047089005240515e-07, "loss": 0.0001, "num_tokens": 151644832.0, "reward": 0.7698330879211426, "reward_std": 0.16766119003295898, "rewards/reward_len/mean": 0.7698330879211426, "rewards/reward_len/std": 0.25371289253234863, "step": 369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6026058631921825, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5762929916381836, "kl": 0.07778153568506241, "learning_rate": 9.04207539216106e-07, "loss": 0.0001, "num_tokens": 152056912.0, "reward": 0.8033612966537476, "reward_std": 0.11196377873420715, "rewards/reward_len/mean": 0.8033612966537476, "rewards/reward_len/std": 0.21505923569202423, "step": 370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6042345276872965, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5650218725204468, "kl": 0.07808873802423477, "learning_rate": 9.037050021316002e-07, "loss": 0.0001, "num_tokens": 152466640.0, "reward": 0.8252801895141602, "reward_std": 0.09713468700647354, "rewards/reward_len/mean": 0.8252801299095154, "rewards/reward_len/std": 0.19334393739700317, "step": 371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6058631921824105, "frac_reward_zero_std": 0.0, "grad_norm": 0.6063220500946045, "kl": 0.07497432082891464, "learning_rate": 9.032012907323361e-07, "loss": 0.0001, "num_tokens": 152878096.0, "reward": 0.7351294755935669, "reward_std": 0.16792398691177368, "rewards/reward_len/mean": 0.7351294755935669, "rewards/reward_len/std": 0.29899710416793823, "step": 372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6074918566775245, "frac_reward_zero_std": 0.0, "grad_norm": 0.5955592393875122, "kl": 0.0804833471775055, "learning_rate": 9.02696406483531e-07, "loss": 0.0001, "num_tokens": 153289680.0, "reward": 0.7245588302612305, "reward_std": 0.1445314586162567, "rewards/reward_len/mean": 0.7245588302612305, "rewards/reward_len/std": 0.2890779376029968, "step": 373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6091205211726385, "frac_reward_zero_std": 0.0, "grad_norm": 0.5744503140449524, "kl": 0.09044265747070312, "learning_rate": 9.021903508538149e-07, "loss": 0.0001, "num_tokens": 153701872.0, "reward": 0.7645851373672485, "reward_std": 0.15662029385566711, "rewards/reward_len/mean": 0.7645851373672485, "rewards/reward_len/std": 0.24281540513038635, "step": 374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6107491856677525, "frac_reward_zero_std": 0.0, "grad_norm": 0.57271808385849, "kl": 0.0946684330701828, "learning_rate": 9.016831253152241e-07, "loss": 0.0001, "num_tokens": 154113104.0, "reward": 0.8081185817718506, "reward_std": 0.1332307606935501, "rewards/reward_len/mean": 0.8081185221672058, "rewards/reward_len/std": 0.24621041119098663, "step": 375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6123778501628665, "frac_reward_zero_std": 0.0, "grad_norm": 0.6060759425163269, "kl": 0.08469435572624207, "learning_rate": 9.011747313431988e-07, "loss": 0.0001, "num_tokens": 154524976.0, "reward": 0.7836043834686279, "reward_std": 0.15428897738456726, "rewards/reward_len/mean": 0.7836043834686279, "rewards/reward_len/std": 0.23578263819217682, "step": 376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6140065146579805, "frac_reward_zero_std": 0.0, "grad_norm": 0.5663647651672363, "kl": 0.0789104551076889, "learning_rate": 9.00665170416577e-07, "loss": 0.0001, "num_tokens": 154936176.0, "reward": 0.7742336988449097, "reward_std": 0.1444513201713562, "rewards/reward_len/mean": 0.7742336988449097, "rewards/reward_len/std": 0.21864862740039825, "step": 377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6156351791530945, "frac_reward_zero_std": 0.0, "grad_norm": 0.6240755915641785, "kl": 0.07666830718517303, "learning_rate": 9.001544440175926e-07, "loss": 0.0001, "num_tokens": 155347632.0, "reward": 0.7270491719245911, "reward_std": 0.14533397555351257, "rewards/reward_len/mean": 0.7270491719245911, "rewards/reward_len/std": 0.24876761436462402, "step": 378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6172638436482085, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5991832613945007, "kl": 0.0865398421883583, "learning_rate": 8.996425536318682e-07, "loss": 0.0001, "num_tokens": 155758736.0, "reward": 0.7833793759346008, "reward_std": 0.10920429229736328, "rewards/reward_len/mean": 0.7833793759346008, "rewards/reward_len/std": 0.26109904050827026, "step": 379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6188925081433225, "frac_reward_zero_std": 0.0, "grad_norm": 0.5930525660514832, "kl": 0.08736968040466309, "learning_rate": 8.991295007484131e-07, "loss": 0.0001, "num_tokens": 156169728.0, "reward": 0.8073899745941162, "reward_std": 0.1109134629368782, "rewards/reward_len/mean": 0.8073899745941162, "rewards/reward_len/std": 0.21363712847232819, "step": 380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6205211726384365, "frac_reward_zero_std": 0.0, "grad_norm": 0.5729550123214722, "kl": 0.0846407487988472, "learning_rate": 8.986152868596182e-07, "loss": 0.0001, "num_tokens": 156581536.0, "reward": 0.7708488702774048, "reward_std": 0.1531340777873993, "rewards/reward_len/mean": 0.7708488702774048, "rewards/reward_len/std": 0.24337807297706604, "step": 381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6221498371335505, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5809146761894226, "kl": 0.08077554404735565, "learning_rate": 8.98099913461251e-07, "loss": 0.0001, "num_tokens": 156992032.0, "reward": 0.7507623434066772, "reward_std": 0.12833991646766663, "rewards/reward_len/mean": 0.7507623434066772, "rewards/reward_len/std": 0.26911917328834534, "step": 382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6237785016286646, "frac_reward_zero_std": 0.0, "grad_norm": 0.5935854911804199, "kl": 0.09335927665233612, "learning_rate": 8.975833820524526e-07, "loss": 0.0001, "num_tokens": 157403408.0, "reward": 0.7922131419181824, "reward_std": 0.11580611020326614, "rewards/reward_len/mean": 0.7922131419181824, "rewards/reward_len/std": 0.24398928880691528, "step": 383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6254071661237784, "frac_reward_zero_std": 0.0, "grad_norm": 0.5681887269020081, "kl": 0.08456458151340485, "learning_rate": 8.970656941357318e-07, "loss": 0.0001, "num_tokens": 157814560.0, "reward": 0.7576636672019958, "reward_std": 0.13860659301280975, "rewards/reward_len/mean": 0.7576636672019958, "rewards/reward_len/std": 0.24341268837451935, "step": 384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6270358306188925, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5694103837013245, "kl": 0.09461534768342972, "learning_rate": 8.965468512169618e-07, "loss": 0.0001, "num_tokens": 158227056.0, "reward": 0.7939473390579224, "reward_std": 0.11189018189907074, "rewards/reward_len/mean": 0.7939473390579224, "rewards/reward_len/std": 0.20945420861244202, "step": 385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6286644951140065, "frac_reward_zero_std": 0.0, "grad_norm": 0.6208769679069519, "kl": 0.09567777812480927, "learning_rate": 8.960268548053757e-07, "loss": 0.0001, "num_tokens": 158638048.0, "reward": 0.7840002179145813, "reward_std": 0.1148892492055893, "rewards/reward_len/mean": 0.7840002179145813, "rewards/reward_len/std": 0.25691163539886475, "step": 386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6302931596091205, "frac_reward_zero_std": 0.0, "grad_norm": 0.6145800352096558, "kl": 0.09094016253948212, "learning_rate": 8.955057064135619e-07, "loss": 0.0001, "num_tokens": 159049760.0, "reward": 0.7835336327552795, "reward_std": 0.17226988077163696, "rewards/reward_len/mean": 0.7835336327552795, "rewards/reward_len/std": 0.25538545846939087, "step": 387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6319218241042345, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5841229557991028, "kl": 0.09584644436836243, "learning_rate": 8.949834075574594e-07, "loss": 0.0001, "num_tokens": 159461024.0, "reward": 0.7771385908126831, "reward_std": 0.14888793230056763, "rewards/reward_len/mean": 0.7771385908126831, "rewards/reward_len/std": 0.25033658742904663, "step": 388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6335504885993485, "frac_reward_zero_std": 0.0, "grad_norm": 0.6108843088150024, "kl": 0.09244780242443085, "learning_rate": 8.944599597563542e-07, "loss": 0.0001, "num_tokens": 159870512.0, "reward": 0.7742296457290649, "reward_std": 0.1333933174610138, "rewards/reward_len/mean": 0.7742296457290649, "rewards/reward_len/std": 0.21872836351394653, "step": 389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6351791530944625, "frac_reward_zero_std": 0.0, "grad_norm": 0.5840979814529419, "kl": 0.09272357821464539, "learning_rate": 8.93935364532874e-07, "loss": 0.0001, "num_tokens": 160281504.0, "reward": 0.7553146481513977, "reward_std": 0.1384490579366684, "rewards/reward_len/mean": 0.7553146481513977, "rewards/reward_len/std": 0.25414231419563293, "step": 390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6368078175895765, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5949684977531433, "kl": 0.08852748572826385, "learning_rate": 8.934096234129842e-07, "loss": 0.0001, "num_tokens": 160692672.0, "reward": 0.7771260738372803, "reward_std": 0.12060913443565369, "rewards/reward_len/mean": 0.7771260738372803, "rewards/reward_len/std": 0.2595462203025818, "step": 391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6384364820846905, "frac_reward_zero_std": 0.0, "grad_norm": 0.6146441102027893, "kl": 0.08975489437580109, "learning_rate": 8.928827379259837e-07, "loss": 0.0001, "num_tokens": 161103760.0, "reward": 0.7597358226776123, "reward_std": 0.12829402089118958, "rewards/reward_len/mean": 0.7597358226776123, "rewards/reward_len/std": 0.23752695322036743, "step": 392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6400651465798045, "frac_reward_zero_std": 0.0, "grad_norm": 0.5911963582038879, "kl": 0.09903927147388458, "learning_rate": 8.923547096045003e-07, "loss": 0.0001, "num_tokens": 161515696.0, "reward": 0.8048355579376221, "reward_std": 0.13012410700321198, "rewards/reward_len/mean": 0.8048355579376221, "rewards/reward_len/std": 0.19108065962791443, "step": 393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6416938110749185, "frac_reward_zero_std": 0.0, "grad_norm": 0.6213447451591492, "kl": 0.08844126760959625, "learning_rate": 8.918255399844853e-07, "loss": 0.0001, "num_tokens": 161926160.0, "reward": 0.7134773135185242, "reward_std": 0.13982728123664856, "rewards/reward_len/mean": 0.7134773135185242, "rewards/reward_len/std": 0.2717779278755188, "step": 394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6433224755700325, "frac_reward_zero_std": 0.0, "grad_norm": 0.5739477872848511, "kl": 0.09034094214439392, "learning_rate": 8.912952306052108e-07, "loss": 0.0001, "num_tokens": 162336624.0, "reward": 0.7774125933647156, "reward_std": 0.12078971415758133, "rewards/reward_len/mean": 0.7774125933647156, "rewards/reward_len/std": 0.2069108486175537, "step": 395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6449511400651465, "frac_reward_zero_std": 0.0, "grad_norm": 0.5881010293960571, "kl": 0.10058103501796722, "learning_rate": 8.907637830092637e-07, "loss": 0.0001, "num_tokens": 162748032.0, "reward": 0.796068549156189, "reward_std": 0.1321488916873932, "rewards/reward_len/mean": 0.796068549156189, "rewards/reward_len/std": 0.19068817794322968, "step": 396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6465798045602605, "frac_reward_zero_std": 0.0, "grad_norm": 0.583114504814148, "kl": 0.09957069903612137, "learning_rate": 8.902311987425422e-07, "loss": 0.0001, "num_tokens": 163160240.0, "reward": 0.7572696208953857, "reward_std": 0.11802658438682556, "rewards/reward_len/mean": 0.7572696805000305, "rewards/reward_len/std": 0.25235462188720703, "step": 397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6482084690553745, "frac_reward_zero_std": 0.0, "grad_norm": 0.5808195471763611, "kl": 0.10059425979852676, "learning_rate": 8.896974793542506e-07, "loss": 0.0001, "num_tokens": 163571280.0, "reward": 0.8227297067642212, "reward_std": 0.13211947679519653, "rewards/reward_len/mean": 0.8227297067642212, "rewards/reward_len/std": 0.17803537845611572, "step": 398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6498371335504886, "frac_reward_zero_std": 0.0, "grad_norm": 0.5985670685768127, "kl": 0.09830187261104584, "learning_rate": 8.891626263968951e-07, "loss": 0.0001, "num_tokens": 163982336.0, "reward": 0.8194187879562378, "reward_std": 0.12716972827911377, "rewards/reward_len/mean": 0.8194187879562378, "rewards/reward_len/std": 0.19987310469150543, "step": 399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6514657980456026, "frac_reward_zero_std": 0.0, "grad_norm": 0.5833403468132019, "kl": 0.10634177923202515, "learning_rate": 8.886266414262795e-07, "loss": 0.0001, "num_tokens": 164392304.0, "reward": 0.8108314275741577, "reward_std": 0.13183222711086273, "rewards/reward_len/mean": 0.8108313679695129, "rewards/reward_len/std": 0.23649773001670837, "step": 400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6530944625407166, "frac_reward_zero_std": 0.0, "grad_norm": 0.5939474701881409, "kl": 0.10624538362026215, "learning_rate": 8.880895260015004e-07, "loss": 0.0001, "num_tokens": 164803168.0, "reward": 0.8095043897628784, "reward_std": 0.11450248211622238, "rewards/reward_len/mean": 0.8095043897628784, "rewards/reward_len/std": 0.21113577485084534, "step": 401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6547231270358306, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5962221026420593, "kl": 0.09379318356513977, "learning_rate": 8.875512816849425e-07, "loss": 0.0001, "num_tokens": 165214416.0, "reward": 0.7194601893424988, "reward_std": 0.14855524897575378, "rewards/reward_len/mean": 0.7194601893424988, "rewards/reward_len/std": 0.28869903087615967, "step": 402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6563517915309446, "frac_reward_zero_std": 0.0, "grad_norm": 0.6215980052947998, "kl": 0.09863665699958801, "learning_rate": 8.870119100422742e-07, "loss": 0.0001, "num_tokens": 165624608.0, "reward": 0.7820011973381042, "reward_std": 0.1368878185749054, "rewards/reward_len/mean": 0.782001256942749, "rewards/reward_len/std": 0.22658748924732208, "step": 403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6579804560260586, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6018466949462891, "kl": 0.09994544088840485, "learning_rate": 8.864714126424435e-07, "loss": 0.0001, "num_tokens": 166036720.0, "reward": 0.7895143032073975, "reward_std": 0.12618517875671387, "rewards/reward_len/mean": 0.7895143032073975, "rewards/reward_len/std": 0.221194788813591, "step": 404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6596091205211726, "frac_reward_zero_std": 0.0, "grad_norm": 0.585871160030365, "kl": 0.08255403488874435, "learning_rate": 8.85929791057673e-07, "loss": 0.0001, "num_tokens": 166448160.0, "reward": 0.7443087100982666, "reward_std": 0.13741180300712585, "rewards/reward_len/mean": 0.7443087100982666, "rewards/reward_len/std": 0.26658740639686584, "step": 405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6612377850162866, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6163228750228882, "kl": 0.11253806948661804, "learning_rate": 8.853870468634553e-07, "loss": 0.0001, "num_tokens": 166858016.0, "reward": 0.7785642743110657, "reward_std": 0.10769744217395782, "rewards/reward_len/mean": 0.7785642743110657, "rewards/reward_len/std": 0.2415749877691269, "step": 406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6628664495114006, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6267464756965637, "kl": 0.09660948067903519, "learning_rate": 8.84843181638548e-07, "loss": 0.0001, "num_tokens": 167268848.0, "reward": 0.8183515071868896, "reward_std": 0.1280132234096527, "rewards/reward_len/mean": 0.8183515071868896, "rewards/reward_len/std": 0.19966894388198853, "step": 407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6644951140065146, "frac_reward_zero_std": 0.0, "grad_norm": 0.5809414386749268, "kl": 0.10618536919355392, "learning_rate": 8.842981969649703e-07, "loss": 0.0001, "num_tokens": 167679760.0, "reward": 0.7694718241691589, "reward_std": 0.12130364775657654, "rewards/reward_len/mean": 0.7694717645645142, "rewards/reward_len/std": 0.23205003142356873, "step": 408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6661237785016286, "frac_reward_zero_std": 0.0, "grad_norm": 0.6134147047996521, "kl": 0.10746574401855469, "learning_rate": 8.837520944279975e-07, "loss": 0.0001, "num_tokens": 168091184.0, "reward": 0.7828845977783203, "reward_std": 0.14139732718467712, "rewards/reward_len/mean": 0.7828845977783203, "rewards/reward_len/std": 0.21137230098247528, "step": 409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6677524429967426, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5955663323402405, "kl": 0.104432612657547, "learning_rate": 8.832048756161565e-07, "loss": 0.0001, "num_tokens": 168502352.0, "reward": 0.7787143588066101, "reward_std": 0.13311436772346497, "rewards/reward_len/mean": 0.7787144184112549, "rewards/reward_len/std": 0.2644841969013214, "step": 410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6693811074918566, "frac_reward_zero_std": 0.0, "grad_norm": 0.5783955454826355, "kl": 0.11128406971693039, "learning_rate": 8.826565421212214e-07, "loss": 0.0001, "num_tokens": 168911664.0, "reward": 0.8409875631332397, "reward_std": 0.09756261855363846, "rewards/reward_len/mean": 0.8409875631332397, "rewards/reward_len/std": 0.15475183725357056, "step": 411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6710097719869706, "frac_reward_zero_std": 0.0, "grad_norm": 0.6052920818328857, "kl": 0.10144831240177155, "learning_rate": 8.821070955382081e-07, "loss": 0.0001, "num_tokens": 169321712.0, "reward": 0.785731315612793, "reward_std": 0.12273804843425751, "rewards/reward_len/mean": 0.785731315612793, "rewards/reward_len/std": 0.2420249730348587, "step": 412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6726384364820847, "frac_reward_zero_std": 0.0, "grad_norm": 0.5902113914489746, "kl": 0.10519538819789886, "learning_rate": 8.815565374653717e-07, "loss": 0.0001, "num_tokens": 169733424.0, "reward": 0.7852585315704346, "reward_std": 0.13035501539707184, "rewards/reward_len/mean": 0.7852585315704346, "rewards/reward_len/std": 0.22395460307598114, "step": 413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6742671009771987, "frac_reward_zero_std": 0.0, "grad_norm": 0.5868922472000122, "kl": 0.10178539156913757, "learning_rate": 8.810048695041989e-07, "loss": 0.0001, "num_tokens": 170144560.0, "reward": 0.7585973143577576, "reward_std": 0.14254379272460938, "rewards/reward_len/mean": 0.7585973143577576, "rewards/reward_len/std": 0.2512659430503845, "step": 414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6758957654723127, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5495643615722656, "kl": 0.11235459893941879, "learning_rate": 8.804520932594059e-07, "loss": 0.0001, "num_tokens": 170555168.0, "reward": 0.8173264265060425, "reward_std": 0.11670033633708954, "rewards/reward_len/mean": 0.8173264265060425, "rewards/reward_len/std": 0.23235267400741577, "step": 415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6775244299674267, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5510718822479248, "kl": 0.09196776896715164, "learning_rate": 8.798982103389324e-07, "loss": 0.0001, "num_tokens": 170966128.0, "reward": 0.821022629737854, "reward_std": 0.1333070993423462, "rewards/reward_len/mean": 0.821022629737854, "rewards/reward_len/std": 0.1963782012462616, "step": 416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6791530944625407, "frac_reward_zero_std": 0.0, "grad_norm": 0.5776665210723877, "kl": 0.10576188564300537, "learning_rate": 8.793432223539371e-07, "loss": 0.0001, "num_tokens": 171377344.0, "reward": 0.7402915954589844, "reward_std": 0.11432945728302002, "rewards/reward_len/mean": 0.7402915954589844, "rewards/reward_len/std": 0.27431392669677734, "step": 417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6807817589576547, "frac_reward_zero_std": 0.0, "grad_norm": 0.5759279727935791, "kl": 0.1052844375371933, "learning_rate": 8.787871309187935e-07, "loss": 0.0001, "num_tokens": 171789072.0, "reward": 0.7908248901367188, "reward_std": 0.133116215467453, "rewards/reward_len/mean": 0.7908248901367188, "rewards/reward_len/std": 0.2511596381664276, "step": 418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6824104234527687, "frac_reward_zero_std": 0.0, "grad_norm": 0.5624917149543762, "kl": 0.11133471876382828, "learning_rate": 8.782299376510845e-07, "loss": 0.0001, "num_tokens": 172201440.0, "reward": 0.7782732248306274, "reward_std": 0.12209971249103546, "rewards/reward_len/mean": 0.7782732248306274, "rewards/reward_len/std": 0.17959067225456238, "step": 419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6840390879478827, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5497819185256958, "kl": 0.11223059892654419, "learning_rate": 8.776716441715983e-07, "loss": 0.0001, "num_tokens": 172611168.0, "reward": 0.7660539746284485, "reward_std": 0.1248738020658493, "rewards/reward_len/mean": 0.7660539746284485, "rewards/reward_len/std": 0.24719123542308807, "step": 420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6856677524429967, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5861281752586365, "kl": 0.10718585550785065, "learning_rate": 8.771122521043233e-07, "loss": 0.0001, "num_tokens": 173023264.0, "reward": 0.7825385332107544, "reward_std": 0.1109902486205101, "rewards/reward_len/mean": 0.7825385332107544, "rewards/reward_len/std": 0.22612184286117554, "step": 421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6872964169381107, "frac_reward_zero_std": 0.0, "grad_norm": 0.6081984043121338, "kl": 0.09906178712844849, "learning_rate": 8.765517630764438e-07, "loss": 0.0001, "num_tokens": 173434944.0, "reward": 0.7231422662734985, "reward_std": 0.1420833319425583, "rewards/reward_len/mean": 0.7231422662734985, "rewards/reward_len/std": 0.2680956721305847, "step": 422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6889250814332247, "frac_reward_zero_std": 0.0, "grad_norm": 0.5906179547309875, "kl": 0.10764866322278976, "learning_rate": 8.759901787183348e-07, "loss": 0.0001, "num_tokens": 173847264.0, "reward": 0.7379202842712402, "reward_std": 0.1490502655506134, "rewards/reward_len/mean": 0.7379202842712402, "rewards/reward_len/std": 0.25693050026893616, "step": 423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6905537459283387, "frac_reward_zero_std": 0.03125, "grad_norm": 0.557159423828125, "kl": 0.11107298731803894, "learning_rate": 8.754275006635572e-07, "loss": 0.0001, "num_tokens": 174257536.0, "reward": 0.7926204204559326, "reward_std": 0.1249522864818573, "rewards/reward_len/mean": 0.7926204204559326, "rewards/reward_len/std": 0.22662484645843506, "step": 424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6921824104234527, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5702197551727295, "kl": 0.11641949415206909, "learning_rate": 8.748637305488535e-07, "loss": 0.0001, "num_tokens": 174667344.0, "reward": 0.7819067239761353, "reward_std": 0.11814748495817184, "rewards/reward_len/mean": 0.7819067239761353, "rewards/reward_len/std": 0.21721284091472626, "step": 425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6938110749185668, "frac_reward_zero_std": 0.0, "grad_norm": 0.5960256457328796, "kl": 0.11333009600639343, "learning_rate": 8.742988700141432e-07, "loss": 0.0001, "num_tokens": 175077920.0, "reward": 0.8410940766334534, "reward_std": 0.10268469899892807, "rewards/reward_len/mean": 0.8410941362380981, "rewards/reward_len/std": 0.16574636101722717, "step": 426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6954397394136808, "frac_reward_zero_std": 0.0, "grad_norm": 0.5898431539535522, "kl": 0.12271983176469803, "learning_rate": 8.73732920702517e-07, "loss": 0.0001, "num_tokens": 175489104.0, "reward": 0.8530140519142151, "reward_std": 0.11132901906967163, "rewards/reward_len/mean": 0.8530140519142151, "rewards/reward_len/std": 0.1661042422056198, "step": 427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6970684039087948, "frac_reward_zero_std": 0.0, "grad_norm": 0.5757982134819031, "kl": 0.10300230979919434, "learning_rate": 8.731658842602331e-07, "loss": 0.0001, "num_tokens": 175900784.0, "reward": 0.7357348799705505, "reward_std": 0.14194801449775696, "rewards/reward_len/mean": 0.7357349395751953, "rewards/reward_len/std": 0.28323277831077576, "step": 428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.6986970684039088, "frac_reward_zero_std": 0.0, "grad_norm": 0.5865767598152161, "kl": 0.10923118889331818, "learning_rate": 8.725977623367118e-07, "loss": 0.0001, "num_tokens": 176311152.0, "reward": 0.7812142968177795, "reward_std": 0.12614960968494415, "rewards/reward_len/mean": 0.7812142372131348, "rewards/reward_len/std": 0.2301645129919052, "step": 429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7003257328990228, "frac_reward_zero_std": 0.0, "grad_norm": 0.5929745435714722, "kl": 0.11444348096847534, "learning_rate": 8.720285565845312e-07, "loss": 0.0001, "num_tokens": 176723040.0, "reward": 0.7990192174911499, "reward_std": 0.12872940301895142, "rewards/reward_len/mean": 0.7990192174911499, "rewards/reward_len/std": 0.22529862821102142, "step": 430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7019543973941368, "frac_reward_zero_std": 0.0, "grad_norm": 0.5779286623001099, "kl": 0.1302841603755951, "learning_rate": 8.714582686594219e-07, "loss": 0.0001, "num_tokens": 177134976.0, "reward": 0.8014757633209229, "reward_std": 0.13612660765647888, "rewards/reward_len/mean": 0.8014757037162781, "rewards/reward_len/std": 0.2212119847536087, "step": 431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7035830618892508, "frac_reward_zero_std": 0.0, "grad_norm": 0.5901995897293091, "kl": 0.12685349583625793, "learning_rate": 8.708869002202619e-07, "loss": 0.0001, "num_tokens": 177546496.0, "reward": 0.830653190612793, "reward_std": 0.1046239584684372, "rewards/reward_len/mean": 0.830653190612793, "rewards/reward_len/std": 0.14980430901050568, "step": 432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7052117263843648, "frac_reward_zero_std": 0.0, "grad_norm": 0.5733382105827332, "kl": 0.11945892870426178, "learning_rate": 8.703144529290731e-07, "loss": 0.0001, "num_tokens": 177957312.0, "reward": 0.8157458305358887, "reward_std": 0.12362077832221985, "rewards/reward_len/mean": 0.8157458305358887, "rewards/reward_len/std": 0.17719817161560059, "step": 433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7068403908794788, "frac_reward_zero_std": 0.0, "grad_norm": 0.5709400177001953, "kl": 0.11691855639219284, "learning_rate": 8.697409284510151e-07, "loss": 0.0001, "num_tokens": 178367168.0, "reward": 0.7538958787918091, "reward_std": 0.12712571024894714, "rewards/reward_len/mean": 0.7538958787918091, "rewards/reward_len/std": 0.28419211506843567, "step": 434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7084690553745928, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5932132601737976, "kl": 0.12882572412490845, "learning_rate": 8.691663284543811e-07, "loss": 0.0001, "num_tokens": 178777712.0, "reward": 0.8013760447502136, "reward_std": 0.09759823977947235, "rewards/reward_len/mean": 0.8013759851455688, "rewards/reward_len/std": 0.22031283378601074, "step": 435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7100977198697068, "frac_reward_zero_std": 0.0, "grad_norm": 0.5892054438591003, "kl": 0.13903558254241943, "learning_rate": 8.685906546105924e-07, "loss": 0.0001, "num_tokens": 179188784.0, "reward": 0.8143337965011597, "reward_std": 0.10239320993423462, "rewards/reward_len/mean": 0.8143337965011597, "rewards/reward_len/std": 0.21787680685520172, "step": 436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7117263843648208, "frac_reward_zero_std": 0.0, "grad_norm": 0.6027694940567017, "kl": 0.12912043929100037, "learning_rate": 8.680139085941944e-07, "loss": 0.0001, "num_tokens": 179599872.0, "reward": 0.7448327541351318, "reward_std": 0.12132882326841354, "rewards/reward_len/mean": 0.7448327541351318, "rewards/reward_len/std": 0.25053122639656067, "step": 437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7133550488599348, "frac_reward_zero_std": 0.0, "grad_norm": 0.5791998505592346, "kl": 0.13381695747375488, "learning_rate": 8.674360920828514e-07, "loss": 0.0001, "num_tokens": 180011376.0, "reward": 0.798336386680603, "reward_std": 0.12276440113782883, "rewards/reward_len/mean": 0.798336386680603, "rewards/reward_len/std": 0.2363118827342987, "step": 438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7149837133550488, "frac_reward_zero_std": 0.0, "grad_norm": 0.576396107673645, "kl": 0.12258187681436539, "learning_rate": 8.668572067573407e-07, "loss": 0.0001, "num_tokens": 180423680.0, "reward": 0.8130970597267151, "reward_std": 0.1226259097456932, "rewards/reward_len/mean": 0.8130970001220703, "rewards/reward_len/std": 0.20068968832492828, "step": 439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7166123778501629, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6098994612693787, "kl": 0.13630448281764984, "learning_rate": 8.662772543015498e-07, "loss": 0.0001, "num_tokens": 180834672.0, "reward": 0.8356354236602783, "reward_std": 0.10911936312913895, "rewards/reward_len/mean": 0.8356354236602783, "rewards/reward_len/std": 0.15676860511302948, "step": 440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7182410423452769, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5501875281333923, "kl": 0.13201114535331726, "learning_rate": 8.656962364024695e-07, "loss": 0.0001, "num_tokens": 181246208.0, "reward": 0.7747049331665039, "reward_std": 0.12334855645895004, "rewards/reward_len/mean": 0.7747049331665039, "rewards/reward_len/std": 0.19507616758346558, "step": 441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7198697068403909, "frac_reward_zero_std": 0.0, "grad_norm": 0.6097755432128906, "kl": 0.13455688953399658, "learning_rate": 8.651141547501903e-07, "loss": 0.0001, "num_tokens": 181656816.0, "reward": 0.8332780599594116, "reward_std": 0.12307107448577881, "rewards/reward_len/mean": 0.8332780599594116, "rewards/reward_len/std": 0.17460539937019348, "step": 442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7214983713355049, "frac_reward_zero_std": 0.0, "grad_norm": 0.5915091633796692, "kl": 0.12223771214485168, "learning_rate": 8.645310110378962e-07, "loss": 0.0001, "num_tokens": 182068720.0, "reward": 0.7409653663635254, "reward_std": 0.13550224900245667, "rewards/reward_len/mean": 0.7409653663635254, "rewards/reward_len/std": 0.30138155817985535, "step": 443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7231270358306189, "frac_reward_zero_std": 0.0, "grad_norm": 0.5735060572624207, "kl": 0.13147825002670288, "learning_rate": 8.639468069618619e-07, "loss": 0.0001, "num_tokens": 182479328.0, "reward": 0.8389967679977417, "reward_std": 0.08827562630176544, "rewards/reward_len/mean": 0.8389967679977417, "rewards/reward_len/std": 0.2029535174369812, "step": 444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7247557003257329, "frac_reward_zero_std": 0.0, "grad_norm": 0.5916714072227478, "kl": 0.1305907815694809, "learning_rate": 8.633615442214451e-07, "loss": 0.0001, "num_tokens": 182890176.0, "reward": 0.8382672667503357, "reward_std": 0.129972904920578, "rewards/reward_len/mean": 0.8382672667503357, "rewards/reward_len/std": 0.24508224427700043, "step": 445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7263843648208469, "frac_reward_zero_std": 0.0, "grad_norm": 0.5951201319694519, "kl": 0.1337270587682724, "learning_rate": 8.627752245190841e-07, "loss": 0.0001, "num_tokens": 183301936.0, "reward": 0.8253979682922363, "reward_std": 0.11459222435951233, "rewards/reward_len/mean": 0.8253979682922363, "rewards/reward_len/std": 0.16881637275218964, "step": 446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7280130293159609, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5395971536636353, "kl": 0.14734810590744019, "learning_rate": 8.621878495602909e-07, "loss": 0.0001, "num_tokens": 183713248.0, "reward": 0.8136435151100159, "reward_std": 0.10936659574508667, "rewards/reward_len/mean": 0.8136435151100159, "rewards/reward_len/std": 0.23644913733005524, "step": 447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7296416938110749, "frac_reward_zero_std": 0.0625, "grad_norm": 0.546752393245697, "kl": 0.1264379918575287, "learning_rate": 8.615994210536477e-07, "loss": 0.0001, "num_tokens": 184124432.0, "reward": 0.7822479009628296, "reward_std": 0.12406748533248901, "rewards/reward_len/mean": 0.7822479009628296, "rewards/reward_len/std": 0.23618721961975098, "step": 448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7312703583061889, "frac_reward_zero_std": 0.0, "grad_norm": 0.5875090956687927, "kl": 0.1281033605337143, "learning_rate": 8.610099407108009e-07, "loss": 0.0001, "num_tokens": 184536064.0, "reward": 0.7947559952735901, "reward_std": 0.12489216774702072, "rewards/reward_len/mean": 0.7947559952735901, "rewards/reward_len/std": 0.18472075462341309, "step": 449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7328990228013029, "frac_reward_zero_std": 0.0, "grad_norm": 0.5790687799453735, "kl": 0.13274456560611725, "learning_rate": 8.60419410246457e-07, "loss": 0.0001, "num_tokens": 184947216.0, "reward": 0.7880150675773621, "reward_std": 0.11703311651945114, "rewards/reward_len/mean": 0.7880150675773621, "rewards/reward_len/std": 0.2537062466144562, "step": 450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7345276872964169, "frac_reward_zero_std": 0.0, "grad_norm": 0.5971469879150391, "kl": 0.13060902059078217, "learning_rate": 8.598278313783765e-07, "loss": 0.0001, "num_tokens": 185359104.0, "reward": 0.7758266925811768, "reward_std": 0.12178370356559753, "rewards/reward_len/mean": 0.7758266925811768, "rewards/reward_len/std": 0.24848511815071106, "step": 451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7361563517915309, "frac_reward_zero_std": 0.0, "grad_norm": 0.5733652114868164, "kl": 0.13622987270355225, "learning_rate": 8.5923520582737e-07, "loss": 0.0001, "num_tokens": 185769648.0, "reward": 0.8159459829330444, "reward_std": 0.11432816088199615, "rewards/reward_len/mean": 0.8159459829330444, "rewards/reward_len/std": 0.16467125713825226, "step": 452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.737785016286645, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6230970025062561, "kl": 0.1268952488899231, "learning_rate": 8.586415353172926e-07, "loss": 0.0001, "num_tokens": 186179920.0, "reward": 0.8192929029464722, "reward_std": 0.1323130577802658, "rewards/reward_len/mean": 0.8192929029464722, "rewards/reward_len/std": 0.22075478732585907, "step": 453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.739413680781759, "frac_reward_zero_std": 0.0, "grad_norm": 0.6524646282196045, "kl": 0.13692700862884521, "learning_rate": 8.580468215750391e-07, "loss": 0.0001, "num_tokens": 186590976.0, "reward": 0.8045700788497925, "reward_std": 0.13630765676498413, "rewards/reward_len/mean": 0.8045700788497925, "rewards/reward_len/std": 0.23439563810825348, "step": 454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.741042345276873, "frac_reward_zero_std": 0.0, "grad_norm": 0.5980470180511475, "kl": 0.14135509729385376, "learning_rate": 8.574510663305387e-07, "loss": 0.0001, "num_tokens": 187002304.0, "reward": 0.7421839833259583, "reward_std": 0.18687787652015686, "rewards/reward_len/mean": 0.7421839833259583, "rewards/reward_len/std": 0.3260200023651123, "step": 455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.742671009771987, "frac_reward_zero_std": 0.0, "grad_norm": 0.5719560980796814, "kl": 0.132034569978714, "learning_rate": 8.568542713167504e-07, "loss": 0.0001, "num_tokens": 187412448.0, "reward": 0.7899684906005859, "reward_std": 0.11732921749353409, "rewards/reward_len/mean": 0.7899684906005859, "rewards/reward_len/std": 0.2362440973520279, "step": 456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.744299674267101, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5599376559257507, "kl": 0.14320048689842224, "learning_rate": 8.562564382696576e-07, "loss": 0.0001, "num_tokens": 187823632.0, "reward": 0.7951593399047852, "reward_std": 0.12860172986984253, "rewards/reward_len/mean": 0.7951593399047852, "rewards/reward_len/std": 0.24920953810214996, "step": 457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.745928338762215, "frac_reward_zero_std": 0.0, "grad_norm": 0.5595481991767883, "kl": 0.14932911098003387, "learning_rate": 8.556575689282632e-07, "loss": 0.0001, "num_tokens": 188235696.0, "reward": 0.7754160165786743, "reward_std": 0.10718704760074615, "rewards/reward_len/mean": 0.7754160165786743, "rewards/reward_len/std": 0.28296056389808655, "step": 458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.747557003257329, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6090998649597168, "kl": 0.1443972885608673, "learning_rate": 8.550576650345846e-07, "loss": 0.0001, "num_tokens": 188647920.0, "reward": 0.7364503145217896, "reward_std": 0.1316375732421875, "rewards/reward_len/mean": 0.7364503145217896, "rewards/reward_len/std": 0.2699930667877197, "step": 459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.749185667752443, "frac_reward_zero_std": 0.09375, "grad_norm": 0.5848806500434875, "kl": 0.13956652581691742, "learning_rate": 8.544567283336484e-07, "loss": 0.0001, "num_tokens": 189059168.0, "reward": 0.7331588268280029, "reward_std": 0.13426969945430756, "rewards/reward_len/mean": 0.7331588864326477, "rewards/reward_len/std": 0.3083712160587311, "step": 460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.750814332247557, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5686269402503967, "kl": 0.14727777242660522, "learning_rate": 8.538547605734853e-07, "loss": 0.0001, "num_tokens": 189470864.0, "reward": 0.7845765352249146, "reward_std": 0.10369409620761871, "rewards/reward_len/mean": 0.7845765948295593, "rewards/reward_len/std": 0.2057599276304245, "step": 461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.752442996742671, "frac_reward_zero_std": 0.0, "grad_norm": 0.5619252324104309, "kl": 0.15922009944915771, "learning_rate": 8.532517635051258e-07, "loss": 0.0002, "num_tokens": 189880928.0, "reward": 0.8215411901473999, "reward_std": 0.14604783058166504, "rewards/reward_len/mean": 0.8215411305427551, "rewards/reward_len/std": 0.20852868258953094, "step": 462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.754071661237785, "frac_reward_zero_std": 0.0, "grad_norm": 0.5982549786567688, "kl": 0.1438244730234146, "learning_rate": 8.52647738882594e-07, "loss": 0.0001, "num_tokens": 190291504.0, "reward": 0.8302823305130005, "reward_std": 0.1036105751991272, "rewards/reward_len/mean": 0.8302823305130005, "rewards/reward_len/std": 0.18722257018089294, "step": 463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.755700325732899, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5594577193260193, "kl": 0.15446290373802185, "learning_rate": 8.520426884629032e-07, "loss": 0.0002, "num_tokens": 190703680.0, "reward": 0.8226549625396729, "reward_std": 0.11641600728034973, "rewards/reward_len/mean": 0.8226549625396729, "rewards/reward_len/std": 0.20946446061134338, "step": 464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.757328990228013, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5540525317192078, "kl": 0.15530972182750702, "learning_rate": 8.514366140060503e-07, "loss": 0.0002, "num_tokens": 191114640.0, "reward": 0.840175449848175, "reward_std": 0.09858464449644089, "rewards/reward_len/mean": 0.8401753902435303, "rewards/reward_len/std": 0.18673868477344513, "step": 465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.758957654723127, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5859994888305664, "kl": 0.1565239131450653, "learning_rate": 8.508295172750115e-07, "loss": 0.0002, "num_tokens": 191524000.0, "reward": 0.7783811092376709, "reward_std": 0.13818703591823578, "rewards/reward_len/mean": 0.7783811092376709, "rewards/reward_len/std": 0.2528523802757263, "step": 466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.760586319218241, "frac_reward_zero_std": 0.0, "grad_norm": 0.5798895359039307, "kl": 0.15062794089317322, "learning_rate": 8.502214000357359e-07, "loss": 0.0002, "num_tokens": 191936080.0, "reward": 0.7850157022476196, "reward_std": 0.12458137422800064, "rewards/reward_len/mean": 0.7850157022476196, "rewards/reward_len/std": 0.22482797503471375, "step": 467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.762214983713355, "frac_reward_zero_std": 0.0, "grad_norm": 0.5509704351425171, "kl": 0.15758123993873596, "learning_rate": 8.496122640571419e-07, "loss": 0.0002, "num_tokens": 192346464.0, "reward": 0.7913236618041992, "reward_std": 0.12255415320396423, "rewards/reward_len/mean": 0.7913236618041992, "rewards/reward_len/std": 0.2546813488006592, "step": 468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7638436482084691, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5787160992622375, "kl": 0.17287278175354004, "learning_rate": 8.490021111111108e-07, "loss": 0.0002, "num_tokens": 192757648.0, "reward": 0.8166881799697876, "reward_std": 0.12223942577838898, "rewards/reward_len/mean": 0.8166881799697876, "rewards/reward_len/std": 0.19747667014598846, "step": 469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7654723127035831, "frac_reward_zero_std": 0.0, "grad_norm": 0.5725114345550537, "kl": 0.1600470244884491, "learning_rate": 8.483909429724818e-07, "loss": 0.0002, "num_tokens": 193167776.0, "reward": 0.7770369052886963, "reward_std": 0.12423050403594971, "rewards/reward_len/mean": 0.7770368456840515, "rewards/reward_len/std": 0.22924190759658813, "step": 470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7671009771986971, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5526883006095886, "kl": 0.1462673544883728, "learning_rate": 8.477787614190479e-07, "loss": 0.0001, "num_tokens": 193578624.0, "reward": 0.8561433553695679, "reward_std": 0.10392145812511444, "rewards/reward_len/mean": 0.8561433553695679, "rewards/reward_len/std": 0.1491996943950653, "step": 471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7687296416938111, "frac_reward_zero_std": 0.0, "grad_norm": 0.5516891479492188, "kl": 0.15460586547851562, "learning_rate": 8.471655682315494e-07, "loss": 0.0002, "num_tokens": 193990160.0, "reward": 0.7838057279586792, "reward_std": 0.10953861474990845, "rewards/reward_len/mean": 0.7838057279586792, "rewards/reward_len/std": 0.22799327969551086, "step": 472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7703583061889251, "frac_reward_zero_std": 0.0, "grad_norm": 0.6277132034301758, "kl": 0.15746784210205078, "learning_rate": 8.465513651936696e-07, "loss": 0.0002, "num_tokens": 194401424.0, "reward": 0.7535176873207092, "reward_std": 0.12785764038562775, "rewards/reward_len/mean": 0.7535176873207092, "rewards/reward_len/std": 0.23820671439170837, "step": 473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7719869706840391, "frac_reward_zero_std": 0.0, "grad_norm": 0.5718030333518982, "kl": 0.13082030415534973, "learning_rate": 8.459361540920288e-07, "loss": 0.0001, "num_tokens": 194811856.0, "reward": 0.7733500599861145, "reward_std": 0.1735813021659851, "rewards/reward_len/mean": 0.7733500599861145, "rewards/reward_len/std": 0.2606576681137085, "step": 474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7736156351791531, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5571656227111816, "kl": 0.16955536603927612, "learning_rate": 8.453199367161803e-07, "loss": 0.0002, "num_tokens": 195222816.0, "reward": 0.8059613108634949, "reward_std": 0.11321564018726349, "rewards/reward_len/mean": 0.8059613704681396, "rewards/reward_len/std": 0.2349226474761963, "step": 475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7752442996742671, "frac_reward_zero_std": 0.0, "grad_norm": 0.5650033950805664, "kl": 0.14363032579421997, "learning_rate": 8.44702714858604e-07, "loss": 0.0001, "num_tokens": 195633440.0, "reward": 0.7653983235359192, "reward_std": 0.12186998873949051, "rewards/reward_len/mean": 0.7653983235359192, "rewards/reward_len/std": 0.24370300769805908, "step": 476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7768729641693811, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5526018142700195, "kl": 0.15929408371448517, "learning_rate": 8.440844903147017e-07, "loss": 0.0002, "num_tokens": 196043824.0, "reward": 0.8417167663574219, "reward_std": 0.10610156506299973, "rewards/reward_len/mean": 0.8417167663574219, "rewards/reward_len/std": 0.18754689395427704, "step": 477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7785016286644951, "frac_reward_zero_std": 0.0, "grad_norm": 0.5756915211677551, "kl": 0.1677446961402893, "learning_rate": 8.434652648827924e-07, "loss": 0.0002, "num_tokens": 196454704.0, "reward": 0.7863709330558777, "reward_std": 0.13557380437850952, "rewards/reward_len/mean": 0.7863709330558777, "rewards/reward_len/std": 0.2168845385313034, "step": 478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7801302931596091, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5712050199508667, "kl": 0.15916121006011963, "learning_rate": 8.428450403641056e-07, "loss": 0.0002, "num_tokens": 196866288.0, "reward": 0.7629751563072205, "reward_std": 0.11996892839670181, "rewards/reward_len/mean": 0.7629752159118652, "rewards/reward_len/std": 0.24230428040027618, "step": 479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7817589576547231, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5585920214653015, "kl": 0.18826574087142944, "learning_rate": 8.422238185627778e-07, "loss": 0.0002, "num_tokens": 197276464.0, "reward": 0.8396924138069153, "reward_std": 0.12119720876216888, "rewards/reward_len/mean": 0.8396924138069153, "rewards/reward_len/std": 0.21000023186206818, "step": 480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7833876221498371, "frac_reward_zero_std": 0.0, "grad_norm": 0.5845147967338562, "kl": 0.16829133033752441, "learning_rate": 8.41601601285846e-07, "loss": 0.0002, "num_tokens": 197687552.0, "reward": 0.7804012298583984, "reward_std": 0.1327204704284668, "rewards/reward_len/mean": 0.7804012298583984, "rewards/reward_len/std": 0.23383314907550812, "step": 481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7850162866449512, "frac_reward_zero_std": 0.0, "grad_norm": 0.5911273956298828, "kl": 0.1565122753381729, "learning_rate": 8.40978390343243e-07, "loss": 0.0002, "num_tokens": 198098416.0, "reward": 0.7936370372772217, "reward_std": 0.13018430769443512, "rewards/reward_len/mean": 0.7936370372772217, "rewards/reward_len/std": 0.23527055978775024, "step": 482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7866449511400652, "frac_reward_zero_std": 0.0, "grad_norm": 0.5549505949020386, "kl": 0.1538291722536087, "learning_rate": 8.403541875477922e-07, "loss": 0.0002, "num_tokens": 198509440.0, "reward": 0.8170169591903687, "reward_std": 0.11064761877059937, "rewards/reward_len/mean": 0.8170169591903687, "rewards/reward_len/std": 0.18760864436626434, "step": 483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7882736156351792, "frac_reward_zero_std": 0.0, "grad_norm": 0.5482110381126404, "kl": 0.18028521537780762, "learning_rate": 8.39728994715202e-07, "loss": 0.0002, "num_tokens": 198920256.0, "reward": 0.7810341119766235, "reward_std": 0.14160411059856415, "rewards/reward_len/mean": 0.7810341715812683, "rewards/reward_len/std": 0.25613388419151306, "step": 484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7899022801302932, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5934019088745117, "kl": 0.16072002053260803, "learning_rate": 8.391028136640603e-07, "loss": 0.0002, "num_tokens": 199331968.0, "reward": 0.7785548567771912, "reward_std": 0.12226171791553497, "rewards/reward_len/mean": 0.7785549163818359, "rewards/reward_len/std": 0.2489868551492691, "step": 485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7915309446254072, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5655373930931091, "kl": 0.18316742777824402, "learning_rate": 8.384756462158301e-07, "loss": 0.0002, "num_tokens": 199742800.0, "reward": 0.8092637658119202, "reward_std": 0.11551668494939804, "rewards/reward_len/mean": 0.8092637658119202, "rewards/reward_len/std": 0.20244869589805603, "step": 486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7931596091205212, "frac_reward_zero_std": 0.0, "grad_norm": 0.5961812138557434, "kl": 0.17301158607006073, "learning_rate": 8.378474941948437e-07, "loss": 0.0002, "num_tokens": 200153536.0, "reward": 0.8363718390464783, "reward_std": 0.11982633173465729, "rewards/reward_len/mean": 0.8363718390464783, "rewards/reward_len/std": 0.16390956938266754, "step": 487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7947882736156352, "frac_reward_zero_std": 0.0, "grad_norm": 0.58428955078125, "kl": 0.16865625977516174, "learning_rate": 8.372183594282969e-07, "loss": 0.0002, "num_tokens": 200564816.0, "reward": 0.8343746662139893, "reward_std": 0.09625723958015442, "rewards/reward_len/mean": 0.8343746662139893, "rewards/reward_len/std": 0.1533888280391693, "step": 488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7964169381107492, "frac_reward_zero_std": 0.0, "grad_norm": 0.5565808415412903, "kl": 0.1446206420660019, "learning_rate": 8.365882437462444e-07, "loss": 0.0001, "num_tokens": 200976576.0, "reward": 0.8058735132217407, "reward_std": 0.13266970217227936, "rewards/reward_len/mean": 0.8058735132217407, "rewards/reward_len/std": 0.19025342166423798, "step": 489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7980456026058632, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5311558842658997, "kl": 0.17097198963165283, "learning_rate": 8.359571489815944e-07, "loss": 0.0002, "num_tokens": 201388080.0, "reward": 0.823564887046814, "reward_std": 0.11252007633447647, "rewards/reward_len/mean": 0.823564887046814, "rewards/reward_len/std": 0.1931627094745636, "step": 490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.7996742671009772, "frac_reward_zero_std": 0.0, "grad_norm": 0.5602233409881592, "kl": 0.18204644322395325, "learning_rate": 8.35325076970103e-07, "loss": 0.0002, "num_tokens": 201799504.0, "reward": 0.829447865486145, "reward_std": 0.11486338078975677, "rewards/reward_len/mean": 0.829447865486145, "rewards/reward_len/std": 0.17914651334285736, "step": 491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8013029315960912, "frac_reward_zero_std": 0.0, "grad_norm": 0.5632943511009216, "kl": 0.16917476058006287, "learning_rate": 8.346920295503686e-07, "loss": 0.0002, "num_tokens": 202210656.0, "reward": 0.8137599229812622, "reward_std": 0.10652235150337219, "rewards/reward_len/mean": 0.8137599229812622, "rewards/reward_len/std": 0.205753356218338, "step": 492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8029315960912052, "frac_reward_zero_std": 0.0, "grad_norm": 0.6586654186248779, "kl": 0.1646963208913803, "learning_rate": 8.340580085638274e-07, "loss": 0.0002, "num_tokens": 202621360.0, "reward": 0.7825111150741577, "reward_std": 0.15102612972259521, "rewards/reward_len/mean": 0.7825111746788025, "rewards/reward_len/std": 0.28390392661094666, "step": 493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8045602605863192, "frac_reward_zero_std": 0.0, "grad_norm": 0.5823910236358643, "kl": 0.17689891159534454, "learning_rate": 8.334230158547474e-07, "loss": 0.0002, "num_tokens": 203032112.0, "reward": 0.8364421129226685, "reward_std": 0.1135559007525444, "rewards/reward_len/mean": 0.8364421129226685, "rewards/reward_len/std": 0.18459609150886536, "step": 494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8061889250814332, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6518563032150269, "kl": 0.16305121779441833, "learning_rate": 8.32787053270223e-07, "loss": 0.0002, "num_tokens": 203442560.0, "reward": 0.7903091907501221, "reward_std": 0.13649149239063263, "rewards/reward_len/mean": 0.7903091907501221, "rewards/reward_len/std": 0.20747913420200348, "step": 495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8078175895765473, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5546198487281799, "kl": 0.170706644654274, "learning_rate": 8.321501226601701e-07, "loss": 0.0002, "num_tokens": 203853856.0, "reward": 0.8407742977142334, "reward_std": 0.10212992131710052, "rewards/reward_len/mean": 0.8407742977142334, "rewards/reward_len/std": 0.16690902411937714, "step": 496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8094462540716613, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6034122109413147, "kl": 0.17934556305408478, "learning_rate": 8.315122258773204e-07, "loss": 0.0002, "num_tokens": 204264352.0, "reward": 0.8576254844665527, "reward_std": 0.0906946212053299, "rewards/reward_len/mean": 0.8576254844665527, "rewards/reward_len/std": 0.1353364884853363, "step": 497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8110749185667753, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5759266018867493, "kl": 0.16913840174674988, "learning_rate": 8.308733647772157e-07, "loss": 0.0002, "num_tokens": 204675184.0, "reward": 0.7953755855560303, "reward_std": 0.11789268255233765, "rewards/reward_len/mean": 0.7953755855560303, "rewards/reward_len/std": 0.25778037309646606, "step": 498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8127035830618893, "frac_reward_zero_std": 0.0, "grad_norm": 0.5783718824386597, "kl": 0.1711007058620453, "learning_rate": 8.302335412182033e-07, "loss": 0.0002, "num_tokens": 205085536.0, "reward": 0.8119986653327942, "reward_std": 0.08623704314231873, "rewards/reward_len/mean": 0.8119986653327942, "rewards/reward_len/std": 0.21778586506843567, "step": 499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8143322475570033, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5736299157142639, "kl": 0.17814481258392334, "learning_rate": 8.295927570614297e-07, "loss": 0.0002, "num_tokens": 205495680.0, "reward": 0.7777491807937622, "reward_std": 0.12676382064819336, "rewards/reward_len/mean": 0.7777491807937622, "rewards/reward_len/std": 0.24927373230457306, "step": 500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8159609120521173, "frac_reward_zero_std": 0.0, "grad_norm": 0.5665991306304932, "kl": 0.17597228288650513, "learning_rate": 8.289510141708362e-07, "loss": 0.0002, "num_tokens": 205907216.0, "reward": 0.8201221227645874, "reward_std": 0.11074990034103394, "rewards/reward_len/mean": 0.8201221227645874, "rewards/reward_len/std": 0.16996964812278748, "step": 501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8175895765472313, "frac_reward_zero_std": 0.0, "grad_norm": 0.7003419399261475, "kl": 0.16043689846992493, "learning_rate": 8.283083144131521e-07, "loss": 0.0002, "num_tokens": 206318320.0, "reward": 0.7682294845581055, "reward_std": 0.14918407797813416, "rewards/reward_len/mean": 0.7682294249534607, "rewards/reward_len/std": 0.26132458448410034, "step": 502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8192182410423453, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5445291996002197, "kl": 0.17058449983596802, "learning_rate": 8.276646596578911e-07, "loss": 0.0002, "num_tokens": 206729072.0, "reward": 0.7933437824249268, "reward_std": 0.12533068656921387, "rewards/reward_len/mean": 0.7933437824249268, "rewards/reward_len/std": 0.2025441825389862, "step": 503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8208469055374593, "frac_reward_zero_std": 0.0, "grad_norm": 0.5906601548194885, "kl": 0.1766405701637268, "learning_rate": 8.27020051777344e-07, "loss": 0.0002, "num_tokens": 207140336.0, "reward": 0.8031889200210571, "reward_std": 0.103009432554245, "rewards/reward_len/mean": 0.8031889200210571, "rewards/reward_len/std": 0.21884943544864655, "step": 504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8224755700325733, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5701841711997986, "kl": 0.19577057659626007, "learning_rate": 8.263744926465743e-07, "loss": 0.0002, "num_tokens": 207551200.0, "reward": 0.6989920139312744, "reward_std": 0.10644520819187164, "rewards/reward_len/mean": 0.6989920139312744, "rewards/reward_len/std": 0.3138832747936249, "step": 505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8241042345276873, "frac_reward_zero_std": 0.0, "grad_norm": 0.6286419034004211, "kl": 0.17838096618652344, "learning_rate": 8.257279841434128e-07, "loss": 0.0002, "num_tokens": 207962224.0, "reward": 0.7576998472213745, "reward_std": 0.13084536790847778, "rewards/reward_len/mean": 0.7576998472213745, "rewards/reward_len/std": 0.2800511121749878, "step": 506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8257328990228013, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5618635416030884, "kl": 0.17235668003559113, "learning_rate": 8.250805281484517e-07, "loss": 0.0002, "num_tokens": 208373120.0, "reward": 0.7394469380378723, "reward_std": 0.1380496323108673, "rewards/reward_len/mean": 0.7394469976425171, "rewards/reward_len/std": 0.2766292691230774, "step": 507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8273615635179153, "frac_reward_zero_std": 0.0, "grad_norm": 0.5179818868637085, "kl": 0.19992780685424805, "learning_rate": 8.24432126545039e-07, "loss": 0.0002, "num_tokens": 208783968.0, "reward": 0.7931818962097168, "reward_std": 0.10573914647102356, "rewards/reward_len/mean": 0.793181836605072, "rewards/reward_len/std": 0.18478284776210785, "step": 508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8289902280130294, "frac_reward_zero_std": 0.0, "grad_norm": 0.5702431201934814, "kl": 0.16734257340431213, "learning_rate": 8.23782781219274e-07, "loss": 0.0002, "num_tokens": 209194800.0, "reward": 0.8225494623184204, "reward_std": 0.11498615145683289, "rewards/reward_len/mean": 0.8225494623184204, "rewards/reward_len/std": 0.18341223895549774, "step": 509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8306188925081434, "frac_reward_zero_std": 0.03125, "grad_norm": 0.576018214225769, "kl": 0.16481716930866241, "learning_rate": 8.231324940600007e-07, "loss": 0.0002, "num_tokens": 209605888.0, "reward": 0.8122807741165161, "reward_std": 0.11140325665473938, "rewards/reward_len/mean": 0.8122807741165161, "rewards/reward_len/std": 0.2070787101984024, "step": 510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8322475570032574, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5980122685432434, "kl": 0.1764886975288391, "learning_rate": 8.224812669588026e-07, "loss": 0.0002, "num_tokens": 210017728.0, "reward": 0.7501088976860046, "reward_std": 0.1211383044719696, "rewards/reward_len/mean": 0.7501088976860046, "rewards/reward_len/std": 0.28208550810813904, "step": 511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8338762214983714, "frac_reward_zero_std": 0.0, "grad_norm": 0.6519628167152405, "kl": 0.204448401927948, "learning_rate": 8.218291018099978e-07, "loss": 0.0002, "num_tokens": 210427824.0, "reward": 0.7726846933364868, "reward_std": 0.126771479845047, "rewards/reward_len/mean": 0.7726846933364868, "rewards/reward_len/std": 0.24338746070861816, "step": 512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8355048859934854, "frac_reward_zero_std": 0.0, "grad_norm": 0.5967844724655151, "kl": 0.1990518718957901, "learning_rate": 8.211760005106327e-07, "loss": 0.0002, "num_tokens": 210838256.0, "reward": 0.808558464050293, "reward_std": 0.12622278928756714, "rewards/reward_len/mean": 0.808558464050293, "rewards/reward_len/std": 0.23035851120948792, "step": 513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8371335504885994, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5958476066589355, "kl": 0.20830781757831573, "learning_rate": 8.20521964960477e-07, "loss": 0.0002, "num_tokens": 211250624.0, "reward": 0.7726049423217773, "reward_std": 0.11904563009738922, "rewards/reward_len/mean": 0.7726049423217773, "rewards/reward_len/std": 0.25383150577545166, "step": 514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8387622149837134, "frac_reward_zero_std": 0.0, "grad_norm": 0.5591326355934143, "kl": 0.20323355495929718, "learning_rate": 8.198669970620176e-07, "loss": 0.0002, "num_tokens": 211661232.0, "reward": 0.8002609014511108, "reward_std": 0.13032686710357666, "rewards/reward_len/mean": 0.8002609014511108, "rewards/reward_len/std": 0.20568270981311798, "step": 515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8403908794788274, "frac_reward_zero_std": 0.0, "grad_norm": 0.5937811732292175, "kl": 0.19927993416786194, "learning_rate": 8.192110987204541e-07, "loss": 0.0002, "num_tokens": 212072336.0, "reward": 0.8262019753456116, "reward_std": 0.10930497944355011, "rewards/reward_len/mean": 0.8262020349502563, "rewards/reward_len/std": 0.15890195965766907, "step": 516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8420195439739414, "frac_reward_zero_std": 0.0, "grad_norm": 0.6359495520591736, "kl": 0.20484387874603271, "learning_rate": 8.185542718436922e-07, "loss": 0.0002, "num_tokens": 212483712.0, "reward": 0.8098372220993042, "reward_std": 0.1264970898628235, "rewards/reward_len/mean": 0.8098372220993042, "rewards/reward_len/std": 0.22233250737190247, "step": 517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8436482084690554, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5653404593467712, "kl": 0.2222657948732376, "learning_rate": 8.178965183423385e-07, "loss": 0.0002, "num_tokens": 212895280.0, "reward": 0.8009644746780396, "reward_std": 0.12843981385231018, "rewards/reward_len/mean": 0.8009644746780396, "rewards/reward_len/std": 0.23441261053085327, "step": 518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8452768729641694, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5550560355186462, "kl": 0.2311854213476181, "learning_rate": 8.172378401296952e-07, "loss": 0.0002, "num_tokens": 213306208.0, "reward": 0.8569848537445068, "reward_std": 0.10135232657194138, "rewards/reward_len/mean": 0.8569848537445068, "rewards/reward_len/std": 0.17825044691562653, "step": 519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8469055374592834, "frac_reward_zero_std": 0.0, "grad_norm": 0.5969561338424683, "kl": 0.2227363884449005, "learning_rate": 8.165782391217543e-07, "loss": 0.0002, "num_tokens": 213717168.0, "reward": 0.8232470750808716, "reward_std": 0.11080748587846756, "rewards/reward_len/mean": 0.8232470750808716, "rewards/reward_len/std": 0.17999213933944702, "step": 520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8485342019543974, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5459310412406921, "kl": 0.2240394651889801, "learning_rate": 8.159177172371921e-07, "loss": 0.0002, "num_tokens": 214127472.0, "reward": 0.8124122619628906, "reward_std": 0.1039663776755333, "rewards/reward_len/mean": 0.8124121427536011, "rewards/reward_len/std": 0.2304028868675232, "step": 521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8501628664495114, "frac_reward_zero_std": 0.0, "grad_norm": 0.5552517175674438, "kl": 0.20736998319625854, "learning_rate": 8.152562763973634e-07, "loss": 0.0002, "num_tokens": 214538816.0, "reward": 0.7806540727615356, "reward_std": 0.110677570104599, "rewards/reward_len/mean": 0.7806540727615356, "rewards/reward_len/std": 0.24112695455551147, "step": 522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8517915309446255, "frac_reward_zero_std": 0.0, "grad_norm": 0.5535444617271423, "kl": 0.1988353580236435, "learning_rate": 8.145939185262963e-07, "loss": 0.0002, "num_tokens": 214950272.0, "reward": 0.8265327215194702, "reward_std": 0.12541432678699493, "rewards/reward_len/mean": 0.8265327215194702, "rewards/reward_len/std": 0.230035662651062, "step": 523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8534201954397395, "frac_reward_zero_std": 0.0, "grad_norm": 0.6231321096420288, "kl": 0.2030281126499176, "learning_rate": 8.139306455506862e-07, "loss": 0.0002, "num_tokens": 215361936.0, "reward": 0.8227354288101196, "reward_std": 0.12941136956214905, "rewards/reward_len/mean": 0.8227354288101196, "rewards/reward_len/std": 0.24100038409233093, "step": 524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8550488599348535, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6084173321723938, "kl": 0.21848607063293457, "learning_rate": 8.132664593998908e-07, "loss": 0.0002, "num_tokens": 215773584.0, "reward": 0.8472744226455688, "reward_std": 0.09557215869426727, "rewards/reward_len/mean": 0.8472744226455688, "rewards/reward_len/std": 0.1558576226234436, "step": 525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8566775244299675, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5858998894691467, "kl": 0.22989630699157715, "learning_rate": 8.126013620059235e-07, "loss": 0.0002, "num_tokens": 216184144.0, "reward": 0.7918598651885986, "reward_std": 0.11895906180143356, "rewards/reward_len/mean": 0.7918598651885986, "rewards/reward_len/std": 0.2263217568397522, "step": 526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8583061889250815, "frac_reward_zero_std": 0.0, "grad_norm": 0.5546427965164185, "kl": 0.24563206732273102, "learning_rate": 8.119353553034492e-07, "loss": 0.0002, "num_tokens": 216594672.0, "reward": 0.8324130773544312, "reward_std": 0.09443847835063934, "rewards/reward_len/mean": 0.8324130773544312, "rewards/reward_len/std": 0.14958862960338593, "step": 527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8599348534201955, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5994703769683838, "kl": 0.20814752578735352, "learning_rate": 8.112684412297767e-07, "loss": 0.0002, "num_tokens": 217006144.0, "reward": 0.7844259142875671, "reward_std": 0.11800087243318558, "rewards/reward_len/mean": 0.7844259142875671, "rewards/reward_len/std": 0.24926894903182983, "step": 528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8615635179153095, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6208709478378296, "kl": 0.21055467426776886, "learning_rate": 8.106006217248551e-07, "loss": 0.0002, "num_tokens": 217416960.0, "reward": 0.7672970294952393, "reward_std": 0.11811448633670807, "rewards/reward_len/mean": 0.7672970294952393, "rewards/reward_len/std": 0.2783321440219879, "step": 529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8631921824104235, "frac_reward_zero_std": 0.0, "grad_norm": 0.5731842517852783, "kl": 0.2540382146835327, "learning_rate": 8.099318987312669e-07, "loss": 0.0003, "num_tokens": 217828864.0, "reward": 0.8180111646652222, "reward_std": 0.10769002884626389, "rewards/reward_len/mean": 0.8180111646652222, "rewards/reward_len/std": 0.17162969708442688, "step": 530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8648208469055375, "frac_reward_zero_std": 0.0, "grad_norm": 0.5568376779556274, "kl": 0.21354849636554718, "learning_rate": 8.092622741942227e-07, "loss": 0.0002, "num_tokens": 218240784.0, "reward": 0.7851330041885376, "reward_std": 0.1345355212688446, "rewards/reward_len/mean": 0.7851329445838928, "rewards/reward_len/std": 0.25593096017837524, "step": 531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8664495114006515, "frac_reward_zero_std": 0.0, "grad_norm": 0.5922732353210449, "kl": 0.2319929152727127, "learning_rate": 8.08591750061556e-07, "loss": 0.0002, "num_tokens": 218651968.0, "reward": 0.8277503848075867, "reward_std": 0.1193704903125763, "rewards/reward_len/mean": 0.8277503848075867, "rewards/reward_len/std": 0.18959197402000427, "step": 532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8680781758957655, "frac_reward_zero_std": 0.0, "grad_norm": 0.5595585703849792, "kl": 0.2004418969154358, "learning_rate": 8.079203282837164e-07, "loss": 0.0002, "num_tokens": 219062576.0, "reward": 0.796823263168335, "reward_std": 0.13372869789600372, "rewards/reward_len/mean": 0.796823263168335, "rewards/reward_len/std": 0.218483105301857, "step": 533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8697068403908795, "frac_reward_zero_std": 0.0, "grad_norm": 0.5785707831382751, "kl": 0.1948273777961731, "learning_rate": 8.072480108137648e-07, "loss": 0.0002, "num_tokens": 219472512.0, "reward": 0.7782167792320251, "reward_std": 0.14345575869083405, "rewards/reward_len/mean": 0.7782168388366699, "rewards/reward_len/std": 0.2546707093715668, "step": 534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8713355048859935, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6082727313041687, "kl": 0.18578749895095825, "learning_rate": 8.065747996073679e-07, "loss": 0.0002, "num_tokens": 219881840.0, "reward": 0.7984079122543335, "reward_std": 0.135448157787323, "rewards/reward_len/mean": 0.7984079718589783, "rewards/reward_len/std": 0.2503807246685028, "step": 535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8729641693811075, "frac_reward_zero_std": 0.0, "grad_norm": 0.6113690137863159, "kl": 0.24310722947120667, "learning_rate": 8.059006966227918e-07, "loss": 0.0002, "num_tokens": 220294928.0, "reward": 0.8211491107940674, "reward_std": 0.11638212949037552, "rewards/reward_len/mean": 0.8211489915847778, "rewards/reward_len/std": 0.20360049605369568, "step": 536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8745928338762216, "frac_reward_zero_std": 0.03125, "grad_norm": 0.565271258354187, "kl": 0.22464409470558167, "learning_rate": 8.052257038208967e-07, "loss": 0.0002, "num_tokens": 220704512.0, "reward": 0.8198528289794922, "reward_std": 0.1038401797413826, "rewards/reward_len/mean": 0.8198528289794922, "rewards/reward_len/std": 0.1817854791879654, "step": 537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8762214983713354, "frac_reward_zero_std": 0.03125, "grad_norm": 0.674662709236145, "kl": 0.23605573177337646, "learning_rate": 8.045498231651313e-07, "loss": 0.0002, "num_tokens": 221116320.0, "reward": 0.8799976110458374, "reward_std": 0.10311263799667358, "rewards/reward_len/mean": 0.8799976110458374, "rewards/reward_len/std": 0.15159542858600616, "step": 538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8778501628664495, "frac_reward_zero_std": 0.0, "grad_norm": 0.5567765235900879, "kl": 0.2074781060218811, "learning_rate": 8.038730566215266e-07, "loss": 0.0002, "num_tokens": 221528544.0, "reward": 0.8332087993621826, "reward_std": 0.10657960921525955, "rewards/reward_len/mean": 0.8332087993621826, "rewards/reward_len/std": 0.16370432078838348, "step": 539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8794788273615635, "frac_reward_zero_std": 0.0, "grad_norm": 0.6347450613975525, "kl": 0.2525138258934021, "learning_rate": 8.031954061586908e-07, "loss": 0.0003, "num_tokens": 221940384.0, "reward": 0.8505464792251587, "reward_std": 0.08386710286140442, "rewards/reward_len/mean": 0.8505464792251587, "rewards/reward_len/std": 0.12247441709041595, "step": 540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8811074918566775, "frac_reward_zero_std": 0.0, "grad_norm": 0.5701805949211121, "kl": 0.2548903524875641, "learning_rate": 8.025168737478033e-07, "loss": 0.0003, "num_tokens": 222351664.0, "reward": 0.8428469300270081, "reward_std": 0.08745795488357544, "rewards/reward_len/mean": 0.8428469896316528, "rewards/reward_len/std": 0.15242761373519897, "step": 541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8827361563517915, "frac_reward_zero_std": 0.0, "grad_norm": 0.574216902256012, "kl": 0.2634994685649872, "learning_rate": 8.018374613626087e-07, "loss": 0.0003, "num_tokens": 222762176.0, "reward": 0.8252073526382446, "reward_std": 0.11821819841861725, "rewards/reward_len/mean": 0.8252073526382446, "rewards/reward_len/std": 0.19583462178707123, "step": 542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8843648208469055, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5950250625610352, "kl": 0.23944808542728424, "learning_rate": 8.011571709794114e-07, "loss": 0.0002, "num_tokens": 223172128.0, "reward": 0.8501109480857849, "reward_std": 0.08798745274543762, "rewards/reward_len/mean": 0.8501109480857849, "rewards/reward_len/std": 0.14654821157455444, "step": 543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8859934853420195, "frac_reward_zero_std": 0.0, "grad_norm": 0.5856684446334839, "kl": 0.2641671895980835, "learning_rate": 8.0047600457707e-07, "loss": 0.0003, "num_tokens": 223583120.0, "reward": 0.8454554080963135, "reward_std": 0.10821234434843063, "rewards/reward_len/mean": 0.8454554080963135, "rewards/reward_len/std": 0.1513986736536026, "step": 544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8876221498371335, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5666660666465759, "kl": 0.24348625540733337, "learning_rate": 7.997939641369908e-07, "loss": 0.0002, "num_tokens": 223994512.0, "reward": 0.849653422832489, "reward_std": 0.08941784501075745, "rewards/reward_len/mean": 0.849653422832489, "rewards/reward_len/std": 0.17270545661449432, "step": 545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8892508143322475, "frac_reward_zero_std": 0.0, "grad_norm": 0.5714840292930603, "kl": 0.21954554319381714, "learning_rate": 7.991110516431232e-07, "loss": 0.0002, "num_tokens": 224405552.0, "reward": 0.7996347546577454, "reward_std": 0.10156466066837311, "rewards/reward_len/mean": 0.7996347546577454, "rewards/reward_len/std": 0.16829070448875427, "step": 546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8908794788273615, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5555116534233093, "kl": 0.23673126101493835, "learning_rate": 7.98427269081953e-07, "loss": 0.0002, "num_tokens": 224815680.0, "reward": 0.8331848382949829, "reward_std": 0.10590057075023651, "rewards/reward_len/mean": 0.8331848382949829, "rewards/reward_len/std": 0.22235199809074402, "step": 547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8925081433224755, "frac_reward_zero_std": 0.0, "grad_norm": 0.5519442558288574, "kl": 0.25686562061309814, "learning_rate": 7.977426184424962e-07, "loss": 0.0003, "num_tokens": 225227104.0, "reward": 0.8243204951286316, "reward_std": 0.10782928764820099, "rewards/reward_len/mean": 0.8243205547332764, "rewards/reward_len/std": 0.19643281400203705, "step": 548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8941368078175895, "frac_reward_zero_std": 0.0, "grad_norm": 0.5900027751922607, "kl": 0.24186165630817413, "learning_rate": 7.970571017162949e-07, "loss": 0.0002, "num_tokens": 225637888.0, "reward": 0.8260592222213745, "reward_std": 0.1007460504770279, "rewards/reward_len/mean": 0.8260592222213745, "rewards/reward_len/std": 0.20925332605838776, "step": 549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8957654723127035, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5959728956222534, "kl": 0.2492484152317047, "learning_rate": 7.963707208974102e-07, "loss": 0.0002, "num_tokens": 226048816.0, "reward": 0.7824039459228516, "reward_std": 0.11140377819538116, "rewards/reward_len/mean": 0.7824040055274963, "rewards/reward_len/std": 0.24756908416748047, "step": 550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8973941368078175, "frac_reward_zero_std": 0.0, "grad_norm": 0.5421157479286194, "kl": 0.26949411630630493, "learning_rate": 7.956834779824166e-07, "loss": 0.0003, "num_tokens": 226459680.0, "reward": 0.8600819706916809, "reward_std": 0.09554202854633331, "rewards/reward_len/mean": 0.8600819706916809, "rewards/reward_len/std": 0.15308891236782074, "step": 551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.8990228013029316, "frac_reward_zero_std": 0.0, "grad_norm": 0.606564998626709, "kl": 0.27292361855506897, "learning_rate": 7.949953749703962e-07, "loss": 0.0003, "num_tokens": 226870080.0, "reward": 0.7963042855262756, "reward_std": 0.10450835525989532, "rewards/reward_len/mean": 0.7963042855262756, "rewards/reward_len/std": 0.19963876903057098, "step": 552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9006514657980456, "frac_reward_zero_std": 0.03125, "grad_norm": 0.555708110332489, "kl": 0.27348271012306213, "learning_rate": 7.943064138629331e-07, "loss": 0.0003, "num_tokens": 227281648.0, "reward": 0.8263704180717468, "reward_std": 0.11014839261770248, "rewards/reward_len/mean": 0.8263704180717468, "rewards/reward_len/std": 0.2328416407108307, "step": 553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9022801302931596, "frac_reward_zero_std": 0.03125, "grad_norm": 0.569821834564209, "kl": 0.2500215172767639, "learning_rate": 7.936165966641077e-07, "loss": 0.0003, "num_tokens": 227692128.0, "reward": 0.8416356444358826, "reward_std": 0.12257122248411179, "rewards/reward_len/mean": 0.8416356444358826, "rewards/reward_len/std": 0.19831472635269165, "step": 554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9039087947882736, "frac_reward_zero_std": 0.0, "grad_norm": 0.5520820617675781, "kl": 0.24895983934402466, "learning_rate": 7.929259253804901e-07, "loss": 0.0002, "num_tokens": 228101984.0, "reward": 0.8259826898574829, "reward_std": 0.09697411954402924, "rewards/reward_len/mean": 0.8259826898574829, "rewards/reward_len/std": 0.15188974142074585, "step": 555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9055374592833876, "frac_reward_zero_std": 0.0, "grad_norm": 0.5634334683418274, "kl": 0.2566365599632263, "learning_rate": 7.922344020211356e-07, "loss": 0.0003, "num_tokens": 228513056.0, "reward": 0.8600718975067139, "reward_std": 0.1176261380314827, "rewards/reward_len/mean": 0.8600719571113586, "rewards/reward_len/std": 0.14862403273582458, "step": 556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9071661237785016, "frac_reward_zero_std": 0.09375, "grad_norm": 0.5217992067337036, "kl": 0.24115225672721863, "learning_rate": 7.915420285975771e-07, "loss": 0.0002, "num_tokens": 228923792.0, "reward": 0.802271842956543, "reward_std": 0.1151605024933815, "rewards/reward_len/mean": 0.802271842956543, "rewards/reward_len/std": 0.2613186836242676, "step": 557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9087947882736156, "frac_reward_zero_std": 0.0, "grad_norm": 0.6246284246444702, "kl": 0.2284250557422638, "learning_rate": 7.90848807123821e-07, "loss": 0.0002, "num_tokens": 229335616.0, "reward": 0.769812822341919, "reward_std": 0.11429795622825623, "rewards/reward_len/mean": 0.769812822341919, "rewards/reward_len/std": 0.2424582540988922, "step": 558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9104234527687296, "frac_reward_zero_std": 0.0, "grad_norm": 0.6257683634757996, "kl": 0.2438315749168396, "learning_rate": 7.901547396163399e-07, "loss": 0.0002, "num_tokens": 229745456.0, "reward": 0.8102747201919556, "reward_std": 0.1216910183429718, "rewards/reward_len/mean": 0.8102747201919556, "rewards/reward_len/std": 0.22713012993335724, "step": 559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9120521172638436, "frac_reward_zero_std": 0.03125, "grad_norm": 0.510200023651123, "kl": 0.2890588939189911, "learning_rate": 7.894598280940681e-07, "loss": 0.0003, "num_tokens": 230156704.0, "reward": 0.8746533393859863, "reward_std": 0.07875138521194458, "rewards/reward_len/mean": 0.8746533393859863, "rewards/reward_len/std": 0.10755586624145508, "step": 560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9136807817589576, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5616955757141113, "kl": 0.27124109864234924, "learning_rate": 7.887640745783943e-07, "loss": 0.0003, "num_tokens": 230567760.0, "reward": 0.8365346193313599, "reward_std": 0.10781961679458618, "rewards/reward_len/mean": 0.8365346193313599, "rewards/reward_len/std": 0.16866962611675262, "step": 561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9153094462540716, "frac_reward_zero_std": 0.0, "grad_norm": 0.5780808329582214, "kl": 0.24212706089019775, "learning_rate": 7.880674810931571e-07, "loss": 0.0002, "num_tokens": 230980000.0, "reward": 0.8335253596305847, "reward_std": 0.11259264498949051, "rewards/reward_len/mean": 0.8335253596305847, "rewards/reward_len/std": 0.18783952295780182, "step": 562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9169381107491856, "frac_reward_zero_std": 0.0, "grad_norm": 0.6028104424476624, "kl": 0.2639290392398834, "learning_rate": 7.873700496646377e-07, "loss": 0.0003, "num_tokens": 231392016.0, "reward": 0.7777013778686523, "reward_std": 0.1427796483039856, "rewards/reward_len/mean": 0.7777013778686523, "rewards/reward_len/std": 0.2544487416744232, "step": 563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9185667752442996, "frac_reward_zero_std": 0.0, "grad_norm": 0.5863448977470398, "kl": 0.24887919425964355, "learning_rate": 7.866717823215552e-07, "loss": 0.0002, "num_tokens": 231802208.0, "reward": 0.8377010822296143, "reward_std": 0.11942918598651886, "rewards/reward_len/mean": 0.8377010822296143, "rewards/reward_len/std": 0.16433678567409515, "step": 564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9201954397394136, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6063240766525269, "kl": 0.2418881058692932, "learning_rate": 7.859726810950605e-07, "loss": 0.0002, "num_tokens": 232212112.0, "reward": 0.8532135486602783, "reward_std": 0.10409578680992126, "rewards/reward_len/mean": 0.8532135486602783, "rewards/reward_len/std": 0.1515282392501831, "step": 565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9218241042345277, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5815527439117432, "kl": 0.24233436584472656, "learning_rate": 7.852727480187293e-07, "loss": 0.0002, "num_tokens": 232622032.0, "reward": 0.8299627304077148, "reward_std": 0.10316605865955353, "rewards/reward_len/mean": 0.8299627304077148, "rewards/reward_len/std": 0.18515600264072418, "step": 566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9234527687296417, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5352641344070435, "kl": 0.3272578716278076, "learning_rate": 7.845719851285579e-07, "loss": 0.0003, "num_tokens": 233033280.0, "reward": 0.8300646543502808, "reward_std": 0.11096519976854324, "rewards/reward_len/mean": 0.8300646543502808, "rewards/reward_len/std": 0.1952357143163681, "step": 567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9250814332247557, "frac_reward_zero_std": 0.0, "grad_norm": 0.6146264672279358, "kl": 0.27137231826782227, "learning_rate": 7.838703944629559e-07, "loss": 0.0003, "num_tokens": 233444368.0, "reward": 0.8303281664848328, "reward_std": 0.10820023715496063, "rewards/reward_len/mean": 0.830328106880188, "rewards/reward_len/std": 0.16442552208900452, "step": 568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9267100977198697, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5749176144599915, "kl": 0.25724995136260986, "learning_rate": 7.831679780627411e-07, "loss": 0.0003, "num_tokens": 233855888.0, "reward": 0.842593252658844, "reward_std": 0.09219968318939209, "rewards/reward_len/mean": 0.8425933122634888, "rewards/reward_len/std": 0.16973188519477844, "step": 569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9283387622149837, "frac_reward_zero_std": 0.0, "grad_norm": 0.5842222571372986, "kl": 0.27163487672805786, "learning_rate": 7.824647379711327e-07, "loss": 0.0003, "num_tokens": 234266720.0, "reward": 0.8053985834121704, "reward_std": 0.12647953629493713, "rewards/reward_len/mean": 0.8053985834121704, "rewards/reward_len/std": 0.24858151376247406, "step": 570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9299674267100977, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5662351250648499, "kl": 0.311867892742157, "learning_rate": 7.817606762337463e-07, "loss": 0.0003, "num_tokens": 234677632.0, "reward": 0.8743805289268494, "reward_std": 0.09253264963626862, "rewards/reward_len/mean": 0.8743805289268494, "rewards/reward_len/std": 0.1344861090183258, "step": 571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9315960912052117, "frac_reward_zero_std": 0.0, "grad_norm": 0.5671836733818054, "kl": 0.28847360610961914, "learning_rate": 7.810557948985877e-07, "loss": 0.0003, "num_tokens": 235088976.0, "reward": 0.8413286209106445, "reward_std": 0.12161331623792648, "rewards/reward_len/mean": 0.8413286209106445, "rewards/reward_len/std": 0.1650070995092392, "step": 572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9332247557003257, "frac_reward_zero_std": 0.0, "grad_norm": 0.5895443558692932, "kl": 0.29549476504325867, "learning_rate": 7.803500960160463e-07, "loss": 0.0003, "num_tokens": 235500080.0, "reward": 0.8584872484207153, "reward_std": 0.09844532608985901, "rewards/reward_len/mean": 0.8584872484207153, "rewards/reward_len/std": 0.1339057832956314, "step": 573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9348534201954397, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5601882338523865, "kl": 0.27787572145462036, "learning_rate": 7.796435816388898e-07, "loss": 0.0003, "num_tokens": 235910784.0, "reward": 0.8277124166488647, "reward_std": 0.11597315967082977, "rewards/reward_len/mean": 0.8277124166488647, "rewards/reward_len/std": 0.17702212929725647, "step": 574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9364820846905537, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5327553749084473, "kl": 0.2847519814968109, "learning_rate": 7.789362538222584e-07, "loss": 0.0003, "num_tokens": 236320928.0, "reward": 0.8437840938568115, "reward_std": 0.08028748631477356, "rewards/reward_len/mean": 0.843783974647522, "rewards/reward_len/std": 0.20275568962097168, "step": 575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9381107491856677, "frac_reward_zero_std": 0.03125, "grad_norm": 0.598971962928772, "kl": 0.30782389640808105, "learning_rate": 7.78228114623658e-07, "loss": 0.0003, "num_tokens": 236731584.0, "reward": 0.8677238821983337, "reward_std": 0.09527072310447693, "rewards/reward_len/mean": 0.8677238821983337, "rewards/reward_len/std": 0.13116368651390076, "step": 576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9397394136807817, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5856978893280029, "kl": 0.2798788547515869, "learning_rate": 7.77519166102955e-07, "loss": 0.0003, "num_tokens": 237142880.0, "reward": 0.7955989837646484, "reward_std": 0.0993552878499031, "rewards/reward_len/mean": 0.7955989837646484, "rewards/reward_len/std": 0.2393493503332138, "step": 577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9413680781758957, "frac_reward_zero_std": 0.0, "grad_norm": 0.7539804577827454, "kl": 0.3084201216697693, "learning_rate": 7.768094103223695e-07, "loss": 0.0003, "num_tokens": 237553968.0, "reward": 0.8020424842834473, "reward_std": 0.1332278698682785, "rewards/reward_len/mean": 0.8020424842834473, "rewards/reward_len/std": 0.21087251603603363, "step": 578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9429967426710097, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6168598532676697, "kl": 0.308039128780365, "learning_rate": 7.760988493464704e-07, "loss": 0.0003, "num_tokens": 237964272.0, "reward": 0.8498693704605103, "reward_std": 0.10567611455917358, "rewards/reward_len/mean": 0.8498693704605103, "rewards/reward_len/std": 0.169338196516037, "step": 579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9446254071661238, "frac_reward_zero_std": 0.0, "grad_norm": 0.5650925040245056, "kl": 0.2860453724861145, "learning_rate": 7.753874852421685e-07, "loss": 0.0003, "num_tokens": 238375104.0, "reward": 0.837483286857605, "reward_std": 0.11252100765705109, "rewards/reward_len/mean": 0.837483286857605, "rewards/reward_len/std": 0.19000345468521118, "step": 580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9462540716612378, "frac_reward_zero_std": 0.0, "grad_norm": 0.5326634049415588, "kl": 0.3409695625305176, "learning_rate": 7.746753200787109e-07, "loss": 0.0003, "num_tokens": 238786976.0, "reward": 0.8365786671638489, "reward_std": 0.09253674745559692, "rewards/reward_len/mean": 0.8365786671638489, "rewards/reward_len/std": 0.19340579211711884, "step": 581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9478827361563518, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5388736724853516, "kl": 0.29505455493927, "learning_rate": 7.739623559276746e-07, "loss": 0.0003, "num_tokens": 239197952.0, "reward": 0.8279705047607422, "reward_std": 0.11050303280353546, "rewards/reward_len/mean": 0.8279705047607422, "rewards/reward_len/std": 0.19504782557487488, "step": 582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9495114006514658, "frac_reward_zero_std": 0.0, "grad_norm": 0.5844742655754089, "kl": 0.30114972591400146, "learning_rate": 7.732485948629609e-07, "loss": 0.0003, "num_tokens": 239608384.0, "reward": 0.8285408020019531, "reward_std": 0.11097884178161621, "rewards/reward_len/mean": 0.8285407423973083, "rewards/reward_len/std": 0.18373934924602509, "step": 583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9511400651465798, "frac_reward_zero_std": 0.0, "grad_norm": 0.6857519745826721, "kl": 0.29866838455200195, "learning_rate": 7.725340389607893e-07, "loss": 0.0003, "num_tokens": 240020208.0, "reward": 0.8053740859031677, "reward_std": 0.1288485825061798, "rewards/reward_len/mean": 0.8053740859031677, "rewards/reward_len/std": 0.1959168165922165, "step": 584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9527687296416938, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5657849907875061, "kl": 0.3421083390712738, "learning_rate": 7.718186902996912e-07, "loss": 0.0003, "num_tokens": 240432544.0, "reward": 0.8726158142089844, "reward_std": 0.07607116550207138, "rewards/reward_len/mean": 0.8726158142089844, "rewards/reward_len/std": 0.1300937980413437, "step": 585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9543973941368078, "frac_reward_zero_std": 0.0, "grad_norm": 0.5389795303344727, "kl": 0.356436550617218, "learning_rate": 7.711025509605041e-07, "loss": 0.0004, "num_tokens": 240842768.0, "reward": 0.8239251375198364, "reward_std": 0.1176847442984581, "rewards/reward_len/mean": 0.8239251375198364, "rewards/reward_len/std": 0.22845831513404846, "step": 586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9560260586319218, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5319870114326477, "kl": 0.30979156494140625, "learning_rate": 7.703856230263651e-07, "loss": 0.0003, "num_tokens": 241253568.0, "reward": 0.8483954071998596, "reward_std": 0.10359126329421997, "rewards/reward_len/mean": 0.8483954668045044, "rewards/reward_len/std": 0.14918607473373413, "step": 587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9576547231270358, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6905359029769897, "kl": 0.40442126989364624, "learning_rate": 7.696679085827058e-07, "loss": 0.0004, "num_tokens": 241664272.0, "reward": 0.8668676018714905, "reward_std": 0.08843831717967987, "rewards/reward_len/mean": 0.8668676018714905, "rewards/reward_len/std": 0.1617080718278885, "step": 588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9592833876221498, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5526609420776367, "kl": 0.3590185046195984, "learning_rate": 7.689494097172455e-07, "loss": 0.0004, "num_tokens": 242074544.0, "reward": 0.8545811176300049, "reward_std": 0.11664096266031265, "rewards/reward_len/mean": 0.8545811176300049, "rewards/reward_len/std": 0.20078115165233612, "step": 589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9609120521172638, "frac_reward_zero_std": 0.0, "grad_norm": 0.6310095191001892, "kl": 0.3420923948287964, "learning_rate": 7.682301285199849e-07, "loss": 0.0003, "num_tokens": 242485296.0, "reward": 0.8467482328414917, "reward_std": 0.11450427025556564, "rewards/reward_len/mean": 0.8467482328414917, "rewards/reward_len/std": 0.17244930565357208, "step": 590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9625407166123778, "frac_reward_zero_std": 0.0, "grad_norm": 0.611204981803894, "kl": 0.36274588108062744, "learning_rate": 7.675100670832006e-07, "loss": 0.0004, "num_tokens": 242897280.0, "reward": 0.838079571723938, "reward_std": 0.09957221150398254, "rewards/reward_len/mean": 0.8380795121192932, "rewards/reward_len/std": 0.16554220020771027, "step": 591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9641693811074918, "frac_reward_zero_std": 0.0, "grad_norm": 0.5560952425003052, "kl": 0.3546805679798126, "learning_rate": 7.667892275014388e-07, "loss": 0.0004, "num_tokens": 243308096.0, "reward": 0.8303995132446289, "reward_std": 0.10593730211257935, "rewards/reward_len/mean": 0.8303995132446289, "rewards/reward_len/std": 0.162358820438385, "step": 592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9657980456026058, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5180210471153259, "kl": 0.3513214588165283, "learning_rate": 7.660676118715091e-07, "loss": 0.0004, "num_tokens": 243719392.0, "reward": 0.8927605152130127, "reward_std": 0.10449770092964172, "rewards/reward_len/mean": 0.8927605152130127, "rewards/reward_len/std": 0.14578671753406525, "step": 593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9674267100977199, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5956279039382935, "kl": 0.3604390323162079, "learning_rate": 7.653452222924789e-07, "loss": 0.0004, "num_tokens": 244129744.0, "reward": 0.8500046730041504, "reward_std": 0.1100105345249176, "rewards/reward_len/mean": 0.8500046730041504, "rewards/reward_len/std": 0.15718653798103333, "step": 594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9690553745928339, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5703727006912231, "kl": 0.32397913932800293, "learning_rate": 7.646220608656661e-07, "loss": 0.0003, "num_tokens": 244539104.0, "reward": 0.822211503982544, "reward_std": 0.11452198028564453, "rewards/reward_len/mean": 0.822211503982544, "rewards/reward_len/std": 0.21146734058856964, "step": 595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9706840390879479, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5250281095504761, "kl": 0.35088035464286804, "learning_rate": 7.638981296946348e-07, "loss": 0.0004, "num_tokens": 244948704.0, "reward": 0.84168541431427, "reward_std": 0.08734993636608124, "rewards/reward_len/mean": 0.84168541431427, "rewards/reward_len/std": 0.17754623293876648, "step": 596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9723127035830619, "frac_reward_zero_std": 0.0, "grad_norm": 0.563444197177887, "kl": 0.3669469952583313, "learning_rate": 7.631734308851871e-07, "loss": 0.0004, "num_tokens": 245360320.0, "reward": 0.8195729851722717, "reward_std": 0.12193629890680313, "rewards/reward_len/mean": 0.819572925567627, "rewards/reward_len/std": 0.2042885720729828, "step": 597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9739413680781759, "frac_reward_zero_std": 0.0, "grad_norm": 0.5756043791770935, "kl": 0.3391907811164856, "learning_rate": 7.624479665453591e-07, "loss": 0.0003, "num_tokens": 245771568.0, "reward": 0.7864437103271484, "reward_std": 0.11826467514038086, "rewards/reward_len/mean": 0.7864437103271484, "rewards/reward_len/std": 0.22963441908359528, "step": 598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9755700325732899, "frac_reward_zero_std": 0.0, "grad_norm": 0.5772921442985535, "kl": 0.3379092812538147, "learning_rate": 7.617217387854129e-07, "loss": 0.0003, "num_tokens": 246183024.0, "reward": 0.8582953214645386, "reward_std": 0.09808305650949478, "rewards/reward_len/mean": 0.8582953214645386, "rewards/reward_len/std": 0.15152522921562195, "step": 599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9771986970684039, "frac_reward_zero_std": 0.09375, "grad_norm": 0.49730271100997925, "kl": 0.37573716044425964, "learning_rate": 7.609947497178315e-07, "loss": 0.0004, "num_tokens": 246595568.0, "reward": 0.8581952452659607, "reward_std": 0.07612589746713638, "rewards/reward_len/mean": 0.8581951856613159, "rewards/reward_len/std": 0.1873202919960022, "step": 600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9788273615635179, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5065875053405762, "kl": 0.36832743883132935, "learning_rate": 7.602670014573127e-07, "loss": 0.0004, "num_tokens": 247006656.0, "reward": 0.849418044090271, "reward_std": 0.09987343847751617, "rewards/reward_len/mean": 0.849418044090271, "rewards/reward_len/std": 0.17614474892616272, "step": 601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9804560260586319, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5368043780326843, "kl": 0.37028101086616516, "learning_rate": 7.595384961207622e-07, "loss": 0.0004, "num_tokens": 247418064.0, "reward": 0.8080271482467651, "reward_std": 0.10241306573152542, "rewards/reward_len/mean": 0.8080271482467651, "rewards/reward_len/std": 0.1684727668762207, "step": 602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9820846905537459, "frac_reward_zero_std": 0.0, "grad_norm": 0.6910271048545837, "kl": 0.3602588176727295, "learning_rate": 7.588092358272884e-07, "loss": 0.0004, "num_tokens": 247828576.0, "reward": 0.7754114866256714, "reward_std": 0.11364054679870605, "rewards/reward_len/mean": 0.7754114270210266, "rewards/reward_len/std": 0.22544024884700775, "step": 603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9837133550488599, "frac_reward_zero_std": 0.0, "grad_norm": 0.5350521206855774, "kl": 0.3361983299255371, "learning_rate": 7.580792226981954e-07, "loss": 0.0003, "num_tokens": 248239200.0, "reward": 0.8500527143478394, "reward_std": 0.08158345520496368, "rewards/reward_len/mean": 0.8500527143478394, "rewards/reward_len/std": 0.19452986121177673, "step": 604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9853420195439739, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5994756817817688, "kl": 0.39164984226226807, "learning_rate": 7.573484588569774e-07, "loss": 0.0004, "num_tokens": 248650400.0, "reward": 0.8673146963119507, "reward_std": 0.0917695090174675, "rewards/reward_len/mean": 0.8673146963119507, "rewards/reward_len/std": 0.19410021603107452, "step": 605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.9869706840390879, "frac_reward_zero_std": 0.0, "grad_norm": 0.5842595100402832, "kl": 0.3062869906425476, "learning_rate": 7.566169464293119e-07, "loss": 0.0003, "num_tokens": 249060848.0, "reward": 0.8280701637268066, "reward_std": 0.10828258097171783, "rewards/reward_len/mean": 0.8280701637268066, "rewards/reward_len/std": 0.2158779799938202, "step": 606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.988599348534202, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5362155437469482, "kl": 0.344526469707489, "learning_rate": 7.558846875430547e-07, "loss": 0.0003, "num_tokens": 249471760.0, "reward": 0.8611372709274292, "reward_std": 0.09540245682001114, "rewards/reward_len/mean": 0.8611372709274292, "rewards/reward_len/std": 0.17043142020702362, "step": 607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.990228013029316, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5807381272315979, "kl": 0.31067898869514465, "learning_rate": 7.551516843282322e-07, "loss": 0.0003, "num_tokens": 249882480.0, "reward": 0.78212571144104, "reward_std": 0.12615013122558594, "rewards/reward_len/mean": 0.78212571144104, "rewards/reward_len/std": 0.2607714831829071, "step": 608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.99185667752443, "frac_reward_zero_std": 0.0, "grad_norm": 0.6229561567306519, "kl": 0.3591206669807434, "learning_rate": 7.544179389170361e-07, "loss": 0.0004, "num_tokens": 250292720.0, "reward": 0.8561010956764221, "reward_std": 0.10623371601104736, "rewards/reward_len/mean": 0.8561011552810669, "rewards/reward_len/std": 0.16192689538002014, "step": 609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.993485342019544, "frac_reward_zero_std": 0.0, "grad_norm": 0.5762795209884644, "kl": 0.40383386611938477, "learning_rate": 7.536834534438173e-07, "loss": 0.0004, "num_tokens": 250703744.0, "reward": 0.8357879519462585, "reward_std": 0.12551359832286835, "rewards/reward_len/mean": 0.8357880115509033, "rewards/reward_len/std": 0.16290920972824097, "step": 610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.995114006514658, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5259905457496643, "kl": 0.36576738953590393, "learning_rate": 7.529482300450792e-07, "loss": 0.0004, "num_tokens": 251114224.0, "reward": 0.8629857897758484, "reward_std": 0.09768620878458023, "rewards/reward_len/mean": 0.8629857897758484, "rewards/reward_len/std": 0.15774208307266235, "step": 611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.996742671009772, "frac_reward_zero_std": 0.0, "grad_norm": 0.6200954914093018, "kl": 0.3864758014678955, "learning_rate": 7.522122708594719e-07, "loss": 0.0004, "num_tokens": 251525888.0, "reward": 0.813705563545227, "reward_std": 0.12580710649490356, "rewards/reward_len/mean": 0.813705563545227, "rewards/reward_len/std": 0.2173054963350296, "step": 612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.998371335504886, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5614097118377686, "kl": 0.3732612133026123, "learning_rate": 7.514755780277853e-07, "loss": 0.0004, "num_tokens": 251936864.0, "reward": 0.8726152181625366, "reward_std": 0.0858200341463089, "rewards/reward_len/mean": 0.8726152181625366, "rewards/reward_len/std": 0.11952804774045944, "step": 613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.5531923770904541, "kl": 0.3738713562488556, "learning_rate": 7.507381536929439e-07, "loss": 0.0004, "num_tokens": 252348720.0, "reward": 0.830078125, "reward_std": 0.09806280583143234, "rewards/reward_len/mean": 0.830078125, "rewards/reward_len/std": 0.19350625574588776, "step": 614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.001628664495114, "frac_reward_zero_std": 0.0, "grad_norm": 0.5552987456321716, "kl": 0.3316299319267273, "learning_rate": 7.5e-07, "loss": 0.0003, "num_tokens": 252759824.0, "reward": 0.8196622133255005, "reward_std": 0.11807513236999512, "rewards/reward_len/mean": 0.8196621537208557, "rewards/reward_len/std": 0.19745850563049316, "step": 615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.003257328990228, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6796162724494934, "kl": 0.38460442423820496, "learning_rate": 7.492611190961271e-07, "loss": 0.0004, "num_tokens": 253171040.0, "reward": 0.8658927083015442, "reward_std": 0.1057574525475502, "rewards/reward_len/mean": 0.8658927083015442, "rewards/reward_len/std": 0.15178290009498596, "step": 616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.004885993485342, "frac_reward_zero_std": 0.0, "grad_norm": 0.5390791296958923, "kl": 0.4506666660308838, "learning_rate": 7.485215131306145e-07, "loss": 0.0005, "num_tokens": 253582560.0, "reward": 0.8585485219955444, "reward_std": 0.08712103217840195, "rewards/reward_len/mean": 0.8585485219955444, "rewards/reward_len/std": 0.18704335391521454, "step": 617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.006514657980456, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5743985772132874, "kl": 0.400398850440979, "learning_rate": 7.477811842548601e-07, "loss": 0.0004, "num_tokens": 253994560.0, "reward": 0.8647803664207458, "reward_std": 0.09316179156303406, "rewards/reward_len/mean": 0.8647804260253906, "rewards/reward_len/std": 0.1435493677854538, "step": 618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.00814332247557, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6208407282829285, "kl": 0.4309912919998169, "learning_rate": 7.470401346223653e-07, "loss": 0.0004, "num_tokens": 254404704.0, "reward": 0.8739265203475952, "reward_std": 0.08483880013227463, "rewards/reward_len/mean": 0.8739265203475952, "rewards/reward_len/std": 0.1547812670469284, "step": 619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.009771986970684, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5944214463233948, "kl": 0.41658148169517517, "learning_rate": 7.462983663887271e-07, "loss": 0.0004, "num_tokens": 254816352.0, "reward": 0.8305535316467285, "reward_std": 0.10325829684734344, "rewards/reward_len/mean": 0.8305535316467285, "rewards/reward_len/std": 0.20945893228054047, "step": 620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.011400651465798, "frac_reward_zero_std": 0.0, "grad_norm": 0.5691332817077637, "kl": 0.4978486895561218, "learning_rate": 7.455558817116338e-07, "loss": 0.0005, "num_tokens": 255229136.0, "reward": 0.8484501242637634, "reward_std": 0.09672130644321442, "rewards/reward_len/mean": 0.8484500646591187, "rewards/reward_len/std": 0.2131289392709732, "step": 621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.013029315960912, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5246078372001648, "kl": 0.4545177221298218, "learning_rate": 7.448126827508572e-07, "loss": 0.0005, "num_tokens": 255639776.0, "reward": 0.8319501876831055, "reward_std": 0.11982402205467224, "rewards/reward_len/mean": 0.8319501876831055, "rewards/reward_len/std": 0.19661396741867065, "step": 622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.014657980456026, "frac_reward_zero_std": 0.0, "grad_norm": 0.5510538220405579, "kl": 0.5204042196273804, "learning_rate": 7.440687716682467e-07, "loss": 0.0005, "num_tokens": 256051552.0, "reward": 0.8570336699485779, "reward_std": 0.09731556475162506, "rewards/reward_len/mean": 0.8570336699485779, "rewards/reward_len/std": 0.18008731305599213, "step": 623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.01628664495114, "frac_reward_zero_std": 0.03125, "grad_norm": 0.583458423614502, "kl": 0.37263137102127075, "learning_rate": 7.433241506277238e-07, "loss": 0.0004, "num_tokens": 256462976.0, "reward": 0.8109834790229797, "reward_std": 0.11518511176109314, "rewards/reward_len/mean": 0.8109834790229797, "rewards/reward_len/std": 0.22808846831321716, "step": 624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.017915309446254, "frac_reward_zero_std": 0.09375, "grad_norm": 0.5040473937988281, "kl": 0.4053654074668884, "learning_rate": 7.425788217952743e-07, "loss": 0.0004, "num_tokens": 256873184.0, "reward": 0.8378493189811707, "reward_std": 0.10683667659759521, "rewards/reward_len/mean": 0.8378493189811707, "rewards/reward_len/std": 0.20308978855609894, "step": 625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.019543973941368, "frac_reward_zero_std": 0.0, "grad_norm": 0.668716549873352, "kl": 0.46982792019844055, "learning_rate": 7.418327873389436e-07, "loss": 0.0005, "num_tokens": 257285136.0, "reward": 0.7864384055137634, "reward_std": 0.12523245811462402, "rewards/reward_len/mean": 0.7864383459091187, "rewards/reward_len/std": 0.2567039132118225, "step": 626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.021172638436482, "frac_reward_zero_std": 0.0, "grad_norm": 0.5947088599205017, "kl": 0.3952181935310364, "learning_rate": 7.410860494288292e-07, "loss": 0.0004, "num_tokens": 257695840.0, "reward": 0.8205632567405701, "reward_std": 0.10346122086048126, "rewards/reward_len/mean": 0.8205632567405701, "rewards/reward_len/std": 0.22779974341392517, "step": 627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.022801302931596, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5791059136390686, "kl": 0.3634698987007141, "learning_rate": 7.40338610237075e-07, "loss": 0.0004, "num_tokens": 258106288.0, "reward": 0.806199312210083, "reward_std": 0.10396707057952881, "rewards/reward_len/mean": 0.8061993718147278, "rewards/reward_len/std": 0.255084365606308, "step": 628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.02442996742671, "frac_reward_zero_std": 0.03125, "grad_norm": 0.550311803817749, "kl": 0.47054311633110046, "learning_rate": 7.395904719378648e-07, "loss": 0.0005, "num_tokens": 258516608.0, "reward": 0.8228127956390381, "reward_std": 0.10350769758224487, "rewards/reward_len/mean": 0.8228127956390381, "rewards/reward_len/std": 0.16354136168956757, "step": 629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0260586319218241, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4968714118003845, "kl": 0.5364499092102051, "learning_rate": 7.388416367074158e-07, "loss": 0.0005, "num_tokens": 258927184.0, "reward": 0.855431318283081, "reward_std": 0.07860717922449112, "rewards/reward_len/mean": 0.8554313778877258, "rewards/reward_len/std": 0.1253942996263504, "step": 630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0276872964169381, "frac_reward_zero_std": 0.0, "grad_norm": 0.5554788708686829, "kl": 0.5350141525268555, "learning_rate": 7.380921067239731e-07, "loss": 0.0005, "num_tokens": 259337504.0, "reward": 0.8577651977539062, "reward_std": 0.09476503729820251, "rewards/reward_len/mean": 0.8577651381492615, "rewards/reward_len/std": 0.14974445104599, "step": 631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0293159609120521, "frac_reward_zero_std": 0.0, "grad_norm": 0.542684018611908, "kl": 0.5427294969558716, "learning_rate": 7.373418841678019e-07, "loss": 0.0005, "num_tokens": 259748672.0, "reward": 0.8402842283248901, "reward_std": 0.10526715219020844, "rewards/reward_len/mean": 0.8402842283248901, "rewards/reward_len/std": 0.15123547613620758, "step": 632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0309446254071661, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5783001184463501, "kl": 0.42869892716407776, "learning_rate": 7.365909712211826e-07, "loss": 0.0004, "num_tokens": 260159184.0, "reward": 0.8216488361358643, "reward_std": 0.10180453956127167, "rewards/reward_len/mean": 0.8216488361358643, "rewards/reward_len/std": 0.20646095275878906, "step": 633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0325732899022801, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5973176956176758, "kl": 0.5131453275680542, "learning_rate": 7.358393700684032e-07, "loss": 0.0005, "num_tokens": 260569600.0, "reward": 0.8411867618560791, "reward_std": 0.09519438445568085, "rewards/reward_len/mean": 0.8411867618560791, "rewards/reward_len/std": 0.2187321037054062, "step": 634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0342019543973942, "frac_reward_zero_std": 0.0, "grad_norm": 0.6703708171844482, "kl": 0.4974726140499115, "learning_rate": 7.350870828957546e-07, "loss": 0.0005, "num_tokens": 260980800.0, "reward": 0.8186280727386475, "reward_std": 0.12627574801445007, "rewards/reward_len/mean": 0.8186280727386475, "rewards/reward_len/std": 0.2290801852941513, "step": 635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0358306188925082, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6944183111190796, "kl": 0.48783761262893677, "learning_rate": 7.343341118915222e-07, "loss": 0.0005, "num_tokens": 261393456.0, "reward": 0.7593145966529846, "reward_std": 0.14245182275772095, "rewards/reward_len/mean": 0.7593146562576294, "rewards/reward_len/std": 0.29970037937164307, "step": 636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0374592833876222, "frac_reward_zero_std": 0.0, "grad_norm": 0.5505980849266052, "kl": 0.6157242059707642, "learning_rate": 7.33580459245981e-07, "loss": 0.0006, "num_tokens": 261804608.0, "reward": 0.8514288067817688, "reward_std": 0.09641700983047485, "rewards/reward_len/mean": 0.8514288067817688, "rewards/reward_len/std": 0.15619535744190216, "step": 637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0390879478827362, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5661560297012329, "kl": 0.4941999912261963, "learning_rate": 7.328261271513889e-07, "loss": 0.0005, "num_tokens": 262214528.0, "reward": 0.840693473815918, "reward_std": 0.10402005910873413, "rewards/reward_len/mean": 0.840693473815918, "rewards/reward_len/std": 0.16605763137340546, "step": 638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0407166123778502, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6257279515266418, "kl": 0.5717203617095947, "learning_rate": 7.320711178019801e-07, "loss": 0.0006, "num_tokens": 262626224.0, "reward": 0.8054149746894836, "reward_std": 0.09906928986310959, "rewards/reward_len/mean": 0.8054150342941284, "rewards/reward_len/std": 0.24368929862976074, "step": 639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0423452768729642, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5667241215705872, "kl": 0.5487487316131592, "learning_rate": 7.313154333939586e-07, "loss": 0.0005, "num_tokens": 263036080.0, "reward": 0.8850383162498474, "reward_std": 0.07264743745326996, "rewards/reward_len/mean": 0.8850383162498474, "rewards/reward_len/std": 0.1247360110282898, "step": 640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0439739413680782, "frac_reward_zero_std": 0.0, "grad_norm": 0.5240339040756226, "kl": 0.5893254280090332, "learning_rate": 7.305590761254923e-07, "loss": 0.0006, "num_tokens": 263447728.0, "reward": 0.8475768566131592, "reward_std": 0.09322605282068253, "rewards/reward_len/mean": 0.8475768566131592, "rewards/reward_len/std": 0.19681315124034882, "step": 641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0456026058631922, "frac_reward_zero_std": 0.0, "grad_norm": 0.5666318535804749, "kl": 0.5332951545715332, "learning_rate": 7.298020481967065e-07, "loss": 0.0005, "num_tokens": 263858496.0, "reward": 0.829300045967102, "reward_std": 0.10281890630722046, "rewards/reward_len/mean": 0.829300045967102, "rewards/reward_len/std": 0.19059991836547852, "step": 642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0472312703583062, "frac_reward_zero_std": 0.0, "grad_norm": 0.5793806910514832, "kl": 0.5877683758735657, "learning_rate": 7.290443518096769e-07, "loss": 0.0006, "num_tokens": 264269536.0, "reward": 0.8446130752563477, "reward_std": 0.11716818064451218, "rewards/reward_len/mean": 0.8446130752563477, "rewards/reward_len/std": 0.18927079439163208, "step": 643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0488599348534202, "frac_reward_zero_std": 0.0, "grad_norm": 0.5846208333969116, "kl": 0.5963468551635742, "learning_rate": 7.282859891684239e-07, "loss": 0.0006, "num_tokens": 264680688.0, "reward": 0.839897096157074, "reward_std": 0.09905301034450531, "rewards/reward_len/mean": 0.839897096157074, "rewards/reward_len/std": 0.1627429872751236, "step": 644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0504885993485342, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5442854762077332, "kl": 0.5723192095756531, "learning_rate": 7.275269624789059e-07, "loss": 0.0006, "num_tokens": 265090528.0, "reward": 0.8441863059997559, "reward_std": 0.1180189847946167, "rewards/reward_len/mean": 0.8441863059997559, "rewards/reward_len/std": 0.1899232119321823, "step": 645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0521172638436482, "frac_reward_zero_std": 0.03125, "grad_norm": 0.586575984954834, "kl": 0.6454929113388062, "learning_rate": 7.267672739490129e-07, "loss": 0.0006, "num_tokens": 265499920.0, "reward": 0.8134720921516418, "reward_std": 0.13857948780059814, "rewards/reward_len/mean": 0.8134720921516418, "rewards/reward_len/std": 0.22875618934631348, "step": 646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0537459283387622, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5722486972808838, "kl": 0.6286854147911072, "learning_rate": 7.260069257885604e-07, "loss": 0.0006, "num_tokens": 265912032.0, "reward": 0.8641586899757385, "reward_std": 0.08157707750797272, "rewards/reward_len/mean": 0.8641586899757385, "rewards/reward_len/std": 0.11922919750213623, "step": 647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0553745928338762, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5345008373260498, "kl": 0.5601773858070374, "learning_rate": 7.25245920209282e-07, "loss": 0.0006, "num_tokens": 266322208.0, "reward": 0.8468251824378967, "reward_std": 0.07428397983312607, "rewards/reward_len/mean": 0.8468251824378967, "rewards/reward_len/std": 0.18750153481960297, "step": 648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0570032573289903, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5925971865653992, "kl": 0.5676771402359009, "learning_rate": 7.244842594248243e-07, "loss": 0.0006, "num_tokens": 266733024.0, "reward": 0.8550490736961365, "reward_std": 0.08532895147800446, "rewards/reward_len/mean": 0.8550491333007812, "rewards/reward_len/std": 0.14064213633537292, "step": 649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0586319218241043, "frac_reward_zero_std": 0.0, "grad_norm": 0.578087329864502, "kl": 0.7336572408676147, "learning_rate": 7.23721945650739e-07, "loss": 0.0007, "num_tokens": 267144672.0, "reward": 0.8530446887016296, "reward_std": 0.09697538614273071, "rewards/reward_len/mean": 0.8530446887016296, "rewards/reward_len/std": 0.14767959713935852, "step": 650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0602605863192183, "frac_reward_zero_std": 0.0, "grad_norm": 0.5517903566360474, "kl": 0.6277514696121216, "learning_rate": 7.229589811044784e-07, "loss": 0.0006, "num_tokens": 267555760.0, "reward": 0.8282729387283325, "reward_std": 0.09742507338523865, "rewards/reward_len/mean": 0.8282729387283325, "rewards/reward_len/std": 0.2149767130613327, "step": 651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0618892508143323, "frac_reward_zero_std": 0.0625, "grad_norm": 0.477424293756485, "kl": 0.7317923307418823, "learning_rate": 7.221953680053866e-07, "loss": 0.0007, "num_tokens": 267967232.0, "reward": 0.8444876670837402, "reward_std": 0.09351719170808792, "rewards/reward_len/mean": 0.8444876670837402, "rewards/reward_len/std": 0.21039189398288727, "step": 652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0635179153094463, "frac_reward_zero_std": 0.0, "grad_norm": 0.5747439861297607, "kl": 0.64033043384552, "learning_rate": 7.214311085746949e-07, "loss": 0.0006, "num_tokens": 268378528.0, "reward": 0.8389354944229126, "reward_std": 0.10830576717853546, "rewards/reward_len/mean": 0.8389354944229126, "rewards/reward_len/std": 0.20537051558494568, "step": 653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0651465798045603, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5664095878601074, "kl": 0.7609057426452637, "learning_rate": 7.206662050355147e-07, "loss": 0.0008, "num_tokens": 268790144.0, "reward": 0.8324373960494995, "reward_std": 0.11176201701164246, "rewards/reward_len/mean": 0.8324373364448547, "rewards/reward_len/std": 0.20510922372341156, "step": 654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0667752442996743, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5125067830085754, "kl": 0.6649645566940308, "learning_rate": 7.199006596128306e-07, "loss": 0.0007, "num_tokens": 269200976.0, "reward": 0.8753788471221924, "reward_std": 0.0900731235742569, "rewards/reward_len/mean": 0.8753788471221924, "rewards/reward_len/std": 0.14599086344242096, "step": 655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0684039087947883, "frac_reward_zero_std": 0.03125, "grad_norm": 0.554685652256012, "kl": 0.7326669692993164, "learning_rate": 7.191344745334948e-07, "loss": 0.0007, "num_tokens": 269611872.0, "reward": 0.8379058241844177, "reward_std": 0.11248129606246948, "rewards/reward_len/mean": 0.837905764579773, "rewards/reward_len/std": 0.1805448979139328, "step": 656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0700325732899023, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5456286668777466, "kl": 0.6794421076774597, "learning_rate": 7.1836765202622e-07, "loss": 0.0007, "num_tokens": 270022640.0, "reward": 0.8431286811828613, "reward_std": 0.12385959178209305, "rewards/reward_len/mean": 0.8431286811828613, "rewards/reward_len/std": 0.2109021246433258, "step": 657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0716612377850163, "frac_reward_zero_std": 0.0, "grad_norm": 0.5899454355239868, "kl": 0.6936199069023132, "learning_rate": 7.176001943215729e-07, "loss": 0.0007, "num_tokens": 270434640.0, "reward": 0.8277610540390015, "reward_std": 0.09654328227043152, "rewards/reward_len/mean": 0.8277610540390015, "rewards/reward_len/std": 0.15865086019039154, "step": 658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0732899022801303, "frac_reward_zero_std": 0.0, "grad_norm": 0.6282787919044495, "kl": 0.6315916776657104, "learning_rate": 7.168321036519678e-07, "loss": 0.0006, "num_tokens": 270844592.0, "reward": 0.8010393381118774, "reward_std": 0.14550721645355225, "rewards/reward_len/mean": 0.8010393381118774, "rewards/reward_len/std": 0.22441506385803223, "step": 659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0749185667752443, "frac_reward_zero_std": 0.0, "grad_norm": 0.518451452255249, "kl": 0.7201743125915527, "learning_rate": 7.160633822516607e-07, "loss": 0.0007, "num_tokens": 271255648.0, "reward": 0.8692667484283447, "reward_std": 0.08767472207546234, "rewards/reward_len/mean": 0.8692667484283447, "rewards/reward_len/std": 0.14743219316005707, "step": 660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0765472312703583, "frac_reward_zero_std": 0.0, "grad_norm": 0.5512977242469788, "kl": 0.7426429986953735, "learning_rate": 7.152940323567422e-07, "loss": 0.0007, "num_tokens": 271667136.0, "reward": 0.8568596839904785, "reward_std": 0.09527057409286499, "rewards/reward_len/mean": 0.8568596839904785, "rewards/reward_len/std": 0.13675197958946228, "step": 661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0781758957654723, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5253465175628662, "kl": 0.6418797969818115, "learning_rate": 7.145240562051304e-07, "loss": 0.0006, "num_tokens": 272078512.0, "reward": 0.8507390022277832, "reward_std": 0.09361854940652847, "rewards/reward_len/mean": 0.8507390022277832, "rewards/reward_len/std": 0.15769143402576447, "step": 662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0798045602605864, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5715615749359131, "kl": 0.6426703333854675, "learning_rate": 7.137534560365661e-07, "loss": 0.0006, "num_tokens": 272489776.0, "reward": 0.8378469944000244, "reward_std": 0.09789557754993439, "rewards/reward_len/mean": 0.8378469944000244, "rewards/reward_len/std": 0.168008491396904, "step": 663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0814332247557004, "frac_reward_zero_std": 0.0, "grad_norm": 0.654187798500061, "kl": 0.6659982204437256, "learning_rate": 7.129822340926043e-07, "loss": 0.0007, "num_tokens": 272901728.0, "reward": 0.8406924605369568, "reward_std": 0.10362594574689865, "rewards/reward_len/mean": 0.8406925201416016, "rewards/reward_len/std": 0.17524506151676178, "step": 664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0830618892508144, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5292242765426636, "kl": 0.683131217956543, "learning_rate": 7.122103926166095e-07, "loss": 0.0007, "num_tokens": 273313568.0, "reward": 0.8270142674446106, "reward_std": 0.10374678671360016, "rewards/reward_len/mean": 0.8270143270492554, "rewards/reward_len/std": 0.17588235437870026, "step": 665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0846905537459284, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5040188431739807, "kl": 0.6801507472991943, "learning_rate": 7.114379338537477e-07, "loss": 0.0007, "num_tokens": 273724016.0, "reward": 0.8485740423202515, "reward_std": 0.0888650119304657, "rewards/reward_len/mean": 0.8485740423202515, "rewards/reward_len/std": 0.21085575222969055, "step": 666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0863192182410424, "frac_reward_zero_std": 0.03125, "grad_norm": 0.576927661895752, "kl": 0.6286316514015198, "learning_rate": 7.106648600509808e-07, "loss": 0.0006, "num_tokens": 274135136.0, "reward": 0.8502990007400513, "reward_std": 0.10106787085533142, "rewards/reward_len/mean": 0.8502990007400513, "rewards/reward_len/std": 0.16696909070014954, "step": 667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0879478827361564, "frac_reward_zero_std": 0.0, "grad_norm": 0.5749943852424622, "kl": 0.668147087097168, "learning_rate": 7.098911734570596e-07, "loss": 0.0007, "num_tokens": 274546096.0, "reward": 0.8244757652282715, "reward_std": 0.1058870330452919, "rewards/reward_len/mean": 0.8244757652282715, "rewards/reward_len/std": 0.2128688246011734, "step": 668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0895765472312704, "frac_reward_zero_std": 0.0, "grad_norm": 0.5485692024230957, "kl": 0.6418262720108032, "learning_rate": 7.091168763225175e-07, "loss": 0.0006, "num_tokens": 274957664.0, "reward": 0.8350225687026978, "reward_std": 0.09799966216087341, "rewards/reward_len/mean": 0.835022509098053, "rewards/reward_len/std": 0.19884252548217773, "step": 669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0912052117263844, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5709507465362549, "kl": 0.786740779876709, "learning_rate": 7.083419708996639e-07, "loss": 0.0008, "num_tokens": 275368896.0, "reward": 0.8359753489494324, "reward_std": 0.09963221102952957, "rewards/reward_len/mean": 0.8359753489494324, "rewards/reward_len/std": 0.17691440880298615, "step": 670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0928338762214984, "frac_reward_zero_std": 0.0, "grad_norm": 0.6520395278930664, "kl": 0.7848333120346069, "learning_rate": 7.075664594425777e-07, "loss": 0.0008, "num_tokens": 275781168.0, "reward": 0.846674382686615, "reward_std": 0.09921238571405411, "rewards/reward_len/mean": 0.846674382686615, "rewards/reward_len/std": 0.16839255392551422, "step": 671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0944625407166124, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5185254812240601, "kl": 0.7707849740982056, "learning_rate": 7.067903442071002e-07, "loss": 0.0008, "num_tokens": 276191232.0, "reward": 0.8708393573760986, "reward_std": 0.08904929459095001, "rewards/reward_len/mean": 0.8708393573760986, "rewards/reward_len/std": 0.16724491119384766, "step": 672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0960912052117264, "frac_reward_zero_std": 0.0, "grad_norm": 0.5192785263061523, "kl": 0.7745413780212402, "learning_rate": 7.060136274508295e-07, "loss": 0.0008, "num_tokens": 276602784.0, "reward": 0.8112930655479431, "reward_std": 0.12126889079809189, "rewards/reward_len/mean": 0.8112930655479431, "rewards/reward_len/std": 0.22560812532901764, "step": 673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0977198697068404, "frac_reward_zero_std": 0.0, "grad_norm": 0.6215066909790039, "kl": 0.7823954224586487, "learning_rate": 7.052363114331134e-07, "loss": 0.0008, "num_tokens": 277015120.0, "reward": 0.8441939949989319, "reward_std": 0.11136381328105927, "rewards/reward_len/mean": 0.8441939949989319, "rewards/reward_len/std": 0.16424837708473206, "step": 674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.0993485342019544, "frac_reward_zero_std": 0.0, "grad_norm": 0.6163678169250488, "kl": 0.7326565980911255, "learning_rate": 7.044583984150424e-07, "loss": 0.0007, "num_tokens": 277426608.0, "reward": 0.7887044548988342, "reward_std": 0.12219779193401337, "rewards/reward_len/mean": 0.788704514503479, "rewards/reward_len/std": 0.24020636081695557, "step": 675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1009771986970684, "frac_reward_zero_std": 0.0, "grad_norm": 0.5312530994415283, "kl": 0.6735379695892334, "learning_rate": 7.036798906594441e-07, "loss": 0.0007, "num_tokens": 277837520.0, "reward": 0.8394418358802795, "reward_std": 0.11619296669960022, "rewards/reward_len/mean": 0.8394417762756348, "rewards/reward_len/std": 0.2167881429195404, "step": 676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1026058631921825, "frac_reward_zero_std": 0.0, "grad_norm": 0.5484333634376526, "kl": 0.8347951769828796, "learning_rate": 7.02900790430876e-07, "loss": 0.0008, "num_tokens": 278246288.0, "reward": 0.8515006303787231, "reward_std": 0.09412658214569092, "rewards/reward_len/mean": 0.8515006303787231, "rewards/reward_len/std": 0.17183060944080353, "step": 677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1042345276872965, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5886895060539246, "kl": 0.7526677250862122, "learning_rate": 7.021210999956186e-07, "loss": 0.0008, "num_tokens": 278656624.0, "reward": 0.876108705997467, "reward_std": 0.07428349554538727, "rewards/reward_len/mean": 0.8761086463928223, "rewards/reward_len/std": 0.13269944489002228, "step": 678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1058631921824105, "frac_reward_zero_std": 0.0, "grad_norm": 0.6842377781867981, "kl": 0.776659369468689, "learning_rate": 7.013408216216699e-07, "loss": 0.0008, "num_tokens": 279068304.0, "reward": 0.831853985786438, "reward_std": 0.11019986867904663, "rewards/reward_len/mean": 0.831853985786438, "rewards/reward_len/std": 0.17557862401008606, "step": 679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1074918566775245, "frac_reward_zero_std": 0.03125, "grad_norm": 0.502957284450531, "kl": 0.802336573600769, "learning_rate": 7.005599575787373e-07, "loss": 0.0008, "num_tokens": 279480160.0, "reward": 0.8427343964576721, "reward_std": 0.1146739274263382, "rewards/reward_len/mean": 0.8427343964576721, "rewards/reward_len/std": 0.2060668021440506, "step": 680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1091205211726385, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5483909845352173, "kl": 0.815475583076477, "learning_rate": 6.997785101382327e-07, "loss": 0.0008, "num_tokens": 279891520.0, "reward": 0.8523061275482178, "reward_std": 0.10607977956533432, "rewards/reward_len/mean": 0.8523061275482178, "rewards/reward_len/std": 0.16366834938526154, "step": 681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1107491856677525, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5042523145675659, "kl": 0.8771810531616211, "learning_rate": 6.989964815732642e-07, "loss": 0.0009, "num_tokens": 280302176.0, "reward": 0.8826267719268799, "reward_std": 0.09586424380540848, "rewards/reward_len/mean": 0.8826267719268799, "rewards/reward_len/std": 0.15042781829833984, "step": 682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1123778501628665, "frac_reward_zero_std": 0.0, "grad_norm": 0.559181272983551, "kl": 0.741096019744873, "learning_rate": 6.982138741586308e-07, "loss": 0.0007, "num_tokens": 280713200.0, "reward": 0.834382951259613, "reward_std": 0.10931536555290222, "rewards/reward_len/mean": 0.8343830108642578, "rewards/reward_len/std": 0.220307394862175, "step": 683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1140065146579805, "frac_reward_zero_std": 0.0, "grad_norm": 0.5810458064079285, "kl": 0.7843657732009888, "learning_rate": 6.974306901708155e-07, "loss": 0.0008, "num_tokens": 281123520.0, "reward": 0.8645069003105164, "reward_std": 0.09088663756847382, "rewards/reward_len/mean": 0.8645069003105164, "rewards/reward_len/std": 0.1261521428823471, "step": 684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1156351791530945, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4713175296783447, "kl": 0.8401117324829102, "learning_rate": 6.966469318879776e-07, "loss": 0.0008, "num_tokens": 281533664.0, "reward": 0.880810022354126, "reward_std": 0.08933113515377045, "rewards/reward_len/mean": 0.880810022354126, "rewards/reward_len/std": 0.13730093836784363, "step": 685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1172638436482085, "frac_reward_zero_std": 0.0, "grad_norm": 0.544887900352478, "kl": 0.7256201505661011, "learning_rate": 6.958626015899478e-07, "loss": 0.0007, "num_tokens": 281945488.0, "reward": 0.856864869594574, "reward_std": 0.0916634127497673, "rewards/reward_len/mean": 0.856864869594574, "rewards/reward_len/std": 0.1464300900697708, "step": 686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1188925081433225, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5682569146156311, "kl": 0.7737596035003662, "learning_rate": 6.950777015582201e-07, "loss": 0.0008, "num_tokens": 282356464.0, "reward": 0.8527705073356628, "reward_std": 0.10637734830379486, "rewards/reward_len/mean": 0.8527705073356628, "rewards/reward_len/std": 0.17249314486980438, "step": 687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1205211726384365, "frac_reward_zero_std": 0.0, "grad_norm": 0.5633009672164917, "kl": 0.7966817617416382, "learning_rate": 6.942922340759464e-07, "loss": 0.0008, "num_tokens": 282767248.0, "reward": 0.8674049377441406, "reward_std": 0.09627127647399902, "rewards/reward_len/mean": 0.8674049377441406, "rewards/reward_len/std": 0.15022093057632446, "step": 688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1221498371335505, "frac_reward_zero_std": 0.0, "grad_norm": 0.5640177726745605, "kl": 0.7774369716644287, "learning_rate": 6.935062014279284e-07, "loss": 0.0008, "num_tokens": 283178528.0, "reward": 0.8250942230224609, "reward_std": 0.10866741836071014, "rewards/reward_len/mean": 0.8250942230224609, "rewards/reward_len/std": 0.21154895424842834, "step": 689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1237785016286646, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5115018486976624, "kl": 0.8634730577468872, "learning_rate": 6.927196059006124e-07, "loss": 0.0009, "num_tokens": 283590080.0, "reward": 0.8917848467826843, "reward_std": 0.09830035269260406, "rewards/reward_len/mean": 0.8917848467826843, "rewards/reward_len/std": 0.14109288156032562, "step": 690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1254071661237786, "frac_reward_zero_std": 0.0, "grad_norm": 0.5952408313751221, "kl": 0.7599347829818726, "learning_rate": 6.919324497820822e-07, "loss": 0.0008, "num_tokens": 284001312.0, "reward": 0.8443737626075745, "reward_std": 0.1096007227897644, "rewards/reward_len/mean": 0.8443737626075745, "rewards/reward_len/std": 0.1583825796842575, "step": 691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1270358306188926, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5920707583427429, "kl": 0.6207900643348694, "learning_rate": 6.911447353620514e-07, "loss": 0.0006, "num_tokens": 284411856.0, "reward": 0.8018007278442383, "reward_std": 0.1480489820241928, "rewards/reward_len/mean": 0.8018007278442383, "rewards/reward_len/std": 0.23233598470687866, "step": 692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1286644951140066, "frac_reward_zero_std": 0.0, "grad_norm": 0.5058006644248962, "kl": 0.7666248083114624, "learning_rate": 6.903564649318585e-07, "loss": 0.0008, "num_tokens": 284822160.0, "reward": 0.8484981656074524, "reward_std": 0.1132572591304779, "rewards/reward_len/mean": 0.8484981060028076, "rewards/reward_len/std": 0.17484301328659058, "step": 693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1302931596091206, "frac_reward_zero_std": 0.0, "grad_norm": 0.5850920081138611, "kl": 0.7800534963607788, "learning_rate": 6.895676407844586e-07, "loss": 0.0008, "num_tokens": 285233456.0, "reward": 0.8291453123092651, "reward_std": 0.11464401334524155, "rewards/reward_len/mean": 0.8291453123092651, "rewards/reward_len/std": 0.2187841534614563, "step": 694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1319218241042346, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5572689771652222, "kl": 0.7347657680511475, "learning_rate": 6.887782652144185e-07, "loss": 0.0007, "num_tokens": 285643968.0, "reward": 0.8482582569122314, "reward_std": 0.08944028615951538, "rewards/reward_len/mean": 0.8482583165168762, "rewards/reward_len/std": 0.19170795381069183, "step": 695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1335504885993486, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6033066511154175, "kl": 0.8608733415603638, "learning_rate": 6.879883405179077e-07, "loss": 0.0009, "num_tokens": 286055264.0, "reward": 0.8635010719299316, "reward_std": 0.09395352005958557, "rewards/reward_len/mean": 0.8635010719299316, "rewards/reward_len/std": 0.12558764219284058, "step": 696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1351791530944626, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5129512548446655, "kl": 0.8482673168182373, "learning_rate": 6.871978689926942e-07, "loss": 0.0008, "num_tokens": 286466048.0, "reward": 0.8326910734176636, "reward_std": 0.10863424837589264, "rewards/reward_len/mean": 0.8326910734176636, "rewards/reward_len/std": 0.16504493355751038, "step": 697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1368078175895766, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5410494804382324, "kl": 0.9499621987342834, "learning_rate": 6.864068529381358e-07, "loss": 0.0009, "num_tokens": 286877776.0, "reward": 0.8595211505889893, "reward_std": 0.09644569456577301, "rewards/reward_len/mean": 0.8595211505889893, "rewards/reward_len/std": 0.16654780507087708, "step": 698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1384364820846906, "frac_reward_zero_std": 0.0, "grad_norm": 0.4891115427017212, "kl": 0.8184199333190918, "learning_rate": 6.856152946551745e-07, "loss": 0.0008, "num_tokens": 287287456.0, "reward": 0.8351621031761169, "reward_std": 0.10317918658256531, "rewards/reward_len/mean": 0.8351620435714722, "rewards/reward_len/std": 0.22763212025165558, "step": 699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1400651465798046, "frac_reward_zero_std": 0.0, "grad_norm": 0.5755412578582764, "kl": 0.6899369955062866, "learning_rate": 6.8482319644633e-07, "loss": 0.0007, "num_tokens": 287697264.0, "reward": 0.8875812292098999, "reward_std": 0.08349711447954178, "rewards/reward_len/mean": 0.8875812292098999, "rewards/reward_len/std": 0.12222469598054886, "step": 700 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1416938110749186, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5549267530441284, "kl": 0.7097915410995483, "learning_rate": 6.84030560615692e-07, "loss": 0.0007, "num_tokens": 288107920.0, "reward": 0.8094353675842285, "reward_std": 0.11651721596717834, "rewards/reward_len/mean": 0.8094353675842285, "rewards/reward_len/std": 0.19146861135959625, "step": 701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1433224755700326, "frac_reward_zero_std": 0.0625, "grad_norm": 0.49200621247291565, "kl": 0.8639323115348816, "learning_rate": 6.832373894689143e-07, "loss": 0.0009, "num_tokens": 288518608.0, "reward": 0.8686662912368774, "reward_std": 0.07750722765922546, "rewards/reward_len/mean": 0.8686662912368774, "rewards/reward_len/std": 0.13216865062713623, "step": 702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1449511400651466, "frac_reward_zero_std": 0.0, "grad_norm": 0.6452897787094116, "kl": 0.8095723390579224, "learning_rate": 6.824436853132079e-07, "loss": 0.0008, "num_tokens": 288930944.0, "reward": 0.8534454107284546, "reward_std": 0.09665780514478683, "rewards/reward_len/mean": 0.8534454703330994, "rewards/reward_len/std": 0.15782180428504944, "step": 703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1465798045602607, "frac_reward_zero_std": 0.0, "grad_norm": 0.5241583585739136, "kl": 0.8133851289749146, "learning_rate": 6.816494504573338e-07, "loss": 0.0008, "num_tokens": 289342096.0, "reward": 0.8588999509811401, "reward_std": 0.11074741184711456, "rewards/reward_len/mean": 0.8588999509811401, "rewards/reward_len/std": 0.18233439326286316, "step": 704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1482084690553747, "frac_reward_zero_std": 0.0, "grad_norm": 0.5026209354400635, "kl": 0.8887513279914856, "learning_rate": 6.808546872115975e-07, "loss": 0.0009, "num_tokens": 289752512.0, "reward": 0.8307567834854126, "reward_std": 0.08807453513145447, "rewards/reward_len/mean": 0.8307567834854126, "rewards/reward_len/std": 0.20738089084625244, "step": 705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1498371335504887, "frac_reward_zero_std": 0.0, "grad_norm": 0.6264156103134155, "kl": 0.7997729182243347, "learning_rate": 6.800593978878406e-07, "loss": 0.0008, "num_tokens": 290163216.0, "reward": 0.8399975299835205, "reward_std": 0.09493061155080795, "rewards/reward_len/mean": 0.8399975895881653, "rewards/reward_len/std": 0.17954881489276886, "step": 706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1514657980456027, "frac_reward_zero_std": 0.0, "grad_norm": 0.521577775478363, "kl": 0.9242134094238281, "learning_rate": 6.792635847994359e-07, "loss": 0.0009, "num_tokens": 290574608.0, "reward": 0.8764218688011169, "reward_std": 0.1008247435092926, "rewards/reward_len/mean": 0.8764218688011169, "rewards/reward_len/std": 0.17119991779327393, "step": 707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1530944625407167, "frac_reward_zero_std": 0.0, "grad_norm": 0.5960517525672913, "kl": 0.7303497195243835, "learning_rate": 6.78467250261279e-07, "loss": 0.0007, "num_tokens": 290985872.0, "reward": 0.8411180973052979, "reward_std": 0.10857447981834412, "rewards/reward_len/mean": 0.8411180973052979, "rewards/reward_len/std": 0.16673539578914642, "step": 708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1547231270358307, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4822656214237213, "kl": 0.7635250091552734, "learning_rate": 6.77670396589783e-07, "loss": 0.0008, "num_tokens": 291396960.0, "reward": 0.8771968483924866, "reward_std": 0.08152635395526886, "rewards/reward_len/mean": 0.8771968483924866, "rewards/reward_len/std": 0.13104671239852905, "step": 709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1563517915309447, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6134892106056213, "kl": 0.6891052722930908, "learning_rate": 6.768730261028702e-07, "loss": 0.0007, "num_tokens": 291808624.0, "reward": 0.8360602855682373, "reward_std": 0.09213851392269135, "rewards/reward_len/mean": 0.8360602855682373, "rewards/reward_len/std": 0.23369985818862915, "step": 710 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1579804560260587, "frac_reward_zero_std": 0.0, "grad_norm": 0.5471806526184082, "kl": 0.7760848999023438, "learning_rate": 6.76075141119967e-07, "loss": 0.0008, "num_tokens": 292219200.0, "reward": 0.8379178643226624, "reward_std": 0.11594150960445404, "rewards/reward_len/mean": 0.8379178643226624, "rewards/reward_len/std": 0.19659440219402313, "step": 711 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1596091205211727, "frac_reward_zero_std": 0.0, "grad_norm": 0.6589335203170776, "kl": 0.8194169998168945, "learning_rate": 6.752767439619961e-07, "loss": 0.0008, "num_tokens": 292630240.0, "reward": 0.8834969401359558, "reward_std": 0.07830050587654114, "rewards/reward_len/mean": 0.8834969401359558, "rewards/reward_len/std": 0.10758334398269653, "step": 712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1612377850162867, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5133358836174011, "kl": 0.7639867663383484, "learning_rate": 6.7447783695137e-07, "loss": 0.0008, "num_tokens": 293041440.0, "reward": 0.8759547472000122, "reward_std": 0.08565741777420044, "rewards/reward_len/mean": 0.8759547472000122, "rewards/reward_len/std": 0.16677983105182648, "step": 713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1628664495114007, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5200150609016418, "kl": 0.788211464881897, "learning_rate": 6.736784224119845e-07, "loss": 0.0008, "num_tokens": 293452640.0, "reward": 0.8495235443115234, "reward_std": 0.08694353699684143, "rewards/reward_len/mean": 0.8495235443115234, "rewards/reward_len/std": 0.17488935589790344, "step": 714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1644951140065147, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4637974500656128, "kl": 0.7905449867248535, "learning_rate": 6.728785026692112e-07, "loss": 0.0008, "num_tokens": 293863200.0, "reward": 0.8664193153381348, "reward_std": 0.08340287208557129, "rewards/reward_len/mean": 0.8664193153381348, "rewards/reward_len/std": 0.1544601023197174, "step": 715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1661237785016287, "frac_reward_zero_std": 0.0, "grad_norm": 0.5272719860076904, "kl": 0.7992947101593018, "learning_rate": 6.72078080049892e-07, "loss": 0.0008, "num_tokens": 294273776.0, "reward": 0.862074077129364, "reward_std": 0.09220850467681885, "rewards/reward_len/mean": 0.862074077129364, "rewards/reward_len/std": 0.1401219367980957, "step": 716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1677524429967427, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4871240258216858, "kl": 0.8800046443939209, "learning_rate": 6.712771568823311e-07, "loss": 0.0009, "num_tokens": 294685536.0, "reward": 0.8901078701019287, "reward_std": 0.07698018848896027, "rewards/reward_len/mean": 0.8901078701019287, "rewards/reward_len/std": 0.10137566179037094, "step": 717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1693811074918568, "frac_reward_zero_std": 0.0, "grad_norm": 0.5077803134918213, "kl": 0.8025038242340088, "learning_rate": 6.704757354962888e-07, "loss": 0.0008, "num_tokens": 295097904.0, "reward": 0.8276467323303223, "reward_std": 0.10241970419883728, "rewards/reward_len/mean": 0.8276467323303223, "rewards/reward_len/std": 0.15808682143688202, "step": 718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1710097719869708, "frac_reward_zero_std": 0.0, "grad_norm": 0.5287492871284485, "kl": 0.8027377128601074, "learning_rate": 6.696738182229746e-07, "loss": 0.0008, "num_tokens": 295510016.0, "reward": 0.8410789966583252, "reward_std": 0.10978923738002777, "rewards/reward_len/mean": 0.8410789966583252, "rewards/reward_len/std": 0.17054344713687897, "step": 719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1726384364820848, "frac_reward_zero_std": 0.0, "grad_norm": 0.5840542912483215, "kl": 0.7137171030044556, "learning_rate": 6.688714073950404e-07, "loss": 0.0007, "num_tokens": 295921552.0, "reward": 0.8505754470825195, "reward_std": 0.09220869094133377, "rewards/reward_len/mean": 0.8505754470825195, "rewards/reward_len/std": 0.15851642191410065, "step": 720 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1742671009771988, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5352020263671875, "kl": 0.841063916683197, "learning_rate": 6.680685053465742e-07, "loss": 0.0008, "num_tokens": 296332336.0, "reward": 0.8749523162841797, "reward_std": 0.09912378340959549, "rewards/reward_len/mean": 0.8749523162841797, "rewards/reward_len/std": 0.14295020699501038, "step": 721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1758957654723128, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5368694067001343, "kl": 0.8727166652679443, "learning_rate": 6.672651144130924e-07, "loss": 0.0009, "num_tokens": 296744176.0, "reward": 0.8165568113327026, "reward_std": 0.1090473085641861, "rewards/reward_len/mean": 0.8165567517280579, "rewards/reward_len/std": 0.19287848472595215, "step": 722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1775244299674268, "frac_reward_zero_std": 0.0, "grad_norm": 0.5525974035263062, "kl": 0.8312952518463135, "learning_rate": 6.664612369315338e-07, "loss": 0.0008, "num_tokens": 297155312.0, "reward": 0.8543537855148315, "reward_std": 0.1273651123046875, "rewards/reward_len/mean": 0.8543538451194763, "rewards/reward_len/std": 0.18210040032863617, "step": 723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1791530944625408, "frac_reward_zero_std": 0.0, "grad_norm": 0.5253997445106506, "kl": 0.8182494640350342, "learning_rate": 6.656568752402521e-07, "loss": 0.0008, "num_tokens": 297566800.0, "reward": 0.8894156813621521, "reward_std": 0.08204472064971924, "rewards/reward_len/mean": 0.8894156813621521, "rewards/reward_len/std": 0.12072575837373734, "step": 724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1807817589576548, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5065537691116333, "kl": 0.8659633994102478, "learning_rate": 6.6485203167901e-07, "loss": 0.0009, "num_tokens": 297977328.0, "reward": 0.8214618563652039, "reward_std": 0.09849168360233307, "rewards/reward_len/mean": 0.8214618563652039, "rewards/reward_len/std": 0.19713866710662842, "step": 725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1824104234527688, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5032474994659424, "kl": 0.8407553434371948, "learning_rate": 6.640467085889715e-07, "loss": 0.0008, "num_tokens": 298386000.0, "reward": 0.8702253699302673, "reward_std": 0.08892545104026794, "rewards/reward_len/mean": 0.8702253699302673, "rewards/reward_len/std": 0.1703691929578781, "step": 726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1840390879478828, "frac_reward_zero_std": 0.03125, "grad_norm": 0.49118921160697937, "kl": 0.8426001071929932, "learning_rate": 6.632409083126958e-07, "loss": 0.0008, "num_tokens": 298797824.0, "reward": 0.8250724077224731, "reward_std": 0.09823055565357208, "rewards/reward_len/mean": 0.8250724077224731, "rewards/reward_len/std": 0.21178792417049408, "step": 727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1856677524429968, "frac_reward_zero_std": 0.0, "grad_norm": 0.6942557096481323, "kl": 0.7334844470024109, "learning_rate": 6.624346331941295e-07, "loss": 0.0007, "num_tokens": 299209344.0, "reward": 0.8263984322547913, "reward_std": 0.11053269356489182, "rewards/reward_len/mean": 0.826398491859436, "rewards/reward_len/std": 0.1840546727180481, "step": 728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1872964169381108, "frac_reward_zero_std": 0.0, "grad_norm": 0.5597640872001648, "kl": 0.704264223575592, "learning_rate": 6.616278855786015e-07, "loss": 0.0007, "num_tokens": 299620352.0, "reward": 0.8641760945320129, "reward_std": 0.08506888151168823, "rewards/reward_len/mean": 0.8641760945320129, "rewards/reward_len/std": 0.12297151982784271, "step": 729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1889250814332248, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7794180512428284, "kl": 0.7050420641899109, "learning_rate": 6.608206678128142e-07, "loss": 0.0007, "num_tokens": 300031328.0, "reward": 0.8291777968406677, "reward_std": 0.10094847530126572, "rewards/reward_len/mean": 0.8291777968406677, "rewards/reward_len/std": 0.23308482766151428, "step": 730 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1905537459283388, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5412080883979797, "kl": 0.7289453744888306, "learning_rate": 6.600129822448381e-07, "loss": 0.0007, "num_tokens": 300442976.0, "reward": 0.8508661985397339, "reward_std": 0.10258449614048004, "rewards/reward_len/mean": 0.8508661985397339, "rewards/reward_len/std": 0.17248402535915375, "step": 731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1921824104234529, "frac_reward_zero_std": 0.0, "grad_norm": 0.5575272440910339, "kl": 0.7881931662559509, "learning_rate": 6.592048312241041e-07, "loss": 0.0008, "num_tokens": 300853344.0, "reward": 0.8183953762054443, "reward_std": 0.12189134955406189, "rewards/reward_len/mean": 0.8183953762054443, "rewards/reward_len/std": 0.22876103222370148, "step": 732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1938110749185669, "frac_reward_zero_std": 0.0, "grad_norm": 0.5377393960952759, "kl": 0.813199520111084, "learning_rate": 6.583962171013972e-07, "loss": 0.0008, "num_tokens": 301264768.0, "reward": 0.8629642724990845, "reward_std": 0.08810026943683624, "rewards/reward_len/mean": 0.8629642724990845, "rewards/reward_len/std": 0.14522507786750793, "step": 733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1954397394136809, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5424534678459167, "kl": 0.8531215190887451, "learning_rate": 6.575871422288497e-07, "loss": 0.0009, "num_tokens": 301676000.0, "reward": 0.8605961799621582, "reward_std": 0.10292813181877136, "rewards/reward_len/mean": 0.8605961799621582, "rewards/reward_len/std": 0.17068244516849518, "step": 734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1970684039087949, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4781990647315979, "kl": 0.9149599075317383, "learning_rate": 6.567776089599339e-07, "loss": 0.0009, "num_tokens": 302087888.0, "reward": 0.8941779136657715, "reward_std": 0.07161632180213928, "rewards/reward_len/mean": 0.8941779136657715, "rewards/reward_len/std": 0.10391388833522797, "step": 735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.1986970684039089, "frac_reward_zero_std": 0.0, "grad_norm": 0.5476263761520386, "kl": 0.8076697587966919, "learning_rate": 6.559676196494554e-07, "loss": 0.0008, "num_tokens": 302499120.0, "reward": 0.8343234062194824, "reward_std": 0.10283831506967545, "rewards/reward_len/mean": 0.8343234062194824, "rewards/reward_len/std": 0.1951364278793335, "step": 736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.200325732899023, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5348288416862488, "kl": 0.8322635889053345, "learning_rate": 6.551571766535469e-07, "loss": 0.0008, "num_tokens": 302909856.0, "reward": 0.837523341178894, "reward_std": 0.12098918855190277, "rewards/reward_len/mean": 0.837523341178894, "rewards/reward_len/std": 0.20404520630836487, "step": 737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.201954397394137, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6151106357574463, "kl": 0.7637999653816223, "learning_rate": 6.543462823296599e-07, "loss": 0.0008, "num_tokens": 303320080.0, "reward": 0.8375487327575684, "reward_std": 0.09235025942325592, "rewards/reward_len/mean": 0.8375487327575684, "rewards/reward_len/std": 0.226637065410614, "step": 738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.203583061889251, "frac_reward_zero_std": 0.0, "grad_norm": 0.5055100917816162, "kl": 0.9053730964660645, "learning_rate": 6.535349390365596e-07, "loss": 0.0009, "num_tokens": 303731248.0, "reward": 0.835732102394104, "reward_std": 0.09293640404939651, "rewards/reward_len/mean": 0.835732102394104, "rewards/reward_len/std": 0.18400031328201294, "step": 739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.205211726384365, "frac_reward_zero_std": 0.0, "grad_norm": 0.7016081213951111, "kl": 0.9074224233627319, "learning_rate": 6.527231491343168e-07, "loss": 0.0009, "num_tokens": 304142992.0, "reward": 0.8516442775726318, "reward_std": 0.06905101239681244, "rewards/reward_len/mean": 0.8516442179679871, "rewards/reward_len/std": 0.10749334841966629, "step": 740 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.206840390879479, "frac_reward_zero_std": 0.0, "grad_norm": 0.6099837422370911, "kl": 0.917671799659729, "learning_rate": 6.519109149843014e-07, "loss": 0.0009, "num_tokens": 304554592.0, "reward": 0.848659873008728, "reward_std": 0.09825589507818222, "rewards/reward_len/mean": 0.8486599326133728, "rewards/reward_len/std": 0.21612964570522308, "step": 741 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.208469055374593, "frac_reward_zero_std": 0.125, "grad_norm": 0.5550546050071716, "kl": 0.8614305257797241, "learning_rate": 6.510982389491756e-07, "loss": 0.0009, "num_tokens": 304965504.0, "reward": 0.8777759075164795, "reward_std": 0.07905925065279007, "rewards/reward_len/mean": 0.8777759075164795, "rewards/reward_len/std": 0.13357146084308624, "step": 742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.210097719869707, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5442739129066467, "kl": 0.8850230574607849, "learning_rate": 6.502851233928871e-07, "loss": 0.0009, "num_tokens": 305376560.0, "reward": 0.891995370388031, "reward_std": 0.08127345144748688, "rewards/reward_len/mean": 0.8919954299926758, "rewards/reward_len/std": 0.1261700540781021, "step": 743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.211726384364821, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5422216057777405, "kl": 0.9487099647521973, "learning_rate": 6.494715706806617e-07, "loss": 0.0009, "num_tokens": 305789312.0, "reward": 0.8632618188858032, "reward_std": 0.10273496061563492, "rewards/reward_len/mean": 0.8632618188858032, "rewards/reward_len/std": 0.16330909729003906, "step": 744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.213355048859935, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5515996217727661, "kl": 0.6991128921508789, "learning_rate": 6.486575831789973e-07, "loss": 0.0007, "num_tokens": 306200544.0, "reward": 0.8254920244216919, "reward_std": 0.118329256772995, "rewards/reward_len/mean": 0.8254920244216919, "rewards/reward_len/std": 0.20838294923305511, "step": 745 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.214983713355049, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5506951808929443, "kl": 0.9139054417610168, "learning_rate": 6.478431632556564e-07, "loss": 0.0009, "num_tokens": 306612080.0, "reward": 0.8896857500076294, "reward_std": 0.08320879936218262, "rewards/reward_len/mean": 0.8896857500076294, "rewards/reward_len/std": 0.11854236572980881, "step": 746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.216612377850163, "frac_reward_zero_std": 0.0, "grad_norm": 0.5367873907089233, "kl": 0.9769244194030762, "learning_rate": 6.470283132796591e-07, "loss": 0.001, "num_tokens": 307022912.0, "reward": 0.8457486629486084, "reward_std": 0.12394586950540543, "rewards/reward_len/mean": 0.8457486629486084, "rewards/reward_len/std": 0.19969245791435242, "step": 747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.218241042345277, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5910618305206299, "kl": 0.7573522329330444, "learning_rate": 6.462130356212768e-07, "loss": 0.0008, "num_tokens": 307435072.0, "reward": 0.8589292764663696, "reward_std": 0.08983289450407028, "rewards/reward_len/mean": 0.8589292764663696, "rewards/reward_len/std": 0.18211866915225983, "step": 748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2198697068403908, "frac_reward_zero_std": 0.0, "grad_norm": 0.5429965257644653, "kl": 0.905631959438324, "learning_rate": 6.453973326520244e-07, "loss": 0.0009, "num_tokens": 307845632.0, "reward": 0.8624215722084045, "reward_std": 0.09564337134361267, "rewards/reward_len/mean": 0.8624215722084045, "rewards/reward_len/std": 0.17887438833713531, "step": 749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.221498371335505, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5832347273826599, "kl": 0.8132959008216858, "learning_rate": 6.44581206744655e-07, "loss": 0.0008, "num_tokens": 308256528.0, "reward": 0.8787779808044434, "reward_std": 0.07330642640590668, "rewards/reward_len/mean": 0.8787779808044434, "rewards/reward_len/std": 0.13901416957378387, "step": 750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2231270358306188, "frac_reward_zero_std": 0.0, "grad_norm": 0.4982871413230896, "kl": 0.918438196182251, "learning_rate": 6.437646602731508e-07, "loss": 0.0009, "num_tokens": 308667424.0, "reward": 0.8543879389762878, "reward_std": 0.10249063372612, "rewards/reward_len/mean": 0.8543879985809326, "rewards/reward_len/std": 0.1851317584514618, "step": 751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.224755700325733, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5514604449272156, "kl": 0.8463090658187866, "learning_rate": 6.42947695612718e-07, "loss": 0.0008, "num_tokens": 309078608.0, "reward": 0.8928946852684021, "reward_std": 0.07070891559123993, "rewards/reward_len/mean": 0.8928946852684021, "rewards/reward_len/std": 0.1580275446176529, "step": 752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2263843648208468, "frac_reward_zero_std": 0.0, "grad_norm": 0.6721744537353516, "kl": 0.9521299004554749, "learning_rate": 6.421303151397792e-07, "loss": 0.001, "num_tokens": 309488432.0, "reward": 0.846250057220459, "reward_std": 0.0883135199546814, "rewards/reward_len/mean": 0.846250057220459, "rewards/reward_len/std": 0.1875513643026352, "step": 753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.228013029315961, "frac_reward_zero_std": 0.0, "grad_norm": 0.5153366327285767, "kl": 0.905552864074707, "learning_rate": 6.413125212319663e-07, "loss": 0.0009, "num_tokens": 309900176.0, "reward": 0.8697444200515747, "reward_std": 0.08848260343074799, "rewards/reward_len/mean": 0.8697444200515747, "rewards/reward_len/std": 0.12877629697322845, "step": 754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2296416938110748, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8473029732704163, "kl": 1.0626804828643799, "learning_rate": 6.404943162681144e-07, "loss": 0.0011, "num_tokens": 310311824.0, "reward": 0.8691158294677734, "reward_std": 0.09486698359251022, "rewards/reward_len/mean": 0.8691158294677734, "rewards/reward_len/std": 0.14626090228557587, "step": 755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.231270358306189, "frac_reward_zero_std": 0.0, "grad_norm": 0.70387202501297, "kl": 0.8879956007003784, "learning_rate": 6.396757026282532e-07, "loss": 0.0009, "num_tokens": 310722208.0, "reward": 0.8461308479309082, "reward_std": 0.08914180845022202, "rewards/reward_len/mean": 0.8461308479309082, "rewards/reward_len/std": 0.18698877096176147, "step": 756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2328990228013028, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5079204440116882, "kl": 0.9683883786201477, "learning_rate": 6.388566826936024e-07, "loss": 0.001, "num_tokens": 311132560.0, "reward": 0.8371248245239258, "reward_std": 0.08711405843496323, "rewards/reward_len/mean": 0.8371248245239258, "rewards/reward_len/std": 0.23504503071308136, "step": 757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.234527687296417, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5931830406188965, "kl": 0.9022970795631409, "learning_rate": 6.380372588465627e-07, "loss": 0.0009, "num_tokens": 311543296.0, "reward": 0.8208324909210205, "reward_std": 0.09903820604085922, "rewards/reward_len/mean": 0.8208324909210205, "rewards/reward_len/std": 0.21744659543037415, "step": 758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2361563517915308, "frac_reward_zero_std": 0.03125, "grad_norm": 0.456917405128479, "kl": 0.8168489933013916, "learning_rate": 6.372174334707101e-07, "loss": 0.0008, "num_tokens": 311953248.0, "reward": 0.8723276257514954, "reward_std": 0.09049741923809052, "rewards/reward_len/mean": 0.8723275661468506, "rewards/reward_len/std": 0.1356828510761261, "step": 759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.237785016286645, "frac_reward_zero_std": 0.0, "grad_norm": 0.5612416863441467, "kl": 1.0238416194915771, "learning_rate": 6.363972089507885e-07, "loss": 0.001, "num_tokens": 312365360.0, "reward": 0.8378081917762756, "reward_std": 0.11028949916362762, "rewards/reward_len/mean": 0.8378081917762756, "rewards/reward_len/std": 0.20265884697437286, "step": 760 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2394136807817588, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5318880081176758, "kl": 0.9725499153137207, "learning_rate": 6.355765876727028e-07, "loss": 0.001, "num_tokens": 312775616.0, "reward": 0.864140510559082, "reward_std": 0.08970316499471664, "rewards/reward_len/mean": 0.864140510559082, "rewards/reward_len/std": 0.1635420024394989, "step": 761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.241042345276873, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4799724817276001, "kl": 1.0898746252059937, "learning_rate": 6.347555720235121e-07, "loss": 0.0011, "num_tokens": 313185904.0, "reward": 0.8833499550819397, "reward_std": 0.08443745970726013, "rewards/reward_len/mean": 0.8833498954772949, "rewards/reward_len/std": 0.1574016809463501, "step": 762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2426710097719869, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5300341248512268, "kl": 1.0349605083465576, "learning_rate": 6.339341643914224e-07, "loss": 0.001, "num_tokens": 313596352.0, "reward": 0.8848140239715576, "reward_std": 0.06851042807102203, "rewards/reward_len/mean": 0.8848140239715576, "rewards/reward_len/std": 0.12050186842679977, "step": 763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.244299674267101, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6392943859100342, "kl": 0.7318077683448792, "learning_rate": 6.331123671657805e-07, "loss": 0.0007, "num_tokens": 314007408.0, "reward": 0.8389783501625061, "reward_std": 0.09373115003108978, "rewards/reward_len/mean": 0.8389783501625061, "rewards/reward_len/std": 0.15636248886585236, "step": 764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2459283387622149, "frac_reward_zero_std": 0.0, "grad_norm": 0.5408647656440735, "kl": 0.9406147003173828, "learning_rate": 6.322901827370658e-07, "loss": 0.0009, "num_tokens": 314417552.0, "reward": 0.8666777610778809, "reward_std": 0.09641110152006149, "rewards/reward_len/mean": 0.8666777610778809, "rewards/reward_len/std": 0.1710299849510193, "step": 765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.247557003257329, "frac_reward_zero_std": 0.0, "grad_norm": 0.5843323469161987, "kl": 0.9401869177818298, "learning_rate": 6.314676134968844e-07, "loss": 0.0009, "num_tokens": 314828544.0, "reward": 0.8665501475334167, "reward_std": 0.08884118497371674, "rewards/reward_len/mean": 0.8665501475334167, "rewards/reward_len/std": 0.1618151068687439, "step": 766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2491856677524429, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5012809038162231, "kl": 0.9538995027542114, "learning_rate": 6.306446618379618e-07, "loss": 0.001, "num_tokens": 315239760.0, "reward": 0.8516203761100769, "reward_std": 0.0981706902384758, "rewards/reward_len/mean": 0.8516203761100769, "rewards/reward_len/std": 0.19749927520751953, "step": 767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2508143322475571, "frac_reward_zero_std": 0.03125, "grad_norm": 0.519517719745636, "kl": 0.9311209917068481, "learning_rate": 6.298213301541355e-07, "loss": 0.0009, "num_tokens": 315650944.0, "reward": 0.8838056325912476, "reward_std": 0.08590787649154663, "rewards/reward_len/mean": 0.8838056325912476, "rewards/reward_len/std": 0.1379007250070572, "step": 768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.252442996742671, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4754287004470825, "kl": 0.8782882690429688, "learning_rate": 6.289976208403489e-07, "loss": 0.0009, "num_tokens": 316059952.0, "reward": 0.8372328877449036, "reward_std": 0.12223777920007706, "rewards/reward_len/mean": 0.8372328877449036, "rewards/reward_len/std": 0.23015186190605164, "step": 769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2540716612377851, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5328430533409119, "kl": 0.9132922887802124, "learning_rate": 6.281735362926435e-07, "loss": 0.0009, "num_tokens": 316469760.0, "reward": 0.8748418092727661, "reward_std": 0.08908462524414062, "rewards/reward_len/mean": 0.8748418092727661, "rewards/reward_len/std": 0.1335860639810562, "step": 770 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.255700325732899, "frac_reward_zero_std": 0.0, "grad_norm": 0.49530285596847534, "kl": 0.9584683179855347, "learning_rate": 6.273490789081528e-07, "loss": 0.001, "num_tokens": 316879312.0, "reward": 0.8460965156555176, "reward_std": 0.10322994738817215, "rewards/reward_len/mean": 0.8460965156555176, "rewards/reward_len/std": 0.18959370255470276, "step": 771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2573289902280131, "frac_reward_zero_std": 0.0, "grad_norm": 0.4842894673347473, "kl": 0.9760171175003052, "learning_rate": 6.26524251085094e-07, "loss": 0.001, "num_tokens": 317288896.0, "reward": 0.8545916676521301, "reward_std": 0.08848881721496582, "rewards/reward_len/mean": 0.8545916676521301, "rewards/reward_len/std": 0.1577739119529724, "step": 772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.258957654723127, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5061283707618713, "kl": 0.9532848596572876, "learning_rate": 6.256990552227626e-07, "loss": 0.001, "num_tokens": 317699312.0, "reward": 0.8028972148895264, "reward_std": 0.11814016103744507, "rewards/reward_len/mean": 0.8028972148895264, "rewards/reward_len/std": 0.28420794010162354, "step": 773 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2605863192182412, "frac_reward_zero_std": 0.0, "grad_norm": 0.4831736385822296, "kl": 1.0683679580688477, "learning_rate": 6.248734937215244e-07, "loss": 0.0011, "num_tokens": 318110976.0, "reward": 0.8526128530502319, "reward_std": 0.09033186733722687, "rewards/reward_len/mean": 0.8526127934455872, "rewards/reward_len/std": 0.1585368812084198, "step": 774 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.262214983713355, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5050777792930603, "kl": 0.9824758768081665, "learning_rate": 6.240475689828087e-07, "loss": 0.001, "num_tokens": 318521600.0, "reward": 0.8657810091972351, "reward_std": 0.07592863589525223, "rewards/reward_len/mean": 0.8657810091972351, "rewards/reward_len/std": 0.1326105296611786, "step": 775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2638436482084692, "frac_reward_zero_std": 0.125, "grad_norm": 0.6611026525497437, "kl": 0.9637758731842041, "learning_rate": 6.232212834091016e-07, "loss": 0.001, "num_tokens": 318932944.0, "reward": 0.8710272312164307, "reward_std": 0.07657058537006378, "rewards/reward_len/mean": 0.8710272312164307, "rewards/reward_len/std": 0.15136153995990753, "step": 776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.265472312703583, "frac_reward_zero_std": 0.0, "grad_norm": 0.5306569337844849, "kl": 0.933874785900116, "learning_rate": 6.223946394039386e-07, "loss": 0.0009, "num_tokens": 319343568.0, "reward": 0.8118324875831604, "reward_std": 0.10793709754943848, "rewards/reward_len/mean": 0.8118324875831604, "rewards/reward_len/std": 0.22982069849967957, "step": 777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2671009771986972, "frac_reward_zero_std": 0.0, "grad_norm": 0.6259944438934326, "kl": 1.0169353485107422, "learning_rate": 6.215676393718979e-07, "loss": 0.001, "num_tokens": 319754864.0, "reward": 0.8239790201187134, "reward_std": 0.0991220474243164, "rewards/reward_len/mean": 0.8239790201187134, "rewards/reward_len/std": 0.13595837354660034, "step": 778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.268729641693811, "frac_reward_zero_std": 0.03125, "grad_norm": 0.548584520816803, "kl": 0.9929777383804321, "learning_rate": 6.207402857185934e-07, "loss": 0.001, "num_tokens": 320167440.0, "reward": 0.8647435903549194, "reward_std": 0.09828947484493256, "rewards/reward_len/mean": 0.8647435903549194, "rewards/reward_len/std": 0.1461469829082489, "step": 779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2703583061889252, "frac_reward_zero_std": 0.03125, "grad_norm": 0.605322003364563, "kl": 0.9966015815734863, "learning_rate": 6.199125808506677e-07, "loss": 0.001, "num_tokens": 320578624.0, "reward": 0.854337751865387, "reward_std": 0.08750371634960175, "rewards/reward_len/mean": 0.8543376922607422, "rewards/reward_len/std": 0.15190161764621735, "step": 780 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.271986970684039, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5148822069168091, "kl": 1.046364188194275, "learning_rate": 6.190845271757846e-07, "loss": 0.001, "num_tokens": 320989168.0, "reward": 0.8599001169204712, "reward_std": 0.09277479350566864, "rewards/reward_len/mean": 0.8599001169204712, "rewards/reward_len/std": 0.14915044605731964, "step": 781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2736156351791532, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6700528264045715, "kl": 0.9485293626785278, "learning_rate": 6.18256127102623e-07, "loss": 0.0009, "num_tokens": 321399584.0, "reward": 0.8495761752128601, "reward_std": 0.09913314133882523, "rewards/reward_len/mean": 0.8495761752128601, "rewards/reward_len/std": 0.16737522184848785, "step": 782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.275244299674267, "frac_reward_zero_std": 0.0, "grad_norm": 0.5867993831634521, "kl": 0.9072176814079285, "learning_rate": 6.174273830408693e-07, "loss": 0.0009, "num_tokens": 321810784.0, "reward": 0.8238231539726257, "reward_std": 0.11934762448072433, "rewards/reward_len/mean": 0.823823094367981, "rewards/reward_len/std": 0.18061968684196472, "step": 783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2768729641693812, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4624544084072113, "kl": 1.07521653175354, "learning_rate": 6.165982974012104e-07, "loss": 0.0011, "num_tokens": 322222768.0, "reward": 0.8622351884841919, "reward_std": 0.08240460604429245, "rewards/reward_len/mean": 0.8622351288795471, "rewards/reward_len/std": 0.13946667313575745, "step": 784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.278501628664495, "frac_reward_zero_std": 0.0, "grad_norm": 0.528731644153595, "kl": 0.9490184187889099, "learning_rate": 6.157688725953269e-07, "loss": 0.0009, "num_tokens": 322635008.0, "reward": 0.8365463614463806, "reward_std": 0.1069311574101448, "rewards/reward_len/mean": 0.8365463614463806, "rewards/reward_len/std": 0.24826325476169586, "step": 785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2801302931596092, "frac_reward_zero_std": 0.0, "grad_norm": 0.667375922203064, "kl": 1.0421568155288696, "learning_rate": 6.149391110358859e-07, "loss": 0.001, "num_tokens": 323045632.0, "reward": 0.8804180026054382, "reward_std": 0.08776363730430603, "rewards/reward_len/mean": 0.880418062210083, "rewards/reward_len/std": 0.13504837453365326, "step": 786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.281758957654723, "frac_reward_zero_std": 0.0, "grad_norm": 0.6649826765060425, "kl": 0.9824499487876892, "learning_rate": 6.14109015136534e-07, "loss": 0.001, "num_tokens": 323457872.0, "reward": 0.8597849607467651, "reward_std": 0.10986722260713577, "rewards/reward_len/mean": 0.8597849607467651, "rewards/reward_len/std": 0.1812620759010315, "step": 787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2833876221498373, "frac_reward_zero_std": 0.0, "grad_norm": 0.5053057670593262, "kl": 0.8942596912384033, "learning_rate": 6.132785873118904e-07, "loss": 0.0009, "num_tokens": 323869248.0, "reward": 0.8864105939865112, "reward_std": 0.09204859286546707, "rewards/reward_len/mean": 0.8864105343818665, "rewards/reward_len/std": 0.13000133633613586, "step": 788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.285016286644951, "frac_reward_zero_std": 0.03125, "grad_norm": 0.672671377658844, "kl": 0.9473615884780884, "learning_rate": 6.124478299775402e-07, "loss": 0.0009, "num_tokens": 324280624.0, "reward": 0.8628567457199097, "reward_std": 0.09368512034416199, "rewards/reward_len/mean": 0.8628567457199097, "rewards/reward_len/std": 0.15331590175628662, "step": 789 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2866449511400653, "frac_reward_zero_std": 0.03125, "grad_norm": 0.49875590205192566, "kl": 1.230309247970581, "learning_rate": 6.116167455500264e-07, "loss": 0.0012, "num_tokens": 324691712.0, "reward": 0.9087775945663452, "reward_std": 0.06528870016336441, "rewards/reward_len/mean": 0.9087775945663452, "rewards/reward_len/std": 0.099493607878685, "step": 790 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.288273615635179, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5536404252052307, "kl": 1.0226893424987793, "learning_rate": 6.107853364468438e-07, "loss": 0.001, "num_tokens": 325101952.0, "reward": 0.8819142580032349, "reward_std": 0.08658333122730255, "rewards/reward_len/mean": 0.8819142580032349, "rewards/reward_len/std": 0.12916278839111328, "step": 791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2899022801302933, "frac_reward_zero_std": 0.0, "grad_norm": 0.632254958152771, "kl": 1.0717904567718506, "learning_rate": 6.099536050864314e-07, "loss": 0.0011, "num_tokens": 325512416.0, "reward": 0.8421878814697266, "reward_std": 0.104166179895401, "rewards/reward_len/mean": 0.842187762260437, "rewards/reward_len/std": 0.17671199142932892, "step": 792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.291530944625407, "frac_reward_zero_std": 0.0, "grad_norm": 0.5571500062942505, "kl": 0.890981137752533, "learning_rate": 6.091215538881658e-07, "loss": 0.0009, "num_tokens": 325923424.0, "reward": 0.8677847385406494, "reward_std": 0.09834159910678864, "rewards/reward_len/mean": 0.8677847385406494, "rewards/reward_len/std": 0.14237581193447113, "step": 793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2931596091205213, "frac_reward_zero_std": 0.0, "grad_norm": 0.46003010869026184, "kl": 1.040059208869934, "learning_rate": 6.08289185272354e-07, "loss": 0.001, "num_tokens": 326334784.0, "reward": 0.882177472114563, "reward_std": 0.0721481442451477, "rewards/reward_len/mean": 0.882177472114563, "rewards/reward_len/std": 0.1114688441157341, "step": 794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.294788273615635, "frac_reward_zero_std": 0.0, "grad_norm": 0.5274245142936707, "kl": 1.0262317657470703, "learning_rate": 6.074565016602263e-07, "loss": 0.001, "num_tokens": 326745344.0, "reward": 0.8824129700660706, "reward_std": 0.07719889283180237, "rewards/reward_len/mean": 0.8824130296707153, "rewards/reward_len/std": 0.14207780361175537, "step": 795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2964169381107493, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5568190813064575, "kl": 0.9820303916931152, "learning_rate": 6.066235054739288e-07, "loss": 0.001, "num_tokens": 327156944.0, "reward": 0.8934600353240967, "reward_std": 0.09114055335521698, "rewards/reward_len/mean": 0.8934600353240967, "rewards/reward_len/std": 0.18184977769851685, "step": 796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.298045602605863, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5187119245529175, "kl": 0.8768844604492188, "learning_rate": 6.057901991365175e-07, "loss": 0.0009, "num_tokens": 327567744.0, "reward": 0.8502852916717529, "reward_std": 0.09909721463918686, "rewards/reward_len/mean": 0.8502852916717529, "rewards/reward_len/std": 0.18013259768486023, "step": 797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.2996742671009773, "frac_reward_zero_std": 0.0, "grad_norm": 0.7224984765052795, "kl": 0.9552005529403687, "learning_rate": 6.049565850719504e-07, "loss": 0.001, "num_tokens": 327977904.0, "reward": 0.8691169023513794, "reward_std": 0.08267006278038025, "rewards/reward_len/mean": 0.8691169023513794, "rewards/reward_len/std": 0.1467570811510086, "step": 798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.3013029315960911, "frac_reward_zero_std": 0.0, "grad_norm": 0.5344515442848206, "kl": 0.9172037839889526, "learning_rate": 6.041226657050803e-07, "loss": 0.0009, "num_tokens": 328388464.0, "reward": 0.877011775970459, "reward_std": 0.09934454411268234, "rewards/reward_len/mean": 0.877011775970459, "rewards/reward_len/std": 0.14581261575222015, "step": 799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 1.3029315960912053, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5337148308753967, "kl": 0.9092113971710205, "learning_rate": 6.032884434616484e-07, "loss": 0.0009, "num_tokens": 328800208.0, "reward": 0.8178131580352783, "reward_std": 0.10676635801792145, "rewards/reward_len/mean": 0.8178131580352783, "rewards/reward_len/std": 0.19412481784820557, "step": 800 } ], "logging_steps": 1, "max_steps": 1842, "num_input_tokens_seen": 328800208, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }