{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0017699115044247, "eval_steps": 500, "global_step": 566, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 116.921875, "completions/mean_terminated_length": 116.921875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.20697785913944244, "epoch": 0.0017699115044247787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 17147.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6091711521148682, "sampling/importance_sampling_ratio/mean": 1.0006325244903564, "sampling/importance_sampling_ratio/min": 0.20119503140449524, "sampling/sampling_logp_difference/max": 1.603480577468872, "sampling/sampling_logp_difference/mean": 0.017017420381307602, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.19342172145843506, "epoch": 0.0035398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 1.0923920946256687, "kl": 0.0, "learning_rate": 8.849557522123893e-09, "loss": 0.0044, "num_tokens": 42739.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001102328300476, "sampling/importance_sampling_ratio/min": 0.5267084240913391, "sampling/sampling_logp_difference/max": 0.8144505023956299, "sampling/sampling_logp_difference/mean": 0.014691345393657684, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 329.578125, "completions/mean_terminated_length": 329.578125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.25077879428863525, "epoch": 0.005309734513274336, "frac_reward_zero_std": 0.5, "grad_norm": 1.3080871009016275, "kl": 0.000654924544505775, "learning_rate": 1.7699115044247786e-08, "loss": -0.0859, "num_tokens": 77512.0, "reward": -0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995365738868713, "sampling/importance_sampling_ratio/min": 0.41745737195014954, "sampling/sampling_logp_difference/max": 0.873572826385498, "sampling/sampling_logp_difference/mean": 0.014907658100128174, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.24211668968200684, "epoch": 0.007079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.099957534131909, "kl": 0.0007418019231408834, "learning_rate": 2.654867256637168e-08, "loss": -0.0251, "num_tokens": 104456.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994640350341797, "sampling/importance_sampling_ratio/min": 0.4866437315940857, "sampling/sampling_logp_difference/max": 0.787409782409668, "sampling/sampling_logp_difference/mean": 0.01581917703151703, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 119.796875, "completions/mean_terminated_length": 119.796875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.17493584752082825, "epoch": 0.008849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.008946367648204409, "kl": 0.0008182815508916974, "learning_rate": 3.539823008849557e-08, "loss": 0.0, "num_tokens": 123563.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005104541778564, "sampling/importance_sampling_ratio/min": 0.45969533920288086, "sampling/sampling_logp_difference/max": 1.0377099514007568, "sampling/sampling_logp_difference/mean": 0.015085182152688503, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 138.96875, "completions/mean_terminated_length": 138.96875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.1379530429840088, "epoch": 0.010619469026548672, "frac_reward_zero_std": 0.75, "grad_norm": 1.6718709011918418, "kl": 0.0010723145678639412, "learning_rate": 4.424778761061947e-08, "loss": -0.0098, "num_tokens": 143353.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003252029418945, "sampling/importance_sampling_ratio/min": 0.47668424248695374, "sampling/sampling_logp_difference/max": 0.8771946430206299, "sampling/sampling_logp_difference/mean": 0.015779944136738777, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 127.46875, "completions/mean_terminated_length": 127.46875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.21951116621494293, "epoch": 0.012389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 1.546570105499669, "kl": 0.0012489594519138336, "learning_rate": 5.309734513274336e-08, "loss": -0.0063, "num_tokens": 166023.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997199773788452, "sampling/importance_sampling_ratio/min": 0.48152080178260803, "sampling/sampling_logp_difference/max": 0.7308058738708496, "sampling/sampling_logp_difference/mean": 0.018544606864452362, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.27145683765411377, "epoch": 0.01415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.7977693290779326, "kl": 0.0009276042110286653, "learning_rate": 6.194690265486725e-08, "loss": 0.0187, "num_tokens": 187703.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8794910907745361, "sampling/importance_sampling_ratio/mean": 1.0002965927124023, "sampling/importance_sampling_ratio/min": 0.5894622802734375, "sampling/sampling_logp_difference/max": 0.6310009956359863, "sampling/sampling_logp_difference/mean": 0.016642892733216286, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 98.75, "completions/mean_terminated_length": 98.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.11876099556684494, "epoch": 0.01592920353982301, "frac_reward_zero_std": 1.0, "grad_norm": 0.031121728957351182, "kl": 0.0012574833817780018, "learning_rate": 7.079646017699114e-08, "loss": 0.0, "num_tokens": 203831.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001816987991333, "sampling/importance_sampling_ratio/min": 0.5172110199928284, "sampling/sampling_logp_difference/max": 0.8154864311218262, "sampling/sampling_logp_difference/mean": 0.013149796985089779, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 156.15625, "completions/mean_terminated_length": 156.15625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.16917940974235535, "epoch": 0.017699115044247787, "frac_reward_zero_std": 0.75, "grad_norm": 1.5171070116726488, "kl": 0.0007576899952255189, "learning_rate": 7.964601769911503e-08, "loss": -0.0085, "num_tokens": 225505.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999789834022522, "sampling/importance_sampling_ratio/min": 0.3941565454006195, "sampling/sampling_logp_difference/max": 0.9310071468353271, "sampling/sampling_logp_difference/mean": 0.013611899688839912, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 183.421875, "completions/mean_terminated_length": 183.421875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.19869819283485413, "epoch": 0.019469026548672566, "frac_reward_zero_std": 1.0, "grad_norm": 0.011400731068222064, "kl": 0.000893578224349767, "learning_rate": 8.849557522123894e-08, "loss": 0.0, "num_tokens": 247228.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006699562072754, "sampling/importance_sampling_ratio/min": 0.3722725808620453, "sampling/sampling_logp_difference/max": 0.9881290197372437, "sampling/sampling_logp_difference/mean": 0.015749048441648483, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.19851046800613403, "epoch": 0.021238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.011807262567201954, "kl": 0.0010877617169171572, "learning_rate": 9.734513274336283e-08, "loss": 0.0, "num_tokens": 265432.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8060849905014038, "sampling/importance_sampling_ratio/mean": 1.000178575515747, "sampling/importance_sampling_ratio/min": 0.4823702871799469, "sampling/sampling_logp_difference/max": 0.7290432453155518, "sampling/sampling_logp_difference/mean": 0.016146667301654816, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 196.234375, "completions/mean_terminated_length": 196.234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17349953949451447, "epoch": 0.023008849557522124, "frac_reward_zero_std": 0.25, "grad_norm": 2.3602637957309476, "kl": 0.0010178176453337073, "learning_rate": 1.0619469026548672e-07, "loss": -0.1324, "num_tokens": 289351.0, "reward": 0.53125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000556230545044, "sampling/importance_sampling_ratio/min": 0.3969041407108307, "sampling/sampling_logp_difference/max": 0.9240605235099792, "sampling/sampling_logp_difference/mean": 0.014139760285615921, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 220.71875, "completions/mean_terminated_length": 220.71875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.3022480010986328, "epoch": 0.024778761061946902, "frac_reward_zero_std": 1.0, "grad_norm": 0.007325421749856517, "kl": 0.0008525942103005946, "learning_rate": 1.1504424778761061e-07, "loss": 0.0, "num_tokens": 316933.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.750264048576355, "sampling/importance_sampling_ratio/mean": 1.0002539157867432, "sampling/importance_sampling_ratio/min": 0.38214367628097534, "sampling/sampling_logp_difference/max": 0.961958646774292, "sampling/sampling_logp_difference/mean": 0.019138658419251442, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2377316802740097, "epoch": 0.02654867256637168, "frac_reward_zero_std": 0.25, "grad_norm": 2.668334625561077, "kl": 0.0009435678948648274, "learning_rate": 1.238938053097345e-07, "loss": 0.1223, "num_tokens": 342573.0, "reward": 0.5, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005204677581787, "sampling/importance_sampling_ratio/min": 0.46006086468696594, "sampling/sampling_logp_difference/max": 0.8165304660797119, "sampling/sampling_logp_difference/mean": 0.017045902088284492, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.13470181822776794, "epoch": 0.02831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.008739180862971822, "kl": 0.0007763542234897614, "learning_rate": 1.327433628318584e-07, "loss": 0.0, "num_tokens": 367021.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000870943069458, "sampling/importance_sampling_ratio/min": 0.5115542411804199, "sampling/sampling_logp_difference/max": 1.010016679763794, "sampling/sampling_logp_difference/mean": 0.011964352801442146, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 301.40625, "completions/mean_terminated_length": 301.40625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.18349219858646393, "epoch": 0.03008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 0.6309668871529117, "kl": 0.0006560541223734617, "learning_rate": 1.4159292035398229e-07, "loss": -0.1004, "num_tokens": 396855.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000659227371216, "sampling/importance_sampling_ratio/min": 0.31769227981567383, "sampling/sampling_logp_difference/max": 1.146672010421753, "sampling/sampling_logp_difference/mean": 0.01168097835034132, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 245.03125, "completions/mean_terminated_length": 245.03125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.15564164519309998, "epoch": 0.03185840707964602, "frac_reward_zero_std": 0.25, "grad_norm": 2.0645102838982945, "kl": 0.0008038764353841543, "learning_rate": 1.504424778761062e-07, "loss": -0.0329, "num_tokens": 422921.0, "reward": 0.53125, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6298564672470093, "sampling/importance_sampling_ratio/mean": 0.9991835355758667, "sampling/importance_sampling_ratio/min": 0.3150845766067505, "sampling/sampling_logp_difference/max": 1.154914140701294, "sampling/sampling_logp_difference/mean": 0.012903538532555103, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 172.984375, "completions/mean_terminated_length": 172.984375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.20079758763313293, "epoch": 0.033628318584070796, "frac_reward_zero_std": 0.75, "grad_norm": 1.26169033794228, "kl": 0.0008508848259225488, "learning_rate": 1.5929203539823007e-07, "loss": -0.084, "num_tokens": 445640.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6069133281707764, "sampling/importance_sampling_ratio/mean": 0.9999054670333862, "sampling/importance_sampling_ratio/min": 0.5909508466720581, "sampling/sampling_logp_difference/max": 0.5260224342346191, "sampling/sampling_logp_difference/mean": 0.014038577675819397, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 128.09375, "completions/mean_terminated_length": 128.09375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.1756119430065155, "epoch": 0.035398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 3.682414213657115, "kl": 0.0014270239043980837, "learning_rate": 1.68141592920354e-07, "loss": 0.0296, "num_tokens": 463774.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000065803527832, "sampling/importance_sampling_ratio/min": 0.41633790731430054, "sampling/sampling_logp_difference/max": 0.8762580156326294, "sampling/sampling_logp_difference/mean": 0.014868613332509995, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 229.59375, "completions/mean_terminated_length": 229.59375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1500265896320343, "epoch": 0.03716814159292035, "frac_reward_zero_std": 0.5, "grad_norm": 2.9487383928866655, "kl": 0.0009082312462851405, "learning_rate": 1.7699115044247788e-07, "loss": -0.0017, "num_tokens": 488820.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008652210235596, "sampling/importance_sampling_ratio/min": 0.39089053869247437, "sampling/sampling_logp_difference/max": 1.2574412822723389, "sampling/sampling_logp_difference/mean": 0.012858321890234947, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 245.140625, "completions/mean_terminated_length": 245.140625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2070005238056183, "epoch": 0.03893805309734513, "frac_reward_zero_std": 0.75, "grad_norm": 0.9258937309354511, "kl": 0.000740000803489238, "learning_rate": 1.8584070796460178e-07, "loss": 0.0165, "num_tokens": 517197.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.992316722869873, "sampling/importance_sampling_ratio/mean": 1.000396490097046, "sampling/importance_sampling_ratio/min": 0.29496005177497864, "sampling/sampling_logp_difference/max": 1.2209153175354004, "sampling/sampling_logp_difference/mean": 0.012652462348341942, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 193.9375, "completions/mean_terminated_length": 193.9375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.21532157063484192, "epoch": 0.04070796460176991, "frac_reward_zero_std": 0.75, "grad_norm": 1.2640275685209335, "kl": 0.0009581141057424247, "learning_rate": 1.9469026548672566e-07, "loss": 0.0327, "num_tokens": 539977.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997631907463074, "sampling/importance_sampling_ratio/min": 0.5373528003692627, "sampling/sampling_logp_difference/max": 1.105118751525879, "sampling/sampling_logp_difference/mean": 0.01637793332338333, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 209.734375, "completions/mean_terminated_length": 209.734375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1488220989704132, "epoch": 0.04247787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.010966285441791441, "kl": 0.0011915145441889763, "learning_rate": 2.0353982300884956e-07, "loss": 0.0, "num_tokens": 564280.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994518756866455, "sampling/importance_sampling_ratio/min": 0.34632962942123413, "sampling/sampling_logp_difference/max": 1.0603642463684082, "sampling/sampling_logp_difference/mean": 0.01427546702325344, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 179.703125, "completions/mean_terminated_length": 179.703125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2321864366531372, "epoch": 0.04424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 1.8353446276240468, "kl": 0.0010274217929691076, "learning_rate": 2.1238938053097344e-07, "loss": 0.0217, "num_tokens": 588213.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.9441547393798828, "sampling/importance_sampling_ratio/mean": 0.9990208745002747, "sampling/importance_sampling_ratio/min": 0.3873034119606018, "sampling/sampling_logp_difference/max": 0.9485468864440918, "sampling/sampling_logp_difference/mean": 0.016579966992139816, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 218.15625, "completions/mean_terminated_length": 218.15625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.19337984919548035, "epoch": 0.04601769911504425, "frac_reward_zero_std": 0.5, "grad_norm": 1.543588826777854, "kl": 0.0007166718132793903, "learning_rate": 2.2123893805309735e-07, "loss": 0.024, "num_tokens": 613391.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8932172060012817, "sampling/importance_sampling_ratio/mean": 0.9997749328613281, "sampling/importance_sampling_ratio/min": 0.41680893301963806, "sampling/sampling_logp_difference/max": 0.8751273155212402, "sampling/sampling_logp_difference/mean": 0.01402277685701847, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 134.890625, "completions/mean_terminated_length": 134.890625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.16752539575099945, "epoch": 0.047787610619469026, "frac_reward_zero_std": 1.0, "grad_norm": 0.008176515429668869, "kl": 0.0007071521831676364, "learning_rate": 2.3008849557522122e-07, "loss": 0.0, "num_tokens": 632104.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006349086761475, "sampling/importance_sampling_ratio/min": 0.5376253128051758, "sampling/sampling_logp_difference/max": 1.0281221866607666, "sampling/sampling_logp_difference/mean": 0.014347018674015999, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 240.3125, "completions/mean_terminated_length": 240.3125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.17694805562496185, "epoch": 0.049557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.007210979472460295, "kl": 0.0007357672438956797, "learning_rate": 2.3893805309734513e-07, "loss": 0.0, "num_tokens": 658684.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000518798828125, "sampling/importance_sampling_ratio/min": 0.4609917104244232, "sampling/sampling_logp_difference/max": 0.8144285678863525, "sampling/sampling_logp_difference/mean": 0.013953857123851776, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 130.765625, "completions/mean_terminated_length": 130.765625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.19520853459835052, "epoch": 0.05132743362831858, "frac_reward_zero_std": 0.75, "grad_norm": 1.6378471485121089, "kl": 0.0009881139267235994, "learning_rate": 2.47787610619469e-07, "loss": 0.0099, "num_tokens": 677485.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000996589660645, "sampling/importance_sampling_ratio/min": 0.490934818983078, "sampling/sampling_logp_difference/max": 0.7196002006530762, "sampling/sampling_logp_difference/mean": 0.01571294665336609, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 191.203125, "completions/mean_terminated_length": 191.203125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.27703967690467834, "epoch": 0.05309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.3669579865235388, "kl": 0.0009267547866329551, "learning_rate": 2.5663716814159294e-07, "loss": 0.0031, "num_tokens": 701802.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5745229721069336, "sampling/importance_sampling_ratio/mean": 0.9990708231925964, "sampling/importance_sampling_ratio/min": 0.5164141654968262, "sampling/sampling_logp_difference/max": 0.6608462333679199, "sampling/sampling_logp_difference/mean": 0.018282253295183182, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 316.34375, "completions/mean_terminated_length": 316.34375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.14983418583869934, "epoch": 0.05486725663716814, "frac_reward_zero_std": 0.25, "grad_norm": 1.588680969129259, "kl": 0.0005144693423062563, "learning_rate": 2.654867256637168e-07, "loss": -0.0748, "num_tokens": 734048.0, "reward": 0.53125, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6938190460205078, "sampling/importance_sampling_ratio/mean": 0.9997175931930542, "sampling/importance_sampling_ratio/min": 0.49723783135414124, "sampling/sampling_logp_difference/max": 0.6986868381500244, "sampling/sampling_logp_difference/mean": 0.010903030633926392, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 236.21875, "completions/mean_terminated_length": 236.21875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2014048993587494, "epoch": 0.05663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 1.1165569594013665, "kl": 0.0008729900000616908, "learning_rate": 2.743362831858407e-07, "loss": 0.0135, "num_tokens": 761070.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011087656021118, "sampling/importance_sampling_ratio/min": 0.5260618925094604, "sampling/sampling_logp_difference/max": 0.7560343742370605, "sampling/sampling_logp_difference/mean": 0.014054338447749615, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 184.171875, "completions/mean_terminated_length": 184.171875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.18755163252353668, "epoch": 0.0584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.01348215804731975, "kl": 0.0011707344092428684, "learning_rate": 2.8318584070796457e-07, "loss": 0.0, "num_tokens": 784009.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.807448148727417, "sampling/importance_sampling_ratio/mean": 1.0004781484603882, "sampling/importance_sampling_ratio/min": 0.3858293294906616, "sampling/sampling_logp_difference/max": 0.9523601531982422, "sampling/sampling_logp_difference/mean": 0.014409128576517105, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 162.046875, "completions/mean_terminated_length": 162.046875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.20699694752693176, "epoch": 0.06017699115044248, "frac_reward_zero_std": 0.5, "grad_norm": 2.5364202111347947, "kl": 0.001438008388504386, "learning_rate": 2.920353982300885e-07, "loss": 0.0114, "num_tokens": 805852.0, "reward": 0.65625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002546310424805, "sampling/importance_sampling_ratio/min": 0.3732570707798004, "sampling/sampling_logp_difference/max": 0.9854879379272461, "sampling/sampling_logp_difference/mean": 0.017536424100399017, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 171.40625, "completions/mean_terminated_length": 171.40625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.14141812920570374, "epoch": 0.061946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.011702442983698658, "kl": 0.001038202317431569, "learning_rate": 3.008849557522124e-07, "loss": 0.0, "num_tokens": 830262.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8767281770706177, "sampling/importance_sampling_ratio/mean": 1.0000505447387695, "sampling/importance_sampling_ratio/min": 0.5362989902496338, "sampling/sampling_logp_difference/max": 0.6295299530029297, "sampling/sampling_logp_difference/mean": 0.013316100463271141, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 92.625, "completions/mean_terminated_length": 92.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.11798705160617828, "epoch": 0.06371681415929203, "frac_reward_zero_std": 1.0, "grad_norm": 0.018761091221181157, "kl": 0.0011536427773535252, "learning_rate": 3.0973451327433626e-07, "loss": 0.0, "num_tokens": 845566.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001306533813477, "sampling/importance_sampling_ratio/min": 0.3007712662220001, "sampling/sampling_logp_difference/max": 1.2014052867889404, "sampling/sampling_logp_difference/mean": 0.014341464266180992, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.24642238020896912, "epoch": 0.06548672566371681, "frac_reward_zero_std": 0.75, "grad_norm": 2.308006414324892, "kl": 0.00105126085691154, "learning_rate": 3.1858407079646014e-07, "loss": 0.0366, "num_tokens": 871302.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.849735975265503, "sampling/importance_sampling_ratio/mean": 1.001230239868164, "sampling/importance_sampling_ratio/min": 0.3682939410209656, "sampling/sampling_logp_difference/max": 0.9988739490509033, "sampling/sampling_logp_difference/mean": 0.01865249313414097, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 141.96875, "completions/mean_terminated_length": 141.96875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16303592920303345, "epoch": 0.06725663716814159, "frac_reward_zero_std": 0.5, "grad_norm": 2.5431876025675364, "kl": 0.0014443504624068737, "learning_rate": 3.2743362831858407e-07, "loss": 0.0215, "num_tokens": 890196.0, "reward": -0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9876481294631958, "sampling/importance_sampling_ratio/mean": 1.0002636909484863, "sampling/importance_sampling_ratio/min": 0.5389053225517273, "sampling/sampling_logp_difference/max": 0.6869521141052246, "sampling/sampling_logp_difference/mean": 0.013612573966383934, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 126.921875, "completions/mean_terminated_length": 126.921875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.14303557574748993, "epoch": 0.06902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 2.223847687129557, "kl": 0.0010769811924546957, "learning_rate": 3.36283185840708e-07, "loss": -0.1026, "num_tokens": 907711.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9390095472335815, "sampling/importance_sampling_ratio/mean": 0.9995125532150269, "sampling/importance_sampling_ratio/min": 0.4159519672393799, "sampling/sampling_logp_difference/max": 0.8771854639053345, "sampling/sampling_logp_difference/mean": 0.014646771363914013, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 141.765625, "completions/mean_terminated_length": 141.765625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.17122280597686768, "epoch": 0.07079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.6144760765821782, "kl": 0.0009360772091895342, "learning_rate": 3.451327433628318e-07, "loss": 0.0059, "num_tokens": 928032.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8118932247161865, "sampling/importance_sampling_ratio/mean": 0.9990648627281189, "sampling/importance_sampling_ratio/min": 0.41042402386665344, "sampling/sampling_logp_difference/max": 0.8905644416809082, "sampling/sampling_logp_difference/mean": 0.013022142462432384, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 233.796875, "completions/mean_terminated_length": 233.796875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2843101918697357, "epoch": 0.07256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.005713047557056584, "kl": 0.0008049719035625458, "learning_rate": 3.5398230088495575e-07, "loss": 0.0, "num_tokens": 956371.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997782707214355, "sampling/importance_sampling_ratio/min": 0.6243152618408203, "sampling/sampling_logp_difference/max": 0.7081446647644043, "sampling/sampling_logp_difference/mean": 0.014828085899353027, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 228.84375, "completions/mean_terminated_length": 228.84375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2109835147857666, "epoch": 0.0743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 1.0713341064905801, "kl": 0.0010454637231305242, "learning_rate": 3.6283185840707963e-07, "loss": -0.0051, "num_tokens": 982617.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003876686096191, "sampling/importance_sampling_ratio/min": 0.6008963584899902, "sampling/sampling_logp_difference/max": 0.8771946430206299, "sampling/sampling_logp_difference/mean": 0.013687508180737495, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.24477946758270264, "epoch": 0.07610619469026549, "frac_reward_zero_std": 0.75, "grad_norm": 1.25107031369608, "kl": 0.001167207956314087, "learning_rate": 3.7168141592920356e-07, "loss": -0.0047, "num_tokens": 1005709.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998118281364441, "sampling/importance_sampling_ratio/min": 0.5384308099746704, "sampling/sampling_logp_difference/max": 1.4401640892028809, "sampling/sampling_logp_difference/mean": 0.016788320615887642, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 178.484375, "completions/mean_terminated_length": 178.484375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.18052473664283752, "epoch": 0.07787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 2.0020955822357904, "kl": 0.0009459182037971914, "learning_rate": 3.805309734513274e-07, "loss": -0.0177, "num_tokens": 1029100.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.85381281375885, "sampling/importance_sampling_ratio/mean": 0.9988968372344971, "sampling/importance_sampling_ratio/min": 0.37974247336387634, "sampling/sampling_logp_difference/max": 0.9682619571685791, "sampling/sampling_logp_difference/mean": 0.01423648837953806, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 107.96875, "completions/mean_terminated_length": 107.96875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.12751907110214233, "epoch": 0.07964601769911504, "frac_reward_zero_std": 0.75, "grad_norm": 3.5654891226520005, "kl": 0.0009743129485286772, "learning_rate": 3.893805309734513e-07, "loss": 0.0045, "num_tokens": 1045066.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000836849212646, "sampling/importance_sampling_ratio/min": 0.49824559688568115, "sampling/sampling_logp_difference/max": 0.7048590183258057, "sampling/sampling_logp_difference/mean": 0.012781774625182152, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 234.5625, "completions/mean_terminated_length": 234.5625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.1693953573703766, "epoch": 0.08141592920353982, "frac_reward_zero_std": 0.5, "grad_norm": 1.819386344945796, "kl": 0.0008317027823068202, "learning_rate": 3.982300884955752e-07, "loss": 0.02, "num_tokens": 1070862.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6261541843414307, "sampling/importance_sampling_ratio/mean": 0.9997702836990356, "sampling/importance_sampling_ratio/min": 0.32852938771247864, "sampling/sampling_logp_difference/max": 1.1131290197372437, "sampling/sampling_logp_difference/mean": 0.012345898896455765, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 270.046875, "completions/mean_terminated_length": 270.046875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.1756407618522644, "epoch": 0.0831858407079646, "frac_reward_zero_std": 0.25, "grad_norm": 1.9758450394265588, "kl": 0.0014370502904057503, "learning_rate": 4.0707964601769913e-07, "loss": -0.2443, "num_tokens": 1098737.0, "reward": 0.125, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.8557589054107666, "sampling/importance_sampling_ratio/mean": 1.0001187324523926, "sampling/importance_sampling_ratio/min": 0.3232802748680115, "sampling/sampling_logp_difference/max": 1.1292356252670288, "sampling/sampling_logp_difference/mean": 0.014051987789571285, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 228.34375, "completions/mean_terminated_length": 228.34375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.15174150466918945, "epoch": 0.08495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 1.2476166953069376, "kl": 0.000813533435575664, "learning_rate": 4.1592920353982295e-07, "loss": 0.0001, "num_tokens": 1123303.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997676610946655, "sampling/importance_sampling_ratio/min": 0.4360385239124298, "sampling/sampling_logp_difference/max": 0.8300247192382812, "sampling/sampling_logp_difference/mean": 0.011012416332960129, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 150.421875, "completions/mean_terminated_length": 150.421875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.20299126207828522, "epoch": 0.08672566371681416, "frac_reward_zero_std": 0.5, "grad_norm": 2.2230779860014236, "kl": 0.0013882017228752375, "learning_rate": 4.247787610619469e-07, "loss": 0.0192, "num_tokens": 1142930.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9442600011825562, "sampling/importance_sampling_ratio/mean": 0.9987527132034302, "sampling/importance_sampling_ratio/min": 0.5047078728675842, "sampling/sampling_logp_difference/max": 0.6837754249572754, "sampling/sampling_logp_difference/mean": 0.014624504372477531, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 179.4375, "completions/mean_terminated_length": 179.4375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.18069413304328918, "epoch": 0.08849557522123894, "frac_reward_zero_std": 0.5, "grad_norm": 2.1473394614744885, "kl": 0.0016161503735929728, "learning_rate": 4.3362831858407076e-07, "loss": -0.0276, "num_tokens": 1164894.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998122453689575, "sampling/importance_sampling_ratio/min": 0.39724382758140564, "sampling/sampling_logp_difference/max": 2.5610384941101074, "sampling/sampling_logp_difference/mean": 0.01448934618383646, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 156.359375, "completions/mean_terminated_length": 156.359375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.15276214480400085, "epoch": 0.09026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 1.5627306046078462, "kl": 0.0018083984032273293, "learning_rate": 4.424778761061947e-07, "loss": -0.018, "num_tokens": 1185253.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8102161884307861, "sampling/importance_sampling_ratio/mean": 1.0000556707382202, "sampling/importance_sampling_ratio/min": 0.2618989944458008, "sampling/sampling_logp_difference/max": 1.3397963047027588, "sampling/sampling_logp_difference/mean": 0.013555309735238552, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 194.90625, "completions/mean_terminated_length": 194.90625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.21800324320793152, "epoch": 0.0920353982300885, "frac_reward_zero_std": 0.75, "grad_norm": 1.868643360743444, "kl": 0.0017419452778995037, "learning_rate": 4.5132743362831857e-07, "loss": 0.0648, "num_tokens": 1212047.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011686086654663, "sampling/importance_sampling_ratio/min": 0.6059046983718872, "sampling/sampling_logp_difference/max": 0.8144683837890625, "sampling/sampling_logp_difference/mean": 0.015190355479717255, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 255.09375, "completions/mean_terminated_length": 255.09375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.2503582537174225, "epoch": 0.09380530973451327, "frac_reward_zero_std": 1.0, "grad_norm": 0.01249984415815557, "kl": 0.0009948830120265484, "learning_rate": 4.6017699115044245e-07, "loss": 0.0, "num_tokens": 1240709.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.837790608406067, "sampling/importance_sampling_ratio/mean": 1.0002087354660034, "sampling/importance_sampling_ratio/min": 0.5682967305183411, "sampling/sampling_logp_difference/max": 0.6085641384124756, "sampling/sampling_logp_difference/mean": 0.01405000314116478, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.22767794132232666, "epoch": 0.09557522123893805, "frac_reward_zero_std": 0.75, "grad_norm": 0.8593297415362304, "kl": 0.002414063084870577, "learning_rate": 4.690265486725664e-07, "loss": -0.0086, "num_tokens": 1265989.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6337288618087769, "sampling/importance_sampling_ratio/mean": 1.0001916885375977, "sampling/importance_sampling_ratio/min": 0.12030123919248581, "sampling/sampling_logp_difference/max": 2.1177563667297363, "sampling/sampling_logp_difference/mean": 0.012822871096432209, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 131.15625, "completions/mean_terminated_length": 131.15625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.1695953905582428, "epoch": 0.09734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.01605816696443317, "kl": 0.0016652561025694013, "learning_rate": 4.778761061946903e-07, "loss": 0.0, "num_tokens": 1285103.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6399857997894287, "sampling/importance_sampling_ratio/mean": 1.000382900238037, "sampling/importance_sampling_ratio/min": 0.6125958561897278, "sampling/sampling_logp_difference/max": 0.494687557220459, "sampling/sampling_logp_difference/mean": 0.014917861670255661, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1710292547941208, "epoch": 0.09911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 2.016599467431796, "kl": 0.002824608236551285, "learning_rate": 4.867256637168141e-07, "loss": 0.0187, "num_tokens": 1310031.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.9443155527114868, "sampling/importance_sampling_ratio/mean": 1.0003931522369385, "sampling/importance_sampling_ratio/min": 0.46123114228248596, "sampling/sampling_logp_difference/max": 0.7738559246063232, "sampling/sampling_logp_difference/mean": 0.01424512267112732, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 140.046875, "completions/mean_terminated_length": 140.046875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.14154188334941864, "epoch": 0.10088495575221239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0361787094364151, "kl": 0.002959079574793577, "learning_rate": 4.95575221238938e-07, "loss": 0.0, "num_tokens": 1328722.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9627652168273926, "sampling/importance_sampling_ratio/mean": 1.000194787979126, "sampling/importance_sampling_ratio/min": 0.39854058623313904, "sampling/sampling_logp_difference/max": 0.9199459552764893, "sampling/sampling_logp_difference/mean": 0.014579495415091515, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.11697734892368317, "epoch": 0.10265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 0.8989127695996973, "kl": 0.002100049750879407, "learning_rate": 5.044247787610619e-07, "loss": -0.0076, "num_tokens": 1356730.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993403553962708, "sampling/importance_sampling_ratio/min": 0.5052147507667542, "sampling/sampling_logp_difference/max": 0.8105251789093018, "sampling/sampling_logp_difference/mean": 0.01117606833577156, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 301.1875, "completions/mean_terminated_length": 301.1875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.26915013790130615, "epoch": 0.10442477876106195, "frac_reward_zero_std": 0.5, "grad_norm": 1.2147300319268985, "kl": 0.0015006400644779205, "learning_rate": 5.132743362831859e-07, "loss": 0.0107, "num_tokens": 1389974.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6589486598968506, "sampling/importance_sampling_ratio/mean": 0.9998583197593689, "sampling/importance_sampling_ratio/min": 0.49761274456977844, "sampling/sampling_logp_difference/max": 0.6979331970214844, "sampling/sampling_logp_difference/mean": 0.014590919017791748, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 235.8125, "completions/mean_terminated_length": 235.8125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19573962688446045, "epoch": 0.10619469026548672, "frac_reward_zero_std": 0.5, "grad_norm": 1.558080662483034, "kl": 0.0016793024260550737, "learning_rate": 5.221238938053097e-07, "loss": -0.0908, "num_tokens": 1415802.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.978001356124878, "sampling/importance_sampling_ratio/mean": 1.0002704858779907, "sampling/importance_sampling_ratio/min": 0.3906334936618805, "sampling/sampling_logp_difference/max": 0.9399855136871338, "sampling/sampling_logp_difference/mean": 0.013243768364191055, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 371.6875, "completions/mean_terminated_length": 371.6875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17582863569259644, "epoch": 0.1079646017699115, "frac_reward_zero_std": 0.25, "grad_norm": 1.6357971919009373, "kl": 0.0019237755332142115, "learning_rate": 5.309734513274336e-07, "loss": 0.0322, "num_tokens": 1453366.0, "reward": -0.0625, "reward_std": 0.6813369989395142, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998849630355835, "sampling/importance_sampling_ratio/min": 0.4624379277229309, "sampling/sampling_logp_difference/max": 0.7783390283584595, "sampling/sampling_logp_difference/mean": 0.012779934331774712, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 107.96875, "completions/mean_terminated_length": 107.96875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.16164499521255493, "epoch": 0.10973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 2.409640912068371, "kl": 0.0036677056923508644, "learning_rate": 5.398230088495575e-07, "loss": -0.0131, "num_tokens": 1470244.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003435611724854, "sampling/importance_sampling_ratio/min": 0.4530305564403534, "sampling/sampling_logp_difference/max": 1.4775214195251465, "sampling/sampling_logp_difference/mean": 0.014469078741967678, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 191.828125, "completions/mean_terminated_length": 191.828125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.14933156967163086, "epoch": 0.11150442477876106, "frac_reward_zero_std": 0.75, "grad_norm": 1.145193689979528, "kl": 0.0032071906607598066, "learning_rate": 5.486725663716814e-07, "loss": 0.024, "num_tokens": 1493577.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6727534532546997, "sampling/importance_sampling_ratio/mean": 0.9998900294303894, "sampling/importance_sampling_ratio/min": 0.3966847360134125, "sampling/sampling_logp_difference/max": 0.9246134757995605, "sampling/sampling_logp_difference/mean": 0.011439943686127663, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2044827789068222, "epoch": 0.11327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 1.2166201611870002, "kl": 0.002989033702760935, "learning_rate": 5.575221238938052e-07, "loss": 0.025, "num_tokens": 1519241.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999023675918579, "sampling/importance_sampling_ratio/min": 0.5040412545204163, "sampling/sampling_logp_difference/max": 0.7373228073120117, "sampling/sampling_logp_difference/mean": 0.013392649590969086, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 180.796875, "completions/mean_terminated_length": 180.796875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2071433663368225, "epoch": 0.11504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.5540011850417457, "kl": 0.0039022755809128284, "learning_rate": 5.663716814159291e-07, "loss": -0.0462, "num_tokens": 1540892.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9989885091781616, "sampling/importance_sampling_ratio/min": 0.3821437656879425, "sampling/sampling_logp_difference/max": 0.9619584083557129, "sampling/sampling_logp_difference/mean": 0.016249412670731544, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 79.5, "completions/mean_terminated_length": 79.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.13125570118427277, "epoch": 0.1168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.08955128693633796, "kl": 0.00606576818972826, "learning_rate": 5.752212389380531e-07, "loss": 0.0001, "num_tokens": 1555660.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6711188554763794, "sampling/importance_sampling_ratio/mean": 0.9981821775436401, "sampling/importance_sampling_ratio/min": 0.47878360748291016, "sampling/sampling_logp_difference/max": 0.7365065813064575, "sampling/sampling_logp_difference/mean": 0.01813593879342079, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 188.734375, "completions/mean_terminated_length": 188.734375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.1586710810661316, "epoch": 0.11858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 1.9592696570834163, "kl": 0.003037875285372138, "learning_rate": 5.84070796460177e-07, "loss": -0.0135, "num_tokens": 1577467.0, "reward": 0.0, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6682101488113403, "sampling/importance_sampling_ratio/mean": 0.9991528391838074, "sampling/importance_sampling_ratio/min": 0.508482038974762, "sampling/sampling_logp_difference/max": 0.6763253211975098, "sampling/sampling_logp_difference/mean": 0.013168432749807835, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.17116700112819672, "epoch": 0.12035398230088495, "frac_reward_zero_std": 0.75, "grad_norm": 1.1025685863887689, "kl": 0.0025802820455282927, "learning_rate": 5.929203539823009e-07, "loss": 0.0909, "num_tokens": 1603435.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005784034729004, "sampling/importance_sampling_ratio/min": 0.3821813762187958, "sampling/sampling_logp_difference/max": 0.961859941482544, "sampling/sampling_logp_difference/mean": 0.014102600514888763, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 188.359375, "completions/mean_terminated_length": 188.359375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.17217673361301422, "epoch": 0.12212389380530973, "frac_reward_zero_std": 0.5, "grad_norm": 2.2102508538355283, "kl": 0.005149882286787033, "learning_rate": 6.017699115044248e-07, "loss": 0.0036, "num_tokens": 1626082.0, "reward": 0.4375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000403642654419, "sampling/importance_sampling_ratio/min": 0.4615646004676819, "sampling/sampling_logp_difference/max": 0.7731332778930664, "sampling/sampling_logp_difference/mean": 0.014090755954384804, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.15563452243804932, "epoch": 0.12389380530973451, "frac_reward_zero_std": 1.0, "grad_norm": 0.04176400561340745, "kl": 0.003772917203605175, "learning_rate": 6.106194690265486e-07, "loss": 0.0, "num_tokens": 1649090.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8104203939437866, "sampling/importance_sampling_ratio/mean": 0.9992653131484985, "sampling/importance_sampling_ratio/min": 0.39965805411338806, "sampling/sampling_logp_difference/max": 0.9171459674835205, "sampling/sampling_logp_difference/mean": 0.011007951572537422, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 153.046875, "completions/mean_terminated_length": 153.046875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.20204734802246094, "epoch": 0.1256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.7098319766421397, "kl": 0.0053355395793914795, "learning_rate": 6.194690265486725e-07, "loss": -0.0411, "num_tokens": 1669109.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003011226654053, "sampling/importance_sampling_ratio/min": 0.46321749687194824, "sampling/sampling_logp_difference/max": 1.037759780883789, "sampling/sampling_logp_difference/mean": 0.016230512410402298, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 187.8125, "completions/mean_terminated_length": 187.8125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.23089121282100677, "epoch": 0.12743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 1.8686856487814465, "kl": 0.004242965951561928, "learning_rate": 6.283185840707964e-07, "loss": 0.052, "num_tokens": 1691513.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.9099587202072144, "sampling/importance_sampling_ratio/mean": 0.9995931386947632, "sampling/importance_sampling_ratio/min": 0.3913111686706543, "sampling/sampling_logp_difference/max": 0.9382522106170654, "sampling/sampling_logp_difference/mean": 0.015800392255187035, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 102.6875, "completions/mean_terminated_length": 102.6875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.1164252832531929, "epoch": 0.12920353982300886, "frac_reward_zero_std": 0.75, "grad_norm": 2.015117906062716, "kl": 0.004554741084575653, "learning_rate": 6.371681415929203e-07, "loss": 0.0095, "num_tokens": 1707813.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004181861877441, "sampling/importance_sampling_ratio/min": 0.4934388995170593, "sampling/sampling_logp_difference/max": 0.7287571430206299, "sampling/sampling_logp_difference/mean": 0.012399572879076004, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 234.734375, "completions/mean_terminated_length": 234.734375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.20000587403774261, "epoch": 0.13097345132743363, "frac_reward_zero_std": 0.5, "grad_norm": 1.666314979307278, "kl": 0.005966211669147015, "learning_rate": 6.460176991150442e-07, "loss": 0.0276, "num_tokens": 1732820.0, "reward": -0.1875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7521334886550903, "sampling/importance_sampling_ratio/mean": 0.9992698431015015, "sampling/importance_sampling_ratio/min": 0.22432927787303925, "sampling/sampling_logp_difference/max": 1.4946403503417969, "sampling/sampling_logp_difference/mean": 0.015472994185984135, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 265.640625, "completions/mean_terminated_length": 265.640625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.20081549882888794, "epoch": 0.13274336283185842, "frac_reward_zero_std": 0.5, "grad_norm": 1.556101511143104, "kl": 0.003977864049375057, "learning_rate": 6.548672566371681e-07, "loss": -0.0025, "num_tokens": 1760301.0, "reward": -0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6007779836654663, "sampling/importance_sampling_ratio/mean": 1.0001752376556396, "sampling/importance_sampling_ratio/min": 0.4545961916446686, "sampling/sampling_logp_difference/max": 0.7883458137512207, "sampling/sampling_logp_difference/mean": 0.012318181805312634, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 136.828125, "completions/mean_terminated_length": 136.828125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19759991765022278, "epoch": 0.13451327433628318, "frac_reward_zero_std": 0.75, "grad_norm": 1.6551037849214671, "kl": 0.004933338612318039, "learning_rate": 6.637168141592921e-07, "loss": -0.0016, "num_tokens": 1780146.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004138946533203, "sampling/importance_sampling_ratio/min": 0.5497366189956665, "sampling/sampling_logp_difference/max": 0.7718650102615356, "sampling/sampling_logp_difference/mean": 0.015339044854044914, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 156.171875, "completions/mean_terminated_length": 156.171875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.1673189103603363, "epoch": 0.13628318584070798, "frac_reward_zero_std": 1.0, "grad_norm": 0.05009280592584467, "kl": 0.0052584391087293625, "learning_rate": 6.72566371681416e-07, "loss": 0.0001, "num_tokens": 1799453.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6283557415008545, "sampling/importance_sampling_ratio/mean": 1.0006399154663086, "sampling/importance_sampling_ratio/min": 0.609236478805542, "sampling/sampling_logp_difference/max": 0.49554872512817383, "sampling/sampling_logp_difference/mean": 0.012239392846822739, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 132.53125, "completions/mean_terminated_length": 132.53125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.19785723090171814, "epoch": 0.13805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 1.8083721674911888, "kl": 0.006302251480519772, "learning_rate": 6.814159292035397e-07, "loss": -0.0101, "num_tokens": 1822351.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006425380706787, "sampling/importance_sampling_ratio/min": 0.4882376492023468, "sampling/sampling_logp_difference/max": 0.9235873222351074, "sampling/sampling_logp_difference/mean": 0.0152043541893363, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 156.90625, "completions/mean_terminated_length": 156.90625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.14425361156463623, "epoch": 0.13982300884955753, "frac_reward_zero_std": 0.75, "grad_norm": 1.6947524920357688, "kl": 0.006576927844434977, "learning_rate": 6.902654867256636e-07, "loss": -0.111, "num_tokens": 1842521.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000072717666626, "sampling/importance_sampling_ratio/min": 0.315999299287796, "sampling/sampling_logp_difference/max": 1.5260412693023682, "sampling/sampling_logp_difference/mean": 0.015954207628965378, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 173.671875, "completions/mean_terminated_length": 173.671875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.2206019163131714, "epoch": 0.1415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.1640461520707954, "kl": 0.004117241129279137, "learning_rate": 6.991150442477876e-07, "loss": 0.0023, "num_tokens": 1866148.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8179765939712524, "sampling/importance_sampling_ratio/mean": 1.0002185106277466, "sampling/importance_sampling_ratio/min": 0.6148074865341187, "sampling/sampling_logp_difference/max": 0.5977240800857544, "sampling/sampling_logp_difference/mean": 0.01679387502372265, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 196.78125, "completions/mean_terminated_length": 196.78125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2194685935974121, "epoch": 0.1433628318584071, "frac_reward_zero_std": 0.25, "grad_norm": 2.2751658856239545, "kl": 0.0050181858241558075, "learning_rate": 7.079646017699115e-07, "loss": -0.0509, "num_tokens": 1890022.0, "reward": 0.34375, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9876788854599, "sampling/importance_sampling_ratio/mean": 1.0003325939178467, "sampling/importance_sampling_ratio/min": 0.5160473585128784, "sampling/sampling_logp_difference/max": 0.6869676113128662, "sampling/sampling_logp_difference/mean": 0.014776955358684063, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 189.15625, "completions/mean_terminated_length": 189.15625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.17521485686302185, "epoch": 0.14513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 2.0452976823893256, "kl": 0.0038846852257847786, "learning_rate": 7.168141592920353e-07, "loss": -0.03, "num_tokens": 1914560.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.836782693862915, "sampling/importance_sampling_ratio/mean": 1.0010371208190918, "sampling/importance_sampling_ratio/min": 0.4904239773750305, "sampling/sampling_logp_difference/max": 0.7124849557876587, "sampling/sampling_logp_difference/mean": 0.012331612408161163, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 161.40625, "completions/mean_terminated_length": 161.40625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.2375316470861435, "epoch": 0.14690265486725665, "frac_reward_zero_std": 0.75, "grad_norm": 1.6347396300226236, "kl": 0.008128422312438488, "learning_rate": 7.256637168141593e-07, "loss": 0.0091, "num_tokens": 1937066.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001358151435852, "sampling/importance_sampling_ratio/min": 0.5486244559288025, "sampling/sampling_logp_difference/max": 0.7371139526367188, "sampling/sampling_logp_difference/mean": 0.016400588676333427, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 173.0625, "completions/mean_terminated_length": 173.0625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.22229667007923126, "epoch": 0.1486725663716814, "frac_reward_zero_std": 0.5, "grad_norm": 2.191412573003673, "kl": 0.008657586760818958, "learning_rate": 7.345132743362832e-07, "loss": -0.0398, "num_tokens": 1959278.0, "reward": 0.59375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.8718520402908325, "sampling/importance_sampling_ratio/mean": 0.9982494115829468, "sampling/importance_sampling_ratio/min": 0.3842117488384247, "sampling/sampling_logp_difference/max": 0.9565615653991699, "sampling/sampling_logp_difference/mean": 0.016396623104810715, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 328.4375, "completions/mean_terminated_length": 328.4375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1696602702140808, "epoch": 0.1504424778761062, "frac_reward_zero_std": 0.25, "grad_norm": 1.682507894556066, "kl": 0.008182156831026077, "learning_rate": 7.433628318584071e-07, "loss": -0.2055, "num_tokens": 1990922.0, "reward": -0.0625, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6045234203338623, "sampling/importance_sampling_ratio/mean": 1.0003488063812256, "sampling/importance_sampling_ratio/min": 0.5005572438240051, "sampling/sampling_logp_difference/max": 0.6920332908630371, "sampling/sampling_logp_difference/mean": 0.01261892355978489, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 107.046875, "completions/mean_terminated_length": 107.046875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.15166431665420532, "epoch": 0.15221238938053097, "frac_reward_zero_std": 0.75, "grad_norm": 2.0490806735858635, "kl": 0.012742295861244202, "learning_rate": 7.522123893805308e-07, "loss": -0.011, "num_tokens": 2007901.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991889595985413, "sampling/importance_sampling_ratio/min": 0.4056454002857208, "sampling/sampling_logp_difference/max": 0.9235873222351074, "sampling/sampling_logp_difference/mean": 0.01681956648826599, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 164.421875, "completions/mean_terminated_length": 164.421875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.15561005473136902, "epoch": 0.15398230088495576, "frac_reward_zero_std": 0.75, "grad_norm": 1.814574657238888, "kl": 0.01124626025557518, "learning_rate": 7.610619469026548e-07, "loss": -0.009, "num_tokens": 2028248.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8092561960220337, "sampling/importance_sampling_ratio/mean": 1.0002126693725586, "sampling/importance_sampling_ratio/min": 0.4373324513435364, "sampling/sampling_logp_difference/max": 0.827061653137207, "sampling/sampling_logp_difference/mean": 0.014207551255822182, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 205.34375, "completions/mean_terminated_length": 205.34375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.19017726182937622, "epoch": 0.15575221238938053, "frac_reward_zero_std": 0.5, "grad_norm": 1.7869045466627012, "kl": 0.012329556979238987, "learning_rate": 7.699115044247787e-07, "loss": -0.041, "num_tokens": 2052286.0, "reward": 0.5625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9104565382003784, "sampling/importance_sampling_ratio/mean": 0.9999362826347351, "sampling/importance_sampling_ratio/min": 0.5039128065109253, "sampling/sampling_logp_difference/max": 0.685352087020874, "sampling/sampling_logp_difference/mean": 0.013883713632822037, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.17904934287071228, "epoch": 0.15752212389380532, "frac_reward_zero_std": 0.75, "grad_norm": 1.80083882427442, "kl": 0.010359562933444977, "learning_rate": 7.787610619469026e-07, "loss": -0.0095, "num_tokens": 2072166.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7553316354751587, "sampling/importance_sampling_ratio/mean": 0.9999127984046936, "sampling/importance_sampling_ratio/min": 0.3858296275138855, "sampling/sampling_logp_difference/max": 0.9523594379425049, "sampling/sampling_logp_difference/mean": 0.015048828907310963, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 221.359375, "completions/mean_terminated_length": 221.359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.17193087935447693, "epoch": 0.1592920353982301, "frac_reward_zero_std": 0.5, "grad_norm": 1.5296040306822714, "kl": 0.020144592970609665, "learning_rate": 7.876106194690266e-07, "loss": -0.0078, "num_tokens": 2100029.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6439993381500244, "sampling/importance_sampling_ratio/mean": 1.0004019737243652, "sampling/importance_sampling_ratio/min": 0.4836675524711609, "sampling/sampling_logp_difference/max": 0.7263574600219727, "sampling/sampling_logp_difference/mean": 0.012492422014474869, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 278.71875, "completions/mean_terminated_length": 278.71875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.29132896661758423, "epoch": 0.16106194690265488, "frac_reward_zero_std": 0.0, "grad_norm": 2.34197496288831, "kl": 0.018009379506111145, "learning_rate": 7.964601769911504e-07, "loss": -0.1033, "num_tokens": 2134299.0, "reward": -0.0625, "reward_std": 0.8083621263504028, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998729228973389, "sampling/importance_sampling_ratio/min": 0.18368913233280182, "sampling/sampling_logp_difference/max": 1.6945104598999023, "sampling/sampling_logp_difference/mean": 0.016460083425045013, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 219.671875, "completions/mean_terminated_length": 219.671875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.19972282648086548, "epoch": 0.16283185840707964, "frac_reward_zero_std": 0.75, "grad_norm": 1.3951160912154388, "kl": 0.02044733241200447, "learning_rate": 8.053097345132743e-07, "loss": 0.015, "num_tokens": 2161718.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999643564224243, "sampling/importance_sampling_ratio/min": 0.42877888679504395, "sampling/sampling_logp_difference/max": 0.8468139171600342, "sampling/sampling_logp_difference/mean": 0.01374647207558155, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 160.828125, "completions/mean_terminated_length": 160.828125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.14671990275382996, "epoch": 0.16460176991150444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11922689368433426, "kl": 0.028555843979120255, "learning_rate": 8.141592920353983e-07, "loss": 0.0002, "num_tokens": 2183067.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7233573198318481, "sampling/importance_sampling_ratio/mean": 1.0006091594696045, "sampling/importance_sampling_ratio/min": 0.3682604730129242, "sampling/sampling_logp_difference/max": 0.998964786529541, "sampling/sampling_logp_difference/mean": 0.013144141063094139, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 222.984375, "completions/mean_terminated_length": 222.984375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.17110270261764526, "epoch": 0.1663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 1.2327153616480566, "kl": 0.015377798117697239, "learning_rate": 8.230088495575221e-07, "loss": 0.0137, "num_tokens": 2207194.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.7553249597549438, "sampling/importance_sampling_ratio/mean": 0.9994000792503357, "sampling/importance_sampling_ratio/min": 0.5353015065193176, "sampling/sampling_logp_difference/max": 0.6249251365661621, "sampling/sampling_logp_difference/mean": 0.011808395385742188, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 214.0625, "completions/mean_terminated_length": 214.0625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2683055102825165, "epoch": 0.168141592920354, "frac_reward_zero_std": 0.25, "grad_norm": 2.055100287121122, "kl": 0.041006509214639664, "learning_rate": 8.318584070796459e-07, "loss": 0.0075, "num_tokens": 2233070.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.0000818967819214, "sampling/importance_sampling_ratio/min": 0.4624044597148895, "sampling/sampling_logp_difference/max": 0.771315336227417, "sampling/sampling_logp_difference/mean": 0.01648646593093872, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 181.84375, "completions/mean_terminated_length": 181.84375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.1469574123620987, "epoch": 0.16991150442477876, "frac_reward_zero_std": 1.0, "grad_norm": 0.09259586437219178, "kl": 0.030910933390259743, "learning_rate": 8.407079646017698e-07, "loss": 0.0003, "num_tokens": 2254148.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000370740890503, "sampling/importance_sampling_ratio/min": 0.6114804744720459, "sampling/sampling_logp_difference/max": 0.8979051113128662, "sampling/sampling_logp_difference/mean": 0.010515779256820679, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.13294535875320435, "epoch": 0.17168141592920355, "frac_reward_zero_std": 1.0, "grad_norm": 0.08325716840133078, "kl": 0.031313274055719376, "learning_rate": 8.495575221238938e-07, "loss": 0.0003, "num_tokens": 2273796.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6257354021072388, "sampling/importance_sampling_ratio/mean": 1.0005269050598145, "sampling/importance_sampling_ratio/min": 0.5266066789627075, "sampling/sampling_logp_difference/max": 0.6413013935089111, "sampling/sampling_logp_difference/mean": 0.010202358476817608, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1133.0, "completions/max_terminated_length": 1133.0, "completions/mean_length": 293.65625, "completions/mean_terminated_length": 293.65625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3294614553451538, "epoch": 0.17345132743362832, "frac_reward_zero_std": 0.0, "grad_norm": 1.52702243882002, "kl": 0.03418732434511185, "learning_rate": 8.584070796460177e-07, "loss": -0.0118, "num_tokens": 2305486.0, "reward": 0.53125, "reward_std": 0.7535127401351929, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6576597690582275, "sampling/importance_sampling_ratio/mean": 1.0000965595245361, "sampling/importance_sampling_ratio/min": 0.527029812335968, "sampling/sampling_logp_difference/max": 0.640498161315918, "sampling/sampling_logp_difference/mean": 0.01520543359220028, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.20407316088676453, "epoch": 0.1752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 1.704507044362501, "kl": 0.04630480706691742, "learning_rate": 8.672566371681415e-07, "loss": 0.0126, "num_tokens": 2327734.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.8747789859771729, "sampling/importance_sampling_ratio/mean": 1.0002317428588867, "sampling/importance_sampling_ratio/min": 0.3969099521636963, "sampling/sampling_logp_difference/max": 0.9240458607673645, "sampling/sampling_logp_difference/mean": 0.014635585248470306, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 237.640625, "completions/mean_terminated_length": 237.640625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1891111135482788, "epoch": 0.17699115044247787, "frac_reward_zero_std": 1.0, "grad_norm": 0.06537608966488662, "kl": 0.037952832877635956, "learning_rate": 8.761061946902655e-07, "loss": 0.0003, "num_tokens": 2358335.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.556389570236206, "sampling/importance_sampling_ratio/mean": 0.9997446537017822, "sampling/importance_sampling_ratio/min": 0.4787577688694, "sampling/sampling_logp_difference/max": 0.7365604639053345, "sampling/sampling_logp_difference/mean": 0.013384765014052391, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 314.125, "completions/mean_terminated_length": 314.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3202126622200012, "epoch": 0.17876106194690267, "frac_reward_zero_std": 0.5, "grad_norm": 1.2989603363464788, "kl": 0.03548089414834976, "learning_rate": 8.849557522123894e-07, "loss": -0.0198, "num_tokens": 2389799.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.723695158958435, "sampling/importance_sampling_ratio/mean": 0.9999679327011108, "sampling/importance_sampling_ratio/min": 0.40947872400283813, "sampling/sampling_logp_difference/max": 0.892870306968689, "sampling/sampling_logp_difference/mean": 0.013907065615057945, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 226.140625, "completions/mean_terminated_length": 226.140625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.1599074900150299, "epoch": 0.18053097345132743, "frac_reward_zero_std": 1.0, "grad_norm": 0.08742824634152596, "kl": 0.05473973602056503, "learning_rate": 8.938053097345132e-07, "loss": 0.0005, "num_tokens": 2413760.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.578529715538025, "sampling/importance_sampling_ratio/mean": 0.9995525479316711, "sampling/importance_sampling_ratio/min": 0.3840782642364502, "sampling/sampling_logp_difference/max": 0.9569089412689209, "sampling/sampling_logp_difference/mean": 0.011220966465771198, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 152.984375, "completions/mean_terminated_length": 152.984375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.22016102075576782, "epoch": 0.18230088495575222, "frac_reward_zero_std": 0.5, "grad_norm": 2.585278634243758, "kl": 0.061252959072589874, "learning_rate": 9.026548672566371e-07, "loss": 0.0071, "num_tokens": 2434223.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5973830223083496, "sampling/importance_sampling_ratio/mean": 0.9994176030158997, "sampling/importance_sampling_ratio/min": 0.0669156089425087, "sampling/sampling_logp_difference/max": 2.7043230533599854, "sampling/sampling_logp_difference/mean": 0.015180688351392746, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 181.265625, "completions/mean_terminated_length": 181.265625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2178543508052826, "epoch": 0.184070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 1.2444956747282565, "kl": 0.06331880390644073, "learning_rate": 9.11504424778761e-07, "loss": -0.0173, "num_tokens": 2455616.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8702857494354248, "sampling/importance_sampling_ratio/mean": 0.9998196959495544, "sampling/importance_sampling_ratio/min": 0.5524421334266663, "sampling/sampling_logp_difference/max": 0.6260912418365479, "sampling/sampling_logp_difference/mean": 0.01474955677986145, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 231.8125, "completions/mean_terminated_length": 231.8125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.2739162743091583, "epoch": 0.18584070796460178, "frac_reward_zero_std": 0.25, "grad_norm": 1.6793493512931617, "kl": 0.08697757869958878, "learning_rate": 9.203539823008849e-07, "loss": 0.0646, "num_tokens": 2485460.0, "reward": -0.125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6217986345291138, "sampling/importance_sampling_ratio/mean": 0.9996331930160522, "sampling/importance_sampling_ratio/min": 0.049657661467790604, "sampling/sampling_logp_difference/max": 3.0026025772094727, "sampling/sampling_logp_difference/mean": 0.015606374479830265, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 268.546875, "completions/mean_terminated_length": 268.546875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3272721469402313, "epoch": 0.18761061946902655, "frac_reward_zero_std": 0.5, "grad_norm": 1.1741161864785092, "kl": 0.04569275304675102, "learning_rate": 9.292035398230088e-07, "loss": 0.027, "num_tokens": 2515783.0, "reward": -0.15625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.9877281188964844, "sampling/importance_sampling_ratio/mean": 1.0006380081176758, "sampling/importance_sampling_ratio/min": 0.60909503698349, "sampling/sampling_logp_difference/max": 0.6869924068450928, "sampling/sampling_logp_difference/mean": 0.014596743509173393, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 207.40625, "completions/mean_terminated_length": 207.40625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.23827733099460602, "epoch": 0.18938053097345134, "frac_reward_zero_std": 0.75, "grad_norm": 1.3147062156970355, "kl": 0.07311700284481049, "learning_rate": 9.380530973451328e-07, "loss": -0.007, "num_tokens": 2540801.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004336833953857, "sampling/importance_sampling_ratio/min": 0.4764232635498047, "sampling/sampling_logp_difference/max": 0.7414486408233643, "sampling/sampling_logp_difference/mean": 0.014896559529006481, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 240.671875, "completions/mean_terminated_length": 240.671875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.23883236944675446, "epoch": 0.1911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.06659212904834758, "kl": 0.0521988570690155, "learning_rate": 9.469026548672566e-07, "loss": 0.0005, "num_tokens": 2567100.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 1.000476598739624, "sampling/importance_sampling_ratio/min": 0.6124340891838074, "sampling/sampling_logp_difference/max": 0.4985384941101074, "sampling/sampling_logp_difference/mean": 0.014996771700680256, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 216.78125, "completions/mean_terminated_length": 216.78125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.236363023519516, "epoch": 0.1929203539823009, "frac_reward_zero_std": 1.0, "grad_norm": 0.09059580829032168, "kl": 0.06687293946743011, "learning_rate": 9.557522123893805e-07, "loss": 0.0005, "num_tokens": 2591502.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6978429555892944, "sampling/importance_sampling_ratio/mean": 0.9998354315757751, "sampling/importance_sampling_ratio/min": 0.5478631854057312, "sampling/sampling_logp_difference/max": 0.6017296314239502, "sampling/sampling_logp_difference/mean": 0.013860290870070457, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 204.734375, "completions/mean_terminated_length": 204.734375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.19048988819122314, "epoch": 0.19469026548672566, "frac_reward_zero_std": 1.0, "grad_norm": 0.20653801080919615, "kl": 0.07563050091266632, "learning_rate": 9.646017699115042e-07, "loss": 0.0007, "num_tokens": 2614733.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0013582706451416, "sampling/importance_sampling_ratio/min": 0.49583864212036133, "sampling/sampling_logp_difference/max": 1.1189532279968262, "sampling/sampling_logp_difference/mean": 0.015409222804009914, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 259.828125, "completions/mean_terminated_length": 259.828125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2620220184326172, "epoch": 0.19646017699115045, "frac_reward_zero_std": 0.5, "grad_norm": 1.4793746510900267, "kl": 0.05652560293674469, "learning_rate": 9.734513274336282e-07, "loss": 0.0691, "num_tokens": 2645458.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001739263534546, "sampling/importance_sampling_ratio/min": 0.47517821192741394, "sampling/sampling_logp_difference/max": 0.8876917362213135, "sampling/sampling_logp_difference/mean": 0.01437767967581749, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 213.15625, "completions/mean_terminated_length": 213.15625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.2658623158931732, "epoch": 0.19823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 1.870100142500517, "kl": 0.04823882505297661, "learning_rate": 9.82300884955752e-07, "loss": -0.0179, "num_tokens": 2677676.0, "reward": 0.375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8722188472747803, "sampling/importance_sampling_ratio/mean": 0.9997239112854004, "sampling/importance_sampling_ratio/min": 0.30714571475982666, "sampling/sampling_logp_difference/max": 1.1804330348968506, "sampling/sampling_logp_difference/mean": 0.017588764429092407, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 173.734375, "completions/mean_terminated_length": 173.734375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2426486611366272, "epoch": 0.2, "frac_reward_zero_std": 0.5, "grad_norm": 1.9514831071692837, "kl": 0.05591709166765213, "learning_rate": 9.91150442477876e-07, "loss": 0.0022, "num_tokens": 2703147.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5589383840560913, "sampling/importance_sampling_ratio/mean": 0.9995216131210327, "sampling/importance_sampling_ratio/min": 0.49919456243515015, "sampling/sampling_logp_difference/max": 0.6947593688964844, "sampling/sampling_logp_difference/mean": 0.014397826045751572, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 115.5, "completions/mean_terminated_length": 115.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2557012736797333, "epoch": 0.20176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.6716524343673684, "kl": 0.08788169920444489, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 2723371.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004115104675293, "sampling/importance_sampling_ratio/min": 0.3945864140987396, "sampling/sampling_logp_difference/max": 0.9299170970916748, "sampling/sampling_logp_difference/mean": 0.01942257210612297, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.21108892560005188, "epoch": 0.20353982300884957, "frac_reward_zero_std": 0.75, "grad_norm": 1.0196786022825157, "kl": 0.033624231815338135, "learning_rate": 9.99997614400677e-07, "loss": 0.0169, "num_tokens": 2749723.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9876788854599, "sampling/importance_sampling_ratio/mean": 0.9999074935913086, "sampling/importance_sampling_ratio/min": 0.5111927390098572, "sampling/sampling_logp_difference/max": 0.6869676113128662, "sampling/sampling_logp_difference/mean": 0.012366114184260368, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 246.078125, "completions/mean_terminated_length": 246.078125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2875988483428955, "epoch": 0.20530973451327433, "frac_reward_zero_std": 0.75, "grad_norm": 1.0129447711376403, "kl": 0.05211128666996956, "learning_rate": 9.999904576254724e-07, "loss": -0.0072, "num_tokens": 2779888.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.9099901914596558, "sampling/importance_sampling_ratio/mean": 1.0008246898651123, "sampling/importance_sampling_ratio/min": 0.6159173846244812, "sampling/sampling_logp_difference/max": 0.6470980644226074, "sampling/sampling_logp_difference/mean": 0.015564092434942722, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 182.703125, "completions/mean_terminated_length": 182.703125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.26912015676498413, "epoch": 0.20707964601769913, "frac_reward_zero_std": 0.5, "grad_norm": 1.9767072934837584, "kl": 0.09913481026887894, "learning_rate": 9.999785297426788e-07, "loss": 0.0642, "num_tokens": 2803277.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4126489162445068, "sampling/importance_sampling_ratio/mean": 0.9992721080780029, "sampling/importance_sampling_ratio/min": 0.4596952497959137, "sampling/sampling_logp_difference/max": 0.7771915197372437, "sampling/sampling_logp_difference/mean": 0.014721906743943691, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.2546720504760742, "epoch": 0.2088495575221239, "frac_reward_zero_std": 0.5, "grad_norm": 1.7649745716762282, "kl": 0.05629459023475647, "learning_rate": 9.999618308661168e-07, "loss": 0.0017, "num_tokens": 2825353.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6095545291900635, "sampling/importance_sampling_ratio/mean": 0.9999802708625793, "sampling/importance_sampling_ratio/min": 0.09610749036073685, "sampling/sampling_logp_difference/max": 2.342288017272949, "sampling/sampling_logp_difference/mean": 0.015184259973466396, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 167.0625, "completions/mean_terminated_length": 167.0625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.17413076758384705, "epoch": 0.21061946902654868, "frac_reward_zero_std": 0.75, "grad_norm": 1.1274353356418567, "kl": 0.06079719215631485, "learning_rate": 9.99940361155134e-07, "loss": -0.064, "num_tokens": 2845261.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001338720321655, "sampling/importance_sampling_ratio/min": 0.49466392397880554, "sampling/sampling_logp_difference/max": 0.8144795894622803, "sampling/sampling_logp_difference/mean": 0.013794423080980778, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 245.046875, "completions/mean_terminated_length": 245.046875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3485606908798218, "epoch": 0.21238938053097345, "frac_reward_zero_std": 0.75, "grad_norm": 1.0795980321006873, "kl": 0.07146261632442474, "learning_rate": 9.999141208146027e-07, "loss": 0.0148, "num_tokens": 2872800.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5975104570388794, "sampling/importance_sampling_ratio/mean": 0.9996150732040405, "sampling/importance_sampling_ratio/min": 0.29496005177497864, "sampling/sampling_logp_difference/max": 1.2209153175354004, "sampling/sampling_logp_difference/mean": 0.017085013911128044, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 172.234375, "completions/mean_terminated_length": 172.234375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2647247314453125, "epoch": 0.21415929203539824, "frac_reward_zero_std": 0.75, "grad_norm": 1.77772041815243, "kl": 0.05289836972951889, "learning_rate": 9.998831100949186e-07, "loss": -0.0088, "num_tokens": 2897935.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8765528202056885, "sampling/importance_sampling_ratio/mean": 1.0002882480621338, "sampling/importance_sampling_ratio/min": 0.4817925989627838, "sampling/sampling_logp_difference/max": 0.7302415370941162, "sampling/sampling_logp_difference/mean": 0.015579603612422943, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 157.375, "completions/mean_terminated_length": 157.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.20127081871032715, "epoch": 0.215929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.07997509442496996, "kl": 0.06404447555541992, "learning_rate": 9.998473292919985e-07, "loss": 0.0006, "num_tokens": 2920935.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6554591655731201, "sampling/importance_sampling_ratio/mean": 0.9998010993003845, "sampling/importance_sampling_ratio/min": 0.5910095572471619, "sampling/sampling_logp_difference/max": 0.5259230136871338, "sampling/sampling_logp_difference/mean": 0.014640753157436848, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 81.4375, "completions/mean_terminated_length": 81.4375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.16063877940177917, "epoch": 0.2176991150442478, "frac_reward_zero_std": 1.0, "grad_norm": 0.22264701748554352, "kl": 0.05465121939778328, "learning_rate": 9.99806778747277e-07, "loss": 0.0005, "num_tokens": 2935779.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.831132173538208, "sampling/importance_sampling_ratio/mean": 0.9984936714172363, "sampling/importance_sampling_ratio/min": 0.5384897589683533, "sampling/sampling_logp_difference/max": 0.6189867258071899, "sampling/sampling_logp_difference/mean": 0.01394605077803135, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 83.890625, "completions/mean_terminated_length": 83.890625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.14302532374858856, "epoch": 0.21946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.1311563129801875, "kl": 0.0500192865729332, "learning_rate": 9.997614588477033e-07, "loss": 0.0005, "num_tokens": 2951260.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8573094606399536, "sampling/importance_sampling_ratio/mean": 0.9990832805633545, "sampling/importance_sampling_ratio/min": 0.4955776035785675, "sampling/sampling_logp_difference/max": 0.7020313739776611, "sampling/sampling_logp_difference/mean": 0.013651306740939617, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.1853649616241455, "epoch": 0.22123893805309736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0641921562695447, "kl": 0.04823670536279678, "learning_rate": 9.99711370025738e-07, "loss": 0.0004, "num_tokens": 2974316.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5467828512191772, "sampling/importance_sampling_ratio/mean": 1.000376582145691, "sampling/importance_sampling_ratio/min": 0.5576075911521912, "sampling/sampling_logp_difference/max": 0.5840997695922852, "sampling/sampling_logp_difference/mean": 0.011807742528617382, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 171.828125, "completions/mean_terminated_length": 171.828125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.19575996696949005, "epoch": 0.22300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 1.3807434042307594, "kl": 0.039622049778699875, "learning_rate": 9.996565127593489e-07, "loss": -0.0258, "num_tokens": 2994993.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6060998439788818, "sampling/importance_sampling_ratio/mean": 1.0007834434509277, "sampling/importance_sampling_ratio/min": 0.4978904724121094, "sampling/sampling_logp_difference/max": 0.6973751783370972, "sampling/sampling_logp_difference/mean": 0.01246834360063076, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.21818499267101288, "epoch": 0.2247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 2.1266328310549576, "kl": 0.04369910806417465, "learning_rate": 9.995968875720051e-07, "loss": 0.0205, "num_tokens": 3017073.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.9486087560653687, "sampling/importance_sampling_ratio/mean": 0.9993820190429688, "sampling/importance_sampling_ratio/min": 0.5328285694122314, "sampling/sampling_logp_difference/max": 0.6671156883239746, "sampling/sampling_logp_difference/mean": 0.014706995338201523, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 175.53125, "completions/mean_terminated_length": 175.53125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.20195582509040833, "epoch": 0.22654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 1.4259870535676444, "kl": 0.04825274646282196, "learning_rate": 9.995324950326745e-07, "loss": 0.0107, "num_tokens": 3039651.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997382164001465, "sampling/importance_sampling_ratio/min": 0.29544302821159363, "sampling/sampling_logp_difference/max": 1.2192792892456055, "sampling/sampling_logp_difference/mean": 0.013305413536727428, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.22373376786708832, "epoch": 0.22831858407079647, "frac_reward_zero_std": 1.0, "grad_norm": 0.07069851761974077, "kl": 0.06062999367713928, "learning_rate": 9.994633357558158e-07, "loss": 0.0006, "num_tokens": 3060795.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007473230361938, "sampling/importance_sampling_ratio/min": 0.6056891679763794, "sampling/sampling_logp_difference/max": 0.8462793827056885, "sampling/sampling_logp_difference/mean": 0.013714168220758438, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 135.609375, "completions/mean_terminated_length": 135.609375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.27379995584487915, "epoch": 0.23008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.08953847298805089, "kl": 0.05032063275575638, "learning_rate": 9.993894104013746e-07, "loss": 0.0005, "num_tokens": 3079842.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7552759647369385, "sampling/importance_sampling_ratio/mean": 1.0004489421844482, "sampling/importance_sampling_ratio/min": 0.5583280920982361, "sampling/sampling_logp_difference/max": 0.5828084945678711, "sampling/sampling_logp_difference/mean": 0.01699213497340679, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 127.859375, "completions/mean_terminated_length": 127.859375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18072837591171265, "epoch": 0.23185840707964603, "frac_reward_zero_std": 0.75, "grad_norm": 1.7145470840045622, "kl": 0.03703214228153229, "learning_rate": 9.993107196747758e-07, "loss": 0.0028, "num_tokens": 3097737.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0012195110321045, "sampling/importance_sampling_ratio/min": 0.503804087638855, "sampling/sampling_logp_difference/max": 0.7053120136260986, "sampling/sampling_logp_difference/mean": 0.014043048024177551, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 186.203125, "completions/mean_terminated_length": 186.203125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.323123574256897, "epoch": 0.2336283185840708, "frac_reward_zero_std": 0.5, "grad_norm": 1.800959872626628, "kl": 0.04085113853216171, "learning_rate": 9.99227264326918e-07, "loss": -0.0454, "num_tokens": 3121702.0, "reward": 0.65625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9997234344482422, "sampling/importance_sampling_ratio/mean": 0.9996789693832397, "sampling/importance_sampling_ratio/min": 0.4441255033016205, "sampling/sampling_logp_difference/max": 0.8116481304168701, "sampling/sampling_logp_difference/mean": 0.016510576009750366, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 158.421875, "completions/mean_terminated_length": 158.421875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.33236420154571533, "epoch": 0.23539823008849559, "frac_reward_zero_std": 0.5, "grad_norm": 1.9055793451547123, "kl": 0.05743710696697235, "learning_rate": 9.991390451541648e-07, "loss": -0.0634, "num_tokens": 3145569.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995548129081726, "sampling/importance_sampling_ratio/min": 0.5193193554878235, "sampling/sampling_logp_difference/max": 0.7912988662719727, "sampling/sampling_logp_difference/mean": 0.01805954799056053, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.3111136555671692, "epoch": 0.23716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 1.311158125425309, "kl": 0.04814206063747406, "learning_rate": 9.990460629983388e-07, "loss": 0.0205, "num_tokens": 3168569.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011968612670898, "sampling/importance_sampling_ratio/min": 0.5260617733001709, "sampling/sampling_logp_difference/max": 0.7374453544616699, "sampling/sampling_logp_difference/mean": 0.016848810017108917, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 140.96875, "completions/mean_terminated_length": 140.96875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.2111259400844574, "epoch": 0.23893805309734514, "frac_reward_zero_std": 0.75, "grad_norm": 5.3230060796163166, "kl": 0.03357328847050667, "learning_rate": 9.989483187467125e-07, "loss": 0.0245, "num_tokens": 3188871.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.621081829071045, "sampling/importance_sampling_ratio/mean": 1.001030445098877, "sampling/importance_sampling_ratio/min": 0.41956886649131775, "sampling/sampling_logp_difference/max": 0.8685276508331299, "sampling/sampling_logp_difference/mean": 0.015535259619355202, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 106.546875, "completions/mean_terminated_length": 106.546875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.18051879107952118, "epoch": 0.2407079646017699, "frac_reward_zero_std": 1.0, "grad_norm": 0.08539384287438397, "kl": 0.04960373789072037, "learning_rate": 9.988458133320008e-07, "loss": 0.0005, "num_tokens": 3205514.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991469979286194, "sampling/importance_sampling_ratio/min": 0.23222419619560242, "sampling/sampling_logp_difference/max": 1.4600520133972168, "sampling/sampling_logp_difference/mean": 0.015370067209005356, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 204.90625, "completions/mean_terminated_length": 204.90625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2886112332344055, "epoch": 0.2424778761061947, "frac_reward_zero_std": 0.5, "grad_norm": 1.5977298716689574, "kl": 0.043666113168001175, "learning_rate": 9.987385477323506e-07, "loss": -0.0104, "num_tokens": 3228660.0, "reward": 0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6629879474639893, "sampling/importance_sampling_ratio/mean": 1.0003365278244019, "sampling/importance_sampling_ratio/min": 0.5337882041931152, "sampling/sampling_logp_difference/max": 0.6277561187744141, "sampling/sampling_logp_difference/mean": 0.016267741098999977, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 142.609375, "completions/mean_terminated_length": 142.609375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.22132322192192078, "epoch": 0.24424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 2.470226000179731, "kl": 0.03379517421126366, "learning_rate": 9.98626522971333e-07, "loss": 0.0857, "num_tokens": 3249611.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000138759613037, "sampling/importance_sampling_ratio/min": 0.5147907733917236, "sampling/sampling_logp_difference/max": 0.7252029180526733, "sampling/sampling_logp_difference/mean": 0.015405265614390373, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 132.46875, "completions/mean_terminated_length": 132.46875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.20629651844501495, "epoch": 0.24601769911504426, "frac_reward_zero_std": 0.75, "grad_norm": 1.7363424852428704, "kl": 0.046867236495018005, "learning_rate": 9.985097401179333e-07, "loss": -0.0498, "num_tokens": 3268137.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.986655592918396, "sampling/importance_sampling_ratio/mean": 1.000293254852295, "sampling/importance_sampling_ratio/min": 0.5087478756904602, "sampling/sampling_logp_difference/max": 0.6864526271820068, "sampling/sampling_logp_difference/mean": 0.016203537583351135, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 264.859375, "completions/mean_terminated_length": 264.859375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.17257320880889893, "epoch": 0.24778761061946902, "frac_reward_zero_std": 0.75, "grad_norm": 0.9239956513635336, "kl": 0.030428174883127213, "learning_rate": 9.98388200286539e-07, "loss": -0.3853, "num_tokens": 3294848.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6448955535888672, "sampling/importance_sampling_ratio/mean": 0.998746931552887, "sampling/importance_sampling_ratio/min": 0.471780002117157, "sampling/sampling_logp_difference/max": 0.7512425780296326, "sampling/sampling_logp_difference/mean": 0.011864934116601944, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 234.8125, "completions/mean_terminated_length": 234.8125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3245481550693512, "epoch": 0.24955752212389382, "frac_reward_zero_std": 0.5, "grad_norm": 1.4947412678195762, "kl": 0.04162580892443657, "learning_rate": 9.98261904636932e-07, "loss": -0.0037, "num_tokens": 3319988.0, "reward": 0.5, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6176438331604004, "sampling/importance_sampling_ratio/mean": 1.0005638599395752, "sampling/importance_sampling_ratio/min": 0.6266666054725647, "sampling/sampling_logp_difference/max": 0.4809706211090088, "sampling/sampling_logp_difference/mean": 0.014213096350431442, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.37478187680244446, "epoch": 0.2513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 0.8110948287864098, "kl": 0.03772830218076706, "learning_rate": 9.981308543742756e-07, "loss": 0.037, "num_tokens": 3352332.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.645712971687317, "sampling/importance_sampling_ratio/mean": 1.0003209114074707, "sampling/importance_sampling_ratio/min": 0.6463695168495178, "sampling/sampling_logp_difference/max": 0.49817371368408203, "sampling/sampling_logp_difference/mean": 0.015004015527665615, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 129.234375, "completions/mean_terminated_length": 129.234375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.202674001455307, "epoch": 0.25309734513274335, "frac_reward_zero_std": 0.75, "grad_norm": 1.7450249167045533, "kl": 0.03424923121929169, "learning_rate": 9.979950507491033e-07, "loss": -0.0068, "num_tokens": 3370635.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 0.9997340440750122, "sampling/importance_sampling_ratio/min": 0.6368716359138489, "sampling/sampling_logp_difference/max": 0.49199533462524414, "sampling/sampling_logp_difference/mean": 0.012455548159778118, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 161.453125, "completions/mean_terminated_length": 161.453125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.1879473626613617, "epoch": 0.25486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 1.4572434561974212, "kl": 0.031219318509101868, "learning_rate": 9.978544950573073e-07, "loss": 0.0159, "num_tokens": 3389448.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005161762237549, "sampling/importance_sampling_ratio/min": 0.56766277551651, "sampling/sampling_logp_difference/max": 0.7144360542297363, "sampling/sampling_logp_difference/mean": 0.012587983161211014, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.1761074811220169, "epoch": 0.25663716814159293, "frac_reward_zero_std": 0.75, "grad_norm": 1.4888018866238095, "kl": 0.036779992282390594, "learning_rate": 9.97709188640126e-07, "loss": 0.0276, "num_tokens": 3409320.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.844468593597412, "sampling/importance_sampling_ratio/mean": 0.9997842311859131, "sampling/importance_sampling_ratio/min": 0.5999211668968201, "sampling/sampling_logp_difference/max": 0.6121912002563477, "sampling/sampling_logp_difference/mean": 0.012412301264703274, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 223.203125, "completions/mean_terminated_length": 223.203125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.29057860374450684, "epoch": 0.2584070796460177, "frac_reward_zero_std": 0.5, "grad_norm": 1.6319849425097723, "kl": 0.03959917277097702, "learning_rate": 9.975591328841304e-07, "loss": -0.0659, "num_tokens": 3436117.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.862831711769104, "sampling/importance_sampling_ratio/mean": 0.9997316002845764, "sampling/importance_sampling_ratio/min": 0.4762316644191742, "sampling/sampling_logp_difference/max": 0.7418508529663086, "sampling/sampling_logp_difference/mean": 0.017581796273589134, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 147.046875, "completions/mean_terminated_length": 147.046875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.3049260079860687, "epoch": 0.26017699115044246, "frac_reward_zero_std": 0.75, "grad_norm": 1.3988666029596748, "kl": 0.038446083664894104, "learning_rate": 9.974043292212127e-07, "loss": 0.0172, "num_tokens": 3462616.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994754791259766, "sampling/importance_sampling_ratio/min": 0.5798601508140564, "sampling/sampling_logp_difference/max": 0.7348442077636719, "sampling/sampling_logp_difference/mean": 0.015258293598890305, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 259.421875, "completions/mean_terminated_length": 259.421875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.34669095277786255, "epoch": 0.26194690265486725, "frac_reward_zero_std": 0.25, "grad_norm": 1.5683862448049795, "kl": 0.046381495893001556, "learning_rate": 9.97244779128571e-07, "loss": 0.0047, "num_tokens": 3492099.0, "reward": -0.09375, "reward_std": 0.6769562363624573, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.601962685585022, "sampling/importance_sampling_ratio/mean": 1.00082266330719, "sampling/importance_sampling_ratio/min": 0.5347745418548584, "sampling/sampling_logp_difference/max": 0.6259100437164307, "sampling/sampling_logp_difference/mean": 0.015078282915055752, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 171.6875, "completions/mean_terminated_length": 171.6875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.27948006987571716, "epoch": 0.26371681415929205, "frac_reward_zero_std": 0.5, "grad_norm": 1.8030906659466106, "kl": 0.04792959988117218, "learning_rate": 9.970804841286953e-07, "loss": 0.0178, "num_tokens": 3513663.0, "reward": 0.40625, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5294251441955566, "sampling/importance_sampling_ratio/mean": 0.9999539852142334, "sampling/importance_sampling_ratio/min": 0.6177488565444946, "sampling/sampling_logp_difference/max": 0.4816732406616211, "sampling/sampling_logp_difference/mean": 0.01500643976032734, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 207.328125, "completions/mean_terminated_length": 207.328125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2452256977558136, "epoch": 0.26548672566371684, "frac_reward_zero_std": 0.75, "grad_norm": 1.1505435654463292, "kl": 0.060367733240127563, "learning_rate": 9.969114457893539e-07, "loss": -0.0143, "num_tokens": 3539508.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.507159948348999, "sampling/importance_sampling_ratio/mean": 0.9999010562896729, "sampling/importance_sampling_ratio/min": 0.5131088495254517, "sampling/sampling_logp_difference/max": 0.6672673225402832, "sampling/sampling_logp_difference/mean": 0.012939774431288242, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 144.453125, "completions/mean_terminated_length": 144.453125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.2799568772315979, "epoch": 0.2672566371681416, "frac_reward_zero_std": 0.75, "grad_norm": 1.570090336354742, "kl": 0.04856320470571518, "learning_rate": 9.967376657235778e-07, "loss": -0.0072, "num_tokens": 3559137.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.612134575843811, "sampling/importance_sampling_ratio/mean": 0.9999405741691589, "sampling/importance_sampling_ratio/min": 0.4382757246494293, "sampling/sampling_logp_difference/max": 0.8249070644378662, "sampling/sampling_logp_difference/mean": 0.016361702233552933, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 159.40625, "completions/mean_terminated_length": 159.40625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2526318430900574, "epoch": 0.26902654867256637, "frac_reward_zero_std": 0.5, "grad_norm": 1.972757595252805, "kl": 0.05048777163028717, "learning_rate": 9.965591455896455e-07, "loss": -0.0239, "num_tokens": 3581051.0, "reward": 0.59375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5939339399337769, "sampling/importance_sampling_ratio/mean": 1.0001716613769531, "sampling/importance_sampling_ratio/min": 0.5696070790290833, "sampling/sampling_logp_difference/max": 0.5628085136413574, "sampling/sampling_logp_difference/mean": 0.016113288700580597, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 339.234375, "completions/mean_terminated_length": 339.234375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16680997610092163, "epoch": 0.27079646017699116, "frac_reward_zero_std": 0.75, "grad_norm": 0.8429714517674914, "kl": 0.05585601180791855, "learning_rate": 9.96375887091067e-07, "loss": -0.0521, "num_tokens": 3615466.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.556276798248291, "sampling/importance_sampling_ratio/mean": 0.9997667074203491, "sampling/importance_sampling_ratio/min": 0.5495362877845764, "sampling/sampling_logp_difference/max": 0.5986804962158203, "sampling/sampling_logp_difference/mean": 0.010948631912469864, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 249.71875, "completions/mean_terminated_length": 249.71875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.20812052488327026, "epoch": 0.27256637168141595, "frac_reward_zero_std": 0.75, "grad_norm": 1.0031975806921591, "kl": 0.04731081426143646, "learning_rate": 9.961878919765677e-07, "loss": -0.0147, "num_tokens": 3644616.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6430031061172485, "sampling/importance_sampling_ratio/mean": 1.0005505084991455, "sampling/importance_sampling_ratio/min": 0.6268644332885742, "sampling/sampling_logp_difference/max": 0.49652576446533203, "sampling/sampling_logp_difference/mean": 0.012204162776470184, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 107.421875, "completions/mean_terminated_length": 107.421875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.18409234285354614, "epoch": 0.2743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 2.571020758734286, "kl": 0.07421048730611801, "learning_rate": 9.959951620400718e-07, "loss": -0.0398, "num_tokens": 3660979.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5522903203964233, "sampling/importance_sampling_ratio/mean": 0.9994438290596008, "sampling/importance_sampling_ratio/min": 0.3204302191734314, "sampling/sampling_logp_difference/max": 1.13809072971344, "sampling/sampling_logp_difference/mean": 0.015559653751552105, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 208.859375, "completions/mean_terminated_length": 208.859375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2387533336877823, "epoch": 0.2761061946902655, "frac_reward_zero_std": 0.75, "grad_norm": 1.2396098839771237, "kl": 0.04698648303747177, "learning_rate": 9.957976991206845e-07, "loss": -0.0115, "num_tokens": 3685194.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.57460618019104, "sampling/importance_sampling_ratio/mean": 1.0006206035614014, "sampling/importance_sampling_ratio/min": 0.3511415719985962, "sampling/sampling_logp_difference/max": 1.0465657711029053, "sampling/sampling_logp_difference/mean": 0.015074488706886768, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 175.203125, "completions/mean_terminated_length": 175.203125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.27575457096099854, "epoch": 0.2778761061946903, "frac_reward_zero_std": 0.5, "grad_norm": 1.976355942606349, "kl": 0.06410539150238037, "learning_rate": 9.955955051026758e-07, "loss": 0.01, "num_tokens": 3708583.0, "reward": 0.40625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8182222843170166, "sampling/importance_sampling_ratio/mean": 1.0003331899642944, "sampling/importance_sampling_ratio/min": 0.37312787771224976, "sampling/sampling_logp_difference/max": 0.9858341217041016, "sampling/sampling_logp_difference/mean": 0.016387388110160828, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 171.15625, "completions/mean_terminated_length": 171.15625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.18562407791614532, "epoch": 0.27964601769911507, "frac_reward_zero_std": 1.0, "grad_norm": 0.09961459968127882, "kl": 0.053850047290325165, "learning_rate": 9.953885819154614e-07, "loss": 0.0005, "num_tokens": 3730241.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5033448934555054, "sampling/importance_sampling_ratio/mean": 0.9995132684707642, "sampling/importance_sampling_ratio/min": 0.6132686734199524, "sampling/sampling_logp_difference/max": 0.4889521598815918, "sampling/sampling_logp_difference/mean": 0.011579981073737144, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 170.03125, "completions/mean_terminated_length": 170.03125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.250222384929657, "epoch": 0.2814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 1.3992419064793753, "kl": 0.11093031615018845, "learning_rate": 9.951769315335843e-07, "loss": -0.0026, "num_tokens": 3752179.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5764904022216797, "sampling/importance_sampling_ratio/mean": 1.0005743503570557, "sampling/importance_sampling_ratio/min": 0.6147308349609375, "sampling/sampling_logp_difference/max": 0.4865708351135254, "sampling/sampling_logp_difference/mean": 0.014710028655827045, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 184.484375, "completions/mean_terminated_length": 184.484375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.24355477094650269, "epoch": 0.2831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 1.3486923650665052, "kl": 0.08063840866088867, "learning_rate": 9.949605559766967e-07, "loss": 0.0059, "num_tokens": 3775394.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6289809942245483, "sampling/importance_sampling_ratio/mean": 0.9995386600494385, "sampling/importance_sampling_ratio/min": 0.6168261170387268, "sampling/sampling_logp_difference/max": 0.48795461654663086, "sampling/sampling_logp_difference/mean": 0.01415097527205944, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 173.015625, "completions/mean_terminated_length": 173.015625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2436259686946869, "epoch": 0.2849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 1.3425878385707986, "kl": 0.08088184148073196, "learning_rate": 9.947394573095402e-07, "loss": -0.0146, "num_tokens": 3797155.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.9441547393798828, "sampling/importance_sampling_ratio/mean": 0.9999350905418396, "sampling/importance_sampling_ratio/min": 0.6379815340042114, "sampling/sampling_logp_difference/max": 0.6648273468017578, "sampling/sampling_logp_difference/mean": 0.014972914941608906, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.17000535130500793, "epoch": 0.2867256637168142, "frac_reward_zero_std": 1.0, "grad_norm": 0.11828672964775479, "kl": 0.06619326770305634, "learning_rate": 9.945136376419258e-07, "loss": 0.0008, "num_tokens": 3815971.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5745176076889038, "sampling/importance_sampling_ratio/mean": 0.9997663497924805, "sampling/importance_sampling_ratio/min": 0.60896235704422, "sampling/sampling_logp_difference/max": 0.4959988594055176, "sampling/sampling_logp_difference/mean": 0.014581337571144104, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.21023070812225342, "epoch": 0.2884955752212389, "frac_reward_zero_std": 1.0, "grad_norm": 0.1885860391975678, "kl": 0.08915962278842926, "learning_rate": 9.942830991287149e-07, "loss": 0.0008, "num_tokens": 3839047.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999798536300659, "sampling/importance_sampling_ratio/min": 0.48880404233932495, "sampling/sampling_logp_difference/max": 0.7188894748687744, "sampling/sampling_logp_difference/mean": 0.015480308793485165, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 139.921875, "completions/mean_terminated_length": 139.921875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1358480155467987, "epoch": 0.2902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.06586042045823841, "kl": 0.05471270903944969, "learning_rate": 9.940478439697972e-07, "loss": 0.0006, "num_tokens": 3857234.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5569746494293213, "sampling/importance_sampling_ratio/mean": 0.9994601607322693, "sampling/importance_sampling_ratio/min": 0.6134350895881653, "sampling/sampling_logp_difference/max": 0.4886808395385742, "sampling/sampling_logp_difference/mean": 0.010513011366128922, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 224.109375, "completions/mean_terminated_length": 224.109375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.29118549823760986, "epoch": 0.2920353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 1.639661615975499, "kl": 0.04187390208244324, "learning_rate": 9.93807874410071e-07, "loss": -0.0555, "num_tokens": 3884105.0, "reward": 0.0, "reward_std": 0.5163977742195129, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.867973804473877, "sampling/importance_sampling_ratio/mean": 0.9997695088386536, "sampling/importance_sampling_ratio/min": 0.6369844079017639, "sampling/sampling_logp_difference/max": 0.624854326248169, "sampling/sampling_logp_difference/mean": 0.014386842027306557, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 182.390625, "completions/mean_terminated_length": 182.390625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.22859393060207367, "epoch": 0.2938053097345133, "frac_reward_zero_std": 0.5, "grad_norm": 1.8322476374176873, "kl": 0.06407855451107025, "learning_rate": 9.935631927394214e-07, "loss": 0.0324, "num_tokens": 3907410.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8200724124908447, "sampling/importance_sampling_ratio/mean": 1.00089430809021, "sampling/importance_sampling_ratio/min": 0.5937168598175049, "sampling/sampling_logp_difference/max": 0.5988762378692627, "sampling/sampling_logp_difference/mean": 0.015347898006439209, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.14640483260154724, "epoch": 0.29557522123893804, "frac_reward_zero_std": 0.75, "grad_norm": 2.1273544806872167, "kl": 0.05889119207859039, "learning_rate": 9.93313801292698e-07, "loss": -0.0294, "num_tokens": 3927426.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5745105743408203, "sampling/importance_sampling_ratio/mean": 1.000089168548584, "sampling/importance_sampling_ratio/min": 0.4313249886035919, "sampling/sampling_logp_difference/max": 0.8408935070037842, "sampling/sampling_logp_difference/mean": 0.012749769724905491, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 177.734375, "completions/mean_terminated_length": 177.734375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.12378593534231186, "epoch": 0.2973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.06522292186516773, "kl": 0.03565538674592972, "learning_rate": 9.93059702449693e-07, "loss": 0.0003, "num_tokens": 3948433.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9125655889511108, "sampling/importance_sampling_ratio/mean": 0.999817967414856, "sampling/importance_sampling_ratio/min": 0.4954244792461395, "sampling/sampling_logp_difference/max": 0.7023403644561768, "sampling/sampling_logp_difference/mean": 0.01145128719508648, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 135.921875, "completions/mean_terminated_length": 135.921875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.20304086804389954, "epoch": 0.2991150442477876, "frac_reward_zero_std": 0.75, "grad_norm": 1.8437021366194568, "kl": 0.05810399353504181, "learning_rate": 9.928008986351186e-07, "loss": 0.0176, "num_tokens": 3967660.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6980632543563843, "sampling/importance_sampling_ratio/mean": 0.9992431402206421, "sampling/importance_sampling_ratio/min": 0.29765835404396057, "sampling/sampling_logp_difference/max": 1.2118089199066162, "sampling/sampling_logp_difference/mean": 0.016240648925304413, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 206.46875, "completions/mean_terminated_length": 206.46875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2700306177139282, "epoch": 0.3008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 1.0896342469618259, "kl": 0.0563848614692688, "learning_rate": 9.925373923185834e-07, "loss": -0.0158, "num_tokens": 3994986.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 1.000365138053894, "sampling/importance_sampling_ratio/min": 0.6099445819854736, "sampling/sampling_logp_difference/max": 0.494387149810791, "sampling/sampling_logp_difference/mean": 0.014883225783705711, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 234.84375, "completions/mean_terminated_length": 234.84375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.20097064971923828, "epoch": 0.30265486725663715, "frac_reward_zero_std": 1.0, "grad_norm": 0.07623516286868408, "kl": 0.05854497849941254, "learning_rate": 9.922691860145696e-07, "loss": 0.0005, "num_tokens": 4025696.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9877879619598389, "sampling/importance_sampling_ratio/mean": 1.0005898475646973, "sampling/importance_sampling_ratio/min": 0.4729013741016388, "sampling/sampling_logp_difference/max": 0.748868465423584, "sampling/sampling_logp_difference/mean": 0.016499243676662445, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 224.390625, "completions/mean_terminated_length": 224.390625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.17945589125156403, "epoch": 0.30442477876106194, "frac_reward_zero_std": 1.0, "grad_norm": 0.06972067160311254, "kl": 0.056893207132816315, "learning_rate": 9.919962822824083e-07, "loss": 0.0005, "num_tokens": 4055849.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7722561359405518, "sampling/importance_sampling_ratio/mean": 0.9997329711914062, "sampling/importance_sampling_ratio/min": 0.5239052772521973, "sampling/sampling_logp_difference/max": 0.6464443802833557, "sampling/sampling_logp_difference/mean": 0.01286320574581623, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 90.96875, "completions/mean_terminated_length": 90.96875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.1225813776254654, "epoch": 0.30619469026548674, "frac_reward_zero_std": 1.0, "grad_norm": 0.22576671565264844, "kl": 0.08146051317453384, "learning_rate": 9.91718683726255e-07, "loss": 0.0008, "num_tokens": 4070775.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008974075317383, "sampling/importance_sampling_ratio/min": 0.22563625872135162, "sampling/sampling_logp_difference/max": 1.4888310432434082, "sampling/sampling_logp_difference/mean": 0.01603073626756668, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 239.21875, "completions/mean_terminated_length": 239.21875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.1834007203578949, "epoch": 0.30796460176991153, "frac_reward_zero_std": 0.75, "grad_norm": 0.9959408178654533, "kl": 0.07882221788167953, "learning_rate": 9.914363929950657e-07, "loss": 0.0004, "num_tokens": 4096277.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.98067045211792, "sampling/importance_sampling_ratio/mean": 0.9997706413269043, "sampling/importance_sampling_ratio/min": 0.48638439178466797, "sampling/sampling_logp_difference/max": 0.7207560539245605, "sampling/sampling_logp_difference/mean": 0.013324819505214691, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 167.703125, "completions/mean_terminated_length": 167.703125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.11181046068668365, "epoch": 0.30973451327433627, "frac_reward_zero_std": 1.0, "grad_norm": 0.09811936833783601, "kl": 0.049999725073575974, "learning_rate": 9.91149412782571e-07, "loss": 0.0004, "num_tokens": 4114946.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6284534931182861, "sampling/importance_sampling_ratio/mean": 0.9999029636383057, "sampling/importance_sampling_ratio/min": 0.3750702738761902, "sampling/sampling_logp_difference/max": 0.9806418418884277, "sampling/sampling_logp_difference/mean": 0.010283855721354485, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 248.484375, "completions/mean_terminated_length": 248.484375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.22917324304580688, "epoch": 0.31150442477876106, "frac_reward_zero_std": 0.25, "grad_norm": 2.1256515127134845, "kl": 0.05161648988723755, "learning_rate": 9.908577458272495e-07, "loss": 0.0653, "num_tokens": 4142465.0, "reward": 0.1875, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5810072422027588, "sampling/importance_sampling_ratio/mean": 0.9997451305389404, "sampling/importance_sampling_ratio/min": 0.41700080037117004, "sampling/sampling_logp_difference/max": 0.8746671676635742, "sampling/sampling_logp_difference/mean": 0.014387201517820358, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 185.015625, "completions/mean_terminated_length": 185.015625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.15055951476097107, "epoch": 0.31327433628318585, "frac_reward_zero_std": 1.0, "grad_norm": 0.09743484755765842, "kl": 0.053565364331007004, "learning_rate": 9.905613949123034e-07, "loss": 0.0004, "num_tokens": 4166482.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003902912139893, "sampling/importance_sampling_ratio/min": 0.30403104424476624, "sampling/sampling_logp_difference/max": 1.1906254291534424, "sampling/sampling_logp_difference/mean": 0.012004513293504715, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 311.296875, "completions/mean_terminated_length": 311.296875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2245781421661377, "epoch": 0.31504424778761064, "frac_reward_zero_std": 0.5, "grad_norm": 1.2270942724955192, "kl": 0.06160787120461464, "learning_rate": 9.902603628656311e-07, "loss": 0.0148, "num_tokens": 4196757.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994308948516846, "sampling/importance_sampling_ratio/min": 0.521935760974884, "sampling/sampling_logp_difference/max": 0.8979051113128662, "sampling/sampling_logp_difference/mean": 0.012811796739697456, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 223.828125, "completions/mean_terminated_length": 223.828125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.18032121658325195, "epoch": 0.3168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 2.168402021965858, "kl": 0.22441846132278442, "learning_rate": 9.899546525597997e-07, "loss": -0.0458, "num_tokens": 4221610.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6514854431152344, "sampling/importance_sampling_ratio/mean": 0.9995503425598145, "sampling/importance_sampling_ratio/min": 0.48067256808280945, "sampling/sampling_logp_difference/max": 0.7325689792633057, "sampling/sampling_logp_difference/mean": 0.013849745504558086, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 215.09375, "completions/mean_terminated_length": 215.09375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.24132972955703735, "epoch": 0.3185840707964602, "frac_reward_zero_std": 0.5, "grad_norm": 1.7294739584199394, "kl": 0.07939330488443375, "learning_rate": 9.896442669120187e-07, "loss": -0.0004, "num_tokens": 4247488.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999198913574219, "sampling/importance_sampling_ratio/min": 0.5291028022766113, "sampling/sampling_logp_difference/max": 0.7101559638977051, "sampling/sampling_logp_difference/mean": 0.014859667047858238, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 161.8125, "completions/mean_terminated_length": 161.8125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.16199669241905212, "epoch": 0.32035398230088497, "frac_reward_zero_std": 1.0, "grad_norm": 0.12516728492593937, "kl": 0.07084304094314575, "learning_rate": 9.893292088841108e-07, "loss": 0.0008, "num_tokens": 4267044.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6293762922286987, "sampling/importance_sampling_ratio/mean": 1.0001921653747559, "sampling/importance_sampling_ratio/min": 0.4992993474006653, "sampling/sampling_logp_difference/max": 0.6945494413375854, "sampling/sampling_logp_difference/mean": 0.011925876140594482, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.18908515572547913, "epoch": 0.32212389380530976, "frac_reward_zero_std": 1.0, "grad_norm": 0.05054152150733385, "kl": 0.03658327832818031, "learning_rate": 9.890094814824852e-07, "loss": 0.0003, "num_tokens": 4300644.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9099600315093994, "sampling/importance_sampling_ratio/mean": 1.0004087686538696, "sampling/importance_sampling_ratio/min": 0.472702294588089, "sampling/sampling_logp_difference/max": 0.7492895126342773, "sampling/sampling_logp_difference/mean": 0.012418387457728386, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 117.625, "completions/mean_terminated_length": 117.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.16997909545898438, "epoch": 0.3238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.13294129984576183, "kl": 0.05893271043896675, "learning_rate": 9.886850877581078e-07, "loss": 0.0006, "num_tokens": 4318972.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6606510877609253, "sampling/importance_sampling_ratio/mean": 0.9994032382965088, "sampling/importance_sampling_ratio/min": 0.3170143663883209, "sampling/sampling_logp_difference/max": 1.148808240890503, "sampling/sampling_logp_difference/mean": 0.014631889760494232, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 128.21875, "completions/mean_terminated_length": 128.21875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.18240663409233093, "epoch": 0.3256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.977835734886894, "kl": 0.05208232253789902, "learning_rate": 9.883560308064722e-07, "loss": 0.0339, "num_tokens": 4337402.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6580424308776855, "sampling/importance_sampling_ratio/mean": 0.9993151426315308, "sampling/importance_sampling_ratio/min": 0.4951190948486328, "sampling/sampling_logp_difference/max": 0.7029569149017334, "sampling/sampling_logp_difference/mean": 0.015012145042419434, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 168.390625, "completions/mean_terminated_length": 168.390625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.17180198431015015, "epoch": 0.3274336283185841, "frac_reward_zero_std": 0.75, "grad_norm": 1.4114473232024334, "kl": 0.028212890028953552, "learning_rate": 9.880223137675707e-07, "loss": 0.017, "num_tokens": 4358195.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.0003812313079834, "sampling/importance_sampling_ratio/min": 0.4882410168647766, "sampling/sampling_logp_difference/max": 0.7169461250305176, "sampling/sampling_logp_difference/mean": 0.012684261426329613, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 135.90625, "completions/mean_terminated_length": 135.90625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.12705272436141968, "epoch": 0.3292035398230089, "frac_reward_zero_std": 1.0, "grad_norm": 0.06703013106973879, "kl": 0.026837332174181938, "learning_rate": 9.876839398258639e-07, "loss": 0.0002, "num_tokens": 4378045.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000888705253601, "sampling/importance_sampling_ratio/min": 0.495414674282074, "sampling/sampling_logp_difference/max": 0.8316564559936523, "sampling/sampling_logp_difference/mean": 0.011435680091381073, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.17391975224018097, "epoch": 0.3309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.9846633506285223, "kl": 0.04173770546913147, "learning_rate": 9.873409122102503e-07, "loss": 0.0153, "num_tokens": 4396517.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999850392341614, "sampling/importance_sampling_ratio/min": 0.291415810585022, "sampling/sampling_logp_difference/max": 1.233004093170166, "sampling/sampling_logp_difference/mean": 0.013772734440863132, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 211.03125, "completions/mean_terminated_length": 211.03125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.2068071812391281, "epoch": 0.3327433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 1.6782416670722644, "kl": 0.0339084193110466, "learning_rate": 9.869932341940358e-07, "loss": 0.0611, "num_tokens": 4421015.0, "reward": 0.59375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.9034979343414307, "sampling/importance_sampling_ratio/mean": 1.0002851486206055, "sampling/importance_sampling_ratio/min": 0.49048733711242676, "sampling/sampling_logp_difference/max": 0.7123558521270752, "sampling/sampling_logp_difference/mean": 0.013064769096672535, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.10757587850093842, "epoch": 0.3345132743362832, "frac_reward_zero_std": 1.0, "grad_norm": 0.058082185866876204, "kl": 0.03402823954820633, "learning_rate": 9.86640909094902e-07, "loss": 0.0002, "num_tokens": 4441279.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6581681966781616, "sampling/importance_sampling_ratio/mean": 0.9989020228385925, "sampling/importance_sampling_ratio/min": 0.43191245198249817, "sampling/sampling_logp_difference/max": 0.8395324349403381, "sampling/sampling_logp_difference/mean": 0.010764717124402523, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 220.40625, "completions/mean_terminated_length": 220.40625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.23897498846054077, "epoch": 0.336283185840708, "frac_reward_zero_std": 0.5, "grad_norm": 1.656888243072091, "kl": 0.060964662581682205, "learning_rate": 9.862839402748753e-07, "loss": -0.0778, "num_tokens": 4466601.0, "reward": -0.03125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6230909824371338, "sampling/importance_sampling_ratio/mean": 0.9998329281806946, "sampling/importance_sampling_ratio/min": 0.5860807299613953, "sampling/sampling_logp_difference/max": 0.5342977046966553, "sampling/sampling_logp_difference/mean": 0.013494778424501419, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 146.921875, "completions/mean_terminated_length": 146.921875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.13072966039180756, "epoch": 0.3380530973451327, "frac_reward_zero_std": 1.0, "grad_norm": 0.15004060781854595, "kl": 0.05056352540850639, "learning_rate": 9.859223311402936e-07, "loss": 0.0004, "num_tokens": 4486196.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9989742636680603, "sampling/importance_sampling_ratio/min": 0.2345745861530304, "sampling/sampling_logp_difference/max": 1.7880442142486572, "sampling/sampling_logp_difference/mean": 0.014112485572695732, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 139.78125, "completions/mean_terminated_length": 139.78125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.21191146969795227, "epoch": 0.3398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 1.7410160891070305, "kl": 0.05065145716071129, "learning_rate": 9.85556085141775e-07, "loss": -0.0278, "num_tokens": 4507350.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.8028171062469482, "sampling/importance_sampling_ratio/mean": 1.0001394748687744, "sampling/importance_sampling_ratio/min": 0.6012518405914307, "sampling/sampling_logp_difference/max": 0.5893504619598389, "sampling/sampling_logp_difference/mean": 0.013851691968739033, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1579647958278656, "epoch": 0.3415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.09011935268067262, "kl": 0.03978281468153, "learning_rate": 9.851852057741844e-07, "loss": 0.0003, "num_tokens": 4530706.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6599884033203125, "sampling/importance_sampling_ratio/mean": 0.9998695850372314, "sampling/importance_sampling_ratio/min": 0.34727928042411804, "sampling/sampling_logp_difference/max": 1.0576260089874268, "sampling/sampling_logp_difference/mean": 0.012382281012833118, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 123.328125, "completions/mean_terminated_length": 123.328125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.2513120174407959, "epoch": 0.3433628318584071, "frac_reward_zero_std": 0.5, "grad_norm": 2.5744351481506746, "kl": 0.0662069320678711, "learning_rate": 9.848096965766002e-07, "loss": -0.0039, "num_tokens": 4549367.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9164477586746216, "sampling/importance_sampling_ratio/mean": 1.0002379417419434, "sampling/importance_sampling_ratio/min": 0.5208438038825989, "sampling/sampling_logp_difference/max": 0.6523051261901855, "sampling/sampling_logp_difference/mean": 0.015936490148305893, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 130.203125, "completions/mean_terminated_length": 130.203125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17120885848999023, "epoch": 0.34513274336283184, "frac_reward_zero_std": 0.75, "grad_norm": 2.3317327398359406, "kl": 0.0669703334569931, "learning_rate": 9.844295611322803e-07, "loss": -0.031, "num_tokens": 4567492.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9755021333694458, "sampling/importance_sampling_ratio/mean": 1.0001695156097412, "sampling/importance_sampling_ratio/min": 0.44553276896476746, "sampling/sampling_logp_difference/max": 0.8084845542907715, "sampling/sampling_logp_difference/mean": 0.013394741341471672, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 147.046875, "completions/mean_terminated_length": 147.046875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.21925824880599976, "epoch": 0.34690265486725663, "frac_reward_zero_std": 0.5, "grad_norm": 1.8283170952427603, "kl": 0.06531023234128952, "learning_rate": 9.84044803068628e-07, "loss": -0.0142, "num_tokens": 4587543.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000981092453003, "sampling/importance_sampling_ratio/min": 0.4891893267631531, "sampling/sampling_logp_difference/max": 0.71500563621521, "sampling/sampling_logp_difference/mean": 0.016116643324494362, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 125.4375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.19096097350120544, "epoch": 0.3486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 1.977272435757514, "kl": 0.047299645841121674, "learning_rate": 9.836554260571577e-07, "loss": 0.0303, "num_tokens": 4606611.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5277752876281738, "sampling/importance_sampling_ratio/mean": 0.9999310970306396, "sampling/importance_sampling_ratio/min": 0.6236317753791809, "sampling/sampling_logp_difference/max": 0.4721951484680176, "sampling/sampling_logp_difference/mean": 0.01185104064643383, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 86.96875, "completions/mean_terminated_length": 86.96875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16540777683258057, "epoch": 0.3504424778761062, "frac_reward_zero_std": 1.0, "grad_norm": 0.19429851234166587, "kl": 0.05975417420268059, "learning_rate": 9.832614338134595e-07, "loss": 0.0006, "num_tokens": 4622241.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996644258499146, "sampling/importance_sampling_ratio/min": 0.4954211413860321, "sampling/sampling_logp_difference/max": 1.0052356719970703, "sampling/sampling_logp_difference/mean": 0.014910046942532063, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 177.375, "completions/mean_terminated_length": 177.375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18113969266414642, "epoch": 0.35221238938053095, "frac_reward_zero_std": 1.0, "grad_norm": 0.057748064152879995, "kl": 0.04518220201134682, "learning_rate": 9.828628300971638e-07, "loss": 0.0004, "num_tokens": 4644521.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.620740294456482, "sampling/importance_sampling_ratio/mean": 1.00044846534729, "sampling/importance_sampling_ratio/min": 0.45969733595848083, "sampling/sampling_logp_difference/max": 0.7771869897842407, "sampling/sampling_logp_difference/mean": 0.012886783108115196, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 193.328125, "completions/mean_terminated_length": 193.328125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.348585844039917, "epoch": 0.35398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 1.9928507367903996, "kl": 0.06873473525047302, "learning_rate": 9.82459618711906e-07, "loss": -0.0041, "num_tokens": 4674574.0, "reward": 0.0, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.625244140625, "sampling/importance_sampling_ratio/mean": 1.0007238388061523, "sampling/importance_sampling_ratio/min": 0.6102320551872253, "sampling/sampling_logp_difference/max": 0.49391603469848633, "sampling/sampling_logp_difference/mean": 0.01781415194272995, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 418.8125, "completions/mean_terminated_length": 418.8125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2606131434440613, "epoch": 0.35575221238938054, "frac_reward_zero_std": 0.25, "grad_norm": 1.0487955310186723, "kl": 0.041200071573257446, "learning_rate": 9.820518035052889e-07, "loss": -0.061, "num_tokens": 4711314.0, "reward": 0.75, "reward_std": 0.5738953948020935, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995898008346558, "sampling/importance_sampling_ratio/min": 0.5483723282814026, "sampling/sampling_logp_difference/max": 0.7226977348327637, "sampling/sampling_logp_difference/mean": 0.012698698788881302, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 255.90625, "completions/mean_terminated_length": 255.90625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2646016478538513, "epoch": 0.35752212389380533, "frac_reward_zero_std": 0.25, "grad_norm": 1.8647569432702575, "kl": 0.05322111397981644, "learning_rate": 9.816393883688475e-07, "loss": -0.021, "num_tokens": 4740540.0, "reward": 0.5625, "reward_std": 0.690913200378418, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6239689588546753, "sampling/importance_sampling_ratio/mean": 0.9998320937156677, "sampling/importance_sampling_ratio/min": 0.5038187503814697, "sampling/sampling_logp_difference/max": 0.6855387687683105, "sampling/sampling_logp_difference/mean": 0.014325067400932312, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 309.171875, "completions/mean_terminated_length": 309.171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19423788785934448, "epoch": 0.35929203539823007, "frac_reward_zero_std": 0.25, "grad_norm": 1.342487793635873, "kl": 0.033892955631017685, "learning_rate": 9.812223772380105e-07, "loss": -0.0134, "num_tokens": 4770247.0, "reward": 0.84375, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 1.0003530979156494, "sampling/importance_sampling_ratio/min": 0.46050336956977844, "sampling/sampling_logp_difference/max": 0.7754350900650024, "sampling/sampling_logp_difference/mean": 0.013265022076666355, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 347.09375, "completions/mean_terminated_length": 347.09375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.20393873751163483, "epoch": 0.36106194690265486, "frac_reward_zero_std": 0.75, "grad_norm": 0.7707261427135795, "kl": 0.03650056570768356, "learning_rate": 9.808007740920645e-07, "loss": -0.0112, "num_tokens": 4805805.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6640413999557495, "sampling/importance_sampling_ratio/mean": 1.0003085136413574, "sampling/importance_sampling_ratio/min": 0.5661108493804932, "sampling/sampling_logp_difference/max": 0.5689653158187866, "sampling/sampling_logp_difference/mean": 0.011536139994859695, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 235.484375, "completions/mean_terminated_length": 235.484375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.24759899079799652, "epoch": 0.36283185840707965, "frac_reward_zero_std": 0.75, "grad_norm": 1.2224063094177127, "kl": 0.0491972491145134, "learning_rate": 9.803745829541137e-07, "loss": 0.0195, "num_tokens": 4833820.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997979402542114, "sampling/importance_sampling_ratio/min": 0.3649396598339081, "sampling/sampling_logp_difference/max": 1.0080232620239258, "sampling/sampling_logp_difference/mean": 0.01325822714716196, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 282.40625, "completions/mean_terminated_length": 282.40625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.16729769110679626, "epoch": 0.36460176991150445, "frac_reward_zero_std": 0.75, "grad_norm": 0.9437107505995642, "kl": 0.03193018585443497, "learning_rate": 9.799438078910432e-07, "loss": -0.0826, "num_tokens": 4862294.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.888427734375, "sampling/importance_sampling_ratio/mean": 1.0001298189163208, "sampling/importance_sampling_ratio/min": 0.645884096622467, "sampling/sampling_logp_difference/max": 0.635744571685791, "sampling/sampling_logp_difference/mean": 0.010878627188503742, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 179.46875, "completions/mean_terminated_length": 179.46875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.20578967034816742, "epoch": 0.3663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 1.0933950804667607, "kl": 0.06878609210252762, "learning_rate": 9.7950845301348e-07, "loss": -0.0121, "num_tokens": 4883876.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7457889318466187, "sampling/importance_sampling_ratio/mean": 0.9998757243156433, "sampling/importance_sampling_ratio/min": 0.614746630191803, "sampling/sampling_logp_difference/max": 0.5572065114974976, "sampling/sampling_logp_difference/mean": 0.0146666020154953, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 215.71875, "completions/mean_terminated_length": 215.71875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.22424697875976562, "epoch": 0.368141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 1.4120096656586418, "kl": 0.04635723680257797, "learning_rate": 9.790685224757532e-07, "loss": -0.0117, "num_tokens": 4908018.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.574514627456665, "sampling/importance_sampling_ratio/mean": 1.0002131462097168, "sampling/importance_sampling_ratio/min": 0.6269536018371582, "sampling/sampling_logp_difference/max": 0.46688270568847656, "sampling/sampling_logp_difference/mean": 0.01408640667796135, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 109.5, "completions/mean_terminated_length": 109.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.13033784925937653, "epoch": 0.36991150442477877, "frac_reward_zero_std": 1.0, "grad_norm": 0.15616098497528982, "kl": 0.046197690069675446, "learning_rate": 9.786240204758552e-07, "loss": 0.0004, "num_tokens": 4924098.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9987190961837769, "sampling/importance_sampling_ratio/min": 0.5300405025482178, "sampling/sampling_logp_difference/max": 0.7694576978683472, "sampling/sampling_logp_difference/mean": 0.012509491294622421, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 479.515625, "completions/mean_terminated_length": 479.515625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.26497337222099304, "epoch": 0.37168141592920356, "frac_reward_zero_std": 0.5, "grad_norm": 0.7218964077729115, "kl": 0.05141793191432953, "learning_rate": 9.781749512553998e-07, "loss": 0.0191, "num_tokens": 4967139.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994410276412964, "sampling/importance_sampling_ratio/min": 0.3912365138530731, "sampling/sampling_logp_difference/max": 0.9725255966186523, "sampling/sampling_logp_difference/mean": 0.013990292325615883, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 262.5625, "completions/mean_terminated_length": 262.5625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.27379727363586426, "epoch": 0.3734513274336283, "frac_reward_zero_std": 0.75, "grad_norm": 0.9086973998853466, "kl": 0.05838555842638016, "learning_rate": 9.777213190995847e-07, "loss": -0.0094, "num_tokens": 4995927.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.625897765159607, "sampling/importance_sampling_ratio/mean": 1.0003423690795898, "sampling/importance_sampling_ratio/min": 0.5373389720916748, "sampling/sampling_logp_difference/max": 0.6211261749267578, "sampling/sampling_logp_difference/mean": 0.012926041148602962, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 224.09375, "completions/mean_terminated_length": 224.09375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.206679105758667, "epoch": 0.3752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 0.9843484734708137, "kl": 0.042788729071617126, "learning_rate": 9.77263128337148e-07, "loss": -0.032, "num_tokens": 5022269.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000333309173584, "sampling/importance_sampling_ratio/min": 0.5870134830474854, "sampling/sampling_logp_difference/max": 1.0005664825439453, "sampling/sampling_logp_difference/mean": 0.011488407850265503, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 236.3125, "completions/mean_terminated_length": 236.3125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2779295742511749, "epoch": 0.3769911504424779, "frac_reward_zero_std": 0.75, "grad_norm": 1.0494655824896466, "kl": 0.061157867312431335, "learning_rate": 9.768003833403276e-07, "loss": -0.0377, "num_tokens": 5050049.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6470122337341309, "sampling/importance_sampling_ratio/mean": 1.0007336139678955, "sampling/importance_sampling_ratio/min": 0.5155584812164307, "sampling/sampling_logp_difference/max": 0.6625045537948608, "sampling/sampling_logp_difference/mean": 0.015172529965639114, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 81.3125, "completions/mean_terminated_length": 81.3125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.13287663459777832, "epoch": 0.3787610619469027, "frac_reward_zero_std": 1.0, "grad_norm": 0.1660401254751755, "kl": 0.052816472947597504, "learning_rate": 9.763330885248204e-07, "loss": 0.0005, "num_tokens": 5065109.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.75422203540802, "sampling/importance_sampling_ratio/mean": 1.0004856586456299, "sampling/importance_sampling_ratio/min": 0.5910776853561401, "sampling/sampling_logp_difference/max": 0.5620254278182983, "sampling/sampling_logp_difference/mean": 0.013867111876606941, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 126.03125, "completions/mean_terminated_length": 126.03125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.18541954457759857, "epoch": 0.3805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.2340989266692736, "kl": 0.0761493444442749, "learning_rate": 9.758612483497394e-07, "loss": 0.0008, "num_tokens": 5083591.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6521116495132446, "sampling/importance_sampling_ratio/mean": 1.0000619888305664, "sampling/importance_sampling_ratio/min": 0.6115787625312805, "sampling/sampling_logp_difference/max": 0.5020542144775391, "sampling/sampling_logp_difference/mean": 0.013983199372887611, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 226.40625, "completions/mean_terminated_length": 226.40625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.19383396208286285, "epoch": 0.3823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0647874820866648, "kl": 0.0444425493478775, "learning_rate": 9.753848673175707e-07, "loss": 0.0004, "num_tokens": 5109505.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996417760848999, "sampling/importance_sampling_ratio/min": 0.6119229197502136, "sampling/sampling_logp_difference/max": 0.7099208831787109, "sampling/sampling_logp_difference/mean": 0.011938889510929585, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 172.15625, "completions/mean_terminated_length": 172.15625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.14755362272262573, "epoch": 0.384070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.128018282440456, "kl": 0.05704216659069061, "learning_rate": 9.74903949974131e-07, "loss": 0.0005, "num_tokens": 5130907.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.576133370399475, "sampling/importance_sampling_ratio/mean": 0.999180793762207, "sampling/importance_sampling_ratio/min": 0.6438745260238647, "sampling/sampling_logp_difference/max": 0.4549746513366699, "sampling/sampling_logp_difference/mean": 0.010835966095328331, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 262.359375, "completions/mean_terminated_length": 262.359375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.26924920082092285, "epoch": 0.3858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 1.1115807412125334, "kl": 0.05774557963013649, "learning_rate": 9.744185009085256e-07, "loss": -0.0043, "num_tokens": 5157970.0, "reward": 0.4375, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6274701356887817, "sampling/importance_sampling_ratio/mean": 0.9999144673347473, "sampling/importance_sampling_ratio/min": 0.5483725070953369, "sampling/sampling_logp_difference/max": 0.6008003950119019, "sampling/sampling_logp_difference/mean": 0.01384336780756712, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 198.796875, "completions/mean_terminated_length": 198.796875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.22300668060779572, "epoch": 0.38761061946902653, "frac_reward_zero_std": 0.75, "grad_norm": 1.2314270391122017, "kl": 0.04785351827740669, "learning_rate": 9.739285247531017e-07, "loss": 0.0041, "num_tokens": 5181637.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6362472772598267, "sampling/importance_sampling_ratio/mean": 0.9997662901878357, "sampling/importance_sampling_ratio/min": 0.4128779470920563, "sampling/sampling_logp_difference/max": 0.8846032619476318, "sampling/sampling_logp_difference/mean": 0.01489302422851324, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 270.984375, "completions/mean_terminated_length": 270.984375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.2795213460922241, "epoch": 0.3893805309734513, "frac_reward_zero_std": 0.25, "grad_norm": 1.5356576919055795, "kl": 0.07295779883861542, "learning_rate": 9.734340261834066e-07, "loss": 0.1157, "num_tokens": 5209940.0, "reward": -0.21875, "reward_std": 0.7211624383926392, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.8276463747024536, "sampling/importance_sampling_ratio/mean": 1.0001741647720337, "sampling/importance_sampling_ratio/min": 0.48236799240112305, "sampling/sampling_logp_difference/max": 0.7290480136871338, "sampling/sampling_logp_difference/mean": 0.013475652784109116, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 393.0, "completions/mean_terminated_length": 393.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.26876699924468994, "epoch": 0.3911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.0357695455601577, "kl": 0.04203413426876068, "learning_rate": 9.729350099181419e-07, "loss": 0.0059, "num_tokens": 5246820.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995237588882446, "sampling/importance_sampling_ratio/min": 0.4764953851699829, "sampling/sampling_logp_difference/max": 0.9559886455535889, "sampling/sampling_logp_difference/mean": 0.013396305032074451, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 191.03125, "completions/mean_terminated_length": 191.03125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1847516894340515, "epoch": 0.3929203539823009, "frac_reward_zero_std": 1.0, "grad_norm": 0.09790467198863977, "kl": 0.05971777066588402, "learning_rate": 9.724314807191196e-07, "loss": 0.0006, "num_tokens": 5270150.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9967682361602783, "sampling/importance_sampling_ratio/mean": 1.0000227689743042, "sampling/importance_sampling_ratio/min": 0.510474681854248, "sampling/sampling_logp_difference/max": 0.6915299892425537, "sampling/sampling_logp_difference/mean": 0.014638702385127544, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 299.734375, "completions/mean_terminated_length": 299.734375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.32936397194862366, "epoch": 0.39469026548672564, "frac_reward_zero_std": 0.25, "grad_norm": 1.4209148870844486, "kl": 0.055370960384607315, "learning_rate": 9.719234433912146e-07, "loss": -0.0494, "num_tokens": 5301845.0, "reward": 0.46875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5685840845108032, "sampling/importance_sampling_ratio/mean": 1.0001795291900635, "sampling/importance_sampling_ratio/min": 0.6057937741279602, "sampling/sampling_logp_difference/max": 0.5012156963348389, "sampling/sampling_logp_difference/mean": 0.013880166225135326, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 187.4375, "completions/mean_terminated_length": 187.4375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.21585038304328918, "epoch": 0.39646017699115044, "frac_reward_zero_std": 0.75, "grad_norm": 1.4560432969983188, "kl": 0.056045565754175186, "learning_rate": 9.714109027823216e-07, "loss": 0.027, "num_tokens": 5325409.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5087573528289795, "sampling/importance_sampling_ratio/mean": 1.000249981880188, "sampling/importance_sampling_ratio/min": 0.3906335234642029, "sampling/sampling_logp_difference/max": 0.939985454082489, "sampling/sampling_logp_difference/mean": 0.014913519844412804, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 145.0625, "completions/mean_terminated_length": 145.0625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.22830285131931305, "epoch": 0.39823008849557523, "frac_reward_zero_std": 0.75, "grad_norm": 1.4378168565618796, "kl": 0.061168670654296875, "learning_rate": 9.708938637833064e-07, "loss": -0.0104, "num_tokens": 5344837.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.575305461883545, "sampling/importance_sampling_ratio/mean": 0.9995800256729126, "sampling/importance_sampling_ratio/min": 0.5629475712776184, "sampling/sampling_logp_difference/max": 0.5745687484741211, "sampling/sampling_logp_difference/mean": 0.01415572501718998, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 343.671875, "completions/mean_terminated_length": 343.671875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.20173272490501404, "epoch": 0.4, "frac_reward_zero_std": 0.75, "grad_norm": 0.7663750043647769, "kl": 0.051169030368328094, "learning_rate": 9.703723313279605e-07, "loss": 0.0166, "num_tokens": 5376800.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.8269095420837402, "sampling/importance_sampling_ratio/mean": 1.0004074573516846, "sampling/importance_sampling_ratio/min": 0.629837691783905, "sampling/sampling_logp_difference/max": 0.6026257276535034, "sampling/sampling_logp_difference/mean": 0.009785030037164688, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1279.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 335.796875, "completions/mean_terminated_length": 335.796875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.30093175172805786, "epoch": 0.40176991150442476, "frac_reward_zero_std": 0.75, "grad_norm": 0.8987905422863307, "kl": 0.07756787538528442, "learning_rate": 9.698463103929541e-07, "loss": -0.0827, "num_tokens": 5409059.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5279512405395508, "sampling/importance_sampling_ratio/mean": 0.9998106956481934, "sampling/importance_sampling_ratio/min": 0.4871019124984741, "sampling/sampling_logp_difference/max": 0.7192819118499756, "sampling/sampling_logp_difference/mean": 0.014042770490050316, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 311.6875, "completions/mean_terminated_length": 311.6875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.23240000009536743, "epoch": 0.40353982300884955, "frac_reward_zero_std": 0.5, "grad_norm": 1.1068416742381213, "kl": 0.04791051894426346, "learning_rate": 9.693158059977877e-07, "loss": -0.0097, "num_tokens": 5440495.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999784231185913, "sampling/importance_sampling_ratio/min": 0.4697237014770508, "sampling/sampling_logp_difference/max": 0.9863030910491943, "sampling/sampling_logp_difference/mean": 0.013133438304066658, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 113.734375, "completions/mean_terminated_length": 113.734375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.13417387008666992, "epoch": 0.40530973451327434, "frac_reward_zero_std": 1.0, "grad_norm": 0.14969908483703784, "kl": 0.0619085431098938, "learning_rate": 9.68780823204745e-07, "loss": 0.0006, "num_tokens": 5457262.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004165172576904, "sampling/importance_sampling_ratio/min": 0.4159519672393799, "sampling/sampling_logp_difference/max": 0.8771854639053345, "sampling/sampling_logp_difference/mean": 0.013183990493416786, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 250.796875, "completions/mean_terminated_length": 250.796875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.17220492660999298, "epoch": 0.40707964601769914, "frac_reward_zero_std": 1.0, "grad_norm": 0.12444371298193976, "kl": 0.04996635019779205, "learning_rate": 9.682413671188444e-07, "loss": 0.0004, "num_tokens": 5483409.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011935234069824, "sampling/importance_sampling_ratio/min": 0.5429806113243103, "sampling/sampling_logp_difference/max": 0.8459408283233643, "sampling/sampling_logp_difference/mean": 0.013233702629804611, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 294.0, "completions/mean_terminated_length": 294.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.2373402714729309, "epoch": 0.4088495575221239, "frac_reward_zero_std": 0.75, "grad_norm": 0.8883970934093129, "kl": 0.05311669409275055, "learning_rate": 9.6769744288779e-07, "loss": 0.014, "num_tokens": 5514145.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00046706199646, "sampling/importance_sampling_ratio/min": 0.6077752709388733, "sampling/sampling_logp_difference/max": 0.7454228401184082, "sampling/sampling_logp_difference/mean": 0.014076707884669304, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 260.71875, "completions/mean_terminated_length": 260.71875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.24738536775112152, "epoch": 0.41061946902654867, "frac_reward_zero_std": 0.5, "grad_norm": 1.4077967857497193, "kl": 0.05067981034517288, "learning_rate": 9.671490557019233e-07, "loss": 0.0422, "num_tokens": 5542639.0, "reward": 0.5625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5178754329681396, "sampling/importance_sampling_ratio/mean": 0.9997684359550476, "sampling/importance_sampling_ratio/min": 0.545313835144043, "sampling/sampling_logp_difference/max": 0.6063938140869141, "sampling/sampling_logp_difference/mean": 0.013422314077615738, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 280.015625, "completions/mean_terminated_length": 280.015625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.16088178753852844, "epoch": 0.41238938053097346, "frac_reward_zero_std": 0.75, "grad_norm": 0.8413670303303458, "kl": 0.04627101123332977, "learning_rate": 9.665962107941724e-07, "loss": 0.0367, "num_tokens": 5570272.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000643014907837, "sampling/importance_sampling_ratio/min": 0.4871179163455963, "sampling/sampling_logp_difference/max": 0.8116297721862793, "sampling/sampling_logp_difference/mean": 0.013861261308193207, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 345.453125, "completions/mean_terminated_length": 345.453125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2672743499279022, "epoch": 0.41415929203539825, "frac_reward_zero_std": 0.0, "grad_norm": 1.4047163703335754, "kl": 0.05912863463163376, "learning_rate": 9.660389134400033e-07, "loss": -0.0383, "num_tokens": 5604109.0, "reward": 0.3125, "reward_std": 0.75, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4753777980804443, "sampling/importance_sampling_ratio/mean": 0.9993388652801514, "sampling/importance_sampling_ratio/min": 0.5914850831031799, "sampling/sampling_logp_difference/max": 0.5251188278198242, "sampling/sampling_logp_difference/mean": 0.013273285701870918, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 250.578125, "completions/mean_terminated_length": 250.578125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.15660569071769714, "epoch": 0.415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.9830366558589027, "kl": 0.04855594038963318, "learning_rate": 9.654771689573684e-07, "loss": -0.0592, "num_tokens": 5630818.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007832050323486, "sampling/importance_sampling_ratio/min": 0.5260870456695557, "sampling/sampling_logp_difference/max": 0.7383193969726562, "sampling/sampling_logp_difference/mean": 0.010624085552990437, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 162.390625, "completions/mean_terminated_length": 162.390625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1809491664171219, "epoch": 0.4176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.2383811911197418, "kl": 0.03860608860850334, "learning_rate": 9.64910982706657e-07, "loss": -0.0473, "num_tokens": 5652587.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001081228256226, "sampling/importance_sampling_ratio/min": 0.6147373914718628, "sampling/sampling_logp_difference/max": 0.7265691757202148, "sampling/sampling_logp_difference/mean": 0.013006515800952911, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 211.890625, "completions/mean_terminated_length": 211.890625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.2051321417093277, "epoch": 0.4194690265486726, "frac_reward_zero_std": 0.75, "grad_norm": 1.0923335780491972, "kl": 0.05851077288389206, "learning_rate": 9.643403600906432e-07, "loss": -0.0667, "num_tokens": 5675124.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6192655563354492, "sampling/importance_sampling_ratio/mean": 0.9998751878738403, "sampling/importance_sampling_ratio/min": 0.45869773626327515, "sampling/sampling_logp_difference/max": 0.7793638706207275, "sampling/sampling_logp_difference/mean": 0.013663128018379211, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 120.765625, "completions/mean_terminated_length": 120.765625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2177194207906723, "epoch": 0.42123893805309737, "frac_reward_zero_std": 0.75, "grad_norm": 1.7405121374197727, "kl": 0.06642745435237885, "learning_rate": 9.637653065544349e-07, "loss": 0.0, "num_tokens": 5693925.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7996087074279785, "sampling/importance_sampling_ratio/mean": 1.000510573387146, "sampling/importance_sampling_ratio/min": 0.6139369606971741, "sampling/sampling_logp_difference/max": 0.5875692367553711, "sampling/sampling_logp_difference/mean": 0.015289681032299995, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 213.296875, "completions/mean_terminated_length": 213.296875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.24603918194770813, "epoch": 0.4230088495575221, "frac_reward_zero_std": 0.5, "grad_norm": 1.508194271077183, "kl": 0.05822637677192688, "learning_rate": 9.63185827585421e-07, "loss": -0.0142, "num_tokens": 5717800.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4753788709640503, "sampling/importance_sampling_ratio/mean": 0.9999099373817444, "sampling/importance_sampling_ratio/min": 0.4052807688713074, "sampling/sampling_logp_difference/max": 0.9031752347946167, "sampling/sampling_logp_difference/mean": 0.013505000621080399, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 297.171875, "completions/mean_terminated_length": 297.171875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.28569483757019043, "epoch": 0.4247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 0.7830212792879713, "kl": 0.07420830428600311, "learning_rate": 9.6260192871322e-07, "loss": -0.0077, "num_tokens": 5749059.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6134573221206665, "sampling/importance_sampling_ratio/mean": 0.9999909400939941, "sampling/importance_sampling_ratio/min": 0.4783531129360199, "sampling/sampling_logp_difference/max": 0.7374061346054077, "sampling/sampling_logp_difference/mean": 0.014619600959122181, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 343.125, "completions/mean_terminated_length": 343.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.34782588481903076, "epoch": 0.4265486725663717, "frac_reward_zero_std": 0.0, "grad_norm": 1.3506367658899368, "kl": 0.08126983046531677, "learning_rate": 9.620136155096275e-07, "loss": 0.0485, "num_tokens": 5781259.0, "reward": -0.4375, "reward_std": 0.8151718378067017, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4630022048950195, "sampling/importance_sampling_ratio/mean": 0.999686062335968, "sampling/importance_sampling_ratio/min": 0.6147370934486389, "sampling/sampling_logp_difference/max": 0.48656052350997925, "sampling/sampling_logp_difference/mean": 0.015721015632152557, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 360.046875, "completions/mean_terminated_length": 360.046875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2700510621070862, "epoch": 0.4283185840707965, "frac_reward_zero_std": 0.25, "grad_norm": 1.2474292186720537, "kl": 0.05477534607052803, "learning_rate": 9.614208935885614e-07, "loss": 0.0188, "num_tokens": 5818478.0, "reward": 0.46875, "reward_std": 0.7211624383926392, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8183878660202026, "sampling/importance_sampling_ratio/mean": 1.0004582405090332, "sampling/importance_sampling_ratio/min": 0.4781224727630615, "sampling/sampling_logp_difference/max": 0.7378883361816406, "sampling/sampling_logp_difference/mean": 0.013893750496208668, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 206.984375, "completions/mean_terminated_length": 206.984375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.16392946243286133, "epoch": 0.4300884955752212, "frac_reward_zero_std": 1.0, "grad_norm": 0.05095550985514265, "kl": 0.04552846401929855, "learning_rate": 9.608237686060097e-07, "loss": 0.0004, "num_tokens": 5843101.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.410611629486084, "sampling/importance_sampling_ratio/mean": 0.9992729425430298, "sampling/importance_sampling_ratio/min": 0.3368482291698456, "sampling/sampling_logp_difference/max": 1.088122844696045, "sampling/sampling_logp_difference/mean": 0.01308012567460537, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 321.390625, "completions/mean_terminated_length": 321.390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.2048739641904831, "epoch": 0.431858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 1.265048548893379, "kl": 0.054630789905786514, "learning_rate": 9.602222462599766e-07, "loss": -0.0388, "num_tokens": 5878534.0, "reward": 0.46875, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8989737033843994, "sampling/importance_sampling_ratio/mean": 1.0002886056900024, "sampling/importance_sampling_ratio/min": 0.5276268720626831, "sampling/sampling_logp_difference/max": 0.6413135528564453, "sampling/sampling_logp_difference/mean": 0.013193481601774693, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 263.921875, "completions/mean_terminated_length": 263.921875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.13228800892829895, "epoch": 0.4336283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 1.0368353364352532, "kl": 0.04123939201235771, "learning_rate": 9.596163322904269e-07, "loss": -0.0313, "num_tokens": 5904513.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9897125959396362, "sampling/importance_sampling_ratio/mean": 1.0001319646835327, "sampling/importance_sampling_ratio/min": 0.5335866808891296, "sampling/sampling_logp_difference/max": 0.6879901885986328, "sampling/sampling_logp_difference/mean": 0.01125460583716631, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 360.9375, "completions/mean_terminated_length": 360.9375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.18678252398967743, "epoch": 0.4353982300884956, "frac_reward_zero_std": 0.5, "grad_norm": 1.1120973192018642, "kl": 0.033606816083192825, "learning_rate": 9.590060324792325e-07, "loss": -0.0408, "num_tokens": 5938237.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006415843963623, "sampling/importance_sampling_ratio/min": 0.5096349120140076, "sampling/sampling_logp_difference/max": 0.7048373222351074, "sampling/sampling_logp_difference/mean": 0.012991039082407951, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.21655277907848358, "epoch": 0.43716814159292033, "frac_reward_zero_std": 0.75, "grad_norm": 0.9711761737061217, "kl": 0.05315382033586502, "learning_rate": 9.58391352650117e-07, "loss": 0.0449, "num_tokens": 5964677.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.612815022468567, "sampling/importance_sampling_ratio/mean": 1.0003581047058105, "sampling/importance_sampling_ratio/min": 0.48110243678092957, "sampling/sampling_logp_difference/max": 0.7316751480102539, "sampling/sampling_logp_difference/mean": 0.013924110680818558, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 327.953125, "completions/mean_terminated_length": 327.953125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.14471164345741272, "epoch": 0.4389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9820988276448238, "kl": 0.03843631595373154, "learning_rate": 9.57772298668599e-07, "loss": 0.0273, "num_tokens": 5996450.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997909069061279, "sampling/importance_sampling_ratio/min": 0.48691385984420776, "sampling/sampling_logp_difference/max": 0.7305965423583984, "sampling/sampling_logp_difference/mean": 0.011359037831425667, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 342.953125, "completions/mean_terminated_length": 342.953125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.20321068167686462, "epoch": 0.4407079646017699, "frac_reward_zero_std": 0.5, "grad_norm": 0.9546390720425624, "kl": 0.04618404060602188, "learning_rate": 9.57148876441938e-07, "loss": -0.0248, "num_tokens": 6029647.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6598302125930786, "sampling/importance_sampling_ratio/mean": 1.0007416009902954, "sampling/importance_sampling_ratio/min": 0.6393142938613892, "sampling/sampling_logp_difference/max": 0.5067152976989746, "sampling/sampling_logp_difference/mean": 0.010426388122141361, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 374.84375, "completions/mean_terminated_length": 374.84375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.21018818020820618, "epoch": 0.4424778761061947, "frac_reward_zero_std": 0.5, "grad_norm": 0.9599644225366605, "kl": 0.047251902520656586, "learning_rate": 9.565210919190763e-07, "loss": -0.0297, "num_tokens": 6069845.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995439052581787, "sampling/importance_sampling_ratio/min": 0.4771886467933655, "sampling/sampling_logp_difference/max": 0.7398433685302734, "sampling/sampling_logp_difference/mean": 0.0118654053658247, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 168.046875, "completions/mean_terminated_length": 168.046875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.17310121655464172, "epoch": 0.44424778761061945, "frac_reward_zero_std": 1.0, "grad_norm": 0.08759113539591282, "kl": 0.06398279964923859, "learning_rate": 9.558889510905835e-07, "loss": 0.0005, "num_tokens": 6093864.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007858276367188, "sampling/importance_sampling_ratio/min": 0.5323373675346375, "sampling/sampling_logp_difference/max": 0.7602612972259521, "sampling/sampling_logp_difference/mean": 0.014505499973893166, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 136.71875, "completions/mean_terminated_length": 136.71875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1281152069568634, "epoch": 0.44601769911504424, "frac_reward_zero_std": 1.0, "grad_norm": 0.11500475760111031, "kl": 0.05013818293809891, "learning_rate": 9.55252459988598e-07, "loss": 0.0005, "num_tokens": 6113782.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6724048852920532, "sampling/importance_sampling_ratio/mean": 1.0001322031021118, "sampling/importance_sampling_ratio/min": 0.6097288131713867, "sampling/sampling_logp_difference/max": 0.5142626762390137, "sampling/sampling_logp_difference/mean": 0.010475566610693932, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 197.421875, "completions/mean_terminated_length": 197.421875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.2097020000219345, "epoch": 0.44778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 1.234614229350842, "kl": 0.07380601018667221, "learning_rate": 9.546116246867713e-07, "loss": -0.0168, "num_tokens": 6137409.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5071629285812378, "sampling/importance_sampling_ratio/mean": 0.9989656209945679, "sampling/importance_sampling_ratio/min": 0.5327930450439453, "sampling/sampling_logp_difference/max": 0.629622220993042, "sampling/sampling_logp_difference/mean": 0.014124183915555477, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 182.828125, "completions/mean_terminated_length": 182.828125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.19614319503307343, "epoch": 0.4495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 1.1947852331691755, "kl": 0.08012209832668304, "learning_rate": 9.539664513002084e-07, "loss": 0.0082, "num_tokens": 6159686.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.608888030052185, "sampling/importance_sampling_ratio/mean": 0.9999400973320007, "sampling/importance_sampling_ratio/min": 0.4770910441875458, "sampling/sampling_logp_difference/max": 0.7400479316711426, "sampling/sampling_logp_difference/mean": 0.014064153656363487, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.16768693923950195, "epoch": 0.45132743362831856, "frac_reward_zero_std": 0.75, "grad_norm": 1.4616210780052514, "kl": 0.06833650171756744, "learning_rate": 9.533169459854098e-07, "loss": -0.0119, "num_tokens": 6181382.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001071691513062, "sampling/importance_sampling_ratio/min": 0.37462469935417175, "sampling/sampling_logp_difference/max": 0.9818305969238281, "sampling/sampling_logp_difference/mean": 0.013666465878486633, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 286.53125, "completions/mean_terminated_length": 286.53125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.14109718799591064, "epoch": 0.45309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.0395205582430334, "kl": 0.05483875051140785, "learning_rate": 9.526631149402134e-07, "loss": 0.036, "num_tokens": 6209912.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004024505615234, "sampling/importance_sampling_ratio/min": 0.36931154131889343, "sampling/sampling_logp_difference/max": 0.9961147308349609, "sampling/sampling_logp_difference/mean": 0.012917298823595047, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 304.390625, "completions/mean_terminated_length": 304.390625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.17394593358039856, "epoch": 0.45486725663716815, "frac_reward_zero_std": 0.75, "grad_norm": 0.9765389403171586, "kl": 0.05817359313368797, "learning_rate": 9.520049644037347e-07, "loss": -0.0041, "num_tokens": 6239393.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5845942497253418, "sampling/importance_sampling_ratio/mean": 0.9998289942741394, "sampling/importance_sampling_ratio/min": 0.39854058623313904, "sampling/sampling_logp_difference/max": 0.9199459552764893, "sampling/sampling_logp_difference/mean": 0.010983521118760109, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 186.78125, "completions/mean_terminated_length": 186.78125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.1929200291633606, "epoch": 0.45663716814159294, "frac_reward_zero_std": 0.75, "grad_norm": 1.4395203045706195, "kl": 0.062079012393951416, "learning_rate": 9.513425006563078e-07, "loss": 0.0133, "num_tokens": 6262291.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004851818084717, "sampling/importance_sampling_ratio/min": 0.3382951319217682, "sampling/sampling_logp_difference/max": 1.5751190185546875, "sampling/sampling_logp_difference/mean": 0.014538601040840149, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 199.234375, "completions/mean_terminated_length": 199.234375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18977543711662292, "epoch": 0.4584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 1.0977685883649906, "kl": 0.07670407742261887, "learning_rate": 9.506757300194248e-07, "loss": -0.0042, "num_tokens": 6297890.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995224475860596, "sampling/importance_sampling_ratio/min": 0.0031889271922409534, "sampling/sampling_logp_difference/max": 5.74807071685791, "sampling/sampling_logp_difference/mean": 0.014349579811096191, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.22712579369544983, "epoch": 0.46017699115044247, "frac_reward_zero_std": 0.75, "grad_norm": 1.0233190610161338, "kl": 0.06024787575006485, "learning_rate": 9.500046588556761e-07, "loss": -0.0004, "num_tokens": 6328642.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5146191120147705, "sampling/importance_sampling_ratio/mean": 0.9995711445808411, "sampling/importance_sampling_ratio/min": 0.5362950563430786, "sampling/sampling_logp_difference/max": 0.6230708360671997, "sampling/sampling_logp_difference/mean": 0.012038059532642365, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 343.9375, "completions/mean_terminated_length": 343.9375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.19357725977897644, "epoch": 0.46194690265486726, "frac_reward_zero_std": 0.75, "grad_norm": 0.8874260019300931, "kl": 0.06048102676868439, "learning_rate": 9.493292935686894e-07, "loss": 0.0892, "num_tokens": 6361022.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6494643688201904, "sampling/importance_sampling_ratio/mean": 0.999594509601593, "sampling/importance_sampling_ratio/min": 0.48228415846824646, "sampling/sampling_logp_difference/max": 0.7292218208312988, "sampling/sampling_logp_difference/mean": 0.01328032836318016, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 388.3125, "completions/mean_terminated_length": 388.3125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.146612748503685, "epoch": 0.46371681415929206, "frac_reward_zero_std": 0.75, "grad_norm": 0.7960085682727691, "kl": 0.046908702701330185, "learning_rate": 9.486496406030685e-07, "loss": 0.0195, "num_tokens": 6397970.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4707763195037842, "sampling/importance_sampling_ratio/mean": 1.000133991241455, "sampling/importance_sampling_ratio/min": 0.6113860011100769, "sampling/sampling_logp_difference/max": 0.49202680587768555, "sampling/sampling_logp_difference/mean": 0.01001516543328762, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 423.4375, "completions/mean_terminated_length": 423.4375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.23376008868217468, "epoch": 0.4654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 0.6138776344519721, "kl": 0.06173320114612579, "learning_rate": 9.479657064443321e-07, "loss": -0.0043, "num_tokens": 6437470.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9560428857803345, "sampling/importance_sampling_ratio/mean": 0.9999052882194519, "sampling/importance_sampling_ratio/min": 0.6139626502990723, "sampling/sampling_logp_difference/max": 0.6709234714508057, "sampling/sampling_logp_difference/mean": 0.011935731396079063, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 402.4375, "completions/mean_terminated_length": 402.4375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.14655429124832153, "epoch": 0.4672566371681416, "frac_reward_zero_std": 0.75, "grad_norm": 0.6726044333883838, "kl": 0.06275837123394012, "learning_rate": 9.472774976188513e-07, "loss": 0.0388, "num_tokens": 6474842.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.9958192110061646, "sampling/importance_sampling_ratio/mean": 1.0000778436660767, "sampling/importance_sampling_ratio/min": 0.5483723282814026, "sampling/sampling_logp_difference/max": 0.6910545825958252, "sampling/sampling_logp_difference/mean": 0.008798177354037762, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1395.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.14278912544250488, "epoch": 0.4690265486725664, "frac_reward_zero_std": 1.0, "grad_norm": 0.07025484313619215, "kl": 0.05805407091975212, "learning_rate": 9.465850206937887e-07, "loss": 0.0004, "num_tokens": 6502828.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000534176826477, "sampling/importance_sampling_ratio/min": 0.39875665307044983, "sampling/sampling_logp_difference/max": 0.9194040298461914, "sampling/sampling_logp_difference/mean": 0.01242618914693594, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 114.125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.1065681055188179, "epoch": 0.47079646017699117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0886571919656371, "kl": 0.04725336283445358, "learning_rate": 9.45888282277034e-07, "loss": 0.0005, "num_tokens": 6519892.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8727670907974243, "sampling/importance_sampling_ratio/mean": 0.9997010231018066, "sampling/importance_sampling_ratio/min": 0.5493660569190979, "sampling/sampling_logp_difference/max": 0.6274170875549316, "sampling/sampling_logp_difference/mean": 0.01061109360307455, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 302.59375, "completions/mean_terminated_length": 302.59375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1819612830877304, "epoch": 0.4725663716814159, "frac_reward_zero_std": 0.5, "grad_norm": 1.0522698938088362, "kl": 0.07520169019699097, "learning_rate": 9.451872890171419e-07, "loss": 0.0079, "num_tokens": 6550122.0, "reward": 0.21875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6207484006881714, "sampling/importance_sampling_ratio/mean": 0.999603807926178, "sampling/importance_sampling_ratio/min": 0.4598866105079651, "sampling/sampling_logp_difference/max": 0.7767753601074219, "sampling/sampling_logp_difference/mean": 0.011108506470918655, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 376.921875, "completions/mean_terminated_length": 376.921875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.26270580291748047, "epoch": 0.4743362831858407, "frac_reward_zero_std": 0.25, "grad_norm": 1.1569258575811436, "kl": 0.07379023730754852, "learning_rate": 9.444820476032685e-07, "loss": -0.0949, "num_tokens": 6587589.0, "reward": -0.5625, "reward_std": 0.6663130521774292, "rewards/decision_reward_func/mean": -0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002766847610474, "sampling/importance_sampling_ratio/min": 0.491433322429657, "sampling/sampling_logp_difference/max": 0.713691234588623, "sampling/sampling_logp_difference/mean": 0.01390319224447012, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 197.28125, "completions/mean_terminated_length": 197.28125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.13100634515285492, "epoch": 0.4761061946902655, "frac_reward_zero_std": 1.0, "grad_norm": 0.08111063934023793, "kl": 0.04550465568900108, "learning_rate": 9.437725647651078e-07, "loss": 0.0004, "num_tokens": 6613271.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6369680166244507, "sampling/importance_sampling_ratio/mean": 0.9995728731155396, "sampling/importance_sampling_ratio/min": 0.4095646142959595, "sampling/sampling_logp_difference/max": 0.8926606178283691, "sampling/sampling_logp_difference/mean": 0.011553870514035225, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 366.4375, "completions/mean_terminated_length": 366.4375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.19636042416095734, "epoch": 0.4778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 0.7406511943716303, "kl": 0.06438256800174713, "learning_rate": 9.430588472728269e-07, "loss": -0.0341, "num_tokens": 6646019.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5396623611450195, "sampling/importance_sampling_ratio/mean": 0.9997000694274902, "sampling/importance_sampling_ratio/min": 0.5127933621406555, "sampling/sampling_logp_difference/max": 0.6678823232650757, "sampling/sampling_logp_difference/mean": 0.010734849609434605, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 206.421875, "completions/mean_terminated_length": 206.421875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1419176608324051, "epoch": 0.479646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.224956413755507, "kl": 0.03848838061094284, "learning_rate": 9.423409019370014e-07, "loss": -0.0875, "num_tokens": 6668942.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010554790496826, "sampling/importance_sampling_ratio/min": 0.6395548582077026, "sampling/sampling_logp_difference/max": 0.8037681579589844, "sampling/sampling_logp_difference/mean": 0.011965746991336346, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 149.640625, "completions/mean_terminated_length": 149.640625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.20000965893268585, "epoch": 0.4814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 1.8322775080033225, "kl": 0.05366097763180733, "learning_rate": 9.416187356085512e-07, "loss": -0.0008, "num_tokens": 6692919.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.9693636894226074, "sampling/importance_sampling_ratio/mean": 1.0010923147201538, "sampling/importance_sampling_ratio/min": 0.3821716606616974, "sampling/sampling_logp_difference/max": 0.9618854522705078, "sampling/sampling_logp_difference/mean": 0.015052524395287037, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 122.828125, "completions/mean_terminated_length": 122.828125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.11449531465768814, "epoch": 0.4831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.3395208130730445, "kl": 0.06311716139316559, "learning_rate": 9.408923551786742e-07, "loss": 0.0005, "num_tokens": 6712508.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005345344543457, "sampling/importance_sampling_ratio/min": 0.47678470611572266, "sampling/sampling_logp_difference/max": 0.7567152976989746, "sampling/sampling_logp_difference/mean": 0.010808173567056656, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 118.53125, "completions/mean_terminated_length": 118.53125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.14217951893806458, "epoch": 0.4849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.09922195583578276, "kl": 0.053493574261665344, "learning_rate": 9.40161767578781e-07, "loss": 0.0006, "num_tokens": 6731390.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996917843818665, "sampling/importance_sampling_ratio/min": 0.4417635500431061, "sampling/sampling_logp_difference/max": 1.0378906726837158, "sampling/sampling_logp_difference/mean": 0.013335926458239555, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 75.046875, "completions/mean_terminated_length": 75.046875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1413385570049286, "epoch": 0.48672566371681414, "frac_reward_zero_std": 1.0, "grad_norm": 0.1269160881576902, "kl": 0.04999929666519165, "learning_rate": 9.394269797804288e-07, "loss": 0.0005, "num_tokens": 6746433.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5788311958312988, "sampling/importance_sampling_ratio/mean": 1.0012959241867065, "sampling/importance_sampling_ratio/min": 0.48454591631889343, "sampling/sampling_logp_difference/max": 0.7245430946350098, "sampling/sampling_logp_difference/mean": 0.013861396349966526, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 201.53125, "completions/mean_terminated_length": 201.53125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.22855433821678162, "epoch": 0.48849557522123893, "frac_reward_zero_std": 0.5, "grad_norm": 1.9114798905570163, "kl": 0.05809088051319122, "learning_rate": 9.386879987952549e-07, "loss": -0.0055, "num_tokens": 6778451.0, "reward": 0.28125, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6088883876800537, "sampling/importance_sampling_ratio/mean": 1.00001859664917, "sampling/importance_sampling_ratio/min": 0.6106173992156982, "sampling/sampling_logp_difference/max": 0.4932847023010254, "sampling/sampling_logp_difference/mean": 0.013666185550391674, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 111.59375, "completions/mean_terminated_length": 111.59375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.12589851021766663, "epoch": 0.4902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.07564664143759202, "kl": 0.04839683324098587, "learning_rate": 9.37944831674909e-07, "loss": 0.0004, "num_tokens": 6794361.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6220738887786865, "sampling/importance_sampling_ratio/mean": 0.9996914863586426, "sampling/importance_sampling_ratio/min": 0.5080466866493225, "sampling/sampling_logp_difference/max": 0.6771819591522217, "sampling/sampling_logp_difference/mean": 0.011566423811018467, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.15775364637374878, "epoch": 0.4920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.08987146103166258, "kl": 0.05933793634176254, "learning_rate": 9.371974855109874e-07, "loss": 0.0006, "num_tokens": 6819329.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999914169311523, "sampling/importance_sampling_ratio/min": 0.3954392969608307, "sampling/sampling_logp_difference/max": 0.927757978439331, "sampling/sampling_logp_difference/mean": 0.01181037724018097, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 128.921875, "completions/mean_terminated_length": 128.921875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19706273078918457, "epoch": 0.49380530973451325, "frac_reward_zero_std": 1.0, "grad_norm": 0.08839176153091326, "kl": 0.06452324986457825, "learning_rate": 9.36445967434964e-07, "loss": 0.0008, "num_tokens": 6837484.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006117820739746, "sampling/importance_sampling_ratio/min": 0.40095263719558716, "sampling/sampling_logp_difference/max": 0.9139120578765869, "sampling/sampling_logp_difference/mean": 0.014659091830253601, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 80.484375, "completions/mean_terminated_length": 80.484375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.14291155338287354, "epoch": 0.49557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.09747972793638322, "kl": 0.04485659301280975, "learning_rate": 9.356902846181228e-07, "loss": 0.0005, "num_tokens": 6851691.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7363736629486084, "sampling/importance_sampling_ratio/mean": 0.99992835521698, "sampling/importance_sampling_ratio/min": 0.382144033908844, "sampling/sampling_logp_difference/max": 0.9619576930999756, "sampling/sampling_logp_difference/mean": 0.01382683776319027, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 176.953125, "completions/mean_terminated_length": 176.953125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16771335899829865, "epoch": 0.49734513274336284, "frac_reward_zero_std": 0.75, "grad_norm": 1.5030858483343574, "kl": 0.10888098180294037, "learning_rate": 9.349304442714895e-07, "loss": -0.0502, "num_tokens": 6873272.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5698778629302979, "sampling/importance_sampling_ratio/mean": 0.9985082745552063, "sampling/importance_sampling_ratio/min": 0.6192692518234253, "sampling/sampling_logp_difference/max": 0.479215145111084, "sampling/sampling_logp_difference/mean": 0.011981655843555927, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 307.296875, "completions/mean_terminated_length": 307.296875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.28487294912338257, "epoch": 0.49911504424778763, "frac_reward_zero_std": 0.5, "grad_norm": 1.0734901998193047, "kl": 0.06948798894882202, "learning_rate": 9.341664536457625e-07, "loss": 0.0345, "num_tokens": 6912923.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6094746589660645, "sampling/importance_sampling_ratio/mean": 0.9998421669006348, "sampling/importance_sampling_ratio/min": 0.3894946277141571, "sampling/sampling_logp_difference/max": 0.9429052472114563, "sampling/sampling_logp_difference/mean": 0.015117552131414413, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 292.546875, "completions/mean_terminated_length": 292.546875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.26889660954475403, "epoch": 0.5008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 1.2696468567406338, "kl": 0.05943174660205841, "learning_rate": 9.33398320031244e-07, "loss": -0.1523, "num_tokens": 6945326.0, "reward": -0.0625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6665871143341064, "sampling/importance_sampling_ratio/mean": 0.999099612236023, "sampling/importance_sampling_ratio/min": 0.4919884502887726, "sampling/sampling_logp_difference/max": 0.7093000411987305, "sampling/sampling_logp_difference/mean": 0.014892845414578915, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 284.328125, "completions/mean_terminated_length": 284.328125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2056923806667328, "epoch": 0.5026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 0.8405283395965987, "kl": 0.04938054829835892, "learning_rate": 9.3262605075777e-07, "loss": 0.0157, "num_tokens": 6975971.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6539686918258667, "sampling/importance_sampling_ratio/mean": 0.9999218583106995, "sampling/importance_sampling_ratio/min": 0.4432476758956909, "sampling/sampling_logp_difference/max": 0.8136265277862549, "sampling/sampling_logp_difference/mean": 0.013145933859050274, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2219.0, "completions/max_terminated_length": 2219.0, "completions/mean_length": 320.09375, "completions/mean_terminated_length": 320.09375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.29539841413497925, "epoch": 0.504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 0.9115690909272004, "kl": 0.04545670002698898, "learning_rate": 9.318496531946409e-07, "loss": 0.1403, "num_tokens": 7012137.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6211599111557007, "sampling/importance_sampling_ratio/mean": 0.9994798302650452, "sampling/importance_sampling_ratio/min": 0.29257097840309143, "sampling/sampling_logp_difference/max": 1.2290480136871338, "sampling/sampling_logp_difference/mean": 0.015050049871206284, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 302.484375, "completions/mean_terminated_length": 302.484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.16481289267539978, "epoch": 0.5061946902654867, "frac_reward_zero_std": 0.75, "grad_norm": 0.7705307189572136, "kl": 0.05943280830979347, "learning_rate": 9.310691347505505e-07, "loss": 0.0031, "num_tokens": 7042760.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00006103515625, "sampling/importance_sampling_ratio/min": 0.4440855383872986, "sampling/sampling_logp_difference/max": 1.0570920705795288, "sampling/sampling_logp_difference/mean": 0.012434977106750011, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3207.0, "completions/max_terminated_length": 3207.0, "completions/mean_length": 702.875, "completions/mean_terminated_length": 702.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.16630315780639648, "epoch": 0.5079646017699115, "frac_reward_zero_std": 0.5, "grad_norm": 0.5475214002956252, "kl": 0.04190077632665634, "learning_rate": 9.30284502873516e-07, "loss": 0.0909, "num_tokens": 7097232.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5464054346084595, "sampling/importance_sampling_ratio/mean": 0.9994837045669556, "sampling/importance_sampling_ratio/min": 0.4982548654079437, "sampling/sampling_logp_difference/max": 0.696643590927124, "sampling/sampling_logp_difference/mean": 0.009219174273312092, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 179.71875, "completions/mean_terminated_length": 179.71875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.17250744998455048, "epoch": 0.5097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 1.3179955929378875, "kl": 0.060148030519485474, "learning_rate": 9.294957650508064e-07, "loss": -0.0381, "num_tokens": 7118382.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.574660301208496, "sampling/importance_sampling_ratio/mean": 0.999683141708374, "sampling/importance_sampling_ratio/min": 0.4064965546131134, "sampling/sampling_logp_difference/max": 0.9001798629760742, "sampling/sampling_logp_difference/mean": 0.011475750245153904, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.23858660459518433, "epoch": 0.511504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.08812454670748333, "kl": 0.06674356013536453, "learning_rate": 9.287029288088716e-07, "loss": 0.0007, "num_tokens": 7143078.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8533576726913452, "sampling/importance_sampling_ratio/mean": 0.9998701810836792, "sampling/importance_sampling_ratio/min": 0.49531203508377075, "sampling/sampling_logp_difference/max": 0.7025673389434814, "sampling/sampling_logp_difference/mean": 0.014833731576800346, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 280.796875, "completions/mean_terminated_length": 280.796875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.27508020401000977, "epoch": 0.5132743362831859, "frac_reward_zero_std": 0.5, "grad_norm": 1.3526854931632666, "kl": 0.07777999341487885, "learning_rate": 9.279060017132697e-07, "loss": -0.202, "num_tokens": 7178041.0, "reward": 0.375, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6465206146240234, "sampling/importance_sampling_ratio/mean": 1.000756859779358, "sampling/importance_sampling_ratio/min": 0.509071946144104, "sampling/sampling_logp_difference/max": 0.6751658916473389, "sampling/sampling_logp_difference/mean": 0.014674736186861992, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 267.34375, "completions/mean_terminated_length": 267.34375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.23666201531887054, "epoch": 0.5150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 1.2125553987782427, "kl": 0.06110885366797447, "learning_rate": 9.271049913685959e-07, "loss": -0.0851, "num_tokens": 7206559.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5332832336425781, "sampling/importance_sampling_ratio/mean": 0.9993974566459656, "sampling/importance_sampling_ratio/min": 0.6133177876472473, "sampling/sampling_logp_difference/max": 0.48887205123901367, "sampling/sampling_logp_difference/mean": 0.01301311980932951, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.33078938722610474, "epoch": 0.5168141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 1.4885184900481847, "kl": 0.08810103684663773, "learning_rate": 9.262999054184091e-07, "loss": 0.0967, "num_tokens": 7241039.0, "reward": 0.09375, "reward_std": 0.769389271736145, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6142606735229492, "sampling/importance_sampling_ratio/mean": 1.0004647970199585, "sampling/importance_sampling_ratio/min": 0.6152219772338867, "sampling/sampling_logp_difference/max": 0.48577213287353516, "sampling/sampling_logp_difference/mean": 0.014158688485622406, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.20557469129562378, "epoch": 0.5185840707964602, "frac_reward_zero_std": 0.75, "grad_norm": 0.6728133110088864, "kl": 0.06331959366798401, "learning_rate": 9.254907515451591e-07, "loss": -0.0502, "num_tokens": 7271903.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9900118112564087, "sampling/importance_sampling_ratio/mean": 1.0009146928787231, "sampling/importance_sampling_ratio/min": 0.523772656917572, "sampling/sampling_logp_difference/max": 0.6881406307220459, "sampling/sampling_logp_difference/mean": 0.01164301298558712, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 410.921875, "completions/mean_terminated_length": 410.921875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.40782609581947327, "epoch": 0.5203539823008849, "frac_reward_zero_std": 0.0, "grad_norm": 1.2100212985465095, "kl": 0.0944828987121582, "learning_rate": 9.246775374701138e-07, "loss": 0.0232, "num_tokens": 7331514.0, "reward": 0.21875, "reward_std": 0.9615862369537354, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.7158334255218506, "sampling/importance_sampling_ratio/mean": 1.0004222393035889, "sampling/importance_sampling_ratio/min": 0.6648805141448975, "sampling/sampling_logp_difference/max": 0.5398988723754883, "sampling/sampling_logp_difference/mean": 0.016165856271982193, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 103.71875, "completions/mean_terminated_length": 103.71875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.19218319654464722, "epoch": 0.5221238938053098, "frac_reward_zero_std": 1.0, "grad_norm": 0.13788621576140975, "kl": 0.0924367755651474, "learning_rate": 9.23860270953285e-07, "loss": 0.001, "num_tokens": 7347464.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4544978141784668, "sampling/importance_sampling_ratio/mean": 0.9993179440498352, "sampling/importance_sampling_ratio/min": 0.6418886780738831, "sampling/sampling_logp_difference/max": 0.4433404207229614, "sampling/sampling_logp_difference/mean": 0.014214429073035717, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 268.453125, "completions/mean_terminated_length": 268.453125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1871560961008072, "epoch": 0.5238938053097345, "frac_reward_zero_std": 0.75, "grad_norm": 0.9329802374907915, "kl": 0.0661318451166153, "learning_rate": 9.230389597933543e-07, "loss": -0.0238, "num_tokens": 7375045.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.8718618154525757, "sampling/importance_sampling_ratio/mean": 1.0005007982254028, "sampling/importance_sampling_ratio/min": 0.566978394985199, "sampling/sampling_logp_difference/max": 0.6269335746765137, "sampling/sampling_logp_difference/mean": 0.010767589323222637, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.2593000531196594, "epoch": 0.5256637168141592, "frac_reward_zero_std": 0.5, "grad_norm": 1.4836936870542898, "kl": 0.08535981178283691, "learning_rate": 9.222136118275995e-07, "loss": 0.0537, "num_tokens": 7399893.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5093467235565186, "sampling/importance_sampling_ratio/mean": 0.9991475343704224, "sampling/importance_sampling_ratio/min": 0.5910110473632812, "sampling/sampling_logp_difference/max": 0.525920569896698, "sampling/sampling_logp_difference/mean": 0.014101866632699966, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 109.859375, "completions/mean_terminated_length": 109.859375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1588691622018814, "epoch": 0.5274336283185841, "frac_reward_zero_std": 1.0, "grad_norm": 0.26793292863903306, "kl": 0.0825464129447937, "learning_rate": 9.213842349318184e-07, "loss": 0.0008, "num_tokens": 7415964.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6052603721618652, "sampling/importance_sampling_ratio/mean": 0.9994765520095825, "sampling/importance_sampling_ratio/min": 0.3743131756782532, "sampling/sampling_logp_difference/max": 0.9826624393463135, "sampling/sampling_logp_difference/mean": 0.01235160417854786, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 351.390625, "completions/mean_terminated_length": 351.390625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.32528895139694214, "epoch": 0.5292035398230088, "frac_reward_zero_std": 0.75, "grad_norm": 0.7001552934229136, "kl": 0.0776461809873581, "learning_rate": 9.205508370202551e-07, "loss": -0.0258, "num_tokens": 7451445.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4513570070266724, "sampling/importance_sampling_ratio/mean": 1.000380516052246, "sampling/importance_sampling_ratio/min": 0.4596993327140808, "sampling/sampling_logp_difference/max": 0.7771825790405273, "sampling/sampling_logp_difference/mean": 0.014301600866019726, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 293.828125, "completions/mean_terminated_length": 293.828125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.306728333234787, "epoch": 0.5309734513274337, "frac_reward_zero_std": 0.75, "grad_norm": 0.7749986185704725, "kl": 0.07144207507371902, "learning_rate": 9.197134260455233e-07, "loss": 0.0051, "num_tokens": 7484506.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.616125226020813, "sampling/importance_sampling_ratio/mean": 1.0003517866134644, "sampling/importance_sampling_ratio/min": 0.6433696746826172, "sampling/sampling_logp_difference/max": 0.48003149032592773, "sampling/sampling_logp_difference/mean": 0.013065634295344353, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 196.90625, "completions/mean_terminated_length": 196.90625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.277901828289032, "epoch": 0.5327433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 1.759918051034487, "kl": 0.07234806567430496, "learning_rate": 9.188720099985315e-07, "loss": 0.0051, "num_tokens": 7509668.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5078667402267456, "sampling/importance_sampling_ratio/mean": 0.9998195171356201, "sampling/importance_sampling_ratio/min": 0.4843681752681732, "sampling/sampling_logp_difference/max": 0.7249100208282471, "sampling/sampling_logp_difference/mean": 0.014015724882483482, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.3066433370113373, "epoch": 0.5345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 0.9579675823353828, "kl": 0.07620877027511597, "learning_rate": 9.180265969084056e-07, "loss": -0.0023, "num_tokens": 7539422.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5852468013763428, "sampling/importance_sampling_ratio/mean": 1.0001237392425537, "sampling/importance_sampling_ratio/min": 0.6171606183052063, "sampling/sampling_logp_difference/max": 0.48262596130371094, "sampling/sampling_logp_difference/mean": 0.014694667421281338, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 274.875, "completions/mean_terminated_length": 274.875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.28560376167297363, "epoch": 0.536283185840708, "frac_reward_zero_std": 0.5, "grad_norm": 1.3973525592233575, "kl": 0.09303057938814163, "learning_rate": 9.171771948424136e-07, "loss": 0.0255, "num_tokens": 7569478.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.591186761856079, "sampling/importance_sampling_ratio/mean": 0.9994941353797913, "sampling/importance_sampling_ratio/min": 0.5497964024543762, "sampling/sampling_logp_difference/max": 0.5982072353363037, "sampling/sampling_logp_difference/mean": 0.01473798044025898, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 166.46875, "completions/mean_terminated_length": 166.46875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.23117336630821228, "epoch": 0.5380530973451327, "frac_reward_zero_std": 0.75, "grad_norm": 1.3162876732127737, "kl": 0.08727886527776718, "learning_rate": 9.163238119058871e-07, "loss": -0.1241, "num_tokens": 7589956.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5148569345474243, "sampling/importance_sampling_ratio/mean": 0.9996645450592041, "sampling/importance_sampling_ratio/min": 0.41505351662635803, "sampling/sampling_logp_difference/max": 0.8793478012084961, "sampling/sampling_logp_difference/mean": 0.018173620104789734, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 190.671875, "completions/mean_terminated_length": 190.671875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.17851614952087402, "epoch": 0.5398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.09353015701113661, "kl": 0.06412681192159653, "learning_rate": 9.154664562421453e-07, "loss": 0.0006, "num_tokens": 7612383.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6308667659759521, "sampling/importance_sampling_ratio/mean": 0.9991492629051208, "sampling/importance_sampling_ratio/min": 0.2518034279346466, "sampling/sampling_logp_difference/max": 1.3791065216064453, "sampling/sampling_logp_difference/mean": 0.014014537446200848, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 181.234375, "completions/mean_terminated_length": 181.234375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.30106690526008606, "epoch": 0.5415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.5496192511960014, "kl": 0.08148938417434692, "learning_rate": 9.146051360324165e-07, "loss": -0.015, "num_tokens": 7635086.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.559645414352417, "sampling/importance_sampling_ratio/mean": 0.9988859295845032, "sampling/importance_sampling_ratio/min": 0.4827265739440918, "sampling/sampling_logp_difference/max": 0.7283048629760742, "sampling/sampling_logp_difference/mean": 0.016066260635852814, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 402.453125, "completions/mean_terminated_length": 402.453125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.23821797966957092, "epoch": 0.5433628318584071, "frac_reward_zero_std": 0.5, "grad_norm": 0.9756708138381618, "kl": 0.0629359781742096, "learning_rate": 9.137398594957603e-07, "loss": -0.006, "num_tokens": 7671355.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.615748405456543, "sampling/importance_sampling_ratio/mean": 0.9993727803230286, "sampling/importance_sampling_ratio/min": 0.6049659848213196, "sampling/sampling_logp_difference/max": 0.5025830268859863, "sampling/sampling_logp_difference/mean": 0.012970870360732079, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 271.953125, "completions/mean_terminated_length": 271.953125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.22955457866191864, "epoch": 0.5451327433628319, "frac_reward_zero_std": 0.75, "grad_norm": 1.0434067588609035, "kl": 0.05926692858338356, "learning_rate": 9.128706348889894e-07, "loss": -0.0341, "num_tokens": 7699288.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999916136264801, "sampling/importance_sampling_ratio/min": 0.3981122672557831, "sampling/sampling_logp_difference/max": 0.9210212230682373, "sampling/sampling_logp_difference/mean": 0.013223261572420597, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 220.453125, "completions/mean_terminated_length": 220.453125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2675577700138092, "epoch": 0.5469026548672566, "frac_reward_zero_std": 0.75, "grad_norm": 1.371522274605918, "kl": 0.053882721811532974, "learning_rate": 9.1199747050659e-07, "loss": 0.0148, "num_tokens": 7725685.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6155786514282227, "sampling/importance_sampling_ratio/mean": 1.0007877349853516, "sampling/importance_sampling_ratio/min": 0.6399836540222168, "sampling/sampling_logp_difference/max": 0.4796931743621826, "sampling/sampling_logp_difference/mean": 0.014492020010948181, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.20168085396289825, "epoch": 0.5486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 1.0648624180257262, "kl": 0.06684594601392746, "learning_rate": 9.111203746806439e-07, "loss": 0.0024, "num_tokens": 7753465.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995627403259277, "sampling/importance_sampling_ratio/min": 0.5129303932189941, "sampling/sampling_logp_difference/max": 0.9398455619812012, "sampling/sampling_logp_difference/mean": 0.012928429991006851, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 293.546875, "completions/mean_terminated_length": 293.546875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.2573712468147278, "epoch": 0.5504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 1.3494248732280365, "kl": 0.06218942627310753, "learning_rate": 9.102393557807476e-07, "loss": 0.0511, "num_tokens": 7784732.0, "reward": 0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6207278966903687, "sampling/importance_sampling_ratio/mean": 1.0007548332214355, "sampling/importance_sampling_ratio/min": 0.3408876359462738, "sampling/sampling_logp_difference/max": 1.076202392578125, "sampling/sampling_logp_difference/mean": 0.014025572687387466, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 143.734375, "completions/mean_terminated_length": 143.734375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.272970974445343, "epoch": 0.552212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 1.5239090892074956, "kl": 0.09259387850761414, "learning_rate": 9.093544222139337e-07, "loss": -0.0266, "num_tokens": 7804267.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8767484426498413, "sampling/importance_sampling_ratio/mean": 0.9993801116943359, "sampling/importance_sampling_ratio/min": 0.5369614362716675, "sampling/sampling_logp_difference/max": 0.6295406818389893, "sampling/sampling_logp_difference/mean": 0.016071217134594917, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 87.203125, "completions/mean_terminated_length": 87.203125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1955489069223404, "epoch": 0.5539823008849557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0688370073157419, "kl": 0.056738391518592834, "learning_rate": 9.084655824245897e-07, "loss": 0.0006, "num_tokens": 7820280.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7803267240524292, "sampling/importance_sampling_ratio/mean": 0.9988535642623901, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.5767968893051147, "sampling/sampling_logp_difference/mean": 0.014690269716084003, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 190.46875, "completions/mean_terminated_length": 190.46875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.29476311802864075, "epoch": 0.5557522123893806, "frac_reward_zero_std": 0.5, "grad_norm": 1.8588501899542456, "kl": 0.08590688556432724, "learning_rate": 9.075728448943781e-07, "loss": -0.0277, "num_tokens": 7843942.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5644317865371704, "sampling/importance_sampling_ratio/mean": 0.9990231990814209, "sampling/importance_sampling_ratio/min": 0.4227965772151947, "sampling/sampling_logp_difference/max": 0.8608641624450684, "sampling/sampling_logp_difference/mean": 0.015657000243663788, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 431.375, "completions/mean_terminated_length": 431.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.38982319831848145, "epoch": 0.5575221238938053, "frac_reward_zero_std": 0.0, "grad_norm": 1.2180201919397797, "kl": 0.06818792223930359, "learning_rate": 9.066762181421552e-07, "loss": -0.0736, "num_tokens": 7891678.0, "reward": 0.3125, "reward_std": 0.8416029214859009, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.627927303314209, "sampling/importance_sampling_ratio/mean": 0.9998992681503296, "sampling/importance_sampling_ratio/min": 0.632481575012207, "sampling/sampling_logp_difference/max": 0.4873075485229492, "sampling/sampling_logp_difference/mean": 0.015121396631002426, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 167.90625, "completions/mean_terminated_length": 167.90625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.22775554656982422, "epoch": 0.5592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 1.4788762934742388, "kl": 0.051848404109478, "learning_rate": 9.057757107238894e-07, "loss": -0.0282, "num_tokens": 7912824.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5800504684448242, "sampling/importance_sampling_ratio/mean": 0.9999009370803833, "sampling/importance_sampling_ratio/min": 0.5910179615020752, "sampling/sampling_logp_difference/max": 0.5259089469909668, "sampling/sampling_logp_difference/mean": 0.014747922308743, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 86.03125, "completions/mean_terminated_length": 86.03125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1674557328224182, "epoch": 0.5610619469026549, "frac_reward_zero_std": 1.0, "grad_norm": 0.08761311265151135, "kl": 0.04727327451109886, "learning_rate": 9.048713312325804e-07, "loss": 0.0005, "num_tokens": 7927994.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6030267477035522, "sampling/importance_sampling_ratio/mean": 1.0001249313354492, "sampling/importance_sampling_ratio/min": 0.4166453182697296, "sampling/sampling_logp_difference/max": 0.8755199909210205, "sampling/sampling_logp_difference/mean": 0.014558599330484867, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 162.328125, "completions/mean_terminated_length": 162.328125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2823961079120636, "epoch": 0.5628318584070796, "frac_reward_zero_std": 0.75, "grad_norm": 1.411193019722694, "kl": 0.09012647718191147, "learning_rate": 9.039630882981768e-07, "loss": 0.0146, "num_tokens": 7949519.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000240802764893, "sampling/importance_sampling_ratio/min": 0.5999212861061096, "sampling/sampling_logp_difference/max": 1.346999168395996, "sampling/sampling_logp_difference/mean": 0.01743931509554386, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 288.859375, "completions/mean_terminated_length": 288.859375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2848256826400757, "epoch": 0.5646017699115045, "frac_reward_zero_std": 0.75, "grad_norm": 1.129495285915146, "kl": 0.06930936872959137, "learning_rate": 9.030509905874932e-07, "loss": 0.0024, "num_tokens": 7981094.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005121231079102, "sampling/importance_sampling_ratio/min": 0.6151832938194275, "sampling/sampling_logp_difference/max": 0.8308603763580322, "sampling/sampling_logp_difference/mean": 0.01379585824906826, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 295.28125, "completions/mean_terminated_length": 295.28125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2820858955383301, "epoch": 0.5663716814159292, "frac_reward_zero_std": 1.0, "grad_norm": 0.053876974919179275, "kl": 0.05944392830133438, "learning_rate": 9.021350468041287e-07, "loss": 0.0006, "num_tokens": 8012088.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5042253732681274, "sampling/importance_sampling_ratio/mean": 1.0006166696548462, "sampling/importance_sampling_ratio/min": 0.39063671231269836, "sampling/sampling_logp_difference/max": 0.9399772882461548, "sampling/sampling_logp_difference/mean": 0.015095303766429424, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 255.703125, "completions/mean_terminated_length": 255.703125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3029351830482483, "epoch": 0.5681415929203539, "frac_reward_zero_std": 0.75, "grad_norm": 0.9249473645018048, "kl": 0.07730159908533096, "learning_rate": 9.012152656883822e-07, "loss": 0.0058, "num_tokens": 8042949.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 1.0001577138900757, "sampling/importance_sampling_ratio/min": 0.48635271191596985, "sampling/sampling_logp_difference/max": 0.7208211421966553, "sampling/sampling_logp_difference/mean": 0.01639869436621666, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 136.546875, "completions/mean_terminated_length": 136.546875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16663135588169098, "epoch": 0.5699115044247788, "frac_reward_zero_std": 1.0, "grad_norm": 0.07524105326785883, "kl": 0.057716891169548035, "learning_rate": 9.002916560171712e-07, "loss": 0.0006, "num_tokens": 8060936.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994326829910278, "sampling/importance_sampling_ratio/min": 0.5884997248649597, "sampling/sampling_logp_difference/max": 0.7073221206665039, "sampling/sampling_logp_difference/mean": 0.013074960559606552, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 459.71875, "completions/mean_terminated_length": 459.71875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.2107498049736023, "epoch": 0.5716814159292035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0371303132084706, "kl": 0.05287554860115051, "learning_rate": 8.993642266039456e-07, "loss": 0.0004, "num_tokens": 8103222.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5756126642227173, "sampling/importance_sampling_ratio/mean": 1.0006787776947021, "sampling/importance_sampling_ratio/min": 0.662255048751831, "sampling/sampling_logp_difference/max": 0.45464420318603516, "sampling/sampling_logp_difference/mean": 0.01088624820113182, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 309.9375, "completions/mean_terminated_length": 309.9375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.3447498381137848, "epoch": 0.5734513274336284, "frac_reward_zero_std": 0.25, "grad_norm": 1.471981673379903, "kl": 0.06898567080497742, "learning_rate": 8.984329862986055e-07, "loss": 0.0647, "num_tokens": 8137122.0, "reward": 0.03125, "reward_std": 0.6424696445465088, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 0.9995123147964478, "sampling/importance_sampling_ratio/min": 0.47005900740623474, "sampling/sampling_logp_difference/max": 0.7548971176147461, "sampling/sampling_logp_difference/mean": 0.01564585044980049, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 300.59375, "completions/mean_terminated_length": 300.59375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.19981971383094788, "epoch": 0.5752212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.04307987538335349, "kl": 0.05091937258839607, "learning_rate": 8.97497943987416e-07, "loss": 0.0004, "num_tokens": 8166664.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 0.9999165534973145, "sampling/importance_sampling_ratio/min": 0.4996969401836395, "sampling/sampling_logp_difference/max": 0.6937534809112549, "sampling/sampling_logp_difference/mean": 0.011926661245524883, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 67.03125, "completions/mean_terminated_length": 67.03125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.15848270058631897, "epoch": 0.5769911504424778, "frac_reward_zero_std": 1.0, "grad_norm": 0.11010405353679283, "kl": 0.057478707283735275, "learning_rate": 8.96559108592922e-07, "loss": 0.0006, "num_tokens": 8179818.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5761481523513794, "sampling/importance_sampling_ratio/mean": 1.0005512237548828, "sampling/importance_sampling_ratio/min": 0.6222555637359619, "sampling/sampling_logp_difference/max": 0.47440433502197266, "sampling/sampling_logp_difference/mean": 0.013758720830082893, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 291.90625, "completions/mean_terminated_length": 291.90625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.21484702825546265, "epoch": 0.5787610619469027, "frac_reward_zero_std": 0.75, "grad_norm": 0.7765090874023917, "kl": 0.05281621590256691, "learning_rate": 8.956164890738642e-07, "loss": 0.0134, "num_tokens": 8210052.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5708550214767456, "sampling/importance_sampling_ratio/mean": 1.000569224357605, "sampling/importance_sampling_ratio/min": 0.47638097405433655, "sampling/sampling_logp_difference/max": 0.74153733253479, "sampling/sampling_logp_difference/mean": 0.012912696227431297, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 169.34375, "completions/mean_terminated_length": 169.34375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1510034054517746, "epoch": 0.5805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.07765595642037981, "kl": 0.05827036872506142, "learning_rate": 8.946700944250924e-07, "loss": 0.0005, "num_tokens": 8230810.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00015127658844, "sampling/importance_sampling_ratio/min": 0.5813713073730469, "sampling/sampling_logp_difference/max": 1.3661885261535645, "sampling/sampling_logp_difference/mean": 0.010614488273859024, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 280.359375, "completions/mean_terminated_length": 280.359375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.25733235478401184, "epoch": 0.5823008849557522, "frac_reward_zero_std": 0.75, "grad_norm": 1.0561407709077029, "kl": 0.05320676416158676, "learning_rate": 8.937199336774804e-07, "loss": 0.027, "num_tokens": 8260833.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6407109498977661, "sampling/importance_sampling_ratio/mean": 0.9997061491012573, "sampling/importance_sampling_ratio/min": 0.5191895365715027, "sampling/sampling_logp_difference/max": 0.6554862260818481, "sampling/sampling_logp_difference/mean": 0.013459185138344765, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 363.21875, "completions/mean_terminated_length": 363.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2740519046783447, "epoch": 0.584070796460177, "frac_reward_zero_std": 0.25, "grad_norm": 1.37126677909582, "kl": 0.07458266615867615, "learning_rate": 8.927660158978392e-07, "loss": 0.041, "num_tokens": 8297231.0, "reward": -0.21875, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009007453918457, "sampling/importance_sampling_ratio/min": 0.5382511019706726, "sampling/sampling_logp_difference/max": 0.7049427032470703, "sampling/sampling_logp_difference/mean": 0.013254882767796516, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 172.890625, "completions/mean_terminated_length": 172.890625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.1928461492061615, "epoch": 0.5858407079646017, "frac_reward_zero_std": 1.0, "grad_norm": 0.04357545424285125, "kl": 0.05610031634569168, "learning_rate": 8.918083501888316e-07, "loss": 0.0005, "num_tokens": 8319784.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001771450042725, "sampling/importance_sampling_ratio/min": 0.5056702494621277, "sampling/sampling_logp_difference/max": 0.7203598022460938, "sampling/sampling_logp_difference/mean": 0.01341334730386734, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 231.265625, "completions/mean_terminated_length": 231.265625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.2923291325569153, "epoch": 0.5876106194690266, "frac_reward_zero_std": 0.5, "grad_norm": 1.3233262743739813, "kl": 0.08021809905767441, "learning_rate": 8.908469456888843e-07, "loss": 0.0326, "num_tokens": 8348473.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5302538871765137, "sampling/importance_sampling_ratio/mean": 0.9991505146026611, "sampling/importance_sampling_ratio/min": 0.590603232383728, "sampling/sampling_logp_difference/max": 0.5266108512878418, "sampling/sampling_logp_difference/mean": 0.015011979267001152, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 274.359375, "completions/mean_terminated_length": 274.359375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.19611316919326782, "epoch": 0.5893805309734513, "frac_reward_zero_std": 1.0, "grad_norm": 0.02954319972454024, "kl": 0.0521879680454731, "learning_rate": 8.898818115721007e-07, "loss": 0.0004, "num_tokens": 8375952.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6398762464523315, "sampling/importance_sampling_ratio/mean": 1.0001055002212524, "sampling/importance_sampling_ratio/min": 0.4978253245353699, "sampling/sampling_logp_difference/max": 0.6975060701370239, "sampling/sampling_logp_difference/mean": 0.012420076876878738, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 151.140625, "completions/mean_terminated_length": 151.140625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.19874125719070435, "epoch": 0.5911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.07069322033226982, "kl": 0.0700312927365303, "learning_rate": 8.889129570481741e-07, "loss": 0.0008, "num_tokens": 8396777.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.615945816040039, "sampling/importance_sampling_ratio/mean": 1.0007848739624023, "sampling/importance_sampling_ratio/min": 0.6252050995826721, "sampling/sampling_logp_difference/max": 0.4799203872680664, "sampling/sampling_logp_difference/mean": 0.011867159977555275, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 196.234375, "completions/mean_terminated_length": 196.234375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17857015132904053, "epoch": 0.5929203539823009, "frac_reward_zero_std": 1.0, "grad_norm": 0.08997393356125598, "kl": 0.07307864725589752, "learning_rate": 8.879403913622996e-07, "loss": 0.0005, "num_tokens": 8419160.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5557953119277954, "sampling/importance_sampling_ratio/mean": 0.9993423223495483, "sampling/importance_sampling_ratio/min": 0.5362957715988159, "sampling/sampling_logp_difference/max": 0.6230694055557251, "sampling/sampling_logp_difference/mean": 0.013124162331223488, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 137.734375, "completions/mean_terminated_length": 137.734375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.23060771822929382, "epoch": 0.5946902654867257, "frac_reward_zero_std": 0.75, "grad_norm": 1.5729627295539523, "kl": 0.07023642957210541, "learning_rate": 8.869641237950849e-07, "loss": 0.0096, "num_tokens": 8439159.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001330375671387, "sampling/importance_sampling_ratio/min": 0.37725985050201416, "sampling/sampling_logp_difference/max": 0.9748210906982422, "sampling/sampling_logp_difference/mean": 0.014551296830177307, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 152.578125, "completions/mean_terminated_length": 152.578125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2595953047275543, "epoch": 0.5964601769911504, "frac_reward_zero_std": 0.5, "grad_norm": 2.2799235127671804, "kl": 0.11811453104019165, "learning_rate": 8.859841636624631e-07, "loss": -0.0076, "num_tokens": 8459436.0, "reward": 0.375, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5285509824752808, "sampling/importance_sampling_ratio/mean": 0.9994604587554932, "sampling/importance_sampling_ratio/min": 0.6150755882263184, "sampling/sampling_logp_difference/max": 0.4860100746154785, "sampling/sampling_logp_difference/mean": 0.016149334609508514, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 81.765625, "completions/mean_terminated_length": 81.765625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16701452434062958, "epoch": 0.5982300884955752, "frac_reward_zero_std": 1.0, "grad_norm": 0.10530513778654406, "kl": 0.0621553435921669, "learning_rate": 8.850005203156034e-07, "loss": 0.0006, "num_tokens": 8475117.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001121759414673, "sampling/importance_sampling_ratio/min": 0.6090817451477051, "sampling/sampling_logp_difference/max": 0.9516317844390869, "sampling/sampling_logp_difference/mean": 0.013827698305249214, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 124.578125, "completions/mean_terminated_length": 124.578125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1291477382183075, "epoch": 0.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.05822591204347676, "kl": 0.05272146314382553, "learning_rate": 8.84013203140821e-07, "loss": 0.0005, "num_tokens": 8492530.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 0.9989855289459229, "sampling/importance_sampling_ratio/min": 0.6146249771118164, "sampling/sampling_logp_difference/max": 0.4867429733276367, "sampling/sampling_logp_difference/mean": 0.010098088532686234, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 83.0625, "completions/mean_terminated_length": 83.0625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1791755110025406, "epoch": 0.6017699115044248, "frac_reward_zero_std": 1.0, "grad_norm": 0.5696757305028878, "kl": 0.07870863378047943, "learning_rate": 8.83022221559489e-07, "loss": 0.0008, "num_tokens": 8508358.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8014352321624756, "sampling/importance_sampling_ratio/mean": 1.0011571645736694, "sampling/importance_sampling_ratio/min": 0.49721482396125793, "sampling/sampling_logp_difference/max": 0.6987330913543701, "sampling/sampling_logp_difference/mean": 0.013631206005811691, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 184.96875, "completions/mean_terminated_length": 184.96875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.24097266793251038, "epoch": 0.6035398230088496, "frac_reward_zero_std": 0.75, "grad_norm": 1.2865019127325843, "kl": 0.06397417932748795, "learning_rate": 8.820275850279472e-07, "loss": 0.0096, "num_tokens": 8531380.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.97219979763031, "sampling/importance_sampling_ratio/mean": 0.9997183084487915, "sampling/importance_sampling_ratio/min": 0.6128032803535461, "sampling/sampling_logp_difference/max": 0.6791496276855469, "sampling/sampling_logp_difference/mean": 0.013794638216495514, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 187.3125, "completions/mean_terminated_length": 187.3125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2940760552883148, "epoch": 0.6053097345132743, "frac_reward_zero_std": 0.5, "grad_norm": 1.791038651762735, "kl": 0.07705433666706085, "learning_rate": 8.810293030374125e-07, "loss": 0.0179, "num_tokens": 8554856.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6143684387207031, "sampling/importance_sampling_ratio/mean": 0.9993451833724976, "sampling/importance_sampling_ratio/min": 0.6150063872337341, "sampling/sampling_logp_difference/max": 0.48612260818481445, "sampling/sampling_logp_difference/mean": 0.01551968976855278, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 104.125, "completions/mean_terminated_length": 104.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16406409442424774, "epoch": 0.6070796460176991, "frac_reward_zero_std": 1.0, "grad_norm": 0.10499089296345146, "kl": 0.06788899749517441, "learning_rate": 8.800273851138882e-07, "loss": 0.0008, "num_tokens": 8571680.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.951509952545166, "sampling/importance_sampling_ratio/mean": 1.000935673713684, "sampling/importance_sampling_ratio/min": 0.5362957715988159, "sampling/sampling_logp_difference/max": 0.6686034202575684, "sampling/sampling_logp_difference/mean": 0.012094831094145775, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 132.4375, "completions/mean_terminated_length": 132.4375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.22538694739341736, "epoch": 0.6088495575221239, "frac_reward_zero_std": 0.75, "grad_norm": 2.0000968985454306, "kl": 0.07399487495422363, "learning_rate": 8.790218408180734e-07, "loss": 0.0417, "num_tokens": 8592044.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5879703760147095, "sampling/importance_sampling_ratio/mean": 1.0010604858398438, "sampling/importance_sampling_ratio/min": 0.5051171183586121, "sampling/sampling_logp_difference/max": 0.6829650402069092, "sampling/sampling_logp_difference/mean": 0.013496413826942444, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 224.5625, "completions/mean_terminated_length": 224.5625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2487189918756485, "epoch": 0.6106194690265486, "frac_reward_zero_std": 0.75, "grad_norm": 1.1415354252268628, "kl": 0.06451243162155151, "learning_rate": 8.780126797452712e-07, "loss": 0.0029, "num_tokens": 8619136.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6598302125930786, "sampling/importance_sampling_ratio/mean": 1.000083565711975, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.5067152976989746, "sampling/sampling_logp_difference/mean": 0.013567602261900902, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.22149424254894257, "epoch": 0.6123893805309735, "frac_reward_zero_std": 0.5, "grad_norm": 1.5703518229608118, "kl": 0.06588408350944519, "learning_rate": 8.769999115252975e-07, "loss": 0.0943, "num_tokens": 8646600.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6341969966888428, "sampling/importance_sampling_ratio/mean": 1.0001894235610962, "sampling/importance_sampling_ratio/min": 0.6255112886428833, "sampling/sampling_logp_difference/max": 0.4911515712738037, "sampling/sampling_logp_difference/mean": 0.012377671897411346, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 117.8125, "completions/mean_terminated_length": 117.8125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.20189084112644196, "epoch": 0.6141592920353982, "frac_reward_zero_std": 0.75, "grad_norm": 1.8138896234674253, "kl": 0.05739222839474678, "learning_rate": 8.759835458223887e-07, "loss": 0.0008, "num_tokens": 8664700.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9370975494384766, "sampling/importance_sampling_ratio/mean": 0.9998288154602051, "sampling/importance_sampling_ratio/min": 0.6521785259246826, "sampling/sampling_logp_difference/max": 0.6611907482147217, "sampling/sampling_logp_difference/mean": 0.01360551081597805, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 160.078125, "completions/mean_terminated_length": 160.078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.2440776526927948, "epoch": 0.6159292035398231, "frac_reward_zero_std": 0.75, "grad_norm": 1.7613896293278415, "kl": 0.071126788854599, "learning_rate": 8.749635923351106e-07, "loss": -0.0216, "num_tokens": 8686289.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6740548610687256, "sampling/importance_sampling_ratio/mean": 0.9998148083686829, "sampling/importance_sampling_ratio/min": 0.4597637951374054, "sampling/sampling_logp_difference/max": 0.7770423889160156, "sampling/sampling_logp_difference/mean": 0.016347207129001617, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 143.296875, "completions/mean_terminated_length": 143.296875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.1951998472213745, "epoch": 0.6176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.918197884818392, "kl": 0.07605881989002228, "learning_rate": 8.739400607962644e-07, "loss": -0.0193, "num_tokens": 8706740.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000845193862915, "sampling/importance_sampling_ratio/min": 0.6092931628227234, "sampling/sampling_logp_difference/max": 0.9780411720275879, "sampling/sampling_logp_difference/mean": 0.013042790815234184, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 126.609375, "completions/mean_terminated_length": 126.609375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1360510289669037, "epoch": 0.6194690265486725, "frac_reward_zero_std": 1.0, "grad_norm": 0.09226234964473169, "kl": 0.05438254773616791, "learning_rate": 8.729129609727946e-07, "loss": 0.0005, "num_tokens": 8723403.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6970200538635254, "sampling/importance_sampling_ratio/mean": 1.0000211000442505, "sampling/importance_sampling_ratio/min": 0.6099864840507507, "sampling/sampling_logp_difference/max": 0.5288738012313843, "sampling/sampling_logp_difference/mean": 0.010959561914205551, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 140.09375, "completions/mean_terminated_length": 140.09375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.24002408981323242, "epoch": 0.6212389380530974, "frac_reward_zero_std": 0.5, "grad_norm": 2.2546419650657046, "kl": 0.08410458266735077, "learning_rate": 8.718823026656958e-07, "loss": 0.0344, "num_tokens": 8744001.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008864402770996, "sampling/importance_sampling_ratio/min": 0.6227321624755859, "sampling/sampling_logp_difference/max": 0.736912727355957, "sampling/sampling_logp_difference/mean": 0.014057939872145653, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 150.765625, "completions/mean_terminated_length": 150.765625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.20358611643314362, "epoch": 0.6230088495575221, "frac_reward_zero_std": 0.75, "grad_norm": 1.4775025989860437, "kl": 0.08161047101020813, "learning_rate": 8.708480957099193e-07, "loss": -0.0306, "num_tokens": 8763026.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 0.9997013807296753, "sampling/importance_sampling_ratio/min": 0.5773112773895264, "sampling/sampling_logp_difference/max": 0.5493736267089844, "sampling/sampling_logp_difference/mean": 0.01194295845925808, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 144.859375, "completions/mean_terminated_length": 144.859375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.218020498752594, "epoch": 0.6247787610619469, "frac_reward_zero_std": 0.5, "grad_norm": 2.335798143126818, "kl": 0.08045586198568344, "learning_rate": 8.698103499742783e-07, "loss": -0.0078, "num_tokens": 8782281.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5969594717025757, "sampling/importance_sampling_ratio/mean": 0.9999384880065918, "sampling/importance_sampling_ratio/min": 0.5707905292510986, "sampling/sampling_logp_difference/max": 0.5607329607009888, "sampling/sampling_logp_difference/mean": 0.01421868521720171, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 107.015625, "completions/mean_terminated_length": 107.015625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.20371389389038086, "epoch": 0.6265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 2.4109434891789103, "kl": 0.09943768382072449, "learning_rate": 8.687690753613554e-07, "loss": -0.0154, "num_tokens": 8800890.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5770983695983887, "sampling/importance_sampling_ratio/mean": 1.0010172128677368, "sampling/importance_sampling_ratio/min": 0.6097264885902405, "sampling/sampling_logp_difference/max": 0.49474477767944336, "sampling/sampling_logp_difference/mean": 0.014848465099930763, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 80.671875, "completions/mean_terminated_length": 80.671875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.14936864376068115, "epoch": 0.6283185840707964, "frac_reward_zero_std": 1.0, "grad_norm": 0.09142014151932654, "kl": 0.06150337681174278, "learning_rate": 8.677242818074062e-07, "loss": 0.0006, "num_tokens": 8815797.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.921689748764038, "sampling/importance_sampling_ratio/mean": 1.000291109085083, "sampling/importance_sampling_ratio/min": 0.6211443543434143, "sampling/sampling_logp_difference/max": 0.6532049179077148, "sampling/sampling_logp_difference/mean": 0.014118233695626259, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 182.453125, "completions/mean_terminated_length": 182.453125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.16201169788837433, "epoch": 0.6300884955752213, "frac_reward_zero_std": 0.75, "grad_norm": 1.6713685332979473, "kl": 0.05448603630065918, "learning_rate": 8.666759792822661e-07, "loss": -0.0304, "num_tokens": 8850402.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9990293979644775, "sampling/importance_sampling_ratio/min": 0.4361476004123688, "sampling/sampling_logp_difference/max": 0.8386523723602295, "sampling/sampling_logp_difference/mean": 0.012812724336981773, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 171.3125, "completions/mean_terminated_length": 171.3125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2026711106300354, "epoch": 0.631858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 1.866750549489719, "kl": 0.08422470837831497, "learning_rate": 8.656241777892542e-07, "loss": -0.0553, "num_tokens": 8871782.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6574535369873047, "sampling/importance_sampling_ratio/mean": 0.999561071395874, "sampling/importance_sampling_ratio/min": 0.3985403776168823, "sampling/sampling_logp_difference/max": 0.9199464321136475, "sampling/sampling_logp_difference/mean": 0.012770526111125946, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 109.203125, "completions/mean_terminated_length": 109.203125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.13958784937858582, "epoch": 0.6336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.08645563618042214, "kl": 0.0448039248585701, "learning_rate": 8.645688873650784e-07, "loss": 0.0004, "num_tokens": 8888531.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6463156938552856, "sampling/importance_sampling_ratio/mean": 1.0001178979873657, "sampling/importance_sampling_ratio/min": 0.6457481384277344, "sampling/sampling_logp_difference/max": 0.49853992462158203, "sampling/sampling_logp_difference/mean": 0.010816150344908237, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 116.9375, "completions/mean_terminated_length": 116.9375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19813725352287292, "epoch": 0.6353982300884956, "frac_reward_zero_std": 0.75, "grad_norm": 1.8021628980923308, "kl": 0.10695920139551163, "learning_rate": 8.63510118079739e-07, "loss": -0.0043, "num_tokens": 8906207.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.9630107879638672, "sampling/importance_sampling_ratio/mean": 0.9996039867401123, "sampling/importance_sampling_ratio/min": 0.6026878952980042, "sampling/sampling_logp_difference/max": 0.6744794845581055, "sampling/sampling_logp_difference/mean": 0.015852531418204308, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 136.59375, "completions/mean_terminated_length": 136.59375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.17604196071624756, "epoch": 0.6371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 1.7990998385976877, "kl": 0.05781039595603943, "learning_rate": 8.624478800364331e-07, "loss": -0.0214, "num_tokens": 8928293.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 1.0000381469726562, "sampling/importance_sampling_ratio/min": 0.5042073130607605, "sampling/sampling_logp_difference/max": 0.6847677230834961, "sampling/sampling_logp_difference/mean": 0.012718873098492622, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 133.90625, "completions/mean_terminated_length": 133.90625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.19571951031684875, "epoch": 0.6389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 1.824136599352241, "kl": 0.08511583507061005, "learning_rate": 8.613821833714583e-07, "loss": -0.029, "num_tokens": 8948383.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5733596086502075, "sampling/importance_sampling_ratio/mean": 0.9997984170913696, "sampling/importance_sampling_ratio/min": 0.41302263736724854, "sampling/sampling_logp_difference/max": 0.8842529058456421, "sampling/sampling_logp_difference/mean": 0.012984220869839191, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 250.03125, "completions/mean_terminated_length": 250.03125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2505485415458679, "epoch": 0.6407079646017699, "frac_reward_zero_std": 0.5, "grad_norm": 1.4488346443495308, "kl": 0.07486391067504883, "learning_rate": 8.603130382541155e-07, "loss": 0.0244, "num_tokens": 8978657.0, "reward": -0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5896488428115845, "sampling/importance_sampling_ratio/mean": 0.999321699142456, "sampling/importance_sampling_ratio/min": 0.6240894794464111, "sampling/sampling_logp_difference/max": 0.47146153450012207, "sampling/sampling_logp_difference/mean": 0.014886381104588509, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 204.28125, "completions/mean_terminated_length": 204.28125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.18027278780937195, "epoch": 0.6424778761061947, "frac_reward_zero_std": 0.5, "grad_norm": 1.6285089428667896, "kl": 0.0981227308511734, "learning_rate": 8.592404548866122e-07, "loss": -0.0298, "num_tokens": 9003651.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5082465410232544, "sampling/importance_sampling_ratio/mean": 0.9994364976882935, "sampling/importance_sampling_ratio/min": 0.533735454082489, "sampling/sampling_logp_difference/max": 0.6278549432754517, "sampling/sampling_logp_difference/mean": 0.012533634901046753, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 108.4375, "completions/mean_terminated_length": 108.4375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.18993332982063293, "epoch": 0.6442477876106195, "frac_reward_zero_std": 0.75, "grad_norm": 2.0095236298127017, "kl": 0.06518308818340302, "learning_rate": 8.58164443503965e-07, "loss": -0.0123, "num_tokens": 9022095.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5596710443496704, "sampling/importance_sampling_ratio/mean": 0.9990471601486206, "sampling/importance_sampling_ratio/min": 0.3990035951137543, "sampling/sampling_logp_difference/max": 0.9187848567962646, "sampling/sampling_logp_difference/mean": 0.015230879187583923, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 143.1875, "completions/mean_terminated_length": 143.1875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2442074567079544, "epoch": 0.6460176991150443, "frac_reward_zero_std": 0.5, "grad_norm": 2.9246010266843836, "kl": 0.10189088433980942, "learning_rate": 8.570850143739021e-07, "loss": -0.0608, "num_tokens": 9043627.0, "reward": -0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.614548683166504, "sampling/importance_sampling_ratio/mean": 1.000311255455017, "sampling/importance_sampling_ratio/min": 0.5488314032554626, "sampling/sampling_logp_difference/max": 0.599963903427124, "sampling/sampling_logp_difference/mean": 0.01644780859351158, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 133.703125, "completions/mean_terminated_length": 133.703125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1803325116634369, "epoch": 0.647787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.15263840655309271, "kl": 0.07733675092458725, "learning_rate": 8.560021777967648e-07, "loss": 0.0009, "num_tokens": 9062008.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000643730163574, "sampling/importance_sampling_ratio/min": 0.5964999198913574, "sampling/sampling_logp_difference/max": 0.8378231525421143, "sampling/sampling_logp_difference/mean": 0.013274619355797768, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 149.09375, "completions/mean_terminated_length": 149.09375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.22909826040267944, "epoch": 0.6495575221238938, "frac_reward_zero_std": 0.5, "grad_norm": 2.2541402015671834, "kl": 0.11110097169876099, "learning_rate": 8.549159441054104e-07, "loss": -0.0221, "num_tokens": 9084142.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000769019126892, "sampling/importance_sampling_ratio/min": 0.6309916973114014, "sampling/sampling_logp_difference/max": 0.8502066135406494, "sampling/sampling_logp_difference/mean": 0.01637434959411621, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 180.640625, "completions/mean_terminated_length": 180.640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1764160394668579, "epoch": 0.6513274336283186, "frac_reward_zero_std": 0.5, "grad_norm": 2.1355237393914055, "kl": 0.09981416165828705, "learning_rate": 8.538263236651117e-07, "loss": -0.0183, "num_tokens": 9107047.0, "reward": 0.4375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000234842300415, "sampling/importance_sampling_ratio/min": 0.6179380416870117, "sampling/sampling_logp_difference/max": 0.8145461082458496, "sampling/sampling_logp_difference/mean": 0.011960906907916069, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 171.90625, "completions/mean_terminated_length": 171.90625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.22818566858768463, "epoch": 0.6530973451327433, "frac_reward_zero_std": 1.0, "grad_norm": 0.23519524463410166, "kl": 0.13261058926582336, "learning_rate": 8.527333268734606e-07, "loss": 0.0014, "num_tokens": 9129441.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.810007095336914, "sampling/importance_sampling_ratio/mean": 1.0005247592926025, "sampling/importance_sampling_ratio/min": 0.621735692024231, "sampling/sampling_logp_difference/max": 0.5933307409286499, "sampling/sampling_logp_difference/mean": 0.014731564559042454, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 235.3125, "completions/mean_terminated_length": 235.3125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2126263678073883, "epoch": 0.6548672566371682, "frac_reward_zero_std": 0.5, "grad_norm": 1.2264701210139088, "kl": 0.11038054525852203, "learning_rate": 8.516369641602661e-07, "loss": -0.0127, "num_tokens": 9160277.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6069321632385254, "sampling/importance_sampling_ratio/mean": 0.9996969699859619, "sampling/importance_sampling_ratio/min": 0.5677486658096313, "sampling/sampling_logp_difference/max": 0.5660765171051025, "sampling/sampling_logp_difference/mean": 0.014379335567355156, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 243.03125, "completions/mean_terminated_length": 243.03125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.1508035808801651, "epoch": 0.6566371681415929, "frac_reward_zero_std": 0.5, "grad_norm": 1.5703032449807368, "kl": 0.08967246860265732, "learning_rate": 8.505372459874571e-07, "loss": 0.0148, "num_tokens": 9187351.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 1.0003345012664795, "sampling/importance_sampling_ratio/min": 0.5754743814468384, "sampling/sampling_logp_difference/max": 0.5525606274604797, "sampling/sampling_logp_difference/mean": 0.010215790010988712, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 140.40625, "completions/mean_terminated_length": 140.40625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2033536732196808, "epoch": 0.6584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 1.8544324741911449, "kl": 0.09469205141067505, "learning_rate": 8.494341828489812e-07, "loss": 0.0184, "num_tokens": 9207025.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6909304857254028, "sampling/importance_sampling_ratio/mean": 1.0011603832244873, "sampling/importance_sampling_ratio/min": 0.5753291249275208, "sampling/sampling_logp_difference/max": 0.5528130531311035, "sampling/sampling_logp_difference/mean": 0.013603290542960167, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 245.203125, "completions/mean_terminated_length": 245.203125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.23742641508579254, "epoch": 0.6601769911504425, "frac_reward_zero_std": 0.5, "grad_norm": 1.2982514742545086, "kl": 0.10852575302124023, "learning_rate": 8.483277852707052e-07, "loss": 0.0041, "num_tokens": 9235758.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6552973985671997, "sampling/importance_sampling_ratio/mean": 0.9996548295021057, "sampling/importance_sampling_ratio/min": 0.5545271039009094, "sampling/sampling_logp_difference/max": 0.5896396636962891, "sampling/sampling_logp_difference/mean": 0.014441119506955147, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 181.453125, "completions/mean_terminated_length": 181.453125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.18451139330863953, "epoch": 0.6619469026548672, "frac_reward_zero_std": 1.0, "grad_norm": 0.13584844995995202, "kl": 0.08872827142477036, "learning_rate": 8.472180638103143e-07, "loss": 0.0009, "num_tokens": 9260315.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001802444458008, "sampling/importance_sampling_ratio/min": 0.47875773906707764, "sampling/sampling_logp_difference/max": 1.686039924621582, "sampling/sampling_logp_difference/mean": 0.014497868716716766, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 240.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.19816793501377106, "epoch": 0.6637168141592921, "frac_reward_zero_std": 0.75, "grad_norm": 1.3257615129954676, "kl": 0.11812318861484528, "learning_rate": 8.461050290572113e-07, "loss": -0.0632, "num_tokens": 9287875.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996795058250427, "sampling/importance_sampling_ratio/min": 0.48100516200065613, "sampling/sampling_logp_difference/max": 0.7942094802856445, "sampling/sampling_logp_difference/mean": 0.012954211793839931, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 168.234375, "completions/mean_terminated_length": 168.234375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.17321261763572693, "epoch": 0.6654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 1.667626246997175, "kl": 0.08773798495531082, "learning_rate": 8.449886916324166e-07, "loss": 0.0156, "num_tokens": 9309394.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9187790155410767, "sampling/importance_sampling_ratio/mean": 1.0006718635559082, "sampling/importance_sampling_ratio/min": 0.37553900480270386, "sampling/sampling_logp_difference/max": 0.9793930053710938, "sampling/sampling_logp_difference/mean": 0.013548526912927628, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 172.203125, "completions/mean_terminated_length": 172.203125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.11970452964305878, "epoch": 0.6672566371681415, "frac_reward_zero_std": 0.75, "grad_norm": 1.6790518566291464, "kl": 0.06799526512622833, "learning_rate": 8.438690621884649e-07, "loss": 0.0349, "num_tokens": 9329407.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6481878757476807, "sampling/importance_sampling_ratio/mean": 0.9998011589050293, "sampling/importance_sampling_ratio/min": 0.48823168873786926, "sampling/sampling_logp_difference/max": 0.7169651985168457, "sampling/sampling_logp_difference/mean": 0.010066435672342777, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.16142089664936066, "epoch": 0.6690265486725664, "frac_reward_zero_std": 0.75, "grad_norm": 1.3902731611782742, "kl": 0.08437947928905487, "learning_rate": 8.427461514093055e-07, "loss": 0.0203, "num_tokens": 9349159.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4028136730194092, "sampling/importance_sampling_ratio/mean": 0.9997028708457947, "sampling/importance_sampling_ratio/min": 0.4666607975959778, "sampling/sampling_logp_difference/max": 0.7621526122093201, "sampling/sampling_logp_difference/mean": 0.012503237463533878, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.21429522335529327, "epoch": 0.6707964601769911, "frac_reward_zero_std": 0.5, "grad_norm": 1.9669430683432334, "kl": 0.10327700525522232, "learning_rate": 8.41619970010199e-07, "loss": -0.0967, "num_tokens": 9373143.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000718832015991, "sampling/importance_sampling_ratio/min": 0.4802923798561096, "sampling/sampling_logp_difference/max": 0.9300475120544434, "sampling/sampling_logp_difference/mean": 0.015657780691981316, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 160.90625, "completions/mean_terminated_length": 160.90625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.16257032752037048, "epoch": 0.672566371681416, "frac_reward_zero_std": 0.75, "grad_norm": 1.0429034433855109, "kl": 0.09608946740627289, "learning_rate": 8.404905287376157e-07, "loss": -0.024, "num_tokens": 9393889.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.668885350227356, "sampling/importance_sampling_ratio/mean": 0.9993311166763306, "sampling/importance_sampling_ratio/min": 0.5570504665374756, "sampling/sampling_logp_difference/max": 0.585099458694458, "sampling/sampling_logp_difference/mean": 0.013087260536849499, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 185.0625, "completions/mean_terminated_length": 185.0625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.14384813606739044, "epoch": 0.6743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 1.243586360450938, "kl": 0.08285379409790039, "learning_rate": 8.393578383691328e-07, "loss": -0.0439, "num_tokens": 9416517.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7818981409072876, "sampling/importance_sampling_ratio/mean": 1.0000312328338623, "sampling/importance_sampling_ratio/min": 0.6181708574295044, "sampling/sampling_logp_difference/max": 0.5776791572570801, "sampling/sampling_logp_difference/mean": 0.010202301666140556, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 169.921875, "completions/mean_terminated_length": 169.921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.20411460101604462, "epoch": 0.6761061946902654, "frac_reward_zero_std": 0.5, "grad_norm": 1.7860184898517784, "kl": 0.14919862151145935, "learning_rate": 8.382219097133323e-07, "loss": -0.0104, "num_tokens": 9438160.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.808617115020752, "sampling/importance_sampling_ratio/mean": 0.9995595216751099, "sampling/importance_sampling_ratio/min": 0.6202918291091919, "sampling/sampling_logp_difference/max": 0.5925625562667847, "sampling/sampling_logp_difference/mean": 0.014305366203188896, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 226.8125, "completions/mean_terminated_length": 226.8125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.16151058673858643, "epoch": 0.6778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.06741503320936171, "kl": 0.06487227231264114, "learning_rate": 8.370827536096964e-07, "loss": 0.0006, "num_tokens": 9463012.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6220383644104004, "sampling/importance_sampling_ratio/mean": 0.9992915391921997, "sampling/importance_sampling_ratio/min": 0.4944916367530823, "sampling/sampling_logp_difference/max": 0.7042250633239746, "sampling/sampling_logp_difference/mean": 0.010601913556456566, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 202.203125, "completions/mean_terminated_length": 202.203125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17492903769016266, "epoch": 0.679646017699115, "frac_reward_zero_std": 1.0, "grad_norm": 0.09674620540581445, "kl": 0.09439212083816528, "learning_rate": 8.359403809285053e-07, "loss": 0.0007, "num_tokens": 9487009.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997942447662354, "sampling/importance_sampling_ratio/min": 0.43054482340812683, "sampling/sampling_logp_difference/max": 1.1349756717681885, "sampling/sampling_logp_difference/mean": 0.01437060534954071, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 169.359375, "completions/mean_terminated_length": 169.359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1787954866886139, "epoch": 0.6814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 1.50379604266135, "kl": 0.0771775022149086, "learning_rate": 8.347948025707329e-07, "loss": -0.0031, "num_tokens": 9510648.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6106988191604614, "sampling/importance_sampling_ratio/mean": 0.9999945759773254, "sampling/importance_sampling_ratio/min": 0.5042071342468262, "sampling/sampling_logp_difference/max": 0.6847681999206543, "sampling/sampling_logp_difference/mean": 0.012294610030949116, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 261.671875, "completions/mean_terminated_length": 261.671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.20040848851203918, "epoch": 0.6831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 0.9028901692747431, "kl": 0.07755372673273087, "learning_rate": 8.336460294679431e-07, "loss": 0.0089, "num_tokens": 9538099.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8187596797943115, "sampling/importance_sampling_ratio/mean": 1.0000616312026978, "sampling/importance_sampling_ratio/min": 0.5378257632255554, "sampling/sampling_logp_difference/max": 0.6202206611633301, "sampling/sampling_logp_difference/mean": 0.013187211006879807, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 276.53125, "completions/mean_terminated_length": 276.53125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2212773710489273, "epoch": 0.6849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 0.908947494015278, "kl": 0.09375490248203278, "learning_rate": 8.324940725821852e-07, "loss": -0.0099, "num_tokens": 9567125.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001212358474731, "sampling/importance_sampling_ratio/min": 0.4161491394042969, "sampling/sampling_logp_difference/max": 0.8767116069793701, "sampling/sampling_logp_difference/mean": 0.012936407700181007, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 198.890625, "completions/mean_terminated_length": 198.890625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.2006136178970337, "epoch": 0.6867256637168142, "frac_reward_zero_std": 0.75, "grad_norm": 1.702631604731334, "kl": 0.08668678998947144, "learning_rate": 8.313389429058895e-07, "loss": 0.0056, "num_tokens": 9591598.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6598421335220337, "sampling/importance_sampling_ratio/mean": 0.9998523592948914, "sampling/importance_sampling_ratio/min": 0.4866368770599365, "sampling/sampling_logp_difference/max": 0.7202370166778564, "sampling/sampling_logp_difference/mean": 0.013114421628415585, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 214.640625, "completions/mean_terminated_length": 214.640625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2733258008956909, "epoch": 0.6884955752212389, "frac_reward_zero_std": 0.5, "grad_norm": 1.6959344130869862, "kl": 0.10790295153856277, "learning_rate": 8.30180651461762e-07, "loss": -0.0137, "num_tokens": 9619895.0, "reward": 0.03125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6107103824615479, "sampling/importance_sampling_ratio/mean": 1.0002367496490479, "sampling/importance_sampling_ratio/min": 0.6176396012306213, "sampling/sampling_logp_difference/max": 0.48185014724731445, "sampling/sampling_logp_difference/mean": 0.0153443468734622, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 278.671875, "completions/mean_terminated_length": 278.671875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.23062852025032043, "epoch": 0.6902654867256637, "frac_reward_zero_std": 0.5, "grad_norm": 1.2337269974401934, "kl": 0.08129149675369263, "learning_rate": 8.290192093026805e-07, "loss": 0.0091, "num_tokens": 9650290.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6367127895355225, "sampling/importance_sampling_ratio/mean": 1.0002433061599731, "sampling/importance_sampling_ratio/min": 0.5119295120239258, "sampling/sampling_logp_difference/max": 0.6695683002471924, "sampling/sampling_logp_difference/mean": 0.012509800493717194, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 225.828125, "completions/mean_terminated_length": 225.828125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.20886671543121338, "epoch": 0.6920353982300885, "frac_reward_zero_std": 0.75, "grad_norm": 1.4763185153544922, "kl": 0.10144011676311493, "learning_rate": 8.278546275115869e-07, "loss": 0.0145, "num_tokens": 9675479.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6977052688598633, "sampling/importance_sampling_ratio/mean": 1.0000884532928467, "sampling/importance_sampling_ratio/min": 0.5530133843421936, "sampling/sampling_logp_difference/max": 0.5923731327056885, "sampling/sampling_logp_difference/mean": 0.014719297178089619, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 278.28125, "completions/mean_terminated_length": 278.28125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2516426146030426, "epoch": 0.6938053097345133, "frac_reward_zero_std": 0.25, "grad_norm": 1.6655705745774225, "kl": 0.10094635933637619, "learning_rate": 8.266869172013835e-07, "loss": -0.0141, "num_tokens": 9703705.0, "reward": 0.21875, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6007353067398071, "sampling/importance_sampling_ratio/mean": 0.9999457597732544, "sampling/importance_sampling_ratio/min": 0.6172205209732056, "sampling/sampling_logp_difference/max": 0.4825289249420166, "sampling/sampling_logp_difference/mean": 0.01326075941324234, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 157.453125, "completions/mean_terminated_length": 157.453125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.24301910400390625, "epoch": 0.695575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 1.604373762499098, "kl": 0.1144510954618454, "learning_rate": 8.255160895148262e-07, "loss": 0.0218, "num_tokens": 9724134.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6008306741714478, "sampling/importance_sampling_ratio/mean": 0.9996973872184753, "sampling/importance_sampling_ratio/min": 0.6056217551231384, "sampling/sampling_logp_difference/max": 0.5014996528625488, "sampling/sampling_logp_difference/mean": 0.01373830996453762, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 203.703125, "completions/mean_terminated_length": 203.703125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.15705539286136627, "epoch": 0.6973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 1.6848563291065242, "kl": 0.1021459624171257, "learning_rate": 8.243421556244178e-07, "loss": 0.03, "num_tokens": 9748211.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.559633493423462, "sampling/importance_sampling_ratio/mean": 0.9999773502349854, "sampling/importance_sampling_ratio/min": 0.61739182472229, "sampling/sampling_logp_difference/max": 0.4822514057159424, "sampling/sampling_logp_difference/mean": 0.009732892736792564, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 183.71875, "completions/mean_terminated_length": 183.71875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.23040032386779785, "epoch": 0.6991150442477876, "frac_reward_zero_std": 0.75, "grad_norm": 1.4789478453920712, "kl": 0.12912510335445404, "learning_rate": 8.231651267323018e-07, "loss": -0.0036, "num_tokens": 9768801.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.931444525718689, "sampling/importance_sampling_ratio/mean": 0.9996687173843384, "sampling/importance_sampling_ratio/min": 0.5891477465629578, "sampling/sampling_logp_difference/max": 0.6582682132720947, "sampling/sampling_logp_difference/mean": 0.014220360666513443, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 100.140625, "completions/mean_terminated_length": 100.140625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.16772866249084473, "epoch": 0.7008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.1785221729704503, "kl": 0.0889422819018364, "learning_rate": 8.219850140701556e-07, "loss": 0.0009, "num_tokens": 9785114.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6216331720352173, "sampling/importance_sampling_ratio/mean": 0.9997761845588684, "sampling/importance_sampling_ratio/min": 0.44626373052597046, "sampling/sampling_logp_difference/max": 0.8068451881408691, "sampling/sampling_logp_difference/mean": 0.012667940929532051, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 125.234375, "completions/mean_terminated_length": 125.234375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.23405146598815918, "epoch": 0.7026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 1.9120795138393125, "kl": 0.09996088594198227, "learning_rate": 8.208018288990831e-07, "loss": -0.0252, "num_tokens": 9803961.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001115798950195, "sampling/importance_sampling_ratio/min": 0.34089627861976624, "sampling/sampling_logp_difference/max": 1.0761770009994507, "sampling/sampling_logp_difference/mean": 0.014903989620506763, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 204.703125, "completions/mean_terminated_length": 204.703125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.22898629307746887, "epoch": 0.7044247787610619, "frac_reward_zero_std": 0.5, "grad_norm": 1.8078180995616104, "kl": 0.08712364733219147, "learning_rate": 8.196155825095072e-07, "loss": 0.0338, "num_tokens": 9827910.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.8198039531707764, "sampling/importance_sampling_ratio/mean": 0.9999672174453735, "sampling/importance_sampling_ratio/min": 0.4887692928314209, "sampling/sampling_logp_difference/max": 0.7158646583557129, "sampling/sampling_logp_difference/mean": 0.013887250795960426, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 173.6875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.23619484901428223, "epoch": 0.7061946902654868, "frac_reward_zero_std": 0.75, "grad_norm": 1.4018792769858808, "kl": 0.11086740344762802, "learning_rate": 8.184262862210624e-07, "loss": -0.0062, "num_tokens": 9849938.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.687377691268921, "sampling/importance_sampling_ratio/mean": 0.9996067881584167, "sampling/importance_sampling_ratio/min": 0.6151829361915588, "sampling/sampling_logp_difference/max": 0.5231757164001465, "sampling/sampling_logp_difference/mean": 0.01551847904920578, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 135.96875, "completions/mean_terminated_length": 135.96875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.2168506681919098, "epoch": 0.7079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 2.3557663589973905, "kl": 0.11242357641458511, "learning_rate": 8.172339513824862e-07, "loss": -0.0196, "num_tokens": 9872880.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998126029968262, "sampling/importance_sampling_ratio/min": 0.5910095572471619, "sampling/sampling_logp_difference/max": 0.8242528438568115, "sampling/sampling_logp_difference/mean": 0.013441469520330429, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.24784806370735168, "epoch": 0.7097345132743362, "frac_reward_zero_std": 0.75, "grad_norm": 1.9096493768072735, "kl": 0.10909762978553772, "learning_rate": 8.160385893715112e-07, "loss": 0.0127, "num_tokens": 9895808.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001208782196045, "sampling/importance_sampling_ratio/min": 0.4805789887905121, "sampling/sampling_logp_difference/max": 0.7329154014587402, "sampling/sampling_logp_difference/mean": 0.015370754525065422, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 129.84375, "completions/mean_terminated_length": 129.84375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.18168896436691284, "epoch": 0.7115044247787611, "frac_reward_zero_std": 0.75, "grad_norm": 2.0031652431247506, "kl": 0.0930924341082573, "learning_rate": 8.14840211594757e-07, "loss": -0.0222, "num_tokens": 9913622.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5947625637054443, "sampling/importance_sampling_ratio/mean": 1.0002487897872925, "sampling/importance_sampling_ratio/min": 0.6324122548103333, "sampling/sampling_logp_difference/max": 0.46672487258911133, "sampling/sampling_logp_difference/mean": 0.011891288682818413, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 141.109375, "completions/mean_terminated_length": 141.109375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2448035329580307, "epoch": 0.7132743362831858, "frac_reward_zero_std": 0.75, "grad_norm": 1.8644059674853986, "kl": 0.13529321551322937, "learning_rate": 8.136388294876202e-07, "loss": 0.042, "num_tokens": 9933277.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4746893644332886, "sampling/importance_sampling_ratio/mean": 0.9996466040611267, "sampling/importance_sampling_ratio/min": 0.4161492586135864, "sampling/sampling_logp_difference/max": 0.8767112493515015, "sampling/sampling_logp_difference/mean": 0.014279556460678577, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 130.71875, "completions/mean_terminated_length": 130.71875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.21659719944000244, "epoch": 0.7150442477876107, "frac_reward_zero_std": 0.75, "grad_norm": 2.085307839342841, "kl": 0.10195846855640411, "learning_rate": 8.124344545141661e-07, "loss": -0.0179, "num_tokens": 9957179.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005733966827393, "sampling/importance_sampling_ratio/min": 0.614747941493988, "sampling/sampling_logp_difference/max": 0.6985714435577393, "sampling/sampling_logp_difference/mean": 0.015068333595991135, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 223.6875, "completions/mean_terminated_length": 223.6875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.2928702235221863, "epoch": 0.7168141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 1.6486871249023423, "kl": 0.1310211569070816, "learning_rate": 8.112270981670195e-07, "loss": -0.0055, "num_tokens": 9986743.0, "reward": 0.34375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6956371068954468, "sampling/importance_sampling_ratio/mean": 1.0002375841140747, "sampling/importance_sampling_ratio/min": 0.40638357400894165, "sampling/sampling_logp_difference/max": 0.9004578590393066, "sampling/sampling_logp_difference/mean": 0.016754474490880966, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 114.3125, "completions/mean_terminated_length": 114.3125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.23156185448169708, "epoch": 0.7185840707964601, "frac_reward_zero_std": 0.75, "grad_norm": 2.5570235751791848, "kl": 0.13598626852035522, "learning_rate": 8.10016771967254e-07, "loss": 0.0392, "num_tokens": 10004491.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.62672758102417, "sampling/importance_sampling_ratio/mean": 0.9990731477737427, "sampling/importance_sampling_ratio/min": 0.6397711634635925, "sampling/sampling_logp_difference/max": 0.4865703582763672, "sampling/sampling_logp_difference/mean": 0.01592668890953064, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.24460113048553467, "epoch": 0.720353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 1.6531855417896537, "kl": 0.11737678945064545, "learning_rate": 8.088034874642833e-07, "loss": 0.0245, "num_tokens": 10027843.0, "reward": -0.46875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7136509418487549, "sampling/importance_sampling_ratio/mean": 0.9996737837791443, "sampling/importance_sampling_ratio/min": 0.6198647022247314, "sampling/sampling_logp_difference/max": 0.5386261940002441, "sampling/sampling_logp_difference/mean": 0.014086121693253517, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 120.1875, "completions/mean_terminated_length": 120.1875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.22202925384044647, "epoch": 0.7221238938053097, "frac_reward_zero_std": 1.0, "grad_norm": 1.2899973363130564, "kl": 0.2256590723991394, "learning_rate": 8.0758725623575e-07, "loss": 0.0022, "num_tokens": 10046223.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5762683153152466, "sampling/importance_sampling_ratio/mean": 1.0004873275756836, "sampling/importance_sampling_ratio/min": 0.6486875414848328, "sampling/sampling_logp_difference/max": 0.4550602436065674, "sampling/sampling_logp_difference/mean": 0.014681857079267502, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 157.21875, "completions/mean_terminated_length": 157.21875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.2511879801750183, "epoch": 0.7238938053097345, "frac_reward_zero_std": 0.75, "grad_norm": 1.507320107757975, "kl": 0.16867324709892273, "learning_rate": 8.063680898874157e-07, "loss": 0.0108, "num_tokens": 10068525.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000004768371582, "sampling/importance_sampling_ratio/min": 0.4780873954296112, "sampling/sampling_logp_difference/max": 0.7379617691040039, "sampling/sampling_logp_difference/mean": 0.014443520456552505, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 93.65625, "completions/mean_terminated_length": 93.65625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.20997223258018494, "epoch": 0.7256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.4632482681407139, "kl": 0.1533925086259842, "learning_rate": 8.051460000530501e-07, "loss": 0.0015, "num_tokens": 10083991.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.582827091217041, "sampling/importance_sampling_ratio/mean": 0.9996761083602905, "sampling/importance_sampling_ratio/min": 0.4839862883090973, "sampling/sampling_logp_difference/max": 0.725698709487915, "sampling/sampling_logp_difference/mean": 0.014583464711904526, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 140.890625, "completions/mean_terminated_length": 140.890625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.17527663707733154, "epoch": 0.727433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.06647882427560135, "kl": 0.1013965755701065, "learning_rate": 8.039209983943201e-07, "loss": 0.0007, "num_tokens": 10102544.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5972766876220703, "sampling/importance_sampling_ratio/mean": 0.999637246131897, "sampling/importance_sampling_ratio/min": 0.6377806663513184, "sampling/sampling_logp_difference/max": 0.46830010414123535, "sampling/sampling_logp_difference/mean": 0.011489623226225376, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 111.0625, "completions/mean_terminated_length": 111.0625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3289560079574585, "epoch": 0.7292035398230089, "frac_reward_zero_std": 0.75, "grad_norm": 1.8456902145242495, "kl": 0.18987998366355896, "learning_rate": 8.026930966006778e-07, "loss": -0.0138, "num_tokens": 10121924.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4742939472198486, "sampling/importance_sampling_ratio/mean": 0.9994705319404602, "sampling/importance_sampling_ratio/min": 0.26938679814338684, "sampling/sampling_logp_difference/max": 1.311607003211975, "sampling/sampling_logp_difference/mean": 0.01715000532567501, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 134.1875, "completions/mean_terminated_length": 134.1875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2241678237915039, "epoch": 0.7309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.9522069345418405, "kl": 0.1012740209698677, "learning_rate": 8.014623063892503e-07, "loss": 0.0346, "num_tokens": 10143296.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4779890775680542, "sampling/importance_sampling_ratio/mean": 0.9997544288635254, "sampling/importance_sampling_ratio/min": 0.4596959352493286, "sampling/sampling_logp_difference/max": 0.777190089225769, "sampling/sampling_logp_difference/mean": 0.012950373813509941, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.3127814531326294, "epoch": 0.7327433628318584, "frac_reward_zero_std": 0.25, "grad_norm": 2.1872087760063015, "kl": 0.1646125614643097, "learning_rate": 8.002286395047266e-07, "loss": -0.0025, "num_tokens": 10173456.0, "reward": 0.125, "reward_std": 0.6267197132110596, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998930096626282, "sampling/importance_sampling_ratio/min": 0.46577200293540955, "sampling/sampling_logp_difference/max": 0.7696456909179688, "sampling/sampling_logp_difference/mean": 0.0167483352124691, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 128.765625, "completions/mean_terminated_length": 128.765625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.3348920941352844, "epoch": 0.7345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 2.7805145515127383, "kl": 0.17094530165195465, "learning_rate": 7.989921077192463e-07, "loss": 0.0028, "num_tokens": 10198433.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5896917581558228, "sampling/importance_sampling_ratio/mean": 1.0003952980041504, "sampling/importance_sampling_ratio/min": 0.5363175868988037, "sampling/sampling_logp_difference/max": 0.6230287551879883, "sampling/sampling_logp_difference/mean": 0.01760284975171089, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 154.46875, "completions/mean_terminated_length": 154.46875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2479586899280548, "epoch": 0.736283185840708, "frac_reward_zero_std": 0.5, "grad_norm": 2.139588080893128, "kl": 0.18218538165092468, "learning_rate": 7.97752722832287e-07, "loss": -0.0009, "num_tokens": 10220191.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6642953157424927, "sampling/importance_sampling_ratio/mean": 1.0002672672271729, "sampling/importance_sampling_ratio/min": 0.047217559069395065, "sampling/sampling_logp_difference/max": 3.0529894828796387, "sampling/sampling_logp_difference/mean": 0.014692382887005806, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 107.609375, "completions/mean_terminated_length": 107.609375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3130699396133423, "epoch": 0.7380530973451327, "frac_reward_zero_std": 0.75, "grad_norm": 2.1433943782066964, "kl": 0.19814994931221008, "learning_rate": 7.965104966705517e-07, "loss": 0.0249, "num_tokens": 10239190.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4284597635269165, "sampling/importance_sampling_ratio/mean": 0.9999039173126221, "sampling/importance_sampling_ratio/min": 0.4982660412788391, "sampling/sampling_logp_difference/max": 0.6966211795806885, "sampling/sampling_logp_difference/mean": 0.017606273293495178, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 128.328125, "completions/mean_terminated_length": 128.328125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.33184927701950073, "epoch": 0.7398230088495575, "frac_reward_zero_std": 0.25, "grad_norm": 3.0588854266486702, "kl": 0.1875271052122116, "learning_rate": 7.952654410878558e-07, "loss": -0.0283, "num_tokens": 10259563.0, "reward": 0.21875, "reward_std": 0.6037135720252991, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6242440938949585, "sampling/importance_sampling_ratio/mean": 1.000694990158081, "sampling/importance_sampling_ratio/min": 0.6751402616500854, "sampling/sampling_logp_difference/max": 0.4850425720214844, "sampling/sampling_logp_difference/mean": 0.01691984012722969, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 103.953125, "completions/mean_terminated_length": 103.953125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.307433545589447, "epoch": 0.7415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.8629919304621134, "kl": 0.17759662866592407, "learning_rate": 7.940175679650145e-07, "loss": 0.0074, "num_tokens": 10279208.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5071558952331543, "sampling/importance_sampling_ratio/mean": 0.999001145362854, "sampling/importance_sampling_ratio/min": 0.6298438310623169, "sampling/sampling_logp_difference/max": 0.4622833728790283, "sampling/sampling_logp_difference/mean": 0.016604270786046982, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 127.21875, "completions/mean_terminated_length": 127.21875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.22612299025058746, "epoch": 0.7433628318584071, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005344465449177, "kl": 0.0908593237400055, "learning_rate": 7.927668892097288e-07, "loss": 0.0008, "num_tokens": 10296870.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.648982286453247, "sampling/importance_sampling_ratio/mean": 1.000003695487976, "sampling/importance_sampling_ratio/min": 0.42307013273239136, "sampling/sampling_logp_difference/max": 0.8602173328399658, "sampling/sampling_logp_difference/mean": 0.01439329981803894, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 72.84375, "completions/mean_terminated_length": 72.84375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1961365044116974, "epoch": 0.7451327433628319, "frac_reward_zero_std": 1.0, "grad_norm": 0.1377099282807247, "kl": 0.13358832895755768, "learning_rate": 7.915134167564723e-07, "loss": 0.0013, "num_tokens": 10310972.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8029872179031372, "sampling/importance_sampling_ratio/mean": 1.001330852508545, "sampling/importance_sampling_ratio/min": 0.5407369136810303, "sampling/sampling_logp_difference/max": 0.6148223876953125, "sampling/sampling_logp_difference/mean": 0.015113083645701408, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 85.328125, "completions/mean_terminated_length": 85.328125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.19887936115264893, "epoch": 0.7469026548672566, "frac_reward_zero_std": 1.0, "grad_norm": 0.20084408422117847, "kl": 0.09519742429256439, "learning_rate": 7.902571625663772e-07, "loss": 0.001, "num_tokens": 10326017.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.632686734199524, "sampling/importance_sampling_ratio/mean": 0.999961793422699, "sampling/importance_sampling_ratio/min": 0.4853159487247467, "sampling/sampling_logp_difference/max": 0.7229551076889038, "sampling/sampling_logp_difference/mean": 0.01400967501103878, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 103.8125, "completions/mean_terminated_length": 103.8125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.21976834535598755, "epoch": 0.7486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 2.1772878212363755, "kl": 0.1845146119594574, "learning_rate": 7.8899813862712e-07, "loss": -0.0016, "num_tokens": 10341669.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5366523265838623, "sampling/importance_sampling_ratio/mean": 0.999984860420227, "sampling/importance_sampling_ratio/min": 0.5098457336425781, "sampling/sampling_logp_difference/max": 0.6736471652984619, "sampling/sampling_logp_difference/mean": 0.015088671818375587, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 129.90625, "completions/mean_terminated_length": 129.90625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.25134825706481934, "epoch": 0.7504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.6034454103774332, "kl": 0.15433214604854584, "learning_rate": 7.877363569528075e-07, "loss": 0.0233, "num_tokens": 10359567.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5785263776779175, "sampling/importance_sampling_ratio/mean": 1.0009269714355469, "sampling/importance_sampling_ratio/min": 0.6328911781311035, "sampling/sampling_logp_difference/max": 0.4574568271636963, "sampling/sampling_logp_difference/mean": 0.015000468119978905, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 88.421875, "completions/mean_terminated_length": 88.421875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.22460322082042694, "epoch": 0.7522123893805309, "frac_reward_zero_std": 1.0, "grad_norm": 0.7017732872715597, "kl": 0.17928902804851532, "learning_rate": 7.864718295838614e-07, "loss": 0.0017, "num_tokens": 10375930.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5873216390609741, "sampling/importance_sampling_ratio/mean": 1.0008187294006348, "sampling/importance_sampling_ratio/min": 0.6217424869537354, "sampling/sampling_logp_difference/max": 0.47522926330566406, "sampling/sampling_logp_difference/mean": 0.015076222829520702, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 253.875, "completions/mean_terminated_length": 253.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.31429821252822876, "epoch": 0.7539823008849558, "frac_reward_zero_std": 0.25, "grad_norm": 1.7147426013805556, "kl": 0.08855272084474564, "learning_rate": 7.852045685869044e-07, "loss": 0.0368, "num_tokens": 10406418.0, "reward": 0.3125, "reward_std": 0.7663977146148682, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6044872999191284, "sampling/importance_sampling_ratio/mean": 1.000441074371338, "sampling/importance_sampling_ratio/min": 0.6176437139511108, "sampling/sampling_logp_difference/max": 0.4818434715270996, "sampling/sampling_logp_difference/mean": 0.014759142883121967, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 72.09375, "completions/mean_terminated_length": 72.09375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.20648300647735596, "epoch": 0.7557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.20464947729883612, "kl": 0.112910196185112, "learning_rate": 7.839345860546447e-07, "loss": 0.0011, "num_tokens": 10420232.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6626209020614624, "sampling/importance_sampling_ratio/mean": 1.000261902809143, "sampling/importance_sampling_ratio/min": 0.556676983833313, "sampling/sampling_logp_difference/max": 0.5857701301574707, "sampling/sampling_logp_difference/mean": 0.014546798542141914, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 250.28125, "completions/mean_terminated_length": 250.28125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.27906370162963867, "epoch": 0.7575221238938054, "frac_reward_zero_std": 0.5, "grad_norm": 1.2977974352535242, "kl": 0.10531355440616608, "learning_rate": 7.826618941057597e-07, "loss": -0.0285, "num_tokens": 10447530.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6692922115325928, "sampling/importance_sampling_ratio/mean": 0.9990940690040588, "sampling/importance_sampling_ratio/min": 0.6255077719688416, "sampling/sampling_logp_difference/max": 0.5123996734619141, "sampling/sampling_logp_difference/mean": 0.013001114130020142, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 170.078125, "completions/mean_terminated_length": 170.078125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.25722211599349976, "epoch": 0.7592920353982301, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201987406260391, "kl": 0.09622658044099808, "learning_rate": 7.813865048847818e-07, "loss": 0.001, "num_tokens": 10469503.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.685192584991455, "sampling/importance_sampling_ratio/mean": 1.0000381469726562, "sampling/importance_sampling_ratio/min": 0.5498476624488831, "sampling/sampling_logp_difference/max": 0.598114013671875, "sampling/sampling_logp_difference/mean": 0.01349552720785141, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 211.546875, "completions/mean_terminated_length": 211.546875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.36132562160491943, "epoch": 0.7610619469026548, "frac_reward_zero_std": 0.25, "grad_norm": 1.7768139653559343, "kl": 0.12593957781791687, "learning_rate": 7.801084305619818e-07, "loss": -0.0259, "num_tokens": 10495842.0, "reward": -0.09375, "reward_std": 0.606805682182312, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5279269218444824, "sampling/importance_sampling_ratio/mean": 1.0005345344543457, "sampling/importance_sampling_ratio/min": 0.6754833459854126, "sampling/sampling_logp_difference/max": 0.42391180992126465, "sampling/sampling_logp_difference/mean": 0.016943801194429398, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 144.921875, "completions/mean_terminated_length": 144.921875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.23178213834762573, "epoch": 0.7628318584070797, "frac_reward_zero_std": 1.0, "grad_norm": 0.07128915388679644, "kl": 0.10431316494941711, "learning_rate": 7.788276833332525e-07, "loss": 0.001, "num_tokens": 10514589.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.680548906326294, "sampling/importance_sampling_ratio/mean": 1.001007080078125, "sampling/importance_sampling_ratio/min": 0.6417016983032227, "sampling/sampling_logp_difference/max": 0.519120454788208, "sampling/sampling_logp_difference/mean": 0.013102984055876732, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 245.921875, "completions/mean_terminated_length": 245.921875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.35798782110214233, "epoch": 0.7646017699115044, "frac_reward_zero_std": 0.25, "grad_norm": 2.003088405112961, "kl": 0.12780684232711792, "learning_rate": 7.775442754199928e-07, "loss": 0.0168, "num_tokens": 10543144.0, "reward": 0.34375, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4408007860183716, "sampling/importance_sampling_ratio/mean": 1.0002108812332153, "sampling/importance_sampling_ratio/min": 0.61895352602005, "sampling/sampling_logp_difference/max": 0.4797251224517822, "sampling/sampling_logp_difference/mean": 0.016016483306884766, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 261.140625, "completions/mean_terminated_length": 261.140625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3103075623512268, "epoch": 0.7663716814159292, "frac_reward_zero_std": 0.25, "grad_norm": 1.5193055990815396, "kl": 0.10806705057621002, "learning_rate": 7.76258219068991e-07, "loss": 0.1239, "num_tokens": 10571393.0, "reward": 0.0625, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4929227828979492, "sampling/importance_sampling_ratio/mean": 0.9996569156646729, "sampling/importance_sampling_ratio/min": 0.6151418685913086, "sampling/sampling_logp_difference/max": 0.4859023094177246, "sampling/sampling_logp_difference/mean": 0.01445387490093708, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 152.140625, "completions/mean_terminated_length": 152.140625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.32054561376571655, "epoch": 0.768141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 1.9673173404959408, "kl": 0.1318821907043457, "learning_rate": 7.749695265523075e-07, "loss": -0.0285, "num_tokens": 10592330.0, "reward": 0.375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.55359947681427, "sampling/importance_sampling_ratio/mean": 1.0008392333984375, "sampling/importance_sampling_ratio/min": 0.5749117732048035, "sampling/sampling_logp_difference/max": 0.5535387992858887, "sampling/sampling_logp_difference/mean": 0.01602165400981903, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.22358392179012299, "epoch": 0.7699115044247787, "frac_reward_zero_std": 0.75, "grad_norm": 2.2185437302982907, "kl": 0.09492562711238861, "learning_rate": 7.736782101671586e-07, "loss": -0.0198, "num_tokens": 10609882.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9082411527633667, "sampling/importance_sampling_ratio/mean": 0.9999414682388306, "sampling/importance_sampling_ratio/min": 0.6937689781188965, "sampling/sampling_logp_difference/max": 0.6461819410324097, "sampling/sampling_logp_difference/mean": 0.013823290355503559, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 226.140625, "completions/mean_terminated_length": 226.140625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.29974108934402466, "epoch": 0.7716814159292036, "frac_reward_zero_std": 0.75, "grad_norm": 1.0118156887737264, "kl": 0.09879736602306366, "learning_rate": 7.723842822357979e-07, "loss": 0.0122, "num_tokens": 10636387.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.552546739578247, "sampling/importance_sampling_ratio/mean": 1.0001718997955322, "sampling/importance_sampling_ratio/min": 0.6304361820220947, "sampling/sampling_logp_difference/max": 0.46134328842163086, "sampling/sampling_logp_difference/mean": 0.014574981294572353, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 95.734375, "completions/mean_terminated_length": 95.734375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.20790165662765503, "epoch": 0.7734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.08683289950443539, "kl": 0.11964157223701477, "learning_rate": 7.710877551054003e-07, "loss": 0.0012, "num_tokens": 10653346.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5744452476501465, "sampling/importance_sampling_ratio/mean": 1.000321626663208, "sampling/importance_sampling_ratio/min": 0.6113335490226746, "sampling/sampling_logp_difference/max": 0.4921126365661621, "sampling/sampling_logp_difference/mean": 0.015471925027668476, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.21783259510993958, "epoch": 0.7752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 1.6502398693245992, "kl": 0.09436312317848206, "learning_rate": 7.697886411479421e-07, "loss": 0.0391, "num_tokens": 10674122.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6221652030944824, "sampling/importance_sampling_ratio/mean": 1.0003793239593506, "sampling/importance_sampling_ratio/min": 0.4808139204978943, "sampling/sampling_logp_difference/max": 0.7322750091552734, "sampling/sampling_logp_difference/mean": 0.013289163820445538, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 118.921875, "completions/mean_terminated_length": 118.921875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.23659348487854004, "epoch": 0.7769911504424779, "frac_reward_zero_std": 1.0, "grad_norm": 0.06304461429024877, "kl": 0.10675106942653656, "learning_rate": 7.684869527600856e-07, "loss": 0.0011, "num_tokens": 10692229.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.628382921218872, "sampling/importance_sampling_ratio/mean": 0.9994261264801025, "sampling/importance_sampling_ratio/min": 0.5441125631332397, "sampling/sampling_logp_difference/max": 0.6085991859436035, "sampling/sampling_logp_difference/mean": 0.015332890674471855, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 158.65625, "completions/mean_terminated_length": 158.65625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.22790870070457458, "epoch": 0.7787610619469026, "frac_reward_zero_std": 1.0, "grad_norm": 0.05294814430683551, "kl": 0.09549225866794586, "learning_rate": 7.671827023630579e-07, "loss": 0.0008, "num_tokens": 10711455.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5601475238800049, "sampling/importance_sampling_ratio/mean": 1.0002624988555908, "sampling/importance_sampling_ratio/min": 0.6056380867958069, "sampling/sampling_logp_difference/max": 0.5014727115631104, "sampling/sampling_logp_difference/mean": 0.01402122713625431, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2548066973686218, "epoch": 0.7805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 1.049698476939084, "kl": 0.10269908607006073, "learning_rate": 7.658759024025347e-07, "loss": -0.0109, "num_tokens": 10736863.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5569202899932861, "sampling/importance_sampling_ratio/mean": 0.9996312856674194, "sampling/importance_sampling_ratio/min": 0.5580610036849976, "sampling/sampling_logp_difference/max": 0.5832870006561279, "sampling/sampling_logp_difference/mean": 0.01371750794351101, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 147.046875, "completions/mean_terminated_length": 147.046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.24499477446079254, "epoch": 0.7823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.06852030669842013, "kl": 0.11023630201816559, "learning_rate": 7.645665653485205e-07, "loss": 0.001, "num_tokens": 10756114.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999265730381012, "sampling/importance_sampling_ratio/min": 0.21256250143051147, "sampling/sampling_logp_difference/max": 1.5485191345214844, "sampling/sampling_logp_difference/mean": 0.01581001840531826, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 143.734375, "completions/mean_terminated_length": 143.734375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.31479573249816895, "epoch": 0.784070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.11273354908663606, "kl": 0.17768093943595886, "learning_rate": 7.632547036952295e-07, "loss": 0.0019, "num_tokens": 10775809.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.715756893157959, "sampling/importance_sampling_ratio/mean": 0.9997842311859131, "sampling/importance_sampling_ratio/min": 0.5076191425323486, "sampling/sampling_logp_difference/max": 0.6780238151550293, "sampling/sampling_logp_difference/mean": 0.017306584864854813, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 130.703125, "completions/mean_terminated_length": 130.703125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.21905525028705597, "epoch": 0.7858407079646018, "frac_reward_zero_std": 0.75, "grad_norm": 1.7762581228099865, "kl": 0.1313691884279251, "learning_rate": 7.619403299609667e-07, "loss": 0.0034, "num_tokens": 10794702.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6061917543411255, "sampling/importance_sampling_ratio/mean": 1.000018835067749, "sampling/importance_sampling_ratio/min": 0.6273802518844604, "sampling/sampling_logp_difference/max": 0.47386598587036133, "sampling/sampling_logp_difference/mean": 0.01330517791211605, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 173.71875, "completions/mean_terminated_length": 173.71875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.35762932896614075, "epoch": 0.7876106194690266, "frac_reward_zero_std": 0.75, "grad_norm": 1.2101697062903451, "kl": 0.20684951543807983, "learning_rate": 7.606234566880088e-07, "loss": 0.0128, "num_tokens": 10816332.0, "reward": -0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.630078911781311, "sampling/importance_sampling_ratio/mean": 0.999476432800293, "sampling/importance_sampling_ratio/min": 0.636064350605011, "sampling/sampling_logp_difference/max": 0.4886283874511719, "sampling/sampling_logp_difference/mean": 0.0165737122297287, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 91.765625, "completions/mean_terminated_length": 91.765625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.24042564630508423, "epoch": 0.7893805309734513, "frac_reward_zero_std": 1.0, "grad_norm": 0.0746164097458041, "kl": 0.11899790167808533, "learning_rate": 7.593040964424835e-07, "loss": 0.0012, "num_tokens": 10831517.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4752185344696045, "sampling/importance_sampling_ratio/mean": 0.9987927675247192, "sampling/importance_sampling_ratio/min": 0.6058770418167114, "sampling/sampling_logp_difference/max": 0.5010782480239868, "sampling/sampling_logp_difference/mean": 0.015875648707151413, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 173.9375, "completions/mean_terminated_length": 173.9375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3096427917480469, "epoch": 0.7911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 1.3760485970182488, "kl": 0.12306735664606094, "learning_rate": 7.579822618142503e-07, "loss": 0.0185, "num_tokens": 10853465.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6976984739303589, "sampling/importance_sampling_ratio/mean": 1.0006637573242188, "sampling/importance_sampling_ratio/min": 0.5578792691230774, "sampling/sampling_logp_difference/max": 0.5836126804351807, "sampling/sampling_logp_difference/mean": 0.01565830036997795, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 163.046875, "completions/mean_terminated_length": 163.046875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.3008842468261719, "epoch": 0.7929203539823009, "frac_reward_zero_std": 0.5, "grad_norm": 1.9120965499892564, "kl": 0.15516658127307892, "learning_rate": 7.56657965416781e-07, "loss": 0.0111, "num_tokens": 10874796.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5506500005722046, "sampling/importance_sampling_ratio/mean": 1.000104308128357, "sampling/importance_sampling_ratio/min": 0.2671625018119812, "sampling/sampling_logp_difference/max": 1.3198981285095215, "sampling/sampling_logp_difference/mean": 0.01531151495873928, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 133.875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.2631182074546814, "epoch": 0.7946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.06087210191524674, "kl": 0.10320760309696198, "learning_rate": 7.553312198870372e-07, "loss": 0.0012, "num_tokens": 10895972.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5962475538253784, "sampling/importance_sampling_ratio/mean": 0.9997023344039917, "sampling/importance_sampling_ratio/min": 0.6134573221206665, "sampling/sampling_logp_difference/max": 0.4886445999145508, "sampling/sampling_logp_difference/mean": 0.01567263714969158, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 195.84375, "completions/mean_terminated_length": 195.84375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.24691849946975708, "epoch": 0.7964601769911505, "frac_reward_zero_std": 1.0, "grad_norm": 0.04575406294381887, "kl": 0.07950303703546524, "learning_rate": 7.540020378853522e-07, "loss": 0.0006, "num_tokens": 10920074.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7520686388015747, "sampling/importance_sampling_ratio/mean": 0.9993558526039124, "sampling/importance_sampling_ratio/min": 0.5897966027259827, "sampling/sampling_logp_difference/max": 0.5607972145080566, "sampling/sampling_logp_difference/mean": 0.014229314401745796, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 161.703125, "completions/mean_terminated_length": 161.703125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.31960299611091614, "epoch": 0.7982300884955752, "frac_reward_zero_std": 1.0, "grad_norm": 0.06009141323076683, "kl": 0.13369546830654144, "learning_rate": 7.52670432095309e-07, "loss": 0.0016, "num_tokens": 10941719.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4930192232131958, "sampling/importance_sampling_ratio/mean": 0.9998590350151062, "sampling/importance_sampling_ratio/min": 0.43092525005340576, "sampling/sampling_logp_difference/max": 0.8418207168579102, "sampling/sampling_logp_difference/mean": 0.016376489773392677, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 181.34375, "completions/mean_terminated_length": 181.34375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.3312147855758667, "epoch": 0.8, "frac_reward_zero_std": 0.5, "grad_norm": 1.7681149566295409, "kl": 0.126938596367836, "learning_rate": 7.513364152236185e-07, "loss": -0.0015, "num_tokens": 10966285.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6323968172073364, "sampling/importance_sampling_ratio/mean": 0.9999746680259705, "sampling/importance_sampling_ratio/min": 0.624896764755249, "sampling/sampling_logp_difference/max": 0.4900493621826172, "sampling/sampling_logp_difference/mean": 0.017019513994455338, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.2554083466529846, "epoch": 0.8017699115044248, "frac_reward_zero_std": 0.75, "grad_norm": 1.3956829927984136, "kl": 0.10268468409776688, "learning_rate": 7.5e-07, "loss": 0.0085, "num_tokens": 10987765.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5277727842330933, "sampling/importance_sampling_ratio/mean": 1.000195026397705, "sampling/importance_sampling_ratio/min": 0.5001277327537537, "sampling/sampling_logp_difference/max": 0.6928918361663818, "sampling/sampling_logp_difference/mean": 0.014314472675323486, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 186.84375, "completions/mean_terminated_length": 186.84375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.22197839617729187, "epoch": 0.8035398230088495, "frac_reward_zero_std": 1.0, "grad_norm": 0.04506479740825905, "kl": 0.06315718591213226, "learning_rate": 7.486611991770585e-07, "loss": 0.0006, "num_tokens": 11010811.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.635847568511963, "sampling/importance_sampling_ratio/mean": 0.9997208118438721, "sampling/importance_sampling_ratio/min": 0.6272012591362, "sampling/sampling_logp_difference/max": 0.4921610355377197, "sampling/sampling_logp_difference/mean": 0.011982977390289307, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 173.546875, "completions/mean_terminated_length": 173.546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.1851518452167511, "epoch": 0.8053097345132744, "frac_reward_zero_std": 1.0, "grad_norm": 0.053830861156893645, "kl": 0.06667741388082504, "learning_rate": 7.473200255301634e-07, "loss": 0.0006, "num_tokens": 11033358.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5749149322509766, "sampling/importance_sampling_ratio/mean": 1.00022554397583, "sampling/importance_sampling_ratio/min": 0.636394739151001, "sampling/sampling_logp_difference/max": 0.45420122146606445, "sampling/sampling_logp_difference/mean": 0.011585669592022896, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.2890259623527527, "epoch": 0.8070796460176991, "frac_reward_zero_std": 0.75, "grad_norm": 1.2481812059615454, "kl": 0.10705706477165222, "learning_rate": 7.459764918573264e-07, "loss": -0.005, "num_tokens": 11060190.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.9441770315170288, "sampling/importance_sampling_ratio/mean": 0.9998108148574829, "sampling/importance_sampling_ratio/min": 0.5826321244239807, "sampling/sampling_logp_difference/max": 0.6648387908935547, "sampling/sampling_logp_difference/mean": 0.013531711883842945, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 245.53125, "completions/mean_terminated_length": 245.53125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1874953806400299, "epoch": 0.8088495575221238, "frac_reward_zero_std": 1.0, "grad_norm": 0.025801164334045307, "kl": 0.06931064277887344, "learning_rate": 7.446306109790797e-07, "loss": 0.0004, "num_tokens": 11085904.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6157500743865967, "sampling/importance_sampling_ratio/mean": 0.9993112683296204, "sampling/importance_sampling_ratio/min": 0.5483723282814026, "sampling/sampling_logp_difference/max": 0.6008007526397705, "sampling/sampling_logp_difference/mean": 0.011181306093931198, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 164.046875, "completions/mean_terminated_length": 164.046875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1791715919971466, "epoch": 0.8106194690265487, "frac_reward_zero_std": 0.75, "grad_norm": 1.6663788965690867, "kl": 0.07812879979610443, "learning_rate": 7.432823957383531e-07, "loss": -0.0369, "num_tokens": 11106419.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4916131496429443, "sampling/importance_sampling_ratio/mean": 0.9999189972877502, "sampling/importance_sampling_ratio/min": 0.27278798818588257, "sampling/sampling_logp_difference/max": 1.2990604639053345, "sampling/sampling_logp_difference/mean": 0.010273879393935204, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 155.21875, "completions/mean_terminated_length": 155.21875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2759154736995697, "epoch": 0.8123893805309734, "frac_reward_zero_std": 1.0, "grad_norm": 0.05059189575672393, "kl": 0.10760311037302017, "learning_rate": 7.419318590003523e-07, "loss": 0.0011, "num_tokens": 11128865.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8663716316223145, "sampling/importance_sampling_ratio/mean": 1.0007175207138062, "sampling/importance_sampling_ratio/min": 0.29967522621154785, "sampling/sampling_logp_difference/max": 1.2050559520721436, "sampling/sampling_logp_difference/mean": 0.015000621788203716, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 181.234375, "completions/mean_terminated_length": 181.234375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.24837835133075714, "epoch": 0.8141592920353983, "frac_reward_zero_std": 1.0, "grad_norm": 0.05305740223343962, "kl": 0.09566047042608261, "learning_rate": 7.405790136524352e-07, "loss": 0.001, "num_tokens": 11149680.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6574124097824097, "sampling/importance_sampling_ratio/mean": 1.0001816749572754, "sampling/importance_sampling_ratio/min": 0.5038161873817444, "sampling/sampling_logp_difference/max": 0.6855437755584717, "sampling/sampling_logp_difference/mean": 0.012731912545859814, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 216.40625, "completions/mean_terminated_length": 216.40625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.23632162809371948, "epoch": 0.815929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.1051701819191762, "kl": 0.06540177017450333, "learning_rate": 7.392238726039897e-07, "loss": 0.0064, "num_tokens": 11173082.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6310417652130127, "sampling/importance_sampling_ratio/mean": 1.0000622272491455, "sampling/importance_sampling_ratio/min": 0.5133769512176514, "sampling/sampling_logp_difference/max": 0.6667449474334717, "sampling/sampling_logp_difference/mean": 0.01352219469845295, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.34099167585372925, "epoch": 0.8176991150442477, "frac_reward_zero_std": 0.25, "grad_norm": 1.4015387839072433, "kl": 0.13486215472221375, "learning_rate": 7.378664487863102e-07, "loss": -0.0766, "num_tokens": 11200430.0, "reward": 0.34375, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4598815441131592, "sampling/importance_sampling_ratio/mean": 1.0004026889801025, "sampling/importance_sampling_ratio/min": 0.6321660876274109, "sampling/sampling_logp_difference/max": 0.4586031436920166, "sampling/sampling_logp_difference/mean": 0.01569107547402382, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 144.984375, "completions/mean_terminated_length": 144.984375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2723640203475952, "epoch": 0.8194690265486726, "frac_reward_zero_std": 0.75, "grad_norm": 1.972496437425704, "kl": 0.09231165796518326, "learning_rate": 7.365067551524739e-07, "loss": -0.0484, "num_tokens": 11221549.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6629855632781982, "sampling/importance_sampling_ratio/mean": 0.9987390041351318, "sampling/importance_sampling_ratio/min": 0.2755856513977051, "sampling/sampling_logp_difference/max": 1.2888567447662354, "sampling/sampling_logp_difference/mean": 0.01529267244040966, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 84.6875, "completions/mean_terminated_length": 84.6875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.2179868221282959, "epoch": 0.8212389380530973, "frac_reward_zero_std": 1.0, "grad_norm": 0.09001194951208835, "kl": 0.11149385571479797, "learning_rate": 7.351448046772177e-07, "loss": 0.0011, "num_tokens": 11236313.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997960329055786, "sampling/importance_sampling_ratio/min": 0.6555051207542419, "sampling/sampling_logp_difference/max": 1.1088428497314453, "sampling/sampling_logp_difference/mean": 0.014946072362363338, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 225.109375, "completions/mean_terminated_length": 225.109375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.27389317750930786, "epoch": 0.8230088495575221, "frac_reward_zero_std": 0.5, "grad_norm": 1.4697215718288328, "kl": 0.08184997737407684, "learning_rate": 7.33780610356814e-07, "loss": -0.0255, "num_tokens": 11263584.0, "reward": 0.3125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6574969291687012, "sampling/importance_sampling_ratio/mean": 0.9999033212661743, "sampling/importance_sampling_ratio/min": 0.5624995231628418, "sampling/sampling_logp_difference/max": 0.5753650665283203, "sampling/sampling_logp_difference/mean": 0.014748264104127884, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.27104291319847107, "epoch": 0.8247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 1.8306315428397812, "kl": 0.097135990858078, "learning_rate": 7.324141852089471e-07, "loss": -0.0081, "num_tokens": 11284856.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999662041664124, "sampling/importance_sampling_ratio/min": 0.4954352378845215, "sampling/sampling_logp_difference/max": 0.709916353225708, "sampling/sampling_logp_difference/mean": 0.015428267419338226, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.36813652515411377, "epoch": 0.8265486725663717, "frac_reward_zero_std": 0.5, "grad_norm": 1.9230341116698462, "kl": 0.1345624029636383, "learning_rate": 7.310455422725889e-07, "loss": -0.0226, "num_tokens": 11308508.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5597198009490967, "sampling/importance_sampling_ratio/mean": 1.000495433807373, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.01686239242553711, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 134.15625, "completions/mean_terminated_length": 134.15625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.2625473737716675, "epoch": 0.8283185840707965, "frac_reward_zero_std": 0.75, "grad_norm": 1.4437605876933313, "kl": 0.09955163300037384, "learning_rate": 7.296746946078736e-07, "loss": -0.0282, "num_tokens": 11327862.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6053035259246826, "sampling/importance_sampling_ratio/mean": 0.9999257922172546, "sampling/importance_sampling_ratio/min": 0.566536545753479, "sampling/sampling_logp_difference/max": 0.568213701248169, "sampling/sampling_logp_difference/mean": 0.014665738679468632, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 264.515625, "completions/mean_terminated_length": 264.515625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.23119328916072845, "epoch": 0.8300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 0.72948690953906, "kl": 0.08526307344436646, "learning_rate": 7.283016552959744e-07, "loss": 0.0281, "num_tokens": 11354071.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6198385953903198, "sampling/importance_sampling_ratio/mean": 1.000049352645874, "sampling/importance_sampling_ratio/min": 0.6165264248847961, "sampling/sampling_logp_difference/max": 0.4836540222167969, "sampling/sampling_logp_difference/mean": 0.011496550403535366, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 176.59375, "completions/mean_terminated_length": 176.59375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.24806052446365356, "epoch": 0.831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 1.155153827724109, "kl": 0.1228296160697937, "learning_rate": 7.26926437438978e-07, "loss": 0.0069, "num_tokens": 11375981.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.522972583770752, "sampling/importance_sampling_ratio/mean": 0.9999892711639404, "sampling/importance_sampling_ratio/min": 0.6096453070640564, "sampling/sampling_logp_difference/max": 0.4948779344558716, "sampling/sampling_logp_difference/mean": 0.01389935053884983, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.28486597537994385, "epoch": 0.8336283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 1.742687772925984, "kl": 0.09621911495923996, "learning_rate": 7.255490541597594e-07, "loss": -0.0183, "num_tokens": 11402501.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00089430809021, "sampling/importance_sampling_ratio/min": 0.3700678050518036, "sampling/sampling_logp_difference/max": 0.9940690994262695, "sampling/sampling_logp_difference/mean": 0.01622522808611393, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 113.796875, "completions/mean_terminated_length": 113.796875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2088649421930313, "epoch": 0.8353982300884956, "frac_reward_zero_std": 1.0, "grad_norm": 0.06763961756041881, "kl": 0.09535472095012665, "learning_rate": 7.241695186018573e-07, "loss": 0.001, "num_tokens": 11418232.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7489606142044067, "sampling/importance_sampling_ratio/mean": 1.000305414199829, "sampling/importance_sampling_ratio/min": 0.6383603811264038, "sampling/sampling_logp_difference/max": 0.5590215921401978, "sampling/sampling_logp_difference/mean": 0.013497954234480858, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 235.375, "completions/mean_terminated_length": 235.375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.22254955768585205, "epoch": 0.8371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 1.0468466496433009, "kl": 0.08238639682531357, "learning_rate": 7.227878439293476e-07, "loss": 0.0492, "num_tokens": 11442992.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.606371283531189, "sampling/importance_sampling_ratio/mean": 0.9996415376663208, "sampling/importance_sampling_ratio/min": 0.6120424270629883, "sampling/sampling_logp_difference/max": 0.4909536838531494, "sampling/sampling_logp_difference/mean": 0.011071446351706982, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 202.296875, "completions/mean_terminated_length": 202.296875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.31057924032211304, "epoch": 0.8389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 1.1532758540702392, "kl": 0.095781110227108, "learning_rate": 7.214040433267198e-07, "loss": 0.0016, "num_tokens": 11469299.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.522496223449707, "sampling/importance_sampling_ratio/mean": 1.0001308917999268, "sampling/importance_sampling_ratio/min": 0.6235222220420837, "sampling/sampling_logp_difference/max": 0.47237086296081543, "sampling/sampling_logp_difference/mean": 0.014310511760413647, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 169.46875, "completions/mean_terminated_length": 169.46875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.30409765243530273, "epoch": 0.8407079646017699, "frac_reward_zero_std": 0.75, "grad_norm": 1.4091697546841495, "kl": 0.09266463667154312, "learning_rate": 7.200181299987482e-07, "loss": -0.0413, "num_tokens": 11491841.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.54977548122406, "sampling/importance_sampling_ratio/mean": 0.9994775056838989, "sampling/importance_sampling_ratio/min": 0.5074211955070496, "sampling/sampling_logp_difference/max": 0.6784138679504395, "sampling/sampling_logp_difference/mean": 0.015322301536798477, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 139.125, "completions/mean_terminated_length": 139.125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.21665015816688538, "epoch": 0.8424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.039833750883976234, "kl": 0.06707678735256195, "learning_rate": 7.186301171703688e-07, "loss": 0.0006, "num_tokens": 11510697.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8187355995178223, "sampling/importance_sampling_ratio/mean": 1.0007455348968506, "sampling/importance_sampling_ratio/min": 0.48598286509513855, "sampling/sampling_logp_difference/max": 0.7215819358825684, "sampling/sampling_logp_difference/mean": 0.013453138992190361, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 157.015625, "completions/mean_terminated_length": 157.015625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.31312859058380127, "epoch": 0.8442477876106195, "frac_reward_zero_std": 0.75, "grad_norm": 1.464745141834717, "kl": 0.09124094247817993, "learning_rate": 7.172400180865513e-07, "loss": 0.037, "num_tokens": 11533066.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000349998474121, "sampling/importance_sampling_ratio/min": 0.5336884260177612, "sampling/sampling_logp_difference/max": 0.8994505405426025, "sampling/sampling_logp_difference/mean": 0.0161188542842865, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 171.65625, "completions/mean_terminated_length": 171.65625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.29203808307647705, "epoch": 0.8460176991150442, "frac_reward_zero_std": 0.5, "grad_norm": 1.7266208028388466, "kl": 0.0928914025425911, "learning_rate": 7.158478460121734e-07, "loss": 0.0498, "num_tokens": 11555380.0, "reward": 0.53125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.586168885231018, "sampling/importance_sampling_ratio/mean": 0.998978316783905, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.5662540197372437, "sampling/sampling_logp_difference/mean": 0.0157019030302763, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 303.546875, "completions/mean_terminated_length": 303.546875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.33147814869880676, "epoch": 0.8477876106194691, "frac_reward_zero_std": 0.25, "grad_norm": 1.2575102869752703, "kl": 0.08407057821750641, "learning_rate": 7.144536142318944e-07, "loss": -0.0112, "num_tokens": 11586343.0, "reward": -0.25, "reward_std": 0.7455305457115173, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6121418476104736, "sampling/importance_sampling_ratio/mean": 1.0000580549240112, "sampling/importance_sampling_ratio/min": 0.5362950563430786, "sampling/sampling_logp_difference/max": 0.6230708360671997, "sampling/sampling_logp_difference/mean": 0.015437502413988113, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 170.96875, "completions/mean_terminated_length": 170.96875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.3016532063484192, "epoch": 0.8495575221238938, "frac_reward_zero_std": 0.5, "grad_norm": 1.8550376344058404, "kl": 0.08589797466993332, "learning_rate": 7.130573360500276e-07, "loss": -0.0097, "num_tokens": 11609141.0, "reward": 0.53125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.625006914138794, "sampling/importance_sampling_ratio/mean": 0.9992378950119019, "sampling/importance_sampling_ratio/min": 0.5388995409011841, "sampling/sampling_logp_difference/max": 0.6182260513305664, "sampling/sampling_logp_difference/mean": 0.016829874366521835, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 156.984375, "completions/mean_terminated_length": 156.984375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.3116839528083801, "epoch": 0.8513274336283185, "frac_reward_zero_std": 0.5, "grad_norm": 1.868819980436171, "kl": 0.11740688979625702, "learning_rate": 7.116590247904143e-07, "loss": 0.0133, "num_tokens": 11628692.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5602270364761353, "sampling/importance_sampling_ratio/mean": 1.0004372596740723, "sampling/importance_sampling_ratio/min": 0.6355285048484802, "sampling/sampling_logp_difference/max": 0.45329833030700684, "sampling/sampling_logp_difference/mean": 0.016720928251743317, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 190.921875, "completions/mean_terminated_length": 190.921875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.18741580843925476, "epoch": 0.8530973451327434, "frac_reward_zero_std": 0.75, "grad_norm": 1.3159831255386631, "kl": 0.07222100347280502, "learning_rate": 7.10258693796296e-07, "loss": 0.002, "num_tokens": 11651087.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6022182703018188, "sampling/importance_sampling_ratio/mean": 0.9991044402122498, "sampling/importance_sampling_ratio/min": 0.5127838253974915, "sampling/sampling_logp_difference/max": 0.6679009199142456, "sampling/sampling_logp_difference/mean": 0.012968306429684162, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 241.328125, "completions/mean_terminated_length": 241.328125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.23737293481826782, "epoch": 0.8548672566371681, "frac_reward_zero_std": 0.5, "grad_norm": 1.4376853412259836, "kl": 0.06894382834434509, "learning_rate": 7.088563564301873e-07, "loss": -0.069, "num_tokens": 11677236.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6117782592773438, "sampling/importance_sampling_ratio/mean": 1.0008764266967773, "sampling/importance_sampling_ratio/min": 0.5017890930175781, "sampling/sampling_logp_difference/max": 0.6895754337310791, "sampling/sampling_logp_difference/mean": 0.01322878710925579, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 134.40625, "completions/mean_terminated_length": 134.40625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3243998885154724, "epoch": 0.856637168141593, "frac_reward_zero_std": 0.5, "grad_norm": 2.40858537952096, "kl": 0.10078921914100647, "learning_rate": 7.074520260737487e-07, "loss": -0.0355, "num_tokens": 11697710.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7326374053955078, "sampling/importance_sampling_ratio/mean": 0.9998638033866882, "sampling/importance_sampling_ratio/min": 0.6438302993774414, "sampling/sampling_logp_difference/max": 0.5496447086334229, "sampling/sampling_logp_difference/mean": 0.016245003789663315, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 74.6875, "completions/mean_terminated_length": 74.6875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.17472270131111145, "epoch": 0.8584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.07628910778521153, "kl": 0.06152138113975525, "learning_rate": 7.06045716127658e-07, "loss": 0.0006, "num_tokens": 11712570.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.634839653968811, "sampling/importance_sampling_ratio/mean": 0.9994436502456665, "sampling/importance_sampling_ratio/min": 0.49368974566459656, "sampling/sampling_logp_difference/max": 0.705847978591919, "sampling/sampling_logp_difference/mean": 0.012359494343400002, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2247626781463623, "epoch": 0.8601769911504424, "frac_reward_zero_std": 0.75, "grad_norm": 1.818919164957071, "kl": 0.09298969805240631, "learning_rate": 7.04637440011484e-07, "loss": 0.0064, "num_tokens": 11731882.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8609539270401, "sampling/importance_sampling_ratio/mean": 1.0003266334533691, "sampling/importance_sampling_ratio/min": 0.5292339324951172, "sampling/sampling_logp_difference/max": 0.6363247632980347, "sampling/sampling_logp_difference/mean": 0.015714038163423538, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 210.890625, "completions/mean_terminated_length": 210.890625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.23297055065631866, "epoch": 0.8619469026548673, "frac_reward_zero_std": 0.5, "grad_norm": 1.4456171359285384, "kl": 0.09041094779968262, "learning_rate": 7.032272111635565e-07, "loss": -0.062, "num_tokens": 11755251.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7978670597076416, "sampling/importance_sampling_ratio/mean": 1.0009212493896484, "sampling/importance_sampling_ratio/min": 0.605469286441803, "sampling/sampling_logp_difference/max": 0.5866010189056396, "sampling/sampling_logp_difference/mean": 0.013496624305844307, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 190.921875, "completions/mean_terminated_length": 190.921875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2863754630088806, "epoch": 0.863716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 1.75543792956684, "kl": 0.145380437374115, "learning_rate": 7.018150430408394e-07, "loss": -0.0002, "num_tokens": 11782030.0, "reward": -0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5525977611541748, "sampling/importance_sampling_ratio/mean": 1.0002708435058594, "sampling/importance_sampling_ratio/min": 0.6248590350151062, "sampling/sampling_logp_difference/max": 0.47022926807403564, "sampling/sampling_logp_difference/mean": 0.015285274013876915, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 190.890625, "completions/mean_terminated_length": 190.890625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2686413526535034, "epoch": 0.8654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 1.2321844483541606, "kl": 0.08146736025810242, "learning_rate": 7.004009491188022e-07, "loss": 0.0018, "num_tokens": 11806455.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6729422807693481, "sampling/importance_sampling_ratio/mean": 0.9995931386947632, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.514583945274353, "sampling/sampling_logp_difference/mean": 0.013509852811694145, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 103.140625, "completions/mean_terminated_length": 103.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.17214247584342957, "epoch": 0.8672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513592945193148, "kl": 0.06472460925579071, "learning_rate": 6.989849428912907e-07, "loss": 0.0007, "num_tokens": 11822464.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971609354019165, "sampling/importance_sampling_ratio/mean": 0.999538242816925, "sampling/importance_sampling_ratio/min": 0.6080341339111328, "sampling/sampling_logp_difference/max": 0.4975242614746094, "sampling/sampling_logp_difference/mean": 0.012858957052230835, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 122.203125, "completions/mean_terminated_length": 122.203125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.20026946067810059, "epoch": 0.8690265486725663, "frac_reward_zero_std": 0.75, "grad_norm": 1.9711135697396813, "kl": 0.09769254177808762, "learning_rate": 6.975670378703992e-07, "loss": -0.048, "num_tokens": 11840125.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9312912225723267, "sampling/importance_sampling_ratio/mean": 0.9998112916946411, "sampling/importance_sampling_ratio/min": 0.2428303211927414, "sampling/sampling_logp_difference/max": 1.4153923988342285, "sampling/sampling_logp_difference/mean": 0.015533816069364548, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 332.359375, "completions/mean_terminated_length": 332.359375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2868131995201111, "epoch": 0.8707964601769912, "frac_reward_zero_std": 0.25, "grad_norm": 1.3661063579088057, "kl": 0.06904970854520798, "learning_rate": 6.961472475863405e-07, "loss": 0.0352, "num_tokens": 11875460.0, "reward": 0.03125, "reward_std": 0.6413977742195129, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.574299931526184, "sampling/importance_sampling_ratio/mean": 0.9994086027145386, "sampling/importance_sampling_ratio/min": 0.5452684760093689, "sampling/sampling_logp_difference/max": 0.6064770221710205, "sampling/sampling_logp_difference/mean": 0.015135878697037697, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 285.546875, "completions/mean_terminated_length": 285.546875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.2845487594604492, "epoch": 0.8725663716814159, "frac_reward_zero_std": 0.25, "grad_norm": 1.75928963948708, "kl": 0.07286480814218521, "learning_rate": 6.947255855873176e-07, "loss": -0.0129, "num_tokens": 11903255.0, "reward": -0.125, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.662986397743225, "sampling/importance_sampling_ratio/mean": 0.9995602369308472, "sampling/importance_sampling_ratio/min": 0.5622440576553345, "sampling/sampling_logp_difference/max": 0.5758192539215088, "sampling/sampling_logp_difference/mean": 0.014554204419255257, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 219.890625, "completions/mean_terminated_length": 219.890625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.26031407713890076, "epoch": 0.8743362831858407, "frac_reward_zero_std": 0.25, "grad_norm": 2.015213646172026, "kl": 0.07468391954898834, "learning_rate": 6.93302065439394e-07, "loss": 0.0775, "num_tokens": 11931280.0, "reward": 0.1875, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6230804920196533, "sampling/importance_sampling_ratio/mean": 1.000654697418213, "sampling/importance_sampling_ratio/min": 0.4528287947177887, "sampling/sampling_logp_difference/max": 0.7922412157058716, "sampling/sampling_logp_difference/mean": 0.015673939138650894, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 89.609375, "completions/mean_terminated_length": 89.609375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.16368423402309418, "epoch": 0.8761061946902655, "frac_reward_zero_std": 1.0, "grad_norm": 0.09560566708332499, "kl": 0.07655131816864014, "learning_rate": 6.918767007263645e-07, "loss": 0.0008, "num_tokens": 11945911.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6242557764053345, "sampling/importance_sampling_ratio/mean": 1.0002734661102295, "sampling/importance_sampling_ratio/min": 0.5777470469474792, "sampling/sampling_logp_difference/max": 0.5486191511154175, "sampling/sampling_logp_difference/mean": 0.010785510763525963, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 87.046875, "completions/mean_terminated_length": 87.046875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16879679262638092, "epoch": 0.8778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.08379966035083888, "kl": 0.058332476764917374, "learning_rate": 6.904495050496258e-07, "loss": 0.0006, "num_tokens": 11962650.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9132671356201172, "sampling/importance_sampling_ratio/mean": 0.9997939467430115, "sampling/importance_sampling_ratio/min": 0.6189252138137817, "sampling/sampling_logp_difference/max": 0.6488122940063477, "sampling/sampling_logp_difference/mean": 0.013459901325404644, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 176.140625, "completions/mean_terminated_length": 176.140625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2343016415834427, "epoch": 0.879646017699115, "frac_reward_zero_std": 0.5, "grad_norm": 1.7292158247454754, "kl": 0.06757572293281555, "learning_rate": 6.890204920280457e-07, "loss": 0.0217, "num_tokens": 11986323.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6573745012283325, "sampling/importance_sampling_ratio/mean": 0.9999605417251587, "sampling/importance_sampling_ratio/min": 0.48819631338119507, "sampling/sampling_logp_difference/max": 0.7170376777648926, "sampling/sampling_logp_difference/mean": 0.01260643545538187, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.22539138793945312, "epoch": 0.8814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 1.2953581635656217, "kl": 0.08673928678035736, "learning_rate": 6.875896752978344e-07, "loss": -0.0006, "num_tokens": 12010827.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4811066389083862, "sampling/importance_sampling_ratio/mean": 1.0002479553222656, "sampling/importance_sampling_ratio/min": 0.5910096168518066, "sampling/sampling_logp_difference/max": 0.525922954082489, "sampling/sampling_logp_difference/mean": 0.012732493691146374, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 147.140625, "completions/mean_terminated_length": 147.140625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.24289393424987793, "epoch": 0.8831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 1.9898035976419366, "kl": 0.08971019089221954, "learning_rate": 6.861570685124134e-07, "loss": -0.0227, "num_tokens": 12030564.0, "reward": 0.46875, "reward_std": 0.5143726468086243, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5297788381576538, "sampling/importance_sampling_ratio/mean": 1.0004501342773438, "sampling/importance_sampling_ratio/min": 0.6058481931686401, "sampling/sampling_logp_difference/max": 0.5011258125305176, "sampling/sampling_logp_difference/mean": 0.014872195199131966, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 241.8125, "completions/mean_terminated_length": 241.8125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.24499258399009705, "epoch": 0.8849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 0.9781015063454445, "kl": 0.060924604535102844, "learning_rate": 6.847226853422861e-07, "loss": 0.004, "num_tokens": 12060792.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5819672346115112, "sampling/importance_sampling_ratio/mean": 1.0001592636108398, "sampling/importance_sampling_ratio/min": 0.5843503475189209, "sampling/sampling_logp_difference/max": 0.5372545719146729, "sampling/sampling_logp_difference/mean": 0.014691051095724106, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.21421381831169128, "epoch": 0.8867256637168142, "frac_reward_zero_std": 0.75, "grad_norm": 1.922568391687469, "kl": 0.06271041929721832, "learning_rate": 6.832865394749065e-07, "loss": -0.0036, "num_tokens": 12080072.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998106956481934, "sampling/importance_sampling_ratio/min": 0.5642319917678833, "sampling/sampling_logp_difference/max": 0.9277102947235107, "sampling/sampling_logp_difference/mean": 0.013271542266011238, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.20577079057693481, "epoch": 0.8884955752212389, "frac_reward_zero_std": 1.0, "grad_norm": 0.10104494650241204, "kl": 0.07240723073482513, "learning_rate": 6.818486446145486e-07, "loss": 0.0008, "num_tokens": 12099328.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9531092643737793, "sampling/importance_sampling_ratio/mean": 1.0007264614105225, "sampling/importance_sampling_ratio/min": 0.5910104513168335, "sampling/sampling_logp_difference/max": 0.6694226264953613, "sampling/sampling_logp_difference/mean": 0.016134457662701607, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 83.546875, "completions/mean_terminated_length": 83.546875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.20276133716106415, "epoch": 0.8902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.09314570886713332, "kl": 0.0983431413769722, "learning_rate": 6.804090144821772e-07, "loss": 0.001, "num_tokens": 12115171.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6653670072555542, "sampling/importance_sampling_ratio/mean": 1.0003103017807007, "sampling/importance_sampling_ratio/min": 0.5074187517166138, "sampling/sampling_logp_difference/max": 0.6784186363220215, "sampling/sampling_logp_difference/mean": 0.013952775858342648, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 153.765625, "completions/mean_terminated_length": 153.765625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18803927302360535, "epoch": 0.8920353982300885, "frac_reward_zero_std": 0.75, "grad_norm": 1.4571297446227123, "kl": 0.059182263910770416, "learning_rate": 6.789676628153143e-07, "loss": -0.0433, "num_tokens": 12136772.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8496507406234741, "sampling/importance_sampling_ratio/mean": 1.000973105430603, "sampling/importance_sampling_ratio/min": 0.5951108932495117, "sampling/sampling_logp_difference/max": 0.6149967908859253, "sampling/sampling_logp_difference/mean": 0.013116484507918358, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 133.828125, "completions/mean_terminated_length": 133.828125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.18603232502937317, "epoch": 0.8938053097345132, "frac_reward_zero_std": 1.0, "grad_norm": 0.09982287937590968, "kl": 0.062489982694387436, "learning_rate": 6.775246033679104e-07, "loss": 0.0006, "num_tokens": 12155881.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.613848090171814, "sampling/importance_sampling_ratio/mean": 1.000258207321167, "sampling/importance_sampling_ratio/min": 0.6171473860740662, "sampling/sampling_logp_difference/max": 0.4826474189758301, "sampling/sampling_logp_difference/mean": 0.012163517996668816, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 170.734375, "completions/mean_terminated_length": 170.734375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.2228136956691742, "epoch": 0.8955752212389381, "frac_reward_zero_std": 0.75, "grad_norm": 1.2420898090060715, "kl": 0.07515808939933777, "learning_rate": 6.76079849910212e-07, "loss": 0.0069, "num_tokens": 12177160.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995928406715393, "sampling/importance_sampling_ratio/min": 0.44219881296157837, "sampling/sampling_logp_difference/max": 0.8458077907562256, "sampling/sampling_logp_difference/mean": 0.013314129784703255, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 87.484375, "completions/mean_terminated_length": 87.484375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.1346748173236847, "epoch": 0.8973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.09771786685055059, "kl": 0.045838937163352966, "learning_rate": 6.746334162286307e-07, "loss": 0.0004, "num_tokens": 12192487.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6421228647232056, "sampling/importance_sampling_ratio/mean": 0.9993751049041748, "sampling/importance_sampling_ratio/min": 0.6059068441390991, "sampling/sampling_logp_difference/max": 0.5010290145874023, "sampling/sampling_logp_difference/mean": 0.009760710410773754, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 117.703125, "completions/mean_terminated_length": 117.703125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17134593427181244, "epoch": 0.8991150442477877, "frac_reward_zero_std": 1.0, "grad_norm": 0.17195312336994908, "kl": 0.07391640543937683, "learning_rate": 6.731853161256113e-07, "loss": 0.0009, "num_tokens": 12209284.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5747500658035278, "sampling/importance_sampling_ratio/mean": 1.0003368854522705, "sampling/importance_sampling_ratio/min": 0.5051394104957581, "sampling/sampling_logp_difference/max": 0.6829209327697754, "sampling/sampling_logp_difference/mean": 0.013952373526990414, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 117.578125, "completions/mean_terminated_length": 117.578125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.21811603009700775, "epoch": 0.9008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 1.749119134091422, "kl": 0.06408712267875671, "learning_rate": 6.717355634195004e-07, "loss": 0.0088, "num_tokens": 12227433.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.597145676612854, "sampling/importance_sampling_ratio/mean": 1.0004184246063232, "sampling/importance_sampling_ratio/min": 0.617236852645874, "sampling/sampling_logp_difference/max": 0.48250246047973633, "sampling/sampling_logp_difference/mean": 0.014479318633675575, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 162.453125, "completions/mean_terminated_length": 162.453125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.29848822951316833, "epoch": 0.9026548672566371, "frac_reward_zero_std": 0.5, "grad_norm": 1.864859633261317, "kl": 0.13393345475196838, "learning_rate": 6.70284171944414e-07, "loss": -0.0135, "num_tokens": 12248214.0, "reward": 0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6555203199386597, "sampling/importance_sampling_ratio/mean": 1.000079870223999, "sampling/importance_sampling_ratio/min": 0.5600612759590149, "sampling/sampling_logp_difference/max": 0.5797090530395508, "sampling/sampling_logp_difference/mean": 0.01591721549630165, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 122.25, "completions/mean_terminated_length": 122.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.2154177576303482, "epoch": 0.904424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.718731467938021, "kl": 0.07451668381690979, "learning_rate": 6.688311555501063e-07, "loss": 0.0104, "num_tokens": 12267894.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6634249687194824, "sampling/importance_sampling_ratio/mean": 0.9999340176582336, "sampling/importance_sampling_ratio/min": 0.5935536623001099, "sampling/sampling_logp_difference/max": 0.52162766456604, "sampling/sampling_logp_difference/mean": 0.013705511577427387, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 177.0625, "completions/mean_terminated_length": 177.0625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.21645711362361908, "epoch": 0.9061946902654867, "frac_reward_zero_std": 0.75, "grad_norm": 1.1522462247593213, "kl": 0.07142680138349533, "learning_rate": 6.673765281018372e-07, "loss": 0.0136, "num_tokens": 12289034.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5331443548202515, "sampling/importance_sampling_ratio/mean": 0.9988040328025818, "sampling/importance_sampling_ratio/min": 0.624366283416748, "sampling/sampling_logp_difference/max": 0.47101807594299316, "sampling/sampling_logp_difference/mean": 0.013371540233492851, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 132.609375, "completions/mean_terminated_length": 132.609375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19599120318889618, "epoch": 0.9079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.676550814822893, "kl": 0.07155442982912064, "learning_rate": 6.659203034802396e-07, "loss": -0.0187, "num_tokens": 12308705.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6008371114730835, "sampling/importance_sampling_ratio/mean": 0.999868631362915, "sampling/importance_sampling_ratio/min": 0.6143175363540649, "sampling/sampling_logp_difference/max": 0.48724329471588135, "sampling/sampling_logp_difference/mean": 0.013603073544800282, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 140.203125, "completions/mean_terminated_length": 140.203125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.24311748147010803, "epoch": 0.9097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 1.9487555628530473, "kl": 0.09489445388317108, "learning_rate": 6.644624955811873e-07, "loss": 0.0261, "num_tokens": 12335758.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0004059076309204, "sampling/importance_sampling_ratio/min": 0.5103806853294373, "sampling/sampling_logp_difference/max": 0.6725983619689941, "sampling/sampling_logp_difference/mean": 0.016555175185203552, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 112.359375, "completions/mean_terminated_length": 112.359375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.16854803264141083, "epoch": 0.911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.12349897216725166, "kl": 0.0720117837190628, "learning_rate": 6.630031183156627e-07, "loss": 0.0008, "num_tokens": 12352677.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.913527250289917, "sampling/importance_sampling_ratio/mean": 0.9999138116836548, "sampling/importance_sampling_ratio/min": 0.6147370934486389, "sampling/sampling_logp_difference/max": 0.6489481925964355, "sampling/sampling_logp_difference/mean": 0.012867513112723827, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 121.46875, "completions/mean_terminated_length": 121.46875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2060689777135849, "epoch": 0.9132743362831859, "frac_reward_zero_std": 0.5, "grad_norm": 2.252512526032329, "kl": 0.08830845355987549, "learning_rate": 6.61542185609623e-07, "loss": -0.0294, "num_tokens": 12372579.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6644889116287231, "sampling/importance_sampling_ratio/mean": 1.000049114227295, "sampling/importance_sampling_ratio/min": 0.5521225333213806, "sampling/sampling_logp_difference/max": 0.5939853191375732, "sampling/sampling_logp_difference/mean": 0.014510020613670349, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 173.40625, "completions/mean_terminated_length": 173.40625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.15603843331336975, "epoch": 0.9150442477876106, "frac_reward_zero_std": 0.75, "grad_norm": 1.4203289680899915, "kl": 0.06413950026035309, "learning_rate": 6.60079711403869e-07, "loss": -0.0193, "num_tokens": 12395485.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6811944246292114, "sampling/importance_sampling_ratio/mean": 1.0003342628479004, "sampling/importance_sampling_ratio/min": 0.6056227684020996, "sampling/sampling_logp_difference/max": 0.5195045471191406, "sampling/sampling_logp_difference/mean": 0.01101603452116251, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 138.640625, "completions/mean_terminated_length": 138.640625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.1671760380268097, "epoch": 0.9168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 1.6311075012686245, "kl": 0.06192456930875778, "learning_rate": 6.586157096539104e-07, "loss": -0.065, "num_tokens": 12413926.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009386539459229, "sampling/importance_sampling_ratio/min": 0.6074139475822449, "sampling/sampling_logp_difference/max": 0.9236290454864502, "sampling/sampling_logp_difference/mean": 0.012389777228236198, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 102.84375, "completions/mean_terminated_length": 102.84375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.18423107266426086, "epoch": 0.9185840707964602, "frac_reward_zero_std": 1.0, "grad_norm": 0.10310690417971817, "kl": 0.07077330350875854, "learning_rate": 6.571501943298335e-07, "loss": 0.0007, "num_tokens": 12429900.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5213937759399414, "sampling/importance_sampling_ratio/mean": 1.0005460977554321, "sampling/importance_sampling_ratio/min": 0.443074107170105, "sampling/sampling_logp_difference/max": 0.8140182495117188, "sampling/sampling_logp_difference/mean": 0.0146607905626297, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 104.4375, "completions/mean_terminated_length": 104.4375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.2153693437576294, "epoch": 0.9203539823008849, "frac_reward_zero_std": 0.75, "grad_norm": 1.975652669636163, "kl": 0.09459412097930908, "learning_rate": 6.556831794161677e-07, "loss": 0.0047, "num_tokens": 12446472.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6207278966903687, "sampling/importance_sampling_ratio/mean": 1.0000276565551758, "sampling/importance_sampling_ratio/min": 0.6665579676628113, "sampling/sampling_logp_difference/max": 0.48287534713745117, "sampling/sampling_logp_difference/mean": 0.014695551246404648, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 181.796875, "completions/mean_terminated_length": 181.796875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.15409141778945923, "epoch": 0.9221238938053097, "frac_reward_zero_std": 1.0, "grad_norm": 0.19493430761605432, "kl": 0.0543629415333271, "learning_rate": 6.542146789117523e-07, "loss": 0.0005, "num_tokens": 12467371.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000316858291626, "sampling/importance_sampling_ratio/min": 0.5910096168518066, "sampling/sampling_logp_difference/max": 1.1254243850708008, "sampling/sampling_logp_difference/mean": 0.011631837114691734, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 118.671875, "completions/mean_terminated_length": 118.671875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16768015921115875, "epoch": 0.9238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.08276491012573578, "kl": 0.05810067057609558, "learning_rate": 6.527447068296025e-07, "loss": 0.0005, "num_tokens": 12484438.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971453189849854, "sampling/importance_sampling_ratio/mean": 1.0008851289749146, "sampling/importance_sampling_ratio/min": 0.49541550874710083, "sampling/sampling_logp_difference/max": 0.7023584842681885, "sampling/sampling_logp_difference/mean": 0.01297941617667675, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 97.234375, "completions/mean_terminated_length": 97.234375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.21290622651576996, "epoch": 0.9256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 2.12129909673824, "kl": 0.08432917296886444, "learning_rate": 6.512732771967758e-07, "loss": -0.0028, "num_tokens": 12501733.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6276776790618896, "sampling/importance_sampling_ratio/mean": 0.9991388320922852, "sampling/importance_sampling_ratio/min": 0.23860850930213928, "sampling/sampling_logp_difference/max": 1.4329311847686768, "sampling/sampling_logp_difference/mean": 0.01383144035935402, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 216.484375, "completions/mean_terminated_length": 216.484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.2048395723104477, "epoch": 0.9274336283185841, "frac_reward_zero_std": 0.5, "grad_norm": 1.4655691469855914, "kl": 0.07254438102245331, "learning_rate": 6.498004040542384e-07, "loss": 0.0329, "num_tokens": 12531684.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998718500137329, "sampling/importance_sampling_ratio/min": 0.23428992927074432, "sampling/sampling_logp_difference/max": 1.4511959552764893, "sampling/sampling_logp_difference/mean": 0.011805642396211624, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 124.453125, "completions/mean_terminated_length": 124.453125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.19450099766254425, "epoch": 0.9292035398230089, "frac_reward_zero_std": 0.75, "grad_norm": 2.4674903481826713, "kl": 0.08769615739583969, "learning_rate": 6.483261014567311e-07, "loss": -0.0003, "num_tokens": 12551025.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.635646939277649, "sampling/importance_sampling_ratio/mean": 1.0004091262817383, "sampling/importance_sampling_ratio/min": 0.6147372126579285, "sampling/sampling_logp_difference/max": 0.4920384883880615, "sampling/sampling_logp_difference/mean": 0.013382935896515846, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 142.40625, "completions/mean_terminated_length": 142.40625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2440652847290039, "epoch": 0.9309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.862878944068647, "kl": 0.07464480400085449, "learning_rate": 6.468503834726349e-07, "loss": 0.0258, "num_tokens": 12572491.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.746742606163025, "sampling/importance_sampling_ratio/mean": 1.0003567934036255, "sampling/importance_sampling_ratio/min": 0.05533358082175255, "sampling/sampling_logp_difference/max": 2.8943753242492676, "sampling/sampling_logp_difference/mean": 0.016031067818403244, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 140.6875, "completions/mean_terminated_length": 140.6875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.2335037738084793, "epoch": 0.9327433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 2.2369931044913347, "kl": 0.08174403011798859, "learning_rate": 6.453732641838371e-07, "loss": 0.0392, "num_tokens": 12593223.0, "reward": 0.59375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5221202373504639, "sampling/importance_sampling_ratio/mean": 0.9992659687995911, "sampling/importance_sampling_ratio/min": 0.416178822517395, "sampling/sampling_logp_difference/max": 0.8766403198242188, "sampling/sampling_logp_difference/mean": 0.015071576461195946, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 187.171875, "completions/mean_terminated_length": 187.171875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.25308364629745483, "epoch": 0.9345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 1.257864270759182, "kl": 0.1140533834695816, "learning_rate": 6.438947576855966e-07, "loss": -0.004, "num_tokens": 12621970.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8251678943634033, "sampling/importance_sampling_ratio/mean": 1.0003626346588135, "sampling/importance_sampling_ratio/min": 0.6146631240844727, "sampling/sampling_logp_difference/max": 0.6016719341278076, "sampling/sampling_logp_difference/mean": 0.01468927413225174, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 189.46875, "completions/mean_terminated_length": 189.46875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.26918596029281616, "epoch": 0.9362831858407079, "frac_reward_zero_std": 0.5, "grad_norm": 1.625279148151284, "kl": 0.10820294916629791, "learning_rate": 6.424148780864103e-07, "loss": -0.0068, "num_tokens": 12646896.0, "reward": -0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4353986978530884, "sampling/importance_sampling_ratio/mean": 1.000308871269226, "sampling/importance_sampling_ratio/min": 0.5456030964851379, "sampling/sampling_logp_difference/max": 0.6058635711669922, "sampling/sampling_logp_difference/mean": 0.014277035370469093, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 130.453125, "completions/mean_terminated_length": 130.453125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.24379435181617737, "epoch": 0.9380530973451328, "frac_reward_zero_std": 0.5, "grad_norm": 2.88275094543766, "kl": 0.09748217463493347, "learning_rate": 6.409336395078771e-07, "loss": 0.0693, "num_tokens": 12666973.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8198883533477783, "sampling/importance_sampling_ratio/mean": 1.0014350414276123, "sampling/importance_sampling_ratio/min": 0.4767603576183319, "sampling/sampling_logp_difference/max": 0.7407413125038147, "sampling/sampling_logp_difference/mean": 0.015441152267158031, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 138.4375, "completions/mean_terminated_length": 138.4375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2686900496482849, "epoch": 0.9398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 2.3983382864233374, "kl": 0.1058024913072586, "learning_rate": 6.394510560845636e-07, "loss": 0.0002, "num_tokens": 12692393.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5298492908477783, "sampling/importance_sampling_ratio/mean": 0.999117374420166, "sampling/importance_sampling_ratio/min": 0.5288708209991455, "sampling/sampling_logp_difference/max": 0.6370110511779785, "sampling/sampling_logp_difference/mean": 0.016936589032411575, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 186.15625, "completions/mean_terminated_length": 186.15625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.21540570259094238, "epoch": 0.9415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.3949435316523566, "kl": 0.06737659871578217, "learning_rate": 6.379671419638702e-07, "loss": 0.0137, "num_tokens": 12716131.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8724091053009033, "sampling/importance_sampling_ratio/mean": 1.00014328956604, "sampling/importance_sampling_ratio/min": 0.5052241683006287, "sampling/sampling_logp_difference/max": 0.6827530860900879, "sampling/sampling_logp_difference/mean": 0.0126869585365057, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 146.015625, "completions/mean_terminated_length": 146.015625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.22015175223350525, "epoch": 0.9433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 2.073589758148823, "kl": 0.08379888534545898, "learning_rate": 6.364819113058951e-07, "loss": 0.1069, "num_tokens": 12745556.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6248646974563599, "sampling/importance_sampling_ratio/mean": 0.999993085861206, "sampling/importance_sampling_ratio/min": 0.44305580854415894, "sampling/sampling_logp_difference/max": 0.8140594959259033, "sampling/sampling_logp_difference/mean": 0.015733974054455757, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 139.4375, "completions/mean_terminated_length": 139.4375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19080805778503418, "epoch": 0.9451327433628318, "frac_reward_zero_std": 1.0, "grad_norm": 0.06086896526960898, "kl": 0.06359320878982544, "learning_rate": 6.349953782832991e-07, "loss": 0.0005, "num_tokens": 12765600.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.786500334739685, "sampling/importance_sampling_ratio/mean": 0.9996516704559326, "sampling/importance_sampling_ratio/min": 0.3746173679828644, "sampling/sampling_logp_difference/max": 0.9818501472473145, "sampling/sampling_logp_difference/mean": 0.01435780804604292, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 163.6875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.2834925055503845, "epoch": 0.9469026548672567, "frac_reward_zero_std": 0.75, "grad_norm": 1.2621666612450506, "kl": 0.10275100916624069, "learning_rate": 6.335075570811708e-07, "loss": -0.0017, "num_tokens": 12788092.0, "reward": -0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5226970911026, "sampling/importance_sampling_ratio/mean": 1.0000207424163818, "sampling/importance_sampling_ratio/min": 0.30695831775665283, "sampling/sampling_logp_difference/max": 1.1810433864593506, "sampling/sampling_logp_difference/mean": 0.015664484351873398, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 80.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16103926301002502, "epoch": 0.9486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.1262095518376495, "kl": 0.060474567115306854, "learning_rate": 6.320184618968914e-07, "loss": 0.0006, "num_tokens": 12802892.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994466304779053, "sampling/importance_sampling_ratio/min": 0.5478149056434631, "sampling/sampling_logp_difference/max": 0.7053267955780029, "sampling/sampling_logp_difference/mean": 0.013035567477345467, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 131.15625, "completions/mean_terminated_length": 131.15625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.24878093600273132, "epoch": 0.9504424778761061, "frac_reward_zero_std": 0.5, "grad_norm": 2.1406502448658014, "kl": 0.10607841610908508, "learning_rate": 6.305281069399988e-07, "loss": 0.0096, "num_tokens": 12822214.0, "reward": -0.09375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5709149837493896, "sampling/importance_sampling_ratio/mean": 1.0003750324249268, "sampling/importance_sampling_ratio/min": 0.4872643053531647, "sampling/sampling_logp_difference/max": 0.7189486026763916, "sampling/sampling_logp_difference/mean": 0.014434963464736938, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 196.609375, "completions/mean_terminated_length": 196.609375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.22695685923099518, "epoch": 0.952212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 1.7522860803347347, "kl": 0.05448362976312637, "learning_rate": 6.290365064320519e-07, "loss": -0.0581, "num_tokens": 12845613.0, "reward": 0.28125, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000390648841858, "sampling/importance_sampling_ratio/min": 0.5791562795639038, "sampling/sampling_logp_difference/max": 0.7019534111022949, "sampling/sampling_logp_difference/mean": 0.01398754958063364, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 129.484375, "completions/mean_terminated_length": 129.484375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.2852279543876648, "epoch": 0.9539823008849557, "frac_reward_zero_std": 0.75, "grad_norm": 1.8826791740289983, "kl": 0.10823218524456024, "learning_rate": 6.275436746064956e-07, "loss": -0.0057, "num_tokens": 12865036.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5986868143081665, "sampling/importance_sampling_ratio/mean": 1.0000401735305786, "sampling/importance_sampling_ratio/min": 0.6008803844451904, "sampling/sampling_logp_difference/max": 0.5093593597412109, "sampling/sampling_logp_difference/mean": 0.017704632133245468, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 142.421875, "completions/mean_terminated_length": 142.421875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.22250120341777802, "epoch": 0.9557522123893806, "frac_reward_zero_std": 0.75, "grad_norm": 1.4908343814373297, "kl": 0.07594837248325348, "learning_rate": 6.260496257085239e-07, "loss": 0.0137, "num_tokens": 12884295.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5072304010391235, "sampling/importance_sampling_ratio/mean": 0.9990766644477844, "sampling/importance_sampling_ratio/min": 0.4997435212135315, "sampling/sampling_logp_difference/max": 0.6936602592468262, "sampling/sampling_logp_difference/mean": 0.014292177744209766, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 229.109375, "completions/mean_terminated_length": 229.109375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.21303771436214447, "epoch": 0.9575221238938053, "frac_reward_zero_std": 1.0, "grad_norm": 0.04504169778511932, "kl": 0.05923967808485031, "learning_rate": 6.245543739949453e-07, "loss": 0.0005, "num_tokens": 12909934.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6535435914993286, "sampling/importance_sampling_ratio/mean": 1.00027596950531, "sampling/importance_sampling_ratio/min": 0.41909292340278625, "sampling/sampling_logp_difference/max": 0.8696626424789429, "sampling/sampling_logp_difference/mean": 0.013105418533086777, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 123.703125, "completions/mean_terminated_length": 123.703125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.22907549142837524, "epoch": 0.95929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.08198436482026467, "kl": 0.06653547286987305, "learning_rate": 6.230579337340456e-07, "loss": 0.0008, "num_tokens": 12928491.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6248584985733032, "sampling/importance_sampling_ratio/mean": 1.000333309173584, "sampling/importance_sampling_ratio/min": 0.5000132322311401, "sampling/sampling_logp_difference/max": 0.6931207180023193, "sampling/sampling_logp_difference/mean": 0.015795625746250153, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 85.734375, "completions/mean_terminated_length": 85.734375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1617472767829895, "epoch": 0.9610619469026549, "frac_reward_zero_std": 1.0, "grad_norm": 0.06992182374630738, "kl": 0.041833315044641495, "learning_rate": 6.215603192054521e-07, "loss": 0.0004, "num_tokens": 12943466.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6242852210998535, "sampling/importance_sampling_ratio/mean": 1.000643253326416, "sampling/importance_sampling_ratio/min": 0.5688135623931885, "sampling/sampling_logp_difference/max": 0.5642025470733643, "sampling/sampling_logp_difference/mean": 0.012807752937078476, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 82.1875, "completions/mean_terminated_length": 82.1875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.17241644859313965, "epoch": 0.9628318584070796, "frac_reward_zero_std": 1.0, "grad_norm": 0.13029437575329306, "kl": 0.05050419270992279, "learning_rate": 6.200615446999981e-07, "loss": 0.0005, "num_tokens": 12958678.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6239190101623535, "sampling/importance_sampling_ratio/mean": 1.0004127025604248, "sampling/importance_sampling_ratio/min": 0.3698098957538605, "sampling/sampling_logp_difference/max": 0.9947662353515625, "sampling/sampling_logp_difference/mean": 0.013712817803025246, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 180.015625, "completions/mean_terminated_length": 180.015625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2560388445854187, "epoch": 0.9646017699115044, "frac_reward_zero_std": 0.5, "grad_norm": 1.741501452622281, "kl": 0.07778279483318329, "learning_rate": 6.185616245195848e-07, "loss": -0.017, "num_tokens": 12983303.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 1.0004401206970215, "sampling/importance_sampling_ratio/min": 0.607722818851471, "sampling/sampling_logp_difference/max": 0.49803638458251953, "sampling/sampling_logp_difference/mean": 0.014068507589399815, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 231.078125, "completions/mean_terminated_length": 231.078125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2516305446624756, "epoch": 0.9663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 1.9389088566968768, "kl": 0.06269054114818573, "learning_rate": 6.170605729770469e-07, "loss": 0.0754, "num_tokens": 13012204.0, "reward": 0.34375, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8878443241119385, "sampling/importance_sampling_ratio/mean": 0.9997134208679199, "sampling/importance_sampling_ratio/min": 0.2951732277870178, "sampling/sampling_logp_difference/max": 1.2201929092407227, "sampling/sampling_logp_difference/mean": 0.014843055047094822, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 126.9375, "completions/mean_terminated_length": 126.9375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.2317388653755188, "epoch": 0.968141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 1.6430394205062622, "kl": 0.08215981721878052, "learning_rate": 6.155584043960143e-07, "loss": 0.0185, "num_tokens": 13029880.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5253117084503174, "sampling/importance_sampling_ratio/mean": 0.9990566968917847, "sampling/importance_sampling_ratio/min": 0.5325444340705872, "sampling/sampling_logp_difference/max": 0.6300889253616333, "sampling/sampling_logp_difference/mean": 0.016833296045660973, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 137.640625, "completions/mean_terminated_length": 137.640625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.23493367433547974, "epoch": 0.9699115044247788, "frac_reward_zero_std": 0.75, "grad_norm": 1.7545168027515754, "kl": 0.06440439075231552, "learning_rate": 6.140551331107766e-07, "loss": 0.0326, "num_tokens": 13049521.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006847381591797, "sampling/importance_sampling_ratio/min": 0.48806527256965637, "sampling/sampling_logp_difference/max": 0.7390429973602295, "sampling/sampling_logp_difference/mean": 0.01386171206831932, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 147.734375, "completions/mean_terminated_length": 147.734375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.23575876653194427, "epoch": 0.9716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 1.6107427452155576, "kl": 0.07220037281513214, "learning_rate": 6.125507734661458e-07, "loss": -0.0502, "num_tokens": 13071664.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8141837120056152, "sampling/importance_sampling_ratio/mean": 0.9996311664581299, "sampling/importance_sampling_ratio/min": 0.6071847677230835, "sampling/sampling_logp_difference/max": 0.5956356525421143, "sampling/sampling_logp_difference/mean": 0.015431705862283707, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 96.890625, "completions/mean_terminated_length": 96.890625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1880950927734375, "epoch": 0.9734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.08729096943677658, "kl": 0.05962299555540085, "learning_rate": 6.110453398173187e-07, "loss": 0.0006, "num_tokens": 13087065.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8367936611175537, "sampling/importance_sampling_ratio/mean": 0.9990761876106262, "sampling/importance_sampling_ratio/min": 0.5893262028694153, "sampling/sampling_logp_difference/max": 0.6080214977264404, "sampling/sampling_logp_difference/mean": 0.013810696080327034, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 129.109375, "completions/mean_terminated_length": 129.109375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.23359403014183044, "epoch": 0.9752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 2.0363647958160693, "kl": 0.07948273420333862, "learning_rate": 6.095388465297418e-07, "loss": 0.032, "num_tokens": 13105840.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6977187395095825, "sampling/importance_sampling_ratio/mean": 0.9998078942298889, "sampling/importance_sampling_ratio/min": 0.5673584938049316, "sampling/sampling_logp_difference/max": 0.5667638778686523, "sampling/sampling_logp_difference/mean": 0.014253129251301289, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.2905304431915283, "epoch": 0.9769911504424779, "frac_reward_zero_std": 0.75, "grad_norm": 1.357117304850838, "kl": 0.0875648558139801, "learning_rate": 6.080313079789723e-07, "loss": -0.0368, "num_tokens": 13128612.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.507219672203064, "sampling/importance_sampling_ratio/mean": 0.9992555379867554, "sampling/importance_sampling_ratio/min": 0.5465753674507141, "sampling/sampling_logp_difference/max": 0.6040830612182617, "sampling/sampling_logp_difference/mean": 0.01624278351664543, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.2660430073738098, "epoch": 0.9787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 2.500235020709489, "kl": 0.09857979416847229, "learning_rate": 6.065227385505421e-07, "loss": -0.0222, "num_tokens": 13146296.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7718896865844727, "sampling/importance_sampling_ratio/mean": 0.9994828701019287, "sampling/importance_sampling_ratio/min": 0.1385183483362198, "sampling/sampling_logp_difference/max": 1.976752519607544, "sampling/sampling_logp_difference/mean": 0.01667897403240204, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 146.484375, "completions/mean_terminated_length": 146.484375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18119508028030396, "epoch": 0.9805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.053228655388293834, "kl": 0.0559057891368866, "learning_rate": 6.050131526398201e-07, "loss": 0.0005, "num_tokens": 13165623.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7368468046188354, "sampling/importance_sampling_ratio/mean": 0.9994401335716248, "sampling/importance_sampling_ratio/min": 0.6369073390960693, "sampling/sampling_logp_difference/max": 0.5520713329315186, "sampling/sampling_logp_difference/mean": 0.013399166986346245, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 150.203125, "completions/mean_terminated_length": 150.203125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.26822900772094727, "epoch": 0.9823008849557522, "frac_reward_zero_std": 0.75, "grad_norm": 1.5406791323256503, "kl": 0.08075505495071411, "learning_rate": 6.035025646518746e-07, "loss": 0.0143, "num_tokens": 13188148.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9115368127822876, "sampling/importance_sampling_ratio/mean": 1.000479817390442, "sampling/importance_sampling_ratio/min": 0.6070020198822021, "sampling/sampling_logp_difference/max": 0.6479074954986572, "sampling/sampling_logp_difference/mean": 0.015663431957364082, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.2163000851869583, "epoch": 0.984070796460177, "frac_reward_zero_std": 0.5, "grad_norm": 1.695017255006761, "kl": 0.07965029776096344, "learning_rate": 6.019909890013366e-07, "loss": 0.0587, "num_tokens": 13208464.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000735878944397, "sampling/importance_sampling_ratio/min": 0.7045882344245911, "sampling/sampling_logp_difference/max": 0.739043116569519, "sampling/sampling_logp_difference/mean": 0.013091850094497204, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 177.09375, "completions/mean_terminated_length": 177.09375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.3319791555404663, "epoch": 0.9858407079646018, "frac_reward_zero_std": 0.25, "grad_norm": 2.258295331040004, "kl": 0.10694355517625809, "learning_rate": 6.004784401122612e-07, "loss": -0.0192, "num_tokens": 13235350.0, "reward": 0.40625, "reward_std": 0.5431214570999146, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5913461446762085, "sampling/importance_sampling_ratio/mean": 0.9996432662010193, "sampling/importance_sampling_ratio/min": 0.6284948587417603, "sampling/sampling_logp_difference/max": 0.4645802974700928, "sampling/sampling_logp_difference/mean": 0.01758665218949318, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 136.03125, "completions/mean_terminated_length": 136.03125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19543331861495972, "epoch": 0.9876106194690265, "frac_reward_zero_std": 0.75, "grad_norm": 1.7850760277891125, "kl": 0.06088044121861458, "learning_rate": 5.98964932417991e-07, "loss": 0.0075, "num_tokens": 13254328.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4152618646621704, "sampling/importance_sampling_ratio/mean": 1.0002940893173218, "sampling/importance_sampling_ratio/min": 0.5911261439323425, "sampling/sampling_logp_difference/max": 0.5257258415222168, "sampling/sampling_logp_difference/mean": 0.013785259798169136, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 168.515625, "completions/mean_terminated_length": 168.515625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.27764254808425903, "epoch": 0.9893805309734514, "frac_reward_zero_std": 0.75, "grad_norm": 1.5025811578300445, "kl": 0.0846179947257042, "learning_rate": 5.974504803610178e-07, "loss": 0.0435, "num_tokens": 13277145.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5486700534820557, "sampling/importance_sampling_ratio/mean": 1.0003268718719482, "sampling/importance_sampling_ratio/min": 0.5045318007469177, "sampling/sampling_logp_difference/max": 0.6841244697570801, "sampling/sampling_logp_difference/mean": 0.014884866774082184, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 153.6875, "completions/mean_terminated_length": 153.6875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.2831330895423889, "epoch": 0.9911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 1.4412746482878112, "kl": 0.07578208297491074, "learning_rate": 5.959350983928445e-07, "loss": 0.0129, "num_tokens": 13299653.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 0.9997293949127197, "sampling/importance_sampling_ratio/min": 0.4701867699623108, "sampling/sampling_logp_difference/max": 0.7546253204345703, "sampling/sampling_logp_difference/mean": 0.016243822872638702, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 116.75, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.29424357414245605, "epoch": 0.9929203539823008, "frac_reward_zero_std": 0.75, "grad_norm": 1.3042316297926195, "kl": 0.09966054558753967, "learning_rate": 5.944188009738483e-07, "loss": -0.006, "num_tokens": 13320085.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.964274287223816, "sampling/importance_sampling_ratio/mean": 0.9996316432952881, "sampling/importance_sampling_ratio/min": 0.5054131150245667, "sampling/sampling_logp_difference/max": 0.6823791861534119, "sampling/sampling_logp_difference/mean": 0.017570752650499344, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 185.03125, "completions/mean_terminated_length": 185.03125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.1773672103881836, "epoch": 0.9946902654867257, "frac_reward_zero_std": 1.0, "grad_norm": 0.04526740882009005, "kl": 0.05905608832836151, "learning_rate": 5.929016025731413e-07, "loss": 0.0004, "num_tokens": 13342039.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003750324249268, "sampling/importance_sampling_ratio/min": 0.1563902497291565, "sampling/sampling_logp_difference/max": 1.855400800704956, "sampling/sampling_logp_difference/mean": 0.01313160639256239, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.1479528546333313, "epoch": 0.9964601769911504, "frac_reward_zero_std": 1.0, "grad_norm": 0.06728779499893418, "kl": 0.06930910795927048, "learning_rate": 5.913835176684334e-07, "loss": 0.0007, "num_tokens": 13359223.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.898005485534668, "sampling/importance_sampling_ratio/mean": 0.9998059272766113, "sampling/importance_sampling_ratio/min": 0.5456662774085999, "sampling/sampling_logp_difference/max": 0.6408035755157471, "sampling/sampling_logp_difference/mean": 0.010531054809689522, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 253.40625, "completions/mean_terminated_length": 253.40625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.32677119970321655, "epoch": 0.9982300884955753, "frac_reward_zero_std": 0.0, "grad_norm": 1.8500650673480148, "kl": 0.09496290236711502, "learning_rate": 5.89864560745894e-07, "loss": 0.0382, "num_tokens": 13386225.0, "reward": -0.125, "reward_std": 0.92271888256073, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4582698345184326, "sampling/importance_sampling_ratio/mean": 0.9998013973236084, "sampling/importance_sampling_ratio/min": 0.5901001691818237, "sampling/sampling_logp_difference/max": 0.5274630188941956, "sampling/sampling_logp_difference/mean": 0.015088970772922039, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 168.171875, "completions/mean_terminated_length": 168.171875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.21553805470466614, "epoch": 1.0, "frac_reward_zero_std": 0.75, "grad_norm": 1.5485652836539958, "kl": 0.08783610910177231, "learning_rate": 5.883447463000135e-07, "loss": 0.031, "num_tokens": 13406780.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6571310758590698, "sampling/importance_sampling_ratio/mean": 0.9994630813598633, "sampling/importance_sampling_ratio/min": 0.34505006670951843, "sampling/sampling_logp_difference/max": 1.06406569480896, "sampling/sampling_logp_difference/mean": 0.01440642774105072, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 92.734375, "completions/mean_terminated_length": 92.734375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.15878769755363464, "epoch": 1.0017699115044247, "frac_reward_zero_std": 1.0, "grad_norm": 0.16786989102997335, "kl": 0.07489940524101257, "learning_rate": 5.868240888334652e-07, "loss": 0.0008, "num_tokens": 13422203.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6159719228744507, "sampling/importance_sampling_ratio/mean": 1.0011003017425537, "sampling/importance_sampling_ratio/min": 0.5127840042114258, "sampling/sampling_logp_difference/max": 0.667900562286377, "sampling/sampling_logp_difference/mean": 0.012248801998794079, "step": 566 } ], "logging_steps": 1, "max_steps": 1130, "num_input_tokens_seen": 13422203, "num_train_epochs": 2, "save_steps": 283, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }