{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0017699115044247, "eval_steps": 500, "global_step": 566, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 119.640625, "completions/mean_terminated_length": 119.640625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5032907128334045, "epoch": 0.0017699115044247787, "frac_reward_zero_std": 0.75, "grad_norm": 1.197581661458249, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0092, "num_tokens": 17321.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3671250343322754, "sampling/importance_sampling_ratio/mean": 1.0002808570861816, "sampling/importance_sampling_ratio/min": 0.7081161737442017, "sampling/sampling_logp_difference/max": 0.34514713287353516, "sampling/sampling_logp_difference/mean": 0.01661113277077675, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 204.09375, "completions/mean_terminated_length": 204.09375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4655645191669464, "epoch": 0.0035398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 0.8178016231119396, "kl": 0.0, "learning_rate": 8.849557522123893e-09, "loss": 0.0201, "num_tokens": 40543.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5473402738571167, "sampling/importance_sampling_ratio/mean": 1.0002034902572632, "sampling/importance_sampling_ratio/min": 0.6968656182289124, "sampling/sampling_logp_difference/max": 0.436537504196167, "sampling/sampling_logp_difference/mean": 0.01577112451195717, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 267.5, "completions/mean_terminated_length": 267.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.5844632983207703, "epoch": 0.005309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 0.5918728285632905, "kl": 0.0003235383774153888, "learning_rate": 1.7699115044247786e-08, "loss": -0.0066, "num_tokens": 71343.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3379162549972534, "sampling/importance_sampling_ratio/mean": 1.0000946521759033, "sampling/importance_sampling_ratio/min": 0.6870805621147156, "sampling/sampling_logp_difference/max": 0.3753037452697754, "sampling/sampling_logp_difference/mean": 0.016847141087055206, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.6066123247146606, "epoch": 0.007079646017699115, "frac_reward_zero_std": 0.5, "grad_norm": 1.2223548604415564, "kl": 0.0003290092572569847, "learning_rate": 2.654867256637168e-08, "loss": -0.0011, "num_tokens": 96711.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.2738739252090454, "sampling/importance_sampling_ratio/mean": 0.9996072053909302, "sampling/importance_sampling_ratio/min": 0.6956945657730103, "sampling/sampling_logp_difference/max": 0.36284446716308594, "sampling/sampling_logp_difference/mean": 0.017108086496591568, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 144.96875, "completions/mean_terminated_length": 144.96875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4158593416213989, "epoch": 0.008849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 0.8614452210534571, "kl": 0.00036510598147287965, "learning_rate": 3.539823008849557e-08, "loss": -0.0019, "num_tokens": 117429.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.636415958404541, "sampling/importance_sampling_ratio/mean": 1.0001493692398071, "sampling/importance_sampling_ratio/min": 0.717685341835022, "sampling/sampling_logp_difference/max": 0.4925084114074707, "sampling/sampling_logp_difference/mean": 0.014107795432209969, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 133.21875, "completions/mean_terminated_length": 133.21875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.40092116594314575, "epoch": 0.010619469026548672, "frac_reward_zero_std": 0.75, "grad_norm": 1.2782024402260204, "kl": 0.0004086779954377562, "learning_rate": 4.424778761061947e-08, "loss": -0.0243, "num_tokens": 136851.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6541728973388672, "sampling/importance_sampling_ratio/mean": 1.0002083778381348, "sampling/importance_sampling_ratio/min": 0.6771749258041382, "sampling/sampling_logp_difference/max": 0.5033011436462402, "sampling/sampling_logp_difference/mean": 0.014566268771886826, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.5555839538574219, "epoch": 0.012389380530973451, "frac_reward_zero_std": 0.5, "grad_norm": 1.6135981781006026, "kl": 0.0003298893861938268, "learning_rate": 5.309734513274336e-08, "loss": 0.0613, "num_tokens": 160243.0, "reward": 0.1875, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5277698040008545, "sampling/importance_sampling_ratio/mean": 1.000162124633789, "sampling/importance_sampling_ratio/min": 0.7050753831863403, "sampling/sampling_logp_difference/max": 0.4238090515136719, "sampling/sampling_logp_difference/mean": 0.01694803312420845, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 164.453125, "completions/mean_terminated_length": 164.453125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.5913296937942505, "epoch": 0.01415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.7973323730261501, "kl": 0.0003854926908388734, "learning_rate": 6.194690265486725e-08, "loss": 0.0218, "num_tokens": 181744.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3442672491073608, "sampling/importance_sampling_ratio/mean": 0.9994102120399475, "sampling/importance_sampling_ratio/min": 0.6955088973045349, "sampling/sampling_logp_difference/max": 0.3631114959716797, "sampling/sampling_logp_difference/mean": 0.0170879028737545, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 106.0625, "completions/mean_terminated_length": 106.0625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3793051540851593, "epoch": 0.01592920353982301, "frac_reward_zero_std": 1.0, "grad_norm": 0.004018561548333935, "kl": 0.00034292670898139477, "learning_rate": 7.079646017699114e-08, "loss": 0.0, "num_tokens": 198340.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3619581460952759, "sampling/importance_sampling_ratio/mean": 0.999984860420227, "sampling/importance_sampling_ratio/min": 0.6996299624443054, "sampling/sampling_logp_difference/max": 0.35720372200012207, "sampling/sampling_logp_difference/mean": 0.014426184818148613, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 183.328125, "completions/mean_terminated_length": 183.328125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.44784918427467346, "epoch": 0.017699115044247787, "frac_reward_zero_std": 0.5, "grad_norm": 1.563306441755095, "kl": 0.00031801534350961447, "learning_rate": 7.964601769911503e-08, "loss": -0.0447, "num_tokens": 221753.0, "reward": 0.53125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4040929079055786, "sampling/importance_sampling_ratio/mean": 0.9993531703948975, "sampling/importance_sampling_ratio/min": 0.6400877237319946, "sampling/sampling_logp_difference/max": 0.4461500644683838, "sampling/sampling_logp_difference/mean": 0.01504572480916977, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 168.703125, "completions/mean_terminated_length": 168.703125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.45911580324172974, "epoch": 0.019469026548672566, "frac_reward_zero_std": 0.75, "grad_norm": 1.0220066337052842, "kl": 0.00025416217977181077, "learning_rate": 8.849557522123894e-08, "loss": -0.032, "num_tokens": 242534.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.404308795928955, "sampling/importance_sampling_ratio/mean": 1.000216007232666, "sampling/importance_sampling_ratio/min": 0.7108312249183655, "sampling/sampling_logp_difference/max": 0.341320276260376, "sampling/sampling_logp_difference/mean": 0.013914674520492554, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 112.0625, "completions/mean_terminated_length": 112.0625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.4932432770729065, "epoch": 0.021238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029255727279340546, "kl": 0.00035566199221648276, "learning_rate": 9.734513274336283e-08, "loss": 0.0, "num_tokens": 260010.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.543989658355713, "sampling/importance_sampling_ratio/mean": 1.0002937316894531, "sampling/importance_sampling_ratio/min": 0.7327228784561157, "sampling/sampling_logp_difference/max": 0.4343698024749756, "sampling/sampling_logp_difference/mean": 0.01683472841978073, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 152.6875, "completions/mean_terminated_length": 152.6875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.4947296977043152, "epoch": 0.023008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 1.3178133854373153, "kl": 0.0003659637295641005, "learning_rate": 1.0619469026548672e-07, "loss": -0.037, "num_tokens": 281142.0, "reward": 0.4375, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3199342489242554, "sampling/importance_sampling_ratio/mean": 0.9999753832817078, "sampling/importance_sampling_ratio/min": 0.7306388020515442, "sampling/sampling_logp_difference/max": 0.31383609771728516, "sampling/sampling_logp_difference/mean": 0.016047393903136253, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 208.21875, "completions/mean_terminated_length": 208.21875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.6193951368331909, "epoch": 0.024778761061946902, "frac_reward_zero_std": 1.0, "grad_norm": 0.002129757602630544, "kl": 0.0003695795312523842, "learning_rate": 1.1504424778761061e-07, "loss": 0.0, "num_tokens": 307924.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4323190450668335, "sampling/importance_sampling_ratio/mean": 1.0006804466247559, "sampling/importance_sampling_ratio/min": 0.708014190196991, "sampling/sampling_logp_difference/max": 0.3592948913574219, "sampling/sampling_logp_difference/mean": 0.018177257850766182, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 231.21875, "completions/mean_terminated_length": 231.21875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.49823617935180664, "epoch": 0.02654867256637168, "frac_reward_zero_std": 0.5, "grad_norm": 0.9588983728620929, "kl": 0.00032964831916615367, "learning_rate": 1.238938053097345e-07, "loss": 0.0279, "num_tokens": 335922.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4731558561325073, "sampling/importance_sampling_ratio/mean": 1.0001003742218018, "sampling/importance_sampling_ratio/min": 0.48042166233062744, "sampling/sampling_logp_difference/max": 0.7330911159515381, "sampling/sampling_logp_difference/mean": 0.014913346618413925, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 234.9375, "completions/mean_terminated_length": 234.9375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.3467172682285309, "epoch": 0.02831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 0.9784864288270121, "kl": 0.0003464373294264078, "learning_rate": 1.327433628318584e-07, "loss": -0.1183, "num_tokens": 362174.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.2827069759368896, "sampling/importance_sampling_ratio/mean": 0.9999648332595825, "sampling/importance_sampling_ratio/min": 0.6876590251922607, "sampling/sampling_logp_difference/max": 0.3744621276855469, "sampling/sampling_logp_difference/mean": 0.012679207138717175, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 213.21875, "completions/mean_terminated_length": 213.21875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5071783065795898, "epoch": 0.03008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 1.0013124681125116, "kl": 0.00035910963197238743, "learning_rate": 1.4159292035398229e-07, "loss": -0.0119, "num_tokens": 386364.0, "reward": 0.375, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.386346697807312, "sampling/importance_sampling_ratio/mean": 0.9996243119239807, "sampling/importance_sampling_ratio/min": 0.6823737025260925, "sampling/sampling_logp_difference/max": 0.38217782974243164, "sampling/sampling_logp_difference/mean": 0.015556364320218563, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 237.328125, "completions/mean_terminated_length": 237.328125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3886912763118744, "epoch": 0.03185840707964602, "frac_reward_zero_std": 0.25, "grad_norm": 1.4652163391027393, "kl": 0.0002612382231745869, "learning_rate": 1.504424778761062e-07, "loss": 0.0634, "num_tokens": 411937.0, "reward": 0.125, "reward_std": 0.7537294626235962, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.3966903686523438, "sampling/importance_sampling_ratio/mean": 0.9995191097259521, "sampling/importance_sampling_ratio/min": 0.6955556273460388, "sampling/sampling_logp_difference/max": 0.36304426193237305, "sampling/sampling_logp_difference/mean": 0.013643961399793625, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 166.515625, "completions/mean_terminated_length": 166.515625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.44238975644111633, "epoch": 0.033628318584070796, "frac_reward_zero_std": 0.5, "grad_norm": 1.4214419352169998, "kl": 0.0003654281026683748, "learning_rate": 1.5929203539823007e-07, "loss": -0.0062, "num_tokens": 434242.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5126184225082397, "sampling/importance_sampling_ratio/mean": 1.0000451803207397, "sampling/importance_sampling_ratio/min": 0.6579197645187378, "sampling/sampling_logp_difference/max": 0.4186723232269287, "sampling/sampling_logp_difference/mean": 0.01501513086259365, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 133.609375, "completions/mean_terminated_length": 133.609375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.4399692118167877, "epoch": 0.035398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017812548897053737, "kl": 0.0003704609989654273, "learning_rate": 1.68141592920354e-07, "loss": 0.0, "num_tokens": 452729.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4165520668029785, "sampling/importance_sampling_ratio/mean": 0.9995684623718262, "sampling/importance_sampling_ratio/min": 0.7300204634666443, "sampling/sampling_logp_difference/max": 0.34822583198547363, "sampling/sampling_logp_difference/mean": 0.015125056728720665, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 192.203125, "completions/mean_terminated_length": 192.203125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.4893031716346741, "epoch": 0.03716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 0.6250544919291408, "kl": 0.0002952019567601383, "learning_rate": 1.7699115044247788e-07, "loss": -0.004, "num_tokens": 475382.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.503237247467041, "sampling/importance_sampling_ratio/mean": 0.9992388486862183, "sampling/importance_sampling_ratio/min": 0.6985951662063599, "sampling/sampling_logp_difference/max": 0.407620906829834, "sampling/sampling_logp_difference/mean": 0.014747219160199165, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5142182111740112, "epoch": 0.03893805309734513, "frac_reward_zero_std": 0.75, "grad_norm": 0.8664434236098217, "kl": 0.0003179911873303354, "learning_rate": 1.8584070796460178e-07, "loss": -0.0106, "num_tokens": 501310.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.342726230621338, "sampling/importance_sampling_ratio/mean": 0.9997316598892212, "sampling/importance_sampling_ratio/min": 0.7164933085441589, "sampling/sampling_logp_difference/max": 0.3333864212036133, "sampling/sampling_logp_difference/mean": 0.016416512429714203, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 208.734375, "completions/mean_terminated_length": 208.734375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.5367847681045532, "epoch": 0.04070796460176991, "frac_reward_zero_std": 0.5, "grad_norm": 1.2154985290713616, "kl": 0.00028733719955198467, "learning_rate": 1.9469026548672566e-07, "loss": -0.0165, "num_tokens": 525037.0, "reward": 0.5625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.443819284439087, "sampling/importance_sampling_ratio/mean": 1.0003135204315186, "sampling/importance_sampling_ratio/min": 0.7076250910758972, "sampling/sampling_logp_difference/max": 0.3672919273376465, "sampling/sampling_logp_difference/mean": 0.015358660370111465, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 177.59375, "completions/mean_terminated_length": 177.59375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.4362090229988098, "epoch": 0.04247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 0.8691423799745507, "kl": 0.00036039401311427355, "learning_rate": 2.0353982300884956e-07, "loss": -0.0424, "num_tokens": 547283.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6200610399246216, "sampling/importance_sampling_ratio/mean": 0.99936842918396, "sampling/importance_sampling_ratio/min": 0.6299377083778381, "sampling/sampling_logp_difference/max": 0.4824638366699219, "sampling/sampling_logp_difference/mean": 0.015077032148838043, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 197.109375, "completions/mean_terminated_length": 197.109375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.5380075573921204, "epoch": 0.04424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 0.9177797773542464, "kl": 0.0003365460433997214, "learning_rate": 2.1238938053097344e-07, "loss": 0.0157, "num_tokens": 572330.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4116113185882568, "sampling/importance_sampling_ratio/mean": 0.9999536871910095, "sampling/importance_sampling_ratio/min": 0.6641045808792114, "sampling/sampling_logp_difference/max": 0.4093155860900879, "sampling/sampling_logp_difference/mean": 0.01646638847887516, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 244.765625, "completions/mean_terminated_length": 244.765625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4543125331401825, "epoch": 0.04601769911504425, "frac_reward_zero_std": 0.5, "grad_norm": 0.9349783033788508, "kl": 0.00024864188162609935, "learning_rate": 2.2123893805309735e-07, "loss": 0.0152, "num_tokens": 599211.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.291829228401184, "sampling/importance_sampling_ratio/mean": 1.0000139474868774, "sampling/importance_sampling_ratio/min": 0.5567372441291809, "sampling/sampling_logp_difference/max": 0.5856618881225586, "sampling/sampling_logp_difference/mean": 0.013491150923073292, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.41913625597953796, "epoch": 0.047787610619469026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024251111136364234, "kl": 0.00032045069383457303, "learning_rate": 2.3008849557522122e-07, "loss": 0.0, "num_tokens": 619563.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.272973656654358, "sampling/importance_sampling_ratio/mean": 0.9999419450759888, "sampling/importance_sampling_ratio/min": 0.7201418280601501, "sampling/sampling_logp_difference/max": 0.3283071517944336, "sampling/sampling_logp_difference/mean": 0.014692498371005058, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 224.328125, "completions/mean_terminated_length": 224.328125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.47675684094429016, "epoch": 0.049557522123893805, "frac_reward_zero_std": 0.5, "grad_norm": 1.1549458875644594, "kl": 0.0002931379422079772, "learning_rate": 2.3893805309734513e-07, "loss": -0.0028, "num_tokens": 645120.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5372246503829956, "sampling/importance_sampling_ratio/mean": 1.0002179145812988, "sampling/importance_sampling_ratio/min": 0.700880229473114, "sampling/sampling_logp_difference/max": 0.429978609085083, "sampling/sampling_logp_difference/mean": 0.014232289046049118, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 111.3125, "completions/mean_terminated_length": 111.3125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.49748218059539795, "epoch": 0.05132743362831858, "frac_reward_zero_std": 0.75, "grad_norm": 1.2068886456859147, "kl": 0.00046287733130156994, "learning_rate": 2.47787610619469e-07, "loss": -0.0244, "num_tokens": 662676.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.276251196861267, "sampling/importance_sampling_ratio/mean": 1.0002195835113525, "sampling/importance_sampling_ratio/min": 0.7073787450790405, "sampling/sampling_logp_difference/max": 0.346189022064209, "sampling/sampling_logp_difference/mean": 0.01758941262960434, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 173.6875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.6052020192146301, "epoch": 0.05309734513274336, "frac_reward_zero_std": 0.5, "grad_norm": 1.2441525063044685, "kl": 0.00044939230429008603, "learning_rate": 2.5663716814159294e-07, "loss": -0.0337, "num_tokens": 685872.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5561800003051758, "sampling/importance_sampling_ratio/mean": 0.9993517398834229, "sampling/importance_sampling_ratio/min": 0.7488274574279785, "sampling/sampling_logp_difference/max": 0.4422340393066406, "sampling/sampling_logp_difference/mean": 0.018925733864307404, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 303.796875, "completions/mean_terminated_length": 303.796875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.35878241062164307, "epoch": 0.05486725663716814, "frac_reward_zero_std": 0.25, "grad_norm": 0.936650159805666, "kl": 0.00023944814165588468, "learning_rate": 2.654867256637168e-07, "loss": -0.1245, "num_tokens": 717315.0, "reward": 0.65625, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4352022409439087, "sampling/importance_sampling_ratio/mean": 1.0003821849822998, "sampling/importance_sampling_ratio/min": 0.6623601913452148, "sampling/sampling_logp_difference/max": 0.41194581985473633, "sampling/sampling_logp_difference/mean": 0.011384059675037861, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 271.046875, "completions/mean_terminated_length": 271.046875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.48334968090057373, "epoch": 0.05663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 0.890487583170878, "kl": 0.00024957250570878386, "learning_rate": 2.743362831858407e-07, "loss": -0.0766, "num_tokens": 746566.0, "reward": 0.34375, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3552336692810059, "sampling/importance_sampling_ratio/mean": 0.9996834993362427, "sampling/importance_sampling_ratio/min": 0.7147769927978516, "sampling/sampling_logp_difference/max": 0.3357846736907959, "sampling/sampling_logp_difference/mean": 0.013956459239125252, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 173.59375, "completions/mean_terminated_length": 173.59375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.4724288880825043, "epoch": 0.0584070796460177, "frac_reward_zero_std": 0.5, "grad_norm": 1.0287426168184086, "kl": 0.00030616761068813503, "learning_rate": 2.8318584070796457e-07, "loss": -0.034, "num_tokens": 768828.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4375419616699219, "sampling/importance_sampling_ratio/mean": 1.0006976127624512, "sampling/importance_sampling_ratio/min": 0.6179822683334351, "sampling/sampling_logp_difference/max": 0.4812955856323242, "sampling/sampling_logp_difference/mean": 0.015223699621856213, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 159.53125, "completions/mean_terminated_length": 159.53125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5169718265533447, "epoch": 0.06017699115044248, "frac_reward_zero_std": 0.75, "grad_norm": 1.237394541415371, "kl": 0.00046695698983967304, "learning_rate": 2.920353982300885e-07, "loss": -0.0049, "num_tokens": 790510.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.387283205986023, "sampling/importance_sampling_ratio/mean": 1.0002853870391846, "sampling/importance_sampling_ratio/min": 0.7103862762451172, "sampling/sampling_logp_difference/max": 0.3419463634490967, "sampling/sampling_logp_difference/mean": 0.016630595549941063, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 165.859375, "completions/mean_terminated_length": 165.859375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.41826707124710083, "epoch": 0.061946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.002856300860067278, "kl": 0.00033567112404853106, "learning_rate": 3.008849557522124e-07, "loss": 0.0, "num_tokens": 814565.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.449743390083313, "sampling/importance_sampling_ratio/mean": 1.0003138780593872, "sampling/importance_sampling_ratio/min": 0.7154431343078613, "sampling/sampling_logp_difference/max": 0.3713865280151367, "sampling/sampling_logp_difference/mean": 0.014297478832304478, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 89.484375, "completions/mean_terminated_length": 89.484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3725784420967102, "epoch": 0.06371681415929203, "frac_reward_zero_std": 1.0, "grad_norm": 0.004419671064042402, "kl": 0.00045324297389015555, "learning_rate": 3.0973451327433626e-07, "loss": 0.0, "num_tokens": 829668.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3111563920974731, "sampling/importance_sampling_ratio/mean": 1.0007599592208862, "sampling/importance_sampling_ratio/min": 0.7192742228507996, "sampling/sampling_logp_difference/max": 0.3295125961303711, "sampling/sampling_logp_difference/mean": 0.015164341777563095, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 186.171875, "completions/mean_terminated_length": 186.171875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.5222961902618408, "epoch": 0.06548672566371681, "frac_reward_zero_std": 0.75, "grad_norm": 1.1725115850214423, "kl": 0.0003714629274327308, "learning_rate": 3.1858407079646014e-07, "loss": 0.0357, "num_tokens": 856047.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.385926604270935, "sampling/importance_sampling_ratio/mean": 0.9998730421066284, "sampling/importance_sampling_ratio/min": 0.6878253221511841, "sampling/sampling_logp_difference/max": 0.3742203712463379, "sampling/sampling_logp_difference/mean": 0.01648404821753502, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 138.765625, "completions/mean_terminated_length": 138.765625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.4833816885948181, "epoch": 0.06725663716814159, "frac_reward_zero_std": 0.75, "grad_norm": 0.9931391806446179, "kl": 0.00040409673238173127, "learning_rate": 3.2743362831858407e-07, "loss": -0.0163, "num_tokens": 874736.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.2900668382644653, "sampling/importance_sampling_ratio/mean": 0.9992552399635315, "sampling/importance_sampling_ratio/min": 0.7554990649223328, "sampling/sampling_logp_difference/max": 0.2803767919540405, "sampling/sampling_logp_difference/mean": 0.016056213527917862, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 132.96875, "completions/mean_terminated_length": 132.96875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3930308222770691, "epoch": 0.06902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 1.1750550787306857, "kl": 0.0004630276234820485, "learning_rate": 3.36283185840708e-07, "loss": -0.02, "num_tokens": 892638.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.428227186203003, "sampling/importance_sampling_ratio/mean": 0.9996497631072998, "sampling/importance_sampling_ratio/min": 0.7563729882240295, "sampling/sampling_logp_difference/max": 0.3564338684082031, "sampling/sampling_logp_difference/mean": 0.014042085036635399, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 167.453125, "completions/mean_terminated_length": 167.453125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.4201562702655792, "epoch": 0.07079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.7417597684965663, "kl": 0.0004567916039377451, "learning_rate": 3.451327433628318e-07, "loss": -0.0117, "num_tokens": 914603.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.270693063735962, "sampling/importance_sampling_ratio/mean": 1.000083565711975, "sampling/importance_sampling_ratio/min": 0.6983985304832458, "sampling/sampling_logp_difference/max": 0.3589653968811035, "sampling/sampling_logp_difference/mean": 0.014485509134829044, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.593041181564331, "epoch": 0.07256637168141593, "frac_reward_zero_std": 0.5, "grad_norm": 1.2353352881167206, "kl": 0.00041572609916329384, "learning_rate": 3.5398230088495575e-07, "loss": -0.0267, "num_tokens": 939907.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3544167280197144, "sampling/importance_sampling_ratio/mean": 0.9995003938674927, "sampling/importance_sampling_ratio/min": 0.7054951786994934, "sampling/sampling_logp_difference/max": 0.3488553762435913, "sampling/sampling_logp_difference/mean": 0.01641146093606949, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 219.03125, "completions/mean_terminated_length": 219.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5267437696456909, "epoch": 0.0743362831858407, "frac_reward_zero_std": 0.25, "grad_norm": 1.1390948453973633, "kl": 0.00043598079355433583, "learning_rate": 3.6283185840707963e-07, "loss": -0.0232, "num_tokens": 965525.0, "reward": 0.3125, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3127607107162476, "sampling/importance_sampling_ratio/mean": 1.0000357627868652, "sampling/importance_sampling_ratio/min": 0.6622505187988281, "sampling/sampling_logp_difference/max": 0.41211140155792236, "sampling/sampling_logp_difference/mean": 0.015290766023099422, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 193.53125, "completions/mean_terminated_length": 193.53125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.571338951587677, "epoch": 0.07610619469026549, "frac_reward_zero_std": 0.5, "grad_norm": 1.0626478262285108, "kl": 0.000434124784078449, "learning_rate": 3.7168141592920356e-07, "loss": -0.005, "num_tokens": 988663.0, "reward": 0.21875, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.434810757637024, "sampling/importance_sampling_ratio/mean": 1.0001163482666016, "sampling/importance_sampling_ratio/min": 0.755242109298706, "sampling/sampling_logp_difference/max": 0.36103296279907227, "sampling/sampling_logp_difference/mean": 0.015851661562919617, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 169.359375, "completions/mean_terminated_length": 169.359375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.44538310170173645, "epoch": 0.07787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 0.7914395304563472, "kl": 0.0004141220997553319, "learning_rate": 3.805309734513274e-07, "loss": -0.0307, "num_tokens": 1011470.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4191381931304932, "sampling/importance_sampling_ratio/mean": 1.0001968145370483, "sampling/importance_sampling_ratio/min": 0.7164943218231201, "sampling/sampling_logp_difference/max": 0.3500497341156006, "sampling/sampling_logp_difference/mean": 0.015289019793272018, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 124.578125, "completions/mean_terminated_length": 124.578125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.3828132152557373, "epoch": 0.07964601769911504, "frac_reward_zero_std": 0.75, "grad_norm": 0.9106055138144511, "kl": 0.0005045943544246256, "learning_rate": 3.893805309734513e-07, "loss": 0.0149, "num_tokens": 1028499.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5080609321594238, "sampling/importance_sampling_ratio/mean": 0.9998210072517395, "sampling/importance_sampling_ratio/min": 0.7134808301925659, "sampling/sampling_logp_difference/max": 0.41082465648651123, "sampling/sampling_logp_difference/mean": 0.014279220253229141, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 242.15625, "completions/mean_terminated_length": 242.15625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4709385335445404, "epoch": 0.08141592920353982, "frac_reward_zero_std": 0.5, "grad_norm": 0.9827505895621653, "kl": 0.0004555697087198496, "learning_rate": 3.982300884955752e-07, "loss": -0.0023, "num_tokens": 1054781.0, "reward": 0.34375, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.432064414024353, "sampling/importance_sampling_ratio/mean": 0.9992622137069702, "sampling/importance_sampling_ratio/min": 0.650286853313446, "sampling/sampling_logp_difference/max": 0.4303417205810547, "sampling/sampling_logp_difference/mean": 0.014137700200080872, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 253.859375, "completions/mean_terminated_length": 253.859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.4534558355808258, "epoch": 0.0831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 1.1457138015634614, "kl": 0.0006146510131657124, "learning_rate": 4.0707964601769913e-07, "loss": -0.0516, "num_tokens": 1081620.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.302929401397705, "sampling/importance_sampling_ratio/mean": 1.0001895427703857, "sampling/importance_sampling_ratio/min": 0.6870179772377014, "sampling/sampling_logp_difference/max": 0.3753948211669922, "sampling/sampling_logp_difference/mean": 0.014385255984961987, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.38244354724884033, "epoch": 0.08495575221238938, "frac_reward_zero_std": 0.5, "grad_norm": 0.7522575038937201, "kl": 0.0005814050673507154, "learning_rate": 4.1592920353982295e-07, "loss": -0.0673, "num_tokens": 1108992.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.2758551836013794, "sampling/importance_sampling_ratio/mean": 1.0001693964004517, "sampling/importance_sampling_ratio/min": 0.7555238008499146, "sampling/sampling_logp_difference/max": 0.28034400939941406, "sampling/sampling_logp_difference/mean": 0.011594526469707489, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 155.140625, "completions/mean_terminated_length": 155.140625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.4777386486530304, "epoch": 0.08672566371681416, "frac_reward_zero_std": 0.5, "grad_norm": 1.410659482901693, "kl": 0.0007034146692603827, "learning_rate": 4.247787610619469e-07, "loss": -0.0335, "num_tokens": 1128921.0, "reward": 0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4442780017852783, "sampling/importance_sampling_ratio/mean": 1.000157117843628, "sampling/importance_sampling_ratio/min": 0.7731385827064514, "sampling/sampling_logp_difference/max": 0.36760950088500977, "sampling/sampling_logp_difference/mean": 0.015548791736364365, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 190.59375, "completions/mean_terminated_length": 190.59375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4977187216281891, "epoch": 0.08849557522123894, "frac_reward_zero_std": 0.5, "grad_norm": 1.120892046080377, "kl": 0.0009281488019041717, "learning_rate": 4.3362831858407076e-07, "loss": 0.0008, "num_tokens": 1151599.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3107186555862427, "sampling/importance_sampling_ratio/mean": 1.0000163316726685, "sampling/importance_sampling_ratio/min": 0.6289657950401306, "sampling/sampling_logp_difference/max": 0.46367835998535156, "sampling/sampling_logp_difference/mean": 0.015473298728466034, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 149.96875, "completions/mean_terminated_length": 149.96875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.40268978476524353, "epoch": 0.09026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 0.9599331457203282, "kl": 0.0007896269089542329, "learning_rate": 4.424778761061947e-07, "loss": 0.0334, "num_tokens": 1171549.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4640812873840332, "sampling/importance_sampling_ratio/mean": 1.0000137090682983, "sampling/importance_sampling_ratio/min": 0.7770414352416992, "sampling/sampling_logp_difference/max": 0.381227970123291, "sampling/sampling_logp_difference/mean": 0.013961746357381344, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 160.921875, "completions/mean_terminated_length": 160.921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.4570772647857666, "epoch": 0.0920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.010874486051355568, "kl": 0.0007457176106981933, "learning_rate": 4.5132743362831857e-07, "loss": 0.0, "num_tokens": 1196168.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2951364517211914, "sampling/importance_sampling_ratio/mean": 1.0003330707550049, "sampling/importance_sampling_ratio/min": 0.7504076957702637, "sampling/sampling_logp_difference/max": 0.28713858127593994, "sampling/sampling_logp_difference/mean": 0.014795606024563313, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 237.5625, "completions/mean_terminated_length": 237.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.5694527626037598, "epoch": 0.09380530973451327, "frac_reward_zero_std": 0.5, "grad_norm": 1.0118068729354581, "kl": 0.0010479043703526258, "learning_rate": 4.6017699115044245e-07, "loss": -0.0249, "num_tokens": 1223708.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4213486909866333, "sampling/importance_sampling_ratio/mean": 1.000314712524414, "sampling/importance_sampling_ratio/min": 0.6865943670272827, "sampling/sampling_logp_difference/max": 0.37601161003112793, "sampling/sampling_logp_difference/mean": 0.016255198046565056, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 211.34375, "completions/mean_terminated_length": 211.34375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5166953206062317, "epoch": 0.09557522123893805, "frac_reward_zero_std": 0.5, "grad_norm": 0.9293770635008775, "kl": 0.0012105627683922648, "learning_rate": 4.690265486725664e-07, "loss": -0.0153, "num_tokens": 1248850.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.2929353713989258, "sampling/importance_sampling_ratio/mean": 0.9998746514320374, "sampling/importance_sampling_ratio/min": 0.7815154790878296, "sampling/sampling_logp_difference/max": 0.2569150924682617, "sampling/sampling_logp_difference/mean": 0.015440109185874462, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 126.078125, "completions/mean_terminated_length": 126.078125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.45780402421951294, "epoch": 0.09734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.014536697482510377, "kl": 0.0013435760047286749, "learning_rate": 4.778761061946903e-07, "loss": 0.0, "num_tokens": 1267639.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.652394413948059, "sampling/importance_sampling_ratio/mean": 1.0007537603378296, "sampling/importance_sampling_ratio/min": 0.7781879305839539, "sampling/sampling_logp_difference/max": 0.502225399017334, "sampling/sampling_logp_difference/mean": 0.015873417258262634, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 238.109375, "completions/mean_terminated_length": 238.109375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.4011164903640747, "epoch": 0.09911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.0663002019754262, "kl": 0.0010613617487251759, "learning_rate": 4.867256637168141e-07, "loss": -0.0741, "num_tokens": 1295694.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.790257215499878, "sampling/importance_sampling_ratio/mean": 0.9998331069946289, "sampling/importance_sampling_ratio/min": 0.7862241864204407, "sampling/sampling_logp_difference/max": 0.5823593139648438, "sampling/sampling_logp_difference/mean": 0.013555500656366348, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 143.21875, "completions/mean_terminated_length": 143.21875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.43477654457092285, "epoch": 0.10088495575221239, "frac_reward_zero_std": 0.75, "grad_norm": 1.1514576732910113, "kl": 0.001417152350768447, "learning_rate": 4.95575221238938e-07, "loss": 0.0297, "num_tokens": 1314588.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.399377703666687, "sampling/importance_sampling_ratio/mean": 0.9996924996376038, "sampling/importance_sampling_ratio/min": 0.6979451179504395, "sampling/sampling_logp_difference/max": 0.35961484909057617, "sampling/sampling_logp_difference/mean": 0.014985587447881699, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 210.421875, "completions/mean_terminated_length": 210.421875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3723405599594116, "epoch": 0.10265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 0.7754310293030342, "kl": 0.001261947792954743, "learning_rate": 5.044247787610619e-07, "loss": -0.0314, "num_tokens": 1338999.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.342655897140503, "sampling/importance_sampling_ratio/mean": 1.0001294612884521, "sampling/importance_sampling_ratio/min": 0.6933748722076416, "sampling/sampling_logp_difference/max": 0.3661844730377197, "sampling/sampling_logp_difference/mean": 0.012805428355932236, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 278.140625, "completions/mean_terminated_length": 278.140625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.560793399810791, "epoch": 0.10442477876106195, "frac_reward_zero_std": 0.75, "grad_norm": 0.6163574311204023, "kl": 0.0012988373637199402, "learning_rate": 5.132743362831859e-07, "loss": -0.0005, "num_tokens": 1370768.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001578330993652, "sampling/importance_sampling_ratio/min": 0.7807647585868835, "sampling/sampling_logp_difference/max": 0.7096753120422363, "sampling/sampling_logp_difference/mean": 0.01540360227227211, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 301.0625, "completions/mean_terminated_length": 301.0625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.43717724084854126, "epoch": 0.10619469026548672, "frac_reward_zero_std": 0.25, "grad_norm": 0.9915849692203775, "kl": 0.001398495864123106, "learning_rate": 5.221238938053097e-07, "loss": -0.1144, "num_tokens": 1400772.0, "reward": 0.40625, "reward_std": 0.6205305457115173, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4271200895309448, "sampling/importance_sampling_ratio/mean": 0.9992467164993286, "sampling/importance_sampling_ratio/min": 0.6068848967552185, "sampling/sampling_logp_difference/max": 0.4994161128997803, "sampling/sampling_logp_difference/mean": 0.013886969536542892, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 319.46875, "completions/mean_terminated_length": 319.46875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.4440951347351074, "epoch": 0.1079646017699115, "frac_reward_zero_std": 0.25, "grad_norm": 0.9971627914784554, "kl": 0.001473217736929655, "learning_rate": 5.309734513274336e-07, "loss": 0.0638, "num_tokens": 1434994.0, "reward": -0.03125, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.338789939880371, "sampling/importance_sampling_ratio/mean": 0.9996881484985352, "sampling/importance_sampling_ratio/min": 0.6665560603141785, "sampling/sampling_logp_difference/max": 0.40563106536865234, "sampling/sampling_logp_difference/mean": 0.013911099173128605, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 100.3125, "completions/mean_terminated_length": 100.3125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.43417152762413025, "epoch": 0.10973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 1.5198689851466292, "kl": 0.0020349654369056225, "learning_rate": 5.398230088495575e-07, "loss": -0.017, "num_tokens": 1451382.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4214783906936646, "sampling/importance_sampling_ratio/mean": 1.0008153915405273, "sampling/importance_sampling_ratio/min": 0.6074426770210266, "sampling/sampling_logp_difference/max": 0.49849748611450195, "sampling/sampling_logp_difference/mean": 0.015794191509485245, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 225.21875, "completions/mean_terminated_length": 225.21875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3711397051811218, "epoch": 0.11150442477876106, "frac_reward_zero_std": 1.0, "grad_norm": 0.009775583269256989, "kl": 0.001354132080450654, "learning_rate": 5.486725663716814e-07, "loss": 0.0, "num_tokens": 1476852.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.42524254322052, "sampling/importance_sampling_ratio/mean": 1.0001494884490967, "sampling/importance_sampling_ratio/min": 0.6258686184883118, "sampling/sampling_logp_difference/max": 0.4686148166656494, "sampling/sampling_logp_difference/mean": 0.013388492166996002, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 265.359375, "completions/mean_terminated_length": 265.359375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.48530012369155884, "epoch": 0.11327433628318584, "frac_reward_zero_std": 0.25, "grad_norm": 1.1618124827579868, "kl": 0.0018533555557951331, "learning_rate": 5.575221238938052e-07, "loss": -0.1203, "num_tokens": 1505883.0, "reward": 0.46875, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3516290187835693, "sampling/importance_sampling_ratio/mean": 1.0001146793365479, "sampling/importance_sampling_ratio/min": 0.7140982151031494, "sampling/sampling_logp_difference/max": 0.3367347717285156, "sampling/sampling_logp_difference/mean": 0.014926677569746971, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 205.703125, "completions/mean_terminated_length": 205.703125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.47270679473876953, "epoch": 0.11504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 1.3142645495827467, "kl": 0.002251716796308756, "learning_rate": 5.663716814159291e-07, "loss": 0.0439, "num_tokens": 1529128.0, "reward": 0.5, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3624502420425415, "sampling/importance_sampling_ratio/mean": 1.0001649856567383, "sampling/importance_sampling_ratio/min": 0.6979560256004333, "sampling/sampling_logp_difference/max": 0.35959911346435547, "sampling/sampling_logp_difference/mean": 0.015638180077075958, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 84.453125, "completions/mean_terminated_length": 84.453125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.3656551241874695, "epoch": 0.1168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.02129541274594342, "kl": 0.0021704663522541523, "learning_rate": 5.752212389380531e-07, "loss": 0.0, "num_tokens": 1544213.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5290725231170654, "sampling/importance_sampling_ratio/mean": 0.9997422099113464, "sampling/importance_sampling_ratio/min": 0.7285875082015991, "sampling/sampling_logp_difference/max": 0.42466139793395996, "sampling/sampling_logp_difference/mean": 0.014993082731962204, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 212.890625, "completions/mean_terminated_length": 212.890625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.40637120604515076, "epoch": 0.11858407079646018, "frac_reward_zero_std": 0.25, "grad_norm": 1.1540254582532898, "kl": 0.0028697806410491467, "learning_rate": 5.84070796460177e-07, "loss": 0.0469, "num_tokens": 1567566.0, "reward": -0.0625, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4922453165054321, "sampling/importance_sampling_ratio/mean": 1.0003211498260498, "sampling/importance_sampling_ratio/min": 0.6008909940719604, "sampling/sampling_logp_difference/max": 0.5093417167663574, "sampling/sampling_logp_difference/mean": 0.013592811301350594, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 262.796875, "completions/mean_terminated_length": 262.796875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.44624337553977966, "epoch": 0.12035398230088495, "frac_reward_zero_std": 0.5, "grad_norm": 1.0644518303560102, "kl": 0.002109923167154193, "learning_rate": 5.929203539823009e-07, "loss": 0.0437, "num_tokens": 1594081.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.2843217849731445, "sampling/importance_sampling_ratio/mean": 1.0008705854415894, "sampling/importance_sampling_ratio/min": 0.7311109900474548, "sampling/sampling_logp_difference/max": 0.3131899833679199, "sampling/sampling_logp_difference/mean": 0.014282532036304474, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 220.4375, "completions/mean_terminated_length": 220.4375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.4575630724430084, "epoch": 0.12212389380530973, "frac_reward_zero_std": 0.5, "grad_norm": 1.1459217306500058, "kl": 0.0031225881539285183, "learning_rate": 6.017699115044248e-07, "loss": 0.0079, "num_tokens": 1618781.0, "reward": 0.625, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5783405303955078, "sampling/importance_sampling_ratio/mean": 0.9996960759162903, "sampling/importance_sampling_ratio/min": 0.6276222467422485, "sampling/sampling_logp_difference/max": 0.46581679582595825, "sampling/sampling_logp_difference/mean": 0.015305288136005402, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 206.65625, "completions/mean_terminated_length": 206.65625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.36566808819770813, "epoch": 0.12389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9604760172861007, "kl": 0.0024020487908273935, "learning_rate": 6.106194690265486e-07, "loss": -0.082, "num_tokens": 1643863.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5116113424301147, "sampling/importance_sampling_ratio/mean": 0.9999454021453857, "sampling/importance_sampling_ratio/min": 0.6207948327064514, "sampling/sampling_logp_difference/max": 0.47675466537475586, "sampling/sampling_logp_difference/mean": 0.014049714431166649, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 162.265625, "completions/mean_terminated_length": 162.265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.510326623916626, "epoch": 0.1256637168141593, "frac_reward_zero_std": 0.5, "grad_norm": 1.2446873826177451, "kl": 0.0035546794533729553, "learning_rate": 6.194690265486725e-07, "loss": -0.0068, "num_tokens": 1664472.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3208906650543213, "sampling/importance_sampling_ratio/mean": 1.0001500844955444, "sampling/importance_sampling_ratio/min": 0.6638264656066895, "sampling/sampling_logp_difference/max": 0.40973448753356934, "sampling/sampling_logp_difference/mean": 0.016301102936267853, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 195.390625, "completions/mean_terminated_length": 195.390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.4624881148338318, "epoch": 0.12743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 2.6749672167904532, "kl": 0.003504736814647913, "learning_rate": 6.283185840707964e-07, "loss": 0.0171, "num_tokens": 1687361.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4651128053665161, "sampling/importance_sampling_ratio/mean": 0.9998956918716431, "sampling/importance_sampling_ratio/min": 0.6601060628890991, "sampling/sampling_logp_difference/max": 0.41535472869873047, "sampling/sampling_logp_difference/mean": 0.015287000685930252, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 116.875, "completions/mean_terminated_length": 116.875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.33651983737945557, "epoch": 0.12920353982300886, "frac_reward_zero_std": 1.0, "grad_norm": 0.022766632280311436, "kl": 0.0031630685552954674, "learning_rate": 6.371681415929203e-07, "loss": 0.0, "num_tokens": 1704569.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2675925493240356, "sampling/importance_sampling_ratio/mean": 0.9999977946281433, "sampling/importance_sampling_ratio/min": 0.7149960398674011, "sampling/sampling_logp_difference/max": 0.3354783058166504, "sampling/sampling_logp_difference/mean": 0.0127112977206707, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 247.859375, "completions/mean_terminated_length": 247.859375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.4296538829803467, "epoch": 0.13097345132743363, "frac_reward_zero_std": 0.5, "grad_norm": 0.9591767345753897, "kl": 0.003236710326746106, "learning_rate": 6.460176991150442e-07, "loss": -0.0254, "num_tokens": 1730416.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3105483055114746, "sampling/importance_sampling_ratio/mean": 1.0005284547805786, "sampling/importance_sampling_ratio/min": 0.6824595928192139, "sampling/sampling_logp_difference/max": 0.382051944732666, "sampling/sampling_logp_difference/mean": 0.013747080229222775, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 286.75, "completions/mean_terminated_length": 286.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.48249033093452454, "epoch": 0.13274336283185842, "frac_reward_zero_std": 0.5, "grad_norm": 0.8923628002486057, "kl": 0.003454292193055153, "learning_rate": 6.548672566371681e-07, "loss": -0.03, "num_tokens": 1759248.0, "reward": -0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": -0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.3643404245376587, "sampling/importance_sampling_ratio/mean": 0.9996294975280762, "sampling/importance_sampling_ratio/min": 0.6760227084159851, "sampling/sampling_logp_difference/max": 0.3915286064147949, "sampling/sampling_logp_difference/mean": 0.014836130663752556, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 147.015625, "completions/mean_terminated_length": 147.015625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.44801831245422363, "epoch": 0.13451327433628318, "frac_reward_zero_std": 0.75, "grad_norm": 1.346146284903926, "kl": 0.0037919783499091864, "learning_rate": 6.637168141592921e-07, "loss": -0.0554, "num_tokens": 1779745.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.484464168548584, "sampling/importance_sampling_ratio/mean": 1.0008909702301025, "sampling/importance_sampling_ratio/min": 0.6165294051170349, "sampling/sampling_logp_difference/max": 0.48364925384521484, "sampling/sampling_logp_difference/mean": 0.016804197803139687, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 189.59375, "completions/mean_terminated_length": 189.59375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4261323809623718, "epoch": 0.13628318584070798, "frac_reward_zero_std": 0.5, "grad_norm": 1.3254439359031005, "kl": 0.003990200348198414, "learning_rate": 6.72566371681416e-07, "loss": -0.0305, "num_tokens": 1801191.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.355075716972351, "sampling/importance_sampling_ratio/mean": 1.0004005432128906, "sampling/importance_sampling_ratio/min": 0.6889482736587524, "sampling/sampling_logp_difference/max": 0.372589111328125, "sampling/sampling_logp_difference/mean": 0.014611724764108658, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 138.5625, "completions/mean_terminated_length": 138.5625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.42789387702941895, "epoch": 0.13805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 0.9557459436042481, "kl": 0.004104773513972759, "learning_rate": 6.814159292035397e-07, "loss": -0.0157, "num_tokens": 1824475.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4372409582138062, "sampling/importance_sampling_ratio/mean": 0.99988853931427, "sampling/importance_sampling_ratio/min": 0.7518267631530762, "sampling/sampling_logp_difference/max": 0.36272525787353516, "sampling/sampling_logp_difference/mean": 0.014697415754199028, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 154.546875, "completions/mean_terminated_length": 154.546875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.4291580319404602, "epoch": 0.13982300884955753, "frac_reward_zero_std": 0.75, "grad_norm": 1.0872952165920264, "kl": 0.0051791369915008545, "learning_rate": 6.902654867256636e-07, "loss": -0.0352, "num_tokens": 1844494.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6494313478469849, "sampling/importance_sampling_ratio/mean": 1.0000989437103271, "sampling/importance_sampling_ratio/min": 0.633714497089386, "sampling/sampling_logp_difference/max": 0.5004305839538574, "sampling/sampling_logp_difference/mean": 0.01674748957157135, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 228.03125, "completions/mean_terminated_length": 228.03125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.5064385533332825, "epoch": 0.1415929203539823, "frac_reward_zero_std": 0.25, "grad_norm": 1.2508944308207364, "kl": 0.006823018193244934, "learning_rate": 6.991150442477876e-07, "loss": -0.0478, "num_tokens": 1871600.0, "reward": 0.4375, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997493624687195, "sampling/importance_sampling_ratio/min": 0.7637916803359985, "sampling/sampling_logp_difference/max": 0.7011445760726929, "sampling/sampling_logp_difference/mean": 0.014860167168080807, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 269.65625, "completions/mean_terminated_length": 269.65625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.43211737275123596, "epoch": 0.1433628318584071, "frac_reward_zero_std": 0.5, "grad_norm": 1.1852625865276305, "kl": 0.0051311105489730835, "learning_rate": 7.079646017699115e-07, "loss": 0.0558, "num_tokens": 1900138.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6506516933441162, "sampling/importance_sampling_ratio/mean": 1.0004315376281738, "sampling/importance_sampling_ratio/min": 0.7342322468757629, "sampling/sampling_logp_difference/max": 0.5011701583862305, "sampling/sampling_logp_difference/mean": 0.013296298682689667, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 201.671875, "completions/mean_terminated_length": 201.671875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.3959803283214569, "epoch": 0.14513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 1.045826184722506, "kl": 0.005446660332381725, "learning_rate": 7.168141592920353e-07, "loss": 0.0411, "num_tokens": 1925477.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3914076089859009, "sampling/importance_sampling_ratio/mean": 0.9998247623443604, "sampling/importance_sampling_ratio/min": 0.7136387825012207, "sampling/sampling_logp_difference/max": 0.3373783826828003, "sampling/sampling_logp_difference/mean": 0.014389789663255215, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 228.84375, "completions/mean_terminated_length": 228.84375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.4334363639354706, "epoch": 0.14690265486725665, "frac_reward_zero_std": 0.75, "grad_norm": 0.740925116424798, "kl": 0.005063987337052822, "learning_rate": 7.256637168141593e-07, "loss": 0.0079, "num_tokens": 1952299.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997670650482178, "sampling/importance_sampling_ratio/min": 0.6632401943206787, "sampling/sampling_logp_difference/max": 0.7203359603881836, "sampling/sampling_logp_difference/mean": 0.015409368090331554, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 179.265625, "completions/mean_terminated_length": 179.265625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.47856420278549194, "epoch": 0.1486725663716814, "frac_reward_zero_std": 0.5, "grad_norm": 1.2377920204210848, "kl": 0.008888129144906998, "learning_rate": 7.345132743362832e-07, "loss": -0.0012, "num_tokens": 1974908.0, "reward": 0.46875, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5281422138214111, "sampling/importance_sampling_ratio/mean": 1.0001909732818604, "sampling/importance_sampling_ratio/min": 0.7076852917671204, "sampling/sampling_logp_difference/max": 0.42405271530151367, "sampling/sampling_logp_difference/mean": 0.017659775912761688, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 293.765625, "completions/mean_terminated_length": 293.765625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4348795413970947, "epoch": 0.1504424778761062, "frac_reward_zero_std": 0.25, "grad_norm": 0.9360824287337893, "kl": 0.007378062233328819, "learning_rate": 7.433628318584071e-07, "loss": -0.0508, "num_tokens": 2004333.0, "reward": -0.21875, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.500417709350586, "sampling/importance_sampling_ratio/mean": 1.0002942085266113, "sampling/importance_sampling_ratio/min": 0.6656407117843628, "sampling/sampling_logp_difference/max": 0.4070051908493042, "sampling/sampling_logp_difference/mean": 0.013928791508078575, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 160.453125, "completions/mean_terminated_length": 160.453125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.33171865344047546, "epoch": 0.15221238938053097, "frac_reward_zero_std": 0.75, "grad_norm": 1.0675652804981777, "kl": 0.006107178516685963, "learning_rate": 7.522123893805308e-07, "loss": -0.0106, "num_tokens": 2024730.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.714139699935913, "sampling/importance_sampling_ratio/mean": 0.9998903274536133, "sampling/importance_sampling_ratio/min": 0.7208889126777649, "sampling/sampling_logp_difference/max": 0.5389113426208496, "sampling/sampling_logp_difference/mean": 0.013012481853365898, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 166.578125, "completions/mean_terminated_length": 166.578125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3667978048324585, "epoch": 0.15398230088495576, "frac_reward_zero_std": 0.75, "grad_norm": 0.9423689414951838, "kl": 0.009107573889195919, "learning_rate": 7.610619469026548e-07, "loss": -0.0191, "num_tokens": 2045215.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.2989658117294312, "sampling/importance_sampling_ratio/mean": 1.0002738237380981, "sampling/importance_sampling_ratio/min": 0.6355611085891724, "sampling/sampling_logp_difference/max": 0.4532470703125, "sampling/sampling_logp_difference/mean": 0.013977423310279846, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 229.0625, "completions/mean_terminated_length": 229.0625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4350617527961731, "epoch": 0.15575221238938053, "frac_reward_zero_std": 0.5, "grad_norm": 1.084586219076594, "kl": 0.007374628912657499, "learning_rate": 7.699115044247787e-07, "loss": 0.0034, "num_tokens": 2070771.0, "reward": 0.34375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4263793230056763, "sampling/importance_sampling_ratio/mean": 1.000484585762024, "sampling/importance_sampling_ratio/min": 0.7787384986877441, "sampling/sampling_logp_difference/max": 0.35513925552368164, "sampling/sampling_logp_difference/mean": 0.014628564938902855, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 168.171875, "completions/mean_terminated_length": 168.171875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.36919862031936646, "epoch": 0.15752212389380532, "frac_reward_zero_std": 0.75, "grad_norm": 1.1026909015292126, "kl": 0.008185441605746746, "learning_rate": 7.787610619469026e-07, "loss": -0.1027, "num_tokens": 2091662.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.402538776397705, "sampling/importance_sampling_ratio/mean": 1.0000860691070557, "sampling/importance_sampling_ratio/min": 0.6938183307647705, "sampling/sampling_logp_difference/max": 0.36554503440856934, "sampling/sampling_logp_difference/mean": 0.016404587775468826, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 225.109375, "completions/mean_terminated_length": 225.109375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.38574910163879395, "epoch": 0.1592920353982301, "frac_reward_zero_std": 0.5, "grad_norm": 1.0001282950089636, "kl": 0.005031602922827005, "learning_rate": 7.876106194690266e-07, "loss": 0.0034, "num_tokens": 2119765.0, "reward": -0.1875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.62883722782135, "sampling/importance_sampling_ratio/mean": 1.0000710487365723, "sampling/importance_sampling_ratio/min": 0.6765734553337097, "sampling/sampling_logp_difference/max": 0.4878664016723633, "sampling/sampling_logp_difference/mean": 0.013780158944427967, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 272.859375, "completions/mean_terminated_length": 272.859375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.5558906197547913, "epoch": 0.16106194690265488, "frac_reward_zero_std": 0.5, "grad_norm": 0.8597256448767986, "kl": 0.008389434777200222, "learning_rate": 7.964601769911504e-07, "loss": 0.0427, "num_tokens": 2153660.0, "reward": 0.03125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6160951852798462, "sampling/importance_sampling_ratio/mean": 0.9994475841522217, "sampling/importance_sampling_ratio/min": 0.6952207088470459, "sampling/sampling_logp_difference/max": 0.4800128936767578, "sampling/sampling_logp_difference/mean": 0.016376271843910217, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 249.796875, "completions/mean_terminated_length": 249.796875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4147866368293762, "epoch": 0.16283185840707964, "frac_reward_zero_std": 0.75, "grad_norm": 0.7704781050366752, "kl": 0.00920557975769043, "learning_rate": 8.053097345132743e-07, "loss": 0.0226, "num_tokens": 2183007.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5229722261428833, "sampling/importance_sampling_ratio/mean": 1.0000641345977783, "sampling/importance_sampling_ratio/min": 0.6773865222930908, "sampling/sampling_logp_difference/max": 0.42066383361816406, "sampling/sampling_logp_difference/mean": 0.01389819011092186, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 181.734375, "completions/mean_terminated_length": 181.734375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.33271199464797974, "epoch": 0.16460176991150444, "frac_reward_zero_std": 1.0, "grad_norm": 0.02514947466783152, "kl": 0.006056039594113827, "learning_rate": 8.141592920353983e-07, "loss": 0.0001, "num_tokens": 2205694.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3995747566223145, "sampling/importance_sampling_ratio/mean": 0.9998966455459595, "sampling/importance_sampling_ratio/min": 0.7760488986968994, "sampling/sampling_logp_difference/max": 0.33616840839385986, "sampling/sampling_logp_difference/mean": 0.012386754155158997, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.35711997747421265, "epoch": 0.1663716814159292, "frac_reward_zero_std": 1.0, "grad_norm": 0.041645209780768716, "kl": 0.009089304134249687, "learning_rate": 8.230088495575221e-07, "loss": 0.0001, "num_tokens": 2228134.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4590530395507812, "sampling/importance_sampling_ratio/mean": 1.0001190900802612, "sampling/importance_sampling_ratio/min": 0.696026086807251, "sampling/sampling_logp_difference/max": 0.37778759002685547, "sampling/sampling_logp_difference/mean": 0.01325652003288269, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 204.09375, "completions/mean_terminated_length": 204.09375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.49231916666030884, "epoch": 0.168141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 1.1067044425908823, "kl": 0.010852073319256306, "learning_rate": 8.318584070796459e-07, "loss": -0.0029, "num_tokens": 2253372.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5164728164672852, "sampling/importance_sampling_ratio/mean": 0.9996018409729004, "sampling/importance_sampling_ratio/min": 0.6475425958633423, "sampling/sampling_logp_difference/max": 0.4345707893371582, "sampling/sampling_logp_difference/mean": 0.015629831701517105, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 176.703125, "completions/mean_terminated_length": 176.703125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3378068208694458, "epoch": 0.16991150442477876, "frac_reward_zero_std": 1.0, "grad_norm": 0.031338906168060635, "kl": 0.007221938110888004, "learning_rate": 8.407079646017698e-07, "loss": 0.0001, "num_tokens": 2274121.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 1.0003361701965332, "sampling/importance_sampling_ratio/min": 0.6199536919593811, "sampling/sampling_logp_difference/max": 0.4985384941101074, "sampling/sampling_logp_difference/mean": 0.012752997688949108, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 176.046875, "completions/mean_terminated_length": 176.046875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.3316665589809418, "epoch": 0.17168141592920355, "frac_reward_zero_std": 1.0, "grad_norm": 0.021395781942905007, "kl": 0.005239901132881641, "learning_rate": 8.495575221238938e-07, "loss": 0.0, "num_tokens": 2294380.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5631126165390015, "sampling/importance_sampling_ratio/mean": 0.9998713731765747, "sampling/importance_sampling_ratio/min": 0.7126661539077759, "sampling/sampling_logp_difference/max": 0.44667911529541016, "sampling/sampling_logp_difference/mean": 0.012051871046423912, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 306.5625, "completions/mean_terminated_length": 306.5625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.5548740029335022, "epoch": 0.17345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 0.7887933927607998, "kl": 0.010057269595563412, "learning_rate": 8.584070796460177e-07, "loss": -0.1587, "num_tokens": 2326896.0, "reward": -0.09375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4116458892822266, "sampling/importance_sampling_ratio/mean": 0.9997860789299011, "sampling/importance_sampling_ratio/min": 0.662236213684082, "sampling/sampling_logp_difference/max": 0.41213297843933105, "sampling/sampling_logp_difference/mean": 0.015368364751338959, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 165.078125, "completions/mean_terminated_length": 165.078125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.43203842639923096, "epoch": 0.1752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 0.8690942151867066, "kl": 0.00882387813180685, "learning_rate": 8.672566371681415e-07, "loss": -0.008, "num_tokens": 2349221.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3688689470291138, "sampling/importance_sampling_ratio/mean": 0.9998939037322998, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.5662540197372437, "sampling/sampling_logp_difference/mean": 0.015083818696439266, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 196.21875, "completions/mean_terminated_length": 196.21875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4429578483104706, "epoch": 0.17699115044247787, "frac_reward_zero_std": 0.5, "grad_norm": 1.3057643966898234, "kl": 0.007735630497336388, "learning_rate": 8.761061946902655e-07, "loss": 0.0206, "num_tokens": 2377171.0, "reward": 0.40625, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5814965963363647, "sampling/importance_sampling_ratio/mean": 0.9997316598892212, "sampling/importance_sampling_ratio/min": 0.6831091046333313, "sampling/sampling_logp_difference/max": 0.458371639251709, "sampling/sampling_logp_difference/mean": 0.01568496972322464, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 281.765625, "completions/mean_terminated_length": 281.765625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.5018290281295776, "epoch": 0.17876106194690267, "frac_reward_zero_std": 0.5, "grad_norm": 0.9087324145624863, "kl": 0.00942955818027258, "learning_rate": 8.849557522123894e-07, "loss": -0.076, "num_tokens": 2406564.0, "reward": 0.59375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.547824740409851, "sampling/importance_sampling_ratio/mean": 1.0005896091461182, "sampling/importance_sampling_ratio/min": 0.6978139877319336, "sampling/sampling_logp_difference/max": 0.43685054779052734, "sampling/sampling_logp_difference/mean": 0.015203689225018024, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 235.421875, "completions/mean_terminated_length": 235.421875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.36309516429901123, "epoch": 0.18053097345132743, "frac_reward_zero_std": 0.75, "grad_norm": 0.5618288068825584, "kl": 0.008715257979929447, "learning_rate": 8.938053097345132e-07, "loss": 0.0305, "num_tokens": 2431119.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4360764026641846, "sampling/importance_sampling_ratio/mean": 0.9999781847000122, "sampling/importance_sampling_ratio/min": 0.6960499882698059, "sampling/sampling_logp_difference/max": 0.3623337745666504, "sampling/sampling_logp_difference/mean": 0.013073887676000595, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 142.859375, "completions/mean_terminated_length": 142.859375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.40601882338523865, "epoch": 0.18230088495575222, "frac_reward_zero_std": 0.75, "grad_norm": 1.1824662207691532, "kl": 0.006852267310023308, "learning_rate": 9.026548672566371e-07, "loss": -0.0366, "num_tokens": 2450934.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 0.9999712109565735, "sampling/importance_sampling_ratio/min": 0.571608304977417, "sampling/sampling_logp_difference/max": 0.5593012571334839, "sampling/sampling_logp_difference/mean": 0.015403908677399158, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.4623086452484131, "epoch": 0.184070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.028542014006020484, "kl": 0.007069814950227737, "learning_rate": 9.11504424778761e-07, "loss": 0.0001, "num_tokens": 2470694.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.455679178237915, "sampling/importance_sampling_ratio/mean": 0.9993364810943604, "sampling/importance_sampling_ratio/min": 0.6755570769309998, "sampling/sampling_logp_difference/max": 0.39221763610839844, "sampling/sampling_logp_difference/mean": 0.016378730535507202, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 232.734375, "completions/mean_terminated_length": 232.734375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.5042564868927002, "epoch": 0.18584070796460178, "frac_reward_zero_std": 0.25, "grad_norm": 1.1968629825437478, "kl": 0.014089603908360004, "learning_rate": 9.203539823008849e-07, "loss": -0.001, "num_tokens": 2500597.0, "reward": -0.1875, "reward_std": 0.7455305457115173, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3822245597839355, "sampling/importance_sampling_ratio/mean": 1.000136375427246, "sampling/importance_sampling_ratio/min": 0.6896970272064209, "sampling/sampling_logp_difference/max": 0.3715028762817383, "sampling/sampling_logp_difference/mean": 0.015157243236899376, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 229.796875, "completions/mean_terminated_length": 229.796875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.5425690412521362, "epoch": 0.18761061946902655, "frac_reward_zero_std": 0.25, "grad_norm": 1.2664620499862738, "kl": 0.010843205265700817, "learning_rate": 9.292035398230088e-07, "loss": 0.0148, "num_tokens": 2528440.0, "reward": 0.0625, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.3200629949569702, "sampling/importance_sampling_ratio/mean": 0.9994847774505615, "sampling/importance_sampling_ratio/min": 0.6900873184204102, "sampling/sampling_logp_difference/max": 0.3709371089935303, "sampling/sampling_logp_difference/mean": 0.0164191797375679, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 236.4375, "completions/mean_terminated_length": 236.4375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.39633840322494507, "epoch": 0.18938053097345134, "frac_reward_zero_std": 0.5, "grad_norm": 0.9862656508133559, "kl": 0.008762618526816368, "learning_rate": 9.380530973451328e-07, "loss": 0.0821, "num_tokens": 2555316.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.292724609375, "sampling/importance_sampling_ratio/mean": 0.9993183016777039, "sampling/importance_sampling_ratio/min": 0.6173948049545288, "sampling/sampling_logp_difference/max": 0.48224663734436035, "sampling/sampling_logp_difference/mean": 0.01380380243062973, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 261.984375, "completions/mean_terminated_length": 261.984375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.3895493447780609, "epoch": 0.1911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.122580674206397, "kl": 0.011455290950834751, "learning_rate": 9.469026548672566e-07, "loss": 0.0154, "num_tokens": 2582979.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4416309595108032, "sampling/importance_sampling_ratio/mean": 0.9999445676803589, "sampling/importance_sampling_ratio/min": 0.7374420762062073, "sampling/sampling_logp_difference/max": 0.36577510833740234, "sampling/sampling_logp_difference/mean": 0.01352797169238329, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 200.4375, "completions/mean_terminated_length": 200.4375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.43929511308670044, "epoch": 0.1929203539823009, "frac_reward_zero_std": 0.75, "grad_norm": 0.8639664150894706, "kl": 0.009811014868319035, "learning_rate": 9.557522123893805e-07, "loss": 0.0094, "num_tokens": 2606335.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.3132628202438354, "sampling/importance_sampling_ratio/mean": 0.9999899864196777, "sampling/importance_sampling_ratio/min": 0.614693284034729, "sampling/sampling_logp_difference/max": 0.4866318702697754, "sampling/sampling_logp_difference/mean": 0.01378970593214035, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 145.953125, "completions/mean_terminated_length": 145.953125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.4220312833786011, "epoch": 0.19469026548672566, "frac_reward_zero_std": 0.5, "grad_norm": 1.4483535535558805, "kl": 0.015068529173731804, "learning_rate": 9.646017699115042e-07, "loss": 0.0175, "num_tokens": 2625804.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5871634483337402, "sampling/importance_sampling_ratio/mean": 0.999325692653656, "sampling/importance_sampling_ratio/min": 0.6505211591720581, "sampling/sampling_logp_difference/max": 0.4619483947753906, "sampling/sampling_logp_difference/mean": 0.016057994216680527, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 348.296875, "completions/mean_terminated_length": 348.296875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4034639894962311, "epoch": 0.19646017699115045, "frac_reward_zero_std": 0.5, "grad_norm": 0.7394903049705748, "kl": 0.010000361129641533, "learning_rate": 9.734513274336282e-07, "loss": 0.0288, "num_tokens": 2662191.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002636909484863, "sampling/importance_sampling_ratio/min": 0.5217915773391724, "sampling/sampling_logp_difference/max": 0.7696138620376587, "sampling/sampling_logp_difference/mean": 0.013374516740441322, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 212.859375, "completions/mean_terminated_length": 212.859375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.48881521821022034, "epoch": 0.19823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 1.2941778523161354, "kl": 0.014841076917946339, "learning_rate": 9.82300884955752e-07, "loss": 0.0304, "num_tokens": 2694390.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4649940729141235, "sampling/importance_sampling_ratio/mean": 1.0000686645507812, "sampling/importance_sampling_ratio/min": 0.5692293643951416, "sampling/sampling_logp_difference/max": 0.563471794128418, "sampling/sampling_logp_difference/mean": 0.0167857613414526, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 221.703125, "completions/mean_terminated_length": 221.703125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.4552157521247864, "epoch": 0.2, "frac_reward_zero_std": 0.75, "grad_norm": 0.8389105605260265, "kl": 0.009184878319501877, "learning_rate": 9.91150442477876e-07, "loss": -0.0755, "num_tokens": 2722931.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4325261116027832, "sampling/importance_sampling_ratio/mean": 0.999966025352478, "sampling/importance_sampling_ratio/min": 0.7087082862854004, "sampling/sampling_logp_difference/max": 0.3594393730163574, "sampling/sampling_logp_difference/mean": 0.014725006185472012, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 159.6875, "completions/mean_terminated_length": 159.6875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.47986888885498047, "epoch": 0.20176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.15603323523098, "kl": 0.018889345228672028, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 2745983.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2896757125854492, "sampling/importance_sampling_ratio/mean": 1.0002847909927368, "sampling/importance_sampling_ratio/min": 0.6975700855255127, "sampling/sampling_logp_difference/max": 0.3601522445678711, "sampling/sampling_logp_difference/mean": 0.016149815171957016, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 240.703125, "completions/mean_terminated_length": 240.703125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.3917234539985657, "epoch": 0.20353982300884957, "frac_reward_zero_std": 0.75, "grad_norm": 1.0417974183252043, "kl": 0.00827406719326973, "learning_rate": 9.99997614400677e-07, "loss": -0.0484, "num_tokens": 2772492.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3156572580337524, "sampling/importance_sampling_ratio/mean": 1.000190258026123, "sampling/importance_sampling_ratio/min": 0.6722845435142517, "sampling/sampling_logp_difference/max": 0.3970736265182495, "sampling/sampling_logp_difference/mean": 0.013651804067194462, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 261.84375, "completions/mean_terminated_length": 261.84375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.4710029661655426, "epoch": 0.20530973451327433, "frac_reward_zero_std": 0.5, "grad_norm": 0.9692197385673672, "kl": 0.014450005255639553, "learning_rate": 9.999904576254724e-07, "loss": 0.0484, "num_tokens": 2803666.0, "reward": -0.03125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5001816749572754, "sampling/importance_sampling_ratio/mean": 1.0000730752944946, "sampling/importance_sampling_ratio/min": 0.6882078051567078, "sampling/sampling_logp_difference/max": 0.40558624267578125, "sampling/sampling_logp_difference/mean": 0.014995187520980835, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 212.671875, "completions/mean_terminated_length": 212.671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.3832337260246277, "epoch": 0.20707964601769913, "frac_reward_zero_std": 0.75, "grad_norm": 0.7818012845672909, "kl": 0.019853873178362846, "learning_rate": 9.999785297426788e-07, "loss": 0.0568, "num_tokens": 2828973.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.342420220375061, "sampling/importance_sampling_ratio/mean": 0.9999743103981018, "sampling/importance_sampling_ratio/min": 0.6965714693069458, "sampling/sampling_logp_difference/max": 0.3615849018096924, "sampling/sampling_logp_difference/mean": 0.013185858726501465, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 170.609375, "completions/mean_terminated_length": 170.609375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.47524765133857727, "epoch": 0.2088495575221239, "frac_reward_zero_std": 0.5, "grad_norm": 1.4404281010763185, "kl": 0.023978184908628464, "learning_rate": 9.999618308661168e-07, "loss": 0.024, "num_tokens": 2850820.0, "reward": 0.28125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4395263195037842, "sampling/importance_sampling_ratio/mean": 0.9994199872016907, "sampling/importance_sampling_ratio/min": 0.636264979839325, "sampling/sampling_logp_difference/max": 0.45214009284973145, "sampling/sampling_logp_difference/mean": 0.016553249210119247, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 153.4375, "completions/mean_terminated_length": 153.4375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.3817932605743408, "epoch": 0.21061946902654868, "frac_reward_zero_std": 1.0, "grad_norm": 0.03819899630623374, "kl": 0.01441490463912487, "learning_rate": 9.99940361155134e-07, "loss": 0.0002, "num_tokens": 2869856.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5989081859588623, "sampling/importance_sampling_ratio/mean": 0.9999228715896606, "sampling/importance_sampling_ratio/min": 0.6624689698219299, "sampling/sampling_logp_difference/max": 0.46932101249694824, "sampling/sampling_logp_difference/mean": 0.014728734269738197, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 243.515625, "completions/mean_terminated_length": 243.515625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5095322132110596, "epoch": 0.21238938053097345, "frac_reward_zero_std": 0.25, "grad_norm": 1.324616011010765, "kl": 0.02429928630590439, "learning_rate": 9.999141208146027e-07, "loss": 0.027, "num_tokens": 2897297.0, "reward": -0.0625, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.2842556238174438, "sampling/importance_sampling_ratio/mean": 1.0001277923583984, "sampling/importance_sampling_ratio/min": 0.749481737613678, "sampling/sampling_logp_difference/max": 0.28837335109710693, "sampling/sampling_logp_difference/mean": 0.015744555741548538, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 168.59375, "completions/mean_terminated_length": 168.59375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.4617387652397156, "epoch": 0.21415929203539824, "frac_reward_zero_std": 0.75, "grad_norm": 1.0006013589197467, "kl": 0.019833650439977646, "learning_rate": 9.998831100949186e-07, "loss": -0.0058, "num_tokens": 2922199.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3597606420516968, "sampling/importance_sampling_ratio/mean": 0.9997391104698181, "sampling/importance_sampling_ratio/min": 0.6564739942550659, "sampling/sampling_logp_difference/max": 0.42087221145629883, "sampling/sampling_logp_difference/mean": 0.015299756079912186, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3810102641582489, "epoch": 0.215929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.03955497873612043, "kl": 0.015549886040389538, "learning_rate": 9.998473292919985e-07, "loss": 0.0001, "num_tokens": 2945383.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4048078060150146, "sampling/importance_sampling_ratio/mean": 0.9995763301849365, "sampling/importance_sampling_ratio/min": 0.6610390543937683, "sampling/sampling_logp_difference/max": 0.4139423370361328, "sampling/sampling_logp_difference/mean": 0.013984480872750282, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 91.515625, "completions/mean_terminated_length": 91.515625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.36809319257736206, "epoch": 0.2176991150442478, "frac_reward_zero_std": 1.0, "grad_norm": 0.061575951625457745, "kl": 0.01606135070323944, "learning_rate": 9.99806778747277e-07, "loss": 0.0002, "num_tokens": 2960872.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6074992418289185, "sampling/importance_sampling_ratio/mean": 1.0007941722869873, "sampling/importance_sampling_ratio/min": 0.6592212319374084, "sampling/sampling_logp_difference/max": 0.47467970848083496, "sampling/sampling_logp_difference/mean": 0.015205798670649529, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 81.515625, "completions/mean_terminated_length": 81.515625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3826492428779602, "epoch": 0.21946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.05223410378274957, "kl": 0.015772055834531784, "learning_rate": 9.997614588477033e-07, "loss": 0.0002, "num_tokens": 2976201.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5464738607406616, "sampling/importance_sampling_ratio/mean": 1.0006520748138428, "sampling/importance_sampling_ratio/min": 0.7868272662162781, "sampling/sampling_logp_difference/max": 0.4359774589538574, "sampling/sampling_logp_difference/mean": 0.015888335183262825, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 187.28125, "completions/mean_terminated_length": 187.28125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.3183233141899109, "epoch": 0.22123893805309736, "frac_reward_zero_std": 0.75, "grad_norm": 0.7641022906189201, "kl": 0.015849297866225243, "learning_rate": 9.99711370025738e-07, "loss": -0.0129, "num_tokens": 2998059.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4020609855651855, "sampling/importance_sampling_ratio/mean": 0.9999303817749023, "sampling/importance_sampling_ratio/min": 0.6294710636138916, "sampling/sampling_logp_difference/max": 0.4628753662109375, "sampling/sampling_logp_difference/mean": 0.0124830212444067, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 188.015625, "completions/mean_terminated_length": 188.015625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.35414624214172363, "epoch": 0.22300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 0.9296845410643008, "kl": 0.012998737394809723, "learning_rate": 9.996565127593489e-07, "loss": -0.1039, "num_tokens": 3019772.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6042280197143555, "sampling/importance_sampling_ratio/mean": 0.9998700022697449, "sampling/importance_sampling_ratio/min": 0.5382936000823975, "sampling/sampling_logp_difference/max": 0.6193511486053467, "sampling/sampling_logp_difference/mean": 0.013802911154925823, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 101.375, "completions/mean_terminated_length": 101.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.4166167378425598, "epoch": 0.2247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 1.5507583876569904, "kl": 0.01925094798207283, "learning_rate": 9.995968875720051e-07, "loss": 0.0314, "num_tokens": 3041172.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.521734595298767, "sampling/importance_sampling_ratio/mean": 0.9996951222419739, "sampling/importance_sampling_ratio/min": 0.6481021046638489, "sampling/sampling_logp_difference/max": 0.43370699882507324, "sampling/sampling_logp_difference/mean": 0.015997136011719704, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 190.34375, "completions/mean_terminated_length": 190.34375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3922315835952759, "epoch": 0.22654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 0.8552558154681225, "kl": 0.016614802181720734, "learning_rate": 9.995324950326745e-07, "loss": 0.0021, "num_tokens": 3064698.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2727667093276978, "sampling/importance_sampling_ratio/mean": 1.0001944303512573, "sampling/importance_sampling_ratio/min": 0.6405651569366455, "sampling/sampling_logp_difference/max": 0.4454045295715332, "sampling/sampling_logp_difference/mean": 0.014252089895308018, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 148.640625, "completions/mean_terminated_length": 148.640625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4221882224082947, "epoch": 0.22831858407079647, "frac_reward_zero_std": 1.0, "grad_norm": 0.044652079291280566, "kl": 0.023451007902622223, "learning_rate": 9.994633357558158e-07, "loss": 0.0002, "num_tokens": 3085363.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5375036001205444, "sampling/importance_sampling_ratio/mean": 1.000227928161621, "sampling/importance_sampling_ratio/min": 0.67856764793396, "sampling/sampling_logp_difference/max": 0.4301600456237793, "sampling/sampling_logp_difference/mean": 0.015490110032260418, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 153.265625, "completions/mean_terminated_length": 153.265625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.44381868839263916, "epoch": 0.23008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 0.9364833836961928, "kl": 0.023741457611322403, "learning_rate": 9.993894104013746e-07, "loss": -0.0037, "num_tokens": 3105540.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.378252625465393, "sampling/importance_sampling_ratio/mean": 1.0005567073822021, "sampling/importance_sampling_ratio/min": 0.6993489265441895, "sampling/sampling_logp_difference/max": 0.3576054573059082, "sampling/sampling_logp_difference/mean": 0.015576737001538277, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.41213229298591614, "epoch": 0.23185840707964603, "frac_reward_zero_std": 0.75, "grad_norm": 0.9931258259593163, "kl": 0.02783460170030594, "learning_rate": 9.993107196747758e-07, "loss": 0.0141, "num_tokens": 3123476.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6307547092437744, "sampling/importance_sampling_ratio/mean": 0.9996258020401001, "sampling/importance_sampling_ratio/min": 0.6208378076553345, "sampling/sampling_logp_difference/max": 0.4890429973602295, "sampling/sampling_logp_difference/mean": 0.014958133921027184, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.4969888925552368, "epoch": 0.2336283185840708, "frac_reward_zero_std": 0.5, "grad_norm": 1.265506741130298, "kl": 0.022039327770471573, "learning_rate": 9.99227264326918e-07, "loss": -0.006, "num_tokens": 3145940.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.289526104927063, "sampling/importance_sampling_ratio/mean": 0.9995390772819519, "sampling/importance_sampling_ratio/min": 0.7225502133369446, "sampling/sampling_logp_difference/max": 0.3249683380126953, "sampling/sampling_logp_difference/mean": 0.015670448541641235, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 136.453125, "completions/mean_terminated_length": 136.453125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.5471580028533936, "epoch": 0.23539823008849559, "frac_reward_zero_std": 0.5, "grad_norm": 1.887121719660996, "kl": 0.01736373081803322, "learning_rate": 9.991390451541648e-07, "loss": -0.0472, "num_tokens": 3168401.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.2742141485214233, "sampling/importance_sampling_ratio/mean": 0.9999238848686218, "sampling/importance_sampling_ratio/min": 0.6470947861671448, "sampling/sampling_logp_difference/max": 0.4352625608444214, "sampling/sampling_logp_difference/mean": 0.016257960349321365, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 146.53125, "completions/mean_terminated_length": 146.53125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.49213963747024536, "epoch": 0.23716814159292035, "frac_reward_zero_std": 0.5, "grad_norm": 1.470629395741809, "kl": 0.02608679048717022, "learning_rate": 9.990460629983388e-07, "loss": 0.0294, "num_tokens": 3189587.0, "reward": 0.3125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.49726140499115, "sampling/importance_sampling_ratio/mean": 0.9998851418495178, "sampling/importance_sampling_ratio/min": 0.6407742500305176, "sampling/sampling_logp_difference/max": 0.44507813453674316, "sampling/sampling_logp_difference/mean": 0.01672007329761982, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 128.640625, "completions/mean_terminated_length": 128.640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.39031732082366943, "epoch": 0.23893805309734514, "frac_reward_zero_std": 0.75, "grad_norm": 1.6354175270657778, "kl": 0.018013792112469673, "learning_rate": 9.989483187467125e-07, "loss": 0.2014, "num_tokens": 3209100.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6007124185562134, "sampling/importance_sampling_ratio/mean": 1.0007861852645874, "sampling/importance_sampling_ratio/min": 0.6581144332885742, "sampling/sampling_logp_difference/max": 0.47044873237609863, "sampling/sampling_logp_difference/mean": 0.015498748049139977, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 98.90625, "completions/mean_terminated_length": 98.90625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3771934509277344, "epoch": 0.2407079646017699, "frac_reward_zero_std": 1.0, "grad_norm": 0.047841604322458015, "kl": 0.015407159924507141, "learning_rate": 9.988458133320008e-07, "loss": 0.0001, "num_tokens": 3225254.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4663512706756592, "sampling/importance_sampling_ratio/mean": 0.9998021125793457, "sampling/importance_sampling_ratio/min": 0.6771776676177979, "sampling/sampling_logp_difference/max": 0.38982152938842773, "sampling/sampling_logp_difference/mean": 0.015056025236845016, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 186.921875, "completions/mean_terminated_length": 186.921875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.47957292199134827, "epoch": 0.2424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 1.0036336604211555, "kl": 0.023874368518590927, "learning_rate": 9.987385477323506e-07, "loss": -0.0824, "num_tokens": 3247249.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6573729515075684, "sampling/importance_sampling_ratio/mean": 0.9997392892837524, "sampling/importance_sampling_ratio/min": 0.6971372961997986, "sampling/sampling_logp_difference/max": 0.5052337646484375, "sampling/sampling_logp_difference/mean": 0.01558865886181593, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 142.609375, "completions/mean_terminated_length": 142.609375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.4005346894264221, "epoch": 0.24424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 1.2876964818205348, "kl": 0.012754756957292557, "learning_rate": 9.98626522971333e-07, "loss": 0.005, "num_tokens": 3268200.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.501747965812683, "sampling/importance_sampling_ratio/mean": 1.0003690719604492, "sampling/importance_sampling_ratio/min": 0.6998880505561829, "sampling/sampling_logp_difference/max": 0.40662968158721924, "sampling/sampling_logp_difference/mean": 0.014330487698316574, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 108.40625, "completions/mean_terminated_length": 108.40625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.42474764585494995, "epoch": 0.24601769911504426, "frac_reward_zero_std": 0.75, "grad_norm": 1.4381328149448838, "kl": 0.020078163594007492, "learning_rate": 9.985097401179333e-07, "loss": -0.0194, "num_tokens": 3285186.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4312281608581543, "sampling/importance_sampling_ratio/mean": 1.0001778602600098, "sampling/importance_sampling_ratio/min": 0.6344427466392517, "sampling/sampling_logp_difference/max": 0.45500826835632324, "sampling/sampling_logp_difference/mean": 0.016353420913219452, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 200.734375, "completions/mean_terminated_length": 200.734375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.34385940432548523, "epoch": 0.24778761061946902, "frac_reward_zero_std": 0.75, "grad_norm": 0.888011426917495, "kl": 0.011675514280796051, "learning_rate": 9.98388200286539e-07, "loss": 0.2142, "num_tokens": 3307793.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4830996990203857, "sampling/importance_sampling_ratio/mean": 1.0002667903900146, "sampling/importance_sampling_ratio/min": 0.5398048162460327, "sampling/sampling_logp_difference/max": 0.6165475845336914, "sampling/sampling_logp_difference/mean": 0.013476641848683357, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 198.109375, "completions/mean_terminated_length": 198.109375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.5087154507637024, "epoch": 0.24955752212389382, "frac_reward_zero_std": 0.5, "grad_norm": 1.1296578490201252, "kl": 0.020054833963513374, "learning_rate": 9.98261904636932e-07, "loss": -0.0278, "num_tokens": 3330584.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8624892234802246, "sampling/importance_sampling_ratio/mean": 1.0003323554992676, "sampling/importance_sampling_ratio/min": 0.6993569135665894, "sampling/sampling_logp_difference/max": 0.6219139099121094, "sampling/sampling_logp_difference/mean": 0.015626512467861176, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 228.296875, "completions/mean_terminated_length": 228.296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.5250065922737122, "epoch": 0.2513274336283186, "frac_reward_zero_std": 0.25, "grad_norm": 1.3021890539361338, "kl": 0.014063199050724506, "learning_rate": 9.981308543742756e-07, "loss": -0.034, "num_tokens": 3357611.0, "reward": 0.84375, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.378688097000122, "sampling/importance_sampling_ratio/mean": 0.9997292757034302, "sampling/importance_sampling_ratio/min": 0.701869547367096, "sampling/sampling_logp_difference/max": 0.3540077209472656, "sampling/sampling_logp_difference/mean": 0.016027584671974182, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 113.0625, "completions/mean_terminated_length": 113.0625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.43075042963027954, "epoch": 0.25309734513274335, "frac_reward_zero_std": 0.75, "grad_norm": 1.4745443518795067, "kl": 0.02290950156748295, "learning_rate": 9.979950507491033e-07, "loss": -0.0717, "num_tokens": 3374879.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3402374982833862, "sampling/importance_sampling_ratio/mean": 0.9989745020866394, "sampling/importance_sampling_ratio/min": 0.7012100219726562, "sampling/sampling_logp_difference/max": 0.3549478054046631, "sampling/sampling_logp_difference/mean": 0.015583084896206856, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 145.453125, "completions/mean_terminated_length": 145.453125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.41358840465545654, "epoch": 0.25486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 0.98435655094442, "kl": 0.018958721309900284, "learning_rate": 9.978544950573073e-07, "loss": 0.0147, "num_tokens": 3392668.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3110979795455933, "sampling/importance_sampling_ratio/mean": 0.9999505281448364, "sampling/importance_sampling_ratio/min": 0.6678863167762756, "sampling/sampling_logp_difference/max": 0.4036372900009155, "sampling/sampling_logp_difference/mean": 0.014273607172071934, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 165.03125, "completions/mean_terminated_length": 165.03125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.3803103566169739, "epoch": 0.25663716814159293, "frac_reward_zero_std": 1.0, "grad_norm": 0.01806955419217176, "kl": 0.011224801652133465, "learning_rate": 9.97709188640126e-07, "loss": 0.0001, "num_tokens": 3413646.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2968004941940308, "sampling/importance_sampling_ratio/mean": 1.000303864479065, "sampling/importance_sampling_ratio/min": 0.633061408996582, "sampling/sampling_logp_difference/max": 0.4571878910064697, "sampling/sampling_logp_difference/mean": 0.012215817347168922, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 200.234375, "completions/mean_terminated_length": 200.234375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5213860273361206, "epoch": 0.2584070796460177, "frac_reward_zero_std": 0.5, "grad_norm": 1.2467819845085475, "kl": 0.022496270015835762, "learning_rate": 9.975591328841304e-07, "loss": -0.0484, "num_tokens": 3438973.0, "reward": 0.5, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3065545558929443, "sampling/importance_sampling_ratio/mean": 0.9998379349708557, "sampling/importance_sampling_ratio/min": 0.720697820186615, "sampling/sampling_logp_difference/max": 0.32753539085388184, "sampling/sampling_logp_difference/mean": 0.016474196687340736, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 175.421875, "completions/mean_terminated_length": 175.421875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.46795982122421265, "epoch": 0.26017699115044246, "frac_reward_zero_std": 0.75, "grad_norm": 0.8151432248481513, "kl": 0.017628170549869537, "learning_rate": 9.974043292212127e-07, "loss": -0.0022, "num_tokens": 3467288.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.2679344415664673, "sampling/importance_sampling_ratio/mean": 0.9998873472213745, "sampling/importance_sampling_ratio/min": 0.5485771894454956, "sampling/sampling_logp_difference/max": 0.6004272699356079, "sampling/sampling_logp_difference/mean": 0.01507764682173729, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 292.546875, "completions/mean_terminated_length": 292.546875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.509474515914917, "epoch": 0.26194690265486725, "frac_reward_zero_std": 0.5, "grad_norm": 0.8631721395253591, "kl": 0.020277369767427444, "learning_rate": 9.97244779128571e-07, "loss": -0.0536, "num_tokens": 3498891.0, "reward": 0.46875, "reward_std": 0.5143726468086243, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3664368391036987, "sampling/importance_sampling_ratio/mean": 0.9999426007270813, "sampling/importance_sampling_ratio/min": 0.7439930438995361, "sampling/sampling_logp_difference/max": 0.312206506729126, "sampling/sampling_logp_difference/mean": 0.01405276358127594, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4960056245326996, "epoch": 0.26371681415929205, "frac_reward_zero_std": 0.75, "grad_norm": 1.0250910794536259, "kl": 0.023861011490225792, "learning_rate": 9.970804841286953e-07, "loss": 0.0024, "num_tokens": 3520811.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7054411172866821, "sampling/importance_sampling_ratio/mean": 1.0007133483886719, "sampling/importance_sampling_ratio/min": 0.7129157185554504, "sampling/sampling_logp_difference/max": 0.5338238477706909, "sampling/sampling_logp_difference/mean": 0.014998722821474075, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 244.171875, "completions/mean_terminated_length": 244.171875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.4869275391101837, "epoch": 0.26548672566371684, "frac_reward_zero_std": 0.5, "grad_norm": 0.9005072207203216, "kl": 0.022945452481508255, "learning_rate": 9.969114457893539e-07, "loss": -0.0041, "num_tokens": 3549014.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3425207138061523, "sampling/importance_sampling_ratio/mean": 1.0000534057617188, "sampling/importance_sampling_ratio/min": 0.7301079630851746, "sampling/sampling_logp_difference/max": 0.3145628571510315, "sampling/sampling_logp_difference/mean": 0.014449442736804485, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.5407697558403015, "epoch": 0.2672566371681416, "frac_reward_zero_std": 0.5, "grad_norm": 1.5559753607777564, "kl": 0.03468339145183563, "learning_rate": 9.967376657235778e-07, "loss": 0.0111, "num_tokens": 3570518.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.595868706703186, "sampling/importance_sampling_ratio/mean": 0.9999444484710693, "sampling/importance_sampling_ratio/min": 0.7469043135643005, "sampling/sampling_logp_difference/max": 0.46741819381713867, "sampling/sampling_logp_difference/mean": 0.016509853303432465, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 195.46875, "completions/mean_terminated_length": 195.46875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5421572923660278, "epoch": 0.26902654867256637, "frac_reward_zero_std": 0.5, "grad_norm": 1.1597632360872765, "kl": 0.02573992870748043, "learning_rate": 9.965591455896455e-07, "loss": -0.0302, "num_tokens": 3594740.0, "reward": 0.5625, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4663231372833252, "sampling/importance_sampling_ratio/mean": 0.999872088432312, "sampling/importance_sampling_ratio/min": 0.7054705023765564, "sampling/sampling_logp_difference/max": 0.3827580213546753, "sampling/sampling_logp_difference/mean": 0.016308188438415527, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 371.28125, "completions/mean_terminated_length": 371.28125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4306092858314514, "epoch": 0.27079646017699116, "frac_reward_zero_std": 0.75, "grad_norm": 0.5105699187192412, "kl": 0.017022263258695602, "learning_rate": 9.96375887091067e-07, "loss": 0.0033, "num_tokens": 3631206.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3559333086013794, "sampling/importance_sampling_ratio/mean": 1.0000176429748535, "sampling/importance_sampling_ratio/min": 0.6082406044006348, "sampling/sampling_logp_difference/max": 0.49718475341796875, "sampling/sampling_logp_difference/mean": 0.012135665863752365, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 368.4375, "completions/mean_terminated_length": 368.4375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4076581597328186, "epoch": 0.27256637168141595, "frac_reward_zero_std": 0.75, "grad_norm": 0.5190375636677884, "kl": 0.014848101884126663, "learning_rate": 9.961878919765677e-07, "loss": -0.0061, "num_tokens": 3667954.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6111408472061157, "sampling/importance_sampling_ratio/mean": 1.0001106262207031, "sampling/importance_sampling_ratio/min": 0.6368882060050964, "sampling/sampling_logp_difference/max": 0.4769425392150879, "sampling/sampling_logp_difference/mean": 0.011590846814215183, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 165.65625, "completions/mean_terminated_length": 165.65625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.5160574316978455, "epoch": 0.2743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 1.0230494470975504, "kl": 0.022557180374860764, "learning_rate": 9.959951620400718e-07, "loss": -0.045, "num_tokens": 3688044.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4046136140823364, "sampling/importance_sampling_ratio/mean": 0.9999648928642273, "sampling/importance_sampling_ratio/min": 0.689996063709259, "sampling/sampling_logp_difference/max": 0.37106943130493164, "sampling/sampling_logp_difference/mean": 0.014881974086165428, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 286.53125, "completions/mean_terminated_length": 286.53125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.5217193365097046, "epoch": 0.2761061946902655, "frac_reward_zero_std": 0.5, "grad_norm": 0.8646579572621913, "kl": 0.020118754357099533, "learning_rate": 9.957976991206845e-07, "loss": -0.02, "num_tokens": 3717230.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.8173089027404785, "sampling/importance_sampling_ratio/mean": 0.9997584819793701, "sampling/importance_sampling_ratio/min": 0.6919742822647095, "sampling/sampling_logp_difference/max": 0.5973567962646484, "sampling/sampling_logp_difference/mean": 0.014802999794483185, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 228.015625, "completions/mean_terminated_length": 228.015625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.5421289205551147, "epoch": 0.2778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 0.8227514375680314, "kl": 0.02495657280087471, "learning_rate": 9.955955051026758e-07, "loss": 0.0134, "num_tokens": 3743999.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3393127918243408, "sampling/importance_sampling_ratio/mean": 1.0007410049438477, "sampling/importance_sampling_ratio/min": 0.6609655022621155, "sampling/sampling_logp_difference/max": 0.41405367851257324, "sampling/sampling_logp_difference/mean": 0.01478620432317257, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 275.734375, "completions/mean_terminated_length": 275.734375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3780868947505951, "epoch": 0.27964601769911507, "frac_reward_zero_std": 1.0, "grad_norm": 0.025718948782958297, "kl": 0.014576991088688374, "learning_rate": 9.953885819154614e-07, "loss": 0.0002, "num_tokens": 3772350.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3519871234893799, "sampling/importance_sampling_ratio/mean": 1.0002187490463257, "sampling/importance_sampling_ratio/min": 0.7160540223121643, "sampling/sampling_logp_difference/max": 0.3339996337890625, "sampling/sampling_logp_difference/mean": 0.010907080955803394, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 267.65625, "completions/mean_terminated_length": 267.65625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.6244798302650452, "epoch": 0.2814159292035398, "frac_reward_zero_std": 0.25, "grad_norm": 1.2032057086410128, "kl": 0.03455369919538498, "learning_rate": 9.951769315335843e-07, "loss": 0.0848, "num_tokens": 3800536.0, "reward": 0.46875, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4502971172332764, "sampling/importance_sampling_ratio/mean": 1.000093698501587, "sampling/importance_sampling_ratio/min": 0.7344853281974792, "sampling/sampling_logp_difference/max": 0.3717684745788574, "sampling/sampling_logp_difference/mean": 0.01600492373108864, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 217.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.5654330253601074, "epoch": 0.2831858407079646, "frac_reward_zero_std": 0.25, "grad_norm": 1.2360407043217891, "kl": 0.035345666110515594, "learning_rate": 9.949605559766967e-07, "loss": 0.006, "num_tokens": 3825832.0, "reward": 0.46875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5361943244934082, "sampling/importance_sampling_ratio/mean": 1.0009934902191162, "sampling/importance_sampling_ratio/min": 0.6373308300971985, "sampling/sampling_logp_difference/max": 0.4504663944244385, "sampling/sampling_logp_difference/mean": 0.015263221226632595, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.5510069727897644, "epoch": 0.2849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 0.5780747194120263, "kl": 0.028772085905075073, "learning_rate": 9.947394573095402e-07, "loss": -0.0074, "num_tokens": 3853488.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5562496185302734, "sampling/importance_sampling_ratio/mean": 1.0002202987670898, "sampling/importance_sampling_ratio/min": 0.46540531516075134, "sampling/sampling_logp_difference/max": 0.7648465633392334, "sampling/sampling_logp_difference/mean": 0.015787167474627495, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 180.9375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.4922630488872528, "epoch": 0.2867256637168142, "frac_reward_zero_std": 0.75, "grad_norm": 1.0161460323501361, "kl": 0.031515732407569885, "learning_rate": 9.945136376419258e-07, "loss": -0.0183, "num_tokens": 3876012.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6088860034942627, "sampling/importance_sampling_ratio/mean": 0.9999725222587585, "sampling/importance_sampling_ratio/min": 0.6851174831390381, "sampling/sampling_logp_difference/max": 0.4755420684814453, "sampling/sampling_logp_difference/mean": 0.014924164861440659, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 251.265625, "completions/mean_terminated_length": 251.265625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.47889378666877747, "epoch": 0.2884955752212389, "frac_reward_zero_std": 0.75, "grad_norm": 0.6741350613342171, "kl": 0.02914135903120041, "learning_rate": 9.942830991287149e-07, "loss": -0.0076, "num_tokens": 3905037.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3499717712402344, "sampling/importance_sampling_ratio/mean": 1.0002903938293457, "sampling/importance_sampling_ratio/min": 0.7686730027198792, "sampling/sampling_logp_difference/max": 0.30008363723754883, "sampling/sampling_logp_difference/mean": 0.013379250653088093, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 195.765625, "completions/mean_terminated_length": 195.765625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.4450652003288269, "epoch": 0.2902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 0.7888195518137604, "kl": 0.025343546643853188, "learning_rate": 9.940478439697972e-07, "loss": 0.0262, "num_tokens": 3926798.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3017381429672241, "sampling/importance_sampling_ratio/mean": 0.9994922876358032, "sampling/importance_sampling_ratio/min": 0.7733324766159058, "sampling/sampling_logp_difference/max": 0.2637004852294922, "sampling/sampling_logp_difference/mean": 0.013473778031766415, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 247.359375, "completions/mean_terminated_length": 247.359375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.5671118497848511, "epoch": 0.2920353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 0.984674532139333, "kl": 0.028708674013614655, "learning_rate": 9.93807874410071e-07, "loss": -0.005, "num_tokens": 3955157.0, "reward": 0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.2685520648956299, "sampling/importance_sampling_ratio/mean": 0.9995976090431213, "sampling/importance_sampling_ratio/min": 0.6891096234321594, "sampling/sampling_logp_difference/max": 0.3723548650741577, "sampling/sampling_logp_difference/mean": 0.01571439579129219, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 218.296875, "completions/mean_terminated_length": 218.296875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.5851125717163086, "epoch": 0.2938053097345133, "frac_reward_zero_std": 0.5, "grad_norm": 1.1481309512015554, "kl": 0.03979609161615372, "learning_rate": 9.935631927394214e-07, "loss": 0.0468, "num_tokens": 3980760.0, "reward": 0.53125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4061118364334106, "sampling/importance_sampling_ratio/mean": 1.0005295276641846, "sampling/importance_sampling_ratio/min": 0.7801968455314636, "sampling/sampling_logp_difference/max": 0.3408282995223999, "sampling/sampling_logp_difference/mean": 0.015908237546682358, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 188.140625, "completions/mean_terminated_length": 188.140625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.474558562040329, "epoch": 0.29557522123893804, "frac_reward_zero_std": 0.5, "grad_norm": 1.1684220042766895, "kl": 0.028991926461458206, "learning_rate": 9.93313801292698e-07, "loss": 0.0473, "num_tokens": 4002417.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4720953702926636, "sampling/importance_sampling_ratio/mean": 0.9996216297149658, "sampling/importance_sampling_ratio/min": 0.7697216272354126, "sampling/sampling_logp_difference/max": 0.3866868019104004, "sampling/sampling_logp_difference/mean": 0.013629883527755737, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 221.09375, "completions/mean_terminated_length": 221.09375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.380819171667099, "epoch": 0.2973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.02683304148745167, "kl": 0.023956146091222763, "learning_rate": 9.93059702449693e-07, "loss": 0.0002, "num_tokens": 4026199.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.415535569190979, "sampling/importance_sampling_ratio/mean": 0.9998903274536133, "sampling/importance_sampling_ratio/min": 0.687654435634613, "sampling/sampling_logp_difference/max": 0.3744688034057617, "sampling/sampling_logp_difference/mean": 0.012791529297828674, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 143.71875, "completions/mean_terminated_length": 143.71875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5017321109771729, "epoch": 0.2991150442477876, "frac_reward_zero_std": 0.75, "grad_norm": 1.0344474256919747, "kl": 0.032271116971969604, "learning_rate": 9.928008986351186e-07, "loss": -0.0003, "num_tokens": 4045925.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2727348804473877, "sampling/importance_sampling_ratio/mean": 0.9998068809509277, "sampling/importance_sampling_ratio/min": 0.6962348818778992, "sampling/sampling_logp_difference/max": 0.36206817626953125, "sampling/sampling_logp_difference/mean": 0.014889528974890709, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.6202338337898254, "epoch": 0.3008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 0.6041874746898551, "kl": 0.02981755882501602, "learning_rate": 9.925373923185834e-07, "loss": 0.0032, "num_tokens": 4074261.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3299710750579834, "sampling/importance_sampling_ratio/mean": 0.9999173283576965, "sampling/importance_sampling_ratio/min": 0.4337630867958069, "sampling/sampling_logp_difference/max": 0.835256814956665, "sampling/sampling_logp_difference/mean": 0.0163117703050375, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 250.140625, "completions/mean_terminated_length": 250.140625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5587708353996277, "epoch": 0.30265486725663715, "frac_reward_zero_std": 0.75, "grad_norm": 0.646145895353327, "kl": 0.02789548970758915, "learning_rate": 9.922691860145696e-07, "loss": 0.002, "num_tokens": 4105950.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.291640043258667, "sampling/importance_sampling_ratio/mean": 1.0001420974731445, "sampling/importance_sampling_ratio/min": 0.682328462600708, "sampling/sampling_logp_difference/max": 0.3822441101074219, "sampling/sampling_logp_difference/mean": 0.0150994174182415, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 221.984375, "completions/mean_terminated_length": 221.984375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.5274782180786133, "epoch": 0.30442477876106194, "frac_reward_zero_std": 0.5, "grad_norm": 1.0743323567130463, "kl": 0.02884136140346527, "learning_rate": 9.919962822824083e-07, "loss": -0.0217, "num_tokens": 4135949.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.2752799987792969, "sampling/importance_sampling_ratio/mean": 1.0002425909042358, "sampling/importance_sampling_ratio/min": 0.6911680102348328, "sampling/sampling_logp_difference/max": 0.3693723678588867, "sampling/sampling_logp_difference/mean": 0.014896942302584648, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 122.078125, "completions/mean_terminated_length": 122.078125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.4607529640197754, "epoch": 0.30619469026548674, "frac_reward_zero_std": 1.0, "grad_norm": 0.03868649107370889, "kl": 0.021036827936768532, "learning_rate": 9.91718683726255e-07, "loss": 0.0002, "num_tokens": 4152866.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.429377555847168, "sampling/importance_sampling_ratio/mean": 0.9999097585678101, "sampling/importance_sampling_ratio/min": 0.7905539870262146, "sampling/sampling_logp_difference/max": 0.3572390079498291, "sampling/sampling_logp_difference/mean": 0.014209815301001072, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 227.6875, "completions/mean_terminated_length": 227.6875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.5349213480949402, "epoch": 0.30796460176991153, "frac_reward_zero_std": 0.5, "grad_norm": 1.0362177377375232, "kl": 0.03002750128507614, "learning_rate": 9.914363929950657e-07, "loss": 0.0468, "num_tokens": 4177630.0, "reward": 0.3125, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3576866388320923, "sampling/importance_sampling_ratio/mean": 0.9997402429580688, "sampling/importance_sampling_ratio/min": 0.6993037462234497, "sampling/sampling_logp_difference/max": 0.3576700687408447, "sampling/sampling_logp_difference/mean": 0.015053363516926765, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 126.75, "completions/mean_terminated_length": 126.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.4266471564769745, "epoch": 0.30973451327433627, "frac_reward_zero_std": 1.0, "grad_norm": 0.03869249307823831, "kl": 0.026396356523036957, "learning_rate": 9.91149412782571e-07, "loss": 0.0003, "num_tokens": 4193678.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3121099472045898, "sampling/importance_sampling_ratio/mean": 0.9998595714569092, "sampling/importance_sampling_ratio/min": 0.7910821437835693, "sampling/sampling_logp_difference/max": 0.2716364860534668, "sampling/sampling_logp_difference/mean": 0.013439470902085304, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 234.984375, "completions/mean_terminated_length": 234.984375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.6032348275184631, "epoch": 0.31150442477876106, "frac_reward_zero_std": 0.25, "grad_norm": 1.2981196874193386, "kl": 0.04099959880113602, "learning_rate": 9.908577458272495e-07, "loss": -0.0244, "num_tokens": 4220333.0, "reward": 0.5, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6209867000579834, "sampling/importance_sampling_ratio/mean": 0.9997628927230835, "sampling/importance_sampling_ratio/min": 0.717178463935852, "sampling/sampling_logp_difference/max": 0.4830350875854492, "sampling/sampling_logp_difference/mean": 0.0164419487118721, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4720958471298218, "epoch": 0.31327433628318585, "frac_reward_zero_std": 0.75, "grad_norm": 1.0342331419515522, "kl": 0.03015996515750885, "learning_rate": 9.905613949123034e-07, "loss": -0.0006, "num_tokens": 4243869.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4058351516723633, "sampling/importance_sampling_ratio/mean": 1.0004899501800537, "sampling/importance_sampling_ratio/min": 0.7888265252113342, "sampling/sampling_logp_difference/max": 0.3406316041946411, "sampling/sampling_logp_difference/mean": 0.014683553017675877, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 231.828125, "completions/mean_terminated_length": 231.828125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.5523578524589539, "epoch": 0.31504424778761064, "frac_reward_zero_std": 0.5, "grad_norm": 1.0266760280763607, "kl": 0.030189139768481255, "learning_rate": 9.902603628656311e-07, "loss": 0.0339, "num_tokens": 4269058.0, "reward": -0.3125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3657116889953613, "sampling/importance_sampling_ratio/mean": 1.000697374343872, "sampling/importance_sampling_ratio/min": 0.710117757320404, "sampling/sampling_logp_difference/max": 0.34232449531555176, "sampling/sampling_logp_difference/mean": 0.015112483873963356, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.5422703623771667, "epoch": 0.3168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.04009644383204252, "kl": 0.03183231130242348, "learning_rate": 9.899546525597997e-07, "loss": 0.0003, "num_tokens": 4289370.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4035450220108032, "sampling/importance_sampling_ratio/mean": 1.0002192258834839, "sampling/importance_sampling_ratio/min": 0.6910572648048401, "sampling/sampling_logp_difference/max": 0.36953258514404297, "sampling/sampling_logp_difference/mean": 0.01554935798048973, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 187.984375, "completions/mean_terminated_length": 187.984375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.6675031185150146, "epoch": 0.3185840707964602, "frac_reward_zero_std": 0.25, "grad_norm": 1.4103564808406392, "kl": 0.03398448973894119, "learning_rate": 9.896442669120187e-07, "loss": 0.0268, "num_tokens": 4313513.0, "reward": 0.40625, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.2897205352783203, "sampling/importance_sampling_ratio/mean": 1.0006816387176514, "sampling/importance_sampling_ratio/min": 0.6369392275810242, "sampling/sampling_logp_difference/max": 0.4510810375213623, "sampling/sampling_logp_difference/mean": 0.01694805547595024, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 189.421875, "completions/mean_terminated_length": 189.421875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.5120855569839478, "epoch": 0.32035398230088497, "frac_reward_zero_std": 0.75, "grad_norm": 0.8746660577647231, "kl": 0.02533281408250332, "learning_rate": 9.893292088841108e-07, "loss": 0.004, "num_tokens": 4334836.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4584860801696777, "sampling/importance_sampling_ratio/mean": 0.9998229146003723, "sampling/importance_sampling_ratio/min": 0.6704840064048767, "sampling/sampling_logp_difference/max": 0.39975547790527344, "sampling/sampling_logp_difference/mean": 0.015046782791614532, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 282.5625, "completions/mean_terminated_length": 282.5625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.6147529482841492, "epoch": 0.32212389380530976, "frac_reward_zero_std": 0.25, "grad_norm": 0.9421922973249914, "kl": 0.028734855353832245, "learning_rate": 9.890094814824852e-07, "loss": -0.0011, "num_tokens": 4365640.0, "reward": -0.1875, "reward_std": 0.5, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.409217119216919, "sampling/importance_sampling_ratio/mean": 1.000175952911377, "sampling/importance_sampling_ratio/min": 0.7664762139320374, "sampling/sampling_logp_difference/max": 0.3430342674255371, "sampling/sampling_logp_difference/mean": 0.015689615160226822, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 144.390625, "completions/mean_terminated_length": 144.390625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5152815580368042, "epoch": 0.3238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.042639245269893405, "kl": 0.03522444888949394, "learning_rate": 9.886850877581078e-07, "loss": 0.0003, "num_tokens": 4385681.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.292569875717163, "sampling/importance_sampling_ratio/mean": 0.9988747239112854, "sampling/importance_sampling_ratio/min": 0.7787993550300598, "sampling/sampling_logp_difference/max": 0.25663232803344727, "sampling/sampling_logp_difference/mean": 0.015145987272262573, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 142.515625, "completions/mean_terminated_length": 142.515625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.5333941578865051, "epoch": 0.3256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.0708163840441351, "kl": 0.029462572187185287, "learning_rate": 9.883560308064722e-07, "loss": -0.0246, "num_tokens": 4405026.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.2831519842147827, "sampling/importance_sampling_ratio/mean": 1.0000826120376587, "sampling/importance_sampling_ratio/min": 0.592808723449707, "sampling/sampling_logp_difference/max": 0.5228835344314575, "sampling/sampling_logp_difference/mean": 0.015138004906475544, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 172.328125, "completions/mean_terminated_length": 172.328125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.5486119389533997, "epoch": 0.3274336283185841, "frac_reward_zero_std": 0.5, "grad_norm": 1.300090454368962, "kl": 0.0219305157661438, "learning_rate": 9.880223137675707e-07, "loss": 0.0773, "num_tokens": 4426071.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.275173544883728, "sampling/importance_sampling_ratio/mean": 0.9996733069419861, "sampling/importance_sampling_ratio/min": 0.7533791661262512, "sampling/sampling_logp_difference/max": 0.283186674118042, "sampling/sampling_logp_difference/mean": 0.014903266914188862, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 139.171875, "completions/mean_terminated_length": 139.171875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.4225764572620392, "epoch": 0.3292035398230089, "frac_reward_zero_std": 1.0, "grad_norm": 0.031880093672485184, "kl": 0.025315623730421066, "learning_rate": 9.876839398258639e-07, "loss": 0.0002, "num_tokens": 4446130.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2970459461212158, "sampling/importance_sampling_ratio/mean": 0.9998003244400024, "sampling/importance_sampling_ratio/min": 0.7320749163627625, "sampling/sampling_logp_difference/max": 0.3118724822998047, "sampling/sampling_logp_difference/mean": 0.01426306925714016, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 138.203125, "completions/mean_terminated_length": 138.203125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5223335027694702, "epoch": 0.3309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.1814954228708288, "kl": 0.026554003357887268, "learning_rate": 9.873409122102503e-07, "loss": 0.0082, "num_tokens": 4465391.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.470061182975769, "sampling/importance_sampling_ratio/mean": 0.9996907711029053, "sampling/importance_sampling_ratio/min": 0.7554728388786316, "sampling/sampling_logp_difference/max": 0.38530397415161133, "sampling/sampling_logp_difference/mean": 0.015401721000671387, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 236.8125, "completions/mean_terminated_length": 236.8125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.5235884189605713, "epoch": 0.3327433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 1.0931493891819872, "kl": 0.02814125269651413, "learning_rate": 9.869932341940358e-07, "loss": 0.0156, "num_tokens": 4491539.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5462726354599, "sampling/importance_sampling_ratio/mean": 0.9997540712356567, "sampling/importance_sampling_ratio/min": 0.7248845100402832, "sampling/sampling_logp_difference/max": 0.43584728240966797, "sampling/sampling_logp_difference/mean": 0.013916519470512867, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 159.703125, "completions/mean_terminated_length": 159.703125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.34615474939346313, "epoch": 0.3345132743362832, "frac_reward_zero_std": 1.0, "grad_norm": 0.022235320012852892, "kl": 0.01645616814494133, "learning_rate": 9.86640909094902e-07, "loss": 0.0001, "num_tokens": 4512032.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5268900394439697, "sampling/importance_sampling_ratio/mean": 0.9999336004257202, "sampling/importance_sampling_ratio/min": 0.6493529677391052, "sampling/sampling_logp_difference/max": 0.4317789077758789, "sampling/sampling_logp_difference/mean": 0.011423708871006966, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 263.4375, "completions/mean_terminated_length": 263.4375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.525526762008667, "epoch": 0.336283185840708, "frac_reward_zero_std": 0.25, "grad_norm": 1.022987653796978, "kl": 0.024146491661667824, "learning_rate": 9.862839402748753e-07, "loss": -0.023, "num_tokens": 4540108.0, "reward": 0.40625, "reward_std": 0.5986068248748779, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.300493597984314, "sampling/importance_sampling_ratio/mean": 1.0000572204589844, "sampling/importance_sampling_ratio/min": 0.7142893075942993, "sampling/sampling_logp_difference/max": 0.33646726608276367, "sampling/sampling_logp_difference/mean": 0.013864274136722088, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 169.0625, "completions/mean_terminated_length": 169.0625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.3839399814605713, "epoch": 0.3380530973451327, "frac_reward_zero_std": 1.0, "grad_norm": 0.029181580902275055, "kl": 0.02368132770061493, "learning_rate": 9.859223311402936e-07, "loss": 0.0002, "num_tokens": 4561120.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3923920392990112, "sampling/importance_sampling_ratio/mean": 1.000184178352356, "sampling/importance_sampling_ratio/min": 0.6896332502365112, "sampling/sampling_logp_difference/max": 0.3715953826904297, "sampling/sampling_logp_difference/mean": 0.012256739661097527, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.5441832542419434, "epoch": 0.3398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 1.0718682372955246, "kl": 0.028561726212501526, "learning_rate": 9.85556085141775e-07, "loss": -0.0148, "num_tokens": 4584352.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.557019829750061, "sampling/importance_sampling_ratio/mean": 0.9994099140167236, "sampling/importance_sampling_ratio/min": 0.775416910648346, "sampling/sampling_logp_difference/max": 0.44277358055114746, "sampling/sampling_logp_difference/mean": 0.015070498920977116, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.43203431367874146, "epoch": 0.3415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0266817878628731, "kl": 0.02091498300433159, "learning_rate": 9.851852057741844e-07, "loss": 0.0002, "num_tokens": 4607592.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.503190040588379, "sampling/importance_sampling_ratio/mean": 0.9997876882553101, "sampling/importance_sampling_ratio/min": 0.6923187971115112, "sampling/sampling_logp_difference/max": 0.40758955478668213, "sampling/sampling_logp_difference/mean": 0.012802131474018097, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 160.765625, "completions/mean_terminated_length": 160.765625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.49228742718696594, "epoch": 0.3433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 1.0169072357529663, "kl": 0.03031754121184349, "learning_rate": 9.848096965766002e-07, "loss": -0.0036, "num_tokens": 4628649.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.635255217552185, "sampling/importance_sampling_ratio/mean": 1.000304102897644, "sampling/importance_sampling_ratio/min": 0.5502952337265015, "sampling/sampling_logp_difference/max": 0.5973002910614014, "sampling/sampling_logp_difference/mean": 0.015048659406602383, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4443419575691223, "epoch": 0.34513274336283184, "frac_reward_zero_std": 1.0, "grad_norm": 0.030205750394484598, "kl": 0.020870406180620193, "learning_rate": 9.844295611322803e-07, "loss": 0.0002, "num_tokens": 4647929.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4348970651626587, "sampling/importance_sampling_ratio/mean": 0.9994109869003296, "sampling/importance_sampling_ratio/min": 0.6878430247306824, "sampling/sampling_logp_difference/max": 0.3741946220397949, "sampling/sampling_logp_difference/mean": 0.014229003340005875, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 161.21875, "completions/mean_terminated_length": 161.21875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.5322006940841675, "epoch": 0.34690265486725663, "frac_reward_zero_std": 0.5, "grad_norm": 1.2546880964081744, "kl": 0.03158285841345787, "learning_rate": 9.84044803068628e-07, "loss": 0.0523, "num_tokens": 4668887.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.314862608909607, "sampling/importance_sampling_ratio/mean": 1.0002789497375488, "sampling/importance_sampling_ratio/min": 0.6997438669204712, "sampling/sampling_logp_difference/max": 0.3570408821105957, "sampling/sampling_logp_difference/mean": 0.01580563187599182, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 125.796875, "completions/mean_terminated_length": 125.796875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.41670292615890503, "epoch": 0.3486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 1.1552479246463945, "kl": 0.02814123034477234, "learning_rate": 9.836554260571577e-07, "loss": 0.0012, "num_tokens": 4687978.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3579728603363037, "sampling/importance_sampling_ratio/mean": 0.9997336864471436, "sampling/importance_sampling_ratio/min": 0.758485734462738, "sampling/sampling_logp_difference/max": 0.30599308013916016, "sampling/sampling_logp_difference/mean": 0.013592096045613289, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 111.953125, "completions/mean_terminated_length": 111.953125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.4496341347694397, "epoch": 0.3504424778761062, "frac_reward_zero_std": 1.0, "grad_norm": 0.02802608199440724, "kl": 0.017266346141695976, "learning_rate": 9.832614338134595e-07, "loss": 0.0002, "num_tokens": 4705207.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2806168794631958, "sampling/importance_sampling_ratio/mean": 1.0000157356262207, "sampling/importance_sampling_ratio/min": 0.7136625647544861, "sampling/sampling_logp_difference/max": 0.3373451232910156, "sampling/sampling_logp_difference/mean": 0.014268642291426659, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 210.609375, "completions/mean_terminated_length": 210.609375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.5102720856666565, "epoch": 0.35221238938053095, "frac_reward_zero_std": 1.0, "grad_norm": 0.022503477491189177, "kl": 0.018594201654195786, "learning_rate": 9.828628300971638e-07, "loss": 0.0002, "num_tokens": 4729614.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2971690893173218, "sampling/importance_sampling_ratio/mean": 1.0003905296325684, "sampling/importance_sampling_ratio/min": 0.6395329833030701, "sampling/sampling_logp_difference/max": 0.4470170736312866, "sampling/sampling_logp_difference/mean": 0.015570325776934624, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 216.84375, "completions/mean_terminated_length": 216.84375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.6130630970001221, "epoch": 0.35398230088495575, "frac_reward_zero_std": 0.25, "grad_norm": 1.2579906527377016, "kl": 0.035723909735679626, "learning_rate": 9.82459618711906e-07, "loss": -0.0017, "num_tokens": 4761172.0, "reward": 0.09375, "reward_std": 0.5431214570999146, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4927719831466675, "sampling/importance_sampling_ratio/mean": 1.0007643699645996, "sampling/importance_sampling_ratio/min": 0.4492037892341614, "sampling/sampling_logp_difference/max": 0.8002786636352539, "sampling/sampling_logp_difference/mean": 0.01679423823952675, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 292.1875, "completions/mean_terminated_length": 292.1875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.48770490288734436, "epoch": 0.35575221238938054, "frac_reward_zero_std": 0.25, "grad_norm": 0.8905894974968666, "kl": 0.029037956148386, "learning_rate": 9.820518035052889e-07, "loss": -0.0269, "num_tokens": 4789808.0, "reward": 0.59375, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.470080018043518, "sampling/importance_sampling_ratio/mean": 0.9998273849487305, "sampling/importance_sampling_ratio/min": 0.7773846387863159, "sampling/sampling_logp_difference/max": 0.3853168487548828, "sampling/sampling_logp_difference/mean": 0.013518981635570526, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5353986620903015, "epoch": 0.35752212389380533, "frac_reward_zero_std": 0.25, "grad_norm": 1.080223988966263, "kl": 0.03176872059702873, "learning_rate": 9.816393883688475e-07, "loss": -0.0516, "num_tokens": 4820148.0, "reward": 0.25, "reward_std": 0.6613117456436157, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4061598777770996, "sampling/importance_sampling_ratio/mean": 1.0003252029418945, "sampling/importance_sampling_ratio/min": 0.7328179478645325, "sampling/sampling_logp_difference/max": 0.340862512588501, "sampling/sampling_logp_difference/mean": 0.014487815089523792, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.4726807177066803, "epoch": 0.35929203539823007, "frac_reward_zero_std": 0.25, "grad_norm": 1.2417660300394053, "kl": 0.03013826720416546, "learning_rate": 9.812223772380105e-07, "loss": -0.1356, "num_tokens": 4845676.0, "reward": 0.78125, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.2829562425613403, "sampling/importance_sampling_ratio/mean": 0.9998630285263062, "sampling/importance_sampling_ratio/min": 0.6370500922203064, "sampling/sampling_logp_difference/max": 0.45090699195861816, "sampling/sampling_logp_difference/mean": 0.014757448807358742, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.43779507279396057, "epoch": 0.36106194690265486, "frac_reward_zero_std": 0.75, "grad_norm": 0.7450664054258594, "kl": 0.020679209381341934, "learning_rate": 9.808007740920645e-07, "loss": 0.0192, "num_tokens": 4876580.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.426794171333313, "sampling/importance_sampling_ratio/mean": 1.000209093093872, "sampling/importance_sampling_ratio/min": 0.7456256151199341, "sampling/sampling_logp_difference/max": 0.35543012619018555, "sampling/sampling_logp_difference/mean": 0.012484179809689522, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 230.640625, "completions/mean_terminated_length": 230.640625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.39596420526504517, "epoch": 0.36283185840707965, "frac_reward_zero_std": 1.0, "grad_norm": 0.028649512453676612, "kl": 0.02035467140376568, "learning_rate": 9.803745829541137e-07, "loss": 0.0002, "num_tokens": 4904285.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.276512861251831, "sampling/importance_sampling_ratio/mean": 0.999709963798523, "sampling/importance_sampling_ratio/min": 0.6210353970527649, "sampling/sampling_logp_difference/max": 0.4763672351837158, "sampling/sampling_logp_difference/mean": 0.012431886978447437, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 212.0625, "completions/mean_terminated_length": 212.0625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.39081645011901855, "epoch": 0.36460176991150445, "frac_reward_zero_std": 0.75, "grad_norm": 0.8304942758969864, "kl": 0.020595062524080276, "learning_rate": 9.799438078910432e-07, "loss": 0.0071, "num_tokens": 4928257.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.439272403717041, "sampling/importance_sampling_ratio/mean": 1.0004137754440308, "sampling/importance_sampling_ratio/min": 0.5685546398162842, "sampling/sampling_logp_difference/max": 0.5646578073501587, "sampling/sampling_logp_difference/mean": 0.012400672771036625, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 174.296875, "completions/mean_terminated_length": 174.296875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.45757555961608887, "epoch": 0.3663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 0.9303005271602158, "kl": 0.02256106585264206, "learning_rate": 9.7950845301348e-07, "loss": 0.0102, "num_tokens": 4949508.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.2926169633865356, "sampling/importance_sampling_ratio/mean": 0.9993118643760681, "sampling/importance_sampling_ratio/min": 0.679608166217804, "sampling/sampling_logp_difference/max": 0.38623881340026855, "sampling/sampling_logp_difference/mean": 0.015102554112672806, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 169.71875, "completions/mean_terminated_length": 169.71875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.4889061748981476, "epoch": 0.368141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 1.0921690403364595, "kl": 0.024689186364412308, "learning_rate": 9.790685224757532e-07, "loss": -0.0119, "num_tokens": 4970706.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3175941705703735, "sampling/importance_sampling_ratio/mean": 1.0001461505889893, "sampling/importance_sampling_ratio/min": 0.7527468204498291, "sampling/sampling_logp_difference/max": 0.2840263843536377, "sampling/sampling_logp_difference/mean": 0.015073701739311218, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 100.859375, "completions/mean_terminated_length": 100.859375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.42682167887687683, "epoch": 0.36991150442477877, "frac_reward_zero_std": 1.0, "grad_norm": 0.022490545695822525, "kl": 0.015489723533391953, "learning_rate": 9.786240204758552e-07, "loss": 0.0001, "num_tokens": 4986233.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.409118890762329, "sampling/importance_sampling_ratio/mean": 0.9991854429244995, "sampling/importance_sampling_ratio/min": 0.7636756300926208, "sampling/sampling_logp_difference/max": 0.34296464920043945, "sampling/sampling_logp_difference/mean": 0.015632610768079758, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 228.65625, "completions/mean_terminated_length": 228.65625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.48719555139541626, "epoch": 0.37168141592920356, "frac_reward_zero_std": 0.5, "grad_norm": 1.0320827191899484, "kl": 0.03096625581383705, "learning_rate": 9.781749512553998e-07, "loss": -0.0384, "num_tokens": 5013219.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.2963361740112305, "sampling/importance_sampling_ratio/mean": 1.0002009868621826, "sampling/importance_sampling_ratio/min": 0.6687098145484924, "sampling/sampling_logp_difference/max": 0.4024050235748291, "sampling/sampling_logp_difference/mean": 0.014101540669798851, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 224.453125, "completions/mean_terminated_length": 224.453125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.4804442822933197, "epoch": 0.3734513274336283, "frac_reward_zero_std": 0.5, "grad_norm": 1.1395964975187098, "kl": 0.027629774063825607, "learning_rate": 9.777213190995847e-07, "loss": -0.0554, "num_tokens": 5039568.0, "reward": 0.65625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5796170234680176, "sampling/importance_sampling_ratio/mean": 0.9995678067207336, "sampling/importance_sampling_ratio/min": 0.6933406591415405, "sampling/sampling_logp_difference/max": 0.4571824073791504, "sampling/sampling_logp_difference/mean": 0.014583523385226727, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 181.921875, "completions/mean_terminated_length": 181.921875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.4216836094856262, "epoch": 0.3752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 0.8571734719829718, "kl": 0.021015014499425888, "learning_rate": 9.77263128337148e-07, "loss": -0.0087, "num_tokens": 5063211.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.2888585329055786, "sampling/importance_sampling_ratio/mean": 0.9995905756950378, "sampling/importance_sampling_ratio/min": 0.7019922137260437, "sampling/sampling_logp_difference/max": 0.3538329601287842, "sampling/sampling_logp_difference/mean": 0.013555197976529598, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 219.84375, "completions/mean_terminated_length": 219.84375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.47979164123535156, "epoch": 0.3769911504424779, "frac_reward_zero_std": 0.5, "grad_norm": 1.0180305395478475, "kl": 0.023723438382148743, "learning_rate": 9.768003833403276e-07, "loss": 0.0023, "num_tokens": 5089937.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.267691969871521, "sampling/importance_sampling_ratio/mean": 0.9998621940612793, "sampling/importance_sampling_ratio/min": 0.7605219483375549, "sampling/sampling_logp_difference/max": 0.27375030517578125, "sampling/sampling_logp_difference/mean": 0.014234641566872597, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 95.71875, "completions/mean_terminated_length": 95.71875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.34744882583618164, "epoch": 0.3787610619469027, "frac_reward_zero_std": 1.0, "grad_norm": 0.029149947071528782, "kl": 0.014398623257875443, "learning_rate": 9.763330885248204e-07, "loss": 0.0001, "num_tokens": 5105919.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.576385259628296, "sampling/importance_sampling_ratio/mean": 1.0000059604644775, "sampling/importance_sampling_ratio/min": 0.7575017213821411, "sampling/sampling_logp_difference/max": 0.45513439178466797, "sampling/sampling_logp_difference/mean": 0.014102145098149776, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.3961879014968872, "epoch": 0.3805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 1.4138819799413318, "kl": 0.019451893866062164, "learning_rate": 9.758612483497394e-07, "loss": 0.0066, "num_tokens": 5124383.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5818085670471191, "sampling/importance_sampling_ratio/mean": 1.0003561973571777, "sampling/importance_sampling_ratio/min": 0.7789625525474548, "sampling/sampling_logp_difference/max": 0.458568811416626, "sampling/sampling_logp_difference/mean": 0.014672085642814636, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4073773920536041, "epoch": 0.3823008849557522, "frac_reward_zero_std": 0.75, "grad_norm": 0.8016637039465784, "kl": 0.026877960190176964, "learning_rate": 9.753848673175707e-07, "loss": -0.0335, "num_tokens": 5147971.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3988468647003174, "sampling/importance_sampling_ratio/mean": 1.0001204013824463, "sampling/importance_sampling_ratio/min": 0.7779297232627869, "sampling/sampling_logp_difference/max": 0.3356482982635498, "sampling/sampling_logp_difference/mean": 0.014463700354099274, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 162.171875, "completions/mean_terminated_length": 162.171875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.34233927726745605, "epoch": 0.384070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0232026754106461, "kl": 0.013723460026085377, "learning_rate": 9.74903949974131e-07, "loss": 0.0001, "num_tokens": 5168734.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2804162502288818, "sampling/importance_sampling_ratio/mean": 0.9998216032981873, "sampling/importance_sampling_ratio/min": 0.6112180948257446, "sampling/sampling_logp_difference/max": 0.49230146408081055, "sampling/sampling_logp_difference/mean": 0.013114867731928825, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 190.96875, "completions/mean_terminated_length": 190.96875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.4857107698917389, "epoch": 0.3858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 1.2399086943742759, "kl": 0.02303343266248703, "learning_rate": 9.744185009085256e-07, "loss": -0.0179, "num_tokens": 5191228.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.293093204498291, "sampling/importance_sampling_ratio/mean": 0.9996750950813293, "sampling/importance_sampling_ratio/min": 0.7136985063552856, "sampling/sampling_logp_difference/max": 0.33729469776153564, "sampling/sampling_logp_difference/mean": 0.01510285772383213, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 174.890625, "completions/mean_terminated_length": 174.890625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.42115509510040283, "epoch": 0.38761061946902653, "frac_reward_zero_std": 0.5, "grad_norm": 1.3727345047939208, "kl": 0.019435705617070198, "learning_rate": 9.739285247531017e-07, "loss": 0.0024, "num_tokens": 5213365.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.518711805343628, "sampling/importance_sampling_ratio/mean": 0.9994184970855713, "sampling/importance_sampling_ratio/min": 0.6116957068443298, "sampling/sampling_logp_difference/max": 0.49152040481567383, "sampling/sampling_logp_difference/mean": 0.016335811465978622, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 200.921875, "completions/mean_terminated_length": 200.921875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5480678677558899, "epoch": 0.3893805309734513, "frac_reward_zero_std": 0.5, "grad_norm": 1.0694191207278179, "kl": 0.032260291278362274, "learning_rate": 9.734340261834066e-07, "loss": -0.0016, "num_tokens": 5237184.0, "reward": 0.65625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3937193155288696, "sampling/importance_sampling_ratio/mean": 1.0005559921264648, "sampling/importance_sampling_ratio/min": 0.712913990020752, "sampling/sampling_logp_difference/max": 0.3383945822715759, "sampling/sampling_logp_difference/mean": 0.0160797368735075, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 268.6875, "completions/mean_terminated_length": 268.6875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4439542889595032, "epoch": 0.3911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 0.7466460883396188, "kl": 0.021524842828512192, "learning_rate": 9.729350099181419e-07, "loss": -0.0264, "num_tokens": 5266108.0, "reward": 0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3670485019683838, "sampling/importance_sampling_ratio/mean": 0.9998194575309753, "sampling/importance_sampling_ratio/min": 0.7128616571426392, "sampling/sampling_logp_difference/max": 0.33846795558929443, "sampling/sampling_logp_difference/mean": 0.013549437746405602, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 162.8125, "completions/mean_terminated_length": 162.8125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.3944702446460724, "epoch": 0.3929203539823009, "frac_reward_zero_std": 1.0, "grad_norm": 0.028899239112063258, "kl": 0.019183315336704254, "learning_rate": 9.724314807191196e-07, "loss": 0.0002, "num_tokens": 5287632.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4702560901641846, "sampling/importance_sampling_ratio/mean": 0.9997889399528503, "sampling/importance_sampling_ratio/min": 0.41246071457862854, "sampling/sampling_logp_difference/max": 0.8856143951416016, "sampling/sampling_logp_difference/mean": 0.014504620805382729, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.5393285751342773, "epoch": 0.39469026548672564, "frac_reward_zero_std": 0.25, "grad_norm": 1.4274717965548285, "kl": 0.026311691850423813, "learning_rate": 9.719234433912146e-07, "loss": 0.0446, "num_tokens": 5313392.0, "reward": 0.21875, "reward_std": 0.5722135901451111, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.26570725440979, "sampling/importance_sampling_ratio/mean": 0.9996041059494019, "sampling/importance_sampling_ratio/min": 0.6907061338424683, "sampling/sampling_logp_difference/max": 0.3700408935546875, "sampling/sampling_logp_difference/mean": 0.015244169160723686, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 150.3125, "completions/mean_terminated_length": 150.3125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.4326164424419403, "epoch": 0.39646017699115044, "frac_reward_zero_std": 0.5, "grad_norm": 1.5999864335337923, "kl": 0.020327169448137283, "learning_rate": 9.714109027823216e-07, "loss": 0.0684, "num_tokens": 5334580.0, "reward": 0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4404765367507935, "sampling/importance_sampling_ratio/mean": 1.0008692741394043, "sampling/importance_sampling_ratio/min": 0.7005481123924255, "sampling/sampling_logp_difference/max": 0.3649740219116211, "sampling/sampling_logp_difference/mean": 0.016018476337194443, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 135.765625, "completions/mean_terminated_length": 135.765625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.40380361676216125, "epoch": 0.39823008849557523, "frac_reward_zero_std": 0.75, "grad_norm": 0.8818988106354686, "kl": 0.013917742297053337, "learning_rate": 9.708938637833064e-07, "loss": -0.0576, "num_tokens": 5353413.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4011801481246948, "sampling/importance_sampling_ratio/mean": 0.9996488094329834, "sampling/importance_sampling_ratio/min": 0.6444272994995117, "sampling/sampling_logp_difference/max": 0.4393932819366455, "sampling/sampling_logp_difference/mean": 0.01501011848449707, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 261.71875, "completions/mean_terminated_length": 261.71875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.37733006477355957, "epoch": 0.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.9024941397959875, "kl": 0.01724552921950817, "learning_rate": 9.703723313279605e-07, "loss": 0.0067, "num_tokens": 5380131.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3694186210632324, "sampling/importance_sampling_ratio/mean": 0.9993304014205933, "sampling/importance_sampling_ratio/min": 0.7134221792221069, "sampling/sampling_logp_difference/max": 0.33768200874328613, "sampling/sampling_logp_difference/mean": 0.012753515504300594, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5193660259246826, "epoch": 0.40176991150442476, "frac_reward_zero_std": 0.5, "grad_norm": 1.008127423692569, "kl": 0.02917841635644436, "learning_rate": 9.698463103929541e-07, "loss": -0.0033, "num_tokens": 5404803.0, "reward": 0.65625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4189236164093018, "sampling/importance_sampling_ratio/mean": 1.0000311136245728, "sampling/importance_sampling_ratio/min": 0.764443576335907, "sampling/sampling_logp_difference/max": 0.3498985767364502, "sampling/sampling_logp_difference/mean": 0.016841476783156395, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 282.546875, "completions/mean_terminated_length": 282.546875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.4152635335922241, "epoch": 0.40353982300884955, "frac_reward_zero_std": 0.5, "grad_norm": 0.8083735248652889, "kl": 0.01835733652114868, "learning_rate": 9.693158059977877e-07, "loss": -0.0103, "num_tokens": 5434374.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.2921773195266724, "sampling/importance_sampling_ratio/mean": 0.999847412109375, "sampling/importance_sampling_ratio/min": 0.6954969167709351, "sampling/sampling_logp_difference/max": 0.363128662109375, "sampling/sampling_logp_difference/mean": 0.013232965022325516, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 87.109375, "completions/mean_terminated_length": 87.109375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.35415130853652954, "epoch": 0.40530973451327434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0414154158849392, "kl": 0.013880953192710876, "learning_rate": 9.68780823204745e-07, "loss": 0.0001, "num_tokens": 5449437.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.431895136833191, "sampling/importance_sampling_ratio/mean": 0.9999564290046692, "sampling/importance_sampling_ratio/min": 0.7358685731887817, "sampling/sampling_logp_difference/max": 0.35899877548217773, "sampling/sampling_logp_difference/mean": 0.01602562889456749, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 156.296875, "completions/mean_terminated_length": 156.296875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.35336047410964966, "epoch": 0.40707964601769914, "frac_reward_zero_std": 1.0, "grad_norm": 0.031836140342479294, "kl": 0.018275540322065353, "learning_rate": 9.682413671188444e-07, "loss": 0.0002, "num_tokens": 5469536.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3247321844100952, "sampling/importance_sampling_ratio/mean": 1.0001217126846313, "sampling/importance_sampling_ratio/min": 0.6933028101921082, "sampling/sampling_logp_difference/max": 0.366288423538208, "sampling/sampling_logp_difference/mean": 0.013501457870006561, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 238.65625, "completions/mean_terminated_length": 238.65625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4286758303642273, "epoch": 0.4088495575221239, "frac_reward_zero_std": 0.5, "grad_norm": 1.121053928669367, "kl": 0.020949389785528183, "learning_rate": 9.6769744288779e-07, "loss": 0.0439, "num_tokens": 5496730.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4388401508331299, "sampling/importance_sampling_ratio/mean": 0.9996062517166138, "sampling/importance_sampling_ratio/min": 0.6555602550506592, "sampling/sampling_logp_difference/max": 0.42226505279541016, "sampling/sampling_logp_difference/mean": 0.01407901756465435, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.47462913393974304, "epoch": 0.41061946902654867, "frac_reward_zero_std": 0.75, "grad_norm": 1.1924424185476858, "kl": 0.02640056610107422, "learning_rate": 9.671490557019233e-07, "loss": -0.1199, "num_tokens": 5519822.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.287208914756775, "sampling/importance_sampling_ratio/mean": 0.9999936819076538, "sampling/importance_sampling_ratio/min": 0.7403419017791748, "sampling/sampling_logp_difference/max": 0.3006432056427002, "sampling/sampling_logp_difference/mean": 0.014690637588500977, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 211.71875, "completions/mean_terminated_length": 211.71875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3383290767669678, "epoch": 0.41238938053097346, "frac_reward_zero_std": 0.75, "grad_norm": 0.7896147238763637, "kl": 0.015110672451555729, "learning_rate": 9.665962107941724e-07, "loss": -0.0163, "num_tokens": 5543084.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3566224575042725, "sampling/importance_sampling_ratio/mean": 0.9994478225708008, "sampling/importance_sampling_ratio/min": 0.5494151711463928, "sampling/sampling_logp_difference/max": 0.5989007949829102, "sampling/sampling_logp_difference/mean": 0.012813359498977661, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 274.765625, "completions/mean_terminated_length": 274.765625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5099519491195679, "epoch": 0.41415929203539825, "frac_reward_zero_std": 0.25, "grad_norm": 0.9556882995385025, "kl": 0.021475229412317276, "learning_rate": 9.660389134400033e-07, "loss": 0.0307, "num_tokens": 5572397.0, "reward": 0.15625, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.8419055938720703, "sampling/importance_sampling_ratio/mean": 1.0001903772354126, "sampling/importance_sampling_ratio/min": 0.646623432636261, "sampling/sampling_logp_difference/max": 0.6108007431030273, "sampling/sampling_logp_difference/mean": 0.01525837741792202, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 199.4375, "completions/mean_terminated_length": 199.4375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.3300395607948303, "epoch": 0.415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.031661613973588035, "kl": 0.017582304775714874, "learning_rate": 9.654771689573684e-07, "loss": 0.0002, "num_tokens": 5595833.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.403545618057251, "sampling/importance_sampling_ratio/mean": 0.9993851780891418, "sampling/importance_sampling_ratio/min": 0.6508767604827881, "sampling/sampling_logp_difference/max": 0.42943501472473145, "sampling/sampling_logp_difference/mean": 0.012309819459915161, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 138.90625, "completions/mean_terminated_length": 138.90625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.39855900406837463, "epoch": 0.4176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.0830721451281482, "kl": 0.019422704353928566, "learning_rate": 9.64910982706657e-07, "loss": 0.0265, "num_tokens": 5616099.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5871074199676514, "sampling/importance_sampling_ratio/mean": 1.000655174255371, "sampling/importance_sampling_ratio/min": 0.7812917828559875, "sampling/sampling_logp_difference/max": 0.4619131088256836, "sampling/sampling_logp_difference/mean": 0.014437375590205193, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 171.328125, "completions/mean_terminated_length": 171.328125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.45381003618240356, "epoch": 0.4194690265486726, "frac_reward_zero_std": 0.5, "grad_norm": 1.4641597741862753, "kl": 0.020227786153554916, "learning_rate": 9.643403600906432e-07, "loss": -0.0551, "num_tokens": 5636040.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.390151023864746, "sampling/importance_sampling_ratio/mean": 1.0003771781921387, "sampling/importance_sampling_ratio/min": 0.7794750928878784, "sampling/sampling_logp_difference/max": 0.32941246032714844, "sampling/sampling_logp_difference/mean": 0.014798464253544807, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.42035090923309326, "epoch": 0.42123893805309737, "frac_reward_zero_std": 0.75, "grad_norm": 1.3697386534432274, "kl": 0.024695709347724915, "learning_rate": 9.637653065544349e-07, "loss": 0.0685, "num_tokens": 5654624.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.508662223815918, "sampling/importance_sampling_ratio/mean": 0.9996183514595032, "sampling/importance_sampling_ratio/min": 0.7458860278129578, "sampling/sampling_logp_difference/max": 0.41122329235076904, "sampling/sampling_logp_difference/mean": 0.016772709786891937, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 183.53125, "completions/mean_terminated_length": 183.53125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4483111500740051, "epoch": 0.4230088495575221, "frac_reward_zero_std": 0.75, "grad_norm": 0.8433104098541186, "kl": 0.02792176976799965, "learning_rate": 9.63185827585421e-07, "loss": -0.0305, "num_tokens": 5676594.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4983381032943726, "sampling/importance_sampling_ratio/mean": 0.999271035194397, "sampling/importance_sampling_ratio/min": 0.5446476340293884, "sampling/sampling_logp_difference/max": 0.6076161861419678, "sampling/sampling_logp_difference/mean": 0.01616353914141655, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 247.96875, "completions/mean_terminated_length": 247.96875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5001981258392334, "epoch": 0.4247787610619469, "frac_reward_zero_std": 0.5, "grad_norm": 0.8655860415568635, "kl": 0.02763250842690468, "learning_rate": 9.6260192871322e-07, "loss": -0.027, "num_tokens": 5704704.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4057986736297607, "sampling/importance_sampling_ratio/mean": 1.0000677108764648, "sampling/importance_sampling_ratio/min": 0.7122602462768555, "sampling/sampling_logp_difference/max": 0.34060561656951904, "sampling/sampling_logp_difference/mean": 0.014366839081048965, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 258.453125, "completions/mean_terminated_length": 258.453125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.5985598564147949, "epoch": 0.4265486725663717, "frac_reward_zero_std": 0.25, "grad_norm": 1.144584903729403, "kl": 0.031906358897686005, "learning_rate": 9.620136155096275e-07, "loss": -0.0896, "num_tokens": 5731485.0, "reward": -0.34375, "reward_std": 0.718070387840271, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.37568998336792, "sampling/importance_sampling_ratio/mean": 1.0000308752059937, "sampling/importance_sampling_ratio/min": 0.781316339969635, "sampling/sampling_logp_difference/max": 0.3189554214477539, "sampling/sampling_logp_difference/mean": 0.01639627106487751, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 231.84375, "completions/mean_terminated_length": 231.84375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.48729994893074036, "epoch": 0.4283185840707965, "frac_reward_zero_std": 0.25, "grad_norm": 1.138170495066922, "kl": 0.031542450189590454, "learning_rate": 9.614208935885614e-07, "loss": -0.0426, "num_tokens": 5760499.0, "reward": 0.09375, "reward_std": 0.7379794120788574, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4415761232376099, "sampling/importance_sampling_ratio/mean": 0.9998794794082642, "sampling/importance_sampling_ratio/min": 0.6376157999038696, "sampling/sampling_logp_difference/max": 0.45001935958862305, "sampling/sampling_logp_difference/mean": 0.014306294731795788, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.4347379505634308, "epoch": 0.4300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 1.0293509731219213, "kl": 0.023223329335451126, "learning_rate": 9.608237686060097e-07, "loss": -0.0021, "num_tokens": 5780995.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.621147632598877, "sampling/importance_sampling_ratio/mean": 1.0011894702911377, "sampling/importance_sampling_ratio/min": 0.7294524312019348, "sampling/sampling_logp_difference/max": 0.48313426971435547, "sampling/sampling_logp_difference/mean": 0.015486613847315311, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 247.03125, "completions/mean_terminated_length": 247.03125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.40740007162094116, "epoch": 0.431858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 0.8552086230548112, "kl": 0.02226535975933075, "learning_rate": 9.602222462599766e-07, "loss": -0.0077, "num_tokens": 5811669.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3945648670196533, "sampling/importance_sampling_ratio/mean": 0.9997775554656982, "sampling/importance_sampling_ratio/min": 0.7197755575180054, "sampling/sampling_logp_difference/max": 0.3325824737548828, "sampling/sampling_logp_difference/mean": 0.012866266071796417, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 180.390625, "completions/mean_terminated_length": 180.390625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3240535855293274, "epoch": 0.4336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.03284049407850343, "kl": 0.019132770597934723, "learning_rate": 9.596163322904269e-07, "loss": 0.0002, "num_tokens": 5832302.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4418944120407104, "sampling/importance_sampling_ratio/mean": 1.000842809677124, "sampling/importance_sampling_ratio/min": 0.7643632888793945, "sampling/sampling_logp_difference/max": 0.36595773696899414, "sampling/sampling_logp_difference/mean": 0.0135429035872221, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 214.40625, "completions/mean_terminated_length": 214.40625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.4747132360935211, "epoch": 0.4353982300884956, "frac_reward_zero_std": 0.75, "grad_norm": 0.9230109152248913, "kl": 0.0271947979927063, "learning_rate": 9.590060324792325e-07, "loss": -0.0952, "num_tokens": 5856648.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6359583139419556, "sampling/importance_sampling_ratio/mean": 1.000832200050354, "sampling/importance_sampling_ratio/min": 0.700729250907898, "sampling/sampling_logp_difference/max": 0.49222874641418457, "sampling/sampling_logp_difference/mean": 0.015770025551319122, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 188.671875, "completions/mean_terminated_length": 188.671875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.4974265396595001, "epoch": 0.43716814159292033, "frac_reward_zero_std": 0.75, "grad_norm": 0.8865768731939675, "kl": 0.025145823135972023, "learning_rate": 9.58391352650117e-07, "loss": -0.0104, "num_tokens": 5878787.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.429240107536316, "sampling/importance_sampling_ratio/mean": 1.0002708435058594, "sampling/importance_sampling_ratio/min": 0.6547340154647827, "sampling/sampling_logp_difference/max": 0.4235262870788574, "sampling/sampling_logp_difference/mean": 0.014619768597185612, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 209.890625, "completions/mean_terminated_length": 209.890625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.420995831489563, "epoch": 0.4389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9326272513076128, "kl": 0.020633120089769363, "learning_rate": 9.57772298668599e-07, "loss": 0.0271, "num_tokens": 5903004.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.944155216217041, "sampling/importance_sampling_ratio/mean": 1.0003857612609863, "sampling/importance_sampling_ratio/min": 0.6512681245803833, "sampling/sampling_logp_difference/max": 0.6648275852203369, "sampling/sampling_logp_difference/mean": 0.014983316883444786, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 287.171875, "completions/mean_terminated_length": 287.171875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.4206381142139435, "epoch": 0.4407079646017699, "frac_reward_zero_std": 0.5, "grad_norm": 0.8410352904355655, "kl": 0.027448756620287895, "learning_rate": 9.57148876441938e-07, "loss": -0.0694, "num_tokens": 5932631.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.2763044834136963, "sampling/importance_sampling_ratio/mean": 0.999886155128479, "sampling/importance_sampling_ratio/min": 0.7463252544403076, "sampling/sampling_logp_difference/max": 0.29259374737739563, "sampling/sampling_logp_difference/mean": 0.013162143528461456, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.48601922392845154, "epoch": 0.4424778761061947, "frac_reward_zero_std": 0.5, "grad_norm": 0.9748642348040831, "kl": 0.023372408002614975, "learning_rate": 9.565210919190763e-07, "loss": -0.0337, "num_tokens": 5966335.0, "reward": 0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7265528440475464, "sampling/importance_sampling_ratio/mean": 1.0004456043243408, "sampling/importance_sampling_ratio/min": 0.6496454477310181, "sampling/sampling_logp_difference/max": 0.5461268424987793, "sampling/sampling_logp_difference/mean": 0.014494102448225021, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 120.21875, "completions/mean_terminated_length": 120.21875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.46461203694343567, "epoch": 0.44424778761061945, "frac_reward_zero_std": 0.75, "grad_norm": 1.1964023424049564, "kl": 0.022524647414684296, "learning_rate": 9.558889510905835e-07, "loss": -0.0008, "num_tokens": 5987293.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4555951356887817, "sampling/importance_sampling_ratio/mean": 1.000432014465332, "sampling/importance_sampling_ratio/min": 0.7904923558235168, "sampling/sampling_logp_difference/max": 0.3754148483276367, "sampling/sampling_logp_difference/mean": 0.014719847589731216, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 123.9375, "completions/mean_terminated_length": 123.9375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.36043643951416016, "epoch": 0.44601769911504424, "frac_reward_zero_std": 1.0, "grad_norm": 0.023732427347126793, "kl": 0.015227346681058407, "learning_rate": 9.55252459988598e-07, "loss": 0.0001, "num_tokens": 6006393.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2994132041931152, "sampling/importance_sampling_ratio/mean": 1.000097632408142, "sampling/importance_sampling_ratio/min": 0.7789879441261292, "sampling/sampling_logp_difference/max": 0.26191282272338867, "sampling/sampling_logp_difference/mean": 0.013076022267341614, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 190.609375, "completions/mean_terminated_length": 190.609375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.40344202518463135, "epoch": 0.44778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.029953875184706, "kl": 0.02808651328086853, "learning_rate": 9.546116246867713e-07, "loss": 0.0003, "num_tokens": 6029584.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6463197469711304, "sampling/importance_sampling_ratio/mean": 0.9998626708984375, "sampling/importance_sampling_ratio/min": 0.726875901222229, "sampling/sampling_logp_difference/max": 0.49854230880737305, "sampling/sampling_logp_difference/mean": 0.012515061534941196, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 179.703125, "completions/mean_terminated_length": 179.703125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.4220643937587738, "epoch": 0.4495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.8259788650853315, "kl": 0.022247038781642914, "learning_rate": 9.539664513002084e-07, "loss": -0.0147, "num_tokens": 6051661.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.288933515548706, "sampling/importance_sampling_ratio/mean": 0.9997393488883972, "sampling/importance_sampling_ratio/min": 0.6351196765899658, "sampling/sampling_logp_difference/max": 0.45394182205200195, "sampling/sampling_logp_difference/mean": 0.014015771448612213, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 168.28125, "completions/mean_terminated_length": 168.28125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.44515854120254517, "epoch": 0.45132743362831856, "frac_reward_zero_std": 0.75, "grad_norm": 0.9384112300033148, "kl": 0.025413163006305695, "learning_rate": 9.533169459854098e-07, "loss": -0.0179, "num_tokens": 6072191.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4136977195739746, "sampling/importance_sampling_ratio/mean": 0.9999372363090515, "sampling/importance_sampling_ratio/min": 0.7728919386863708, "sampling/sampling_logp_difference/max": 0.3462088108062744, "sampling/sampling_logp_difference/mean": 0.014568738639354706, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 193.265625, "completions/mean_terminated_length": 193.265625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.4067316949367523, "epoch": 0.45309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 0.9389057554377178, "kl": 0.01858017034828663, "learning_rate": 9.526631149402134e-07, "loss": -0.0157, "num_tokens": 6094752.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3617420196533203, "sampling/importance_sampling_ratio/mean": 0.9996851086616516, "sampling/importance_sampling_ratio/min": 0.7345516085624695, "sampling/sampling_logp_difference/max": 0.30876481533050537, "sampling/sampling_logp_difference/mean": 0.013466689735651016, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 217.171875, "completions/mean_terminated_length": 217.171875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.3828349709510803, "epoch": 0.45486725663716815, "frac_reward_zero_std": 0.75, "grad_norm": 0.8531294053997839, "kl": 0.02091463841497898, "learning_rate": 9.520049644037347e-07, "loss": 0.0324, "num_tokens": 6118651.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3620110750198364, "sampling/importance_sampling_ratio/mean": 1.0002086162567139, "sampling/importance_sampling_ratio/min": 0.7171450853347778, "sampling/sampling_logp_difference/max": 0.3324770927429199, "sampling/sampling_logp_difference/mean": 0.011766104958951473, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 155.828125, "completions/mean_terminated_length": 155.828125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4815964698791504, "epoch": 0.45663716814159294, "frac_reward_zero_std": 0.75, "grad_norm": 1.062406338547466, "kl": 0.02147318422794342, "learning_rate": 9.513425006563078e-07, "loss": -0.0069, "num_tokens": 6139568.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3486475944519043, "sampling/importance_sampling_ratio/mean": 1.0001500844955444, "sampling/importance_sampling_ratio/min": 0.7188703417778015, "sampling/sampling_logp_difference/max": 0.3300743103027344, "sampling/sampling_logp_difference/mean": 0.015135202556848526, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 157.328125, "completions/mean_terminated_length": 157.328125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.44287437200546265, "epoch": 0.4584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 0.9025426799805967, "kl": 0.02093086764216423, "learning_rate": 9.506757300194248e-07, "loss": 0.0123, "num_tokens": 6172485.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7544947862625122, "sampling/importance_sampling_ratio/mean": 1.0004286766052246, "sampling/importance_sampling_ratio/min": 0.6623055338859558, "sampling/sampling_logp_difference/max": 0.5621809959411621, "sampling/sampling_logp_difference/mean": 0.014587843790650368, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 207.546875, "completions/mean_terminated_length": 207.546875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.46362990140914917, "epoch": 0.46017699115044247, "frac_reward_zero_std": 0.5, "grad_norm": 1.0666632874497055, "kl": 0.027465712279081345, "learning_rate": 9.500046588556761e-07, "loss": 0.0485, "num_tokens": 6196920.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4749035835266113, "sampling/importance_sampling_ratio/mean": 0.9999303817749023, "sampling/importance_sampling_ratio/min": 0.7344458103179932, "sampling/sampling_logp_difference/max": 0.38859260082244873, "sampling/sampling_logp_difference/mean": 0.01438632421195507, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 162.96875, "completions/mean_terminated_length": 162.96875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.5139217376708984, "epoch": 0.46194690265486726, "frac_reward_zero_std": 0.5, "grad_norm": 1.3084236815398984, "kl": 0.02867557853460312, "learning_rate": 9.493292935686894e-07, "loss": 0.0121, "num_tokens": 6217718.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.2784643173217773, "sampling/importance_sampling_ratio/mean": 1.0003505945205688, "sampling/importance_sampling_ratio/min": 0.6865198612213135, "sampling/sampling_logp_difference/max": 0.37612009048461914, "sampling/sampling_logp_difference/mean": 0.015432951971888542, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 272.109375, "completions/mean_terminated_length": 272.109375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.3513563275337219, "epoch": 0.46371681415929206, "frac_reward_zero_std": 0.75, "grad_norm": 0.5901515522469556, "kl": 0.016452450305223465, "learning_rate": 9.486496406030685e-07, "loss": -0.0288, "num_tokens": 6247229.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5751897096633911, "sampling/importance_sampling_ratio/mean": 0.999725341796875, "sampling/importance_sampling_ratio/min": 0.6298417448997498, "sampling/sampling_logp_difference/max": 0.46228671073913574, "sampling/sampling_logp_difference/mean": 0.011618832126259804, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 220.78125, "completions/mean_terminated_length": 220.78125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.48534053564071655, "epoch": 0.4654867256637168, "frac_reward_zero_std": 0.5, "grad_norm": 0.9545346848980775, "kl": 0.03555607050657272, "learning_rate": 9.479657064443321e-07, "loss": 0.0075, "num_tokens": 6273759.0, "reward": 0.125, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.2853167057037354, "sampling/importance_sampling_ratio/mean": 1.0002505779266357, "sampling/importance_sampling_ratio/min": 0.7286049127578735, "sampling/sampling_logp_difference/max": 0.3166236877441406, "sampling/sampling_logp_difference/mean": 0.014509860426187515, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 298.90625, "completions/mean_terminated_length": 298.90625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3376370072364807, "epoch": 0.4672566371681416, "frac_reward_zero_std": 0.5, "grad_norm": 0.6308260639746693, "kl": 0.031115680932998657, "learning_rate": 9.472774976188513e-07, "loss": 0.0172, "num_tokens": 6304505.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.463265299797058, "sampling/importance_sampling_ratio/mean": 0.9994950294494629, "sampling/importance_sampling_ratio/min": 0.6298382878303528, "sampling/sampling_logp_difference/max": 0.4622921943664551, "sampling/sampling_logp_difference/mean": 0.010739332996308804, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 182.171875, "completions/mean_terminated_length": 182.171875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3612799048423767, "epoch": 0.4690265486725664, "frac_reward_zero_std": 0.75, "grad_norm": 0.995751963566048, "kl": 0.016666874289512634, "learning_rate": 9.465850206937887e-07, "loss": -0.0276, "num_tokens": 6326708.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4556899070739746, "sampling/importance_sampling_ratio/mean": 0.9997296333312988, "sampling/importance_sampling_ratio/min": 0.6958641409873962, "sampling/sampling_logp_difference/max": 0.37547993659973145, "sampling/sampling_logp_difference/mean": 0.013304678723216057, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 111.96875, "completions/mean_terminated_length": 111.96875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.33584925532341003, "epoch": 0.47079646017699117, "frac_reward_zero_std": 1.0, "grad_norm": 0.04486005940704978, "kl": 0.016245992854237556, "learning_rate": 9.45888282277034e-07, "loss": 0.0002, "num_tokens": 6343634.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.411961317062378, "sampling/importance_sampling_ratio/mean": 0.9996743202209473, "sampling/importance_sampling_ratio/min": 0.7331298589706421, "sampling/sampling_logp_difference/max": 0.34497976303100586, "sampling/sampling_logp_difference/mean": 0.0139246117323637, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 193.09375, "completions/mean_terminated_length": 193.09375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.3893989622592926, "epoch": 0.4725663716814159, "frac_reward_zero_std": 1.0, "grad_norm": 0.031575168981618086, "kl": 0.027552293613553047, "learning_rate": 9.451872890171419e-07, "loss": 0.0003, "num_tokens": 6366856.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3442569971084595, "sampling/importance_sampling_ratio/mean": 0.9997502565383911, "sampling/importance_sampling_ratio/min": 0.7825663089752197, "sampling/sampling_logp_difference/max": 0.2958414554595947, "sampling/sampling_logp_difference/mean": 0.012174347415566444, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 242.515625, "completions/mean_terminated_length": 242.515625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.5150254368782043, "epoch": 0.4743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 1.1494259418584443, "kl": 0.04173735901713371, "learning_rate": 9.444820476032685e-07, "loss": -0.1109, "num_tokens": 6395721.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4457945823669434, "sampling/importance_sampling_ratio/mean": 0.9996910095214844, "sampling/importance_sampling_ratio/min": 0.49494314193725586, "sampling/sampling_logp_difference/max": 0.7033124566078186, "sampling/sampling_logp_difference/mean": 0.014535872265696526, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 180.5625, "completions/mean_terminated_length": 180.5625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.3699858486652374, "epoch": 0.4761061946902655, "frac_reward_zero_std": 1.0, "grad_norm": 0.02506190100969895, "kl": 0.019144756719470024, "learning_rate": 9.437725647651078e-07, "loss": 0.0002, "num_tokens": 6420333.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3118797540664673, "sampling/importance_sampling_ratio/mean": 1.0001341104507446, "sampling/importance_sampling_ratio/min": 0.7496989369392395, "sampling/sampling_logp_difference/max": 0.288083553314209, "sampling/sampling_logp_difference/mean": 0.012265527620911598, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 216.65625, "completions/mean_terminated_length": 216.65625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.41407477855682373, "epoch": 0.4778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 0.754032133125725, "kl": 0.028063761070370674, "learning_rate": 9.430588472728269e-07, "loss": -0.0024, "num_tokens": 6443495.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.2595200538635254, "sampling/importance_sampling_ratio/mean": 0.9994900226593018, "sampling/importance_sampling_ratio/min": 0.732181966304779, "sampling/sampling_logp_difference/max": 0.3117262125015259, "sampling/sampling_logp_difference/mean": 0.012702877633273602, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 175.296875, "completions/mean_terminated_length": 175.296875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.347293496131897, "epoch": 0.479646017699115, "frac_reward_zero_std": 1.0, "grad_norm": 0.020463314727654638, "kl": 0.016730479896068573, "learning_rate": 9.423409019370014e-07, "loss": 0.0002, "num_tokens": 6464426.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3111567497253418, "sampling/importance_sampling_ratio/mean": 1.000911831855774, "sampling/importance_sampling_ratio/min": 0.7393708229064941, "sampling/sampling_logp_difference/max": 0.3019556999206543, "sampling/sampling_logp_difference/mean": 0.013450185768306255, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 133.9375, "completions/mean_terminated_length": 133.9375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.5141062140464783, "epoch": 0.4814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 2.4344588412566908, "kl": 0.022841716185212135, "learning_rate": 9.416187356085512e-07, "loss": -0.0212, "num_tokens": 6487398.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.3609944581985474, "sampling/importance_sampling_ratio/mean": 0.9997943639755249, "sampling/importance_sampling_ratio/min": 0.6609594821929932, "sampling/sampling_logp_difference/max": 0.4140627384185791, "sampling/sampling_logp_difference/mean": 0.017085053026676178, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 127.890625, "completions/mean_terminated_length": 127.890625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.3545767068862915, "epoch": 0.4831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.02907967606263098, "kl": 0.01671958714723587, "learning_rate": 9.408923551786742e-07, "loss": 0.0002, "num_tokens": 6507311.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3642396926879883, "sampling/importance_sampling_ratio/mean": 0.999678373336792, "sampling/importance_sampling_ratio/min": 0.7404797673225403, "sampling/sampling_logp_difference/max": 0.31059730052948, "sampling/sampling_logp_difference/mean": 0.013313070870935917, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 113.53125, "completions/mean_terminated_length": 113.53125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.38214343786239624, "epoch": 0.4849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.03758927376184869, "kl": 0.02708502858877182, "learning_rate": 9.40161767578781e-07, "loss": 0.0003, "num_tokens": 6525873.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5448487997055054, "sampling/importance_sampling_ratio/mean": 1.0005196332931519, "sampling/importance_sampling_ratio/min": 0.7083582878112793, "sampling/sampling_logp_difference/max": 0.43492603302001953, "sampling/sampling_logp_difference/mean": 0.014143557287752628, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 81.765625, "completions/mean_terminated_length": 81.765625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.3427674472332001, "epoch": 0.48672566371681414, "frac_reward_zero_std": 1.0, "grad_norm": 0.05314824991827773, "kl": 0.02233726531267166, "learning_rate": 9.394269797804288e-07, "loss": 0.0002, "num_tokens": 6541346.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4305939674377441, "sampling/importance_sampling_ratio/mean": 0.9996143579483032, "sampling/importance_sampling_ratio/min": 0.7682305574417114, "sampling/sampling_logp_difference/max": 0.3580896854400635, "sampling/sampling_logp_difference/mean": 0.014109227806329727, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 176.171875, "completions/mean_terminated_length": 176.171875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.5313659906387329, "epoch": 0.48849557522123893, "frac_reward_zero_std": 0.5, "grad_norm": 1.3572990405303726, "kl": 0.03350096940994263, "learning_rate": 9.386879987952549e-07, "loss": -0.0546, "num_tokens": 6571741.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.3507276773452759, "sampling/importance_sampling_ratio/mean": 0.9998536109924316, "sampling/importance_sampling_ratio/min": 0.7178463339805603, "sampling/sampling_logp_difference/max": 0.3314998149871826, "sampling/sampling_logp_difference/mean": 0.01627851277589798, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 116.765625, "completions/mean_terminated_length": 116.765625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.2943428158760071, "epoch": 0.4902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.030513593384564313, "kl": 0.01798875629901886, "learning_rate": 9.37944831674909e-07, "loss": 0.0002, "num_tokens": 6587982.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2874562740325928, "sampling/importance_sampling_ratio/mean": 1.0003066062927246, "sampling/importance_sampling_ratio/min": 0.771958589553833, "sampling/sampling_logp_difference/max": 0.25882434844970703, "sampling/sampling_logp_difference/mean": 0.0119395786896348, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 171.375, "completions/mean_terminated_length": 171.375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.37061136960983276, "epoch": 0.4920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.029594369847415546, "kl": 0.022415220737457275, "learning_rate": 9.371974855109874e-07, "loss": 0.0003, "num_tokens": 6610534.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5768307447433472, "sampling/importance_sampling_ratio/mean": 1.0004709959030151, "sampling/importance_sampling_ratio/min": 0.7646911144256592, "sampling/sampling_logp_difference/max": 0.4554169178009033, "sampling/sampling_logp_difference/mean": 0.012888370081782341, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.43819940090179443, "epoch": 0.49380530973451325, "frac_reward_zero_std": 0.75, "grad_norm": 1.3835528659574545, "kl": 0.026608847081661224, "learning_rate": 9.36445967434964e-07, "loss": -0.0045, "num_tokens": 6629078.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4258359670639038, "sampling/importance_sampling_ratio/mean": 1.000093936920166, "sampling/importance_sampling_ratio/min": 0.7807312607765198, "sampling/sampling_logp_difference/max": 0.35475826263427734, "sampling/sampling_logp_difference/mean": 0.015082063153386116, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 87.5625, "completions/mean_terminated_length": 87.5625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.38979020714759827, "epoch": 0.49557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.04454170190976207, "kl": 0.02234538458287716, "learning_rate": 9.356902846181228e-07, "loss": 0.0002, "num_tokens": 6643738.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3670485019683838, "sampling/importance_sampling_ratio/mean": 1.000714659690857, "sampling/importance_sampling_ratio/min": 0.683316171169281, "sampling/sampling_logp_difference/max": 0.3807976245880127, "sampling/sampling_logp_difference/mean": 0.016383085399866104, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 139.859375, "completions/mean_terminated_length": 139.859375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3762098550796509, "epoch": 0.49734513274336284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0335147567998549, "kl": 0.02258431538939476, "learning_rate": 9.349304442714895e-07, "loss": 0.0002, "num_tokens": 6662945.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5072108507156372, "sampling/importance_sampling_ratio/mean": 1.000349998474121, "sampling/importance_sampling_ratio/min": 0.6849249005317688, "sampling/sampling_logp_difference/max": 0.41026079654693604, "sampling/sampling_logp_difference/mean": 0.013234042562544346, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 219.0625, "completions/mean_terminated_length": 219.0625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.5574349164962769, "epoch": 0.49911504424778763, "frac_reward_zero_std": 0.75, "grad_norm": 0.7158336559619782, "kl": 0.02687021903693676, "learning_rate": 9.341664536457625e-07, "loss": -0.0068, "num_tokens": 6696949.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5546824932098389, "sampling/importance_sampling_ratio/mean": 1.0002248287200928, "sampling/importance_sampling_ratio/min": 0.7115757465362549, "sampling/sampling_logp_difference/max": 0.4412713050842285, "sampling/sampling_logp_difference/mean": 0.01624004729092121, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.5636672973632812, "epoch": 0.5008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 0.9148229273005299, "kl": 0.030749784782528877, "learning_rate": 9.33398320031244e-07, "loss": -0.021, "num_tokens": 6723189.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.2864587306976318, "sampling/importance_sampling_ratio/mean": 0.9990864992141724, "sampling/importance_sampling_ratio/min": 0.7221028208732605, "sampling/sampling_logp_difference/max": 0.32558774948120117, "sampling/sampling_logp_difference/mean": 0.01665741764008999, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 192.515625, "completions/mean_terminated_length": 192.515625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.40647560358047485, "epoch": 0.5026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 1.0426483698427593, "kl": 0.024281399324536324, "learning_rate": 9.3262605075777e-07, "loss": -0.0033, "num_tokens": 6747958.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4269031286239624, "sampling/importance_sampling_ratio/mean": 0.9995687007904053, "sampling/importance_sampling_ratio/min": 0.7290580868721008, "sampling/sampling_logp_difference/max": 0.35550642013549805, "sampling/sampling_logp_difference/mean": 0.013786457479000092, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 155.015625, "completions/mean_terminated_length": 155.015625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5595388412475586, "epoch": 0.504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.0880408883161694, "kl": 0.0223354771733284, "learning_rate": 9.318496531946409e-07, "loss": -0.002, "num_tokens": 6773559.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3138937950134277, "sampling/importance_sampling_ratio/mean": 1.0004900693893433, "sampling/importance_sampling_ratio/min": 0.6217941045761108, "sampling/sampling_logp_difference/max": 0.4751462936401367, "sampling/sampling_logp_difference/mean": 0.017683856189250946, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 162.078125, "completions/mean_terminated_length": 162.078125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.4235331416130066, "epoch": 0.5061946902654867, "frac_reward_zero_std": 0.75, "grad_norm": 1.01426347718253, "kl": 0.02303442731499672, "learning_rate": 9.310691347505505e-07, "loss": -0.0334, "num_tokens": 6795196.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000581741333008, "sampling/importance_sampling_ratio/min": 0.6324044466018677, "sampling/sampling_logp_difference/max": 1.2133370637893677, "sampling/sampling_logp_difference/mean": 0.014442702755331993, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 214.796875, "completions/mean_terminated_length": 214.796875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3913472890853882, "epoch": 0.5079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.6340974690895596, "kl": 0.02389129437506199, "learning_rate": 9.30284502873516e-07, "loss": 0.0608, "num_tokens": 6818431.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4247369766235352, "sampling/importance_sampling_ratio/mean": 1.0000317096710205, "sampling/importance_sampling_ratio/min": 0.7060527205467224, "sampling/sampling_logp_difference/max": 0.3539872169494629, "sampling/sampling_logp_difference/mean": 0.012021524831652641, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 180.59375, "completions/mean_terminated_length": 180.59375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.33694714307785034, "epoch": 0.5097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 1.067956647867076, "kl": 0.016941959038376808, "learning_rate": 9.294957650508064e-07, "loss": 0.0388, "num_tokens": 6839637.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4524016380310059, "sampling/importance_sampling_ratio/mean": 1.0009417533874512, "sampling/importance_sampling_ratio/min": 0.778668224811554, "sampling/sampling_logp_difference/max": 0.3732185363769531, "sampling/sampling_logp_difference/mean": 0.013980678282678127, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 173.484375, "completions/mean_terminated_length": 173.484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.4274466037750244, "epoch": 0.511504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 0.8307748556586433, "kl": 0.023602060973644257, "learning_rate": 9.287029288088716e-07, "loss": 0.0628, "num_tokens": 6863236.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.403639316558838, "sampling/importance_sampling_ratio/mean": 0.9999457597732544, "sampling/importance_sampling_ratio/min": 0.6135632395744324, "sampling/sampling_logp_difference/max": 0.48847198486328125, "sampling/sampling_logp_difference/mean": 0.013387692160904408, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.5357771515846252, "epoch": 0.5132743362831859, "frac_reward_zero_std": 0.5, "grad_norm": 1.4439232532658601, "kl": 0.02527947910130024, "learning_rate": 9.279060017132697e-07, "loss": 0.0317, "num_tokens": 6890132.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3950165510177612, "sampling/importance_sampling_ratio/mean": 1.0003771781921387, "sampling/importance_sampling_ratio/min": 0.7121195197105408, "sampling/sampling_logp_difference/max": 0.33950960636138916, "sampling/sampling_logp_difference/mean": 0.01664656028151512, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 194.640625, "completions/mean_terminated_length": 194.640625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.4595835208892822, "epoch": 0.5150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 1.0524872759313664, "kl": 0.032371461391448975, "learning_rate": 9.271049913685959e-07, "loss": 0.0474, "num_tokens": 6913997.0, "reward": -0.1875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.2739500999450684, "sampling/importance_sampling_ratio/mean": 0.9998501539230347, "sampling/importance_sampling_ratio/min": 0.7022725939750671, "sampling/sampling_logp_difference/max": 0.35343360900878906, "sampling/sampling_logp_difference/mean": 0.014543505385518074, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.561547577381134, "epoch": 0.5168141592920354, "frac_reward_zero_std": 0.0, "grad_norm": 1.4389190025077898, "kl": 0.029952242970466614, "learning_rate": 9.262999054184091e-07, "loss": -0.0037, "num_tokens": 6938917.0, "reward": 0.03125, "reward_std": 0.8409290909767151, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4523767232894897, "sampling/importance_sampling_ratio/mean": 1.0001108646392822, "sampling/importance_sampling_ratio/min": 0.7210462093353271, "sampling/sampling_logp_difference/max": 0.3732013702392578, "sampling/sampling_logp_difference/mean": 0.015093608759343624, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 164.015625, "completions/mean_terminated_length": 164.015625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.47052571177482605, "epoch": 0.5185840707964602, "frac_reward_zero_std": 0.75, "grad_norm": 0.8424302095536896, "kl": 0.024961790069937706, "learning_rate": 9.254907515451591e-07, "loss": 0.0171, "num_tokens": 6960054.0, "reward": -0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.322510004043579, "sampling/importance_sampling_ratio/mean": 0.9995924830436707, "sampling/importance_sampling_ratio/min": 0.6906332969665527, "sampling/sampling_logp_difference/max": 0.3701462745666504, "sampling/sampling_logp_difference/mean": 0.013942908495664597, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 206.6875, "completions/mean_terminated_length": 206.6875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.7058727741241455, "epoch": 0.5203539823008849, "frac_reward_zero_std": 0.0, "grad_norm": 1.62044663550059, "kl": 0.03815227746963501, "learning_rate": 9.246775374701138e-07, "loss": -0.0493, "num_tokens": 7006594.0, "reward": 0.6875, "reward_std": 0.6707825064659119, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.2923325300216675, "sampling/importance_sampling_ratio/mean": 0.9999611973762512, "sampling/importance_sampling_ratio/min": 0.7103539705276489, "sampling/sampling_logp_difference/max": 0.3419919013977051, "sampling/sampling_logp_difference/mean": 0.019572153687477112, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 97.765625, "completions/mean_terminated_length": 97.765625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.38359206914901733, "epoch": 0.5221238938053098, "frac_reward_zero_std": 1.0, "grad_norm": 0.02490290600044646, "kl": 0.01738068275153637, "learning_rate": 9.23860270953285e-07, "loss": 0.0002, "num_tokens": 7022163.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.414842128753662, "sampling/importance_sampling_ratio/mean": 1.000035047531128, "sampling/importance_sampling_ratio/min": 0.32179585099220276, "sampling/sampling_logp_difference/max": 1.1338379383087158, "sampling/sampling_logp_difference/mean": 0.012069846503436565, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 153.703125, "completions/mean_terminated_length": 153.703125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.40974512696266174, "epoch": 0.5238938053097345, "frac_reward_zero_std": 0.75, "grad_norm": 0.9243552264610663, "kl": 0.02297171577811241, "learning_rate": 9.230389597933543e-07, "loss": 0.0106, "num_tokens": 7042400.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.2644307613372803, "sampling/importance_sampling_ratio/mean": 0.9998427629470825, "sampling/importance_sampling_ratio/min": 0.6128631830215454, "sampling/sampling_logp_difference/max": 0.48961353302001953, "sampling/sampling_logp_difference/mean": 0.013672401197254658, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 121.984375, "completions/mean_terminated_length": 121.984375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.5432904958724976, "epoch": 0.5256637168141592, "frac_reward_zero_std": 0.5, "grad_norm": 1.6418258495983953, "kl": 0.028919009491801262, "learning_rate": 9.222136118275995e-07, "loss": 0.021, "num_tokens": 7061647.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3228206634521484, "sampling/importance_sampling_ratio/mean": 1.0008418560028076, "sampling/importance_sampling_ratio/min": 0.6871564984321594, "sampling/sampling_logp_difference/max": 0.3751932382583618, "sampling/sampling_logp_difference/mean": 0.015978515148162842, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 98.71875, "completions/mean_terminated_length": 98.71875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3812759518623352, "epoch": 0.5274336283185841, "frac_reward_zero_std": 1.0, "grad_norm": 0.026281657354558577, "kl": 0.014870252460241318, "learning_rate": 9.213842349318184e-07, "loss": 0.0001, "num_tokens": 7077005.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.266287922859192, "sampling/importance_sampling_ratio/mean": 0.999727189540863, "sampling/importance_sampling_ratio/min": 0.7114967703819275, "sampling/sampling_logp_difference/max": 0.34038448333740234, "sampling/sampling_logp_difference/mean": 0.012317991815507412, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 240.671875, "completions/mean_terminated_length": 240.671875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.4979361891746521, "epoch": 0.5292035398230088, "frac_reward_zero_std": 0.5, "grad_norm": 0.8069973738942912, "kl": 0.020270682871341705, "learning_rate": 9.205508370202551e-07, "loss": -0.0164, "num_tokens": 7105400.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3977611064910889, "sampling/importance_sampling_ratio/mean": 1.0000991821289062, "sampling/importance_sampling_ratio/min": 0.625463604927063, "sampling/sampling_logp_difference/max": 0.46926212310791016, "sampling/sampling_logp_difference/mean": 0.01386222429573536, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5659000873565674, "epoch": 0.5309734513274337, "frac_reward_zero_std": 0.25, "grad_norm": 1.458864330187627, "kl": 0.027280788868665695, "learning_rate": 9.197134260455233e-07, "loss": 0.0619, "num_tokens": 7131328.0, "reward": 0.15625, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.2639267444610596, "sampling/importance_sampling_ratio/mean": 0.9993382692337036, "sampling/importance_sampling_ratio/min": 0.681110143661499, "sampling/sampling_logp_difference/max": 0.3840312957763672, "sampling/sampling_logp_difference/mean": 0.015114151872694492, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 141.109375, "completions/mean_terminated_length": 141.109375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.5378804206848145, "epoch": 0.5327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 1.3592462061583312, "kl": 0.02363814041018486, "learning_rate": 9.188720099985315e-07, "loss": 0.0231, "num_tokens": 7152919.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.38033926486969, "sampling/importance_sampling_ratio/mean": 1.0007405281066895, "sampling/importance_sampling_ratio/min": 0.7704980969429016, "sampling/sampling_logp_difference/max": 0.3223292827606201, "sampling/sampling_logp_difference/mean": 0.01572255790233612, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.5824719667434692, "epoch": 0.5345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 1.3574428418747804, "kl": 0.028891537338495255, "learning_rate": 9.180265969084056e-07, "loss": 0.0287, "num_tokens": 7175263.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.404373049736023, "sampling/importance_sampling_ratio/mean": 1.0004374980926514, "sampling/importance_sampling_ratio/min": 0.7618606090545654, "sampling/sampling_logp_difference/max": 0.33959102630615234, "sampling/sampling_logp_difference/mean": 0.016006726771593094, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 151.40625, "completions/mean_terminated_length": 151.40625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5031188726425171, "epoch": 0.536283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 1.1136881456238523, "kl": 0.02510761097073555, "learning_rate": 9.171771948424136e-07, "loss": -0.0263, "num_tokens": 7197417.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2919437885284424, "sampling/importance_sampling_ratio/mean": 0.9999505281448364, "sampling/importance_sampling_ratio/min": 0.7065415382385254, "sampling/sampling_logp_difference/max": 0.34737324714660645, "sampling/sampling_logp_difference/mean": 0.014848611317574978, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 112.15625, "completions/mean_terminated_length": 112.15625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.44976624846458435, "epoch": 0.5380530973451327, "frac_reward_zero_std": 0.75, "grad_norm": 1.1007404401836094, "kl": 0.024789299815893173, "learning_rate": 9.163238119058871e-07, "loss": 0.0096, "num_tokens": 7214419.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.2969236373901367, "sampling/importance_sampling_ratio/mean": 1.0000333786010742, "sampling/importance_sampling_ratio/min": 0.7929874658584595, "sampling/sampling_logp_difference/max": 0.2599949836730957, "sampling/sampling_logp_difference/mean": 0.013627534732222557, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 139.78125, "completions/mean_terminated_length": 139.78125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.4234926104545593, "epoch": 0.5398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.024452359261084817, "kl": 0.017473291605710983, "learning_rate": 9.154664562421453e-07, "loss": 0.0002, "num_tokens": 7233589.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.397467851638794, "sampling/importance_sampling_ratio/mean": 0.9993629455566406, "sampling/importance_sampling_ratio/min": 0.7854596972465515, "sampling/sampling_logp_difference/max": 0.33466196060180664, "sampling/sampling_logp_difference/mean": 0.013136005029082298, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 142.890625, "completions/mean_terminated_length": 142.890625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5456439852714539, "epoch": 0.5415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.3200248064246203, "kl": 0.021501999348402023, "learning_rate": 9.146051360324165e-07, "loss": 0.0571, "num_tokens": 7253838.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3971199989318848, "sampling/importance_sampling_ratio/mean": 1.000162959098816, "sampling/importance_sampling_ratio/min": 0.7829035520553589, "sampling/sampling_logp_difference/max": 0.3344130516052246, "sampling/sampling_logp_difference/mean": 0.015179058536887169, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.48661133646965027, "epoch": 0.5433628318584071, "frac_reward_zero_std": 0.5, "grad_norm": 1.1743043463083715, "kl": 0.022761614993214607, "learning_rate": 9.137398594957603e-07, "loss": 0.0482, "num_tokens": 7274886.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.2921069860458374, "sampling/importance_sampling_ratio/mean": 0.9999053478240967, "sampling/importance_sampling_ratio/min": 0.7394416332244873, "sampling/sampling_logp_difference/max": 0.30185985565185547, "sampling/sampling_logp_difference/mean": 0.014484938234090805, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 202.484375, "completions/mean_terminated_length": 202.484375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.4843139052391052, "epoch": 0.5451327433628319, "frac_reward_zero_std": 0.5, "grad_norm": 1.226443483937684, "kl": 0.01852584257721901, "learning_rate": 9.128706348889894e-07, "loss": -0.0612, "num_tokens": 7298373.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3292255401611328, "sampling/importance_sampling_ratio/mean": 0.999993085861206, "sampling/importance_sampling_ratio/min": 0.70584636926651, "sampling/sampling_logp_difference/max": 0.3483576774597168, "sampling/sampling_logp_difference/mean": 0.013197941705584526, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 189.96875, "completions/mean_terminated_length": 189.96875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.46538448333740234, "epoch": 0.5469026548672566, "frac_reward_zero_std": 0.75, "grad_norm": 0.8332203307381577, "kl": 0.01989561878144741, "learning_rate": 9.1199747050659e-07, "loss": 0.013, "num_tokens": 7322819.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000416040420532, "sampling/importance_sampling_ratio/min": 0.6935426592826843, "sampling/sampling_logp_difference/max": 0.6975505352020264, "sampling/sampling_logp_difference/mean": 0.0133647620677948, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 213.15625, "completions/mean_terminated_length": 213.15625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4567338824272156, "epoch": 0.5486725663716814, "frac_reward_zero_std": 0.5, "grad_norm": 0.9703156652302585, "kl": 0.018455907702445984, "learning_rate": 9.111203746806439e-07, "loss": -0.0896, "num_tokens": 7346813.0, "reward": 0.46875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4165599346160889, "sampling/importance_sampling_ratio/mean": 1.0005401372909546, "sampling/importance_sampling_ratio/min": 0.7809019684791565, "sampling/sampling_logp_difference/max": 0.34823131561279297, "sampling/sampling_logp_difference/mean": 0.013046861626207829, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 207.6875, "completions/mean_terminated_length": 207.6875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5189756155014038, "epoch": 0.5504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 0.7942999346115056, "kl": 0.01731109619140625, "learning_rate": 9.102393557807476e-07, "loss": -0.0121, "num_tokens": 7372585.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.2809677124023438, "sampling/importance_sampling_ratio/mean": 1.0002483129501343, "sampling/importance_sampling_ratio/min": 0.6814830899238586, "sampling/sampling_logp_difference/max": 0.38348388671875, "sampling/sampling_logp_difference/mean": 0.014638138003647327, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 149.46875, "completions/mean_terminated_length": 149.46875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.4825288951396942, "epoch": 0.552212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.02176378670283425, "kl": 0.020055875182151794, "learning_rate": 9.093544222139337e-07, "loss": 0.0002, "num_tokens": 7392487.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3002797365188599, "sampling/importance_sampling_ratio/mean": 1.000125765800476, "sampling/importance_sampling_ratio/min": 0.7737025618553162, "sampling/sampling_logp_difference/max": 0.26257944107055664, "sampling/sampling_logp_difference/mean": 0.013001724146306515, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 102.609375, "completions/mean_terminated_length": 102.609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.4079360365867615, "epoch": 0.5539823008849557, "frac_reward_zero_std": 1.0, "grad_norm": 0.03275681829976402, "kl": 0.020958352833986282, "learning_rate": 9.084655824245897e-07, "loss": 0.0002, "num_tokens": 7409486.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2769980430603027, "sampling/importance_sampling_ratio/mean": 1.0000150203704834, "sampling/importance_sampling_ratio/min": 0.7881141304969788, "sampling/sampling_logp_difference/max": 0.24451208114624023, "sampling/sampling_logp_difference/mean": 0.011911412701010704, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 138.1875, "completions/mean_terminated_length": 138.1875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5443404912948608, "epoch": 0.5557522123893806, "frac_reward_zero_std": 0.5, "grad_norm": 1.4474139154472354, "kl": 0.030087875202298164, "learning_rate": 9.075728448943781e-07, "loss": 0.0294, "num_tokens": 7429802.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2724729776382446, "sampling/importance_sampling_ratio/mean": 0.99989914894104, "sampling/importance_sampling_ratio/min": 0.7774305939674377, "sampling/sampling_logp_difference/max": 0.25176095962524414, "sampling/sampling_logp_difference/mean": 0.015302419662475586, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 216.109375, "completions/mean_terminated_length": 216.109375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.6859138011932373, "epoch": 0.5575221238938053, "frac_reward_zero_std": 0.25, "grad_norm": 1.39794973613683, "kl": 0.036491651087999344, "learning_rate": 9.066762181421552e-07, "loss": -0.0715, "num_tokens": 7463761.0, "reward": 0.0, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.295169711112976, "sampling/importance_sampling_ratio/mean": 1.0003595352172852, "sampling/importance_sampling_ratio/min": 0.7082785964012146, "sampling/sampling_logp_difference/max": 0.34491777420043945, "sampling/sampling_logp_difference/mean": 0.017996162176132202, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 154.140625, "completions/mean_terminated_length": 154.140625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.47537490725517273, "epoch": 0.5592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 0.991926512667197, "kl": 0.017289787530899048, "learning_rate": 9.057757107238894e-07, "loss": -0.0574, "num_tokens": 7484026.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.2754137516021729, "sampling/importance_sampling_ratio/mean": 0.9996475577354431, "sampling/importance_sampling_ratio/min": 0.7784848213195801, "sampling/sampling_logp_difference/max": 0.25040578842163086, "sampling/sampling_logp_difference/mean": 0.013836858794093132, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 114.546875, "completions/mean_terminated_length": 114.546875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4277423024177551, "epoch": 0.5610619469026549, "frac_reward_zero_std": 0.75, "grad_norm": 1.3896407070115557, "kl": 0.01902003400027752, "learning_rate": 9.048713312325804e-07, "loss": 0.1314, "num_tokens": 7501021.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2346246242523193, "sampling/importance_sampling_ratio/mean": 1.0006167888641357, "sampling/importance_sampling_ratio/min": 0.7484177350997925, "sampling/sampling_logp_difference/max": 0.2897939682006836, "sampling/sampling_logp_difference/mean": 0.01313665322959423, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 151.765625, "completions/mean_terminated_length": 151.765625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.48448890447616577, "epoch": 0.5628318584070796, "frac_reward_zero_std": 0.75, "grad_norm": 0.9881770048454785, "kl": 0.02129141055047512, "learning_rate": 9.039630882981768e-07, "loss": -0.0296, "num_tokens": 7521870.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4085352420806885, "sampling/importance_sampling_ratio/mean": 0.9999593496322632, "sampling/importance_sampling_ratio/min": 0.704167902469635, "sampling/sampling_logp_difference/max": 0.350738525390625, "sampling/sampling_logp_difference/mean": 0.01419967319816351, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 226.8125, "completions/mean_terminated_length": 226.8125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.46544888615608215, "epoch": 0.5646017699115045, "frac_reward_zero_std": 0.5, "grad_norm": 0.8774899547663088, "kl": 0.019584359601140022, "learning_rate": 9.030509905874932e-07, "loss": -0.0685, "num_tokens": 7549474.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.276070475578308, "sampling/importance_sampling_ratio/mean": 1.0003020763397217, "sampling/importance_sampling_ratio/min": 0.7528629302978516, "sampling/sampling_logp_difference/max": 0.283872127532959, "sampling/sampling_logp_difference/mean": 0.013035607524216175, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 233.609375, "completions/mean_terminated_length": 233.609375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.46485772728919983, "epoch": 0.5663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 0.755352491644238, "kl": 0.019727567210793495, "learning_rate": 9.021350468041287e-07, "loss": -0.0114, "num_tokens": 7576521.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2966866493225098, "sampling/importance_sampling_ratio/mean": 0.9996893405914307, "sampling/importance_sampling_ratio/min": 0.753505289554596, "sampling/sampling_logp_difference/max": 0.2830193042755127, "sampling/sampling_logp_difference/mean": 0.013445643708109856, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 210.65625, "completions/mean_terminated_length": 210.65625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.5262316465377808, "epoch": 0.5681415929203539, "frac_reward_zero_std": 0.75, "grad_norm": 0.7933488273607078, "kl": 0.026277896016836166, "learning_rate": 9.012152656883822e-07, "loss": 0.0031, "num_tokens": 7604499.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.2737895250320435, "sampling/importance_sampling_ratio/mean": 0.9998112320899963, "sampling/importance_sampling_ratio/min": 0.6325240135192871, "sampling/sampling_logp_difference/max": 0.4580371379852295, "sampling/sampling_logp_difference/mean": 0.015091046690940857, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 148.203125, "completions/mean_terminated_length": 148.203125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.37113308906555176, "epoch": 0.5699115044247788, "frac_reward_zero_std": 1.0, "grad_norm": 0.019660894801485192, "kl": 0.017514094710350037, "learning_rate": 9.002916560171712e-07, "loss": 0.0002, "num_tokens": 7623232.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3000627756118774, "sampling/importance_sampling_ratio/mean": 0.9994478225708008, "sampling/importance_sampling_ratio/min": 0.7794197201728821, "sampling/sampling_logp_difference/max": 0.26241254806518555, "sampling/sampling_logp_difference/mean": 0.012616605497896671, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1413.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 415.953125, "completions/mean_terminated_length": 415.953125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.39512285590171814, "epoch": 0.5716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 0.4974206210022354, "kl": 0.01768594980239868, "learning_rate": 8.993642266039456e-07, "loss": 0.0905, "num_tokens": 7662717.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3046001195907593, "sampling/importance_sampling_ratio/mean": 1.0000929832458496, "sampling/importance_sampling_ratio/min": 0.7399448752403259, "sampling/sampling_logp_difference/max": 0.3011796474456787, "sampling/sampling_logp_difference/mean": 0.010675627738237381, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 204.265625, "completions/mean_terminated_length": 204.265625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5697576403617859, "epoch": 0.5734513274336284, "frac_reward_zero_std": 0.25, "grad_norm": 1.4140480069648567, "kl": 0.03068055957555771, "learning_rate": 8.984329862986055e-07, "loss": 0.0636, "num_tokens": 7689854.0, "reward": 0.09375, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.407089114189148, "sampling/importance_sampling_ratio/mean": 1.0000618696212769, "sampling/importance_sampling_ratio/min": 0.7086215615272522, "sampling/sampling_logp_difference/max": 0.34443366527557373, "sampling/sampling_logp_difference/mean": 0.01600939780473709, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 295.859375, "completions/mean_terminated_length": 295.859375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.38906726241111755, "epoch": 0.5752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 0.7529332961020591, "kl": 0.020837359130382538, "learning_rate": 8.97497943987416e-07, "loss": -0.0008, "num_tokens": 7719093.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4352035522460938, "sampling/importance_sampling_ratio/mean": 0.9998739361763, "sampling/importance_sampling_ratio/min": 0.14447951316833496, "sampling/sampling_logp_difference/max": 1.9346176385879517, "sampling/sampling_logp_difference/mean": 0.010943189263343811, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 107.984375, "completions/mean_terminated_length": 107.984375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3338187038898468, "epoch": 0.5769911504424778, "frac_reward_zero_std": 1.0, "grad_norm": 0.03538540017405453, "kl": 0.020005034282803535, "learning_rate": 8.96559108592922e-07, "loss": 0.0002, "num_tokens": 7734868.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2792854309082031, "sampling/importance_sampling_ratio/mean": 0.9996330738067627, "sampling/importance_sampling_ratio/min": 0.7042019963264465, "sampling/sampling_logp_difference/max": 0.3506901264190674, "sampling/sampling_logp_difference/mean": 0.011899251490831375, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 270.109375, "completions/mean_terminated_length": 270.109375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.420855313539505, "epoch": 0.5787610619469027, "frac_reward_zero_std": 0.75, "grad_norm": 0.570124350346202, "kl": 0.022403981536626816, "learning_rate": 8.956164890738642e-07, "loss": -0.0103, "num_tokens": 7763707.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.2796051502227783, "sampling/importance_sampling_ratio/mean": 0.9998582005500793, "sampling/importance_sampling_ratio/min": 0.7320122718811035, "sampling/sampling_logp_difference/max": 0.3119579553604126, "sampling/sampling_logp_difference/mean": 0.012368805706501007, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 180.03125, "completions/mean_terminated_length": 180.03125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.3523303270339966, "epoch": 0.5805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.021947114091054803, "kl": 0.02078210934996605, "learning_rate": 8.946700944250924e-07, "loss": 0.0002, "num_tokens": 7785149.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.384071946144104, "sampling/importance_sampling_ratio/mean": 1.0001161098480225, "sampling/importance_sampling_ratio/min": 0.7214380502700806, "sampling/sampling_logp_difference/max": 0.3265087604522705, "sampling/sampling_logp_difference/mean": 0.011749118566513062, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 330.96875, "completions/mean_terminated_length": 330.96875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.45975786447525024, "epoch": 0.5823008849557522, "frac_reward_zero_std": 0.75, "grad_norm": 0.6860314136953702, "kl": 0.015220011584460735, "learning_rate": 8.937199336774804e-07, "loss": 0.0011, "num_tokens": 7818411.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3818371295928955, "sampling/importance_sampling_ratio/mean": 1.0002927780151367, "sampling/importance_sampling_ratio/min": 0.7682878375053406, "sampling/sampling_logp_difference/max": 0.3234138488769531, "sampling/sampling_logp_difference/mean": 0.012542881071567535, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 354.375, "completions/mean_terminated_length": 354.375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5063283443450928, "epoch": 0.584070796460177, "frac_reward_zero_std": 0.25, "grad_norm": 0.9220189577399611, "kl": 0.02713492326438427, "learning_rate": 8.927660158978392e-07, "loss": 0.0034, "num_tokens": 7854243.0, "reward": 0.34375, "reward_std": 0.6205305457115173, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3607432842254639, "sampling/importance_sampling_ratio/mean": 0.9999664425849915, "sampling/importance_sampling_ratio/min": 0.6482208967208862, "sampling/sampling_logp_difference/max": 0.4335237741470337, "sampling/sampling_logp_difference/mean": 0.013684609904885292, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 233.90625, "completions/mean_terminated_length": 233.90625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3622668981552124, "epoch": 0.5858407079646017, "frac_reward_zero_std": 1.0, "grad_norm": 0.020275111802893722, "kl": 0.014940309338271618, "learning_rate": 8.918083501888316e-07, "loss": 0.0001, "num_tokens": 7880701.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2778414487838745, "sampling/importance_sampling_ratio/mean": 1.0001190900802612, "sampling/importance_sampling_ratio/min": 0.7774587273597717, "sampling/sampling_logp_difference/max": 0.2517247200012207, "sampling/sampling_logp_difference/mean": 0.010680027306079865, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 255.34375, "completions/mean_terminated_length": 255.34375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4821820557117462, "epoch": 0.5876106194690266, "frac_reward_zero_std": 0.5, "grad_norm": 0.892518587155143, "kl": 0.030800973996520042, "learning_rate": 8.908469456888843e-07, "loss": 0.0069, "num_tokens": 7910931.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.292553186416626, "sampling/importance_sampling_ratio/mean": 1.0001028776168823, "sampling/importance_sampling_ratio/min": 0.7805123925209045, "sampling/sampling_logp_difference/max": 0.2566194534301758, "sampling/sampling_logp_difference/mean": 0.013792471960186958, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 347.28125, "completions/mean_terminated_length": 347.28125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3561914563179016, "epoch": 0.5893805309734513, "frac_reward_zero_std": 1.0, "grad_norm": 0.018550518587126856, "kl": 0.020201288163661957, "learning_rate": 8.898818115721007e-07, "loss": 0.0002, "num_tokens": 7943077.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4352679252624512, "sampling/importance_sampling_ratio/mean": 0.999655544757843, "sampling/importance_sampling_ratio/min": 0.7496740221977234, "sampling/sampling_logp_difference/max": 0.36135149002075195, "sampling/sampling_logp_difference/mean": 0.010385926812887192, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 234.921875, "completions/mean_terminated_length": 234.921875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.3919602036476135, "epoch": 0.5911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.03056473211857754, "kl": 0.025840554386377335, "learning_rate": 8.889129570481741e-07, "loss": 0.0003, "num_tokens": 7969264.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3053627014160156, "sampling/importance_sampling_ratio/mean": 0.9997424483299255, "sampling/importance_sampling_ratio/min": 0.7295703291893005, "sampling/sampling_logp_difference/max": 0.31529951095581055, "sampling/sampling_logp_difference/mean": 0.011989651247859001, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 157.984375, "completions/mean_terminated_length": 157.984375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.38670140504837036, "epoch": 0.5929203539823009, "frac_reward_zero_std": 1.0, "grad_norm": 0.03241074858302365, "kl": 0.023697353899478912, "learning_rate": 8.879403913622996e-07, "loss": 0.0002, "num_tokens": 7989199.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2581428289413452, "sampling/importance_sampling_ratio/mean": 1.000050663948059, "sampling/importance_sampling_ratio/min": 0.765271782875061, "sampling/sampling_logp_difference/max": 0.26752424240112305, "sampling/sampling_logp_difference/mean": 0.012265531346201897, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 166.8125, "completions/mean_terminated_length": 166.8125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4962707757949829, "epoch": 0.5946902654867257, "frac_reward_zero_std": 0.5, "grad_norm": 1.3474338419464484, "kl": 0.02984827384352684, "learning_rate": 8.869641237950849e-07, "loss": 0.025, "num_tokens": 8011059.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4169665575027466, "sampling/importance_sampling_ratio/mean": 1.00020432472229, "sampling/importance_sampling_ratio/min": 0.7693047523498535, "sampling/sampling_logp_difference/max": 0.34851837158203125, "sampling/sampling_logp_difference/mean": 0.014645911753177643, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 247.234375, "completions/mean_terminated_length": 247.234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.4767092764377594, "epoch": 0.5964601769911504, "frac_reward_zero_std": 0.5, "grad_norm": 1.029613210381509, "kl": 0.025278568267822266, "learning_rate": 8.859841636624631e-07, "loss": -0.1091, "num_tokens": 8037394.0, "reward": 0.5625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5299617052078247, "sampling/importance_sampling_ratio/mean": 1.0001429319381714, "sampling/importance_sampling_ratio/min": 0.6745862364768982, "sampling/sampling_logp_difference/max": 0.42524266242980957, "sampling/sampling_logp_difference/mean": 0.014204558916389942, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 112.171875, "completions/mean_terminated_length": 112.171875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.40845632553100586, "epoch": 0.5982300884955752, "frac_reward_zero_std": 1.0, "grad_norm": 0.03141419672657545, "kl": 0.02248462289571762, "learning_rate": 8.850005203156034e-07, "loss": 0.0002, "num_tokens": 8055021.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4219577312469482, "sampling/importance_sampling_ratio/mean": 1.00075364112854, "sampling/importance_sampling_ratio/min": 0.6355901956558228, "sampling/sampling_logp_difference/max": 0.4532012939453125, "sampling/sampling_logp_difference/mean": 0.013014793395996094, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.30393433570861816, "epoch": 0.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.042140843731403005, "kl": 0.020060870796442032, "learning_rate": 8.84013203140821e-07, "loss": 0.0002, "num_tokens": 8074373.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2943553924560547, "sampling/importance_sampling_ratio/mean": 0.9997761249542236, "sampling/importance_sampling_ratio/min": 0.7235432267189026, "sampling/sampling_logp_difference/max": 0.3235950469970703, "sampling/sampling_logp_difference/mean": 0.009732896462082863, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 124.328125, "completions/mean_terminated_length": 124.328125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.3979174494743347, "epoch": 0.6017699115044248, "frac_reward_zero_std": 1.0, "grad_norm": 0.03171287730256308, "kl": 0.021462097764015198, "learning_rate": 8.83022221559489e-07, "loss": 0.0002, "num_tokens": 8092842.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.315820336341858, "sampling/importance_sampling_ratio/mean": 1.0000431537628174, "sampling/importance_sampling_ratio/min": 0.7462828755378723, "sampling/sampling_logp_difference/max": 0.29265058040618896, "sampling/sampling_logp_difference/mean": 0.011864936910569668, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 209.921875, "completions/mean_terminated_length": 209.921875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.4495287537574768, "epoch": 0.6035398230088496, "frac_reward_zero_std": 0.75, "grad_norm": 0.7690487299485748, "kl": 0.02333666943013668, "learning_rate": 8.820275850279472e-07, "loss": 0.0007, "num_tokens": 8117461.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3653202056884766, "sampling/importance_sampling_ratio/mean": 0.9995042085647583, "sampling/importance_sampling_ratio/min": 0.7126814126968384, "sampling/sampling_logp_difference/max": 0.33872079849243164, "sampling/sampling_logp_difference/mean": 0.013177530840039253, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 236.046875, "completions/mean_terminated_length": 236.046875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.5031207799911499, "epoch": 0.6053097345132743, "frac_reward_zero_std": 0.5, "grad_norm": 1.1006038821948354, "kl": 0.027163395658135414, "learning_rate": 8.810293030374125e-07, "loss": 0.1049, "num_tokens": 8144056.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.2608977556228638, "sampling/importance_sampling_ratio/mean": 0.9999832510948181, "sampling/importance_sampling_ratio/min": 0.715685248374939, "sampling/sampling_logp_difference/max": 0.334514856338501, "sampling/sampling_logp_difference/mean": 0.013897942379117012, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 173.640625, "completions/mean_terminated_length": 173.640625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.3988833725452423, "epoch": 0.6070796460176991, "frac_reward_zero_std": 0.75, "grad_norm": 0.8254985364154142, "kl": 0.023361889645457268, "learning_rate": 8.800273851138882e-07, "loss": 0.0247, "num_tokens": 8165329.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2623324394226074, "sampling/importance_sampling_ratio/mean": 1.0005016326904297, "sampling/importance_sampling_ratio/min": 0.788011908531189, "sampling/sampling_logp_difference/max": 0.23824214935302734, "sampling/sampling_logp_difference/mean": 0.011087987571954727, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.48774707317352295, "epoch": 0.6088495575221239, "frac_reward_zero_std": 0.75, "grad_norm": 0.8690617359332877, "kl": 0.025597820058465004, "learning_rate": 8.790218408180734e-07, "loss": 0.0323, "num_tokens": 8188849.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.2785621881484985, "sampling/importance_sampling_ratio/mean": 0.9999054670333862, "sampling/importance_sampling_ratio/min": 0.6738258004188538, "sampling/sampling_logp_difference/max": 0.39478373527526855, "sampling/sampling_logp_difference/mean": 0.01327909342944622, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 251.671875, "completions/mean_terminated_length": 251.671875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.45492517948150635, "epoch": 0.6106194690265486, "frac_reward_zero_std": 0.75, "grad_norm": 0.6125973487740176, "kl": 0.02505698800086975, "learning_rate": 8.780126797452712e-07, "loss": -0.0226, "num_tokens": 8217676.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.269783616065979, "sampling/importance_sampling_ratio/mean": 0.9998481273651123, "sampling/importance_sampling_ratio/min": 0.7574979066848755, "sampling/sampling_logp_difference/max": 0.27773451805114746, "sampling/sampling_logp_difference/mean": 0.012534982524812222, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.4061852991580963, "epoch": 0.6123893805309735, "frac_reward_zero_std": 0.75, "grad_norm": 0.5143229574855908, "kl": 0.026813451200723648, "learning_rate": 8.769999115252975e-07, "loss": -0.0131, "num_tokens": 8250492.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.507156252861023, "sampling/importance_sampling_ratio/mean": 0.9998631477355957, "sampling/importance_sampling_ratio/min": 0.613524317741394, "sampling/sampling_logp_difference/max": 0.48853540420532227, "sampling/sampling_logp_difference/mean": 0.011711902916431427, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 131.953125, "completions/mean_terminated_length": 131.953125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.47795355319976807, "epoch": 0.6141592920353982, "frac_reward_zero_std": 0.75, "grad_norm": 1.1440702137686438, "kl": 0.030107995495200157, "learning_rate": 8.759835458223887e-07, "loss": -0.0211, "num_tokens": 8269497.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2723186016082764, "sampling/importance_sampling_ratio/mean": 0.9996389150619507, "sampling/importance_sampling_ratio/min": 0.6929596066474915, "sampling/sampling_logp_difference/max": 0.36678361892700195, "sampling/sampling_logp_difference/mean": 0.013581225648522377, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 198.015625, "completions/mean_terminated_length": 198.015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.5331894159317017, "epoch": 0.6159292035398231, "frac_reward_zero_std": 0.5, "grad_norm": 1.27486141644626, "kl": 0.034227535128593445, "learning_rate": 8.749635923351106e-07, "loss": -0.0127, "num_tokens": 8293514.0, "reward": 0.625, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000253677368164, "sampling/importance_sampling_ratio/min": 0.7716612219810486, "sampling/sampling_logp_difference/max": 0.7810070514678955, "sampling/sampling_logp_difference/mean": 0.014890450984239578, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.4829539954662323, "epoch": 0.6176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 0.9220496741001091, "kl": 0.026452170684933662, "learning_rate": 8.739400607962644e-07, "loss": 0.0372, "num_tokens": 8317226.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.2848265171051025, "sampling/importance_sampling_ratio/mean": 1.0000731945037842, "sampling/importance_sampling_ratio/min": 0.7159280776977539, "sampling/sampling_logp_difference/max": 0.33417558670043945, "sampling/sampling_logp_difference/mean": 0.013107567094266415, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 178.15625, "completions/mean_terminated_length": 178.15625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.36726176738739014, "epoch": 0.6194690265486725, "frac_reward_zero_std": 1.0, "grad_norm": 0.02728767041319218, "kl": 0.022122027352452278, "learning_rate": 8.729129609727946e-07, "loss": 0.0002, "num_tokens": 8337188.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004839897155762, "sampling/importance_sampling_ratio/min": 0.7795758247375488, "sampling/sampling_logp_difference/max": 1.0377068519592285, "sampling/sampling_logp_difference/mean": 0.010822944343090057, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 222.28125, "completions/mean_terminated_length": 222.28125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.4941057562828064, "epoch": 0.6212389380530974, "frac_reward_zero_std": 0.75, "grad_norm": 0.9632157373634614, "kl": 0.029219551011919975, "learning_rate": 8.718823026656958e-07, "loss": 0.2543, "num_tokens": 8363046.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2820266485214233, "sampling/importance_sampling_ratio/mean": 0.999423623085022, "sampling/importance_sampling_ratio/min": 0.6483745574951172, "sampling/sampling_logp_difference/max": 0.4332866668701172, "sampling/sampling_logp_difference/mean": 0.013565474189817905, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 214.96875, "completions/mean_terminated_length": 214.96875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.4676707983016968, "epoch": 0.6230088495575221, "frac_reward_zero_std": 0.5, "grad_norm": 1.1854348464836129, "kl": 0.031019899994134903, "learning_rate": 8.708480957099193e-07, "loss": -0.0407, "num_tokens": 8386180.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.2320605516433716, "sampling/importance_sampling_ratio/mean": 0.9995864033699036, "sampling/importance_sampling_ratio/min": 0.7639457583427429, "sampling/sampling_logp_difference/max": 0.2692584991455078, "sampling/sampling_logp_difference/mean": 0.011983351781964302, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 219.359375, "completions/mean_terminated_length": 219.359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.5043070316314697, "epoch": 0.6247787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.022541164699895557, "kl": 0.027150984853506088, "learning_rate": 8.698103499742783e-07, "loss": 0.0003, "num_tokens": 8410203.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2862666845321655, "sampling/importance_sampling_ratio/mean": 0.9999834299087524, "sampling/importance_sampling_ratio/min": 0.770699679851532, "sampling/sampling_logp_difference/max": 0.26045656204223633, "sampling/sampling_logp_difference/mean": 0.013735330663621426, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 183.578125, "completions/mean_terminated_length": 183.578125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4920017123222351, "epoch": 0.6265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 1.0858546442917074, "kl": 0.02852623723447323, "learning_rate": 8.687690753613554e-07, "loss": -0.034, "num_tokens": 8433712.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2511587142944336, "sampling/importance_sampling_ratio/mean": 0.9998182058334351, "sampling/importance_sampling_ratio/min": 0.7757064700126648, "sampling/sampling_logp_difference/max": 0.2539811134338379, "sampling/sampling_logp_difference/mean": 0.013604482635855675, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.420221745967865, "epoch": 0.6283185840707964, "frac_reward_zero_std": 1.0, "grad_norm": 0.028487046687618384, "kl": 0.022680988535284996, "learning_rate": 8.677242818074062e-07, "loss": 0.0002, "num_tokens": 8451264.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3862122297286987, "sampling/importance_sampling_ratio/mean": 1.0000605583190918, "sampling/importance_sampling_ratio/min": 0.6953662633895874, "sampling/sampling_logp_difference/max": 0.36331653594970703, "sampling/sampling_logp_difference/mean": 0.012932376936078072, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.40682464838027954, "epoch": 0.6300884955752213, "frac_reward_zero_std": 0.75, "grad_norm": 0.7597417921682754, "kl": 0.018179986625909805, "learning_rate": 8.666759792822661e-07, "loss": 0.0442, "num_tokens": 8491568.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3536229133605957, "sampling/importance_sampling_ratio/mean": 1.0000758171081543, "sampling/importance_sampling_ratio/min": 0.5983306169509888, "sampling/sampling_logp_difference/max": 0.5136117935180664, "sampling/sampling_logp_difference/mean": 0.01157149113714695, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 270.828125, "completions/mean_terminated_length": 270.828125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.44195038080215454, "epoch": 0.631858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 0.5876769931497832, "kl": 0.02279229275882244, "learning_rate": 8.656241777892542e-07, "loss": -0.0846, "num_tokens": 8519317.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2860088348388672, "sampling/importance_sampling_ratio/mean": 1.0000419616699219, "sampling/importance_sampling_ratio/min": 0.7190498113632202, "sampling/sampling_logp_difference/max": 0.32982468605041504, "sampling/sampling_logp_difference/mean": 0.011525126174092293, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 170.3125, "completions/mean_terminated_length": 170.3125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.4412679672241211, "epoch": 0.6336283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 0.8777828390360136, "kl": 0.018211886286735535, "learning_rate": 8.645688873650784e-07, "loss": 0.0067, "num_tokens": 8539977.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3385186195373535, "sampling/importance_sampling_ratio/mean": 1.0000141859054565, "sampling/importance_sampling_ratio/min": 0.7725958228111267, "sampling/sampling_logp_difference/max": 0.2915635108947754, "sampling/sampling_logp_difference/mean": 0.012404140084981918, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 189.984375, "completions/mean_terminated_length": 189.984375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.5027115345001221, "epoch": 0.6353982300884956, "frac_reward_zero_std": 0.75, "grad_norm": 0.7521287851566814, "kl": 0.02928530052304268, "learning_rate": 8.63510118079739e-07, "loss": -0.003, "num_tokens": 8562328.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.2942529916763306, "sampling/importance_sampling_ratio/mean": 0.9999889731407166, "sampling/importance_sampling_ratio/min": 0.7680959105491638, "sampling/sampling_logp_difference/max": 0.2638406753540039, "sampling/sampling_logp_difference/mean": 0.013064699247479439, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 179.78125, "completions/mean_terminated_length": 179.78125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.46562573313713074, "epoch": 0.6371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 1.0266261557224343, "kl": 0.023430999368429184, "learning_rate": 8.624478800364331e-07, "loss": 0.0226, "num_tokens": 8587178.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.2917325496673584, "sampling/importance_sampling_ratio/mean": 0.9993578791618347, "sampling/importance_sampling_ratio/min": 0.7845205664634705, "sampling/sampling_logp_difference/max": 0.2559843063354492, "sampling/sampling_logp_difference/mean": 0.012804657220840454, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 246.703125, "completions/mean_terminated_length": 246.703125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4675860106945038, "epoch": 0.6389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 0.5742333191385458, "kl": 0.021360373124480247, "learning_rate": 8.613821833714583e-07, "loss": -0.0298, "num_tokens": 8614487.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.2880377769470215, "sampling/importance_sampling_ratio/mean": 0.9999352693557739, "sampling/importance_sampling_ratio/min": 0.7406852841377258, "sampling/sampling_logp_difference/max": 0.30017948150634766, "sampling/sampling_logp_difference/mean": 0.012621838599443436, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2496.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 361.546875, "completions/mean_terminated_length": 361.546875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.5509105920791626, "epoch": 0.6407079646017699, "frac_reward_zero_std": 0.5, "grad_norm": 0.7273417812754059, "kl": 0.020181788131594658, "learning_rate": 8.603130382541155e-07, "loss": -0.0775, "num_tokens": 8651898.0, "reward": -0.15625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.2890141010284424, "sampling/importance_sampling_ratio/mean": 1.0000405311584473, "sampling/importance_sampling_ratio/min": 0.6976709365844727, "sampling/sampling_logp_difference/max": 0.36000776290893555, "sampling/sampling_logp_difference/mean": 0.013328511267900467, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.4670611619949341, "epoch": 0.6424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 0.5990283199768982, "kl": 0.024382244795560837, "learning_rate": 8.592404548866122e-07, "loss": 0.0452, "num_tokens": 8682986.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.435893177986145, "sampling/importance_sampling_ratio/mean": 0.9993845224380493, "sampling/importance_sampling_ratio/min": 0.743273138999939, "sampling/sampling_logp_difference/max": 0.3617870807647705, "sampling/sampling_logp_difference/mean": 0.012076450511813164, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 148.78125, "completions/mean_terminated_length": 148.78125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.5084958672523499, "epoch": 0.6442477876106195, "frac_reward_zero_std": 0.75, "grad_norm": 0.8807914316866654, "kl": 0.026441290974617004, "learning_rate": 8.58164443503965e-07, "loss": 0.0031, "num_tokens": 8704012.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.267411231994629, "sampling/importance_sampling_ratio/mean": 1.0002222061157227, "sampling/importance_sampling_ratio/min": 0.7764332890510559, "sampling/sampling_logp_difference/max": 0.25304460525512695, "sampling/sampling_logp_difference/mean": 0.013500794768333435, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 184.015625, "completions/mean_terminated_length": 184.015625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.67201828956604, "epoch": 0.6460176991150443, "frac_reward_zero_std": 0.5, "grad_norm": 1.2002168456687612, "kl": 0.036267057061195374, "learning_rate": 8.570850143739021e-07, "loss": 0.0212, "num_tokens": 8728157.0, "reward": -0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": -0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4485478401184082, "sampling/importance_sampling_ratio/mean": 1.000503659248352, "sampling/importance_sampling_ratio/min": 0.7837531566619873, "sampling/sampling_logp_difference/max": 0.3705615997314453, "sampling/sampling_logp_difference/mean": 0.016331903636455536, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 133.953125, "completions/mean_terminated_length": 133.953125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.5630679726600647, "epoch": 0.647787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 1.2463041970502162, "kl": 0.026198048144578934, "learning_rate": 8.560021777967648e-07, "loss": -0.0111, "num_tokens": 8746554.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.2711914777755737, "sampling/importance_sampling_ratio/mean": 0.9995185136795044, "sampling/importance_sampling_ratio/min": 0.7593411803245544, "sampling/sampling_logp_difference/max": 0.27530407905578613, "sampling/sampling_logp_difference/mean": 0.015583851374685764, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 212.171875, "completions/mean_terminated_length": 212.171875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5904482007026672, "epoch": 0.6495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.838862630555474, "kl": 0.030584806576371193, "learning_rate": 8.549159441054104e-07, "loss": -0.0464, "num_tokens": 8772725.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3894929885864258, "sampling/importance_sampling_ratio/mean": 1.000403881072998, "sampling/importance_sampling_ratio/min": 0.6045659780502319, "sampling/sampling_logp_difference/max": 0.5032444596290588, "sampling/sampling_logp_difference/mean": 0.01460997387766838, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 244.46875, "completions/mean_terminated_length": 244.46875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.5420571565628052, "epoch": 0.6513274336283186, "frac_reward_zero_std": 0.5, "grad_norm": 1.0746882462897032, "kl": 0.03214477747678757, "learning_rate": 8.538263236651117e-07, "loss": 0.1079, "num_tokens": 8799715.0, "reward": 0.21875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.2749974727630615, "sampling/importance_sampling_ratio/mean": 1.0002174377441406, "sampling/importance_sampling_ratio/min": 0.7694841623306274, "sampling/sampling_logp_difference/max": 0.26203489303588867, "sampling/sampling_logp_difference/mean": 0.013695253990590572, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 218.46875, "completions/mean_terminated_length": 218.46875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.6565658450126648, "epoch": 0.6530973451327433, "frac_reward_zero_std": 0.5, "grad_norm": 1.1216774953973006, "kl": 0.0351128950715065, "learning_rate": 8.527333268734606e-07, "loss": 0.0059, "num_tokens": 8825089.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.283918857574463, "sampling/importance_sampling_ratio/mean": 0.9998213648796082, "sampling/importance_sampling_ratio/min": 0.7703509330749512, "sampling/sampling_logp_difference/max": 0.2609090805053711, "sampling/sampling_logp_difference/mean": 0.015703830868005753, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 399.609375, "completions/mean_terminated_length": 399.609375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.5904992818832397, "epoch": 0.6548672566371682, "frac_reward_zero_std": 0.25, "grad_norm": 0.7965618169707703, "kl": 0.029476068913936615, "learning_rate": 8.516369641602661e-07, "loss": -0.0618, "num_tokens": 8866440.0, "reward": 0.65625, "reward_std": 0.5986068248748779, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.36335027217865, "sampling/importance_sampling_ratio/mean": 1.0002217292785645, "sampling/importance_sampling_ratio/min": 0.7621548771858215, "sampling/sampling_logp_difference/max": 0.30994510650634766, "sampling/sampling_logp_difference/mean": 0.01400639209896326, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 388.28125, "completions/mean_terminated_length": 388.28125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4588692784309387, "epoch": 0.6566371681415929, "frac_reward_zero_std": 0.5, "grad_norm": 0.6411460717233641, "kl": 0.021127849817276, "learning_rate": 8.505372459874571e-07, "loss": 0.0862, "num_tokens": 8902810.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3492194414138794, "sampling/importance_sampling_ratio/mean": 0.999609112739563, "sampling/importance_sampling_ratio/min": 0.20565012097358704, "sampling/sampling_logp_difference/max": 1.5815789699554443, "sampling/sampling_logp_difference/mean": 0.011133144609630108, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 179.953125, "completions/mean_terminated_length": 179.953125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.6086217761039734, "epoch": 0.6584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.02371478519455344, "kl": 0.026376575231552124, "learning_rate": 8.494341828489812e-07, "loss": 0.0003, "num_tokens": 8925015.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2634663581848145, "sampling/importance_sampling_ratio/mean": 0.9996379017829895, "sampling/importance_sampling_ratio/min": 0.7802466750144958, "sampling/sampling_logp_difference/max": 0.24814510345458984, "sampling/sampling_logp_difference/mean": 0.014736920595169067, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 311.484375, "completions/mean_terminated_length": 311.484375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.708957314491272, "epoch": 0.6601769911504425, "frac_reward_zero_std": 0.25, "grad_norm": 0.788735143367135, "kl": 0.03220956400036812, "learning_rate": 8.483277852707052e-07, "loss": -0.0458, "num_tokens": 8957990.0, "reward": -0.0625, "reward_std": 0.6047805547714233, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.2701094150543213, "sampling/importance_sampling_ratio/mean": 0.9999181032180786, "sampling/importance_sampling_ratio/min": 0.779296338558197, "sampling/sampling_logp_difference/max": 0.24936389923095703, "sampling/sampling_logp_difference/mean": 0.015684492886066437, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 198.390625, "completions/mean_terminated_length": 198.390625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.6557304859161377, "epoch": 0.6619469026548672, "frac_reward_zero_std": 1.0, "grad_norm": 0.027799589939456318, "kl": 0.03298699110746384, "learning_rate": 8.472180638103143e-07, "loss": 0.0003, "num_tokens": 8983631.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2837529182434082, "sampling/importance_sampling_ratio/mean": 1.000145673751831, "sampling/importance_sampling_ratio/min": 0.7200532555580139, "sampling/sampling_logp_difference/max": 0.32843017578125, "sampling/sampling_logp_difference/mean": 0.015664996579289436, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 263.078125, "completions/mean_terminated_length": 263.078125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.6758696436882019, "epoch": 0.6637168141592921, "frac_reward_zero_std": 0.25, "grad_norm": 0.9696689842236048, "kl": 0.03473559021949768, "learning_rate": 8.461050290572113e-07, "loss": -0.0887, "num_tokens": 9012644.0, "reward": 0.65625, "reward_std": 0.6337460875511169, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3068387508392334, "sampling/importance_sampling_ratio/mean": 0.9997862577438354, "sampling/importance_sampling_ratio/min": 0.7731117606163025, "sampling/sampling_logp_difference/max": 0.267611026763916, "sampling/sampling_logp_difference/mean": 0.01557997427880764, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 276.640625, "completions/mean_terminated_length": 276.640625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.47760453820228577, "epoch": 0.6654867256637168, "frac_reward_zero_std": 0.5, "grad_norm": 0.7454731096303974, "kl": 0.022528115659952164, "learning_rate": 8.449886916324166e-07, "loss": -0.0718, "num_tokens": 9041101.0, "reward": 0.0, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4225870370864868, "sampling/importance_sampling_ratio/mean": 1.0000154972076416, "sampling/importance_sampling_ratio/min": 0.6955118775367737, "sampling/sampling_logp_difference/max": 0.36310720443725586, "sampling/sampling_logp_difference/mean": 0.012021130882203579, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 150.34375, "completions/mean_terminated_length": 150.34375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.4635184705257416, "epoch": 0.6672566371681415, "frac_reward_zero_std": 0.75, "grad_norm": 0.9022270779814036, "kl": 0.033895909786224365, "learning_rate": 8.438690621884649e-07, "loss": -0.0177, "num_tokens": 9059715.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.2729942798614502, "sampling/importance_sampling_ratio/mean": 0.9998793005943298, "sampling/importance_sampling_ratio/min": 0.7799273729324341, "sampling/sampling_logp_difference/max": 0.24855446815490723, "sampling/sampling_logp_difference/mean": 0.01244885753840208, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 163.84375, "completions/mean_terminated_length": 163.84375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5203865766525269, "epoch": 0.6690265486725664, "frac_reward_zero_std": 0.75, "grad_norm": 0.9894126640993817, "kl": 0.02886105328798294, "learning_rate": 8.427461514093055e-07, "loss": 0.0074, "num_tokens": 9080361.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.26200270652771, "sampling/importance_sampling_ratio/mean": 1.0001089572906494, "sampling/importance_sampling_ratio/min": 0.8044192790985107, "sampling/sampling_logp_difference/max": 0.23269987106323242, "sampling/sampling_logp_difference/mean": 0.013348866254091263, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 234.421875, "completions/mean_terminated_length": 234.421875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.5824223160743713, "epoch": 0.6707964601769911, "frac_reward_zero_std": 0.5, "grad_norm": 1.0975653910610275, "kl": 0.029856372624635696, "learning_rate": 8.41619970010199e-07, "loss": -0.1137, "num_tokens": 9106212.0, "reward": 0.46875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5540844202041626, "sampling/importance_sampling_ratio/mean": 1.0003693103790283, "sampling/importance_sampling_ratio/min": 0.5592557787895203, "sampling/sampling_logp_difference/max": 0.5811483860015869, "sampling/sampling_logp_difference/mean": 0.01457655057311058, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 197.59375, "completions/mean_terminated_length": 197.59375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5618985891342163, "epoch": 0.672566371681416, "frac_reward_zero_std": 0.5, "grad_norm": 1.1005969055843197, "kl": 0.03320286422967911, "learning_rate": 8.404905287376157e-07, "loss": -0.0618, "num_tokens": 9129306.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.2609201669692993, "sampling/importance_sampling_ratio/mean": 1.0006989240646362, "sampling/importance_sampling_ratio/min": 0.7712981104850769, "sampling/sampling_logp_difference/max": 0.25968027114868164, "sampling/sampling_logp_difference/mean": 0.014056424610316753, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 190.296875, "completions/mean_terminated_length": 190.296875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.517371416091919, "epoch": 0.6743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 1.013892324138114, "kl": 0.032203856855630875, "learning_rate": 8.393578383691328e-07, "loss": 0.0101, "num_tokens": 9152269.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.31023108959198, "sampling/importance_sampling_ratio/mean": 0.9994510412216187, "sampling/importance_sampling_ratio/min": 0.7832241058349609, "sampling/sampling_logp_difference/max": 0.2702035903930664, "sampling/sampling_logp_difference/mean": 0.013653119094669819, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.6100054979324341, "epoch": 0.6761061946902654, "frac_reward_zero_std": 0.75, "grad_norm": 0.7255557913879723, "kl": 0.03324592858552933, "learning_rate": 8.382219097133323e-07, "loss": -0.0246, "num_tokens": 9172937.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2637981176376343, "sampling/importance_sampling_ratio/mean": 1.000061273574829, "sampling/importance_sampling_ratio/min": 0.6904789209365845, "sampling/sampling_logp_difference/max": 0.37036991119384766, "sampling/sampling_logp_difference/mean": 0.014985913410782814, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 240.453125, "completions/mean_terminated_length": 240.453125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5403121709823608, "epoch": 0.6778761061946903, "frac_reward_zero_std": 0.5, "grad_norm": 0.8898620310491819, "kl": 0.029807697981595993, "learning_rate": 8.370827536096964e-07, "loss": -0.0293, "num_tokens": 9198662.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.3563177585601807, "sampling/importance_sampling_ratio/mean": 1.0002014636993408, "sampling/importance_sampling_ratio/min": 0.776527464389801, "sampling/sampling_logp_difference/max": 0.30477356910705566, "sampling/sampling_logp_difference/mean": 0.013213641941547394, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 173.953125, "completions/mean_terminated_length": 173.953125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.540034830570221, "epoch": 0.679646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.9326328666682789, "kl": 0.035741355270147324, "learning_rate": 8.359403809285053e-07, "loss": -0.0348, "num_tokens": 9220851.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4128259420394897, "sampling/importance_sampling_ratio/mean": 0.9999628067016602, "sampling/importance_sampling_ratio/min": 0.79998779296875, "sampling/sampling_logp_difference/max": 0.3455919027328491, "sampling/sampling_logp_difference/mean": 0.013962743803858757, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 153.609375, "completions/mean_terminated_length": 153.609375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.6082345247268677, "epoch": 0.6814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 1.1510022343068778, "kl": 0.03825441002845764, "learning_rate": 8.347948025707329e-07, "loss": -0.0367, "num_tokens": 9243482.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2928928136825562, "sampling/importance_sampling_ratio/mean": 1.000234842300415, "sampling/importance_sampling_ratio/min": 0.7800227403640747, "sampling/sampling_logp_difference/max": 0.2568821907043457, "sampling/sampling_logp_difference/mean": 0.014559213072061539, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 305.65625, "completions/mean_terminated_length": 305.65625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.591589093208313, "epoch": 0.6831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 0.7144892832253507, "kl": 0.03293855860829353, "learning_rate": 8.336460294679431e-07, "loss": -0.0605, "num_tokens": 9273748.0, "reward": 0.71875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.2764477729797363, "sampling/importance_sampling_ratio/mean": 1.0000072717666626, "sampling/importance_sampling_ratio/min": 0.6083496809005737, "sampling/sampling_logp_difference/max": 0.4970054626464844, "sampling/sampling_logp_difference/mean": 0.013944409787654877, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 324.921875, "completions/mean_terminated_length": 324.921875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.6242102384567261, "epoch": 0.6849557522123894, "frac_reward_zero_std": 0.5, "grad_norm": 0.7729211100426958, "kl": 0.034412190318107605, "learning_rate": 8.324940725821852e-07, "loss": -0.1, "num_tokens": 9305871.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.3554214239120483, "sampling/importance_sampling_ratio/mean": 0.999740481376648, "sampling/importance_sampling_ratio/min": 0.772234320640564, "sampling/sampling_logp_difference/max": 0.30411243438720703, "sampling/sampling_logp_difference/mean": 0.014125213027000427, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 208.359375, "completions/mean_terminated_length": 208.359375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.6388272643089294, "epoch": 0.6867256637168142, "frac_reward_zero_std": 0.75, "grad_norm": 1.020506954002645, "kl": 0.04015382379293442, "learning_rate": 8.313389429058895e-07, "loss": 0.0315, "num_tokens": 9330950.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.2605828046798706, "sampling/importance_sampling_ratio/mean": 1.0002447366714478, "sampling/importance_sampling_ratio/min": 0.6901382207870483, "sampling/sampling_logp_difference/max": 0.3708634376525879, "sampling/sampling_logp_difference/mean": 0.01514605525881052, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 205.453125, "completions/mean_terminated_length": 205.453125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.8231542110443115, "epoch": 0.6884955752212389, "frac_reward_zero_std": 0.75, "grad_norm": 0.8880420694596971, "kl": 0.050952017307281494, "learning_rate": 8.30180651461762e-07, "loss": -0.0451, "num_tokens": 9358659.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.3616708517074585, "sampling/importance_sampling_ratio/mean": 0.9999266862869263, "sampling/importance_sampling_ratio/min": 0.785467267036438, "sampling/sampling_logp_difference/max": 0.3087124824523926, "sampling/sampling_logp_difference/mean": 0.01801479235291481, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.5955511331558228, "epoch": 0.6902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 0.6604594581145838, "kl": 0.0431831032037735, "learning_rate": 8.290192093026805e-07, "loss": -0.019, "num_tokens": 9388517.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.2740169763565063, "sampling/importance_sampling_ratio/mean": 0.9995293617248535, "sampling/importance_sampling_ratio/min": 0.5855978727340698, "sampling/sampling_logp_difference/max": 0.5351219177246094, "sampling/sampling_logp_difference/mean": 0.014664819464087486, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 233.96875, "completions/mean_terminated_length": 233.96875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.583286464214325, "epoch": 0.6920353982300885, "frac_reward_zero_std": 0.75, "grad_norm": 0.5834252619354268, "kl": 0.03397216647863388, "learning_rate": 8.278546275115869e-07, "loss": 0.0017, "num_tokens": 9414227.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3985861539840698, "sampling/importance_sampling_ratio/mean": 1.000089406967163, "sampling/importance_sampling_ratio/min": 0.7360794544219971, "sampling/sampling_logp_difference/max": 0.3354618549346924, "sampling/sampling_logp_difference/mean": 0.014018921181559563, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 378.65625, "completions/mean_terminated_length": 378.65625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.6142008304595947, "epoch": 0.6938053097345133, "frac_reward_zero_std": 0.25, "grad_norm": 0.8068628231814695, "kl": 0.03943706676363945, "learning_rate": 8.266869172013835e-07, "loss": -0.0061, "num_tokens": 9448877.0, "reward": 0.4375, "reward_std": 0.690913200378418, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.465820074081421, "sampling/importance_sampling_ratio/mean": 1.0003738403320312, "sampling/importance_sampling_ratio/min": 0.686782956123352, "sampling/sampling_logp_difference/max": 0.3824148178100586, "sampling/sampling_logp_difference/mean": 0.01385589875280857, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.6114860773086548, "epoch": 0.695575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.8849583592456787, "kl": 0.04806371033191681, "learning_rate": 8.255160895148262e-07, "loss": 0.0183, "num_tokens": 9471869.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3279483318328857, "sampling/importance_sampling_ratio/mean": 0.9999027252197266, "sampling/importance_sampling_ratio/min": 0.7286269664764404, "sampling/sampling_logp_difference/max": 0.3165934085845947, "sampling/sampling_logp_difference/mean": 0.014248994179069996, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 192.640625, "completions/mean_terminated_length": 192.640625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.6132149696350098, "epoch": 0.6973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 0.9392227465360554, "kl": 0.04451679438352585, "learning_rate": 8.243421556244178e-07, "loss": 0.0474, "num_tokens": 9495238.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.2758854627609253, "sampling/importance_sampling_ratio/mean": 1.0001885890960693, "sampling/importance_sampling_ratio/min": 0.7333983778953552, "sampling/sampling_logp_difference/max": 0.31006622314453125, "sampling/sampling_logp_difference/mean": 0.014767677523195744, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 278.640625, "completions/mean_terminated_length": 278.640625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.5861450433731079, "epoch": 0.6991150442477876, "frac_reward_zero_std": 0.75, "grad_norm": 0.5800785812902889, "kl": 0.0425945445895195, "learning_rate": 8.231651267323018e-07, "loss": 0.0067, "num_tokens": 9521903.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3726626634597778, "sampling/importance_sampling_ratio/mean": 1.0004127025604248, "sampling/importance_sampling_ratio/min": 0.8057330250740051, "sampling/sampling_logp_difference/max": 0.31675243377685547, "sampling/sampling_logp_difference/mean": 0.014240694232285023, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 125.328125, "completions/mean_terminated_length": 125.328125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.5321370363235474, "epoch": 0.7008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.03970409261782805, "kl": 0.05122319981455803, "learning_rate": 8.219850140701556e-07, "loss": 0.0005, "num_tokens": 9539828.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3042287826538086, "sampling/importance_sampling_ratio/mean": 1.0002028942108154, "sampling/importance_sampling_ratio/min": 0.7259151935577393, "sampling/sampling_logp_difference/max": 0.32032203674316406, "sampling/sampling_logp_difference/mean": 0.01377511490136385, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 163.265625, "completions/mean_terminated_length": 163.265625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.6501399278640747, "epoch": 0.7026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 0.9723786578984597, "kl": 0.055191926658153534, "learning_rate": 8.208018288990831e-07, "loss": 0.0264, "num_tokens": 9561109.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.290521502494812, "sampling/importance_sampling_ratio/mean": 0.9994111657142639, "sampling/importance_sampling_ratio/min": 0.7783161401748657, "sampling/sampling_logp_difference/max": 0.25504636764526367, "sampling/sampling_logp_difference/mean": 0.015406796708703041, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.6198195815086365, "epoch": 0.7044247787610619, "frac_reward_zero_std": 0.5, "grad_norm": 0.970055457461607, "kl": 0.04273217171430588, "learning_rate": 8.196155825095072e-07, "loss": 0.0243, "num_tokens": 9590549.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.2971134185791016, "sampling/importance_sampling_ratio/mean": 0.9996868371963501, "sampling/importance_sampling_ratio/min": 0.68349289894104, "sampling/sampling_logp_difference/max": 0.38053905963897705, "sampling/sampling_logp_difference/mean": 0.014377663843333721, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 242.78125, "completions/mean_terminated_length": 242.78125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.5678697824478149, "epoch": 0.7061946902654868, "frac_reward_zero_std": 0.5, "grad_norm": 1.0366074757470989, "kl": 0.04768776148557663, "learning_rate": 8.184262862210624e-07, "loss": 0.0207, "num_tokens": 9616999.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.2689909934997559, "sampling/importance_sampling_ratio/mean": 1.0004022121429443, "sampling/importance_sampling_ratio/min": 0.7870878577232361, "sampling/sampling_logp_difference/max": 0.23941540718078613, "sampling/sampling_logp_difference/mean": 0.013945416547358036, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 175.53125, "completions/mean_terminated_length": 175.53125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5880441069602966, "epoch": 0.7079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.0238992392430195, "kl": 0.05072199925780296, "learning_rate": 8.172339513824862e-07, "loss": -0.0251, "num_tokens": 9642473.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.2697378396987915, "sampling/importance_sampling_ratio/mean": 1.0001071691513062, "sampling/importance_sampling_ratio/min": 0.7889683842658997, "sampling/sampling_logp_difference/max": 0.23881053924560547, "sampling/sampling_logp_difference/mean": 0.014471493661403656, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 294.671875, "completions/mean_terminated_length": 294.671875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.6381601691246033, "epoch": 0.7097345132743362, "frac_reward_zero_std": 0.5, "grad_norm": 0.9550777697986479, "kl": 0.04391219839453697, "learning_rate": 8.160385893715112e-07, "loss": -0.0109, "num_tokens": 9672180.0, "reward": 0.21875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4058539867401123, "sampling/importance_sampling_ratio/mean": 0.9997583627700806, "sampling/importance_sampling_ratio/min": 0.7331592440605164, "sampling/sampling_logp_difference/max": 0.3406449556350708, "sampling/sampling_logp_difference/mean": 0.015479639172554016, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 207.703125, "completions/mean_terminated_length": 207.703125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.4981333613395691, "epoch": 0.7115044247787611, "frac_reward_zero_std": 0.75, "grad_norm": 0.8388483096281728, "kl": 0.048921674489974976, "learning_rate": 8.14840211594757e-07, "loss": -0.051, "num_tokens": 9694977.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.2688875198364258, "sampling/importance_sampling_ratio/mean": 1.0003901720046997, "sampling/importance_sampling_ratio/min": 0.7529307007789612, "sampling/sampling_logp_difference/max": 0.28378212451934814, "sampling/sampling_logp_difference/mean": 0.012409433722496033, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 205.359375, "completions/mean_terminated_length": 205.359375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.574770450592041, "epoch": 0.7132743362831858, "frac_reward_zero_std": 0.75, "grad_norm": 0.9634246559161275, "kl": 0.04920308291912079, "learning_rate": 8.136388294876202e-07, "loss": 0.0716, "num_tokens": 9718744.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.2724429368972778, "sampling/importance_sampling_ratio/mean": 0.99964439868927, "sampling/importance_sampling_ratio/min": 0.7868424654006958, "sampling/sampling_logp_difference/max": 0.24093866348266602, "sampling/sampling_logp_difference/mean": 0.013840064406394958, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.5439393520355225, "epoch": 0.7150442477876107, "frac_reward_zero_std": 0.75, "grad_norm": 1.445690027975047, "kl": 0.04532237350940704, "learning_rate": 8.124344545141661e-07, "loss": 0.1694, "num_tokens": 9744768.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3377478122711182, "sampling/importance_sampling_ratio/mean": 1.0002708435058594, "sampling/importance_sampling_ratio/min": 0.769629180431366, "sampling/sampling_logp_difference/max": 0.290987491607666, "sampling/sampling_logp_difference/mean": 0.014286450110375881, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 260.40625, "completions/mean_terminated_length": 260.40625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.7575639486312866, "epoch": 0.7168141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 0.8910973606629476, "kl": 0.04973479360342026, "learning_rate": 8.112270981670195e-07, "loss": -0.0297, "num_tokens": 9776682.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.2737940549850464, "sampling/importance_sampling_ratio/mean": 0.9998492002487183, "sampling/importance_sampling_ratio/min": 0.7660143971443176, "sampling/sampling_logp_difference/max": 0.2665543556213379, "sampling/sampling_logp_difference/mean": 0.016974743455648422, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 171.671875, "completions/mean_terminated_length": 171.671875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.5304486155509949, "epoch": 0.7185840707964601, "frac_reward_zero_std": 0.75, "grad_norm": 0.9323335748932801, "kl": 0.04561503604054451, "learning_rate": 8.10016771967254e-07, "loss": -0.0003, "num_tokens": 9798101.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2717859745025635, "sampling/importance_sampling_ratio/mean": 1.000009536743164, "sampling/importance_sampling_ratio/min": 0.7241708040237427, "sampling/sampling_logp_difference/max": 0.3227280378341675, "sampling/sampling_logp_difference/mean": 0.014041692018508911, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 428.34375, "completions/mean_terminated_length": 428.34375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5311000943183899, "epoch": 0.720353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 0.7071722110680441, "kl": 0.02927211858332157, "learning_rate": 8.088034874642833e-07, "loss": 0.0514, "num_tokens": 9835707.0, "reward": -0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3246009349822998, "sampling/importance_sampling_ratio/mean": 0.9999911189079285, "sampling/importance_sampling_ratio/min": 0.7496660351753235, "sampling/sampling_logp_difference/max": 0.28812742233276367, "sampling/sampling_logp_difference/mean": 0.013390760868787766, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 170.71875, "completions/mean_terminated_length": 170.71875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.5504417419433594, "epoch": 0.7221238938053097, "frac_reward_zero_std": 1.0, "grad_norm": 0.03605265228400824, "kl": 0.0444914773106575, "learning_rate": 8.0758725623575e-07, "loss": 0.0005, "num_tokens": 9857321.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2636146545410156, "sampling/importance_sampling_ratio/mean": 0.9997602701187134, "sampling/importance_sampling_ratio/min": 0.7789768576622009, "sampling/sampling_logp_difference/max": 0.24977397918701172, "sampling/sampling_logp_difference/mean": 0.013640528544783592, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 255.21875, "completions/mean_terminated_length": 255.21875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.6179224848747253, "epoch": 0.7238938053097345, "frac_reward_zero_std": 0.75, "grad_norm": 0.7576350054132374, "kl": 0.03780404478311539, "learning_rate": 8.063680898874157e-07, "loss": -0.0069, "num_tokens": 9885895.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.2713985443115234, "sampling/importance_sampling_ratio/mean": 0.9993462562561035, "sampling/importance_sampling_ratio/min": 0.749877393245697, "sampling/sampling_logp_difference/max": 0.2878456115722656, "sampling/sampling_logp_difference/mean": 0.014377973042428493, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 230.0625, "completions/mean_terminated_length": 230.0625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.446044385433197, "epoch": 0.7256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.017778756520411314, "kl": 0.028128717094659805, "learning_rate": 8.051460000530501e-07, "loss": 0.0002, "num_tokens": 9910091.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4440149068832397, "sampling/importance_sampling_ratio/mean": 1.0000320672988892, "sampling/importance_sampling_ratio/min": 0.7879032492637634, "sampling/sampling_logp_difference/max": 0.36742734909057617, "sampling/sampling_logp_difference/mean": 0.011730384081602097, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 201.234375, "completions/mean_terminated_length": 201.234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.4440212845802307, "epoch": 0.727433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.019746072540553092, "kl": 0.02954564243555069, "learning_rate": 8.039209983943201e-07, "loss": 0.0002, "num_tokens": 9932506.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2563470602035522, "sampling/importance_sampling_ratio/mean": 1.0003342628479004, "sampling/importance_sampling_ratio/min": 0.7805354595184326, "sampling/sampling_logp_difference/max": 0.24777507781982422, "sampling/sampling_logp_difference/mean": 0.012176052667200565, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 223.828125, "completions/mean_terminated_length": 223.828125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.6150417923927307, "epoch": 0.7292035398230089, "frac_reward_zero_std": 0.75, "grad_norm": 0.7213547115359932, "kl": 0.04231223464012146, "learning_rate": 8.026930966006778e-07, "loss": 0.051, "num_tokens": 9959103.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.2890146970748901, "sampling/importance_sampling_ratio/mean": 1.0000054836273193, "sampling/importance_sampling_ratio/min": 0.6600985527038574, "sampling/sampling_logp_difference/max": 0.41536617279052734, "sampling/sampling_logp_difference/mean": 0.015552632510662079, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.5611389875411987, "epoch": 0.7309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 0.9469589059865152, "kl": 0.03591710329055786, "learning_rate": 8.014623063892503e-07, "loss": -0.0032, "num_tokens": 9985167.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.2715003490447998, "sampling/importance_sampling_ratio/mean": 0.9996594786643982, "sampling/importance_sampling_ratio/min": 0.7996978759765625, "sampling/sampling_logp_difference/max": 0.24019765853881836, "sampling/sampling_logp_difference/mean": 0.013684605248272419, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 323.21875, "completions/mean_terminated_length": 323.21875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.7013252973556519, "epoch": 0.7327433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 0.8217200799374641, "kl": 0.040895603597164154, "learning_rate": 8.002286395047266e-07, "loss": -0.0529, "num_tokens": 10023677.0, "reward": 0.1875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3881274461746216, "sampling/importance_sampling_ratio/mean": 1.0003700256347656, "sampling/importance_sampling_ratio/min": 0.7370283007621765, "sampling/sampling_logp_difference/max": 0.3279557228088379, "sampling/sampling_logp_difference/mean": 0.01638074964284897, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 248.734375, "completions/mean_terminated_length": 248.734375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.6986381411552429, "epoch": 0.7345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 0.9681014536921748, "kl": 0.04458405077457428, "learning_rate": 7.989921077192463e-07, "loss": -0.0539, "num_tokens": 10056332.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2845783233642578, "sampling/importance_sampling_ratio/mean": 1.0004305839538574, "sampling/importance_sampling_ratio/min": 0.8019576072692871, "sampling/sampling_logp_difference/max": 0.2504305839538574, "sampling/sampling_logp_difference/mean": 0.01640075445175171, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 396.546875, "completions/mean_terminated_length": 396.546875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.43965262174606323, "epoch": 0.736283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.014459620076378304, "kl": 0.026451773941516876, "learning_rate": 7.97752722832287e-07, "loss": 0.0002, "num_tokens": 10093583.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4509881734848022, "sampling/importance_sampling_ratio/mean": 1.0003583431243896, "sampling/importance_sampling_ratio/min": 0.6954984068870544, "sampling/sampling_logp_difference/max": 0.37224483489990234, "sampling/sampling_logp_difference/mean": 0.011681952513754368, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 244.609375, "completions/mean_terminated_length": 244.609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.5174590349197388, "epoch": 0.7380530973451327, "frac_reward_zero_std": 0.5, "grad_norm": 0.9879331767140251, "kl": 0.03312615305185318, "learning_rate": 7.965104966705517e-07, "loss": 0.0467, "num_tokens": 10121350.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.2825586795806885, "sampling/importance_sampling_ratio/mean": 0.9997804164886475, "sampling/importance_sampling_ratio/min": 0.7728856801986694, "sampling/sampling_logp_difference/max": 0.25762414932250977, "sampling/sampling_logp_difference/mean": 0.013050254434347153, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.6145303249359131, "epoch": 0.7398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 1.0340484641256698, "kl": 0.04335036128759384, "learning_rate": 7.952654410878558e-07, "loss": -0.0294, "num_tokens": 10147870.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.2907583713531494, "sampling/importance_sampling_ratio/mean": 1.0003231763839722, "sampling/importance_sampling_ratio/min": 0.787506639957428, "sampling/sampling_logp_difference/max": 0.2552299499511719, "sampling/sampling_logp_difference/mean": 0.014685952104628086, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 257.921875, "completions/mean_terminated_length": 257.921875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5901252031326294, "epoch": 0.7415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.6155586374644301, "kl": 0.036357589066028595, "learning_rate": 7.940175679650145e-07, "loss": 0.0125, "num_tokens": 10177369.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.2488468885421753, "sampling/importance_sampling_ratio/mean": 0.9997331500053406, "sampling/importance_sampling_ratio/min": 0.6989989876747131, "sampling/sampling_logp_difference/max": 0.3581058979034424, "sampling/sampling_logp_difference/mean": 0.015248929150402546, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 165.53125, "completions/mean_terminated_length": 165.53125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.5086607336997986, "epoch": 0.7433628318584071, "frac_reward_zero_std": 1.0, "grad_norm": 0.025702954650145108, "kl": 0.03315176069736481, "learning_rate": 7.927668892097288e-07, "loss": 0.0003, "num_tokens": 10197483.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2916887998580933, "sampling/importance_sampling_ratio/mean": 1.000204086303711, "sampling/importance_sampling_ratio/min": 0.6840068101882935, "sampling/sampling_logp_difference/max": 0.3797874450683594, "sampling/sampling_logp_difference/mean": 0.01362298522144556, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 102.578125, "completions/mean_terminated_length": 102.578125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.43110209703445435, "epoch": 0.7451327433628319, "frac_reward_zero_std": 1.0, "grad_norm": 0.029119439536607748, "kl": 0.024199388921260834, "learning_rate": 7.915134167564723e-07, "loss": 0.0002, "num_tokens": 10213488.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2614885568618774, "sampling/importance_sampling_ratio/mean": 0.9998210072517395, "sampling/importance_sampling_ratio/min": 0.7834794521331787, "sampling/sampling_logp_difference/max": 0.24401044845581055, "sampling/sampling_logp_difference/mean": 0.01294726226478815, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 117.03125, "completions/mean_terminated_length": 117.03125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.46812424063682556, "epoch": 0.7469026548672566, "frac_reward_zero_std": 1.0, "grad_norm": 0.02408020408102983, "kl": 0.02914397604763508, "learning_rate": 7.902571625663772e-07, "loss": 0.0003, "num_tokens": 10230562.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6391023397445679, "sampling/importance_sampling_ratio/mean": 1.0002003908157349, "sampling/importance_sampling_ratio/min": 0.7844666838645935, "sampling/sampling_logp_difference/max": 0.49414873123168945, "sampling/sampling_logp_difference/mean": 0.013646814972162247, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 207.828125, "completions/mean_terminated_length": 207.828125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.3817795515060425, "epoch": 0.7486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.025599478909410948, "kl": 0.02518792450428009, "learning_rate": 7.8899813862712e-07, "loss": 0.0003, "num_tokens": 10252871.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2714892625808716, "sampling/importance_sampling_ratio/mean": 0.9998159408569336, "sampling/importance_sampling_ratio/min": 0.7706360220909119, "sampling/sampling_logp_difference/max": 0.26053905487060547, "sampling/sampling_logp_difference/mean": 0.01119263470172882, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 210.328125, "completions/mean_terminated_length": 210.328125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.4865052103996277, "epoch": 0.7504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 0.6183458650555943, "kl": 0.0423419289290905, "learning_rate": 7.877363569528075e-07, "loss": 0.0164, "num_tokens": 10275916.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5006414651870728, "sampling/importance_sampling_ratio/mean": 1.0000896453857422, "sampling/importance_sampling_ratio/min": 0.7677870988845825, "sampling/sampling_logp_difference/max": 0.40589261054992676, "sampling/sampling_logp_difference/mean": 0.012981399893760681, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 142.734375, "completions/mean_terminated_length": 142.734375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.4313989281654358, "epoch": 0.7522123893805309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0206528651357792, "kl": 0.028065882623195648, "learning_rate": 7.864718295838614e-07, "loss": 0.0003, "num_tokens": 10295755.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2645851373672485, "sampling/importance_sampling_ratio/mean": 1.000291109085083, "sampling/importance_sampling_ratio/min": 0.6827008128166199, "sampling/sampling_logp_difference/max": 0.3816986083984375, "sampling/sampling_logp_difference/mean": 0.01229530107229948, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 364.484375, "completions/mean_terminated_length": 364.484375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.6108871102333069, "epoch": 0.7539823008849558, "frac_reward_zero_std": 0.5, "grad_norm": 0.7203730706351245, "kl": 0.038677990436553955, "learning_rate": 7.852045685869044e-07, "loss": -0.1326, "num_tokens": 10333322.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.2985036373138428, "sampling/importance_sampling_ratio/mean": 0.9998139142990112, "sampling/importance_sampling_ratio/min": 0.6713259220123291, "sampling/sampling_logp_difference/max": 0.39850056171417236, "sampling/sampling_logp_difference/mean": 0.014769405126571655, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 88.5, "completions/mean_terminated_length": 88.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4560648202896118, "epoch": 0.7557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.034205023257193365, "kl": 0.03481544554233551, "learning_rate": 7.839345860546447e-07, "loss": 0.0003, "num_tokens": 10348186.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2973552942276, "sampling/importance_sampling_ratio/mean": 1.0008292198181152, "sampling/importance_sampling_ratio/min": 0.7703726887702942, "sampling/sampling_logp_difference/max": 0.2608809471130371, "sampling/sampling_logp_difference/mean": 0.014451464638113976, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 423.171875, "completions/mean_terminated_length": 423.171875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.444010853767395, "epoch": 0.7575221238938054, "frac_reward_zero_std": 0.5, "grad_norm": 0.5507981474438232, "kl": 0.024493619799613953, "learning_rate": 7.826618941057597e-07, "loss": -0.016, "num_tokens": 10386549.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3991864919662476, "sampling/importance_sampling_ratio/mean": 0.9999668002128601, "sampling/importance_sampling_ratio/min": 0.7739522457122803, "sampling/sampling_logp_difference/max": 0.3358910083770752, "sampling/sampling_logp_difference/mean": 0.010899325832724571, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 267.453125, "completions/mean_terminated_length": 267.453125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.44651085138320923, "epoch": 0.7592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 0.585716512410736, "kl": 0.02946385368704796, "learning_rate": 7.813865048847818e-07, "loss": -0.0568, "num_tokens": 10414754.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3954877853393555, "sampling/importance_sampling_ratio/mean": 1.0002694129943848, "sampling/importance_sampling_ratio/min": 0.6886769533157349, "sampling/sampling_logp_difference/max": 0.3729829788208008, "sampling/sampling_logp_difference/mean": 0.012613236904144287, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 301.03125, "completions/mean_terminated_length": 301.03125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.6009423136711121, "epoch": 0.7610619469026548, "frac_reward_zero_std": 0.25, "grad_norm": 0.9933302182600604, "kl": 0.04393962770700455, "learning_rate": 7.801084305619818e-07, "loss": 0.0782, "num_tokens": 10446820.0, "reward": 0.65625, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3730158805847168, "sampling/importance_sampling_ratio/mean": 0.9999932050704956, "sampling/importance_sampling_ratio/min": 0.7901090979576111, "sampling/sampling_logp_difference/max": 0.31700968742370605, "sampling/sampling_logp_difference/mean": 0.014922568574547768, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 163.90625, "completions/mean_terminated_length": 163.90625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.4138805866241455, "epoch": 0.7628318584070797, "frac_reward_zero_std": 1.0, "grad_norm": 0.018571573352140252, "kl": 0.02377704344689846, "learning_rate": 7.788276833332525e-07, "loss": 0.0002, "num_tokens": 10466782.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4309606552124023, "sampling/importance_sampling_ratio/mean": 1.000049352645874, "sampling/importance_sampling_ratio/min": 0.7120794653892517, "sampling/sampling_logp_difference/max": 0.35834598541259766, "sampling/sampling_logp_difference/mean": 0.012025558389723301, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 233.4375, "completions/mean_terminated_length": 233.4375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.5966275334358215, "epoch": 0.7646017699115044, "frac_reward_zero_std": 0.5, "grad_norm": 1.0074666950129674, "kl": 0.039923615753650665, "learning_rate": 7.775442754199928e-07, "loss": 0.0061, "num_tokens": 10494538.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3143128156661987, "sampling/importance_sampling_ratio/mean": 1.0001587867736816, "sampling/importance_sampling_ratio/min": 0.7404649257659912, "sampling/sampling_logp_difference/max": 0.3004770278930664, "sampling/sampling_logp_difference/mean": 0.01564229466021061, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 338.125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.5332291126251221, "epoch": 0.7663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 0.7027433639270724, "kl": 0.026494858786463737, "learning_rate": 7.76258219068991e-07, "loss": -0.027, "num_tokens": 10527714.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.2875243425369263, "sampling/importance_sampling_ratio/mean": 1.0004990100860596, "sampling/importance_sampling_ratio/min": 0.7390238046646118, "sampling/sampling_logp_difference/max": 0.3024251461029053, "sampling/sampling_logp_difference/mean": 0.013710242696106434, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 204.40625, "completions/mean_terminated_length": 204.40625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.5407809019088745, "epoch": 0.768141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 0.8293554177747698, "kl": 0.03756345435976982, "learning_rate": 7.749695265523075e-07, "loss": -0.1366, "num_tokens": 10551996.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.2573373317718506, "sampling/importance_sampling_ratio/mean": 0.9997073411941528, "sampling/importance_sampling_ratio/min": 0.5997409820556641, "sampling/sampling_logp_difference/max": 0.5112574100494385, "sampling/sampling_logp_difference/mean": 0.015162959694862366, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.42888888716697693, "epoch": 0.7699115044247787, "frac_reward_zero_std": 1.0, "grad_norm": 0.03443332695496176, "kl": 0.029706230387091637, "learning_rate": 7.736782101671586e-07, "loss": 0.0003, "num_tokens": 10570524.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2733570337295532, "sampling/importance_sampling_ratio/mean": 1.0002081394195557, "sampling/importance_sampling_ratio/min": 0.6091095805168152, "sampling/sampling_logp_difference/max": 0.4957571029663086, "sampling/sampling_logp_difference/mean": 0.013336366973817348, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 272.453125, "completions/mean_terminated_length": 272.453125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.5150054693222046, "epoch": 0.7716814159292036, "frac_reward_zero_std": 0.75, "grad_norm": 0.5493669467908714, "kl": 0.03158093988895416, "learning_rate": 7.723842822357979e-07, "loss": -0.0143, "num_tokens": 10599993.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3444499969482422, "sampling/importance_sampling_ratio/mean": 1.000211477279663, "sampling/importance_sampling_ratio/min": 0.76054447889328, "sampling/sampling_logp_difference/max": 0.29598498344421387, "sampling/sampling_logp_difference/mean": 0.013418814167380333, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 97.125, "completions/mean_terminated_length": 97.125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.42227739095687866, "epoch": 0.7734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.030688375633997164, "kl": 0.02855643630027771, "learning_rate": 7.710877551054003e-07, "loss": 0.0003, "num_tokens": 10617041.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.574539065361023, "sampling/importance_sampling_ratio/mean": 0.9995243549346924, "sampling/importance_sampling_ratio/min": 0.7018066048622131, "sampling/sampling_logp_difference/max": 0.4539625644683838, "sampling/sampling_logp_difference/mean": 0.012881052680313587, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.5003805160522461, "epoch": 0.7752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 1.2145108016288721, "kl": 0.03165384382009506, "learning_rate": 7.697886411479421e-07, "loss": -0.0002, "num_tokens": 10638401.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5261650085449219, "sampling/importance_sampling_ratio/mean": 1.0007299184799194, "sampling/importance_sampling_ratio/min": 0.7740103602409363, "sampling/sampling_logp_difference/max": 0.4227581024169922, "sampling/sampling_logp_difference/mean": 0.01449810154736042, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 164.59375, "completions/mean_terminated_length": 164.59375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.4594087600708008, "epoch": 0.7769911504424779, "frac_reward_zero_std": 1.0, "grad_norm": 0.023352909978015583, "kl": 0.027794960886240005, "learning_rate": 7.684869527600856e-07, "loss": 0.0003, "num_tokens": 10659431.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2940813302993774, "sampling/importance_sampling_ratio/mean": 0.9991923570632935, "sampling/importance_sampling_ratio/min": 0.6983209252357483, "sampling/sampling_logp_difference/max": 0.35907649993896484, "sampling/sampling_logp_difference/mean": 0.014465093612670898, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 221.578125, "completions/mean_terminated_length": 221.578125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.38419753313064575, "epoch": 0.7787610619469026, "frac_reward_zero_std": 1.0, "grad_norm": 0.02519789563762391, "kl": 0.024195484817028046, "learning_rate": 7.671827023630579e-07, "loss": 0.0003, "num_tokens": 10682684.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2787325382232666, "sampling/importance_sampling_ratio/mean": 0.999823808670044, "sampling/importance_sampling_ratio/min": 0.7821112275123596, "sampling/sampling_logp_difference/max": 0.24586939811706543, "sampling/sampling_logp_difference/mean": 0.011161397211253643, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 275.921875, "completions/mean_terminated_length": 275.921875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.45152318477630615, "epoch": 0.7805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 0.7035346721076269, "kl": 0.023259248584508896, "learning_rate": 7.658759024025347e-07, "loss": -0.0488, "num_tokens": 10711783.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3871408700942993, "sampling/importance_sampling_ratio/mean": 1.0006746053695679, "sampling/importance_sampling_ratio/min": 0.7725328207015991, "sampling/sampling_logp_difference/max": 0.32724475860595703, "sampling/sampling_logp_difference/mean": 0.013077957555651665, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 196.703125, "completions/mean_terminated_length": 196.703125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.4339255094528198, "epoch": 0.7823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.021244812514413552, "kl": 0.022286761552095413, "learning_rate": 7.645665653485205e-07, "loss": 0.0002, "num_tokens": 10734212.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5467441082000732, "sampling/importance_sampling_ratio/mean": 1.000171184539795, "sampling/importance_sampling_ratio/min": 0.763978898525238, "sampling/sampling_logp_difference/max": 0.43615221977233887, "sampling/sampling_logp_difference/mean": 0.013061773031949997, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.5018302798271179, "epoch": 0.784070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 0.8913059065977091, "kl": 0.02923012711107731, "learning_rate": 7.632547036952295e-07, "loss": 0.0497, "num_tokens": 10757484.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.3488526344299316, "sampling/importance_sampling_ratio/mean": 0.999633252620697, "sampling/importance_sampling_ratio/min": 0.5483744740486145, "sampling/sampling_logp_difference/max": 0.6007968187332153, "sampling/sampling_logp_difference/mean": 0.013715255074203014, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 150.328125, "completions/mean_terminated_length": 150.328125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.46669435501098633, "epoch": 0.7858407079646018, "frac_reward_zero_std": 0.75, "grad_norm": 1.1354952478893947, "kl": 0.04630407691001892, "learning_rate": 7.619403299609667e-07, "loss": -0.0105, "num_tokens": 10777633.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3220911026000977, "sampling/importance_sampling_ratio/mean": 0.9996940493583679, "sampling/importance_sampling_ratio/min": 0.7745318412780762, "sampling/sampling_logp_difference/max": 0.27921462059020996, "sampling/sampling_logp_difference/mean": 0.0143440468236804, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 273.421875, "completions/mean_terminated_length": 273.421875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.5878114700317383, "epoch": 0.7876106194690266, "frac_reward_zero_std": 0.5, "grad_norm": 0.8522564558252177, "kl": 0.03717903792858124, "learning_rate": 7.606234566880088e-07, "loss": 0.0153, "num_tokens": 10805644.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3009824752807617, "sampling/importance_sampling_ratio/mean": 1.0000145435333252, "sampling/importance_sampling_ratio/min": 0.7246646285057068, "sampling/sampling_logp_difference/max": 0.32204627990722656, "sampling/sampling_logp_difference/mean": 0.015486251562833786, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 114.8125, "completions/mean_terminated_length": 114.8125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4024341106414795, "epoch": 0.7893805309734513, "frac_reward_zero_std": 1.0, "grad_norm": 0.027879583841671444, "kl": 0.026340488344430923, "learning_rate": 7.593040964424835e-07, "loss": 0.0003, "num_tokens": 10822304.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4007272720336914, "sampling/importance_sampling_ratio/mean": 0.9998321533203125, "sampling/importance_sampling_ratio/min": 0.7129731774330139, "sampling/sampling_logp_difference/max": 0.3383115530014038, "sampling/sampling_logp_difference/mean": 0.013402743265032768, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.5344680547714233, "epoch": 0.7911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.1420813630956677, "kl": 0.03319220244884491, "learning_rate": 7.579822618142503e-07, "loss": 0.0076, "num_tokens": 10846384.0, "reward": -0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.282419204711914, "sampling/importance_sampling_ratio/mean": 1.0004487037658691, "sampling/importance_sampling_ratio/min": 0.7643356323242188, "sampling/sampling_logp_difference/max": 0.26874828338623047, "sampling/sampling_logp_difference/mean": 0.014978764578700066, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 262.390625, "completions/mean_terminated_length": 262.390625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4648711681365967, "epoch": 0.7929203539823009, "frac_reward_zero_std": 0.75, "grad_norm": 0.5885241924762683, "kl": 0.02966545894742012, "learning_rate": 7.56657965416781e-07, "loss": -0.0728, "num_tokens": 10874073.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3482927083969116, "sampling/importance_sampling_ratio/mean": 0.9995811581611633, "sampling/importance_sampling_ratio/min": 0.6962099671363831, "sampling/sampling_logp_difference/max": 0.3621039390563965, "sampling/sampling_logp_difference/mean": 0.013349469751119614, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 134.640625, "completions/mean_terminated_length": 134.640625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.48496830463409424, "epoch": 0.7946902654867256, "frac_reward_zero_std": 0.75, "grad_norm": 1.2329295493227554, "kl": 0.03269118070602417, "learning_rate": 7.553312198870372e-07, "loss": -0.0062, "num_tokens": 10895298.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.435898780822754, "sampling/importance_sampling_ratio/mean": 1.0002576112747192, "sampling/importance_sampling_ratio/min": 0.779975175857544, "sampling/sampling_logp_difference/max": 0.36179089546203613, "sampling/sampling_logp_difference/mean": 0.015333665534853935, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 216.21875, "completions/mean_terminated_length": 216.21875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4284505844116211, "epoch": 0.7964601769911505, "frac_reward_zero_std": 1.0, "grad_norm": 0.03433615102773921, "kl": 0.03012528270483017, "learning_rate": 7.540020378853522e-07, "loss": 0.0003, "num_tokens": 10920704.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3699814081192017, "sampling/importance_sampling_ratio/mean": 0.9998600482940674, "sampling/importance_sampling_ratio/min": 0.7276777029037476, "sampling/sampling_logp_difference/max": 0.31789708137512207, "sampling/sampling_logp_difference/mean": 0.012783464044332504, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 199.65625, "completions/mean_terminated_length": 199.65625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5476254820823669, "epoch": 0.7982300884955752, "frac_reward_zero_std": 0.5, "grad_norm": 1.195894263535234, "kl": 0.037123337388038635, "learning_rate": 7.52670432095309e-07, "loss": -0.0595, "num_tokens": 10944778.0, "reward": 0.40625, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3971526622772217, "sampling/importance_sampling_ratio/mean": 0.9999358654022217, "sampling/importance_sampling_ratio/min": 0.6622995138168335, "sampling/sampling_logp_difference/max": 0.41203737258911133, "sampling/sampling_logp_difference/mean": 0.01616559736430645, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5837963819503784, "epoch": 0.8, "frac_reward_zero_std": 0.5, "grad_norm": 1.2950197099208176, "kl": 0.03727755323052406, "learning_rate": 7.513364152236185e-07, "loss": 0.0361, "num_tokens": 10968674.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3993202447891235, "sampling/importance_sampling_ratio/mean": 0.9999784231185913, "sampling/importance_sampling_ratio/min": 0.6482345461845398, "sampling/sampling_logp_difference/max": 0.4335026741027832, "sampling/sampling_logp_difference/mean": 0.01656384952366352, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 174.609375, "completions/mean_terminated_length": 174.609375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.48638856410980225, "epoch": 0.8017699115044248, "frac_reward_zero_std": 1.0, "grad_norm": 0.034520101340169126, "kl": 0.03867881000041962, "learning_rate": 7.5e-07, "loss": 0.0004, "num_tokens": 10990521.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2748844623565674, "sampling/importance_sampling_ratio/mean": 1.0004186630249023, "sampling/importance_sampling_ratio/min": 0.6954996585845947, "sampling/sampling_logp_difference/max": 0.3631247282028198, "sampling/sampling_logp_difference/mean": 0.014318529516458511, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 232.328125, "completions/mean_terminated_length": 232.328125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.39822274446487427, "epoch": 0.8035398230088495, "frac_reward_zero_std": 1.0, "grad_norm": 0.02470107717998635, "kl": 0.026745492592453957, "learning_rate": 7.486611991770585e-07, "loss": 0.0003, "num_tokens": 11016478.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4480769634246826, "sampling/importance_sampling_ratio/mean": 1.0001085996627808, "sampling/importance_sampling_ratio/min": 0.7452806830406189, "sampling/sampling_logp_difference/max": 0.3702363967895508, "sampling/sampling_logp_difference/mean": 0.011767730116844177, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.3846904933452606, "epoch": 0.8053097345132744, "frac_reward_zero_std": 1.0, "grad_norm": 0.024929384929832016, "kl": 0.02466784231364727, "learning_rate": 7.473200255301634e-07, "loss": 0.0002, "num_tokens": 11038234.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.292103886604309, "sampling/importance_sampling_ratio/mean": 0.9994224905967712, "sampling/importance_sampling_ratio/min": 0.42090708017349243, "sampling/sampling_logp_difference/max": 0.8653432130813599, "sampling/sampling_logp_difference/mean": 0.013551078736782074, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.4845409095287323, "epoch": 0.8070796460176991, "frac_reward_zero_std": 0.75, "grad_norm": 0.5941695953160814, "kl": 0.03295625373721123, "learning_rate": 7.459764918573264e-07, "loss": 0.0593, "num_tokens": 11069890.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.976773738861084, "sampling/importance_sampling_ratio/mean": 1.0003403425216675, "sampling/importance_sampling_ratio/min": 0.6821828484535217, "sampling/sampling_logp_difference/max": 0.6814661026000977, "sampling/sampling_logp_difference/mean": 0.014344906434416771, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.35666027665138245, "epoch": 0.8088495575221238, "frac_reward_zero_std": 1.0, "grad_norm": 0.025818551583167294, "kl": 0.022908702492713928, "learning_rate": 7.446306109790797e-07, "loss": 0.0002, "num_tokens": 11092674.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.351462721824646, "sampling/importance_sampling_ratio/mean": 0.9997496008872986, "sampling/importance_sampling_ratio/min": 0.6530708074569702, "sampling/sampling_logp_difference/max": 0.4260697364807129, "sampling/sampling_logp_difference/mean": 0.012220639735460281, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 185.953125, "completions/mean_terminated_length": 185.953125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.30786052346229553, "epoch": 0.8106194690265487, "frac_reward_zero_std": 1.0, "grad_norm": 0.021505220582146843, "kl": 0.021251916885375977, "learning_rate": 7.432823957383531e-07, "loss": 0.0002, "num_tokens": 11114591.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4754825830459595, "sampling/importance_sampling_ratio/mean": 1.0004115104675293, "sampling/importance_sampling_ratio/min": 0.7135739326477051, "sampling/sampling_logp_difference/max": 0.3889850378036499, "sampling/sampling_logp_difference/mean": 0.011822560802102089, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 150.609375, "completions/mean_terminated_length": 150.609375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.522521436214447, "epoch": 0.8123893805309734, "frac_reward_zero_std": 0.75, "grad_norm": 1.1575108264129086, "kl": 0.03246885538101196, "learning_rate": 7.419318590003523e-07, "loss": 0.007, "num_tokens": 11136742.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.498809576034546, "sampling/importance_sampling_ratio/mean": 1.0004464387893677, "sampling/importance_sampling_ratio/min": 0.7658902406692505, "sampling/sampling_logp_difference/max": 0.40467125177383423, "sampling/sampling_logp_difference/mean": 0.015536777675151825, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.42402294278144836, "epoch": 0.8141592920353983, "frac_reward_zero_std": 1.0, "grad_norm": 0.05101026144566764, "kl": 0.0434221550822258, "learning_rate": 7.405790136524352e-07, "loss": 0.0004, "num_tokens": 11158478.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6273397207260132, "sampling/importance_sampling_ratio/mean": 1.0000479221343994, "sampling/importance_sampling_ratio/min": 0.715876042842865, "sampling/sampling_logp_difference/max": 0.48694658279418945, "sampling/sampling_logp_difference/mean": 0.013064793311059475, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 211.375, "completions/mean_terminated_length": 211.375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.3769833445549011, "epoch": 0.815929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.8130484287920594, "kl": 0.023719415068626404, "learning_rate": 7.392238726039897e-07, "loss": -0.049, "num_tokens": 11181558.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5071641206741333, "sampling/importance_sampling_ratio/mean": 0.9999847412109375, "sampling/importance_sampling_ratio/min": 0.5999216437339783, "sampling/sampling_logp_difference/max": 0.5109561681747437, "sampling/sampling_logp_difference/mean": 0.013288834132254124, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 266.671875, "completions/mean_terminated_length": 266.671875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5343055725097656, "epoch": 0.8176991150442477, "frac_reward_zero_std": 0.25, "grad_norm": 0.997344620645345, "kl": 0.04499747231602669, "learning_rate": 7.378664487863102e-07, "loss": -0.0117, "num_tokens": 11208529.0, "reward": 0.375, "reward_std": 0.6116957664489746, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.363373041152954, "sampling/importance_sampling_ratio/mean": 0.9997880458831787, "sampling/importance_sampling_ratio/min": 0.7186910510063171, "sampling/sampling_logp_difference/max": 0.3303236961364746, "sampling/sampling_logp_difference/mean": 0.014024769887328148, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 124.671875, "completions/mean_terminated_length": 124.671875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.4910151958465576, "epoch": 0.8194690265486726, "frac_reward_zero_std": 1.0, "grad_norm": 0.04986874805092984, "kl": 0.0405578538775444, "learning_rate": 7.365067551524739e-07, "loss": 0.0004, "num_tokens": 11228348.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3114522695541382, "sampling/importance_sampling_ratio/mean": 1.0005993843078613, "sampling/importance_sampling_ratio/min": 0.7302378416061401, "sampling/sampling_logp_difference/max": 0.31438496708869934, "sampling/sampling_logp_difference/mean": 0.015644388273358345, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 91.578125, "completions/mean_terminated_length": 91.578125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.37779563665390015, "epoch": 0.8212389380530973, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052217573764209, "kl": 0.031010787934064865, "learning_rate": 7.351448046772177e-07, "loss": 0.0003, "num_tokens": 11243553.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2999294996261597, "sampling/importance_sampling_ratio/mean": 0.9995562434196472, "sampling/importance_sampling_ratio/min": 0.7820419073104858, "sampling/sampling_logp_difference/max": 0.2623100280761719, "sampling/sampling_logp_difference/mean": 0.013820555061101913, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 268.609375, "completions/mean_terminated_length": 268.609375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.48436078429222107, "epoch": 0.8230088495575221, "frac_reward_zero_std": 0.5, "grad_norm": 1.0868209307297103, "kl": 0.02616463601589203, "learning_rate": 7.33780610356814e-07, "loss": 0.0155, "num_tokens": 11273608.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.441197395324707, "sampling/importance_sampling_ratio/mean": 1.0004792213439941, "sampling/importance_sampling_ratio/min": 0.7312192916870117, "sampling/sampling_logp_difference/max": 0.36547422409057617, "sampling/sampling_logp_difference/mean": 0.014788644388318062, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 196.53125, "completions/mean_terminated_length": 196.53125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.4497166872024536, "epoch": 0.8247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 0.9465872850641279, "kl": 0.02660083770751953, "learning_rate": 7.324141852089471e-07, "loss": -0.0431, "num_tokens": 11297722.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4556270837783813, "sampling/importance_sampling_ratio/mean": 0.999812126159668, "sampling/importance_sampling_ratio/min": 0.6622627377510071, "sampling/sampling_logp_difference/max": 0.412092924118042, "sampling/sampling_logp_difference/mean": 0.014465127140283585, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 199.578125, "completions/mean_terminated_length": 199.578125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.5702181458473206, "epoch": 0.8265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 0.7541623494996718, "kl": 0.03747157007455826, "learning_rate": 7.310455422725889e-07, "loss": -0.0138, "num_tokens": 11322991.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.346382975578308, "sampling/importance_sampling_ratio/mean": 1.0009618997573853, "sampling/importance_sampling_ratio/min": 0.7495355010032654, "sampling/sampling_logp_difference/max": 0.2974216938018799, "sampling/sampling_logp_difference/mean": 0.015652555972337723, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 131.671875, "completions/mean_terminated_length": 131.671875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.4590853154659271, "epoch": 0.8283185840707965, "frac_reward_zero_std": 1.0, "grad_norm": 0.026950083424843858, "kl": 0.030661489814519882, "learning_rate": 7.296746946078736e-07, "loss": 0.0003, "num_tokens": 11342186.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.457200527191162, "sampling/importance_sampling_ratio/mean": 1.000671148300171, "sampling/importance_sampling_ratio/min": 0.7460749745368958, "sampling/sampling_logp_difference/max": 0.3765171766281128, "sampling/sampling_logp_difference/mean": 0.014778186567127705, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 274.890625, "completions/mean_terminated_length": 274.890625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.3902917504310608, "epoch": 0.8300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 0.8083030396004173, "kl": 0.03197154775261879, "learning_rate": 7.283016552959744e-07, "loss": -0.012, "num_tokens": 11369059.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4135171175003052, "sampling/importance_sampling_ratio/mean": 0.9999354481697083, "sampling/importance_sampling_ratio/min": 0.7754601240158081, "sampling/sampling_logp_difference/max": 0.346081018447876, "sampling/sampling_logp_difference/mean": 0.011018112301826477, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 218.046875, "completions/mean_terminated_length": 218.046875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.4256887137889862, "epoch": 0.831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 0.7532580614324942, "kl": 0.027467817068099976, "learning_rate": 7.26926437438978e-07, "loss": 0.0593, "num_tokens": 11393622.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4222370386123657, "sampling/importance_sampling_ratio/mean": 1.0004581212997437, "sampling/importance_sampling_ratio/min": 0.5834801197052002, "sampling/sampling_logp_difference/max": 0.5387449264526367, "sampling/sampling_logp_difference/mean": 0.012748299166560173, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 140.765625, "completions/mean_terminated_length": 140.765625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.487289696931839, "epoch": 0.8336283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 1.3313805129963245, "kl": 0.025811290368437767, "learning_rate": 7.255490541597594e-07, "loss": 0.0247, "num_tokens": 11419495.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5287584066390991, "sampling/importance_sampling_ratio/mean": 1.0003440380096436, "sampling/importance_sampling_ratio/min": 0.70777827501297, "sampling/sampling_logp_difference/max": 0.4244558811187744, "sampling/sampling_logp_difference/mean": 0.017625022679567337, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 126.96875, "completions/mean_terminated_length": 126.96875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.356789767742157, "epoch": 0.8353982300884956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0244096963281102, "kl": 0.0201118066906929, "learning_rate": 7.241695186018573e-07, "loss": 0.0002, "num_tokens": 11436069.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.393985390663147, "sampling/importance_sampling_ratio/mean": 1.0006990432739258, "sampling/importance_sampling_ratio/min": 0.6962440013885498, "sampling/sampling_logp_difference/max": 0.36205506324768066, "sampling/sampling_logp_difference/mean": 0.01351387333124876, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 301.40625, "completions/mean_terminated_length": 301.40625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.3471708297729492, "epoch": 0.8371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 0.41616857483899017, "kl": 0.026676032692193985, "learning_rate": 7.227878439293476e-07, "loss": -0.0286, "num_tokens": 11465055.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.346140742301941, "sampling/importance_sampling_ratio/mean": 1.0000357627868652, "sampling/importance_sampling_ratio/min": 0.6855998039245605, "sampling/sampling_logp_difference/max": 0.37746119499206543, "sampling/sampling_logp_difference/mean": 0.010506587103009224, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.545015811920166, "epoch": 0.8389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 1.0613731474750803, "kl": 0.034096769988536835, "learning_rate": 7.214040433267198e-07, "loss": 0.0514, "num_tokens": 11489583.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.2997113466262817, "sampling/importance_sampling_ratio/mean": 1.0002450942993164, "sampling/importance_sampling_ratio/min": 0.7379943132400513, "sampling/sampling_logp_difference/max": 0.3038191795349121, "sampling/sampling_logp_difference/mean": 0.016056951135396957, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 178.78125, "completions/mean_terminated_length": 178.78125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.526360273361206, "epoch": 0.8407079646017699, "frac_reward_zero_std": 0.75, "grad_norm": 0.9978996042146862, "kl": 0.040484048426151276, "learning_rate": 7.200181299987482e-07, "loss": -0.0543, "num_tokens": 11512721.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4478073120117188, "sampling/importance_sampling_ratio/mean": 0.9992498159408569, "sampling/importance_sampling_ratio/min": 0.5596717596054077, "sampling/sampling_logp_difference/max": 0.5804047584533691, "sampling/sampling_logp_difference/mean": 0.016286306083202362, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 133.359375, "completions/mean_terminated_length": 133.359375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.4107704758644104, "epoch": 0.8424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.19391179468298175, "kl": 0.04821649193763733, "learning_rate": 7.186301171703688e-07, "loss": 0.0007, "num_tokens": 11531208.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.27157461643219, "sampling/importance_sampling_ratio/mean": 1.0001773834228516, "sampling/importance_sampling_ratio/min": 0.7676100730895996, "sampling/sampling_logp_difference/max": 0.26447343826293945, "sampling/sampling_logp_difference/mean": 0.014536736533045769, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 208.78125, "completions/mean_terminated_length": 208.78125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.5146007537841797, "epoch": 0.8442477876106195, "frac_reward_zero_std": 0.75, "grad_norm": 0.7245865187700515, "kl": 0.03338593617081642, "learning_rate": 7.172400180865513e-07, "loss": -0.0338, "num_tokens": 11556890.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3360331058502197, "sampling/importance_sampling_ratio/mean": 1.000069260597229, "sampling/importance_sampling_ratio/min": 0.7131832242012024, "sampling/sampling_logp_difference/max": 0.33801698684692383, "sampling/sampling_logp_difference/mean": 0.015137892216444016, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 210.140625, "completions/mean_terminated_length": 210.140625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.48041802644729614, "epoch": 0.8460176991150442, "frac_reward_zero_std": 0.75, "grad_norm": 0.8317884102824823, "kl": 0.030050743371248245, "learning_rate": 7.158478460121734e-07, "loss": -0.0393, "num_tokens": 11581667.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3866528272628784, "sampling/importance_sampling_ratio/mean": 1.0001895427703857, "sampling/importance_sampling_ratio/min": 0.7775483131408691, "sampling/sampling_logp_difference/max": 0.3268928527832031, "sampling/sampling_logp_difference/mean": 0.014050308614969254, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 355.828125, "completions/mean_terminated_length": 355.828125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5411931872367859, "epoch": 0.8477876106194691, "frac_reward_zero_std": 0.0, "grad_norm": 0.996920304121891, "kl": 0.02936280146241188, "learning_rate": 7.144536142318944e-07, "loss": -0.0493, "num_tokens": 11615976.0, "reward": 0.0, "reward_std": 0.907694935798645, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3146055936813354, "sampling/importance_sampling_ratio/mean": 0.9998599290847778, "sampling/importance_sampling_ratio/min": 0.7086941599845886, "sampling/sampling_logp_difference/max": 0.3443312644958496, "sampling/sampling_logp_difference/mean": 0.014649911783635616, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 153.390625, "completions/mean_terminated_length": 153.390625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.5173763036727905, "epoch": 0.8495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.9861125342172509, "kl": 0.04371890425682068, "learning_rate": 7.130573360500276e-07, "loss": -0.0498, "num_tokens": 11637649.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3012876510620117, "sampling/importance_sampling_ratio/mean": 0.9999483823776245, "sampling/importance_sampling_ratio/min": 0.6451591849327087, "sampling/sampling_logp_difference/max": 0.43825817108154297, "sampling/sampling_logp_difference/mean": 0.016263380646705627, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 201.078125, "completions/mean_terminated_length": 201.078125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.520511269569397, "epoch": 0.8513274336283185, "frac_reward_zero_std": 0.5, "grad_norm": 1.5591129363129872, "kl": 0.02956336922943592, "learning_rate": 7.116590247904143e-07, "loss": -0.0332, "num_tokens": 11660022.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6561579704284668, "sampling/importance_sampling_ratio/mean": 0.9996403455734253, "sampling/importance_sampling_ratio/min": 0.7768332362174988, "sampling/sampling_logp_difference/max": 0.5045003890991211, "sampling/sampling_logp_difference/mean": 0.014829604886472225, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 201.328125, "completions/mean_terminated_length": 201.328125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.428012877702713, "epoch": 0.8530973451327434, "frac_reward_zero_std": 0.75, "grad_norm": 0.749202794783412, "kl": 0.029783062636852264, "learning_rate": 7.10258693796296e-07, "loss": -0.031, "num_tokens": 11683083.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4433956146240234, "sampling/importance_sampling_ratio/mean": 0.999994158744812, "sampling/importance_sampling_ratio/min": 0.7057110667228699, "sampling/sampling_logp_difference/max": 0.36699843406677246, "sampling/sampling_logp_difference/mean": 0.013292660936713219, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 251.734375, "completions/mean_terminated_length": 251.734375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.46995267271995544, "epoch": 0.8548672566371681, "frac_reward_zero_std": 0.5, "grad_norm": 0.9155829197691279, "kl": 0.029299696907401085, "learning_rate": 7.088563564301873e-07, "loss": -0.1206, "num_tokens": 11709898.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4354580640792847, "sampling/importance_sampling_ratio/mean": 1.000469446182251, "sampling/importance_sampling_ratio/min": 0.6222667098045349, "sampling/sampling_logp_difference/max": 0.47438645362854004, "sampling/sampling_logp_difference/mean": 0.014075471088290215, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 146.828125, "completions/mean_terminated_length": 146.828125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.5843732357025146, "epoch": 0.856637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.0401387492558383, "kl": 0.03785454481840134, "learning_rate": 7.074520260737487e-07, "loss": -0.0003, "num_tokens": 11731167.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2977970838546753, "sampling/importance_sampling_ratio/mean": 1.000131368637085, "sampling/importance_sampling_ratio/min": 0.7850620746612549, "sampling/sampling_logp_difference/max": 0.2606682777404785, "sampling/sampling_logp_difference/mean": 0.0165737085044384, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 86.40625, "completions/mean_terminated_length": 86.40625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.40064767003059387, "epoch": 0.8584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.03165138422522889, "kl": 0.022871503606438637, "learning_rate": 7.06045716127658e-07, "loss": 0.0002, "num_tokens": 11746777.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2716447114944458, "sampling/importance_sampling_ratio/mean": 1.0001604557037354, "sampling/importance_sampling_ratio/min": 0.7843747138977051, "sampling/sampling_logp_difference/max": 0.24286842346191406, "sampling/sampling_logp_difference/mean": 0.01485820859670639, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 187.0625, "completions/mean_terminated_length": 187.0625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4952508211135864, "epoch": 0.8601769911504424, "frac_reward_zero_std": 1.0, "grad_norm": 0.040254008704668, "kl": 0.03231019154191017, "learning_rate": 7.04637440011484e-07, "loss": 0.0004, "num_tokens": 11769709.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2749754190444946, "sampling/importance_sampling_ratio/mean": 0.9997749924659729, "sampling/importance_sampling_ratio/min": 0.7213035821914673, "sampling/sampling_logp_difference/max": 0.32669520378112793, "sampling/sampling_logp_difference/mean": 0.014968110248446465, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 310.078125, "completions/mean_terminated_length": 310.078125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.44478166103363037, "epoch": 0.8619469026548673, "frac_reward_zero_std": 0.75, "grad_norm": 0.5448113024227174, "kl": 0.029941804707050323, "learning_rate": 7.032272111635565e-07, "loss": -0.0247, "num_tokens": 11799426.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.322062373161316, "sampling/importance_sampling_ratio/mean": 1.0000691413879395, "sampling/importance_sampling_ratio/min": 0.691804051399231, "sampling/sampling_logp_difference/max": 0.3684525489807129, "sampling/sampling_logp_difference/mean": 0.012722712010145187, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 249.90625, "completions/mean_terminated_length": 249.90625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.500858724117279, "epoch": 0.863716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 0.8117935943001742, "kl": 0.035196833312511444, "learning_rate": 7.018150430408394e-07, "loss": -0.014, "num_tokens": 11829980.0, "reward": -0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.2399535179138184, "sampling/importance_sampling_ratio/mean": 0.999932587146759, "sampling/importance_sampling_ratio/min": 0.6622503399848938, "sampling/sampling_logp_difference/max": 0.41211163997650146, "sampling/sampling_logp_difference/mean": 0.01349402591586113, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 294.484375, "completions/mean_terminated_length": 294.484375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.48047471046447754, "epoch": 0.8654867256637168, "frac_reward_zero_std": 0.5, "grad_norm": 0.9710981844145332, "kl": 0.023888757452368736, "learning_rate": 7.004009491188022e-07, "loss": 0.0547, "num_tokens": 11861035.0, "reward": -0.125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5514986515045166, "sampling/importance_sampling_ratio/mean": 0.9999948740005493, "sampling/importance_sampling_ratio/min": 0.7841082811355591, "sampling/sampling_logp_difference/max": 0.4392213225364685, "sampling/sampling_logp_difference/mean": 0.013466393575072289, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 110.984375, "completions/mean_terminated_length": 110.984375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3945050537586212, "epoch": 0.8672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.026062915037987123, "kl": 0.024000931531190872, "learning_rate": 6.989849428912907e-07, "loss": 0.0002, "num_tokens": 11877546.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.421489953994751, "sampling/importance_sampling_ratio/mean": 0.9999309182167053, "sampling/importance_sampling_ratio/min": 0.722906231880188, "sampling/sampling_logp_difference/max": 0.35170555114746094, "sampling/sampling_logp_difference/mean": 0.01340898685157299, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 173.40625, "completions/mean_terminated_length": 173.40625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4261055886745453, "epoch": 0.8690265486725663, "frac_reward_zero_std": 1.0, "grad_norm": 0.02130291550619073, "kl": 0.0226762518286705, "learning_rate": 6.975670378703992e-07, "loss": 0.0002, "num_tokens": 11898484.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3713045120239258, "sampling/importance_sampling_ratio/mean": 1.0003994703292847, "sampling/importance_sampling_ratio/min": 0.7284289598464966, "sampling/sampling_logp_difference/max": 0.3168652057647705, "sampling/sampling_logp_difference/mean": 0.013701027259230614, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 379.046875, "completions/mean_terminated_length": 379.046875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5800562500953674, "epoch": 0.8707964601769912, "frac_reward_zero_std": 0.25, "grad_norm": 1.0104309367821631, "kl": 0.029074903577566147, "learning_rate": 6.961472475863405e-07, "loss": -0.0101, "num_tokens": 11936807.0, "reward": 0.03125, "reward_std": 0.676956295967102, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4636763334274292, "sampling/importance_sampling_ratio/mean": 0.9996291399002075, "sampling/importance_sampling_ratio/min": 0.6017234921455383, "sampling/sampling_logp_difference/max": 0.5079572200775146, "sampling/sampling_logp_difference/mean": 0.015200433321297169, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 404.375, "completions/mean_terminated_length": 404.375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.49595940113067627, "epoch": 0.8725663716814159, "frac_reward_zero_std": 0.25, "grad_norm": 0.7814473339955769, "kl": 0.0224757082760334, "learning_rate": 6.947255855873176e-07, "loss": 0.0091, "num_tokens": 11972207.0, "reward": 0.28125, "reward_std": 0.7517043352127075, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999433755874634, "sampling/importance_sampling_ratio/min": 0.719646155834198, "sampling/sampling_logp_difference/max": 1.1721675395965576, "sampling/sampling_logp_difference/mean": 0.013315827585756779, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 280.796875, "completions/mean_terminated_length": 280.796875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.5208157300949097, "epoch": 0.8743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 0.811775830327024, "kl": 0.026327433064579964, "learning_rate": 6.93302065439394e-07, "loss": -0.01, "num_tokens": 12004130.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3280677795410156, "sampling/importance_sampling_ratio/mean": 1.0003632307052612, "sampling/importance_sampling_ratio/min": 0.7550833225250244, "sampling/sampling_logp_difference/max": 0.28372514247894287, "sampling/sampling_logp_difference/mean": 0.014217689633369446, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 92.28125, "completions/mean_terminated_length": 92.28125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3917282223701477, "epoch": 0.8761061946902655, "frac_reward_zero_std": 1.0, "grad_norm": 0.027722761754140412, "kl": 0.02344999648630619, "learning_rate": 6.918767007263645e-07, "loss": 0.0002, "num_tokens": 12018932.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.27275550365448, "sampling/importance_sampling_ratio/mean": 1.000253677368164, "sampling/importance_sampling_ratio/min": 0.7841185331344604, "sampling/sampling_logp_difference/max": 0.2431950569152832, "sampling/sampling_logp_difference/mean": 0.013372492045164108, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 93.578125, "completions/mean_terminated_length": 93.578125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4542263150215149, "epoch": 0.8778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.026428982809176813, "kl": 0.02201511710882187, "learning_rate": 6.904495050496258e-07, "loss": 0.0002, "num_tokens": 12036089.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.279893159866333, "sampling/importance_sampling_ratio/mean": 1.0000813007354736, "sampling/importance_sampling_ratio/min": 0.712908148765564, "sampling/sampling_logp_difference/max": 0.33840274810791016, "sampling/sampling_logp_difference/mean": 0.01597658358514309, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 217.359375, "completions/mean_terminated_length": 217.359375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.5032815337181091, "epoch": 0.879646017699115, "frac_reward_zero_std": 0.5, "grad_norm": 1.2087693688185344, "kl": 0.02380669116973877, "learning_rate": 6.890204920280457e-07, "loss": -0.077, "num_tokens": 12062400.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3951588869094849, "sampling/importance_sampling_ratio/mean": 1.0001604557037354, "sampling/importance_sampling_ratio/min": 0.7064918875694275, "sampling/sampling_logp_difference/max": 0.3474435806274414, "sampling/sampling_logp_difference/mean": 0.015207412652671337, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 221.078125, "completions/mean_terminated_length": 221.078125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.5150728225708008, "epoch": 0.8814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 0.7922492472140978, "kl": 0.03381916880607605, "learning_rate": 6.875896752978344e-07, "loss": -0.0017, "num_tokens": 12089125.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.457470417022705, "sampling/importance_sampling_ratio/mean": 1.0004377365112305, "sampling/importance_sampling_ratio/min": 0.7087637782096863, "sampling/sampling_logp_difference/max": 0.37670230865478516, "sampling/sampling_logp_difference/mean": 0.01392663735896349, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 248.078125, "completions/mean_terminated_length": 248.078125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.48234066367149353, "epoch": 0.8831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 0.8565721435977334, "kl": 0.02796265110373497, "learning_rate": 6.861570685124134e-07, "loss": -0.0596, "num_tokens": 12115322.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.2721396684646606, "sampling/importance_sampling_ratio/mean": 0.999937117099762, "sampling/importance_sampling_ratio/min": 0.7091477513313293, "sampling/sampling_logp_difference/max": 0.343691349029541, "sampling/sampling_logp_difference/mean": 0.013156416825950146, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 250.78125, "completions/mean_terminated_length": 250.78125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.51570725440979, "epoch": 0.8849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 0.587458378766746, "kl": 0.029079455882310867, "learning_rate": 6.847226853422861e-07, "loss": 0.0065, "num_tokens": 12146124.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.478989839553833, "sampling/importance_sampling_ratio/mean": 1.0006601810455322, "sampling/importance_sampling_ratio/min": 0.7337961792945862, "sampling/sampling_logp_difference/max": 0.3913593292236328, "sampling/sampling_logp_difference/mean": 0.015173892490565777, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 111.515625, "completions/mean_terminated_length": 111.515625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.49788400530815125, "epoch": 0.8867256637168142, "frac_reward_zero_std": 1.0, "grad_norm": 0.03832759372292146, "kl": 0.03281484544277191, "learning_rate": 6.832865394749065e-07, "loss": 0.0004, "num_tokens": 12165261.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4023759365081787, "sampling/importance_sampling_ratio/mean": 1.0008957386016846, "sampling/importance_sampling_ratio/min": 0.6976507306098938, "sampling/sampling_logp_difference/max": 0.36003661155700684, "sampling/sampling_logp_difference/mean": 0.014865086413919926, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 174.953125, "completions/mean_terminated_length": 174.953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.4915030896663666, "epoch": 0.8884955752212389, "frac_reward_zero_std": 1.0, "grad_norm": 0.021233917709623164, "kl": 0.021638117730617523, "learning_rate": 6.818486446145486e-07, "loss": 0.0002, "num_tokens": 12187802.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4598971605300903, "sampling/importance_sampling_ratio/mean": 1.000180959701538, "sampling/importance_sampling_ratio/min": 0.7449281811714172, "sampling/sampling_logp_difference/max": 0.37836599349975586, "sampling/sampling_logp_difference/mean": 0.014047574251890182, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 126.140625, "completions/mean_terminated_length": 126.140625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.5061375498771667, "epoch": 0.8902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.030954333305085516, "kl": 0.028523443266749382, "learning_rate": 6.804090144821772e-07, "loss": 0.0003, "num_tokens": 12206371.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3339695930480957, "sampling/importance_sampling_ratio/mean": 1.0004111528396606, "sampling/importance_sampling_ratio/min": 0.7442020177841187, "sampling/sampling_logp_difference/max": 0.2954428195953369, "sampling/sampling_logp_difference/mean": 0.015156304463744164, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 248.1875, "completions/mean_terminated_length": 248.1875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.437954306602478, "epoch": 0.8920353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 0.9608567984463886, "kl": 0.024916578084230423, "learning_rate": 6.789676628153143e-07, "loss": -0.2257, "num_tokens": 12234015.0, "reward": 0.40625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.2881041765213013, "sampling/importance_sampling_ratio/mean": 1.000047206878662, "sampling/importance_sampling_ratio/min": 0.7708719968795776, "sampling/sampling_logp_difference/max": 0.26023292541503906, "sampling/sampling_logp_difference/mean": 0.01334239263087511, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 181.0625, "completions/mean_terminated_length": 181.0625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.43932265043258667, "epoch": 0.8938053097345132, "frac_reward_zero_std": 1.0, "grad_norm": 0.020468985707559374, "kl": 0.02096303179860115, "learning_rate": 6.775246033679104e-07, "loss": 0.0002, "num_tokens": 12256147.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3998541831970215, "sampling/importance_sampling_ratio/mean": 0.9996835589408875, "sampling/importance_sampling_ratio/min": 0.6922141909599304, "sampling/sampling_logp_difference/max": 0.3678598403930664, "sampling/sampling_logp_difference/mean": 0.013049916364252567, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 230.359375, "completions/mean_terminated_length": 230.359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.5136737823486328, "epoch": 0.8955752212389381, "frac_reward_zero_std": 0.5, "grad_norm": 1.0557140774475746, "kl": 0.027632474899291992, "learning_rate": 6.76079849910212e-07, "loss": 0.0168, "num_tokens": 12281242.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.287634253501892, "sampling/importance_sampling_ratio/mean": 1.0004892349243164, "sampling/importance_sampling_ratio/min": 0.7725920081138611, "sampling/sampling_logp_difference/max": 0.25800418853759766, "sampling/sampling_logp_difference/mean": 0.014434251934289932, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 95.234375, "completions/mean_terminated_length": 95.234375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.3628949522972107, "epoch": 0.8973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.02127218034284862, "kl": 0.01586075872182846, "learning_rate": 6.746334162286307e-07, "loss": 0.0002, "num_tokens": 12297065.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3714330196380615, "sampling/importance_sampling_ratio/mean": 1.00048828125, "sampling/importance_sampling_ratio/min": 0.7775425314903259, "sampling/sampling_logp_difference/max": 0.3158562183380127, "sampling/sampling_logp_difference/mean": 0.012748388573527336, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 145.03125, "completions/mean_terminated_length": 145.03125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.43298447132110596, "epoch": 0.8991150442477877, "frac_reward_zero_std": 0.75, "grad_norm": 1.083759226493711, "kl": 0.023886151611804962, "learning_rate": 6.731853161256113e-07, "loss": 0.0186, "num_tokens": 12315611.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.3225853443145752, "sampling/importance_sampling_ratio/mean": 1.0002093315124512, "sampling/importance_sampling_ratio/min": 0.7806451320648193, "sampling/sampling_logp_difference/max": 0.27958840131759644, "sampling/sampling_logp_difference/mean": 0.013250073418021202, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 151.265625, "completions/mean_terminated_length": 151.265625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.4795664846897125, "epoch": 0.9008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.02265181429566962, "kl": 0.023300642147660255, "learning_rate": 6.717355634195004e-07, "loss": 0.0003, "num_tokens": 12335916.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4300289154052734, "sampling/importance_sampling_ratio/mean": 0.9999045133590698, "sampling/importance_sampling_ratio/min": 0.7733750939369202, "sampling/sampling_logp_difference/max": 0.3576946258544922, "sampling/sampling_logp_difference/mean": 0.014130566269159317, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 257.515625, "completions/mean_terminated_length": 257.515625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.5581847429275513, "epoch": 0.9026548672566371, "frac_reward_zero_std": 0.25, "grad_norm": 1.0692824810420285, "kl": 0.03280746191740036, "learning_rate": 6.70284171944414e-07, "loss": 0.0547, "num_tokens": 12362781.0, "reward": 0.28125, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4066280126571655, "sampling/importance_sampling_ratio/mean": 1.0004000663757324, "sampling/importance_sampling_ratio/min": 0.7918227314949036, "sampling/sampling_logp_difference/max": 0.34119534492492676, "sampling/sampling_logp_difference/mean": 0.015058835968375206, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.5470117330551147, "epoch": 0.904424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.2830717427958098, "kl": 0.03015040047466755, "learning_rate": 6.688311555501063e-07, "loss": -0.0028, "num_tokens": 12383213.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.358778715133667, "sampling/importance_sampling_ratio/mean": 0.9998684525489807, "sampling/importance_sampling_ratio/min": 0.699345588684082, "sampling/sampling_logp_difference/max": 0.35761022567749023, "sampling/sampling_logp_difference/mean": 0.015695251524448395, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 225.59375, "completions/mean_terminated_length": 225.59375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.48056620359420776, "epoch": 0.9061946902654867, "frac_reward_zero_std": 0.75, "grad_norm": 0.8462025301089073, "kl": 0.02353088930249214, "learning_rate": 6.673765281018372e-07, "loss": -0.0118, "num_tokens": 12407459.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3631311655044556, "sampling/importance_sampling_ratio/mean": 0.9997181296348572, "sampling/importance_sampling_ratio/min": 0.7787919044494629, "sampling/sampling_logp_difference/max": 0.3097844123840332, "sampling/sampling_logp_difference/mean": 0.013789271004498005, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 155.3125, "completions/mean_terminated_length": 155.3125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4893641471862793, "epoch": 0.9079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.2104936670520567, "kl": 0.021346217021346092, "learning_rate": 6.659203034802396e-07, "loss": -0.0918, "num_tokens": 12428583.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.444678783416748, "sampling/importance_sampling_ratio/mean": 1.0006427764892578, "sampling/importance_sampling_ratio/min": 0.6547064185142517, "sampling/sampling_logp_difference/max": 0.42356836795806885, "sampling/sampling_logp_difference/mean": 0.014761978760361671, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 216.578125, "completions/mean_terminated_length": 216.578125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.501872718334198, "epoch": 0.9097345132743363, "frac_reward_zero_std": 0.5, "grad_norm": 1.0593924110509019, "kl": 0.02280062437057495, "learning_rate": 6.644624955811873e-07, "loss": -0.0701, "num_tokens": 12460524.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3488348722457886, "sampling/importance_sampling_ratio/mean": 1.0000243186950684, "sampling/importance_sampling_ratio/min": 0.5989245772361755, "sampling/sampling_logp_difference/max": 0.5126196146011353, "sampling/sampling_logp_difference/mean": 0.014903506264090538, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 165.78125, "completions/mean_terminated_length": 165.78125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4183484613895416, "epoch": 0.911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.02083613207126292, "kl": 0.01958419755101204, "learning_rate": 6.630031183156627e-07, "loss": 0.0002, "num_tokens": 12480862.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4310715198516846, "sampling/importance_sampling_ratio/mean": 0.999096155166626, "sampling/importance_sampling_ratio/min": 0.7688083052635193, "sampling/sampling_logp_difference/max": 0.35842347145080566, "sampling/sampling_logp_difference/mean": 0.01297256164252758, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 186.484375, "completions/mean_terminated_length": 186.484375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.5136644244194031, "epoch": 0.9132743362831859, "frac_reward_zero_std": 0.5, "grad_norm": 1.2098354686328954, "kl": 0.024713782593607903, "learning_rate": 6.61542185609623e-07, "loss": -0.0535, "num_tokens": 12504925.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3446333408355713, "sampling/importance_sampling_ratio/mean": 1.0000817775726318, "sampling/importance_sampling_ratio/min": 0.7615574598312378, "sampling/sampling_logp_difference/max": 0.29612135887145996, "sampling/sampling_logp_difference/mean": 0.014643507078289986, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 272.140625, "completions/mean_terminated_length": 272.140625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.36052703857421875, "epoch": 0.9150442477876106, "frac_reward_zero_std": 0.75, "grad_norm": 0.6169708718976682, "kl": 0.02099330723285675, "learning_rate": 6.60079711403869e-07, "loss": -0.0054, "num_tokens": 12534150.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5027910470962524, "sampling/importance_sampling_ratio/mean": 0.9998451471328735, "sampling/importance_sampling_ratio/min": 0.7137043476104736, "sampling/sampling_logp_difference/max": 0.40732407569885254, "sampling/sampling_logp_difference/mean": 0.01117715984582901, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 281.890625, "completions/mean_terminated_length": 281.890625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3235178589820862, "epoch": 0.9168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 0.6358294125912906, "kl": 0.01651156321167946, "learning_rate": 6.586157096539104e-07, "loss": 0.0054, "num_tokens": 12561759.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4152244329452515, "sampling/importance_sampling_ratio/mean": 0.9997048377990723, "sampling/importance_sampling_ratio/min": 0.7470855712890625, "sampling/sampling_logp_difference/max": 0.3472881317138672, "sampling/sampling_logp_difference/mean": 0.011585024185478687, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 159.3125, "completions/mean_terminated_length": 159.3125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.4212814271450043, "epoch": 0.9185840707964602, "frac_reward_zero_std": 0.75, "grad_norm": 1.112715778004349, "kl": 0.018092583864927292, "learning_rate": 6.571501943298335e-07, "loss": 0.0761, "num_tokens": 12581347.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4491983652114868, "sampling/importance_sampling_ratio/mean": 0.9996023178100586, "sampling/importance_sampling_ratio/min": 0.7069697380065918, "sampling/sampling_logp_difference/max": 0.37101054191589355, "sampling/sampling_logp_difference/mean": 0.013572558760643005, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 226.109375, "completions/mean_terminated_length": 226.109375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.4114120602607727, "epoch": 0.9203539823008849, "frac_reward_zero_std": 1.0, "grad_norm": 0.02629405312251763, "kl": 0.024458853527903557, "learning_rate": 6.556831794161677e-07, "loss": 0.0002, "num_tokens": 12605706.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2728180885314941, "sampling/importance_sampling_ratio/mean": 0.999756932258606, "sampling/importance_sampling_ratio/min": 0.7169108986854553, "sampling/sampling_logp_difference/max": 0.33280372619628906, "sampling/sampling_logp_difference/mean": 0.01283460296690464, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 301.4375, "completions/mean_terminated_length": 301.4375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.31713157892227173, "epoch": 0.9221238938053097, "frac_reward_zero_std": 1.0, "grad_norm": 0.01771430757528254, "kl": 0.017292920500040054, "learning_rate": 6.542146789117523e-07, "loss": 0.0002, "num_tokens": 12634262.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6303718090057373, "sampling/importance_sampling_ratio/mean": 1.0002166032791138, "sampling/importance_sampling_ratio/min": 0.6520615816116333, "sampling/sampling_logp_difference/max": 0.48880815505981445, "sampling/sampling_logp_difference/mean": 0.010217871516942978, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 149.390625, "completions/mean_terminated_length": 149.390625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4099932610988617, "epoch": 0.9238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.026490032648893623, "kl": 0.02425050549209118, "learning_rate": 6.527447068296025e-07, "loss": 0.0002, "num_tokens": 12653295.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6009069681167603, "sampling/importance_sampling_ratio/mean": 0.9999533295631409, "sampling/importance_sampling_ratio/min": 0.7815190553665161, "sampling/sampling_logp_difference/max": 0.47057032585144043, "sampling/sampling_logp_difference/mean": 0.013794412836432457, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 165.3125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.44003963470458984, "epoch": 0.9256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.027804375945801128, "kl": 0.024549465626478195, "learning_rate": 6.512732771967758e-07, "loss": 0.0003, "num_tokens": 12674947.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3827100992202759, "sampling/importance_sampling_ratio/mean": 0.9999372959136963, "sampling/importance_sampling_ratio/min": 0.6235314011573792, "sampling/sampling_logp_difference/max": 0.47235608100891113, "sampling/sampling_logp_difference/mean": 0.013607329688966274, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 347.875, "completions/mean_terminated_length": 347.875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.4221234619617462, "epoch": 0.9274336283185841, "frac_reward_zero_std": 0.75, "grad_norm": 0.5967749438246166, "kl": 0.021170098334550858, "learning_rate": 6.498004040542384e-07, "loss": -0.0175, "num_tokens": 12713307.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4646762609481812, "sampling/importance_sampling_ratio/mean": 0.9992688894271851, "sampling/importance_sampling_ratio/min": 0.5447493195533752, "sampling/sampling_logp_difference/max": 0.6074295043945312, "sampling/sampling_logp_difference/mean": 0.012210089713335037, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 187.40625, "completions/mean_terminated_length": 187.40625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.3938935101032257, "epoch": 0.9292035398230089, "frac_reward_zero_std": 1.0, "grad_norm": 0.024158492161167302, "kl": 0.021889664232730865, "learning_rate": 6.483261014567311e-07, "loss": 0.0002, "num_tokens": 12736677.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2892292737960815, "sampling/importance_sampling_ratio/mean": 1.0005744695663452, "sampling/importance_sampling_ratio/min": 0.7308642268180847, "sampling/sampling_logp_difference/max": 0.31352758407592773, "sampling/sampling_logp_difference/mean": 0.013046994805335999, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 281.65625, "completions/mean_terminated_length": 281.65625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.45261669158935547, "epoch": 0.9309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 0.6327460765290148, "kl": 0.02436748333275318, "learning_rate": 6.468503834726349e-07, "loss": -0.0326, "num_tokens": 12767055.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998573064804077, "sampling/importance_sampling_ratio/min": 0.734407365322113, "sampling/sampling_logp_difference/max": 0.7483378648757935, "sampling/sampling_logp_difference/mean": 0.01314487773925066, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 267.390625, "completions/mean_terminated_length": 267.390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.45160195231437683, "epoch": 0.9327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 0.7183711817559669, "kl": 0.023968249559402466, "learning_rate": 6.453732641838371e-07, "loss": -0.0213, "num_tokens": 12795896.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3704925775527954, "sampling/importance_sampling_ratio/mean": 1.0001479387283325, "sampling/importance_sampling_ratio/min": 0.7264717221260071, "sampling/sampling_logp_difference/max": 0.31955575942993164, "sampling/sampling_logp_difference/mean": 0.013902518898248672, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1128.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 380.828125, "completions/mean_terminated_length": 380.828125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5013418197631836, "epoch": 0.9345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 0.6441107034909824, "kl": 0.026820922270417213, "learning_rate": 6.438947576855966e-07, "loss": -0.0011, "num_tokens": 12837037.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4478718042373657, "sampling/importance_sampling_ratio/mean": 0.9999604225158691, "sampling/importance_sampling_ratio/min": 0.7244784235954285, "sampling/sampling_logp_difference/max": 0.37009477615356445, "sampling/sampling_logp_difference/mean": 0.014299528673291206, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 441.65625, "completions/mean_terminated_length": 441.65625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.42085015773773193, "epoch": 0.9362831858407079, "frac_reward_zero_std": 0.5, "grad_norm": 0.6555197672825456, "kl": 0.02555309794843197, "learning_rate": 6.424148780864103e-07, "loss": -0.0493, "num_tokens": 12878103.0, "reward": -0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.3809754848480225, "sampling/importance_sampling_ratio/mean": 0.9996974468231201, "sampling/importance_sampling_ratio/min": 0.5096358060836792, "sampling/sampling_logp_difference/max": 0.6740589141845703, "sampling/sampling_logp_difference/mean": 0.011325279250741005, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 349.0, "completions/mean_terminated_length": 349.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.37050145864486694, "epoch": 0.9380530973451328, "frac_reward_zero_std": 0.75, "grad_norm": 0.579379794790801, "kl": 0.02218376100063324, "learning_rate": 6.409336395078771e-07, "loss": 0.0102, "num_tokens": 12912167.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4542145729064941, "sampling/importance_sampling_ratio/mean": 1.0002388954162598, "sampling/importance_sampling_ratio/min": 0.695831298828125, "sampling/sampling_logp_difference/max": 0.3744659423828125, "sampling/sampling_logp_difference/mean": 0.012211505323648453, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 210.640625, "completions/mean_terminated_length": 210.640625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.5094162225723267, "epoch": 0.9398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 1.1624307716023343, "kl": 0.033237241208553314, "learning_rate": 6.394510560845636e-07, "loss": -0.0485, "num_tokens": 12942208.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.36483633518219, "sampling/importance_sampling_ratio/mean": 0.999850332736969, "sampling/importance_sampling_ratio/min": 0.7761027216911316, "sampling/sampling_logp_difference/max": 0.3110344409942627, "sampling/sampling_logp_difference/mean": 0.015838120132684708, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.39384061098098755, "epoch": 0.9415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.021540675016173363, "kl": 0.024982232600450516, "learning_rate": 6.379671419638702e-07, "loss": 0.0003, "num_tokens": 12975152.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.501024603843689, "sampling/importance_sampling_ratio/mean": 0.999975860118866, "sampling/importance_sampling_ratio/min": 0.7136148810386658, "sampling/sampling_logp_difference/max": 0.40614795684814453, "sampling/sampling_logp_difference/mean": 0.011872950941324234, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 264.578125, "completions/mean_terminated_length": 264.578125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.40402644872665405, "epoch": 0.9433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 0.8274407738527938, "kl": 0.022723723202943802, "learning_rate": 6.364819113058951e-07, "loss": 0.1119, "num_tokens": 13012165.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997878670692444, "sampling/importance_sampling_ratio/min": 0.5187135338783264, "sampling/sampling_logp_difference/max": 1.018458366394043, "sampling/sampling_logp_difference/mean": 0.012855648063123226, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 337.578125, "completions/mean_terminated_length": 337.578125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4513665437698364, "epoch": 0.9451327433628318, "frac_reward_zero_std": 0.75, "grad_norm": 0.8208460167458104, "kl": 0.02092919498682022, "learning_rate": 6.349953782832991e-07, "loss": 0.0135, "num_tokens": 13044890.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5612350702285767, "sampling/importance_sampling_ratio/mean": 0.9995930790901184, "sampling/importance_sampling_ratio/min": 0.6993576288223267, "sampling/sampling_logp_difference/max": 0.4454772472381592, "sampling/sampling_logp_difference/mean": 0.01442214846611023, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 355.828125, "completions/mean_terminated_length": 355.828125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5425180792808533, "epoch": 0.9469026548672567, "frac_reward_zero_std": 0.25, "grad_norm": 0.9671309953856718, "kl": 0.03844757378101349, "learning_rate": 6.335075570811708e-07, "loss": 0.003, "num_tokens": 13079679.0, "reward": 0.6875, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4113689661026, "sampling/importance_sampling_ratio/mean": 0.9998151063919067, "sampling/importance_sampling_ratio/min": 0.6899908185005188, "sampling/sampling_logp_difference/max": 0.3710770606994629, "sampling/sampling_logp_difference/mean": 0.01512528769671917, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 102.671875, "completions/mean_terminated_length": 102.671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.36823874711990356, "epoch": 0.9486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.032894680360928595, "kl": 0.022162236273288727, "learning_rate": 6.320184618968914e-07, "loss": 0.0002, "num_tokens": 13095930.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4023869037628174, "sampling/importance_sampling_ratio/mean": 1.000161051750183, "sampling/importance_sampling_ratio/min": 0.7880635261535645, "sampling/sampling_logp_difference/max": 0.33817577362060547, "sampling/sampling_logp_difference/mean": 0.013399723917245865, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 293.4375, "completions/mean_terminated_length": 293.4375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.5144325494766235, "epoch": 0.9504424778761061, "frac_reward_zero_std": 0.5, "grad_norm": 0.9027615557339758, "kl": 0.036306578665971756, "learning_rate": 6.305281069399988e-07, "loss": -0.1264, "num_tokens": 13125638.0, "reward": 0.21875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.403609275817871, "sampling/importance_sampling_ratio/mean": 1.0003875494003296, "sampling/importance_sampling_ratio/min": 0.7172664999961853, "sampling/sampling_logp_difference/max": 0.3390469551086426, "sampling/sampling_logp_difference/mean": 0.015700161457061768, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 388.140625, "completions/mean_terminated_length": 388.140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.41762110590934753, "epoch": 0.952212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 0.6238030608655667, "kl": 0.02695799432694912, "learning_rate": 6.290365064320519e-07, "loss": -0.013, "num_tokens": 13161295.0, "reward": 0.4375, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5369281768798828, "sampling/importance_sampling_ratio/mean": 1.0002576112747192, "sampling/importance_sampling_ratio/min": 0.6712830662727356, "sampling/sampling_logp_difference/max": 0.42978572845458984, "sampling/sampling_logp_difference/mean": 0.012539315968751907, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 262.671875, "completions/mean_terminated_length": 262.671875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.5008562207221985, "epoch": 0.9539823008849557, "frac_reward_zero_std": 0.75, "grad_norm": 0.5755909915540746, "kl": 0.037118975073099136, "learning_rate": 6.275436746064956e-07, "loss": -0.0008, "num_tokens": 13189242.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5072718858718872, "sampling/importance_sampling_ratio/mean": 0.9997512102127075, "sampling/importance_sampling_ratio/min": 0.7031087875366211, "sampling/sampling_logp_difference/max": 0.4103013277053833, "sampling/sampling_logp_difference/mean": 0.01381722278892994, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 250.6875, "completions/mean_terminated_length": 250.6875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.44269657135009766, "epoch": 0.9557522123893806, "frac_reward_zero_std": 0.75, "grad_norm": 0.725654799349624, "kl": 0.031984515488147736, "learning_rate": 6.260496257085239e-07, "loss": -0.0296, "num_tokens": 13215430.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.2694391012191772, "sampling/importance_sampling_ratio/mean": 1.0000256299972534, "sampling/importance_sampling_ratio/min": 0.7199440598487854, "sampling/sampling_logp_difference/max": 0.3285818099975586, "sampling/sampling_logp_difference/mean": 0.013542082160711288, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 302.609375, "completions/mean_terminated_length": 302.609375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.42221054434776306, "epoch": 0.9575221238938053, "frac_reward_zero_std": 0.75, "grad_norm": 0.5039082725367354, "kl": 0.026738828048110008, "learning_rate": 6.245543739949453e-07, "loss": 0.0078, "num_tokens": 13245773.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4075931310653687, "sampling/importance_sampling_ratio/mean": 0.9997868537902832, "sampling/importance_sampling_ratio/min": 0.6801050305366516, "sampling/sampling_logp_difference/max": 0.38550806045532227, "sampling/sampling_logp_difference/mean": 0.012500166893005371, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 215.78125, "completions/mean_terminated_length": 215.78125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.40436482429504395, "epoch": 0.95929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.025883781081769624, "kl": 0.026466555893421173, "learning_rate": 6.230579337340456e-07, "loss": 0.0003, "num_tokens": 13270223.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3615260124206543, "sampling/importance_sampling_ratio/mean": 1.0003719329833984, "sampling/importance_sampling_ratio/min": 0.698758602142334, "sampling/sampling_logp_difference/max": 0.35844993591308594, "sampling/sampling_logp_difference/mean": 0.012754770927131176, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 143.234375, "completions/mean_terminated_length": 143.234375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.39504268765449524, "epoch": 0.9610619469026549, "frac_reward_zero_std": 1.0, "grad_norm": 0.030834978602436714, "kl": 0.021094808354973793, "learning_rate": 6.215603192054521e-07, "loss": 0.0002, "num_tokens": 13288878.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2980706691741943, "sampling/importance_sampling_ratio/mean": 0.9994897842407227, "sampling/importance_sampling_ratio/min": 0.748090922832489, "sampling/sampling_logp_difference/max": 0.29023075103759766, "sampling/sampling_logp_difference/mean": 0.01344493217766285, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 104.734375, "completions/mean_terminated_length": 104.734375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3789525032043457, "epoch": 0.9628318584070796, "frac_reward_zero_std": 1.0, "grad_norm": 0.03569685045556069, "kl": 0.025061320513486862, "learning_rate": 6.200615446999981e-07, "loss": 0.0003, "num_tokens": 13305533.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5690453052520752, "sampling/importance_sampling_ratio/mean": 0.9997316598892212, "sampling/importance_sampling_ratio/min": 0.6368664503097534, "sampling/sampling_logp_difference/max": 0.45119524002075195, "sampling/sampling_logp_difference/mean": 0.013636662624776363, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 242.625, "completions/mean_terminated_length": 242.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4651487469673157, "epoch": 0.9646017699115044, "frac_reward_zero_std": 0.75, "grad_norm": 0.7687754562927372, "kl": 0.03860689327120781, "learning_rate": 6.185616245195848e-07, "loss": 0.0376, "num_tokens": 13334165.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.2807093858718872, "sampling/importance_sampling_ratio/mean": 0.9996968507766724, "sampling/importance_sampling_ratio/min": 0.7003874778747559, "sampling/sampling_logp_difference/max": 0.3561215400695801, "sampling/sampling_logp_difference/mean": 0.0130031518638134, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 488.390625, "completions/mean_terminated_length": 488.390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.4143977761268616, "epoch": 0.9663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 0.6028894278535177, "kl": 0.026027612388134003, "learning_rate": 6.170605729770469e-07, "loss": -0.0458, "num_tokens": 13379534.0, "reward": -0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3819230794906616, "sampling/importance_sampling_ratio/mean": 0.9997913837432861, "sampling/importance_sampling_ratio/min": 0.6546958684921265, "sampling/sampling_logp_difference/max": 0.4235844612121582, "sampling/sampling_logp_difference/mean": 0.011837435886263847, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 236.140625, "completions/mean_terminated_length": 236.140625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.396547794342041, "epoch": 0.968141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 0.6599271162266507, "kl": 0.026850953698158264, "learning_rate": 6.155584043960143e-07, "loss": 0.036, "num_tokens": 13404199.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.277846336364746, "sampling/importance_sampling_ratio/mean": 0.9997128844261169, "sampling/importance_sampling_ratio/min": 0.7155342102050781, "sampling/sampling_logp_difference/max": 0.33472585678100586, "sampling/sampling_logp_difference/mean": 0.012825481593608856, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 207.890625, "completions/mean_terminated_length": 207.890625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.45791032910346985, "epoch": 0.9699115044247788, "frac_reward_zero_std": 1.0, "grad_norm": 0.03250768221644379, "kl": 0.03187892213463783, "learning_rate": 6.140551331107766e-07, "loss": 0.0003, "num_tokens": 13428336.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.272735834121704, "sampling/importance_sampling_ratio/mean": 1.000754714012146, "sampling/importance_sampling_ratio/min": 0.7218002080917358, "sampling/sampling_logp_difference/max": 0.3260068893432617, "sampling/sampling_logp_difference/mean": 0.012514041736721992, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 335.703125, "completions/mean_terminated_length": 335.703125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.34679436683654785, "epoch": 0.9716814159292035, "frac_reward_zero_std": 1.0, "grad_norm": 0.01784709334922375, "kl": 0.020453231409192085, "learning_rate": 6.125507734661458e-07, "loss": 0.0002, "num_tokens": 13462509.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.564109206199646, "sampling/importance_sampling_ratio/mean": 0.9998632669448853, "sampling/importance_sampling_ratio/min": 0.6782023906707764, "sampling/sampling_logp_difference/max": 0.4473165273666382, "sampling/sampling_logp_difference/mean": 0.010892463847994804, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 136.15625, "completions/mean_terminated_length": 136.15625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.382659912109375, "epoch": 0.9734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.02325718683461479, "kl": 0.019123533740639687, "learning_rate": 6.110453398173187e-07, "loss": 0.0002, "num_tokens": 13480423.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3233411312103271, "sampling/importance_sampling_ratio/mean": 0.999683678150177, "sampling/importance_sampling_ratio/min": 0.7012141942977905, "sampling/sampling_logp_difference/max": 0.35494184494018555, "sampling/sampling_logp_difference/mean": 0.012873586267232895, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 217.234375, "completions/mean_terminated_length": 217.234375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.45072633028030396, "epoch": 0.9752212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 0.9998902172921925, "kl": 0.03590782359242439, "learning_rate": 6.095388465297418e-07, "loss": -0.0048, "num_tokens": 13504838.0, "reward": 0.5625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2763832807540894, "sampling/importance_sampling_ratio/mean": 0.9996399879455566, "sampling/importance_sampling_ratio/min": 0.6401306986808777, "sampling/sampling_logp_difference/max": 0.4460829496383667, "sampling/sampling_logp_difference/mean": 0.013903114013373852, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 322.03125, "completions/mean_terminated_length": 322.03125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5136559009552002, "epoch": 0.9769911504424779, "frac_reward_zero_std": 0.75, "grad_norm": 0.46612861743723794, "kl": 0.02835317887365818, "learning_rate": 6.080313079789723e-07, "loss": -0.0167, "num_tokens": 13537928.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4303278923034668, "sampling/importance_sampling_ratio/mean": 0.9996492862701416, "sampling/importance_sampling_ratio/min": 0.6369600296020508, "sampling/sampling_logp_difference/max": 0.4510483741760254, "sampling/sampling_logp_difference/mean": 0.014163967221975327, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 245.46875, "completions/mean_terminated_length": 245.46875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.42518749833106995, "epoch": 0.9787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 0.6598414490159867, "kl": 0.028329703956842422, "learning_rate": 6.065227385505421e-07, "loss": -0.0016, "num_tokens": 13563446.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4395016431808472, "sampling/importance_sampling_ratio/mean": 0.9997446537017822, "sampling/importance_sampling_ratio/min": 0.7862369418144226, "sampling/sampling_logp_difference/max": 0.36429691314697266, "sampling/sampling_logp_difference/mean": 0.012089397758245468, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 326.578125, "completions/mean_terminated_length": 326.578125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.32329261302948, "epoch": 0.9805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.017604491353214135, "kl": 0.018136534839868546, "learning_rate": 6.050131526398201e-07, "loss": 0.0001, "num_tokens": 13594299.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.491687536239624, "sampling/importance_sampling_ratio/mean": 0.9996800422668457, "sampling/importance_sampling_ratio/min": 0.6273070573806763, "sampling/sampling_logp_difference/max": 0.46631908416748047, "sampling/sampling_logp_difference/mean": 0.010580826550722122, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5251365303993225, "epoch": 0.9823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 1.2161930268489385, "kl": 0.025617901235818863, "learning_rate": 6.035025646518746e-07, "loss": -0.0045, "num_tokens": 13621899.0, "reward": 0.03125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.295454502105713, "sampling/importance_sampling_ratio/mean": 0.9993857145309448, "sampling/importance_sampling_ratio/min": 0.7769662141799927, "sampling/sampling_logp_difference/max": 0.2588615417480469, "sampling/sampling_logp_difference/mean": 0.014331163838505745, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.38137808442115784, "epoch": 0.984070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.02944732431574343, "kl": 0.02904684469103813, "learning_rate": 6.019909890013366e-07, "loss": 0.0003, "num_tokens": 13649091.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.44140625, "sampling/importance_sampling_ratio/mean": 0.9995729923248291, "sampling/importance_sampling_ratio/min": 0.785086452960968, "sampling/sampling_logp_difference/max": 0.3656191825866699, "sampling/sampling_logp_difference/mean": 0.011895684525370598, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.5476418733596802, "epoch": 0.9858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 0.7819956140072826, "kl": 0.03881065174937248, "learning_rate": 6.004784401122612e-07, "loss": -0.039, "num_tokens": 13684595.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6007577180862427, "sampling/importance_sampling_ratio/mean": 1.0001033544540405, "sampling/importance_sampling_ratio/min": 0.6461929678916931, "sampling/sampling_logp_difference/max": 0.4704771041870117, "sampling/sampling_logp_difference/mean": 0.014676381833851337, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3928982615470886, "epoch": 0.9876106194690265, "frac_reward_zero_std": 0.75, "grad_norm": 0.9756861179930788, "kl": 0.02087804675102234, "learning_rate": 5.98964932417991e-07, "loss": -0.0212, "num_tokens": 13708267.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3034518957138062, "sampling/importance_sampling_ratio/mean": 1.0004992485046387, "sampling/importance_sampling_ratio/min": 0.7129567861557007, "sampling/sampling_logp_difference/max": 0.33833450078964233, "sampling/sampling_logp_difference/mean": 0.012889813631772995, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 205.28125, "completions/mean_terminated_length": 205.28125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.5221399068832397, "epoch": 0.9893805309734514, "frac_reward_zero_std": 0.5, "grad_norm": 1.2467977646081803, "kl": 0.0359196662902832, "learning_rate": 5.974504803610178e-07, "loss": 0.0074, "num_tokens": 13733437.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4236117601394653, "sampling/importance_sampling_ratio/mean": 0.9997265934944153, "sampling/importance_sampling_ratio/min": 0.7364019155502319, "sampling/sampling_logp_difference/max": 0.3531970977783203, "sampling/sampling_logp_difference/mean": 0.014655915088951588, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 296.15625, "completions/mean_terminated_length": 296.15625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.4537449777126312, "epoch": 0.9911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.03183405274393796, "kl": 0.028165467083454132, "learning_rate": 5.959350983928445e-07, "loss": 0.0003, "num_tokens": 13765063.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5579522848129272, "sampling/importance_sampling_ratio/mean": 1.0000855922698975, "sampling/importance_sampling_ratio/min": 0.7263023257255554, "sampling/sampling_logp_difference/max": 0.44337230920791626, "sampling/sampling_logp_difference/mean": 0.013872995972633362, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 273.4375, "completions/mean_terminated_length": 273.4375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.46611136198043823, "epoch": 0.9929203539823008, "frac_reward_zero_std": 1.0, "grad_norm": 0.04223792102892499, "kl": 0.030913997441530228, "learning_rate": 5.944188009738483e-07, "loss": 0.0003, "num_tokens": 13795523.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4326947927474976, "sampling/importance_sampling_ratio/mean": 1.000631332397461, "sampling/importance_sampling_ratio/min": 0.7577334642410278, "sampling/sampling_logp_difference/max": 0.3595571517944336, "sampling/sampling_logp_difference/mean": 0.013823620975017548, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.3631245791912079, "epoch": 0.9946902654867257, "frac_reward_zero_std": 1.0, "grad_norm": 0.02802293741419622, "kl": 0.020053338259458542, "learning_rate": 5.929016025731413e-07, "loss": 0.0002, "num_tokens": 13821195.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.314363718032837, "sampling/importance_sampling_ratio/mean": 1.0004332065582275, "sampling/importance_sampling_ratio/min": 0.7174208164215088, "sampling/sampling_logp_difference/max": 0.3320927619934082, "sampling/sampling_logp_difference/mean": 0.011732209473848343, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3391154706478119, "epoch": 0.9964601769911504, "frac_reward_zero_std": 1.0, "grad_norm": 0.019168099167691532, "kl": 0.015857823193073273, "learning_rate": 5.913835176684334e-07, "loss": 0.0002, "num_tokens": 13841523.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2994259595870972, "sampling/importance_sampling_ratio/mean": 1.0002155303955078, "sampling/importance_sampling_ratio/min": 0.7582544088363647, "sampling/sampling_logp_difference/max": 0.2767362594604492, "sampling/sampling_logp_difference/mean": 0.012182366102933884, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 461.734375, "completions/mean_terminated_length": 461.734375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.5233696103096008, "epoch": 0.9982300884955753, "frac_reward_zero_std": 0.25, "grad_norm": 0.7636498734062086, "kl": 0.04132012277841568, "learning_rate": 5.89864560745894e-07, "loss": 0.0354, "num_tokens": 13881858.0, "reward": -0.0625, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4060444831848145, "sampling/importance_sampling_ratio/mean": 1.000087022781372, "sampling/importance_sampling_ratio/min": 0.6333706974983215, "sampling/sampling_logp_difference/max": 0.4566994607448578, "sampling/sampling_logp_difference/mean": 0.013759579509496689, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 295.078125, "completions/mean_terminated_length": 295.078125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4168611168861389, "epoch": 1.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.6565084881388615, "kl": 0.028434766456484795, "learning_rate": 5.883447463000135e-07, "loss": 0.0009, "num_tokens": 13910535.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4067094326019287, "sampling/importance_sampling_ratio/mean": 0.9999887347221375, "sampling/importance_sampling_ratio/min": 0.7865856885910034, "sampling/sampling_logp_difference/max": 0.34125328063964844, "sampling/sampling_logp_difference/mean": 0.012332264333963394, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 90.78125, "completions/mean_terminated_length": 90.78125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.36798930168151855, "epoch": 1.0017699115044247, "frac_reward_zero_std": 1.0, "grad_norm": 0.02322500627253227, "kl": 0.020650256425142288, "learning_rate": 5.868240888334652e-07, "loss": 0.0002, "num_tokens": 13925833.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2987287044525146, "sampling/importance_sampling_ratio/mean": 1.0002477169036865, "sampling/importance_sampling_ratio/min": 0.7757216095924377, "sampling/sampling_logp_difference/max": 0.2613859176635742, "sampling/sampling_logp_difference/mean": 0.01316915638744831, "step": 566 } ], "logging_steps": 1, "max_steps": 1130, "num_input_tokens_seen": 13925833, "num_train_epochs": 2, "save_steps": 283, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }