{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 24, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3262875378131866, "epoch": 0.19047619047619047, "grad_norm": 0.42746037244796753, "learning_rate": 0.0, "loss": 2.5209, "mean_token_accuracy": 0.5272354185581207, "num_tokens": 3907.0, "step": 1 }, { "entropy": 1.4145832359790802, "epoch": 0.38095238095238093, "grad_norm": 0.38449376821517944, "learning_rate": 2e-05, "loss": 2.5619, "mean_token_accuracy": 0.5107117593288422, "num_tokens": 7798.0, "step": 2 }, { "entropy": 1.3320209383964539, "epoch": 0.5714285714285714, "grad_norm": 0.43162596225738525, "learning_rate": 1.9916173514326978e-05, "loss": 2.5132, "mean_token_accuracy": 0.5169796347618103, "num_tokens": 11669.0, "step": 3 }, { "entropy": 1.460325837135315, "epoch": 0.7619047619047619, "grad_norm": 0.39215895533561707, "learning_rate": 1.9666255586130196e-05, "loss": 2.6913, "mean_token_accuracy": 0.49600744992494583, "num_tokens": 15400.0, "step": 4 }, { "entropy": 1.3482075929641724, "epoch": 0.9523809523809523, "grad_norm": 0.41734573245048523, "learning_rate": 1.925490171354908e-05, "loss": 2.5605, "mean_token_accuracy": 0.5217743217945099, "num_tokens": 19278.0, "step": 5 }, { "entropy": 1.2881449460983276, "epoch": 1.0, "grad_norm": 0.5783767104148865, "learning_rate": 1.86897746409184e-05, "loss": 2.4256, "mean_token_accuracy": 0.5332428812980652, "num_tokens": 20018.0, "step": 6 }, { "entropy": 1.3765594959259033, "epoch": 1.1904761904761905, "grad_norm": 0.4032095670700073, "learning_rate": 1.798140161633978e-05, "loss": 2.5216, "mean_token_accuracy": 0.5237941592931747, "num_tokens": 23850.0, "step": 7 }, { "entropy": 1.4430685341358185, "epoch": 1.380952380952381, "grad_norm": 0.42382702231407166, "learning_rate": 1.714297828896789e-05, "loss": 2.6032, "mean_token_accuracy": 0.4955971837043762, "num_tokens": 27559.0, "step": 8 }, { "entropy": 1.3608110547065735, "epoch": 1.5714285714285714, "grad_norm": 0.45954015851020813, "learning_rate": 1.6190122899033807e-05, "loss": 2.4666, "mean_token_accuracy": 0.5162522122263908, "num_tokens": 31555.0, "step": 9 }, { "entropy": 1.3784911930561066, "epoch": 1.7619047619047619, "grad_norm": 0.44564327597618103, "learning_rate": 1.5140585339580372e-05, "loss": 2.4044, "mean_token_accuracy": 0.5227449685335159, "num_tokens": 35542.0, "step": 10 }, { "entropy": 1.4194905161857605, "epoch": 1.9523809523809523, "grad_norm": 0.43261900544166565, "learning_rate": 1.4013916509538877e-05, "loss": 2.4835, "mean_token_accuracy": 0.5238108187913895, "num_tokens": 39306.0, "step": 11 }, { "entropy": 1.3612134456634521, "epoch": 2.0, "grad_norm": 0.49288633465766907, "learning_rate": 1.2831104117473708e-05, "loss": 2.3331, "mean_token_accuracy": 0.5405777096748352, "num_tokens": 40036.0, "step": 12 }, { "entropy": 1.4140946567058563, "epoch": 2.1904761904761907, "grad_norm": 0.4206177294254303, "learning_rate": 1.161418172028204e-05, "loss": 2.4259, "mean_token_accuracy": 0.5216626822948456, "num_tokens": 43934.0, "step": 13 }, { "entropy": 1.4334158599376678, "epoch": 2.380952380952381, "grad_norm": 0.4195733666419983, "learning_rate": 1.0385818279717963e-05, "loss": 2.4049, "mean_token_accuracy": 0.5157886892557144, "num_tokens": 47807.0, "step": 14 }, { "entropy": 1.4315949380397797, "epoch": 2.571428571428571, "grad_norm": 0.4438965916633606, "learning_rate": 9.168895882526299e-06, "loss": 2.385, "mean_token_accuracy": 0.5175213366746902, "num_tokens": 51668.0, "step": 15 }, { "entropy": 1.4568908214569092, "epoch": 2.761904761904762, "grad_norm": 0.4084729552268982, "learning_rate": 7.986083490461124e-06, "loss": 2.3757, "mean_token_accuracy": 0.5272674188017845, "num_tokens": 55552.0, "step": 16 }, { "entropy": 1.4177967011928558, "epoch": 2.9523809523809526, "grad_norm": 0.4245680272579193, "learning_rate": 6.859414660419632e-06, "loss": 2.3215, "mean_token_accuracy": 0.523059070110321, "num_tokens": 59348.0, "step": 17 }, { "entropy": 1.3359137773513794, "epoch": 3.0, "grad_norm": 0.4944182336330414, "learning_rate": 5.809877100966197e-06, "loss": 2.2219, "mean_token_accuracy": 0.5405405163764954, "num_tokens": 60054.0, "step": 18 }, { "entropy": 1.4150276184082031, "epoch": 3.1904761904761907, "grad_norm": 0.41034770011901855, "learning_rate": 4.857021711032115e-06, "loss": 2.2712, "mean_token_accuracy": 0.5344524532556534, "num_tokens": 63922.0, "step": 19 }, { "entropy": 1.3929513990879059, "epoch": 3.380952380952381, "grad_norm": 0.4209787845611572, "learning_rate": 4.018598383660221e-06, "loss": 2.2417, "mean_token_accuracy": 0.5421377569437027, "num_tokens": 67884.0, "step": 20 }, { "entropy": 1.4352200031280518, "epoch": 3.571428571428571, "grad_norm": 0.4058735966682434, "learning_rate": 3.3102253590816034e-06, "loss": 2.3249, "mean_token_accuracy": 0.5226768404245377, "num_tokens": 71798.0, "step": 21 }, { "entropy": 1.5018433034420013, "epoch": 3.761904761904762, "grad_norm": 0.43237438797950745, "learning_rate": 2.7450982864509253e-06, "loss": 2.4183, "mean_token_accuracy": 0.5107452720403671, "num_tokens": 75561.0, "step": 22 }, { "entropy": 1.492310345172882, "epoch": 3.9523809523809526, "grad_norm": 0.40404191613197327, "learning_rate": 2.3337444138698082e-06, "loss": 2.3382, "mean_token_accuracy": 0.5147013962268829, "num_tokens": 79338.0, "step": 23 }, { "entropy": 1.415541648864746, "epoch": 4.0, "grad_norm": 0.4709555208683014, "learning_rate": 2.0838264856730233e-06, "loss": 2.1213, "mean_token_accuracy": 0.5567715167999268, "num_tokens": 80072.0, "step": 24 } ], "logging_steps": 1, "max_steps": 24, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.750705152191283e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }