{ "best_global_step": 540, "best_metric": 0.43301182985305786, "best_model_checkpoint": "Mistral-7B-v0.1/r4/checkpoint-540", "epoch": 3.0, "eval_steps": 60, "global_step": 720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.3212984272175365, "epoch": 0.25069637883008355, "grad_norm": 4.5548248291015625, "learning_rate": 7.638888888888889e-05, "loss": 2.8492, "mean_token_accuracy": 0.6425296268943284, "num_tokens": 1474560.0, "step": 60 }, { "epoch": 0.25069637883008355, "eval_entropy": 0.6633218830669081, "eval_loss": 0.5672379732131958, "eval_mean_token_accuracy": 0.8630788239178719, "eval_num_tokens": 1474560.0, "eval_runtime": 1043.9462, "eval_samples_per_second": 2.357, "eval_steps_per_second": 0.295, "step": 60 }, { "entropy": 0.5157420211368137, "epoch": 0.5013927576601671, "grad_norm": 1.0181090831756592, "learning_rate": 9.891743688752738e-05, "loss": 0.5017, "mean_token_accuracy": 0.8742851712637477, "num_tokens": 2949120.0, "step": 120 }, { "epoch": 0.5013927576601671, "eval_entropy": 0.4764551486965124, "eval_loss": 0.46269991993904114, "eval_mean_token_accuracy": 0.882140948400869, "eval_num_tokens": 2949120.0, "eval_runtime": 1038.9576, "eval_samples_per_second": 2.369, "eval_steps_per_second": 0.296, "step": 120 }, { "entropy": 0.46944635676013097, "epoch": 0.7520891364902507, "grad_norm": 1.4101026058197021, "learning_rate": 9.389450641873323e-05, "loss": 0.4597, "mean_token_accuracy": 0.8829208799534374, "num_tokens": 4423680.0, "step": 180 }, { "epoch": 0.7520891364902507, "eval_entropy": 0.4749697573579751, "eval_loss": 0.45166516304016113, "eval_mean_token_accuracy": 0.8840483388343414, "eval_num_tokens": 4423680.0, "eval_runtime": 1040.6612, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.296, "step": 180 }, { "entropy": 0.4655254686648926, "epoch": 1.0, "grad_norm": 1.558998465538025, "learning_rate": 8.518351670729529e-05, "loss": 0.4543, "mean_token_accuracy": 0.8834420842735955, "num_tokens": 5879808.0, "step": 240 }, { "epoch": 1.0, "eval_entropy": 0.4612268569407525, "eval_loss": 0.4448107182979584, "eval_mean_token_accuracy": 0.8855506770022503, "eval_num_tokens": 5879808.0, "eval_runtime": 1040.4289, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.296, "step": 240 }, { "entropy": 0.45157561596069073, "epoch": 1.2506963788300836, "grad_norm": 0.6592407822608948, "learning_rate": 7.351637360519813e-05, "loss": 0.4387, "mean_token_accuracy": 0.8871405469046698, "num_tokens": 7354368.0, "step": 300 }, { "epoch": 1.2506963788300836, "eval_entropy": 0.4438396056557631, "eval_loss": 0.44106850028038025, "eval_mean_token_accuracy": 0.8862724611898521, "eval_num_tokens": 7354368.0, "eval_runtime": 1037.3961, "eval_samples_per_second": 2.372, "eval_steps_per_second": 0.297, "step": 300 }, { "entropy": 0.4534259519229333, "epoch": 1.501392757660167, "grad_norm": 0.6076720356941223, "learning_rate": 5.9873361855649876e-05, "loss": 0.4402, "mean_token_accuracy": 0.8862164333462715, "num_tokens": 8828928.0, "step": 360 }, { "epoch": 1.501392757660167, "eval_entropy": 0.44579531716835963, "eval_loss": 0.43730634450912476, "eval_mean_token_accuracy": 0.8868549373242762, "eval_num_tokens": 8828928.0, "eval_runtime": 1044.4842, "eval_samples_per_second": 2.356, "eval_steps_per_second": 0.295, "step": 360 }, { "entropy": 0.44368693138621434, "epoch": 1.7520891364902507, "grad_norm": 0.6398904919624329, "learning_rate": 4.5400780612818626e-05, "loss": 0.4223, "mean_token_accuracy": 0.888861709115218, "num_tokens": 11753472.0, "step": 420 }, { "epoch": 1.7520891364902507, "eval_entropy": 0.4433383915524978, "eval_loss": 0.4355394244194031, "eval_mean_token_accuracy": 0.8874243900373384, "eval_num_tokens": 11753472.0, "eval_runtime": 1037.2958, "eval_samples_per_second": 2.373, "eval_steps_per_second": 0.297, "step": 420 }, { "entropy": 0.4386935969919301, "epoch": 2.0, "grad_norm": 1.2995598316192627, "learning_rate": 3.131463026883449e-05, "loss": 0.4264, "mean_token_accuracy": 0.8898014740997486, "num_tokens": 13209600.0, "step": 480 }, { "epoch": 2.0, "eval_entropy": 0.4436397281076227, "eval_loss": 0.4339684844017029, "eval_mean_token_accuracy": 0.8877705109196824, "eval_num_tokens": 13209600.0, "eval_runtime": 1041.4468, "eval_samples_per_second": 2.363, "eval_steps_per_second": 0.296, "step": 480 }, { "entropy": 0.43632681195934614, "epoch": 2.2506963788300833, "grad_norm": 0.5876320004463196, "learning_rate": 1.8798442914793663e-05, "loss": 0.4221, "mean_token_accuracy": 0.8905482163031896, "num_tokens": 14684160.0, "step": 540 }, { "epoch": 2.2506963788300833, "eval_entropy": 0.4378794138985021, "eval_loss": 0.43301182985305786, "eval_mean_token_accuracy": 0.8880488022968367, "eval_num_tokens": 14684160.0, "eval_runtime": 1046.5384, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.294, "step": 540 }, { "entropy": 0.4391553503357702, "epoch": 2.501392757660167, "grad_norm": 0.630813479423523, "learning_rate": 8.903840820084096e-06, "loss": 0.4249, "mean_token_accuracy": 0.8895656656887796, "num_tokens": 16158720.0, "step": 600 }, { "epoch": 2.501392757660167, "eval_entropy": 0.4278067753880055, "eval_loss": 0.4333657920360565, "eval_mean_token_accuracy": 0.8881023328799706, "eval_num_tokens": 16158720.0, "eval_runtime": 1045.4866, "eval_samples_per_second": 2.354, "eval_steps_per_second": 0.295, "step": 600 }, { "entropy": 0.42997388628621896, "epoch": 2.752089136490251, "grad_norm": 0.7219184637069702, "learning_rate": 2.462178103591678e-06, "loss": 0.4144, "mean_token_accuracy": 0.8923869328366385, "num_tokens": 17633280.0, "step": 660 }, { "epoch": 2.752089136490251, "eval_entropy": 0.42755064682720545, "eval_loss": 0.4343859851360321, "eval_mean_token_accuracy": 0.8881968432045603, "eval_num_tokens": 17633280.0, "eval_runtime": 1047.511, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 660 }, { "entropy": 0.43753848961564934, "epoch": 3.0, "grad_norm": 1.3657286167144775, "learning_rate": 1.4689549754337562e-08, "loss": 0.4228, "mean_token_accuracy": 0.8900845710481151, "num_tokens": 19089408.0, "step": 720 }, { "epoch": 3.0, "eval_entropy": 0.42660339708839146, "eval_loss": 0.4343104064464569, "eval_mean_token_accuracy": 0.8882718165586521, "eval_num_tokens": 19089408.0, "eval_runtime": 1048.422, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.294, "step": 720 }, { "epoch": 3.0, "step": 720, "total_flos": 8.146999757147996e+17, "train_loss": 0.035229322645399304, "train_runtime": 4770.3564, "train_samples_per_second": 7.222, "train_steps_per_second": 0.151 } ], "logging_steps": 60, "max_steps": 720, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 60, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 13 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.146999757147996e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }