| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 20.0, | |
| "eval_steps": 500, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 14.416919708251953, | |
| "learning_rate": 9.75e-05, | |
| "loss": 0.3808, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 14.942909240722656, | |
| "learning_rate": 9.5e-05, | |
| "loss": 0.1636, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.1972920000553131, | |
| "eval_mse": 0.1972920149564743, | |
| "eval_runtime": 1.8069, | |
| "eval_samples_per_second": 21.584, | |
| "eval_steps_per_second": 2.767, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.4087517261505127, | |
| "learning_rate": 9.250000000000001e-05, | |
| "loss": 0.2302, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 21.778547286987305, | |
| "learning_rate": 9e-05, | |
| "loss": 0.1331, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.046439673751592636, | |
| "eval_mse": 0.046439677476882935, | |
| "eval_runtime": 1.622, | |
| "eval_samples_per_second": 24.044, | |
| "eval_steps_per_second": 3.083, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.1269134283065796, | |
| "learning_rate": 8.75e-05, | |
| "loss": 0.05, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 4.305675983428955, | |
| "learning_rate": 8.5e-05, | |
| "loss": 0.0289, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.03575298935174942, | |
| "eval_mse": 0.03575298190116882, | |
| "eval_runtime": 1.6208, | |
| "eval_samples_per_second": 24.063, | |
| "eval_steps_per_second": 3.085, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 1.7014061212539673, | |
| "learning_rate": 8.25e-05, | |
| "loss": 0.0246, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.39174097776412964, | |
| "learning_rate": 8e-05, | |
| "loss": 0.0221, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.03326353803277016, | |
| "eval_mse": 0.03326353803277016, | |
| "eval_runtime": 1.6114, | |
| "eval_samples_per_second": 24.203, | |
| "eval_steps_per_second": 3.103, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 2.6145267486572266, | |
| "learning_rate": 7.75e-05, | |
| "loss": 0.021, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 13.847771644592285, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.0223, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.03398064896464348, | |
| "eval_mse": 0.033980656415224075, | |
| "eval_runtime": 1.6515, | |
| "eval_samples_per_second": 23.615, | |
| "eval_steps_per_second": 3.028, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.43660324811935425, | |
| "learning_rate": 7.25e-05, | |
| "loss": 0.0131, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 2.1774399280548096, | |
| "learning_rate": 7e-05, | |
| "loss": 0.0117, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.04457540437579155, | |
| "eval_mse": 0.04457540065050125, | |
| "eval_runtime": 1.6081, | |
| "eval_samples_per_second": 24.252, | |
| "eval_steps_per_second": 3.109, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 2.61739182472229, | |
| "learning_rate": 6.750000000000001e-05, | |
| "loss": 0.0168, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 2.7107529640197754, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 0.0107, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.03702976927161217, | |
| "eval_mse": 0.037029776722192764, | |
| "eval_runtime": 1.6027, | |
| "eval_samples_per_second": 24.334, | |
| "eval_steps_per_second": 3.12, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 4.807140350341797, | |
| "learning_rate": 6.25e-05, | |
| "loss": 0.0112, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 1.6699814796447754, | |
| "learning_rate": 6e-05, | |
| "loss": 0.0096, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.03073795698583126, | |
| "eval_mse": 0.030737943947315216, | |
| "eval_runtime": 1.678, | |
| "eval_samples_per_second": 23.242, | |
| "eval_steps_per_second": 2.98, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 5.444133281707764, | |
| "learning_rate": 5.7499999999999995e-05, | |
| "loss": 0.0099, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 1.5312561988830566, | |
| "learning_rate": 5.500000000000001e-05, | |
| "loss": 0.0142, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.03504549711942673, | |
| "eval_mse": 0.03504551202058792, | |
| "eval_runtime": 1.6103, | |
| "eval_samples_per_second": 24.218, | |
| "eval_steps_per_second": 3.105, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 1.527550220489502, | |
| "learning_rate": 5.25e-05, | |
| "loss": 0.0051, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 1.0232219696044922, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0069, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.032399099320173264, | |
| "eval_mse": 0.03239908814430237, | |
| "eval_runtime": 1.61, | |
| "eval_samples_per_second": 24.224, | |
| "eval_steps_per_second": 3.106, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 10.5, | |
| "grad_norm": 1.1013288497924805, | |
| "learning_rate": 4.75e-05, | |
| "loss": 0.0034, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.35051777958869934, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.0028, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.02933628484606743, | |
| "eval_mse": 0.029336294159293175, | |
| "eval_runtime": 1.7012, | |
| "eval_samples_per_second": 22.925, | |
| "eval_steps_per_second": 2.939, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 11.5, | |
| "grad_norm": 1.1170843839645386, | |
| "learning_rate": 4.25e-05, | |
| "loss": 0.0019, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 1.3299288749694824, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0044, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.028278259560465813, | |
| "eval_mse": 0.028278270736336708, | |
| "eval_runtime": 1.5914, | |
| "eval_samples_per_second": 24.506, | |
| "eval_steps_per_second": 3.142, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 1.6604584455490112, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.002, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 1.2441127300262451, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.0011, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.029920559376478195, | |
| "eval_mse": 0.029920564964413643, | |
| "eval_runtime": 1.6282, | |
| "eval_samples_per_second": 23.953, | |
| "eval_steps_per_second": 3.071, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 13.5, | |
| "grad_norm": 0.7714802026748657, | |
| "learning_rate": 3.2500000000000004e-05, | |
| "loss": 0.0008, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.5498138070106506, | |
| "learning_rate": 3e-05, | |
| "loss": 0.0005, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.027942122891545296, | |
| "eval_mse": 0.027942117303609848, | |
| "eval_runtime": 1.5994, | |
| "eval_samples_per_second": 24.384, | |
| "eval_steps_per_second": 3.126, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 14.5, | |
| "grad_norm": 0.5462870001792908, | |
| "learning_rate": 2.7500000000000004e-05, | |
| "loss": 0.0006, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.32672354578971863, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0005, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.029117202386260033, | |
| "eval_mse": 0.02911720797419548, | |
| "eval_runtime": 1.595, | |
| "eval_samples_per_second": 24.451, | |
| "eval_steps_per_second": 3.135, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 15.5, | |
| "grad_norm": 0.7088171243667603, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.0012, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.3224898874759674, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0011, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.028802577406167984, | |
| "eval_mse": 0.028802569955587387, | |
| "eval_runtime": 1.6242, | |
| "eval_samples_per_second": 24.012, | |
| "eval_steps_per_second": 3.078, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 16.5, | |
| "grad_norm": 0.2536928951740265, | |
| "learning_rate": 1.75e-05, | |
| "loss": 0.0002, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.2693057060241699, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.0003, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.028974896296858788, | |
| "eval_mse": 0.02897489443421364, | |
| "eval_runtime": 1.5937, | |
| "eval_samples_per_second": 24.472, | |
| "eval_steps_per_second": 3.137, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "grad_norm": 0.22231905162334442, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.0001, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.44173210859298706, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.029911378398537636, | |
| "eval_mse": 0.029911383986473083, | |
| "eval_runtime": 1.5913, | |
| "eval_samples_per_second": 24.509, | |
| "eval_steps_per_second": 3.142, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 18.5, | |
| "grad_norm": 0.2958744764328003, | |
| "learning_rate": 7.5e-06, | |
| "loss": 0.0001, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.41316938400268555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0001, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.029724078252911568, | |
| "eval_mse": 0.029724083840847015, | |
| "eval_runtime": 1.5907, | |
| "eval_samples_per_second": 24.518, | |
| "eval_steps_per_second": 3.143, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 19.5, | |
| "grad_norm": 0.0391409695148468, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.021498844027519226, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |