{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4287245444801715, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007145409074669525, "grad_norm": 4.4086809158325195, "learning_rate": 2.3809523809523808e-06, "loss": 1.0969, "step": 10 }, { "epoch": 0.01429081814933905, "grad_norm": 5.687011241912842, "learning_rate": 4.7619047619047615e-06, "loss": 1.0795, "step": 20 }, { "epoch": 0.021436227224008574, "grad_norm": 1.976590633392334, "learning_rate": 7.142857142857143e-06, "loss": 0.7536, "step": 30 }, { "epoch": 0.0285816362986781, "grad_norm": 3.1355409622192383, "learning_rate": 9.523809523809523e-06, "loss": 0.5564, "step": 40 }, { "epoch": 0.03572704537334762, "grad_norm": 2.6710309982299805, "learning_rate": 1.1904761904761905e-05, "loss": 0.623, "step": 50 }, { "epoch": 0.04287245444801715, "grad_norm": 2.8567938804626465, "learning_rate": 1.4285714285714285e-05, "loss": 0.5322, "step": 60 }, { "epoch": 0.050017863522686674, "grad_norm": 3.4388861656188965, "learning_rate": 1.6666666666666667e-05, "loss": 0.5102, "step": 70 }, { "epoch": 0.0571632725973562, "grad_norm": 3.093275308609009, "learning_rate": 1.9047619047619046e-05, "loss": 0.568, "step": 80 }, { "epoch": 0.06430868167202572, "grad_norm": 2.3798677921295166, "learning_rate": 2.1428571428571428e-05, "loss": 0.4883, "step": 90 }, { "epoch": 0.07145409074669525, "grad_norm": 2.846259117126465, "learning_rate": 2.380952380952381e-05, "loss": 0.417, "step": 100 }, { "epoch": 0.07145409074669525, "eval_news_finetune_val_loss": 0.48679304122924805, "eval_news_finetune_val_runtime": 1001.9158, "eval_news_finetune_val_samples_per_second": 1.397, "eval_news_finetune_val_steps_per_second": 1.397, "step": 100 }, { "epoch": 0.07859949982136477, "grad_norm": 1.9387887716293335, "learning_rate": 2.6190476190476192e-05, "loss": 0.4595, "step": 110 }, { "epoch": 0.0857449088960343, "grad_norm": 2.3232853412628174, "learning_rate": 2.857142857142857e-05, "loss": 0.4658, "step": 120 }, { "epoch": 0.09289031797070382, "grad_norm": 2.813093423843384, "learning_rate": 3.095238095238095e-05, "loss": 0.4122, "step": 130 }, { "epoch": 0.10003572704537335, "grad_norm": 1.9588465690612793, "learning_rate": 3.3333333333333335e-05, "loss": 0.4878, "step": 140 }, { "epoch": 0.10718113612004287, "grad_norm": 1.4838117361068726, "learning_rate": 3.571428571428572e-05, "loss": 0.4168, "step": 150 }, { "epoch": 0.1143265451947124, "grad_norm": 3.020738124847412, "learning_rate": 3.809523809523809e-05, "loss": 0.4298, "step": 160 }, { "epoch": 0.12147195426938193, "grad_norm": 2.097656011581421, "learning_rate": 4.047619047619048e-05, "loss": 0.4413, "step": 170 }, { "epoch": 0.12861736334405144, "grad_norm": 1.6332950592041016, "learning_rate": 4.2857142857142856e-05, "loss": 0.3734, "step": 180 }, { "epoch": 0.13576277241872098, "grad_norm": 2.1570417881011963, "learning_rate": 4.523809523809524e-05, "loss": 0.4015, "step": 190 }, { "epoch": 0.1429081814933905, "grad_norm": 1.6941479444503784, "learning_rate": 4.761904761904762e-05, "loss": 0.4411, "step": 200 }, { "epoch": 0.1429081814933905, "eval_news_finetune_val_loss": 0.4338369369506836, "eval_news_finetune_val_runtime": 1002.1695, "eval_news_finetune_val_samples_per_second": 1.397, "eval_news_finetune_val_steps_per_second": 1.397, "step": 200 }, { "epoch": 0.15005359056806003, "grad_norm": 2.3582301139831543, "learning_rate": 5e-05, "loss": 0.3697, "step": 210 }, { "epoch": 0.15719899964272954, "grad_norm": 2.0517632961273193, "learning_rate": 5.2380952380952384e-05, "loss": 0.4076, "step": 220 }, { "epoch": 0.16434440871739908, "grad_norm": 1.3338748216629028, "learning_rate": 5.4761904761904766e-05, "loss": 0.3307, "step": 230 }, { "epoch": 0.1714898177920686, "grad_norm": 3.0515363216400146, "learning_rate": 5.714285714285714e-05, "loss": 0.4227, "step": 240 }, { "epoch": 0.17863522686673813, "grad_norm": 2.4899113178253174, "learning_rate": 5.9523809523809524e-05, "loss": 0.4689, "step": 250 }, { "epoch": 0.18578063594140765, "grad_norm": 1.6197255849838257, "learning_rate": 6.19047619047619e-05, "loss": 0.3618, "step": 260 }, { "epoch": 0.19292604501607716, "grad_norm": 1.654628872871399, "learning_rate": 6.428571428571429e-05, "loss": 0.4668, "step": 270 }, { "epoch": 0.2000714540907467, "grad_norm": 1.6470831632614136, "learning_rate": 6.666666666666667e-05, "loss": 0.3525, "step": 280 }, { "epoch": 0.2072168631654162, "grad_norm": 2.640536308288574, "learning_rate": 6.904761904761905e-05, "loss": 0.3707, "step": 290 }, { "epoch": 0.21436227224008575, "grad_norm": 2.3426971435546875, "learning_rate": 7.142857142857143e-05, "loss": 0.4461, "step": 300 }, { "epoch": 0.21436227224008575, "eval_news_finetune_val_loss": 0.40391305088996887, "eval_news_finetune_val_runtime": 1002.5797, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 300 }, { "epoch": 0.22150768131475526, "grad_norm": 1.0351321697235107, "learning_rate": 7.380952380952382e-05, "loss": 0.3439, "step": 310 }, { "epoch": 0.2286530903894248, "grad_norm": 3.062483549118042, "learning_rate": 7.619047619047618e-05, "loss": 0.4492, "step": 320 }, { "epoch": 0.2357984994640943, "grad_norm": 2.095825672149658, "learning_rate": 7.857142857142858e-05, "loss": 0.3399, "step": 330 }, { "epoch": 0.24294390853876385, "grad_norm": 1.700642704963684, "learning_rate": 8.095238095238096e-05, "loss": 0.4336, "step": 340 }, { "epoch": 0.2500893176134334, "grad_norm": 1.6802127361297607, "learning_rate": 8.333333333333334e-05, "loss": 0.3628, "step": 350 }, { "epoch": 0.2572347266881029, "grad_norm": 1.1725817918777466, "learning_rate": 8.571428571428571e-05, "loss": 0.4113, "step": 360 }, { "epoch": 0.2643801357627724, "grad_norm": 1.0182325839996338, "learning_rate": 8.80952380952381e-05, "loss": 0.4009, "step": 370 }, { "epoch": 0.27152554483744196, "grad_norm": 2.5762252807617188, "learning_rate": 9.047619047619048e-05, "loss": 0.3399, "step": 380 }, { "epoch": 0.27867095391211144, "grad_norm": 1.5393809080123901, "learning_rate": 9.285714285714286e-05, "loss": 0.326, "step": 390 }, { "epoch": 0.285816362986781, "grad_norm": 2.3259921073913574, "learning_rate": 9.523809523809524e-05, "loss": 0.4228, "step": 400 }, { "epoch": 0.285816362986781, "eval_news_finetune_val_loss": 0.39322975277900696, "eval_news_finetune_val_runtime": 1002.8865, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 400 }, { "epoch": 0.2929617720614505, "grad_norm": 0.9278184771537781, "learning_rate": 9.761904761904762e-05, "loss": 0.3184, "step": 410 }, { "epoch": 0.30010718113612006, "grad_norm": 1.4571782350540161, "learning_rate": 0.0001, "loss": 0.473, "step": 420 }, { "epoch": 0.30725259021078954, "grad_norm": 1.6199829578399658, "learning_rate": 9.99982704095424e-05, "loss": 0.392, "step": 430 }, { "epoch": 0.3143979992854591, "grad_norm": 1.302309513092041, "learning_rate": 9.999308175782893e-05, "loss": 0.3824, "step": 440 }, { "epoch": 0.3215434083601286, "grad_norm": 1.438289761543274, "learning_rate": 9.998443440382927e-05, "loss": 0.4001, "step": 450 }, { "epoch": 0.32868881743479816, "grad_norm": 1.7557189464569092, "learning_rate": 9.997232894579868e-05, "loss": 0.4144, "step": 460 }, { "epoch": 0.33583422650946765, "grad_norm": 0.9362027645111084, "learning_rate": 9.995676622123655e-05, "loss": 0.3094, "step": 470 }, { "epoch": 0.3429796355841372, "grad_norm": 1.7850221395492554, "learning_rate": 9.993774730682845e-05, "loss": 0.2966, "step": 480 }, { "epoch": 0.35012504465880673, "grad_norm": 1.705842137336731, "learning_rate": 9.991527351837174e-05, "loss": 0.3274, "step": 490 }, { "epoch": 0.35727045373347627, "grad_norm": 1.0722746849060059, "learning_rate": 9.988934641068436e-05, "loss": 0.4301, "step": 500 }, { "epoch": 0.35727045373347627, "eval_news_finetune_val_loss": 0.3787713646888733, "eval_news_finetune_val_runtime": 1002.8588, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 500 }, { "epoch": 0.36441586280814575, "grad_norm": 1.282714605331421, "learning_rate": 9.985996777749747e-05, "loss": 0.3636, "step": 510 }, { "epoch": 0.3715612718828153, "grad_norm": 2.0360989570617676, "learning_rate": 9.982713965133122e-05, "loss": 0.4467, "step": 520 }, { "epoch": 0.37870668095748483, "grad_norm": 1.7432626485824585, "learning_rate": 9.979086430335417e-05, "loss": 0.3875, "step": 530 }, { "epoch": 0.3858520900321543, "grad_norm": 1.6053438186645508, "learning_rate": 9.975114424322609e-05, "loss": 0.3646, "step": 540 }, { "epoch": 0.39299749910682386, "grad_norm": 1.2323070764541626, "learning_rate": 9.970798221892452e-05, "loss": 0.353, "step": 550 }, { "epoch": 0.4001429081814934, "grad_norm": 1.16932213306427, "learning_rate": 9.966138121655445e-05, "loss": 0.331, "step": 560 }, { "epoch": 0.40728831725616294, "grad_norm": 1.8134998083114624, "learning_rate": 9.961134446014184e-05, "loss": 0.3132, "step": 570 }, { "epoch": 0.4144337263308324, "grad_norm": 1.4292124509811401, "learning_rate": 9.955787541141055e-05, "loss": 0.3017, "step": 580 }, { "epoch": 0.42157913540550196, "grad_norm": 1.4605034589767456, "learning_rate": 9.950097776954284e-05, "loss": 0.3596, "step": 590 }, { "epoch": 0.4287245444801715, "grad_norm": 1.2365972995758057, "learning_rate": 9.944065547092345e-05, "loss": 0.3399, "step": 600 }, { "epoch": 0.4287245444801715, "eval_news_finetune_val_loss": 0.36549311876296997, "eval_news_finetune_val_runtime": 1002.8044, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 600 }, { "epoch": 0.43586995355484104, "grad_norm": 1.0590678453445435, "learning_rate": 9.937691268886725e-05, "loss": 0.3747, "step": 610 }, { "epoch": 0.4430153626295105, "grad_norm": 0.9111473560333252, "learning_rate": 9.930975383333056e-05, "loss": 0.2868, "step": 620 }, { "epoch": 0.45016077170418006, "grad_norm": 2.0456018447875977, "learning_rate": 9.923918355060599e-05, "loss": 0.3289, "step": 630 }, { "epoch": 0.4573061807788496, "grad_norm": 1.5998501777648926, "learning_rate": 9.916520672300107e-05, "loss": 0.3664, "step": 640 }, { "epoch": 0.4644515898535191, "grad_norm": 1.0773181915283203, "learning_rate": 9.908782846850037e-05, "loss": 0.3432, "step": 650 }, { "epoch": 0.4715969989281886, "grad_norm": 1.244042158126831, "learning_rate": 9.900705414041154e-05, "loss": 0.3242, "step": 660 }, { "epoch": 0.47874240800285817, "grad_norm": 1.8120310306549072, "learning_rate": 9.892288932699484e-05, "loss": 0.317, "step": 670 }, { "epoch": 0.4858878170775277, "grad_norm": 0.7863224148750305, "learning_rate": 9.883533985107663e-05, "loss": 0.322, "step": 680 }, { "epoch": 0.4930332261521972, "grad_norm": 1.223832130432129, "learning_rate": 9.874441176964642e-05, "loss": 0.343, "step": 690 }, { "epoch": 0.5001786352268668, "grad_norm": 0.9870743155479431, "learning_rate": 9.865011137343787e-05, "loss": 0.3278, "step": 700 }, { "epoch": 0.5001786352268668, "eval_news_finetune_val_loss": 0.35386842489242554, "eval_news_finetune_val_runtime": 1003.4109, "eval_news_finetune_val_samples_per_second": 1.395, "eval_news_finetune_val_steps_per_second": 1.395, "step": 700 }, { "epoch": 0.5073240443015362, "grad_norm": 1.3699963092803955, "learning_rate": 9.85524451864936e-05, "loss": 0.3902, "step": 710 }, { "epoch": 0.5144694533762058, "grad_norm": 1.7188071012496948, "learning_rate": 9.845141996571384e-05, "loss": 0.369, "step": 720 }, { "epoch": 0.5216148624508753, "grad_norm": 0.4889034628868103, "learning_rate": 9.834704270038888e-05, "loss": 0.3174, "step": 730 }, { "epoch": 0.5287602715255448, "grad_norm": 0.8782143592834473, "learning_rate": 9.823932061171561e-05, "loss": 0.3501, "step": 740 }, { "epoch": 0.5359056806002144, "grad_norm": 2.4089126586914062, "learning_rate": 9.812826115229789e-05, "loss": 0.3292, "step": 750 }, { "epoch": 0.5430510896748839, "grad_norm": 1.6382787227630615, "learning_rate": 9.801387200563096e-05, "loss": 0.459, "step": 760 }, { "epoch": 0.5501964987495535, "grad_norm": 1.443916916847229, "learning_rate": 9.789616108556992e-05, "loss": 0.3409, "step": 770 }, { "epoch": 0.5573419078242229, "grad_norm": 1.632278323173523, "learning_rate": 9.77751365357821e-05, "loss": 0.281, "step": 780 }, { "epoch": 0.5644873168988924, "grad_norm": 2.1452109813690186, "learning_rate": 9.765080672918374e-05, "loss": 0.3511, "step": 790 }, { "epoch": 0.571632725973562, "grad_norm": 1.2721842527389526, "learning_rate": 9.752318026736078e-05, "loss": 0.2298, "step": 800 }, { "epoch": 0.571632725973562, "eval_news_finetune_val_loss": 0.34554028511047363, "eval_news_finetune_val_runtime": 1003.3342, "eval_news_finetune_val_samples_per_second": 1.395, "eval_news_finetune_val_steps_per_second": 1.395, "step": 800 }, { "epoch": 0.5787781350482315, "grad_norm": 2.5264174938201904, "learning_rate": 9.739226597997359e-05, "loss": 0.3214, "step": 810 }, { "epoch": 0.585923544122901, "grad_norm": 1.4553183317184448, "learning_rate": 9.725807292414629e-05, "loss": 0.2697, "step": 820 }, { "epoch": 0.5930689531975706, "grad_norm": 2.2111873626708984, "learning_rate": 9.712061038384002e-05, "loss": 0.3315, "step": 830 }, { "epoch": 0.6002143622722401, "grad_norm": 1.4308302402496338, "learning_rate": 9.697988786921071e-05, "loss": 0.4036, "step": 840 }, { "epoch": 0.6073597713469097, "grad_norm": 1.8136054277420044, "learning_rate": 9.683591511595107e-05, "loss": 0.2946, "step": 850 }, { "epoch": 0.6145051804215791, "grad_norm": 1.8586084842681885, "learning_rate": 9.668870208461713e-05, "loss": 0.2259, "step": 860 }, { "epoch": 0.6216505894962486, "grad_norm": 1.1640444993972778, "learning_rate": 9.653825895993908e-05, "loss": 0.4, "step": 870 }, { "epoch": 0.6287959985709182, "grad_norm": 1.386013388633728, "learning_rate": 9.63845961501166e-05, "loss": 0.2804, "step": 880 }, { "epoch": 0.6359414076455877, "grad_norm": 2.1413650512695312, "learning_rate": 9.622772428609887e-05, "loss": 0.3593, "step": 890 }, { "epoch": 0.6430868167202572, "grad_norm": 1.5462217330932617, "learning_rate": 9.606765422084908e-05, "loss": 0.3058, "step": 900 }, { "epoch": 0.6430868167202572, "eval_news_finetune_val_loss": 0.3292103707790375, "eval_news_finetune_val_runtime": 1003.4558, "eval_news_finetune_val_samples_per_second": 1.395, "eval_news_finetune_val_steps_per_second": 1.395, "step": 900 }, { "epoch": 0.6502322257949268, "grad_norm": 1.0373942852020264, "learning_rate": 9.590439702859351e-05, "loss": 0.3318, "step": 910 }, { "epoch": 0.6573776348695963, "grad_norm": 1.2724213600158691, "learning_rate": 9.573796400405544e-05, "loss": 0.3328, "step": 920 }, { "epoch": 0.6645230439442658, "grad_norm": 0.8528966903686523, "learning_rate": 9.55683666616737e-05, "loss": 0.2673, "step": 930 }, { "epoch": 0.6716684530189353, "grad_norm": 1.65499746799469, "learning_rate": 9.539561673480612e-05, "loss": 0.3538, "step": 940 }, { "epoch": 0.6788138620936048, "grad_norm": 2.341379404067993, "learning_rate": 9.521972617491767e-05, "loss": 0.3228, "step": 950 }, { "epoch": 0.6859592711682744, "grad_norm": 1.4938244819641113, "learning_rate": 9.504070715075372e-05, "loss": 0.3974, "step": 960 }, { "epoch": 0.6931046802429439, "grad_norm": 1.0390361547470093, "learning_rate": 9.485857204749811e-05, "loss": 0.3236, "step": 970 }, { "epoch": 0.7002500893176135, "grad_norm": 3.8845393657684326, "learning_rate": 9.467333346591632e-05, "loss": 0.3027, "step": 980 }, { "epoch": 0.707395498392283, "grad_norm": 1.3295674324035645, "learning_rate": 9.448500422148364e-05, "loss": 0.3005, "step": 990 }, { "epoch": 0.7145409074669525, "grad_norm": 1.0146369934082031, "learning_rate": 9.429359734349863e-05, "loss": 0.294, "step": 1000 }, { "epoch": 0.7145409074669525, "eval_news_finetune_val_loss": 0.3208242654800415, "eval_news_finetune_val_runtime": 1003.2491, "eval_news_finetune_val_samples_per_second": 1.395, "eval_news_finetune_val_steps_per_second": 1.395, "step": 1000 }, { "epoch": 0.721686316541622, "grad_norm": 1.5076738595962524, "learning_rate": 9.409912607418172e-05, "loss": 0.268, "step": 1010 }, { "epoch": 0.7288317256162915, "grad_norm": 3.3230276107788086, "learning_rate": 9.390160386775895e-05, "loss": 0.3038, "step": 1020 }, { "epoch": 0.735977134690961, "grad_norm": 1.699854850769043, "learning_rate": 9.370104438953125e-05, "loss": 0.2869, "step": 1030 }, { "epoch": 0.7431225437656306, "grad_norm": 0.904507577419281, "learning_rate": 9.349746151492902e-05, "loss": 0.289, "step": 1040 }, { "epoch": 0.7502679528403001, "grad_norm": 0.9463105201721191, "learning_rate": 9.329086932855215e-05, "loss": 0.3729, "step": 1050 }, { "epoch": 0.7574133619149697, "grad_norm": 1.4746607542037964, "learning_rate": 9.30812821231956e-05, "loss": 0.2282, "step": 1060 }, { "epoch": 0.7645587709896392, "grad_norm": 1.0270076990127563, "learning_rate": 9.286871439886058e-05, "loss": 0.3029, "step": 1070 }, { "epoch": 0.7717041800643086, "grad_norm": 2.0656538009643555, "learning_rate": 9.265318086175143e-05, "loss": 0.3268, "step": 1080 }, { "epoch": 0.7788495891389782, "grad_norm": 0.9798826575279236, "learning_rate": 9.243469642325805e-05, "loss": 0.2942, "step": 1090 }, { "epoch": 0.7859949982136477, "grad_norm": 1.1419672966003418, "learning_rate": 9.221327619892452e-05, "loss": 0.3266, "step": 1100 }, { "epoch": 0.7859949982136477, "eval_news_finetune_val_loss": 0.307956337928772, "eval_news_finetune_val_runtime": 1003.1873, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1100 }, { "epoch": 0.7931404072883173, "grad_norm": 0.6810228228569031, "learning_rate": 9.198893550740306e-05, "loss": 0.3596, "step": 1110 }, { "epoch": 0.8002858163629868, "grad_norm": 1.6553049087524414, "learning_rate": 9.176168986939446e-05, "loss": 0.3106, "step": 1120 }, { "epoch": 0.8074312254376563, "grad_norm": 0.7749443650245667, "learning_rate": 9.153155500657422e-05, "loss": 0.3298, "step": 1130 }, { "epoch": 0.8145766345123259, "grad_norm": 0.8693751096725464, "learning_rate": 9.129854684050481e-05, "loss": 0.279, "step": 1140 }, { "epoch": 0.8217220435869954, "grad_norm": 1.1013332605361938, "learning_rate": 9.10626814915343e-05, "loss": 0.3195, "step": 1150 }, { "epoch": 0.8288674526616648, "grad_norm": 1.2278695106506348, "learning_rate": 9.082397527768092e-05, "loss": 0.3027, "step": 1160 }, { "epoch": 0.8360128617363344, "grad_norm": 2.173530101776123, "learning_rate": 9.058244471350428e-05, "loss": 0.2238, "step": 1170 }, { "epoch": 0.8431582708110039, "grad_norm": 1.125986933708191, "learning_rate": 9.033810650896274e-05, "loss": 0.2399, "step": 1180 }, { "epoch": 0.8503036798856735, "grad_norm": 0.6611151099205017, "learning_rate": 9.009097756825737e-05, "loss": 0.2736, "step": 1190 }, { "epoch": 0.857449088960343, "grad_norm": 1.9068485498428345, "learning_rate": 8.98410749886625e-05, "loss": 0.2949, "step": 1200 }, { "epoch": 0.857449088960343, "eval_news_finetune_val_loss": 0.31006094813346863, "eval_news_finetune_val_runtime": 1002.7866, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1200 }, { "epoch": 0.8645944980350125, "grad_norm": 1.192031979560852, "learning_rate": 8.958841605934278e-05, "loss": 0.3657, "step": 1210 }, { "epoch": 0.8717399071096821, "grad_norm": 1.2596725225448608, "learning_rate": 8.933301826015715e-05, "loss": 0.3068, "step": 1220 }, { "epoch": 0.8788853161843515, "grad_norm": 1.4713683128356934, "learning_rate": 8.907489926044945e-05, "loss": 0.3122, "step": 1230 }, { "epoch": 0.886030725259021, "grad_norm": 1.3583886623382568, "learning_rate": 8.881407691782608e-05, "loss": 0.2989, "step": 1240 }, { "epoch": 0.8931761343336906, "grad_norm": 0.9863426089286804, "learning_rate": 8.855056927692037e-05, "loss": 0.2549, "step": 1250 }, { "epoch": 0.9003215434083601, "grad_norm": 1.0579396486282349, "learning_rate": 8.828439456814442e-05, "loss": 0.2809, "step": 1260 }, { "epoch": 0.9074669524830297, "grad_norm": 2.847482681274414, "learning_rate": 8.801557120642766e-05, "loss": 0.2933, "step": 1270 }, { "epoch": 0.9146123615576992, "grad_norm": 0.8942415118217468, "learning_rate": 8.774411778994295e-05, "loss": 0.2866, "step": 1280 }, { "epoch": 0.9217577706323687, "grad_norm": 1.297845721244812, "learning_rate": 8.747005309881984e-05, "loss": 0.2939, "step": 1290 }, { "epoch": 0.9289031797070382, "grad_norm": 1.2745181322097778, "learning_rate": 8.719339609384531e-05, "loss": 0.3018, "step": 1300 }, { "epoch": 0.9289031797070382, "eval_news_finetune_val_loss": 0.29822030663490295, "eval_news_finetune_val_runtime": 1002.5672, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1300 }, { "epoch": 0.9360485887817077, "grad_norm": 1.3898978233337402, "learning_rate": 8.691416591515198e-05, "loss": 0.295, "step": 1310 }, { "epoch": 0.9431939978563773, "grad_norm": 1.1516591310501099, "learning_rate": 8.663238188089398e-05, "loss": 0.209, "step": 1320 }, { "epoch": 0.9503394069310468, "grad_norm": 0.9356768131256104, "learning_rate": 8.634806348591036e-05, "loss": 0.2904, "step": 1330 }, { "epoch": 0.9574848160057163, "grad_norm": 1.884950876235962, "learning_rate": 8.606123040037643e-05, "loss": 0.2607, "step": 1340 }, { "epoch": 0.9646302250803859, "grad_norm": 1.2719082832336426, "learning_rate": 8.577190246844291e-05, "loss": 0.3279, "step": 1350 }, { "epoch": 0.9717756341550554, "grad_norm": 0.935297429561615, "learning_rate": 8.548009970686302e-05, "loss": 0.3011, "step": 1360 }, { "epoch": 0.978921043229725, "grad_norm": 1.6732884645462036, "learning_rate": 8.51858423036076e-05, "loss": 0.2379, "step": 1370 }, { "epoch": 0.9860664523043944, "grad_norm": 0.6651692390441895, "learning_rate": 8.488915061646856e-05, "loss": 0.2599, "step": 1380 }, { "epoch": 0.9932118613790639, "grad_norm": 1.121752381324768, "learning_rate": 8.459004517165032e-05, "loss": 0.2265, "step": 1390 }, { "epoch": 1.0, "grad_norm": 0.5099928379058838, "learning_rate": 8.428854666234978e-05, "loss": 0.3301, "step": 1400 }, { "epoch": 1.0, "eval_news_finetune_val_loss": 0.28762951493263245, "eval_news_finetune_val_runtime": 1002.7793, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1400 }, { "epoch": 1.0071454090746694, "grad_norm": 0.9986103177070618, "learning_rate": 8.398467594732478e-05, "loss": 0.2021, "step": 1410 }, { "epoch": 1.014290818149339, "grad_norm": 1.2675282955169678, "learning_rate": 8.367845404945084e-05, "loss": 0.2228, "step": 1420 }, { "epoch": 1.0214362272240085, "grad_norm": 0.8156709671020508, "learning_rate": 8.336990215426688e-05, "loss": 0.1947, "step": 1430 }, { "epoch": 1.0285816362986782, "grad_norm": 0.5374387502670288, "learning_rate": 8.305904160850941e-05, "loss": 0.2344, "step": 1440 }, { "epoch": 1.0357270453733476, "grad_norm": 0.6672261357307434, "learning_rate": 8.274589391863583e-05, "loss": 0.1919, "step": 1450 }, { "epoch": 1.0428724544480172, "grad_norm": 0.9803467988967896, "learning_rate": 8.243048074933634e-05, "loss": 0.2218, "step": 1460 }, { "epoch": 1.0500178635226867, "grad_norm": 1.482840657234192, "learning_rate": 8.21128239220353e-05, "loss": 0.2556, "step": 1470 }, { "epoch": 1.057163272597356, "grad_norm": 1.0589625835418701, "learning_rate": 8.179294541338135e-05, "loss": 0.2052, "step": 1480 }, { "epoch": 1.0643086816720257, "grad_norm": 0.8332052230834961, "learning_rate": 8.147086735372716e-05, "loss": 0.2386, "step": 1490 }, { "epoch": 1.0714540907466952, "grad_norm": 0.6018723845481873, "learning_rate": 8.114661202559828e-05, "loss": 0.1426, "step": 1500 }, { "epoch": 1.0714540907466952, "eval_news_finetune_val_loss": 0.30121028423309326, "eval_news_finetune_val_runtime": 1002.7457, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1500 }, { "epoch": 1.0785994998213648, "grad_norm": 1.7663507461547852, "learning_rate": 8.082020186215156e-05, "loss": 0.2407, "step": 1510 }, { "epoch": 1.0857449088960343, "grad_norm": 1.2081632614135742, "learning_rate": 8.049165944562316e-05, "loss": 0.2483, "step": 1520 }, { "epoch": 1.092890317970704, "grad_norm": 0.5045826435089111, "learning_rate": 8.016100750576621e-05, "loss": 0.2013, "step": 1530 }, { "epoch": 1.1000357270453733, "grad_norm": 1.4456278085708618, "learning_rate": 7.98282689182783e-05, "loss": 0.2034, "step": 1540 }, { "epoch": 1.107181136120043, "grad_norm": 1.1558668613433838, "learning_rate": 7.949346670321891e-05, "loss": 0.2386, "step": 1550 }, { "epoch": 1.1143265451947124, "grad_norm": 1.4196126461029053, "learning_rate": 7.915662402341664e-05, "loss": 0.2299, "step": 1560 }, { "epoch": 1.1214719542693818, "grad_norm": 0.9341222047805786, "learning_rate": 7.88177641828669e-05, "loss": 0.2105, "step": 1570 }, { "epoch": 1.1286173633440515, "grad_norm": 1.066001296043396, "learning_rate": 7.847691062511957e-05, "loss": 0.1925, "step": 1580 }, { "epoch": 1.135762772418721, "grad_norm": 0.7840182781219482, "learning_rate": 7.813408693165704e-05, "loss": 0.2425, "step": 1590 }, { "epoch": 1.1429081814933906, "grad_norm": 0.983668327331543, "learning_rate": 7.778931682026293e-05, "loss": 0.2014, "step": 1600 }, { "epoch": 1.1429081814933906, "eval_news_finetune_val_loss": 0.29564452171325684, "eval_news_finetune_val_runtime": 1003.001, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1600 }, { "epoch": 1.15005359056806, "grad_norm": 1.63984215259552, "learning_rate": 7.744262414338099e-05, "loss": 0.2863, "step": 1610 }, { "epoch": 1.1571989996427297, "grad_norm": 0.9211621284484863, "learning_rate": 7.709403288646507e-05, "loss": 0.2175, "step": 1620 }, { "epoch": 1.164344408717399, "grad_norm": 1.3369996547698975, "learning_rate": 7.67435671663196e-05, "loss": 0.1893, "step": 1630 }, { "epoch": 1.1714898177920685, "grad_norm": 0.7532891631126404, "learning_rate": 7.63912512294312e-05, "loss": 0.2483, "step": 1640 }, { "epoch": 1.1786352268667382, "grad_norm": 1.0959442853927612, "learning_rate": 7.603710945029119e-05, "loss": 0.1888, "step": 1650 }, { "epoch": 1.1857806359414076, "grad_norm": 0.9019472599029541, "learning_rate": 7.568116632970922e-05, "loss": 0.2144, "step": 1660 }, { "epoch": 1.1929260450160772, "grad_norm": 1.1219818592071533, "learning_rate": 7.532344649311829e-05, "loss": 0.191, "step": 1670 }, { "epoch": 1.2000714540907467, "grad_norm": 1.0829100608825684, "learning_rate": 7.496397468887106e-05, "loss": 0.2762, "step": 1680 }, { "epoch": 1.2072168631654163, "grad_norm": 0.7855832576751709, "learning_rate": 7.460277578652759e-05, "loss": 0.157, "step": 1690 }, { "epoch": 1.2143622722400857, "grad_norm": 2.407999038696289, "learning_rate": 7.423987477513488e-05, "loss": 0.2627, "step": 1700 }, { "epoch": 1.2143622722400857, "eval_news_finetune_val_loss": 0.28248873353004456, "eval_news_finetune_val_runtime": 1003.1081, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1700 }, { "epoch": 1.2215076813147552, "grad_norm": 1.5500895977020264, "learning_rate": 7.387529676149799e-05, "loss": 0.1477, "step": 1710 }, { "epoch": 1.2286530903894248, "grad_norm": 1.5599130392074585, "learning_rate": 7.350906696844307e-05, "loss": 0.1942, "step": 1720 }, { "epoch": 1.2357984994640943, "grad_norm": 1.6327091455459595, "learning_rate": 7.314121073307229e-05, "loss": 0.2, "step": 1730 }, { "epoch": 1.242943908538764, "grad_norm": 0.6044666767120361, "learning_rate": 7.277175350501111e-05, "loss": 0.185, "step": 1740 }, { "epoch": 1.2500893176134333, "grad_norm": 1.317089319229126, "learning_rate": 7.240072084464729e-05, "loss": 0.196, "step": 1750 }, { "epoch": 1.257234726688103, "grad_norm": 1.089105486869812, "learning_rate": 7.202813842136283e-05, "loss": 0.1322, "step": 1760 }, { "epoch": 1.2643801357627724, "grad_norm": 1.4972888231277466, "learning_rate": 7.165403201175787e-05, "loss": 0.2176, "step": 1770 }, { "epoch": 1.2715255448374418, "grad_norm": 1.4998830556869507, "learning_rate": 7.127842749786747e-05, "loss": 0.218, "step": 1780 }, { "epoch": 1.2786709539121115, "grad_norm": 0.9759517908096313, "learning_rate": 7.090135086537095e-05, "loss": 0.1653, "step": 1790 }, { "epoch": 1.285816362986781, "grad_norm": 0.9713583588600159, "learning_rate": 7.052282820179412e-05, "loss": 0.175, "step": 1800 }, { "epoch": 1.285816362986781, "eval_news_finetune_val_loss": 0.2936909794807434, "eval_news_finetune_val_runtime": 1003.12, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1800 }, { "epoch": 1.2929617720614506, "grad_norm": 0.6328814625740051, "learning_rate": 7.014288569470446e-05, "loss": 0.1727, "step": 1810 }, { "epoch": 1.30010718113612, "grad_norm": 1.622104525566101, "learning_rate": 6.976154962989934e-05, "loss": 0.2363, "step": 1820 }, { "epoch": 1.3072525902107897, "grad_norm": 1.8254674673080444, "learning_rate": 6.937884638958757e-05, "loss": 0.1897, "step": 1830 }, { "epoch": 1.314397999285459, "grad_norm": 0.8813793063163757, "learning_rate": 6.899480245056396e-05, "loss": 0.2029, "step": 1840 }, { "epoch": 1.3215434083601285, "grad_norm": 0.7675999999046326, "learning_rate": 6.860944438237788e-05, "loss": 0.2025, "step": 1850 }, { "epoch": 1.3286888174347982, "grad_norm": 1.1973013877868652, "learning_rate": 6.82227988454948e-05, "loss": 0.2317, "step": 1860 }, { "epoch": 1.3358342265094676, "grad_norm": 0.7864009737968445, "learning_rate": 6.783489258945195e-05, "loss": 0.2318, "step": 1870 }, { "epoch": 1.3429796355841372, "grad_norm": 1.0866330862045288, "learning_rate": 6.74457524510077e-05, "loss": 0.1871, "step": 1880 }, { "epoch": 1.3501250446588067, "grad_norm": 0.8745126724243164, "learning_rate": 6.705540535228485e-05, "loss": 0.211, "step": 1890 }, { "epoch": 1.3572704537334763, "grad_norm": 1.3401581048965454, "learning_rate": 6.66638782989081e-05, "loss": 0.2307, "step": 1900 }, { "epoch": 1.3572704537334763, "eval_news_finetune_val_loss": 0.2787444591522217, "eval_news_finetune_val_runtime": 1002.9344, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 1900 }, { "epoch": 1.3644158628081458, "grad_norm": 0.6149284839630127, "learning_rate": 6.627119837813564e-05, "loss": 0.2128, "step": 1910 }, { "epoch": 1.3715612718828152, "grad_norm": 1.7847625017166138, "learning_rate": 6.587739275698525e-05, "loss": 0.1551, "step": 1920 }, { "epoch": 1.3787066809574848, "grad_norm": 1.1973716020584106, "learning_rate": 6.54824886803547e-05, "loss": 0.2335, "step": 1930 }, { "epoch": 1.3858520900321543, "grad_norm": 1.5757859945297241, "learning_rate": 6.508651346913687e-05, "loss": 0.1504, "step": 1940 }, { "epoch": 1.392997499106824, "grad_norm": 1.7269341945648193, "learning_rate": 6.468949451832968e-05, "loss": 0.2679, "step": 1950 }, { "epoch": 1.4001429081814933, "grad_norm": 1.6860129833221436, "learning_rate": 6.429145929514063e-05, "loss": 0.1942, "step": 1960 }, { "epoch": 1.407288317256163, "grad_norm": 1.1732631921768188, "learning_rate": 6.389243533708671e-05, "loss": 0.2025, "step": 1970 }, { "epoch": 1.4144337263308324, "grad_norm": 0.9073033332824707, "learning_rate": 6.349245025008912e-05, "loss": 0.1836, "step": 1980 }, { "epoch": 1.4215791354055018, "grad_norm": 1.133843183517456, "learning_rate": 6.309153170656342e-05, "loss": 0.1526, "step": 1990 }, { "epoch": 1.4287245444801715, "grad_norm": 2.656296968460083, "learning_rate": 6.268970744350515e-05, "loss": 0.1939, "step": 2000 }, { "epoch": 1.4287245444801715, "eval_news_finetune_val_loss": 0.27414408326148987, "eval_news_finetune_val_runtime": 1003.0949, "eval_news_finetune_val_samples_per_second": 1.396, "eval_news_finetune_val_steps_per_second": 1.396, "step": 2000 } ], "logging_steps": 10, "max_steps": 4197, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.538125336973312e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }