OsamaMo's picture
Training in progress, step 2000, checkpoint
c994d38 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.4287245444801715,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007145409074669525,
"grad_norm": 4.4086809158325195,
"learning_rate": 2.3809523809523808e-06,
"loss": 1.0969,
"step": 10
},
{
"epoch": 0.01429081814933905,
"grad_norm": 5.687011241912842,
"learning_rate": 4.7619047619047615e-06,
"loss": 1.0795,
"step": 20
},
{
"epoch": 0.021436227224008574,
"grad_norm": 1.976590633392334,
"learning_rate": 7.142857142857143e-06,
"loss": 0.7536,
"step": 30
},
{
"epoch": 0.0285816362986781,
"grad_norm": 3.1355409622192383,
"learning_rate": 9.523809523809523e-06,
"loss": 0.5564,
"step": 40
},
{
"epoch": 0.03572704537334762,
"grad_norm": 2.6710309982299805,
"learning_rate": 1.1904761904761905e-05,
"loss": 0.623,
"step": 50
},
{
"epoch": 0.04287245444801715,
"grad_norm": 2.8567938804626465,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.5322,
"step": 60
},
{
"epoch": 0.050017863522686674,
"grad_norm": 3.4388861656188965,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.5102,
"step": 70
},
{
"epoch": 0.0571632725973562,
"grad_norm": 3.093275308609009,
"learning_rate": 1.9047619047619046e-05,
"loss": 0.568,
"step": 80
},
{
"epoch": 0.06430868167202572,
"grad_norm": 2.3798677921295166,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.4883,
"step": 90
},
{
"epoch": 0.07145409074669525,
"grad_norm": 2.846259117126465,
"learning_rate": 2.380952380952381e-05,
"loss": 0.417,
"step": 100
},
{
"epoch": 0.07145409074669525,
"eval_news_finetune_val_loss": 0.48679304122924805,
"eval_news_finetune_val_runtime": 1001.9158,
"eval_news_finetune_val_samples_per_second": 1.397,
"eval_news_finetune_val_steps_per_second": 1.397,
"step": 100
},
{
"epoch": 0.07859949982136477,
"grad_norm": 1.9387887716293335,
"learning_rate": 2.6190476190476192e-05,
"loss": 0.4595,
"step": 110
},
{
"epoch": 0.0857449088960343,
"grad_norm": 2.3232853412628174,
"learning_rate": 2.857142857142857e-05,
"loss": 0.4658,
"step": 120
},
{
"epoch": 0.09289031797070382,
"grad_norm": 2.813093423843384,
"learning_rate": 3.095238095238095e-05,
"loss": 0.4122,
"step": 130
},
{
"epoch": 0.10003572704537335,
"grad_norm": 1.9588465690612793,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.4878,
"step": 140
},
{
"epoch": 0.10718113612004287,
"grad_norm": 1.4838117361068726,
"learning_rate": 3.571428571428572e-05,
"loss": 0.4168,
"step": 150
},
{
"epoch": 0.1143265451947124,
"grad_norm": 3.020738124847412,
"learning_rate": 3.809523809523809e-05,
"loss": 0.4298,
"step": 160
},
{
"epoch": 0.12147195426938193,
"grad_norm": 2.097656011581421,
"learning_rate": 4.047619047619048e-05,
"loss": 0.4413,
"step": 170
},
{
"epoch": 0.12861736334405144,
"grad_norm": 1.6332950592041016,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.3734,
"step": 180
},
{
"epoch": 0.13576277241872098,
"grad_norm": 2.1570417881011963,
"learning_rate": 4.523809523809524e-05,
"loss": 0.4015,
"step": 190
},
{
"epoch": 0.1429081814933905,
"grad_norm": 1.6941479444503784,
"learning_rate": 4.761904761904762e-05,
"loss": 0.4411,
"step": 200
},
{
"epoch": 0.1429081814933905,
"eval_news_finetune_val_loss": 0.4338369369506836,
"eval_news_finetune_val_runtime": 1002.1695,
"eval_news_finetune_val_samples_per_second": 1.397,
"eval_news_finetune_val_steps_per_second": 1.397,
"step": 200
},
{
"epoch": 0.15005359056806003,
"grad_norm": 2.3582301139831543,
"learning_rate": 5e-05,
"loss": 0.3697,
"step": 210
},
{
"epoch": 0.15719899964272954,
"grad_norm": 2.0517632961273193,
"learning_rate": 5.2380952380952384e-05,
"loss": 0.4076,
"step": 220
},
{
"epoch": 0.16434440871739908,
"grad_norm": 1.3338748216629028,
"learning_rate": 5.4761904761904766e-05,
"loss": 0.3307,
"step": 230
},
{
"epoch": 0.1714898177920686,
"grad_norm": 3.0515363216400146,
"learning_rate": 5.714285714285714e-05,
"loss": 0.4227,
"step": 240
},
{
"epoch": 0.17863522686673813,
"grad_norm": 2.4899113178253174,
"learning_rate": 5.9523809523809524e-05,
"loss": 0.4689,
"step": 250
},
{
"epoch": 0.18578063594140765,
"grad_norm": 1.6197255849838257,
"learning_rate": 6.19047619047619e-05,
"loss": 0.3618,
"step": 260
},
{
"epoch": 0.19292604501607716,
"grad_norm": 1.654628872871399,
"learning_rate": 6.428571428571429e-05,
"loss": 0.4668,
"step": 270
},
{
"epoch": 0.2000714540907467,
"grad_norm": 1.6470831632614136,
"learning_rate": 6.666666666666667e-05,
"loss": 0.3525,
"step": 280
},
{
"epoch": 0.2072168631654162,
"grad_norm": 2.640536308288574,
"learning_rate": 6.904761904761905e-05,
"loss": 0.3707,
"step": 290
},
{
"epoch": 0.21436227224008575,
"grad_norm": 2.3426971435546875,
"learning_rate": 7.142857142857143e-05,
"loss": 0.4461,
"step": 300
},
{
"epoch": 0.21436227224008575,
"eval_news_finetune_val_loss": 0.40391305088996887,
"eval_news_finetune_val_runtime": 1002.5797,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 300
},
{
"epoch": 0.22150768131475526,
"grad_norm": 1.0351321697235107,
"learning_rate": 7.380952380952382e-05,
"loss": 0.3439,
"step": 310
},
{
"epoch": 0.2286530903894248,
"grad_norm": 3.062483549118042,
"learning_rate": 7.619047619047618e-05,
"loss": 0.4492,
"step": 320
},
{
"epoch": 0.2357984994640943,
"grad_norm": 2.095825672149658,
"learning_rate": 7.857142857142858e-05,
"loss": 0.3399,
"step": 330
},
{
"epoch": 0.24294390853876385,
"grad_norm": 1.700642704963684,
"learning_rate": 8.095238095238096e-05,
"loss": 0.4336,
"step": 340
},
{
"epoch": 0.2500893176134334,
"grad_norm": 1.6802127361297607,
"learning_rate": 8.333333333333334e-05,
"loss": 0.3628,
"step": 350
},
{
"epoch": 0.2572347266881029,
"grad_norm": 1.1725817918777466,
"learning_rate": 8.571428571428571e-05,
"loss": 0.4113,
"step": 360
},
{
"epoch": 0.2643801357627724,
"grad_norm": 1.0182325839996338,
"learning_rate": 8.80952380952381e-05,
"loss": 0.4009,
"step": 370
},
{
"epoch": 0.27152554483744196,
"grad_norm": 2.5762252807617188,
"learning_rate": 9.047619047619048e-05,
"loss": 0.3399,
"step": 380
},
{
"epoch": 0.27867095391211144,
"grad_norm": 1.5393809080123901,
"learning_rate": 9.285714285714286e-05,
"loss": 0.326,
"step": 390
},
{
"epoch": 0.285816362986781,
"grad_norm": 2.3259921073913574,
"learning_rate": 9.523809523809524e-05,
"loss": 0.4228,
"step": 400
},
{
"epoch": 0.285816362986781,
"eval_news_finetune_val_loss": 0.39322975277900696,
"eval_news_finetune_val_runtime": 1002.8865,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 400
},
{
"epoch": 0.2929617720614505,
"grad_norm": 0.9278184771537781,
"learning_rate": 9.761904761904762e-05,
"loss": 0.3184,
"step": 410
},
{
"epoch": 0.30010718113612006,
"grad_norm": 1.4571782350540161,
"learning_rate": 0.0001,
"loss": 0.473,
"step": 420
},
{
"epoch": 0.30725259021078954,
"grad_norm": 1.6199829578399658,
"learning_rate": 9.99982704095424e-05,
"loss": 0.392,
"step": 430
},
{
"epoch": 0.3143979992854591,
"grad_norm": 1.302309513092041,
"learning_rate": 9.999308175782893e-05,
"loss": 0.3824,
"step": 440
},
{
"epoch": 0.3215434083601286,
"grad_norm": 1.438289761543274,
"learning_rate": 9.998443440382927e-05,
"loss": 0.4001,
"step": 450
},
{
"epoch": 0.32868881743479816,
"grad_norm": 1.7557189464569092,
"learning_rate": 9.997232894579868e-05,
"loss": 0.4144,
"step": 460
},
{
"epoch": 0.33583422650946765,
"grad_norm": 0.9362027645111084,
"learning_rate": 9.995676622123655e-05,
"loss": 0.3094,
"step": 470
},
{
"epoch": 0.3429796355841372,
"grad_norm": 1.7850221395492554,
"learning_rate": 9.993774730682845e-05,
"loss": 0.2966,
"step": 480
},
{
"epoch": 0.35012504465880673,
"grad_norm": 1.705842137336731,
"learning_rate": 9.991527351837174e-05,
"loss": 0.3274,
"step": 490
},
{
"epoch": 0.35727045373347627,
"grad_norm": 1.0722746849060059,
"learning_rate": 9.988934641068436e-05,
"loss": 0.4301,
"step": 500
},
{
"epoch": 0.35727045373347627,
"eval_news_finetune_val_loss": 0.3787713646888733,
"eval_news_finetune_val_runtime": 1002.8588,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 500
},
{
"epoch": 0.36441586280814575,
"grad_norm": 1.282714605331421,
"learning_rate": 9.985996777749747e-05,
"loss": 0.3636,
"step": 510
},
{
"epoch": 0.3715612718828153,
"grad_norm": 2.0360989570617676,
"learning_rate": 9.982713965133122e-05,
"loss": 0.4467,
"step": 520
},
{
"epoch": 0.37870668095748483,
"grad_norm": 1.7432626485824585,
"learning_rate": 9.979086430335417e-05,
"loss": 0.3875,
"step": 530
},
{
"epoch": 0.3858520900321543,
"grad_norm": 1.6053438186645508,
"learning_rate": 9.975114424322609e-05,
"loss": 0.3646,
"step": 540
},
{
"epoch": 0.39299749910682386,
"grad_norm": 1.2323070764541626,
"learning_rate": 9.970798221892452e-05,
"loss": 0.353,
"step": 550
},
{
"epoch": 0.4001429081814934,
"grad_norm": 1.16932213306427,
"learning_rate": 9.966138121655445e-05,
"loss": 0.331,
"step": 560
},
{
"epoch": 0.40728831725616294,
"grad_norm": 1.8134998083114624,
"learning_rate": 9.961134446014184e-05,
"loss": 0.3132,
"step": 570
},
{
"epoch": 0.4144337263308324,
"grad_norm": 1.4292124509811401,
"learning_rate": 9.955787541141055e-05,
"loss": 0.3017,
"step": 580
},
{
"epoch": 0.42157913540550196,
"grad_norm": 1.4605034589767456,
"learning_rate": 9.950097776954284e-05,
"loss": 0.3596,
"step": 590
},
{
"epoch": 0.4287245444801715,
"grad_norm": 1.2365972995758057,
"learning_rate": 9.944065547092345e-05,
"loss": 0.3399,
"step": 600
},
{
"epoch": 0.4287245444801715,
"eval_news_finetune_val_loss": 0.36549311876296997,
"eval_news_finetune_val_runtime": 1002.8044,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 600
},
{
"epoch": 0.43586995355484104,
"grad_norm": 1.0590678453445435,
"learning_rate": 9.937691268886725e-05,
"loss": 0.3747,
"step": 610
},
{
"epoch": 0.4430153626295105,
"grad_norm": 0.9111473560333252,
"learning_rate": 9.930975383333056e-05,
"loss": 0.2868,
"step": 620
},
{
"epoch": 0.45016077170418006,
"grad_norm": 2.0456018447875977,
"learning_rate": 9.923918355060599e-05,
"loss": 0.3289,
"step": 630
},
{
"epoch": 0.4573061807788496,
"grad_norm": 1.5998501777648926,
"learning_rate": 9.916520672300107e-05,
"loss": 0.3664,
"step": 640
},
{
"epoch": 0.4644515898535191,
"grad_norm": 1.0773181915283203,
"learning_rate": 9.908782846850037e-05,
"loss": 0.3432,
"step": 650
},
{
"epoch": 0.4715969989281886,
"grad_norm": 1.244042158126831,
"learning_rate": 9.900705414041154e-05,
"loss": 0.3242,
"step": 660
},
{
"epoch": 0.47874240800285817,
"grad_norm": 1.8120310306549072,
"learning_rate": 9.892288932699484e-05,
"loss": 0.317,
"step": 670
},
{
"epoch": 0.4858878170775277,
"grad_norm": 0.7863224148750305,
"learning_rate": 9.883533985107663e-05,
"loss": 0.322,
"step": 680
},
{
"epoch": 0.4930332261521972,
"grad_norm": 1.223832130432129,
"learning_rate": 9.874441176964642e-05,
"loss": 0.343,
"step": 690
},
{
"epoch": 0.5001786352268668,
"grad_norm": 0.9870743155479431,
"learning_rate": 9.865011137343787e-05,
"loss": 0.3278,
"step": 700
},
{
"epoch": 0.5001786352268668,
"eval_news_finetune_val_loss": 0.35386842489242554,
"eval_news_finetune_val_runtime": 1003.4109,
"eval_news_finetune_val_samples_per_second": 1.395,
"eval_news_finetune_val_steps_per_second": 1.395,
"step": 700
},
{
"epoch": 0.5073240443015362,
"grad_norm": 1.3699963092803955,
"learning_rate": 9.85524451864936e-05,
"loss": 0.3902,
"step": 710
},
{
"epoch": 0.5144694533762058,
"grad_norm": 1.7188071012496948,
"learning_rate": 9.845141996571384e-05,
"loss": 0.369,
"step": 720
},
{
"epoch": 0.5216148624508753,
"grad_norm": 0.4889034628868103,
"learning_rate": 9.834704270038888e-05,
"loss": 0.3174,
"step": 730
},
{
"epoch": 0.5287602715255448,
"grad_norm": 0.8782143592834473,
"learning_rate": 9.823932061171561e-05,
"loss": 0.3501,
"step": 740
},
{
"epoch": 0.5359056806002144,
"grad_norm": 2.4089126586914062,
"learning_rate": 9.812826115229789e-05,
"loss": 0.3292,
"step": 750
},
{
"epoch": 0.5430510896748839,
"grad_norm": 1.6382787227630615,
"learning_rate": 9.801387200563096e-05,
"loss": 0.459,
"step": 760
},
{
"epoch": 0.5501964987495535,
"grad_norm": 1.443916916847229,
"learning_rate": 9.789616108556992e-05,
"loss": 0.3409,
"step": 770
},
{
"epoch": 0.5573419078242229,
"grad_norm": 1.632278323173523,
"learning_rate": 9.77751365357821e-05,
"loss": 0.281,
"step": 780
},
{
"epoch": 0.5644873168988924,
"grad_norm": 2.1452109813690186,
"learning_rate": 9.765080672918374e-05,
"loss": 0.3511,
"step": 790
},
{
"epoch": 0.571632725973562,
"grad_norm": 1.2721842527389526,
"learning_rate": 9.752318026736078e-05,
"loss": 0.2298,
"step": 800
},
{
"epoch": 0.571632725973562,
"eval_news_finetune_val_loss": 0.34554028511047363,
"eval_news_finetune_val_runtime": 1003.3342,
"eval_news_finetune_val_samples_per_second": 1.395,
"eval_news_finetune_val_steps_per_second": 1.395,
"step": 800
},
{
"epoch": 0.5787781350482315,
"grad_norm": 2.5264174938201904,
"learning_rate": 9.739226597997359e-05,
"loss": 0.3214,
"step": 810
},
{
"epoch": 0.585923544122901,
"grad_norm": 1.4553183317184448,
"learning_rate": 9.725807292414629e-05,
"loss": 0.2697,
"step": 820
},
{
"epoch": 0.5930689531975706,
"grad_norm": 2.2111873626708984,
"learning_rate": 9.712061038384002e-05,
"loss": 0.3315,
"step": 830
},
{
"epoch": 0.6002143622722401,
"grad_norm": 1.4308302402496338,
"learning_rate": 9.697988786921071e-05,
"loss": 0.4036,
"step": 840
},
{
"epoch": 0.6073597713469097,
"grad_norm": 1.8136054277420044,
"learning_rate": 9.683591511595107e-05,
"loss": 0.2946,
"step": 850
},
{
"epoch": 0.6145051804215791,
"grad_norm": 1.8586084842681885,
"learning_rate": 9.668870208461713e-05,
"loss": 0.2259,
"step": 860
},
{
"epoch": 0.6216505894962486,
"grad_norm": 1.1640444993972778,
"learning_rate": 9.653825895993908e-05,
"loss": 0.4,
"step": 870
},
{
"epoch": 0.6287959985709182,
"grad_norm": 1.386013388633728,
"learning_rate": 9.63845961501166e-05,
"loss": 0.2804,
"step": 880
},
{
"epoch": 0.6359414076455877,
"grad_norm": 2.1413650512695312,
"learning_rate": 9.622772428609887e-05,
"loss": 0.3593,
"step": 890
},
{
"epoch": 0.6430868167202572,
"grad_norm": 1.5462217330932617,
"learning_rate": 9.606765422084908e-05,
"loss": 0.3058,
"step": 900
},
{
"epoch": 0.6430868167202572,
"eval_news_finetune_val_loss": 0.3292103707790375,
"eval_news_finetune_val_runtime": 1003.4558,
"eval_news_finetune_val_samples_per_second": 1.395,
"eval_news_finetune_val_steps_per_second": 1.395,
"step": 900
},
{
"epoch": 0.6502322257949268,
"grad_norm": 1.0373942852020264,
"learning_rate": 9.590439702859351e-05,
"loss": 0.3318,
"step": 910
},
{
"epoch": 0.6573776348695963,
"grad_norm": 1.2724213600158691,
"learning_rate": 9.573796400405544e-05,
"loss": 0.3328,
"step": 920
},
{
"epoch": 0.6645230439442658,
"grad_norm": 0.8528966903686523,
"learning_rate": 9.55683666616737e-05,
"loss": 0.2673,
"step": 930
},
{
"epoch": 0.6716684530189353,
"grad_norm": 1.65499746799469,
"learning_rate": 9.539561673480612e-05,
"loss": 0.3538,
"step": 940
},
{
"epoch": 0.6788138620936048,
"grad_norm": 2.341379404067993,
"learning_rate": 9.521972617491767e-05,
"loss": 0.3228,
"step": 950
},
{
"epoch": 0.6859592711682744,
"grad_norm": 1.4938244819641113,
"learning_rate": 9.504070715075372e-05,
"loss": 0.3974,
"step": 960
},
{
"epoch": 0.6931046802429439,
"grad_norm": 1.0390361547470093,
"learning_rate": 9.485857204749811e-05,
"loss": 0.3236,
"step": 970
},
{
"epoch": 0.7002500893176135,
"grad_norm": 3.8845393657684326,
"learning_rate": 9.467333346591632e-05,
"loss": 0.3027,
"step": 980
},
{
"epoch": 0.707395498392283,
"grad_norm": 1.3295674324035645,
"learning_rate": 9.448500422148364e-05,
"loss": 0.3005,
"step": 990
},
{
"epoch": 0.7145409074669525,
"grad_norm": 1.0146369934082031,
"learning_rate": 9.429359734349863e-05,
"loss": 0.294,
"step": 1000
},
{
"epoch": 0.7145409074669525,
"eval_news_finetune_val_loss": 0.3208242654800415,
"eval_news_finetune_val_runtime": 1003.2491,
"eval_news_finetune_val_samples_per_second": 1.395,
"eval_news_finetune_val_steps_per_second": 1.395,
"step": 1000
},
{
"epoch": 0.721686316541622,
"grad_norm": 1.5076738595962524,
"learning_rate": 9.409912607418172e-05,
"loss": 0.268,
"step": 1010
},
{
"epoch": 0.7288317256162915,
"grad_norm": 3.3230276107788086,
"learning_rate": 9.390160386775895e-05,
"loss": 0.3038,
"step": 1020
},
{
"epoch": 0.735977134690961,
"grad_norm": 1.699854850769043,
"learning_rate": 9.370104438953125e-05,
"loss": 0.2869,
"step": 1030
},
{
"epoch": 0.7431225437656306,
"grad_norm": 0.904507577419281,
"learning_rate": 9.349746151492902e-05,
"loss": 0.289,
"step": 1040
},
{
"epoch": 0.7502679528403001,
"grad_norm": 0.9463105201721191,
"learning_rate": 9.329086932855215e-05,
"loss": 0.3729,
"step": 1050
},
{
"epoch": 0.7574133619149697,
"grad_norm": 1.4746607542037964,
"learning_rate": 9.30812821231956e-05,
"loss": 0.2282,
"step": 1060
},
{
"epoch": 0.7645587709896392,
"grad_norm": 1.0270076990127563,
"learning_rate": 9.286871439886058e-05,
"loss": 0.3029,
"step": 1070
},
{
"epoch": 0.7717041800643086,
"grad_norm": 2.0656538009643555,
"learning_rate": 9.265318086175143e-05,
"loss": 0.3268,
"step": 1080
},
{
"epoch": 0.7788495891389782,
"grad_norm": 0.9798826575279236,
"learning_rate": 9.243469642325805e-05,
"loss": 0.2942,
"step": 1090
},
{
"epoch": 0.7859949982136477,
"grad_norm": 1.1419672966003418,
"learning_rate": 9.221327619892452e-05,
"loss": 0.3266,
"step": 1100
},
{
"epoch": 0.7859949982136477,
"eval_news_finetune_val_loss": 0.307956337928772,
"eval_news_finetune_val_runtime": 1003.1873,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1100
},
{
"epoch": 0.7931404072883173,
"grad_norm": 0.6810228228569031,
"learning_rate": 9.198893550740306e-05,
"loss": 0.3596,
"step": 1110
},
{
"epoch": 0.8002858163629868,
"grad_norm": 1.6553049087524414,
"learning_rate": 9.176168986939446e-05,
"loss": 0.3106,
"step": 1120
},
{
"epoch": 0.8074312254376563,
"grad_norm": 0.7749443650245667,
"learning_rate": 9.153155500657422e-05,
"loss": 0.3298,
"step": 1130
},
{
"epoch": 0.8145766345123259,
"grad_norm": 0.8693751096725464,
"learning_rate": 9.129854684050481e-05,
"loss": 0.279,
"step": 1140
},
{
"epoch": 0.8217220435869954,
"grad_norm": 1.1013332605361938,
"learning_rate": 9.10626814915343e-05,
"loss": 0.3195,
"step": 1150
},
{
"epoch": 0.8288674526616648,
"grad_norm": 1.2278695106506348,
"learning_rate": 9.082397527768092e-05,
"loss": 0.3027,
"step": 1160
},
{
"epoch": 0.8360128617363344,
"grad_norm": 2.173530101776123,
"learning_rate": 9.058244471350428e-05,
"loss": 0.2238,
"step": 1170
},
{
"epoch": 0.8431582708110039,
"grad_norm": 1.125986933708191,
"learning_rate": 9.033810650896274e-05,
"loss": 0.2399,
"step": 1180
},
{
"epoch": 0.8503036798856735,
"grad_norm": 0.6611151099205017,
"learning_rate": 9.009097756825737e-05,
"loss": 0.2736,
"step": 1190
},
{
"epoch": 0.857449088960343,
"grad_norm": 1.9068485498428345,
"learning_rate": 8.98410749886625e-05,
"loss": 0.2949,
"step": 1200
},
{
"epoch": 0.857449088960343,
"eval_news_finetune_val_loss": 0.31006094813346863,
"eval_news_finetune_val_runtime": 1002.7866,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1200
},
{
"epoch": 0.8645944980350125,
"grad_norm": 1.192031979560852,
"learning_rate": 8.958841605934278e-05,
"loss": 0.3657,
"step": 1210
},
{
"epoch": 0.8717399071096821,
"grad_norm": 1.2596725225448608,
"learning_rate": 8.933301826015715e-05,
"loss": 0.3068,
"step": 1220
},
{
"epoch": 0.8788853161843515,
"grad_norm": 1.4713683128356934,
"learning_rate": 8.907489926044945e-05,
"loss": 0.3122,
"step": 1230
},
{
"epoch": 0.886030725259021,
"grad_norm": 1.3583886623382568,
"learning_rate": 8.881407691782608e-05,
"loss": 0.2989,
"step": 1240
},
{
"epoch": 0.8931761343336906,
"grad_norm": 0.9863426089286804,
"learning_rate": 8.855056927692037e-05,
"loss": 0.2549,
"step": 1250
},
{
"epoch": 0.9003215434083601,
"grad_norm": 1.0579396486282349,
"learning_rate": 8.828439456814442e-05,
"loss": 0.2809,
"step": 1260
},
{
"epoch": 0.9074669524830297,
"grad_norm": 2.847482681274414,
"learning_rate": 8.801557120642766e-05,
"loss": 0.2933,
"step": 1270
},
{
"epoch": 0.9146123615576992,
"grad_norm": 0.8942415118217468,
"learning_rate": 8.774411778994295e-05,
"loss": 0.2866,
"step": 1280
},
{
"epoch": 0.9217577706323687,
"grad_norm": 1.297845721244812,
"learning_rate": 8.747005309881984e-05,
"loss": 0.2939,
"step": 1290
},
{
"epoch": 0.9289031797070382,
"grad_norm": 1.2745181322097778,
"learning_rate": 8.719339609384531e-05,
"loss": 0.3018,
"step": 1300
},
{
"epoch": 0.9289031797070382,
"eval_news_finetune_val_loss": 0.29822030663490295,
"eval_news_finetune_val_runtime": 1002.5672,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1300
},
{
"epoch": 0.9360485887817077,
"grad_norm": 1.3898978233337402,
"learning_rate": 8.691416591515198e-05,
"loss": 0.295,
"step": 1310
},
{
"epoch": 0.9431939978563773,
"grad_norm": 1.1516591310501099,
"learning_rate": 8.663238188089398e-05,
"loss": 0.209,
"step": 1320
},
{
"epoch": 0.9503394069310468,
"grad_norm": 0.9356768131256104,
"learning_rate": 8.634806348591036e-05,
"loss": 0.2904,
"step": 1330
},
{
"epoch": 0.9574848160057163,
"grad_norm": 1.884950876235962,
"learning_rate": 8.606123040037643e-05,
"loss": 0.2607,
"step": 1340
},
{
"epoch": 0.9646302250803859,
"grad_norm": 1.2719082832336426,
"learning_rate": 8.577190246844291e-05,
"loss": 0.3279,
"step": 1350
},
{
"epoch": 0.9717756341550554,
"grad_norm": 0.935297429561615,
"learning_rate": 8.548009970686302e-05,
"loss": 0.3011,
"step": 1360
},
{
"epoch": 0.978921043229725,
"grad_norm": 1.6732884645462036,
"learning_rate": 8.51858423036076e-05,
"loss": 0.2379,
"step": 1370
},
{
"epoch": 0.9860664523043944,
"grad_norm": 0.6651692390441895,
"learning_rate": 8.488915061646856e-05,
"loss": 0.2599,
"step": 1380
},
{
"epoch": 0.9932118613790639,
"grad_norm": 1.121752381324768,
"learning_rate": 8.459004517165032e-05,
"loss": 0.2265,
"step": 1390
},
{
"epoch": 1.0,
"grad_norm": 0.5099928379058838,
"learning_rate": 8.428854666234978e-05,
"loss": 0.3301,
"step": 1400
},
{
"epoch": 1.0,
"eval_news_finetune_val_loss": 0.28762951493263245,
"eval_news_finetune_val_runtime": 1002.7793,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1400
},
{
"epoch": 1.0071454090746694,
"grad_norm": 0.9986103177070618,
"learning_rate": 8.398467594732478e-05,
"loss": 0.2021,
"step": 1410
},
{
"epoch": 1.014290818149339,
"grad_norm": 1.2675282955169678,
"learning_rate": 8.367845404945084e-05,
"loss": 0.2228,
"step": 1420
},
{
"epoch": 1.0214362272240085,
"grad_norm": 0.8156709671020508,
"learning_rate": 8.336990215426688e-05,
"loss": 0.1947,
"step": 1430
},
{
"epoch": 1.0285816362986782,
"grad_norm": 0.5374387502670288,
"learning_rate": 8.305904160850941e-05,
"loss": 0.2344,
"step": 1440
},
{
"epoch": 1.0357270453733476,
"grad_norm": 0.6672261357307434,
"learning_rate": 8.274589391863583e-05,
"loss": 0.1919,
"step": 1450
},
{
"epoch": 1.0428724544480172,
"grad_norm": 0.9803467988967896,
"learning_rate": 8.243048074933634e-05,
"loss": 0.2218,
"step": 1460
},
{
"epoch": 1.0500178635226867,
"grad_norm": 1.482840657234192,
"learning_rate": 8.21128239220353e-05,
"loss": 0.2556,
"step": 1470
},
{
"epoch": 1.057163272597356,
"grad_norm": 1.0589625835418701,
"learning_rate": 8.179294541338135e-05,
"loss": 0.2052,
"step": 1480
},
{
"epoch": 1.0643086816720257,
"grad_norm": 0.8332052230834961,
"learning_rate": 8.147086735372716e-05,
"loss": 0.2386,
"step": 1490
},
{
"epoch": 1.0714540907466952,
"grad_norm": 0.6018723845481873,
"learning_rate": 8.114661202559828e-05,
"loss": 0.1426,
"step": 1500
},
{
"epoch": 1.0714540907466952,
"eval_news_finetune_val_loss": 0.30121028423309326,
"eval_news_finetune_val_runtime": 1002.7457,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1500
},
{
"epoch": 1.0785994998213648,
"grad_norm": 1.7663507461547852,
"learning_rate": 8.082020186215156e-05,
"loss": 0.2407,
"step": 1510
},
{
"epoch": 1.0857449088960343,
"grad_norm": 1.2081632614135742,
"learning_rate": 8.049165944562316e-05,
"loss": 0.2483,
"step": 1520
},
{
"epoch": 1.092890317970704,
"grad_norm": 0.5045826435089111,
"learning_rate": 8.016100750576621e-05,
"loss": 0.2013,
"step": 1530
},
{
"epoch": 1.1000357270453733,
"grad_norm": 1.4456278085708618,
"learning_rate": 7.98282689182783e-05,
"loss": 0.2034,
"step": 1540
},
{
"epoch": 1.107181136120043,
"grad_norm": 1.1558668613433838,
"learning_rate": 7.949346670321891e-05,
"loss": 0.2386,
"step": 1550
},
{
"epoch": 1.1143265451947124,
"grad_norm": 1.4196126461029053,
"learning_rate": 7.915662402341664e-05,
"loss": 0.2299,
"step": 1560
},
{
"epoch": 1.1214719542693818,
"grad_norm": 0.9341222047805786,
"learning_rate": 7.88177641828669e-05,
"loss": 0.2105,
"step": 1570
},
{
"epoch": 1.1286173633440515,
"grad_norm": 1.066001296043396,
"learning_rate": 7.847691062511957e-05,
"loss": 0.1925,
"step": 1580
},
{
"epoch": 1.135762772418721,
"grad_norm": 0.7840182781219482,
"learning_rate": 7.813408693165704e-05,
"loss": 0.2425,
"step": 1590
},
{
"epoch": 1.1429081814933906,
"grad_norm": 0.983668327331543,
"learning_rate": 7.778931682026293e-05,
"loss": 0.2014,
"step": 1600
},
{
"epoch": 1.1429081814933906,
"eval_news_finetune_val_loss": 0.29564452171325684,
"eval_news_finetune_val_runtime": 1003.001,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1600
},
{
"epoch": 1.15005359056806,
"grad_norm": 1.63984215259552,
"learning_rate": 7.744262414338099e-05,
"loss": 0.2863,
"step": 1610
},
{
"epoch": 1.1571989996427297,
"grad_norm": 0.9211621284484863,
"learning_rate": 7.709403288646507e-05,
"loss": 0.2175,
"step": 1620
},
{
"epoch": 1.164344408717399,
"grad_norm": 1.3369996547698975,
"learning_rate": 7.67435671663196e-05,
"loss": 0.1893,
"step": 1630
},
{
"epoch": 1.1714898177920685,
"grad_norm": 0.7532891631126404,
"learning_rate": 7.63912512294312e-05,
"loss": 0.2483,
"step": 1640
},
{
"epoch": 1.1786352268667382,
"grad_norm": 1.0959442853927612,
"learning_rate": 7.603710945029119e-05,
"loss": 0.1888,
"step": 1650
},
{
"epoch": 1.1857806359414076,
"grad_norm": 0.9019472599029541,
"learning_rate": 7.568116632970922e-05,
"loss": 0.2144,
"step": 1660
},
{
"epoch": 1.1929260450160772,
"grad_norm": 1.1219818592071533,
"learning_rate": 7.532344649311829e-05,
"loss": 0.191,
"step": 1670
},
{
"epoch": 1.2000714540907467,
"grad_norm": 1.0829100608825684,
"learning_rate": 7.496397468887106e-05,
"loss": 0.2762,
"step": 1680
},
{
"epoch": 1.2072168631654163,
"grad_norm": 0.7855832576751709,
"learning_rate": 7.460277578652759e-05,
"loss": 0.157,
"step": 1690
},
{
"epoch": 1.2143622722400857,
"grad_norm": 2.407999038696289,
"learning_rate": 7.423987477513488e-05,
"loss": 0.2627,
"step": 1700
},
{
"epoch": 1.2143622722400857,
"eval_news_finetune_val_loss": 0.28248873353004456,
"eval_news_finetune_val_runtime": 1003.1081,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1700
},
{
"epoch": 1.2215076813147552,
"grad_norm": 1.5500895977020264,
"learning_rate": 7.387529676149799e-05,
"loss": 0.1477,
"step": 1710
},
{
"epoch": 1.2286530903894248,
"grad_norm": 1.5599130392074585,
"learning_rate": 7.350906696844307e-05,
"loss": 0.1942,
"step": 1720
},
{
"epoch": 1.2357984994640943,
"grad_norm": 1.6327091455459595,
"learning_rate": 7.314121073307229e-05,
"loss": 0.2,
"step": 1730
},
{
"epoch": 1.242943908538764,
"grad_norm": 0.6044666767120361,
"learning_rate": 7.277175350501111e-05,
"loss": 0.185,
"step": 1740
},
{
"epoch": 1.2500893176134333,
"grad_norm": 1.317089319229126,
"learning_rate": 7.240072084464729e-05,
"loss": 0.196,
"step": 1750
},
{
"epoch": 1.257234726688103,
"grad_norm": 1.089105486869812,
"learning_rate": 7.202813842136283e-05,
"loss": 0.1322,
"step": 1760
},
{
"epoch": 1.2643801357627724,
"grad_norm": 1.4972888231277466,
"learning_rate": 7.165403201175787e-05,
"loss": 0.2176,
"step": 1770
},
{
"epoch": 1.2715255448374418,
"grad_norm": 1.4998830556869507,
"learning_rate": 7.127842749786747e-05,
"loss": 0.218,
"step": 1780
},
{
"epoch": 1.2786709539121115,
"grad_norm": 0.9759517908096313,
"learning_rate": 7.090135086537095e-05,
"loss": 0.1653,
"step": 1790
},
{
"epoch": 1.285816362986781,
"grad_norm": 0.9713583588600159,
"learning_rate": 7.052282820179412e-05,
"loss": 0.175,
"step": 1800
},
{
"epoch": 1.285816362986781,
"eval_news_finetune_val_loss": 0.2936909794807434,
"eval_news_finetune_val_runtime": 1003.12,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1800
},
{
"epoch": 1.2929617720614506,
"grad_norm": 0.6328814625740051,
"learning_rate": 7.014288569470446e-05,
"loss": 0.1727,
"step": 1810
},
{
"epoch": 1.30010718113612,
"grad_norm": 1.622104525566101,
"learning_rate": 6.976154962989934e-05,
"loss": 0.2363,
"step": 1820
},
{
"epoch": 1.3072525902107897,
"grad_norm": 1.8254674673080444,
"learning_rate": 6.937884638958757e-05,
"loss": 0.1897,
"step": 1830
},
{
"epoch": 1.314397999285459,
"grad_norm": 0.8813793063163757,
"learning_rate": 6.899480245056396e-05,
"loss": 0.2029,
"step": 1840
},
{
"epoch": 1.3215434083601285,
"grad_norm": 0.7675999999046326,
"learning_rate": 6.860944438237788e-05,
"loss": 0.2025,
"step": 1850
},
{
"epoch": 1.3286888174347982,
"grad_norm": 1.1973013877868652,
"learning_rate": 6.82227988454948e-05,
"loss": 0.2317,
"step": 1860
},
{
"epoch": 1.3358342265094676,
"grad_norm": 0.7864009737968445,
"learning_rate": 6.783489258945195e-05,
"loss": 0.2318,
"step": 1870
},
{
"epoch": 1.3429796355841372,
"grad_norm": 1.0866330862045288,
"learning_rate": 6.74457524510077e-05,
"loss": 0.1871,
"step": 1880
},
{
"epoch": 1.3501250446588067,
"grad_norm": 0.8745126724243164,
"learning_rate": 6.705540535228485e-05,
"loss": 0.211,
"step": 1890
},
{
"epoch": 1.3572704537334763,
"grad_norm": 1.3401581048965454,
"learning_rate": 6.66638782989081e-05,
"loss": 0.2307,
"step": 1900
},
{
"epoch": 1.3572704537334763,
"eval_news_finetune_val_loss": 0.2787444591522217,
"eval_news_finetune_val_runtime": 1002.9344,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 1900
},
{
"epoch": 1.3644158628081458,
"grad_norm": 0.6149284839630127,
"learning_rate": 6.627119837813564e-05,
"loss": 0.2128,
"step": 1910
},
{
"epoch": 1.3715612718828152,
"grad_norm": 1.7847625017166138,
"learning_rate": 6.587739275698525e-05,
"loss": 0.1551,
"step": 1920
},
{
"epoch": 1.3787066809574848,
"grad_norm": 1.1973716020584106,
"learning_rate": 6.54824886803547e-05,
"loss": 0.2335,
"step": 1930
},
{
"epoch": 1.3858520900321543,
"grad_norm": 1.5757859945297241,
"learning_rate": 6.508651346913687e-05,
"loss": 0.1504,
"step": 1940
},
{
"epoch": 1.392997499106824,
"grad_norm": 1.7269341945648193,
"learning_rate": 6.468949451832968e-05,
"loss": 0.2679,
"step": 1950
},
{
"epoch": 1.4001429081814933,
"grad_norm": 1.6860129833221436,
"learning_rate": 6.429145929514063e-05,
"loss": 0.1942,
"step": 1960
},
{
"epoch": 1.407288317256163,
"grad_norm": 1.1732631921768188,
"learning_rate": 6.389243533708671e-05,
"loss": 0.2025,
"step": 1970
},
{
"epoch": 1.4144337263308324,
"grad_norm": 0.9073033332824707,
"learning_rate": 6.349245025008912e-05,
"loss": 0.1836,
"step": 1980
},
{
"epoch": 1.4215791354055018,
"grad_norm": 1.133843183517456,
"learning_rate": 6.309153170656342e-05,
"loss": 0.1526,
"step": 1990
},
{
"epoch": 1.4287245444801715,
"grad_norm": 2.656296968460083,
"learning_rate": 6.268970744350515e-05,
"loss": 0.1939,
"step": 2000
},
{
"epoch": 1.4287245444801715,
"eval_news_finetune_val_loss": 0.27414408326148987,
"eval_news_finetune_val_runtime": 1003.0949,
"eval_news_finetune_val_samples_per_second": 1.396,
"eval_news_finetune_val_steps_per_second": 1.396,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 4197,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.538125336973312e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}