jdannem6's picture
Uploaded checkpoint-30000
80edba6 verified
raw
history blame contribute delete
No virus
164 kB
{
"best_metric": 0.7402730584144592,
"best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-10000",
"epoch": 0.25,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 3.086414337158203,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.7892,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 8.478134155273438,
"learning_rate": 8.000000000000001e-07,
"loss": 0.7746,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 5.574502468109131,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.8222,
"step": 30
},
{
"epoch": 0.0,
"grad_norm": 2.6497371196746826,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.7423,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 3.116753339767456,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7622,
"step": 50
},
{
"epoch": 0.0,
"grad_norm": 3.179832696914673,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.8183,
"step": 60
},
{
"epoch": 0.0,
"grad_norm": 3.9869463443756104,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.822,
"step": 70
},
{
"epoch": 0.0,
"grad_norm": 5.093494415283203,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.7966,
"step": 80
},
{
"epoch": 0.0,
"grad_norm": 5.230633735656738,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.8113,
"step": 90
},
{
"epoch": 0.0,
"grad_norm": 9.374403953552246,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7582,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 6.465492248535156,
"learning_rate": 4.4e-06,
"loss": 0.7662,
"step": 110
},
{
"epoch": 0.0,
"grad_norm": 6.279934883117676,
"learning_rate": 4.800000000000001e-06,
"loss": 0.8376,
"step": 120
},
{
"epoch": 0.0,
"grad_norm": 5.799221992492676,
"learning_rate": 5.2e-06,
"loss": 0.7965,
"step": 130
},
{
"epoch": 0.0,
"grad_norm": 3.222240686416626,
"learning_rate": 5.600000000000001e-06,
"loss": 0.8855,
"step": 140
},
{
"epoch": 0.0,
"grad_norm": 9.009174346923828,
"learning_rate": 6e-06,
"loss": 0.8394,
"step": 150
},
{
"epoch": 0.0,
"grad_norm": 8.040350914001465,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.8426,
"step": 160
},
{
"epoch": 0.0,
"grad_norm": 4.131030559539795,
"learning_rate": 6.800000000000001e-06,
"loss": 0.7747,
"step": 170
},
{
"epoch": 0.0,
"grad_norm": 3.31986927986145,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.7125,
"step": 180
},
{
"epoch": 0.0,
"grad_norm": 5.7623395919799805,
"learning_rate": 7.600000000000001e-06,
"loss": 0.7854,
"step": 190
},
{
"epoch": 0.01,
"grad_norm": 10.848206520080566,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7756,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 13.455166816711426,
"learning_rate": 8.400000000000001e-06,
"loss": 0.7894,
"step": 210
},
{
"epoch": 0.01,
"grad_norm": 12.759767532348633,
"learning_rate": 8.8e-06,
"loss": 0.7454,
"step": 220
},
{
"epoch": 0.01,
"grad_norm": 4.262899875640869,
"learning_rate": 9.200000000000002e-06,
"loss": 0.8555,
"step": 230
},
{
"epoch": 0.01,
"grad_norm": 4.28985071182251,
"learning_rate": 9.600000000000001e-06,
"loss": 0.6845,
"step": 240
},
{
"epoch": 0.01,
"grad_norm": 4.174241542816162,
"learning_rate": 1e-05,
"loss": 0.7983,
"step": 250
},
{
"epoch": 0.01,
"grad_norm": 12.931599617004395,
"learning_rate": 1.04e-05,
"loss": 0.9041,
"step": 260
},
{
"epoch": 0.01,
"grad_norm": 7.004627227783203,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.817,
"step": 270
},
{
"epoch": 0.01,
"grad_norm": 3.6102757453918457,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.7292,
"step": 280
},
{
"epoch": 0.01,
"grad_norm": 2.764902353286743,
"learning_rate": 1.16e-05,
"loss": 0.9042,
"step": 290
},
{
"epoch": 0.01,
"grad_norm": 3.958317995071411,
"learning_rate": 1.2e-05,
"loss": 0.7539,
"step": 300
},
{
"epoch": 0.01,
"grad_norm": 7.098923683166504,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.7955,
"step": 310
},
{
"epoch": 0.01,
"grad_norm": 12.129098892211914,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.849,
"step": 320
},
{
"epoch": 0.01,
"grad_norm": 2.054119825363159,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.8645,
"step": 330
},
{
"epoch": 0.01,
"grad_norm": 5.205028057098389,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.8175,
"step": 340
},
{
"epoch": 0.01,
"grad_norm": 2.614790439605713,
"learning_rate": 1.4e-05,
"loss": 0.8998,
"step": 350
},
{
"epoch": 0.01,
"grad_norm": 2.9891204833984375,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.8108,
"step": 360
},
{
"epoch": 0.01,
"grad_norm": 4.152099609375,
"learning_rate": 1.48e-05,
"loss": 0.7855,
"step": 370
},
{
"epoch": 0.01,
"grad_norm": 9.833850860595703,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.7736,
"step": 380
},
{
"epoch": 0.01,
"grad_norm": 3.849621295928955,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.7668,
"step": 390
},
{
"epoch": 0.01,
"grad_norm": 5.4542975425720215,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.7781,
"step": 400
},
{
"epoch": 0.01,
"grad_norm": 6.197661876678467,
"learning_rate": 1.64e-05,
"loss": 0.8654,
"step": 410
},
{
"epoch": 0.01,
"grad_norm": 3.2606770992279053,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.7565,
"step": 420
},
{
"epoch": 0.01,
"grad_norm": 3.9680209159851074,
"learning_rate": 1.72e-05,
"loss": 0.7886,
"step": 430
},
{
"epoch": 0.01,
"grad_norm": 18.749984741210938,
"learning_rate": 1.76e-05,
"loss": 0.7305,
"step": 440
},
{
"epoch": 0.01,
"grad_norm": 5.822000503540039,
"learning_rate": 1.8e-05,
"loss": 0.7833,
"step": 450
},
{
"epoch": 0.01,
"grad_norm": 12.999715805053711,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.8483,
"step": 460
},
{
"epoch": 0.01,
"grad_norm": 7.193736553192139,
"learning_rate": 1.88e-05,
"loss": 0.84,
"step": 470
},
{
"epoch": 0.01,
"grad_norm": 12.573124885559082,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.8437,
"step": 480
},
{
"epoch": 0.01,
"grad_norm": 4.4221601486206055,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.6836,
"step": 490
},
{
"epoch": 0.01,
"grad_norm": 3.0399410724639893,
"learning_rate": 2e-05,
"loss": 0.8264,
"step": 500
},
{
"epoch": 0.01,
"eval_loss": 0.8175864219665527,
"eval_runtime": 67.7802,
"eval_samples_per_second": 14.754,
"eval_steps_per_second": 14.754,
"step": 500
},
{
"epoch": 0.01,
"grad_norm": 3.971303701400757,
"learning_rate": 1.9978947368421054e-05,
"loss": 0.7385,
"step": 510
},
{
"epoch": 0.01,
"grad_norm": 3.8043839931488037,
"learning_rate": 1.9957894736842107e-05,
"loss": 0.7826,
"step": 520
},
{
"epoch": 0.01,
"grad_norm": 11.702253341674805,
"learning_rate": 1.993684210526316e-05,
"loss": 0.7971,
"step": 530
},
{
"epoch": 0.01,
"grad_norm": 5.176826000213623,
"learning_rate": 1.9915789473684212e-05,
"loss": 0.748,
"step": 540
},
{
"epoch": 0.01,
"grad_norm": 7.120133876800537,
"learning_rate": 1.9894736842105265e-05,
"loss": 0.8461,
"step": 550
},
{
"epoch": 0.01,
"grad_norm": 12.286151885986328,
"learning_rate": 1.9873684210526318e-05,
"loss": 0.8335,
"step": 560
},
{
"epoch": 0.01,
"grad_norm": 7.857172966003418,
"learning_rate": 1.985263157894737e-05,
"loss": 0.7231,
"step": 570
},
{
"epoch": 0.01,
"grad_norm": 5.327859401702881,
"learning_rate": 1.9831578947368423e-05,
"loss": 0.877,
"step": 580
},
{
"epoch": 0.01,
"grad_norm": 6.9340362548828125,
"learning_rate": 1.9810526315789476e-05,
"loss": 0.8984,
"step": 590
},
{
"epoch": 0.01,
"grad_norm": 2.1034326553344727,
"learning_rate": 1.9789473684210528e-05,
"loss": 0.7045,
"step": 600
},
{
"epoch": 0.02,
"grad_norm": 3.853721857070923,
"learning_rate": 1.976842105263158e-05,
"loss": 0.761,
"step": 610
},
{
"epoch": 0.02,
"grad_norm": 7.6926398277282715,
"learning_rate": 1.9747368421052633e-05,
"loss": 0.9493,
"step": 620
},
{
"epoch": 0.02,
"grad_norm": 6.261799335479736,
"learning_rate": 1.9726315789473686e-05,
"loss": 0.7719,
"step": 630
},
{
"epoch": 0.02,
"grad_norm": 3.864114284515381,
"learning_rate": 1.970526315789474e-05,
"loss": 0.9406,
"step": 640
},
{
"epoch": 0.02,
"grad_norm": 7.093533515930176,
"learning_rate": 1.968421052631579e-05,
"loss": 0.7951,
"step": 650
},
{
"epoch": 0.02,
"grad_norm": 2.3724496364593506,
"learning_rate": 1.9663157894736844e-05,
"loss": 0.8648,
"step": 660
},
{
"epoch": 0.02,
"grad_norm": 10.12341022491455,
"learning_rate": 1.9642105263157897e-05,
"loss": 0.7823,
"step": 670
},
{
"epoch": 0.02,
"grad_norm": 2.80940842628479,
"learning_rate": 1.962105263157895e-05,
"loss": 0.706,
"step": 680
},
{
"epoch": 0.02,
"grad_norm": 8.243487358093262,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.8244,
"step": 690
},
{
"epoch": 0.02,
"grad_norm": 11.420123100280762,
"learning_rate": 1.9578947368421055e-05,
"loss": 0.6753,
"step": 700
},
{
"epoch": 0.02,
"grad_norm": 63.8618278503418,
"learning_rate": 1.9557894736842107e-05,
"loss": 0.8309,
"step": 710
},
{
"epoch": 0.02,
"grad_norm": 4.521258354187012,
"learning_rate": 1.953684210526316e-05,
"loss": 0.8101,
"step": 720
},
{
"epoch": 0.02,
"grad_norm": 2.9532318115234375,
"learning_rate": 1.9515789473684213e-05,
"loss": 0.8533,
"step": 730
},
{
"epoch": 0.02,
"grad_norm": 3.792180061340332,
"learning_rate": 1.9494736842105265e-05,
"loss": 0.7573,
"step": 740
},
{
"epoch": 0.02,
"grad_norm": 5.155513286590576,
"learning_rate": 1.9473684210526318e-05,
"loss": 0.8961,
"step": 750
},
{
"epoch": 0.02,
"grad_norm": 9.195950508117676,
"learning_rate": 1.945263157894737e-05,
"loss": 0.8398,
"step": 760
},
{
"epoch": 0.02,
"grad_norm": 6.699478626251221,
"learning_rate": 1.9431578947368423e-05,
"loss": 0.8018,
"step": 770
},
{
"epoch": 0.02,
"grad_norm": 5.254507541656494,
"learning_rate": 1.9410526315789476e-05,
"loss": 0.8408,
"step": 780
},
{
"epoch": 0.02,
"grad_norm": 4.351966857910156,
"learning_rate": 1.9389473684210525e-05,
"loss": 0.7323,
"step": 790
},
{
"epoch": 0.02,
"grad_norm": 2.361276626586914,
"learning_rate": 1.936842105263158e-05,
"loss": 0.8401,
"step": 800
},
{
"epoch": 0.02,
"grad_norm": 5.449990272521973,
"learning_rate": 1.9347368421052634e-05,
"loss": 0.726,
"step": 810
},
{
"epoch": 0.02,
"grad_norm": 5.375738143920898,
"learning_rate": 1.9326315789473687e-05,
"loss": 0.8305,
"step": 820
},
{
"epoch": 0.02,
"grad_norm": 2.601025342941284,
"learning_rate": 1.930526315789474e-05,
"loss": 0.9152,
"step": 830
},
{
"epoch": 0.02,
"grad_norm": 12.153268814086914,
"learning_rate": 1.9284210526315792e-05,
"loss": 0.8423,
"step": 840
},
{
"epoch": 0.02,
"grad_norm": 3.785663604736328,
"learning_rate": 1.9263157894736845e-05,
"loss": 0.7733,
"step": 850
},
{
"epoch": 0.02,
"grad_norm": 10.162787437438965,
"learning_rate": 1.9242105263157894e-05,
"loss": 0.893,
"step": 860
},
{
"epoch": 0.02,
"grad_norm": 3.871621608734131,
"learning_rate": 1.922105263157895e-05,
"loss": 0.798,
"step": 870
},
{
"epoch": 0.02,
"grad_norm": 2.9919800758361816,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.8484,
"step": 880
},
{
"epoch": 0.02,
"grad_norm": 5.40109920501709,
"learning_rate": 1.9178947368421055e-05,
"loss": 0.9129,
"step": 890
},
{
"epoch": 0.02,
"grad_norm": 6.794926643371582,
"learning_rate": 1.9157894736842108e-05,
"loss": 0.8687,
"step": 900
},
{
"epoch": 0.02,
"grad_norm": 5.942440986633301,
"learning_rate": 1.913684210526316e-05,
"loss": 0.8564,
"step": 910
},
{
"epoch": 0.02,
"grad_norm": 5.968307018280029,
"learning_rate": 1.9115789473684213e-05,
"loss": 0.8495,
"step": 920
},
{
"epoch": 0.02,
"grad_norm": 8.425616264343262,
"learning_rate": 1.9094736842105262e-05,
"loss": 0.7242,
"step": 930
},
{
"epoch": 0.02,
"grad_norm": 2.819301128387451,
"learning_rate": 1.907368421052632e-05,
"loss": 0.8381,
"step": 940
},
{
"epoch": 0.02,
"grad_norm": 6.81688117980957,
"learning_rate": 1.9052631578947368e-05,
"loss": 0.8817,
"step": 950
},
{
"epoch": 0.02,
"grad_norm": 5.102423191070557,
"learning_rate": 1.9031578947368424e-05,
"loss": 0.8274,
"step": 960
},
{
"epoch": 0.02,
"grad_norm": 4.12994909286499,
"learning_rate": 1.9010526315789476e-05,
"loss": 0.7052,
"step": 970
},
{
"epoch": 0.02,
"grad_norm": 5.15468692779541,
"learning_rate": 1.898947368421053e-05,
"loss": 0.772,
"step": 980
},
{
"epoch": 0.02,
"grad_norm": 1.62323796749115,
"learning_rate": 1.8968421052631582e-05,
"loss": 0.7764,
"step": 990
},
{
"epoch": 0.03,
"grad_norm": 2.546677589416504,
"learning_rate": 1.894736842105263e-05,
"loss": 0.8365,
"step": 1000
},
{
"epoch": 0.03,
"eval_loss": 0.7952949404716492,
"eval_runtime": 67.7544,
"eval_samples_per_second": 14.759,
"eval_steps_per_second": 14.759,
"step": 1000
},
{
"epoch": 0.03,
"grad_norm": 9.28386402130127,
"learning_rate": 1.8926315789473687e-05,
"loss": 0.8765,
"step": 1010
},
{
"epoch": 0.03,
"grad_norm": 7.3430304527282715,
"learning_rate": 1.8905263157894736e-05,
"loss": 0.8763,
"step": 1020
},
{
"epoch": 0.03,
"grad_norm": 4.0531206130981445,
"learning_rate": 1.8884210526315792e-05,
"loss": 0.7943,
"step": 1030
},
{
"epoch": 0.03,
"grad_norm": 3.028320074081421,
"learning_rate": 1.886315789473684e-05,
"loss": 0.836,
"step": 1040
},
{
"epoch": 0.03,
"grad_norm": 3.3861188888549805,
"learning_rate": 1.8842105263157898e-05,
"loss": 0.7336,
"step": 1050
},
{
"epoch": 0.03,
"grad_norm": 3.7832908630371094,
"learning_rate": 1.882105263157895e-05,
"loss": 0.9283,
"step": 1060
},
{
"epoch": 0.03,
"grad_norm": 3.8170342445373535,
"learning_rate": 1.88e-05,
"loss": 0.7655,
"step": 1070
},
{
"epoch": 0.03,
"grad_norm": 6.15322732925415,
"learning_rate": 1.8778947368421056e-05,
"loss": 0.9341,
"step": 1080
},
{
"epoch": 0.03,
"grad_norm": 7.066686153411865,
"learning_rate": 1.8757894736842105e-05,
"loss": 0.85,
"step": 1090
},
{
"epoch": 0.03,
"grad_norm": 2.986961603164673,
"learning_rate": 1.873684210526316e-05,
"loss": 0.8943,
"step": 1100
},
{
"epoch": 0.03,
"grad_norm": 2.8456902503967285,
"learning_rate": 1.871578947368421e-05,
"loss": 0.8279,
"step": 1110
},
{
"epoch": 0.03,
"grad_norm": 3.6177377700805664,
"learning_rate": 1.8694736842105266e-05,
"loss": 0.8192,
"step": 1120
},
{
"epoch": 0.03,
"grad_norm": 14.768010139465332,
"learning_rate": 1.8673684210526316e-05,
"loss": 0.8005,
"step": 1130
},
{
"epoch": 0.03,
"grad_norm": 11.347342491149902,
"learning_rate": 1.8652631578947368e-05,
"loss": 0.8081,
"step": 1140
},
{
"epoch": 0.03,
"grad_norm": 4.0560150146484375,
"learning_rate": 1.8631578947368424e-05,
"loss": 0.9389,
"step": 1150
},
{
"epoch": 0.03,
"grad_norm": 3.3164710998535156,
"learning_rate": 1.8610526315789473e-05,
"loss": 0.8501,
"step": 1160
},
{
"epoch": 0.03,
"grad_norm": 11.112225532531738,
"learning_rate": 1.858947368421053e-05,
"loss": 0.7162,
"step": 1170
},
{
"epoch": 0.03,
"grad_norm": 6.200588703155518,
"learning_rate": 1.856842105263158e-05,
"loss": 0.7448,
"step": 1180
},
{
"epoch": 0.03,
"grad_norm": 6.573482513427734,
"learning_rate": 1.8547368421052635e-05,
"loss": 0.8071,
"step": 1190
},
{
"epoch": 0.03,
"grad_norm": 5.153548717498779,
"learning_rate": 1.8526315789473684e-05,
"loss": 0.7957,
"step": 1200
},
{
"epoch": 0.03,
"grad_norm": 5.3308305740356445,
"learning_rate": 1.8505263157894737e-05,
"loss": 0.7301,
"step": 1210
},
{
"epoch": 0.03,
"grad_norm": 5.269808769226074,
"learning_rate": 1.8484210526315793e-05,
"loss": 0.8072,
"step": 1220
},
{
"epoch": 0.03,
"grad_norm": 5.588324546813965,
"learning_rate": 1.8463157894736842e-05,
"loss": 0.8587,
"step": 1230
},
{
"epoch": 0.03,
"grad_norm": 4.593557357788086,
"learning_rate": 1.8442105263157898e-05,
"loss": 0.856,
"step": 1240
},
{
"epoch": 0.03,
"grad_norm": 5.2591094970703125,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.7717,
"step": 1250
},
{
"epoch": 0.03,
"grad_norm": 4.052567958831787,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.7823,
"step": 1260
},
{
"epoch": 0.03,
"grad_norm": 4.447838306427002,
"learning_rate": 1.8378947368421053e-05,
"loss": 0.83,
"step": 1270
},
{
"epoch": 0.03,
"grad_norm": 4.029257774353027,
"learning_rate": 1.8357894736842105e-05,
"loss": 0.7504,
"step": 1280
},
{
"epoch": 0.03,
"grad_norm": 9.053960800170898,
"learning_rate": 1.8336842105263158e-05,
"loss": 0.9074,
"step": 1290
},
{
"epoch": 0.03,
"grad_norm": 2.2877705097198486,
"learning_rate": 1.831578947368421e-05,
"loss": 0.772,
"step": 1300
},
{
"epoch": 0.03,
"grad_norm": 3.4482290744781494,
"learning_rate": 1.8294736842105267e-05,
"loss": 0.8658,
"step": 1310
},
{
"epoch": 0.03,
"grad_norm": 6.684794902801514,
"learning_rate": 1.8273684210526316e-05,
"loss": 0.7848,
"step": 1320
},
{
"epoch": 0.03,
"grad_norm": 3.553828716278076,
"learning_rate": 1.8252631578947372e-05,
"loss": 0.8219,
"step": 1330
},
{
"epoch": 0.03,
"grad_norm": 2.5203397274017334,
"learning_rate": 1.823157894736842e-05,
"loss": 0.9071,
"step": 1340
},
{
"epoch": 0.03,
"grad_norm": 4.961795806884766,
"learning_rate": 1.8210526315789477e-05,
"loss": 0.6542,
"step": 1350
},
{
"epoch": 0.03,
"grad_norm": 3.663081645965576,
"learning_rate": 1.8189473684210527e-05,
"loss": 0.7402,
"step": 1360
},
{
"epoch": 0.03,
"grad_norm": 8.785040855407715,
"learning_rate": 1.816842105263158e-05,
"loss": 0.7462,
"step": 1370
},
{
"epoch": 0.03,
"grad_norm": 4.659074783325195,
"learning_rate": 1.8147368421052632e-05,
"loss": 0.6951,
"step": 1380
},
{
"epoch": 0.03,
"grad_norm": 3.5885703563690186,
"learning_rate": 1.8126315789473685e-05,
"loss": 0.7008,
"step": 1390
},
{
"epoch": 0.04,
"grad_norm": 3.1295347213745117,
"learning_rate": 1.810526315789474e-05,
"loss": 0.9103,
"step": 1400
},
{
"epoch": 0.04,
"grad_norm": 2.4699888229370117,
"learning_rate": 1.808421052631579e-05,
"loss": 0.841,
"step": 1410
},
{
"epoch": 0.04,
"grad_norm": 5.3273444175720215,
"learning_rate": 1.8063157894736846e-05,
"loss": 0.9041,
"step": 1420
},
{
"epoch": 0.04,
"grad_norm": 5.149638652801514,
"learning_rate": 1.8042105263157895e-05,
"loss": 0.7784,
"step": 1430
},
{
"epoch": 0.04,
"grad_norm": 3.4124910831451416,
"learning_rate": 1.8021052631578948e-05,
"loss": 0.8208,
"step": 1440
},
{
"epoch": 0.04,
"grad_norm": 2.9231085777282715,
"learning_rate": 1.8e-05,
"loss": 0.7173,
"step": 1450
},
{
"epoch": 0.04,
"grad_norm": 4.008113384246826,
"learning_rate": 1.7978947368421053e-05,
"loss": 0.7383,
"step": 1460
},
{
"epoch": 0.04,
"grad_norm": 5.1748046875,
"learning_rate": 1.795789473684211e-05,
"loss": 0.8399,
"step": 1470
},
{
"epoch": 0.04,
"grad_norm": 3.4990293979644775,
"learning_rate": 1.793684210526316e-05,
"loss": 0.6721,
"step": 1480
},
{
"epoch": 0.04,
"grad_norm": 3.1186299324035645,
"learning_rate": 1.7915789473684214e-05,
"loss": 0.782,
"step": 1490
},
{
"epoch": 0.04,
"grad_norm": 5.12732458114624,
"learning_rate": 1.7894736842105264e-05,
"loss": 0.7211,
"step": 1500
},
{
"epoch": 0.04,
"eval_loss": 0.811568021774292,
"eval_runtime": 67.7961,
"eval_samples_per_second": 14.75,
"eval_steps_per_second": 14.75,
"step": 1500
},
{
"epoch": 0.04,
"grad_norm": 3.631096124649048,
"learning_rate": 1.7873684210526316e-05,
"loss": 0.7557,
"step": 1510
},
{
"epoch": 0.04,
"grad_norm": 8.850045204162598,
"learning_rate": 1.785263157894737e-05,
"loss": 0.8757,
"step": 1520
},
{
"epoch": 0.04,
"grad_norm": 3.1114978790283203,
"learning_rate": 1.7831578947368422e-05,
"loss": 0.7613,
"step": 1530
},
{
"epoch": 0.04,
"grad_norm": 4.5038743019104,
"learning_rate": 1.7810526315789474e-05,
"loss": 0.8049,
"step": 1540
},
{
"epoch": 0.04,
"grad_norm": 4.2331156730651855,
"learning_rate": 1.7789473684210527e-05,
"loss": 0.8277,
"step": 1550
},
{
"epoch": 0.04,
"grad_norm": 5.05696964263916,
"learning_rate": 1.7768421052631583e-05,
"loss": 0.7973,
"step": 1560
},
{
"epoch": 0.04,
"grad_norm": 2.1331920623779297,
"learning_rate": 1.7747368421052632e-05,
"loss": 0.7688,
"step": 1570
},
{
"epoch": 0.04,
"grad_norm": 4.984541416168213,
"learning_rate": 1.7726315789473685e-05,
"loss": 0.7865,
"step": 1580
},
{
"epoch": 0.04,
"grad_norm": 7.149406433105469,
"learning_rate": 1.7705263157894738e-05,
"loss": 0.7728,
"step": 1590
},
{
"epoch": 0.04,
"grad_norm": 8.092243194580078,
"learning_rate": 1.768421052631579e-05,
"loss": 0.935,
"step": 1600
},
{
"epoch": 0.04,
"grad_norm": 13.16551399230957,
"learning_rate": 1.7663157894736843e-05,
"loss": 0.8286,
"step": 1610
},
{
"epoch": 0.04,
"grad_norm": 2.131350517272949,
"learning_rate": 1.7642105263157896e-05,
"loss": 0.7864,
"step": 1620
},
{
"epoch": 0.04,
"grad_norm": 7.870023727416992,
"learning_rate": 1.7621052631578948e-05,
"loss": 0.8645,
"step": 1630
},
{
"epoch": 0.04,
"grad_norm": 10.631692886352539,
"learning_rate": 1.76e-05,
"loss": 0.8473,
"step": 1640
},
{
"epoch": 0.04,
"grad_norm": 6.421032905578613,
"learning_rate": 1.7578947368421054e-05,
"loss": 0.7868,
"step": 1650
},
{
"epoch": 0.04,
"grad_norm": 4.57529878616333,
"learning_rate": 1.7557894736842106e-05,
"loss": 0.7882,
"step": 1660
},
{
"epoch": 0.04,
"grad_norm": 3.8785624504089355,
"learning_rate": 1.753684210526316e-05,
"loss": 0.7543,
"step": 1670
},
{
"epoch": 0.04,
"grad_norm": 5.722006320953369,
"learning_rate": 1.751578947368421e-05,
"loss": 0.9626,
"step": 1680
},
{
"epoch": 0.04,
"grad_norm": 2.466771364212036,
"learning_rate": 1.7494736842105264e-05,
"loss": 0.783,
"step": 1690
},
{
"epoch": 0.04,
"grad_norm": 3.072049856185913,
"learning_rate": 1.7473684210526317e-05,
"loss": 0.7503,
"step": 1700
},
{
"epoch": 0.04,
"grad_norm": 5.768575668334961,
"learning_rate": 1.745263157894737e-05,
"loss": 0.8193,
"step": 1710
},
{
"epoch": 0.04,
"grad_norm": 2.585022211074829,
"learning_rate": 1.7431578947368422e-05,
"loss": 0.8808,
"step": 1720
},
{
"epoch": 0.04,
"grad_norm": 3.0711567401885986,
"learning_rate": 1.7410526315789475e-05,
"loss": 0.8098,
"step": 1730
},
{
"epoch": 0.04,
"grad_norm": 3.3020272254943848,
"learning_rate": 1.7389473684210527e-05,
"loss": 0.7196,
"step": 1740
},
{
"epoch": 0.04,
"grad_norm": 3.645238161087036,
"learning_rate": 1.736842105263158e-05,
"loss": 0.8904,
"step": 1750
},
{
"epoch": 0.04,
"grad_norm": 6.018638610839844,
"learning_rate": 1.7347368421052633e-05,
"loss": 0.7937,
"step": 1760
},
{
"epoch": 0.04,
"grad_norm": 3.629096746444702,
"learning_rate": 1.7326315789473685e-05,
"loss": 0.9171,
"step": 1770
},
{
"epoch": 0.04,
"grad_norm": 2.5619189739227295,
"learning_rate": 1.7305263157894738e-05,
"loss": 0.9488,
"step": 1780
},
{
"epoch": 0.04,
"grad_norm": 9.464752197265625,
"learning_rate": 1.728421052631579e-05,
"loss": 0.8459,
"step": 1790
},
{
"epoch": 0.04,
"grad_norm": 3.9856364727020264,
"learning_rate": 1.7263157894736843e-05,
"loss": 0.8378,
"step": 1800
},
{
"epoch": 0.05,
"grad_norm": 3.753553867340088,
"learning_rate": 1.7242105263157896e-05,
"loss": 0.8093,
"step": 1810
},
{
"epoch": 0.05,
"grad_norm": 3.4593358039855957,
"learning_rate": 1.722105263157895e-05,
"loss": 0.7896,
"step": 1820
},
{
"epoch": 0.05,
"grad_norm": 2.7163546085357666,
"learning_rate": 1.72e-05,
"loss": 0.7188,
"step": 1830
},
{
"epoch": 0.05,
"grad_norm": 3.105628728866577,
"learning_rate": 1.7178947368421054e-05,
"loss": 0.7643,
"step": 1840
},
{
"epoch": 0.05,
"grad_norm": 2.387368679046631,
"learning_rate": 1.7157894736842107e-05,
"loss": 0.8465,
"step": 1850
},
{
"epoch": 0.05,
"grad_norm": 6.020385265350342,
"learning_rate": 1.713684210526316e-05,
"loss": 0.7798,
"step": 1860
},
{
"epoch": 0.05,
"grad_norm": 4.560520172119141,
"learning_rate": 1.7115789473684212e-05,
"loss": 0.7704,
"step": 1870
},
{
"epoch": 0.05,
"grad_norm": 15.739727973937988,
"learning_rate": 1.7094736842105265e-05,
"loss": 0.7148,
"step": 1880
},
{
"epoch": 0.05,
"grad_norm": 5.79690408706665,
"learning_rate": 1.7073684210526317e-05,
"loss": 0.798,
"step": 1890
},
{
"epoch": 0.05,
"grad_norm": 2.6939146518707275,
"learning_rate": 1.705263157894737e-05,
"loss": 0.7641,
"step": 1900
},
{
"epoch": 0.05,
"grad_norm": 5.193384170532227,
"learning_rate": 1.7031578947368423e-05,
"loss": 0.7866,
"step": 1910
},
{
"epoch": 0.05,
"grad_norm": 4.940731525421143,
"learning_rate": 1.7010526315789475e-05,
"loss": 0.8261,
"step": 1920
},
{
"epoch": 0.05,
"grad_norm": 2.1812446117401123,
"learning_rate": 1.6989473684210528e-05,
"loss": 0.7973,
"step": 1930
},
{
"epoch": 0.05,
"grad_norm": 3.7413289546966553,
"learning_rate": 1.696842105263158e-05,
"loss": 0.7818,
"step": 1940
},
{
"epoch": 0.05,
"grad_norm": 4.024014472961426,
"learning_rate": 1.6947368421052633e-05,
"loss": 0.7237,
"step": 1950
},
{
"epoch": 0.05,
"grad_norm": 3.0871291160583496,
"learning_rate": 1.6926315789473686e-05,
"loss": 0.772,
"step": 1960
},
{
"epoch": 0.05,
"grad_norm": 3.28814435005188,
"learning_rate": 1.690526315789474e-05,
"loss": 0.7067,
"step": 1970
},
{
"epoch": 0.05,
"grad_norm": 2.8241286277770996,
"learning_rate": 1.688421052631579e-05,
"loss": 0.8175,
"step": 1980
},
{
"epoch": 0.05,
"grad_norm": 2.5942068099975586,
"learning_rate": 1.6863157894736844e-05,
"loss": 0.9265,
"step": 1990
},
{
"epoch": 0.05,
"grad_norm": 6.6822662353515625,
"learning_rate": 1.6842105263157896e-05,
"loss": 0.8593,
"step": 2000
},
{
"epoch": 0.05,
"eval_loss": 0.8064771890640259,
"eval_runtime": 67.7887,
"eval_samples_per_second": 14.752,
"eval_steps_per_second": 14.752,
"step": 2000
},
{
"epoch": 0.05,
"grad_norm": 7.032164573669434,
"learning_rate": 1.682105263157895e-05,
"loss": 0.8819,
"step": 2010
},
{
"epoch": 0.05,
"grad_norm": 4.874982833862305,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.8021,
"step": 2020
},
{
"epoch": 0.05,
"grad_norm": 2.6172547340393066,
"learning_rate": 1.6778947368421054e-05,
"loss": 0.8017,
"step": 2030
},
{
"epoch": 0.05,
"grad_norm": 10.659741401672363,
"learning_rate": 1.6757894736842107e-05,
"loss": 0.8896,
"step": 2040
},
{
"epoch": 0.05,
"grad_norm": 6.189141750335693,
"learning_rate": 1.673684210526316e-05,
"loss": 0.7997,
"step": 2050
},
{
"epoch": 0.05,
"grad_norm": 4.523468971252441,
"learning_rate": 1.6715789473684212e-05,
"loss": 0.8498,
"step": 2060
},
{
"epoch": 0.05,
"grad_norm": 8.533658981323242,
"learning_rate": 1.6694736842105265e-05,
"loss": 0.8857,
"step": 2070
},
{
"epoch": 0.05,
"grad_norm": 3.0041606426239014,
"learning_rate": 1.6673684210526318e-05,
"loss": 0.8112,
"step": 2080
},
{
"epoch": 0.05,
"grad_norm": 5.055651664733887,
"learning_rate": 1.665263157894737e-05,
"loss": 0.7872,
"step": 2090
},
{
"epoch": 0.05,
"grad_norm": 5.761922836303711,
"learning_rate": 1.6631578947368423e-05,
"loss": 0.7727,
"step": 2100
},
{
"epoch": 0.05,
"grad_norm": 2.518223524093628,
"learning_rate": 1.6610526315789476e-05,
"loss": 0.7997,
"step": 2110
},
{
"epoch": 0.05,
"grad_norm": 4.975761890411377,
"learning_rate": 1.658947368421053e-05,
"loss": 0.7457,
"step": 2120
},
{
"epoch": 0.05,
"grad_norm": 3.2227561473846436,
"learning_rate": 1.656842105263158e-05,
"loss": 0.816,
"step": 2130
},
{
"epoch": 0.05,
"grad_norm": 4.705923080444336,
"learning_rate": 1.6547368421052634e-05,
"loss": 0.8113,
"step": 2140
},
{
"epoch": 0.05,
"grad_norm": 2.655057430267334,
"learning_rate": 1.6526315789473686e-05,
"loss": 0.7912,
"step": 2150
},
{
"epoch": 0.05,
"grad_norm": 3.0186755657196045,
"learning_rate": 1.650526315789474e-05,
"loss": 0.8608,
"step": 2160
},
{
"epoch": 0.05,
"grad_norm": 1.232386827468872,
"learning_rate": 1.648421052631579e-05,
"loss": 0.8549,
"step": 2170
},
{
"epoch": 0.05,
"grad_norm": 11.968620300292969,
"learning_rate": 1.6463157894736844e-05,
"loss": 0.868,
"step": 2180
},
{
"epoch": 0.05,
"grad_norm": 3.5853216648101807,
"learning_rate": 1.6442105263157897e-05,
"loss": 0.8388,
"step": 2190
},
{
"epoch": 0.06,
"grad_norm": 2.375610589981079,
"learning_rate": 1.642105263157895e-05,
"loss": 0.9111,
"step": 2200
},
{
"epoch": 0.06,
"grad_norm": 1.9734487533569336,
"learning_rate": 1.64e-05,
"loss": 0.7288,
"step": 2210
},
{
"epoch": 0.06,
"grad_norm": 10.517192840576172,
"learning_rate": 1.6378947368421055e-05,
"loss": 0.698,
"step": 2220
},
{
"epoch": 0.06,
"grad_norm": 4.183718204498291,
"learning_rate": 1.6357894736842108e-05,
"loss": 0.7759,
"step": 2230
},
{
"epoch": 0.06,
"grad_norm": 3.9075675010681152,
"learning_rate": 1.633684210526316e-05,
"loss": 0.7829,
"step": 2240
},
{
"epoch": 0.06,
"grad_norm": 5.287744998931885,
"learning_rate": 1.6315789473684213e-05,
"loss": 0.7057,
"step": 2250
},
{
"epoch": 0.06,
"grad_norm": 4.977657318115234,
"learning_rate": 1.6294736842105265e-05,
"loss": 0.8346,
"step": 2260
},
{
"epoch": 0.06,
"grad_norm": 7.196689128875732,
"learning_rate": 1.6273684210526318e-05,
"loss": 0.8508,
"step": 2270
},
{
"epoch": 0.06,
"grad_norm": 2.467477798461914,
"learning_rate": 1.6252631578947367e-05,
"loss": 0.7179,
"step": 2280
},
{
"epoch": 0.06,
"grad_norm": 7.059762954711914,
"learning_rate": 1.6231578947368423e-05,
"loss": 0.7549,
"step": 2290
},
{
"epoch": 0.06,
"grad_norm": 3.980865955352783,
"learning_rate": 1.6210526315789473e-05,
"loss": 0.814,
"step": 2300
},
{
"epoch": 0.06,
"grad_norm": 7.675939559936523,
"learning_rate": 1.618947368421053e-05,
"loss": 0.8227,
"step": 2310
},
{
"epoch": 0.06,
"grad_norm": 3.530073642730713,
"learning_rate": 1.616842105263158e-05,
"loss": 0.8517,
"step": 2320
},
{
"epoch": 0.06,
"grad_norm": 3.6851344108581543,
"learning_rate": 1.6147368421052634e-05,
"loss": 0.7684,
"step": 2330
},
{
"epoch": 0.06,
"grad_norm": 5.206923961639404,
"learning_rate": 1.6126315789473687e-05,
"loss": 0.8199,
"step": 2340
},
{
"epoch": 0.06,
"grad_norm": 5.220828056335449,
"learning_rate": 1.6105263157894736e-05,
"loss": 0.8871,
"step": 2350
},
{
"epoch": 0.06,
"grad_norm": 3.5062482357025146,
"learning_rate": 1.6084210526315792e-05,
"loss": 0.8281,
"step": 2360
},
{
"epoch": 0.06,
"grad_norm": 1.9830796718597412,
"learning_rate": 1.606315789473684e-05,
"loss": 0.8678,
"step": 2370
},
{
"epoch": 0.06,
"grad_norm": 3.3255491256713867,
"learning_rate": 1.6042105263157897e-05,
"loss": 0.8337,
"step": 2380
},
{
"epoch": 0.06,
"grad_norm": 5.259572505950928,
"learning_rate": 1.6021052631578947e-05,
"loss": 0.7954,
"step": 2390
},
{
"epoch": 0.06,
"grad_norm": 3.6201376914978027,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.818,
"step": 2400
},
{
"epoch": 0.06,
"grad_norm": 3.3598544597625732,
"learning_rate": 1.5978947368421055e-05,
"loss": 0.7697,
"step": 2410
},
{
"epoch": 0.06,
"grad_norm": 6.34808349609375,
"learning_rate": 1.5957894736842105e-05,
"loss": 0.6347,
"step": 2420
},
{
"epoch": 0.06,
"grad_norm": 3.967682361602783,
"learning_rate": 1.593684210526316e-05,
"loss": 0.7178,
"step": 2430
},
{
"epoch": 0.06,
"grad_norm": 10.222978591918945,
"learning_rate": 1.591578947368421e-05,
"loss": 0.7642,
"step": 2440
},
{
"epoch": 0.06,
"grad_norm": 3.9339826107025146,
"learning_rate": 1.5894736842105266e-05,
"loss": 0.8197,
"step": 2450
},
{
"epoch": 0.06,
"grad_norm": 2.3337771892547607,
"learning_rate": 1.5873684210526315e-05,
"loss": 0.9375,
"step": 2460
},
{
"epoch": 0.06,
"grad_norm": 2.8479838371276855,
"learning_rate": 1.585263157894737e-05,
"loss": 0.9196,
"step": 2470
},
{
"epoch": 0.06,
"grad_norm": 9.294541358947754,
"learning_rate": 1.5831578947368424e-05,
"loss": 0.7144,
"step": 2480
},
{
"epoch": 0.06,
"grad_norm": 5.325323104858398,
"learning_rate": 1.5810526315789473e-05,
"loss": 0.7897,
"step": 2490
},
{
"epoch": 0.06,
"grad_norm": 4.377369403839111,
"learning_rate": 1.578947368421053e-05,
"loss": 0.9008,
"step": 2500
},
{
"epoch": 0.06,
"eval_loss": 0.8163847923278809,
"eval_runtime": 67.7994,
"eval_samples_per_second": 14.749,
"eval_steps_per_second": 14.749,
"step": 2500
},
{
"epoch": 0.06,
"grad_norm": 5.1105055809021,
"learning_rate": 1.576842105263158e-05,
"loss": 0.7897,
"step": 2510
},
{
"epoch": 0.06,
"grad_norm": 3.321247100830078,
"learning_rate": 1.5747368421052635e-05,
"loss": 0.7394,
"step": 2520
},
{
"epoch": 0.06,
"grad_norm": 2.1728689670562744,
"learning_rate": 1.5726315789473684e-05,
"loss": 0.7395,
"step": 2530
},
{
"epoch": 0.06,
"grad_norm": 4.246960163116455,
"learning_rate": 1.570526315789474e-05,
"loss": 0.7825,
"step": 2540
},
{
"epoch": 0.06,
"grad_norm": 4.518326282501221,
"learning_rate": 1.568421052631579e-05,
"loss": 0.8168,
"step": 2550
},
{
"epoch": 0.06,
"grad_norm": 4.336541652679443,
"learning_rate": 1.5663157894736842e-05,
"loss": 0.8887,
"step": 2560
},
{
"epoch": 0.06,
"grad_norm": 3.3204426765441895,
"learning_rate": 1.5642105263157898e-05,
"loss": 0.8257,
"step": 2570
},
{
"epoch": 0.06,
"grad_norm": 9.327149391174316,
"learning_rate": 1.5621052631578947e-05,
"loss": 0.7896,
"step": 2580
},
{
"epoch": 0.06,
"grad_norm": 1.575266718864441,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.9021,
"step": 2590
},
{
"epoch": 0.07,
"grad_norm": 2.955282688140869,
"learning_rate": 1.5578947368421052e-05,
"loss": 0.8115,
"step": 2600
},
{
"epoch": 0.07,
"grad_norm": 6.098946571350098,
"learning_rate": 1.555789473684211e-05,
"loss": 0.647,
"step": 2610
},
{
"epoch": 0.07,
"grad_norm": 3.354290723800659,
"learning_rate": 1.5536842105263158e-05,
"loss": 0.8033,
"step": 2620
},
{
"epoch": 0.07,
"grad_norm": 4.7387518882751465,
"learning_rate": 1.551578947368421e-05,
"loss": 0.6904,
"step": 2630
},
{
"epoch": 0.07,
"grad_norm": 7.594583034515381,
"learning_rate": 1.5494736842105263e-05,
"loss": 0.7914,
"step": 2640
},
{
"epoch": 0.07,
"grad_norm": 2.994126081466675,
"learning_rate": 1.5473684210526316e-05,
"loss": 0.8019,
"step": 2650
},
{
"epoch": 0.07,
"grad_norm": 5.478656768798828,
"learning_rate": 1.545263157894737e-05,
"loss": 0.7575,
"step": 2660
},
{
"epoch": 0.07,
"grad_norm": 3.4734623432159424,
"learning_rate": 1.543157894736842e-05,
"loss": 0.7662,
"step": 2670
},
{
"epoch": 0.07,
"grad_norm": 3.2532217502593994,
"learning_rate": 1.5410526315789477e-05,
"loss": 0.6782,
"step": 2680
},
{
"epoch": 0.07,
"grad_norm": 4.5201520919799805,
"learning_rate": 1.5389473684210526e-05,
"loss": 0.7102,
"step": 2690
},
{
"epoch": 0.07,
"grad_norm": 3.8668696880340576,
"learning_rate": 1.536842105263158e-05,
"loss": 0.8358,
"step": 2700
},
{
"epoch": 0.07,
"grad_norm": 5.816726207733154,
"learning_rate": 1.534736842105263e-05,
"loss": 0.8439,
"step": 2710
},
{
"epoch": 0.07,
"grad_norm": 3.014636516571045,
"learning_rate": 1.5326315789473684e-05,
"loss": 0.8699,
"step": 2720
},
{
"epoch": 0.07,
"grad_norm": 3.465543270111084,
"learning_rate": 1.530526315789474e-05,
"loss": 0.7515,
"step": 2730
},
{
"epoch": 0.07,
"grad_norm": 6.904135227203369,
"learning_rate": 1.528421052631579e-05,
"loss": 0.7578,
"step": 2740
},
{
"epoch": 0.07,
"grad_norm": 3.1614532470703125,
"learning_rate": 1.5263157894736846e-05,
"loss": 0.6915,
"step": 2750
},
{
"epoch": 0.07,
"grad_norm": 2.4877758026123047,
"learning_rate": 1.5242105263157897e-05,
"loss": 0.7607,
"step": 2760
},
{
"epoch": 0.07,
"grad_norm": 6.346368312835693,
"learning_rate": 1.5221052631578948e-05,
"loss": 0.8558,
"step": 2770
},
{
"epoch": 0.07,
"grad_norm": 4.319607734680176,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.8349,
"step": 2780
},
{
"epoch": 0.07,
"grad_norm": 3.130995750427246,
"learning_rate": 1.5178947368421053e-05,
"loss": 0.7747,
"step": 2790
},
{
"epoch": 0.07,
"grad_norm": 1.8080275058746338,
"learning_rate": 1.5157894736842107e-05,
"loss": 0.696,
"step": 2800
},
{
"epoch": 0.07,
"grad_norm": 3.184603214263916,
"learning_rate": 1.5136842105263158e-05,
"loss": 0.7874,
"step": 2810
},
{
"epoch": 0.07,
"grad_norm": 2.7330257892608643,
"learning_rate": 1.5115789473684212e-05,
"loss": 0.6844,
"step": 2820
},
{
"epoch": 0.07,
"grad_norm": 3.2653279304504395,
"learning_rate": 1.5094736842105263e-05,
"loss": 0.6429,
"step": 2830
},
{
"epoch": 0.07,
"grad_norm": 3.826791524887085,
"learning_rate": 1.5073684210526316e-05,
"loss": 0.7868,
"step": 2840
},
{
"epoch": 0.07,
"grad_norm": 5.656713008880615,
"learning_rate": 1.505263157894737e-05,
"loss": 0.81,
"step": 2850
},
{
"epoch": 0.07,
"grad_norm": 2.6106789112091064,
"learning_rate": 1.5031578947368421e-05,
"loss": 0.7693,
"step": 2860
},
{
"epoch": 0.07,
"grad_norm": 2.2952094078063965,
"learning_rate": 1.5010526315789476e-05,
"loss": 0.7727,
"step": 2870
},
{
"epoch": 0.07,
"grad_norm": 2.8655829429626465,
"learning_rate": 1.4989473684210527e-05,
"loss": 0.7702,
"step": 2880
},
{
"epoch": 0.07,
"grad_norm": 3.170299768447876,
"learning_rate": 1.4968421052631581e-05,
"loss": 0.884,
"step": 2890
},
{
"epoch": 0.07,
"grad_norm": 10.48736572265625,
"learning_rate": 1.4947368421052632e-05,
"loss": 0.8281,
"step": 2900
},
{
"epoch": 0.07,
"grad_norm": 6.244320392608643,
"learning_rate": 1.4926315789473686e-05,
"loss": 0.8549,
"step": 2910
},
{
"epoch": 0.07,
"grad_norm": 9.334859848022461,
"learning_rate": 1.4905263157894739e-05,
"loss": 0.7232,
"step": 2920
},
{
"epoch": 0.07,
"grad_norm": 6.034826755523682,
"learning_rate": 1.488421052631579e-05,
"loss": 0.7666,
"step": 2930
},
{
"epoch": 0.07,
"grad_norm": 5.024431228637695,
"learning_rate": 1.4863157894736844e-05,
"loss": 0.7803,
"step": 2940
},
{
"epoch": 0.07,
"grad_norm": 4.2071685791015625,
"learning_rate": 1.4842105263157895e-05,
"loss": 0.8844,
"step": 2950
},
{
"epoch": 0.07,
"grad_norm": 2.319312810897827,
"learning_rate": 1.482105263157895e-05,
"loss": 0.7345,
"step": 2960
},
{
"epoch": 0.07,
"grad_norm": 4.384433746337891,
"learning_rate": 1.48e-05,
"loss": 0.6718,
"step": 2970
},
{
"epoch": 0.07,
"grad_norm": 3.6182382106781006,
"learning_rate": 1.4778947368421055e-05,
"loss": 0.7898,
"step": 2980
},
{
"epoch": 0.07,
"grad_norm": 8.145679473876953,
"learning_rate": 1.4757894736842106e-05,
"loss": 0.7754,
"step": 2990
},
{
"epoch": 0.07,
"grad_norm": 4.761884689331055,
"learning_rate": 1.4736842105263159e-05,
"loss": 0.7607,
"step": 3000
},
{
"epoch": 0.07,
"eval_loss": 0.8005050420761108,
"eval_runtime": 67.8219,
"eval_samples_per_second": 14.744,
"eval_steps_per_second": 14.744,
"step": 3000
},
{
"epoch": 0.08,
"grad_norm": 9.505555152893066,
"learning_rate": 1.4715789473684213e-05,
"loss": 0.6883,
"step": 3010
},
{
"epoch": 0.08,
"grad_norm": 4.4507155418396,
"learning_rate": 1.4694736842105264e-05,
"loss": 0.7203,
"step": 3020
},
{
"epoch": 0.08,
"grad_norm": 7.255837440490723,
"learning_rate": 1.4673684210526318e-05,
"loss": 0.8193,
"step": 3030
},
{
"epoch": 0.08,
"grad_norm": 6.857261657714844,
"learning_rate": 1.465263157894737e-05,
"loss": 0.6581,
"step": 3040
},
{
"epoch": 0.08,
"grad_norm": 8.000073432922363,
"learning_rate": 1.4631578947368424e-05,
"loss": 0.7318,
"step": 3050
},
{
"epoch": 0.08,
"grad_norm": 2.8200011253356934,
"learning_rate": 1.4610526315789474e-05,
"loss": 0.8176,
"step": 3060
},
{
"epoch": 0.08,
"grad_norm": 4.028472423553467,
"learning_rate": 1.4589473684210527e-05,
"loss": 0.7712,
"step": 3070
},
{
"epoch": 0.08,
"grad_norm": 3.2594337463378906,
"learning_rate": 1.456842105263158e-05,
"loss": 0.8032,
"step": 3080
},
{
"epoch": 0.08,
"grad_norm": 3.751202344894409,
"learning_rate": 1.4547368421052632e-05,
"loss": 0.7905,
"step": 3090
},
{
"epoch": 0.08,
"grad_norm": 4.235973834991455,
"learning_rate": 1.4526315789473687e-05,
"loss": 0.8402,
"step": 3100
},
{
"epoch": 0.08,
"grad_norm": 2.327855348587036,
"learning_rate": 1.4505263157894738e-05,
"loss": 0.9027,
"step": 3110
},
{
"epoch": 0.08,
"grad_norm": 5.911487102508545,
"learning_rate": 1.4484210526315792e-05,
"loss": 0.8241,
"step": 3120
},
{
"epoch": 0.08,
"grad_norm": 2.3612990379333496,
"learning_rate": 1.4463157894736843e-05,
"loss": 0.7884,
"step": 3130
},
{
"epoch": 0.08,
"grad_norm": 2.106682300567627,
"learning_rate": 1.4442105263157896e-05,
"loss": 0.7414,
"step": 3140
},
{
"epoch": 0.08,
"grad_norm": 4.354177951812744,
"learning_rate": 1.4421052631578948e-05,
"loss": 0.7626,
"step": 3150
},
{
"epoch": 0.08,
"grad_norm": 3.0019009113311768,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.6853,
"step": 3160
},
{
"epoch": 0.08,
"grad_norm": 3.787949562072754,
"learning_rate": 1.4378947368421054e-05,
"loss": 0.8105,
"step": 3170
},
{
"epoch": 0.08,
"grad_norm": 5.0848469734191895,
"learning_rate": 1.4357894736842106e-05,
"loss": 0.7787,
"step": 3180
},
{
"epoch": 0.08,
"grad_norm": 6.892744541168213,
"learning_rate": 1.433684210526316e-05,
"loss": 0.8883,
"step": 3190
},
{
"epoch": 0.08,
"grad_norm": 3.717949390411377,
"learning_rate": 1.4315789473684212e-05,
"loss": 0.7458,
"step": 3200
},
{
"epoch": 0.08,
"grad_norm": 3.4417831897735596,
"learning_rate": 1.4294736842105263e-05,
"loss": 0.7868,
"step": 3210
},
{
"epoch": 0.08,
"grad_norm": 9.308151245117188,
"learning_rate": 1.4273684210526317e-05,
"loss": 0.8725,
"step": 3220
},
{
"epoch": 0.08,
"grad_norm": 2.7036919593811035,
"learning_rate": 1.425263157894737e-05,
"loss": 0.8706,
"step": 3230
},
{
"epoch": 0.08,
"grad_norm": 7.338090419769287,
"learning_rate": 1.4231578947368422e-05,
"loss": 0.7695,
"step": 3240
},
{
"epoch": 0.08,
"grad_norm": 2.9678733348846436,
"learning_rate": 1.4210526315789475e-05,
"loss": 0.7259,
"step": 3250
},
{
"epoch": 0.08,
"grad_norm": 3.3436050415039062,
"learning_rate": 1.418947368421053e-05,
"loss": 0.7193,
"step": 3260
},
{
"epoch": 0.08,
"grad_norm": 2.23856520652771,
"learning_rate": 1.416842105263158e-05,
"loss": 0.6766,
"step": 3270
},
{
"epoch": 0.08,
"grad_norm": 4.021206855773926,
"learning_rate": 1.4147368421052631e-05,
"loss": 0.888,
"step": 3280
},
{
"epoch": 0.08,
"grad_norm": 7.063048839569092,
"learning_rate": 1.4126315789473686e-05,
"loss": 0.7543,
"step": 3290
},
{
"epoch": 0.08,
"grad_norm": 4.777950763702393,
"learning_rate": 1.4105263157894738e-05,
"loss": 0.6239,
"step": 3300
},
{
"epoch": 0.08,
"grad_norm": 3.2984225749969482,
"learning_rate": 1.4084210526315791e-05,
"loss": 0.8349,
"step": 3310
},
{
"epoch": 0.08,
"grad_norm": 6.48808479309082,
"learning_rate": 1.4063157894736844e-05,
"loss": 0.8765,
"step": 3320
},
{
"epoch": 0.08,
"grad_norm": 4.557926177978516,
"learning_rate": 1.4042105263157896e-05,
"loss": 0.8508,
"step": 3330
},
{
"epoch": 0.08,
"grad_norm": 2.661513090133667,
"learning_rate": 1.4021052631578949e-05,
"loss": 0.7547,
"step": 3340
},
{
"epoch": 0.08,
"grad_norm": 5.827274799346924,
"learning_rate": 1.4e-05,
"loss": 0.8556,
"step": 3350
},
{
"epoch": 0.08,
"grad_norm": 3.0803449153900146,
"learning_rate": 1.3978947368421054e-05,
"loss": 0.8663,
"step": 3360
},
{
"epoch": 0.08,
"grad_norm": 9.996018409729004,
"learning_rate": 1.3957894736842105e-05,
"loss": 0.68,
"step": 3370
},
{
"epoch": 0.08,
"grad_norm": 5.443753242492676,
"learning_rate": 1.393684210526316e-05,
"loss": 0.7227,
"step": 3380
},
{
"epoch": 0.08,
"grad_norm": 9.685049057006836,
"learning_rate": 1.3915789473684212e-05,
"loss": 0.7336,
"step": 3390
},
{
"epoch": 0.09,
"grad_norm": 5.89941930770874,
"learning_rate": 1.3894736842105265e-05,
"loss": 0.7016,
"step": 3400
},
{
"epoch": 0.09,
"grad_norm": 9.616964340209961,
"learning_rate": 1.3873684210526317e-05,
"loss": 0.7678,
"step": 3410
},
{
"epoch": 0.09,
"grad_norm": 5.262804985046387,
"learning_rate": 1.3852631578947368e-05,
"loss": 0.8227,
"step": 3420
},
{
"epoch": 0.09,
"grad_norm": 5.251176357269287,
"learning_rate": 1.3831578947368423e-05,
"loss": 0.71,
"step": 3430
},
{
"epoch": 0.09,
"grad_norm": 5.626483917236328,
"learning_rate": 1.3810526315789474e-05,
"loss": 0.8204,
"step": 3440
},
{
"epoch": 0.09,
"grad_norm": 6.990488052368164,
"learning_rate": 1.3789473684210528e-05,
"loss": 0.7745,
"step": 3450
},
{
"epoch": 0.09,
"grad_norm": 7.510478496551514,
"learning_rate": 1.3768421052631579e-05,
"loss": 0.6286,
"step": 3460
},
{
"epoch": 0.09,
"grad_norm": 5.006512641906738,
"learning_rate": 1.3747368421052633e-05,
"loss": 0.8146,
"step": 3470
},
{
"epoch": 0.09,
"grad_norm": 6.388507843017578,
"learning_rate": 1.3726315789473686e-05,
"loss": 0.7914,
"step": 3480
},
{
"epoch": 0.09,
"grad_norm": 5.217645168304443,
"learning_rate": 1.3705263157894737e-05,
"loss": 0.7219,
"step": 3490
},
{
"epoch": 0.09,
"grad_norm": 6.257259368896484,
"learning_rate": 1.3684210526315791e-05,
"loss": 0.8404,
"step": 3500
},
{
"epoch": 0.09,
"eval_loss": 0.8086790442466736,
"eval_runtime": 67.9356,
"eval_samples_per_second": 14.72,
"eval_steps_per_second": 14.72,
"step": 3500
},
{
"epoch": 0.09,
"grad_norm": 2.6733217239379883,
"learning_rate": 1.3663157894736842e-05,
"loss": 0.7936,
"step": 3510
},
{
"epoch": 0.09,
"grad_norm": 2.0083932876586914,
"learning_rate": 1.3642105263157897e-05,
"loss": 0.7122,
"step": 3520
},
{
"epoch": 0.09,
"grad_norm": 17.048171997070312,
"learning_rate": 1.3621052631578948e-05,
"loss": 0.7568,
"step": 3530
},
{
"epoch": 0.09,
"grad_norm": 3.1675314903259277,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.639,
"step": 3540
},
{
"epoch": 0.09,
"grad_norm": 2.6769821643829346,
"learning_rate": 1.3578947368421055e-05,
"loss": 0.8691,
"step": 3550
},
{
"epoch": 0.09,
"grad_norm": 3.516094207763672,
"learning_rate": 1.3557894736842106e-05,
"loss": 0.7681,
"step": 3560
},
{
"epoch": 0.09,
"grad_norm": 2.8059396743774414,
"learning_rate": 1.353684210526316e-05,
"loss": 0.7387,
"step": 3570
},
{
"epoch": 0.09,
"grad_norm": 4.531425952911377,
"learning_rate": 1.3515789473684211e-05,
"loss": 0.7559,
"step": 3580
},
{
"epoch": 0.09,
"grad_norm": 9.721296310424805,
"learning_rate": 1.3494736842105265e-05,
"loss": 0.7816,
"step": 3590
},
{
"epoch": 0.09,
"grad_norm": 6.60942268371582,
"learning_rate": 1.3473684210526316e-05,
"loss": 0.8203,
"step": 3600
},
{
"epoch": 0.09,
"grad_norm": 2.4415578842163086,
"learning_rate": 1.345263157894737e-05,
"loss": 0.6731,
"step": 3610
},
{
"epoch": 0.09,
"grad_norm": 3.322394847869873,
"learning_rate": 1.3431578947368421e-05,
"loss": 0.8578,
"step": 3620
},
{
"epoch": 0.09,
"grad_norm": 2.429548978805542,
"learning_rate": 1.3410526315789474e-05,
"loss": 0.7576,
"step": 3630
},
{
"epoch": 0.09,
"grad_norm": 3.9194421768188477,
"learning_rate": 1.3389473684210528e-05,
"loss": 0.8131,
"step": 3640
},
{
"epoch": 0.09,
"grad_norm": 3.6766152381896973,
"learning_rate": 1.336842105263158e-05,
"loss": 0.9391,
"step": 3650
},
{
"epoch": 0.09,
"grad_norm": 3.4084839820861816,
"learning_rate": 1.3347368421052634e-05,
"loss": 0.8242,
"step": 3660
},
{
"epoch": 0.09,
"grad_norm": 3.2697949409484863,
"learning_rate": 1.3326315789473685e-05,
"loss": 0.7534,
"step": 3670
},
{
"epoch": 0.09,
"grad_norm": 3.7821884155273438,
"learning_rate": 1.3305263157894739e-05,
"loss": 0.7237,
"step": 3680
},
{
"epoch": 0.09,
"grad_norm": 4.925840854644775,
"learning_rate": 1.328421052631579e-05,
"loss": 0.8194,
"step": 3690
},
{
"epoch": 0.09,
"grad_norm": 4.694246768951416,
"learning_rate": 1.3263157894736843e-05,
"loss": 0.7628,
"step": 3700
},
{
"epoch": 0.09,
"grad_norm": 7.358584403991699,
"learning_rate": 1.3242105263157895e-05,
"loss": 0.9161,
"step": 3710
},
{
"epoch": 0.09,
"grad_norm": 2.007431983947754,
"learning_rate": 1.3221052631578948e-05,
"loss": 0.6624,
"step": 3720
},
{
"epoch": 0.09,
"grad_norm": 2.7626278400421143,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.7662,
"step": 3730
},
{
"epoch": 0.09,
"grad_norm": 2.4226157665252686,
"learning_rate": 1.3178947368421053e-05,
"loss": 0.809,
"step": 3740
},
{
"epoch": 0.09,
"grad_norm": 3.5735135078430176,
"learning_rate": 1.3157894736842108e-05,
"loss": 0.6769,
"step": 3750
},
{
"epoch": 0.09,
"grad_norm": 2.4251084327697754,
"learning_rate": 1.3136842105263159e-05,
"loss": 0.7785,
"step": 3760
},
{
"epoch": 0.09,
"grad_norm": 2.311429977416992,
"learning_rate": 1.3115789473684211e-05,
"loss": 0.7536,
"step": 3770
},
{
"epoch": 0.09,
"grad_norm": 3.2348074913024902,
"learning_rate": 1.3094736842105264e-05,
"loss": 0.8138,
"step": 3780
},
{
"epoch": 0.09,
"grad_norm": 7.259544372558594,
"learning_rate": 1.3073684210526317e-05,
"loss": 0.8,
"step": 3790
},
{
"epoch": 0.1,
"grad_norm": 3.99937105178833,
"learning_rate": 1.305263157894737e-05,
"loss": 0.7894,
"step": 3800
},
{
"epoch": 0.1,
"grad_norm": 10.336478233337402,
"learning_rate": 1.3031578947368422e-05,
"loss": 0.7928,
"step": 3810
},
{
"epoch": 0.1,
"grad_norm": 4.500198841094971,
"learning_rate": 1.3010526315789476e-05,
"loss": 0.7916,
"step": 3820
},
{
"epoch": 0.1,
"grad_norm": 3.115521192550659,
"learning_rate": 1.2989473684210527e-05,
"loss": 0.7585,
"step": 3830
},
{
"epoch": 0.1,
"grad_norm": 8.954665184020996,
"learning_rate": 1.2968421052631578e-05,
"loss": 0.7727,
"step": 3840
},
{
"epoch": 0.1,
"grad_norm": 4.874253273010254,
"learning_rate": 1.2947368421052633e-05,
"loss": 0.7903,
"step": 3850
},
{
"epoch": 0.1,
"grad_norm": 3.151484966278076,
"learning_rate": 1.2926315789473685e-05,
"loss": 0.7199,
"step": 3860
},
{
"epoch": 0.1,
"grad_norm": 10.117889404296875,
"learning_rate": 1.2905263157894738e-05,
"loss": 0.7562,
"step": 3870
},
{
"epoch": 0.1,
"grad_norm": 2.595205307006836,
"learning_rate": 1.288421052631579e-05,
"loss": 0.8167,
"step": 3880
},
{
"epoch": 0.1,
"grad_norm": 3.4744372367858887,
"learning_rate": 1.2863157894736845e-05,
"loss": 0.7343,
"step": 3890
},
{
"epoch": 0.1,
"grad_norm": 3.1740803718566895,
"learning_rate": 1.2842105263157896e-05,
"loss": 0.8754,
"step": 3900
},
{
"epoch": 0.1,
"grad_norm": 7.299022197723389,
"learning_rate": 1.2821052631578947e-05,
"loss": 0.7379,
"step": 3910
},
{
"epoch": 0.1,
"grad_norm": 3.2339208126068115,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.771,
"step": 3920
},
{
"epoch": 0.1,
"grad_norm": 2.5612077713012695,
"learning_rate": 1.2778947368421054e-05,
"loss": 0.7959,
"step": 3930
},
{
"epoch": 0.1,
"grad_norm": 4.87350606918335,
"learning_rate": 1.2757894736842106e-05,
"loss": 0.7871,
"step": 3940
},
{
"epoch": 0.1,
"grad_norm": 3.8318493366241455,
"learning_rate": 1.2736842105263159e-05,
"loss": 0.7502,
"step": 3950
},
{
"epoch": 0.1,
"grad_norm": 4.797230243682861,
"learning_rate": 1.2715789473684212e-05,
"loss": 0.7241,
"step": 3960
},
{
"epoch": 0.1,
"grad_norm": 4.037790775299072,
"learning_rate": 1.2694736842105264e-05,
"loss": 0.8642,
"step": 3970
},
{
"epoch": 0.1,
"grad_norm": 4.736443042755127,
"learning_rate": 1.2673684210526315e-05,
"loss": 0.7672,
"step": 3980
},
{
"epoch": 0.1,
"grad_norm": 3.449172258377075,
"learning_rate": 1.265263157894737e-05,
"loss": 0.7685,
"step": 3990
},
{
"epoch": 0.1,
"grad_norm": 2.1893362998962402,
"learning_rate": 1.263157894736842e-05,
"loss": 0.6876,
"step": 4000
},
{
"epoch": 0.1,
"eval_loss": 0.8031703233718872,
"eval_runtime": 67.9677,
"eval_samples_per_second": 14.713,
"eval_steps_per_second": 14.713,
"step": 4000
},
{
"epoch": 0.1,
"grad_norm": 3.5685079097747803,
"learning_rate": 1.2610526315789475e-05,
"loss": 0.7317,
"step": 4010
},
{
"epoch": 0.1,
"grad_norm": 6.440120220184326,
"learning_rate": 1.2589473684210528e-05,
"loss": 0.7919,
"step": 4020
},
{
"epoch": 0.1,
"grad_norm": 5.1870341300964355,
"learning_rate": 1.256842105263158e-05,
"loss": 0.6921,
"step": 4030
},
{
"epoch": 0.1,
"grad_norm": 4.161406517028809,
"learning_rate": 1.2547368421052633e-05,
"loss": 0.7822,
"step": 4040
},
{
"epoch": 0.1,
"grad_norm": 6.242280006408691,
"learning_rate": 1.2526315789473684e-05,
"loss": 0.8151,
"step": 4050
},
{
"epoch": 0.1,
"grad_norm": 3.825861692428589,
"learning_rate": 1.2505263157894738e-05,
"loss": 0.7709,
"step": 4060
},
{
"epoch": 0.1,
"grad_norm": 22.97239112854004,
"learning_rate": 1.248421052631579e-05,
"loss": 0.8456,
"step": 4070
},
{
"epoch": 0.1,
"grad_norm": 3.718015193939209,
"learning_rate": 1.2463157894736844e-05,
"loss": 0.8354,
"step": 4080
},
{
"epoch": 0.1,
"grad_norm": 3.266710042953491,
"learning_rate": 1.2442105263157895e-05,
"loss": 0.6444,
"step": 4090
},
{
"epoch": 0.1,
"grad_norm": 4.711140155792236,
"learning_rate": 1.2421052631578949e-05,
"loss": 0.8418,
"step": 4100
},
{
"epoch": 0.1,
"grad_norm": 2.8773484230041504,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.6673,
"step": 4110
},
{
"epoch": 0.1,
"grad_norm": 4.492387771606445,
"learning_rate": 1.2378947368421053e-05,
"loss": 0.7801,
"step": 4120
},
{
"epoch": 0.1,
"grad_norm": 4.106402397155762,
"learning_rate": 1.2357894736842107e-05,
"loss": 0.718,
"step": 4130
},
{
"epoch": 0.1,
"grad_norm": 4.770216941833496,
"learning_rate": 1.2336842105263158e-05,
"loss": 0.7546,
"step": 4140
},
{
"epoch": 0.1,
"grad_norm": 3.7071616649627686,
"learning_rate": 1.2315789473684212e-05,
"loss": 0.8232,
"step": 4150
},
{
"epoch": 0.1,
"grad_norm": 11.786856651306152,
"learning_rate": 1.2294736842105263e-05,
"loss": 0.845,
"step": 4160
},
{
"epoch": 0.1,
"grad_norm": 2.190443515777588,
"learning_rate": 1.2273684210526317e-05,
"loss": 0.7656,
"step": 4170
},
{
"epoch": 0.1,
"grad_norm": 6.3326239585876465,
"learning_rate": 1.225263157894737e-05,
"loss": 0.8145,
"step": 4180
},
{
"epoch": 0.1,
"grad_norm": 2.735156297683716,
"learning_rate": 1.2231578947368421e-05,
"loss": 0.7637,
"step": 4190
},
{
"epoch": 0.1,
"grad_norm": 44.92083740234375,
"learning_rate": 1.2210526315789475e-05,
"loss": 0.8358,
"step": 4200
},
{
"epoch": 0.11,
"grad_norm": 5.335235595703125,
"learning_rate": 1.2189473684210526e-05,
"loss": 0.8564,
"step": 4210
},
{
"epoch": 0.11,
"grad_norm": 6.2740349769592285,
"learning_rate": 1.216842105263158e-05,
"loss": 0.8443,
"step": 4220
},
{
"epoch": 0.11,
"grad_norm": 5.290927410125732,
"learning_rate": 1.2147368421052632e-05,
"loss": 0.8041,
"step": 4230
},
{
"epoch": 0.11,
"grad_norm": 9.460419654846191,
"learning_rate": 1.2126315789473686e-05,
"loss": 0.8054,
"step": 4240
},
{
"epoch": 0.11,
"grad_norm": 3.963223934173584,
"learning_rate": 1.2105263157894737e-05,
"loss": 0.8104,
"step": 4250
},
{
"epoch": 0.11,
"grad_norm": 5.091956615447998,
"learning_rate": 1.208421052631579e-05,
"loss": 0.7156,
"step": 4260
},
{
"epoch": 0.11,
"grad_norm": 3.055617570877075,
"learning_rate": 1.2063157894736844e-05,
"loss": 0.835,
"step": 4270
},
{
"epoch": 0.11,
"grad_norm": 3.4951014518737793,
"learning_rate": 1.2042105263157895e-05,
"loss": 0.8004,
"step": 4280
},
{
"epoch": 0.11,
"grad_norm": 2.876716136932373,
"learning_rate": 1.202105263157895e-05,
"loss": 0.7324,
"step": 4290
},
{
"epoch": 0.11,
"grad_norm": 2.0460751056671143,
"learning_rate": 1.2e-05,
"loss": 0.7856,
"step": 4300
},
{
"epoch": 0.11,
"grad_norm": 2.417022943496704,
"learning_rate": 1.1978947368421055e-05,
"loss": 0.7649,
"step": 4310
},
{
"epoch": 0.11,
"grad_norm": 5.435426235198975,
"learning_rate": 1.1957894736842106e-05,
"loss": 0.8354,
"step": 4320
},
{
"epoch": 0.11,
"grad_norm": 3.5707461833953857,
"learning_rate": 1.1936842105263158e-05,
"loss": 0.7264,
"step": 4330
},
{
"epoch": 0.11,
"grad_norm": 4.077741622924805,
"learning_rate": 1.1915789473684211e-05,
"loss": 0.6934,
"step": 4340
},
{
"epoch": 0.11,
"grad_norm": 5.498834133148193,
"learning_rate": 1.1894736842105264e-05,
"loss": 0.7687,
"step": 4350
},
{
"epoch": 0.11,
"grad_norm": 3.684948205947876,
"learning_rate": 1.1873684210526318e-05,
"loss": 0.9371,
"step": 4360
},
{
"epoch": 0.11,
"grad_norm": 2.4451920986175537,
"learning_rate": 1.1852631578947369e-05,
"loss": 0.7904,
"step": 4370
},
{
"epoch": 0.11,
"grad_norm": 3.1316657066345215,
"learning_rate": 1.1831578947368423e-05,
"loss": 0.6835,
"step": 4380
},
{
"epoch": 0.11,
"grad_norm": 3.741140127182007,
"learning_rate": 1.1810526315789474e-05,
"loss": 0.8932,
"step": 4390
},
{
"epoch": 0.11,
"grad_norm": 4.40395975112915,
"learning_rate": 1.1789473684210527e-05,
"loss": 0.9044,
"step": 4400
},
{
"epoch": 0.11,
"grad_norm": 3.406174898147583,
"learning_rate": 1.176842105263158e-05,
"loss": 0.7924,
"step": 4410
},
{
"epoch": 0.11,
"grad_norm": 6.892871379852295,
"learning_rate": 1.1747368421052632e-05,
"loss": 0.7356,
"step": 4420
},
{
"epoch": 0.11,
"grad_norm": 5.855538368225098,
"learning_rate": 1.1726315789473685e-05,
"loss": 0.7543,
"step": 4430
},
{
"epoch": 0.11,
"grad_norm": 4.951474666595459,
"learning_rate": 1.1705263157894737e-05,
"loss": 0.7416,
"step": 4440
},
{
"epoch": 0.11,
"grad_norm": 3.314338207244873,
"learning_rate": 1.1684210526315792e-05,
"loss": 0.7764,
"step": 4450
},
{
"epoch": 0.11,
"grad_norm": 5.176599502563477,
"learning_rate": 1.1663157894736843e-05,
"loss": 0.7658,
"step": 4460
},
{
"epoch": 0.11,
"grad_norm": 5.275913238525391,
"learning_rate": 1.1642105263157897e-05,
"loss": 0.6621,
"step": 4470
},
{
"epoch": 0.11,
"grad_norm": 3.1267452239990234,
"learning_rate": 1.1621052631578948e-05,
"loss": 0.7699,
"step": 4480
},
{
"epoch": 0.11,
"grad_norm": 8.313840866088867,
"learning_rate": 1.16e-05,
"loss": 0.7262,
"step": 4490
},
{
"epoch": 0.11,
"grad_norm": 8.310973167419434,
"learning_rate": 1.1578947368421053e-05,
"loss": 0.8871,
"step": 4500
},
{
"epoch": 0.11,
"eval_loss": 0.7730265855789185,
"eval_runtime": 67.9868,
"eval_samples_per_second": 14.709,
"eval_steps_per_second": 14.709,
"step": 4500
},
{
"epoch": 0.11,
"grad_norm": 3.4506046772003174,
"learning_rate": 1.1557894736842106e-05,
"loss": 0.7579,
"step": 4510
},
{
"epoch": 0.11,
"grad_norm": 3.864931106567383,
"learning_rate": 1.153684210526316e-05,
"loss": 0.899,
"step": 4520
},
{
"epoch": 0.11,
"grad_norm": 5.998289108276367,
"learning_rate": 1.1515789473684211e-05,
"loss": 0.7924,
"step": 4530
},
{
"epoch": 0.11,
"grad_norm": 4.859367370605469,
"learning_rate": 1.1494736842105266e-05,
"loss": 0.773,
"step": 4540
},
{
"epoch": 0.11,
"grad_norm": 3.9851796627044678,
"learning_rate": 1.1473684210526317e-05,
"loss": 0.8665,
"step": 4550
},
{
"epoch": 0.11,
"grad_norm": 5.357670783996582,
"learning_rate": 1.145263157894737e-05,
"loss": 0.7522,
"step": 4560
},
{
"epoch": 0.11,
"grad_norm": 3.778637409210205,
"learning_rate": 1.1431578947368422e-05,
"loss": 0.6188,
"step": 4570
},
{
"epoch": 0.11,
"grad_norm": 8.546213150024414,
"learning_rate": 1.1410526315789475e-05,
"loss": 0.7296,
"step": 4580
},
{
"epoch": 0.11,
"grad_norm": 2.5466620922088623,
"learning_rate": 1.1389473684210527e-05,
"loss": 0.7774,
"step": 4590
},
{
"epoch": 0.12,
"grad_norm": 4.85372257232666,
"learning_rate": 1.136842105263158e-05,
"loss": 0.7884,
"step": 4600
},
{
"epoch": 0.12,
"grad_norm": 3.1757266521453857,
"learning_rate": 1.1347368421052634e-05,
"loss": 0.8008,
"step": 4610
},
{
"epoch": 0.12,
"grad_norm": 2.7544124126434326,
"learning_rate": 1.1326315789473685e-05,
"loss": 0.7322,
"step": 4620
},
{
"epoch": 0.12,
"grad_norm": 7.456575870513916,
"learning_rate": 1.1305263157894736e-05,
"loss": 0.7978,
"step": 4630
},
{
"epoch": 0.12,
"grad_norm": 2.9789164066314697,
"learning_rate": 1.128421052631579e-05,
"loss": 0.617,
"step": 4640
},
{
"epoch": 0.12,
"grad_norm": 4.361474514007568,
"learning_rate": 1.1263157894736843e-05,
"loss": 0.8214,
"step": 4650
},
{
"epoch": 0.12,
"grad_norm": 14.45222282409668,
"learning_rate": 1.1242105263157896e-05,
"loss": 0.7183,
"step": 4660
},
{
"epoch": 0.12,
"grad_norm": 4.7697906494140625,
"learning_rate": 1.1221052631578949e-05,
"loss": 0.729,
"step": 4670
},
{
"epoch": 0.12,
"grad_norm": 4.225655555725098,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.7627,
"step": 4680
},
{
"epoch": 0.12,
"grad_norm": 4.067778587341309,
"learning_rate": 1.1178947368421054e-05,
"loss": 0.78,
"step": 4690
},
{
"epoch": 0.12,
"grad_norm": 4.5654473304748535,
"learning_rate": 1.1157894736842105e-05,
"loss": 0.7178,
"step": 4700
},
{
"epoch": 0.12,
"grad_norm": 1.7385423183441162,
"learning_rate": 1.1136842105263159e-05,
"loss": 0.9387,
"step": 4710
},
{
"epoch": 0.12,
"grad_norm": 4.847338676452637,
"learning_rate": 1.111578947368421e-05,
"loss": 0.7951,
"step": 4720
},
{
"epoch": 0.12,
"grad_norm": 2.739323377609253,
"learning_rate": 1.1094736842105264e-05,
"loss": 0.8198,
"step": 4730
},
{
"epoch": 0.12,
"grad_norm": 5.23370885848999,
"learning_rate": 1.1073684210526317e-05,
"loss": 0.7462,
"step": 4740
},
{
"epoch": 0.12,
"grad_norm": 4.970132350921631,
"learning_rate": 1.105263157894737e-05,
"loss": 0.6983,
"step": 4750
},
{
"epoch": 0.12,
"grad_norm": 3.8072540760040283,
"learning_rate": 1.1031578947368422e-05,
"loss": 0.852,
"step": 4760
},
{
"epoch": 0.12,
"grad_norm": 2.734208345413208,
"learning_rate": 1.1010526315789473e-05,
"loss": 0.8621,
"step": 4770
},
{
"epoch": 0.12,
"grad_norm": 3.022127151489258,
"learning_rate": 1.0989473684210528e-05,
"loss": 0.7652,
"step": 4780
},
{
"epoch": 0.12,
"grad_norm": 7.284844875335693,
"learning_rate": 1.0968421052631579e-05,
"loss": 0.7901,
"step": 4790
},
{
"epoch": 0.12,
"grad_norm": 6.52205753326416,
"learning_rate": 1.0947368421052633e-05,
"loss": 0.8347,
"step": 4800
},
{
"epoch": 0.12,
"grad_norm": 3.1662251949310303,
"learning_rate": 1.0926315789473686e-05,
"loss": 0.6105,
"step": 4810
},
{
"epoch": 0.12,
"grad_norm": 6.027661323547363,
"learning_rate": 1.0905263157894738e-05,
"loss": 0.7447,
"step": 4820
},
{
"epoch": 0.12,
"grad_norm": 9.989821434020996,
"learning_rate": 1.0884210526315791e-05,
"loss": 0.8144,
"step": 4830
},
{
"epoch": 0.12,
"grad_norm": 3.886387825012207,
"learning_rate": 1.0863157894736842e-05,
"loss": 0.7702,
"step": 4840
},
{
"epoch": 0.12,
"grad_norm": 8.8762845993042,
"learning_rate": 1.0842105263157896e-05,
"loss": 0.7305,
"step": 4850
},
{
"epoch": 0.12,
"grad_norm": 5.934712886810303,
"learning_rate": 1.0821052631578947e-05,
"loss": 0.7849,
"step": 4860
},
{
"epoch": 0.12,
"grad_norm": 2.3684771060943604,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.8292,
"step": 4870
},
{
"epoch": 0.12,
"grad_norm": 10.528717041015625,
"learning_rate": 1.0778947368421053e-05,
"loss": 0.8637,
"step": 4880
},
{
"epoch": 0.12,
"grad_norm": 10.721526145935059,
"learning_rate": 1.0757894736842107e-05,
"loss": 0.7756,
"step": 4890
},
{
"epoch": 0.12,
"grad_norm": 4.99760103225708,
"learning_rate": 1.073684210526316e-05,
"loss": 0.7928,
"step": 4900
},
{
"epoch": 0.12,
"grad_norm": 8.126914978027344,
"learning_rate": 1.071578947368421e-05,
"loss": 0.7833,
"step": 4910
},
{
"epoch": 0.12,
"grad_norm": 2.2220332622528076,
"learning_rate": 1.0694736842105265e-05,
"loss": 0.7819,
"step": 4920
},
{
"epoch": 0.12,
"grad_norm": 7.0100321769714355,
"learning_rate": 1.0673684210526316e-05,
"loss": 0.8533,
"step": 4930
},
{
"epoch": 0.12,
"grad_norm": 3.5343334674835205,
"learning_rate": 1.065263157894737e-05,
"loss": 0.7053,
"step": 4940
},
{
"epoch": 0.12,
"grad_norm": 4.862158298492432,
"learning_rate": 1.0631578947368421e-05,
"loss": 0.7556,
"step": 4950
},
{
"epoch": 0.12,
"grad_norm": 10.008291244506836,
"learning_rate": 1.0610526315789476e-05,
"loss": 0.8381,
"step": 4960
},
{
"epoch": 0.12,
"grad_norm": 2.455188035964966,
"learning_rate": 1.0589473684210526e-05,
"loss": 0.749,
"step": 4970
},
{
"epoch": 0.12,
"grad_norm": 5.882299423217773,
"learning_rate": 1.0568421052631579e-05,
"loss": 0.7797,
"step": 4980
},
{
"epoch": 0.12,
"grad_norm": 5.7382001876831055,
"learning_rate": 1.0547368421052633e-05,
"loss": 0.8191,
"step": 4990
},
{
"epoch": 0.12,
"grad_norm": 3.9528167247772217,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.6382,
"step": 5000
},
{
"epoch": 0.12,
"eval_loss": 0.7771185040473938,
"eval_runtime": 67.9924,
"eval_samples_per_second": 14.708,
"eval_steps_per_second": 14.708,
"step": 5000
},
{
"epoch": 0.13,
"grad_norm": 3.4830055236816406,
"learning_rate": 1.0505263157894739e-05,
"loss": 0.9099,
"step": 5010
},
{
"epoch": 0.13,
"grad_norm": 5.7392096519470215,
"learning_rate": 1.048421052631579e-05,
"loss": 0.6423,
"step": 5020
},
{
"epoch": 0.13,
"grad_norm": 2.720612049102783,
"learning_rate": 1.0463157894736844e-05,
"loss": 0.7826,
"step": 5030
},
{
"epoch": 0.13,
"grad_norm": 3.0437145233154297,
"learning_rate": 1.0442105263157895e-05,
"loss": 0.7865,
"step": 5040
},
{
"epoch": 0.13,
"grad_norm": 8.835311889648438,
"learning_rate": 1.0421052631578948e-05,
"loss": 0.7778,
"step": 5050
},
{
"epoch": 0.13,
"grad_norm": 7.596973419189453,
"learning_rate": 1.04e-05,
"loss": 0.7381,
"step": 5060
},
{
"epoch": 0.13,
"grad_norm": 4.108314037322998,
"learning_rate": 1.0378947368421053e-05,
"loss": 0.7689,
"step": 5070
},
{
"epoch": 0.13,
"grad_norm": 3.865196704864502,
"learning_rate": 1.0357894736842107e-05,
"loss": 0.7785,
"step": 5080
},
{
"epoch": 0.13,
"grad_norm": 3.4403493404388428,
"learning_rate": 1.0336842105263158e-05,
"loss": 0.8322,
"step": 5090
},
{
"epoch": 0.13,
"grad_norm": 3.243029832839966,
"learning_rate": 1.0315789473684213e-05,
"loss": 0.6658,
"step": 5100
},
{
"epoch": 0.13,
"grad_norm": 3.806818962097168,
"learning_rate": 1.0294736842105264e-05,
"loss": 0.781,
"step": 5110
},
{
"epoch": 0.13,
"grad_norm": 3.820622205734253,
"learning_rate": 1.0273684210526316e-05,
"loss": 0.7499,
"step": 5120
},
{
"epoch": 0.13,
"grad_norm": 4.203964710235596,
"learning_rate": 1.0252631578947369e-05,
"loss": 0.7702,
"step": 5130
},
{
"epoch": 0.13,
"grad_norm": 2.803215503692627,
"learning_rate": 1.0231578947368422e-05,
"loss": 0.6291,
"step": 5140
},
{
"epoch": 0.13,
"grad_norm": 5.486114978790283,
"learning_rate": 1.0210526315789476e-05,
"loss": 0.8124,
"step": 5150
},
{
"epoch": 0.13,
"grad_norm": 7.74938440322876,
"learning_rate": 1.0189473684210527e-05,
"loss": 0.7735,
"step": 5160
},
{
"epoch": 0.13,
"grad_norm": 4.10128116607666,
"learning_rate": 1.0168421052631581e-05,
"loss": 0.6809,
"step": 5170
},
{
"epoch": 0.13,
"grad_norm": 6.844088554382324,
"learning_rate": 1.0147368421052632e-05,
"loss": 0.8294,
"step": 5180
},
{
"epoch": 0.13,
"grad_norm": 4.329681873321533,
"learning_rate": 1.0126315789473685e-05,
"loss": 0.861,
"step": 5190
},
{
"epoch": 0.13,
"grad_norm": 12.482446670532227,
"learning_rate": 1.0105263157894738e-05,
"loss": 0.7346,
"step": 5200
},
{
"epoch": 0.13,
"grad_norm": 1.8471055030822754,
"learning_rate": 1.008421052631579e-05,
"loss": 0.7714,
"step": 5210
},
{
"epoch": 0.13,
"grad_norm": 3.1509273052215576,
"learning_rate": 1.0063157894736843e-05,
"loss": 0.697,
"step": 5220
},
{
"epoch": 0.13,
"grad_norm": 4.524876117706299,
"learning_rate": 1.0042105263157896e-05,
"loss": 0.8373,
"step": 5230
},
{
"epoch": 0.13,
"grad_norm": 2.7305006980895996,
"learning_rate": 1.002105263157895e-05,
"loss": 0.7182,
"step": 5240
},
{
"epoch": 0.13,
"grad_norm": 2.5194203853607178,
"learning_rate": 1e-05,
"loss": 0.794,
"step": 5250
},
{
"epoch": 0.13,
"grad_norm": 14.967845916748047,
"learning_rate": 9.978947368421053e-06,
"loss": 0.7564,
"step": 5260
},
{
"epoch": 0.13,
"grad_norm": 1.8730751276016235,
"learning_rate": 9.957894736842106e-06,
"loss": 0.726,
"step": 5270
},
{
"epoch": 0.13,
"grad_norm": 2.1793789863586426,
"learning_rate": 9.936842105263159e-06,
"loss": 0.7019,
"step": 5280
},
{
"epoch": 0.13,
"grad_norm": 5.0785651206970215,
"learning_rate": 9.915789473684211e-06,
"loss": 0.7771,
"step": 5290
},
{
"epoch": 0.13,
"grad_norm": 9.810837745666504,
"learning_rate": 9.894736842105264e-06,
"loss": 0.7542,
"step": 5300
},
{
"epoch": 0.13,
"grad_norm": 24.654855728149414,
"learning_rate": 9.873684210526317e-06,
"loss": 0.7928,
"step": 5310
},
{
"epoch": 0.13,
"grad_norm": 3.083669424057007,
"learning_rate": 9.85263157894737e-06,
"loss": 0.8091,
"step": 5320
},
{
"epoch": 0.13,
"grad_norm": 3.9507665634155273,
"learning_rate": 9.831578947368422e-06,
"loss": 0.7548,
"step": 5330
},
{
"epoch": 0.13,
"grad_norm": 2.55362606048584,
"learning_rate": 9.810526315789475e-06,
"loss": 0.7804,
"step": 5340
},
{
"epoch": 0.13,
"grad_norm": 3.572410821914673,
"learning_rate": 9.789473684210527e-06,
"loss": 0.748,
"step": 5350
},
{
"epoch": 0.13,
"grad_norm": 3.70060658454895,
"learning_rate": 9.76842105263158e-06,
"loss": 0.7303,
"step": 5360
},
{
"epoch": 0.13,
"grad_norm": 3.397512674331665,
"learning_rate": 9.747368421052633e-06,
"loss": 0.7209,
"step": 5370
},
{
"epoch": 0.13,
"grad_norm": 2.797943592071533,
"learning_rate": 9.726315789473685e-06,
"loss": 0.9082,
"step": 5380
},
{
"epoch": 0.13,
"grad_norm": 9.164168357849121,
"learning_rate": 9.705263157894738e-06,
"loss": 0.7995,
"step": 5390
},
{
"epoch": 0.14,
"grad_norm": 6.297326564788818,
"learning_rate": 9.68421052631579e-06,
"loss": 0.7484,
"step": 5400
},
{
"epoch": 0.14,
"grad_norm": 12.500905990600586,
"learning_rate": 9.663157894736843e-06,
"loss": 0.7291,
"step": 5410
},
{
"epoch": 0.14,
"grad_norm": 3.1083016395568848,
"learning_rate": 9.642105263157896e-06,
"loss": 0.8064,
"step": 5420
},
{
"epoch": 0.14,
"grad_norm": 4.058903694152832,
"learning_rate": 9.621052631578947e-06,
"loss": 0.7087,
"step": 5430
},
{
"epoch": 0.14,
"grad_norm": 5.303778648376465,
"learning_rate": 9.600000000000001e-06,
"loss": 0.6257,
"step": 5440
},
{
"epoch": 0.14,
"grad_norm": 2.8508620262145996,
"learning_rate": 9.578947368421054e-06,
"loss": 0.7423,
"step": 5450
},
{
"epoch": 0.14,
"grad_norm": 5.9560956954956055,
"learning_rate": 9.557894736842107e-06,
"loss": 0.7304,
"step": 5460
},
{
"epoch": 0.14,
"grad_norm": 2.8841540813446045,
"learning_rate": 9.53684210526316e-06,
"loss": 0.7768,
"step": 5470
},
{
"epoch": 0.14,
"grad_norm": 2.6742358207702637,
"learning_rate": 9.515789473684212e-06,
"loss": 0.7618,
"step": 5480
},
{
"epoch": 0.14,
"grad_norm": 4.105114936828613,
"learning_rate": 9.494736842105265e-06,
"loss": 0.7086,
"step": 5490
},
{
"epoch": 0.14,
"grad_norm": 4.728137493133545,
"learning_rate": 9.473684210526315e-06,
"loss": 0.8313,
"step": 5500
},
{
"epoch": 0.14,
"eval_loss": 0.7711445689201355,
"eval_runtime": 67.9047,
"eval_samples_per_second": 14.727,
"eval_steps_per_second": 14.727,
"step": 5500
},
{
"epoch": 0.14,
"grad_norm": 4.539173603057861,
"learning_rate": 9.452631578947368e-06,
"loss": 0.7231,
"step": 5510
},
{
"epoch": 0.14,
"grad_norm": 4.742118835449219,
"learning_rate": 9.43157894736842e-06,
"loss": 0.8199,
"step": 5520
},
{
"epoch": 0.14,
"grad_norm": 5.9068603515625,
"learning_rate": 9.410526315789475e-06,
"loss": 0.7615,
"step": 5530
},
{
"epoch": 0.14,
"grad_norm": 7.106772422790527,
"learning_rate": 9.389473684210528e-06,
"loss": 0.7139,
"step": 5540
},
{
"epoch": 0.14,
"grad_norm": 2.272012710571289,
"learning_rate": 9.36842105263158e-06,
"loss": 0.6264,
"step": 5550
},
{
"epoch": 0.14,
"grad_norm": 14.025699615478516,
"learning_rate": 9.347368421052633e-06,
"loss": 0.7416,
"step": 5560
},
{
"epoch": 0.14,
"grad_norm": 12.747345924377441,
"learning_rate": 9.326315789473684e-06,
"loss": 0.781,
"step": 5570
},
{
"epoch": 0.14,
"grad_norm": 7.966195106506348,
"learning_rate": 9.305263157894737e-06,
"loss": 0.7503,
"step": 5580
},
{
"epoch": 0.14,
"grad_norm": 3.3705811500549316,
"learning_rate": 9.28421052631579e-06,
"loss": 0.7704,
"step": 5590
},
{
"epoch": 0.14,
"grad_norm": 5.239542007446289,
"learning_rate": 9.263157894736842e-06,
"loss": 0.6806,
"step": 5600
},
{
"epoch": 0.14,
"grad_norm": 6.395047187805176,
"learning_rate": 9.242105263157896e-06,
"loss": 0.6961,
"step": 5610
},
{
"epoch": 0.14,
"grad_norm": 3.807992458343506,
"learning_rate": 9.221052631578949e-06,
"loss": 0.769,
"step": 5620
},
{
"epoch": 0.14,
"grad_norm": 3.8179049491882324,
"learning_rate": 9.200000000000002e-06,
"loss": 0.7515,
"step": 5630
},
{
"epoch": 0.14,
"grad_norm": 4.826687812805176,
"learning_rate": 9.178947368421053e-06,
"loss": 0.7337,
"step": 5640
},
{
"epoch": 0.14,
"grad_norm": 4.776168346405029,
"learning_rate": 9.157894736842105e-06,
"loss": 0.7173,
"step": 5650
},
{
"epoch": 0.14,
"grad_norm": 4.10529088973999,
"learning_rate": 9.136842105263158e-06,
"loss": 0.7255,
"step": 5660
},
{
"epoch": 0.14,
"grad_norm": 5.4715189933776855,
"learning_rate": 9.11578947368421e-06,
"loss": 0.8092,
"step": 5670
},
{
"epoch": 0.14,
"grad_norm": 3.8921728134155273,
"learning_rate": 9.094736842105263e-06,
"loss": 0.6684,
"step": 5680
},
{
"epoch": 0.14,
"grad_norm": 5.904684066772461,
"learning_rate": 9.073684210526316e-06,
"loss": 0.7804,
"step": 5690
},
{
"epoch": 0.14,
"grad_norm": 9.521209716796875,
"learning_rate": 9.05263157894737e-06,
"loss": 0.793,
"step": 5700
},
{
"epoch": 0.14,
"grad_norm": 11.125286102294922,
"learning_rate": 9.031578947368423e-06,
"loss": 0.8254,
"step": 5710
},
{
"epoch": 0.14,
"grad_norm": 8.136049270629883,
"learning_rate": 9.010526315789474e-06,
"loss": 0.7475,
"step": 5720
},
{
"epoch": 0.14,
"grad_norm": 2.4722092151641846,
"learning_rate": 8.989473684210527e-06,
"loss": 0.7268,
"step": 5730
},
{
"epoch": 0.14,
"grad_norm": 3.330580711364746,
"learning_rate": 8.96842105263158e-06,
"loss": 0.7995,
"step": 5740
},
{
"epoch": 0.14,
"grad_norm": 25.711868286132812,
"learning_rate": 8.947368421052632e-06,
"loss": 0.801,
"step": 5750
},
{
"epoch": 0.14,
"grad_norm": 2.3957395553588867,
"learning_rate": 8.926315789473685e-06,
"loss": 0.6988,
"step": 5760
},
{
"epoch": 0.14,
"grad_norm": 3.033153533935547,
"learning_rate": 8.905263157894737e-06,
"loss": 0.7378,
"step": 5770
},
{
"epoch": 0.14,
"grad_norm": 4.359398365020752,
"learning_rate": 8.884210526315792e-06,
"loss": 0.7214,
"step": 5780
},
{
"epoch": 0.14,
"grad_norm": 3.08485746383667,
"learning_rate": 8.863157894736842e-06,
"loss": 0.7034,
"step": 5790
},
{
"epoch": 0.14,
"grad_norm": 4.156674385070801,
"learning_rate": 8.842105263157895e-06,
"loss": 0.7833,
"step": 5800
},
{
"epoch": 0.15,
"grad_norm": 4.031563758850098,
"learning_rate": 8.821052631578948e-06,
"loss": 0.7385,
"step": 5810
},
{
"epoch": 0.15,
"grad_norm": 9.957317352294922,
"learning_rate": 8.8e-06,
"loss": 0.8572,
"step": 5820
},
{
"epoch": 0.15,
"grad_norm": 3.951910972595215,
"learning_rate": 8.778947368421053e-06,
"loss": 0.7374,
"step": 5830
},
{
"epoch": 0.15,
"grad_norm": 5.296828746795654,
"learning_rate": 8.757894736842106e-06,
"loss": 0.7619,
"step": 5840
},
{
"epoch": 0.15,
"grad_norm": 7.079039096832275,
"learning_rate": 8.736842105263158e-06,
"loss": 0.7842,
"step": 5850
},
{
"epoch": 0.15,
"grad_norm": 4.972481727600098,
"learning_rate": 8.715789473684211e-06,
"loss": 0.7039,
"step": 5860
},
{
"epoch": 0.15,
"grad_norm": 11.936322212219238,
"learning_rate": 8.694736842105264e-06,
"loss": 0.6701,
"step": 5870
},
{
"epoch": 0.15,
"grad_norm": 4.164266586303711,
"learning_rate": 8.673684210526316e-06,
"loss": 0.7481,
"step": 5880
},
{
"epoch": 0.15,
"grad_norm": 4.0412397384643555,
"learning_rate": 8.652631578947369e-06,
"loss": 0.8783,
"step": 5890
},
{
"epoch": 0.15,
"grad_norm": 13.239718437194824,
"learning_rate": 8.631578947368422e-06,
"loss": 0.8639,
"step": 5900
},
{
"epoch": 0.15,
"grad_norm": 5.553131103515625,
"learning_rate": 8.610526315789474e-06,
"loss": 0.7861,
"step": 5910
},
{
"epoch": 0.15,
"grad_norm": 4.507501602172852,
"learning_rate": 8.589473684210527e-06,
"loss": 0.7526,
"step": 5920
},
{
"epoch": 0.15,
"grad_norm": 3.70124888420105,
"learning_rate": 8.56842105263158e-06,
"loss": 0.8391,
"step": 5930
},
{
"epoch": 0.15,
"grad_norm": 4.307315349578857,
"learning_rate": 8.547368421052632e-06,
"loss": 0.7253,
"step": 5940
},
{
"epoch": 0.15,
"grad_norm": 12.232582092285156,
"learning_rate": 8.526315789473685e-06,
"loss": 0.8559,
"step": 5950
},
{
"epoch": 0.15,
"grad_norm": 3.0924105644226074,
"learning_rate": 8.505263157894738e-06,
"loss": 0.6245,
"step": 5960
},
{
"epoch": 0.15,
"grad_norm": 2.90191912651062,
"learning_rate": 8.48421052631579e-06,
"loss": 0.6643,
"step": 5970
},
{
"epoch": 0.15,
"grad_norm": 3.4637041091918945,
"learning_rate": 8.463157894736843e-06,
"loss": 0.72,
"step": 5980
},
{
"epoch": 0.15,
"grad_norm": 2.8273704051971436,
"learning_rate": 8.442105263157896e-06,
"loss": 0.7202,
"step": 5990
},
{
"epoch": 0.15,
"grad_norm": 7.119280815124512,
"learning_rate": 8.421052631578948e-06,
"loss": 0.7047,
"step": 6000
},
{
"epoch": 0.15,
"eval_loss": 0.7685219645500183,
"eval_runtime": 67.892,
"eval_samples_per_second": 14.729,
"eval_steps_per_second": 14.729,
"step": 6000
},
{
"epoch": 0.15,
"grad_norm": 4.9551520347595215,
"learning_rate": 8.400000000000001e-06,
"loss": 0.6911,
"step": 6010
},
{
"epoch": 0.15,
"grad_norm": 2.9231200218200684,
"learning_rate": 8.378947368421054e-06,
"loss": 0.7942,
"step": 6020
},
{
"epoch": 0.15,
"grad_norm": 7.254823684692383,
"learning_rate": 8.357894736842106e-06,
"loss": 0.7811,
"step": 6030
},
{
"epoch": 0.15,
"grad_norm": 3.8563404083251953,
"learning_rate": 8.336842105263159e-06,
"loss": 0.7523,
"step": 6040
},
{
"epoch": 0.15,
"grad_norm": 3.5061299800872803,
"learning_rate": 8.315789473684212e-06,
"loss": 0.6222,
"step": 6050
},
{
"epoch": 0.15,
"grad_norm": 3.3213858604431152,
"learning_rate": 8.294736842105264e-06,
"loss": 0.7617,
"step": 6060
},
{
"epoch": 0.15,
"grad_norm": 5.054555416107178,
"learning_rate": 8.273684210526317e-06,
"loss": 0.7333,
"step": 6070
},
{
"epoch": 0.15,
"grad_norm": 3.5189318656921387,
"learning_rate": 8.25263157894737e-06,
"loss": 0.8676,
"step": 6080
},
{
"epoch": 0.15,
"grad_norm": 4.989790439605713,
"learning_rate": 8.231578947368422e-06,
"loss": 0.6678,
"step": 6090
},
{
"epoch": 0.15,
"grad_norm": 7.941010475158691,
"learning_rate": 8.210526315789475e-06,
"loss": 0.7317,
"step": 6100
},
{
"epoch": 0.15,
"grad_norm": 6.6499247550964355,
"learning_rate": 8.189473684210527e-06,
"loss": 0.7484,
"step": 6110
},
{
"epoch": 0.15,
"grad_norm": 3.512948513031006,
"learning_rate": 8.16842105263158e-06,
"loss": 0.8508,
"step": 6120
},
{
"epoch": 0.15,
"grad_norm": 3.844045400619507,
"learning_rate": 8.147368421052633e-06,
"loss": 0.7468,
"step": 6130
},
{
"epoch": 0.15,
"grad_norm": 2.620250701904297,
"learning_rate": 8.126315789473684e-06,
"loss": 0.6449,
"step": 6140
},
{
"epoch": 0.15,
"grad_norm": 3.5233919620513916,
"learning_rate": 8.105263157894736e-06,
"loss": 0.7928,
"step": 6150
},
{
"epoch": 0.15,
"grad_norm": 4.866186618804932,
"learning_rate": 8.08421052631579e-06,
"loss": 0.787,
"step": 6160
},
{
"epoch": 0.15,
"grad_norm": 4.392407417297363,
"learning_rate": 8.063157894736843e-06,
"loss": 0.7746,
"step": 6170
},
{
"epoch": 0.15,
"grad_norm": 6.6285176277160645,
"learning_rate": 8.042105263157896e-06,
"loss": 0.7304,
"step": 6180
},
{
"epoch": 0.15,
"grad_norm": 2.571240186691284,
"learning_rate": 8.021052631578949e-06,
"loss": 0.7008,
"step": 6190
},
{
"epoch": 0.15,
"grad_norm": 2.8306283950805664,
"learning_rate": 8.000000000000001e-06,
"loss": 0.834,
"step": 6200
},
{
"epoch": 0.16,
"grad_norm": 2.5514955520629883,
"learning_rate": 7.978947368421052e-06,
"loss": 0.8136,
"step": 6210
},
{
"epoch": 0.16,
"grad_norm": 8.471675872802734,
"learning_rate": 7.957894736842105e-06,
"loss": 0.8439,
"step": 6220
},
{
"epoch": 0.16,
"grad_norm": 8.785553932189941,
"learning_rate": 7.936842105263158e-06,
"loss": 0.7763,
"step": 6230
},
{
"epoch": 0.16,
"grad_norm": 5.334304332733154,
"learning_rate": 7.915789473684212e-06,
"loss": 0.7832,
"step": 6240
},
{
"epoch": 0.16,
"grad_norm": 14.861701011657715,
"learning_rate": 7.894736842105265e-06,
"loss": 0.6889,
"step": 6250
},
{
"epoch": 0.16,
"grad_norm": 2.040034770965576,
"learning_rate": 7.873684210526317e-06,
"loss": 0.7422,
"step": 6260
},
{
"epoch": 0.16,
"grad_norm": 9.74354076385498,
"learning_rate": 7.85263157894737e-06,
"loss": 0.7765,
"step": 6270
},
{
"epoch": 0.16,
"grad_norm": 3.4280757904052734,
"learning_rate": 7.831578947368421e-06,
"loss": 0.7465,
"step": 6280
},
{
"epoch": 0.16,
"grad_norm": 6.530819416046143,
"learning_rate": 7.810526315789474e-06,
"loss": 0.8216,
"step": 6290
},
{
"epoch": 0.16,
"grad_norm": 6.786412239074707,
"learning_rate": 7.789473684210526e-06,
"loss": 0.7694,
"step": 6300
},
{
"epoch": 0.16,
"grad_norm": 4.896278381347656,
"learning_rate": 7.768421052631579e-06,
"loss": 0.8282,
"step": 6310
},
{
"epoch": 0.16,
"grad_norm": 4.5938825607299805,
"learning_rate": 7.747368421052631e-06,
"loss": 0.6628,
"step": 6320
},
{
"epoch": 0.16,
"grad_norm": 2.134136915206909,
"learning_rate": 7.726315789473686e-06,
"loss": 0.8061,
"step": 6330
},
{
"epoch": 0.16,
"grad_norm": 7.497835159301758,
"learning_rate": 7.705263157894738e-06,
"loss": 0.8946,
"step": 6340
},
{
"epoch": 0.16,
"grad_norm": 3.5185306072235107,
"learning_rate": 7.68421052631579e-06,
"loss": 0.6689,
"step": 6350
},
{
"epoch": 0.16,
"grad_norm": 2.464015245437622,
"learning_rate": 7.663157894736842e-06,
"loss": 0.7758,
"step": 6360
},
{
"epoch": 0.16,
"grad_norm": 2.803342580795288,
"learning_rate": 7.642105263157895e-06,
"loss": 0.7478,
"step": 6370
},
{
"epoch": 0.16,
"grad_norm": 6.2652130126953125,
"learning_rate": 7.621052631578948e-06,
"loss": 0.7293,
"step": 6380
},
{
"epoch": 0.16,
"grad_norm": 9.655146598815918,
"learning_rate": 7.600000000000001e-06,
"loss": 0.7454,
"step": 6390
},
{
"epoch": 0.16,
"grad_norm": 5.041891574859619,
"learning_rate": 7.578947368421054e-06,
"loss": 0.8579,
"step": 6400
},
{
"epoch": 0.16,
"grad_norm": 3.133237838745117,
"learning_rate": 7.557894736842106e-06,
"loss": 0.6662,
"step": 6410
},
{
"epoch": 0.16,
"grad_norm": 7.207560062408447,
"learning_rate": 7.536842105263158e-06,
"loss": 0.8135,
"step": 6420
},
{
"epoch": 0.16,
"grad_norm": 3.374864101409912,
"learning_rate": 7.515789473684211e-06,
"loss": 0.7514,
"step": 6430
},
{
"epoch": 0.16,
"grad_norm": 4.067178249359131,
"learning_rate": 7.494736842105263e-06,
"loss": 0.7446,
"step": 6440
},
{
"epoch": 0.16,
"grad_norm": 4.283421516418457,
"learning_rate": 7.473684210526316e-06,
"loss": 0.7955,
"step": 6450
},
{
"epoch": 0.16,
"grad_norm": 3.092348098754883,
"learning_rate": 7.4526315789473695e-06,
"loss": 0.5471,
"step": 6460
},
{
"epoch": 0.16,
"grad_norm": 9.400391578674316,
"learning_rate": 7.431578947368422e-06,
"loss": 0.7098,
"step": 6470
},
{
"epoch": 0.16,
"grad_norm": 5.843224048614502,
"learning_rate": 7.410526315789475e-06,
"loss": 0.7943,
"step": 6480
},
{
"epoch": 0.16,
"grad_norm": 3.5985705852508545,
"learning_rate": 7.3894736842105275e-06,
"loss": 0.8059,
"step": 6490
},
{
"epoch": 0.16,
"grad_norm": 5.502979278564453,
"learning_rate": 7.368421052631579e-06,
"loss": 0.6236,
"step": 6500
},
{
"epoch": 0.16,
"eval_loss": 0.7682243585586548,
"eval_runtime": 67.9039,
"eval_samples_per_second": 14.727,
"eval_steps_per_second": 14.727,
"step": 6500
},
{
"epoch": 0.16,
"grad_norm": 11.025419235229492,
"learning_rate": 7.347368421052632e-06,
"loss": 0.8343,
"step": 6510
},
{
"epoch": 0.16,
"grad_norm": 3.4290804862976074,
"learning_rate": 7.326315789473685e-06,
"loss": 0.7572,
"step": 6520
},
{
"epoch": 0.16,
"grad_norm": 3.0629210472106934,
"learning_rate": 7.305263157894737e-06,
"loss": 0.8245,
"step": 6530
},
{
"epoch": 0.16,
"grad_norm": 5.065977573394775,
"learning_rate": 7.28421052631579e-06,
"loss": 0.6447,
"step": 6540
},
{
"epoch": 0.16,
"grad_norm": 3.971541166305542,
"learning_rate": 7.263157894736843e-06,
"loss": 0.8688,
"step": 6550
},
{
"epoch": 0.16,
"grad_norm": 3.4434573650360107,
"learning_rate": 7.242105263157896e-06,
"loss": 0.6749,
"step": 6560
},
{
"epoch": 0.16,
"grad_norm": 4.323293685913086,
"learning_rate": 7.221052631578948e-06,
"loss": 0.7982,
"step": 6570
},
{
"epoch": 0.16,
"grad_norm": 16.821266174316406,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.7898,
"step": 6580
},
{
"epoch": 0.16,
"grad_norm": 3.008687734603882,
"learning_rate": 7.178947368421053e-06,
"loss": 0.7375,
"step": 6590
},
{
"epoch": 0.17,
"grad_norm": 3.629837989807129,
"learning_rate": 7.157894736842106e-06,
"loss": 0.7909,
"step": 6600
},
{
"epoch": 0.17,
"grad_norm": 5.807744026184082,
"learning_rate": 7.1368421052631585e-06,
"loss": 0.621,
"step": 6610
},
{
"epoch": 0.17,
"grad_norm": 3.9960129261016846,
"learning_rate": 7.115789473684211e-06,
"loss": 0.851,
"step": 6620
},
{
"epoch": 0.17,
"grad_norm": 2.7165372371673584,
"learning_rate": 7.094736842105265e-06,
"loss": 0.7872,
"step": 6630
},
{
"epoch": 0.17,
"grad_norm": 5.922586917877197,
"learning_rate": 7.073684210526316e-06,
"loss": 0.8822,
"step": 6640
},
{
"epoch": 0.17,
"grad_norm": 9.046282768249512,
"learning_rate": 7.052631578947369e-06,
"loss": 0.7454,
"step": 6650
},
{
"epoch": 0.17,
"grad_norm": 4.76317024230957,
"learning_rate": 7.031578947368422e-06,
"loss": 0.7116,
"step": 6660
},
{
"epoch": 0.17,
"grad_norm": 4.31531286239624,
"learning_rate": 7.010526315789474e-06,
"loss": 0.7892,
"step": 6670
},
{
"epoch": 0.17,
"grad_norm": 3.0895297527313232,
"learning_rate": 6.989473684210527e-06,
"loss": 0.7095,
"step": 6680
},
{
"epoch": 0.17,
"grad_norm": 4.174783706665039,
"learning_rate": 6.96842105263158e-06,
"loss": 0.8007,
"step": 6690
},
{
"epoch": 0.17,
"grad_norm": 4.1555280685424805,
"learning_rate": 6.947368421052632e-06,
"loss": 0.8274,
"step": 6700
},
{
"epoch": 0.17,
"grad_norm": 3.173882246017456,
"learning_rate": 6.926315789473684e-06,
"loss": 0.6447,
"step": 6710
},
{
"epoch": 0.17,
"grad_norm": 2.1489410400390625,
"learning_rate": 6.905263157894737e-06,
"loss": 0.7428,
"step": 6720
},
{
"epoch": 0.17,
"grad_norm": 2.523904323577881,
"learning_rate": 6.8842105263157895e-06,
"loss": 0.8159,
"step": 6730
},
{
"epoch": 0.17,
"grad_norm": 1.7494622468948364,
"learning_rate": 6.863157894736843e-06,
"loss": 0.863,
"step": 6740
},
{
"epoch": 0.17,
"grad_norm": 2.552121639251709,
"learning_rate": 6.842105263157896e-06,
"loss": 0.7448,
"step": 6750
},
{
"epoch": 0.17,
"grad_norm": 4.1907453536987305,
"learning_rate": 6.821052631578948e-06,
"loss": 0.6813,
"step": 6760
},
{
"epoch": 0.17,
"grad_norm": 4.284384727478027,
"learning_rate": 6.800000000000001e-06,
"loss": 0.699,
"step": 6770
},
{
"epoch": 0.17,
"grad_norm": 5.010688781738281,
"learning_rate": 6.778947368421053e-06,
"loss": 0.7803,
"step": 6780
},
{
"epoch": 0.17,
"grad_norm": 2.5098397731781006,
"learning_rate": 6.7578947368421054e-06,
"loss": 0.767,
"step": 6790
},
{
"epoch": 0.17,
"grad_norm": 2.8980441093444824,
"learning_rate": 6.736842105263158e-06,
"loss": 0.8084,
"step": 6800
},
{
"epoch": 0.17,
"grad_norm": 3.8058199882507324,
"learning_rate": 6.715789473684211e-06,
"loss": 0.7214,
"step": 6810
},
{
"epoch": 0.17,
"grad_norm": 2.3668529987335205,
"learning_rate": 6.694736842105264e-06,
"loss": 0.6759,
"step": 6820
},
{
"epoch": 0.17,
"grad_norm": 5.715735912322998,
"learning_rate": 6.673684210526317e-06,
"loss": 0.7747,
"step": 6830
},
{
"epoch": 0.17,
"grad_norm": 8.902985572814941,
"learning_rate": 6.6526315789473695e-06,
"loss": 0.8256,
"step": 6840
},
{
"epoch": 0.17,
"grad_norm": 5.802920818328857,
"learning_rate": 6.631578947368421e-06,
"loss": 0.7682,
"step": 6850
},
{
"epoch": 0.17,
"grad_norm": 9.218498229980469,
"learning_rate": 6.610526315789474e-06,
"loss": 0.7855,
"step": 6860
},
{
"epoch": 0.17,
"grad_norm": 4.406294822692871,
"learning_rate": 6.589473684210527e-06,
"loss": 0.736,
"step": 6870
},
{
"epoch": 0.17,
"grad_norm": 5.765889644622803,
"learning_rate": 6.568421052631579e-06,
"loss": 0.7073,
"step": 6880
},
{
"epoch": 0.17,
"grad_norm": 2.910264015197754,
"learning_rate": 6.547368421052632e-06,
"loss": 0.7328,
"step": 6890
},
{
"epoch": 0.17,
"grad_norm": 9.011739730834961,
"learning_rate": 6.526315789473685e-06,
"loss": 0.6798,
"step": 6900
},
{
"epoch": 0.17,
"grad_norm": 8.296028137207031,
"learning_rate": 6.505263157894738e-06,
"loss": 0.7469,
"step": 6910
},
{
"epoch": 0.17,
"grad_norm": 5.347682952880859,
"learning_rate": 6.484210526315789e-06,
"loss": 0.7143,
"step": 6920
},
{
"epoch": 0.17,
"grad_norm": 5.903685092926025,
"learning_rate": 6.463157894736843e-06,
"loss": 0.7413,
"step": 6930
},
{
"epoch": 0.17,
"grad_norm": 4.017665386199951,
"learning_rate": 6.442105263157895e-06,
"loss": 0.7569,
"step": 6940
},
{
"epoch": 0.17,
"grad_norm": 2.3947088718414307,
"learning_rate": 6.421052631578948e-06,
"loss": 0.75,
"step": 6950
},
{
"epoch": 0.17,
"grad_norm": 4.019251823425293,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.7364,
"step": 6960
},
{
"epoch": 0.17,
"grad_norm": 2.439628839492798,
"learning_rate": 6.378947368421053e-06,
"loss": 0.68,
"step": 6970
},
{
"epoch": 0.17,
"grad_norm": 2.413942575454712,
"learning_rate": 6.357894736842106e-06,
"loss": 0.79,
"step": 6980
},
{
"epoch": 0.17,
"grad_norm": 8.72237491607666,
"learning_rate": 6.336842105263158e-06,
"loss": 0.6678,
"step": 6990
},
{
"epoch": 0.17,
"grad_norm": 3.9021055698394775,
"learning_rate": 6.31578947368421e-06,
"loss": 0.7169,
"step": 7000
},
{
"epoch": 0.17,
"eval_loss": 0.7889605164527893,
"eval_runtime": 67.8704,
"eval_samples_per_second": 14.734,
"eval_steps_per_second": 14.734,
"step": 7000
},
{
"epoch": 0.18,
"grad_norm": 8.238909721374512,
"learning_rate": 6.294736842105264e-06,
"loss": 0.658,
"step": 7010
},
{
"epoch": 0.18,
"grad_norm": 3.403461456298828,
"learning_rate": 6.2736842105263165e-06,
"loss": 0.8165,
"step": 7020
},
{
"epoch": 0.18,
"grad_norm": 5.648688316345215,
"learning_rate": 6.252631578947369e-06,
"loss": 0.7506,
"step": 7030
},
{
"epoch": 0.18,
"grad_norm": 2.380591630935669,
"learning_rate": 6.231578947368422e-06,
"loss": 0.8892,
"step": 7040
},
{
"epoch": 0.18,
"grad_norm": 4.201750755310059,
"learning_rate": 6.2105263157894745e-06,
"loss": 0.7069,
"step": 7050
},
{
"epoch": 0.18,
"grad_norm": 2.9994821548461914,
"learning_rate": 6.189473684210526e-06,
"loss": 0.6896,
"step": 7060
},
{
"epoch": 0.18,
"grad_norm": 5.100094318389893,
"learning_rate": 6.168421052631579e-06,
"loss": 0.6241,
"step": 7070
},
{
"epoch": 0.18,
"grad_norm": 3.88962721824646,
"learning_rate": 6.1473684210526316e-06,
"loss": 0.741,
"step": 7080
},
{
"epoch": 0.18,
"grad_norm": 3.669283151626587,
"learning_rate": 6.126315789473685e-06,
"loss": 0.5153,
"step": 7090
},
{
"epoch": 0.18,
"grad_norm": 6.010345458984375,
"learning_rate": 6.105263157894738e-06,
"loss": 0.7394,
"step": 7100
},
{
"epoch": 0.18,
"grad_norm": 5.333982467651367,
"learning_rate": 6.08421052631579e-06,
"loss": 0.6423,
"step": 7110
},
{
"epoch": 0.18,
"grad_norm": 2.0060064792633057,
"learning_rate": 6.063157894736843e-06,
"loss": 0.7073,
"step": 7120
},
{
"epoch": 0.18,
"grad_norm": 3.618821144104004,
"learning_rate": 6.042105263157895e-06,
"loss": 0.7221,
"step": 7130
},
{
"epoch": 0.18,
"grad_norm": 2.6231422424316406,
"learning_rate": 6.0210526315789475e-06,
"loss": 0.6748,
"step": 7140
},
{
"epoch": 0.18,
"grad_norm": 7.207015514373779,
"learning_rate": 6e-06,
"loss": 0.7403,
"step": 7150
},
{
"epoch": 0.18,
"grad_norm": 5.1877031326293945,
"learning_rate": 5.978947368421053e-06,
"loss": 0.6143,
"step": 7160
},
{
"epoch": 0.18,
"grad_norm": 3.433973550796509,
"learning_rate": 5.9578947368421055e-06,
"loss": 0.6593,
"step": 7170
},
{
"epoch": 0.18,
"grad_norm": 4.261890888214111,
"learning_rate": 5.936842105263159e-06,
"loss": 0.7119,
"step": 7180
},
{
"epoch": 0.18,
"grad_norm": 2.4731180667877197,
"learning_rate": 5.915789473684212e-06,
"loss": 0.7764,
"step": 7190
},
{
"epoch": 0.18,
"grad_norm": 3.540252923965454,
"learning_rate": 5.8947368421052634e-06,
"loss": 0.788,
"step": 7200
},
{
"epoch": 0.18,
"grad_norm": 16.481884002685547,
"learning_rate": 5.873684210526316e-06,
"loss": 0.7411,
"step": 7210
},
{
"epoch": 0.18,
"grad_norm": 5.3406548500061035,
"learning_rate": 5.852631578947369e-06,
"loss": 0.7703,
"step": 7220
},
{
"epoch": 0.18,
"grad_norm": 5.786658763885498,
"learning_rate": 5.831578947368421e-06,
"loss": 0.7068,
"step": 7230
},
{
"epoch": 0.18,
"grad_norm": 6.659720420837402,
"learning_rate": 5.810526315789474e-06,
"loss": 0.7287,
"step": 7240
},
{
"epoch": 0.18,
"grad_norm": 2.9273788928985596,
"learning_rate": 5.789473684210527e-06,
"loss": 0.7059,
"step": 7250
},
{
"epoch": 0.18,
"grad_norm": 5.475671768188477,
"learning_rate": 5.76842105263158e-06,
"loss": 0.7284,
"step": 7260
},
{
"epoch": 0.18,
"grad_norm": 5.699868202209473,
"learning_rate": 5.747368421052633e-06,
"loss": 0.8036,
"step": 7270
},
{
"epoch": 0.18,
"grad_norm": 3.518573045730591,
"learning_rate": 5.726315789473685e-06,
"loss": 0.7209,
"step": 7280
},
{
"epoch": 0.18,
"grad_norm": 8.151999473571777,
"learning_rate": 5.705263157894737e-06,
"loss": 0.6903,
"step": 7290
},
{
"epoch": 0.18,
"grad_norm": 4.088874340057373,
"learning_rate": 5.68421052631579e-06,
"loss": 0.7685,
"step": 7300
},
{
"epoch": 0.18,
"grad_norm": 9.118200302124023,
"learning_rate": 5.663157894736843e-06,
"loss": 0.7256,
"step": 7310
},
{
"epoch": 0.18,
"grad_norm": 6.765544414520264,
"learning_rate": 5.642105263157895e-06,
"loss": 0.8016,
"step": 7320
},
{
"epoch": 0.18,
"grad_norm": 11.424837112426758,
"learning_rate": 5.621052631578948e-06,
"loss": 0.7721,
"step": 7330
},
{
"epoch": 0.18,
"grad_norm": 5.862210750579834,
"learning_rate": 5.600000000000001e-06,
"loss": 0.6898,
"step": 7340
},
{
"epoch": 0.18,
"grad_norm": 4.197153568267822,
"learning_rate": 5.578947368421052e-06,
"loss": 0.6907,
"step": 7350
},
{
"epoch": 0.18,
"grad_norm": 6.712553977966309,
"learning_rate": 5.557894736842105e-06,
"loss": 0.773,
"step": 7360
},
{
"epoch": 0.18,
"grad_norm": 4.968278408050537,
"learning_rate": 5.5368421052631586e-06,
"loss": 0.7892,
"step": 7370
},
{
"epoch": 0.18,
"grad_norm": 3.8882153034210205,
"learning_rate": 5.515789473684211e-06,
"loss": 0.8365,
"step": 7380
},
{
"epoch": 0.18,
"grad_norm": 6.6297197341918945,
"learning_rate": 5.494736842105264e-06,
"loss": 0.7374,
"step": 7390
},
{
"epoch": 0.18,
"grad_norm": 2.2362327575683594,
"learning_rate": 5.4736842105263165e-06,
"loss": 0.8293,
"step": 7400
},
{
"epoch": 0.19,
"grad_norm": 4.1008100509643555,
"learning_rate": 5.452631578947369e-06,
"loss": 0.7048,
"step": 7410
},
{
"epoch": 0.19,
"grad_norm": 4.488921642303467,
"learning_rate": 5.431578947368421e-06,
"loss": 0.7902,
"step": 7420
},
{
"epoch": 0.19,
"grad_norm": 3.7497622966766357,
"learning_rate": 5.410526315789474e-06,
"loss": 0.8359,
"step": 7430
},
{
"epoch": 0.19,
"grad_norm": 3.192277193069458,
"learning_rate": 5.389473684210526e-06,
"loss": 0.7253,
"step": 7440
},
{
"epoch": 0.19,
"grad_norm": 4.586243629455566,
"learning_rate": 5.36842105263158e-06,
"loss": 0.7588,
"step": 7450
},
{
"epoch": 0.19,
"grad_norm": 3.424870729446411,
"learning_rate": 5.3473684210526325e-06,
"loss": 0.7268,
"step": 7460
},
{
"epoch": 0.19,
"grad_norm": 28.807186126708984,
"learning_rate": 5.326315789473685e-06,
"loss": 0.7979,
"step": 7470
},
{
"epoch": 0.19,
"grad_norm": 4.297823905944824,
"learning_rate": 5.305263157894738e-06,
"loss": 0.768,
"step": 7480
},
{
"epoch": 0.19,
"grad_norm": 4.891976833343506,
"learning_rate": 5.2842105263157896e-06,
"loss": 0.7063,
"step": 7490
},
{
"epoch": 0.19,
"grad_norm": 4.083632469177246,
"learning_rate": 5.263157894736842e-06,
"loss": 0.8102,
"step": 7500
},
{
"epoch": 0.19,
"eval_loss": 0.7507393956184387,
"eval_runtime": 67.8717,
"eval_samples_per_second": 14.734,
"eval_steps_per_second": 14.734,
"step": 7500
},
{
"epoch": 0.19,
"grad_norm": 10.315424919128418,
"learning_rate": 5.242105263157895e-06,
"loss": 0.736,
"step": 7510
},
{
"epoch": 0.19,
"grad_norm": 3.440877676010132,
"learning_rate": 5.2210526315789475e-06,
"loss": 0.799,
"step": 7520
},
{
"epoch": 0.19,
"grad_norm": 2.361064910888672,
"learning_rate": 5.2e-06,
"loss": 0.832,
"step": 7530
},
{
"epoch": 0.19,
"grad_norm": 2.1224961280822754,
"learning_rate": 5.178947368421054e-06,
"loss": 0.7118,
"step": 7540
},
{
"epoch": 0.19,
"grad_norm": 4.9322614669799805,
"learning_rate": 5.157894736842106e-06,
"loss": 0.6614,
"step": 7550
},
{
"epoch": 0.19,
"grad_norm": 4.812900066375732,
"learning_rate": 5.136842105263158e-06,
"loss": 0.8002,
"step": 7560
},
{
"epoch": 0.19,
"grad_norm": 6.411820411682129,
"learning_rate": 5.115789473684211e-06,
"loss": 0.835,
"step": 7570
},
{
"epoch": 0.19,
"grad_norm": 5.406981468200684,
"learning_rate": 5.0947368421052635e-06,
"loss": 0.8384,
"step": 7580
},
{
"epoch": 0.19,
"grad_norm": 4.32007360458374,
"learning_rate": 5.073684210526316e-06,
"loss": 0.5798,
"step": 7590
},
{
"epoch": 0.19,
"grad_norm": 4.640589714050293,
"learning_rate": 5.052631578947369e-06,
"loss": 0.7896,
"step": 7600
},
{
"epoch": 0.19,
"grad_norm": 5.4717936515808105,
"learning_rate": 5.0315789473684214e-06,
"loss": 0.7829,
"step": 7610
},
{
"epoch": 0.19,
"grad_norm": 2.995558261871338,
"learning_rate": 5.010526315789475e-06,
"loss": 0.7322,
"step": 7620
},
{
"epoch": 0.19,
"grad_norm": 2.5911152362823486,
"learning_rate": 4.989473684210527e-06,
"loss": 0.7727,
"step": 7630
},
{
"epoch": 0.19,
"grad_norm": 3.3829457759857178,
"learning_rate": 4.968421052631579e-06,
"loss": 0.7178,
"step": 7640
},
{
"epoch": 0.19,
"grad_norm": 5.157157897949219,
"learning_rate": 4.947368421052632e-06,
"loss": 0.7241,
"step": 7650
},
{
"epoch": 0.19,
"grad_norm": 6.205902099609375,
"learning_rate": 4.926315789473685e-06,
"loss": 0.7831,
"step": 7660
},
{
"epoch": 0.19,
"grad_norm": 3.92594051361084,
"learning_rate": 4.905263157894737e-06,
"loss": 0.8057,
"step": 7670
},
{
"epoch": 0.19,
"grad_norm": 4.578032493591309,
"learning_rate": 4.88421052631579e-06,
"loss": 0.8011,
"step": 7680
},
{
"epoch": 0.19,
"grad_norm": 6.8539605140686035,
"learning_rate": 4.863157894736843e-06,
"loss": 0.7792,
"step": 7690
},
{
"epoch": 0.19,
"grad_norm": 7.954685211181641,
"learning_rate": 4.842105263157895e-06,
"loss": 0.6691,
"step": 7700
},
{
"epoch": 0.19,
"grad_norm": 2.0253312587738037,
"learning_rate": 4.821052631578948e-06,
"loss": 0.6483,
"step": 7710
},
{
"epoch": 0.19,
"grad_norm": 8.230294227600098,
"learning_rate": 4.800000000000001e-06,
"loss": 0.8076,
"step": 7720
},
{
"epoch": 0.19,
"grad_norm": 2.5444509983062744,
"learning_rate": 4.778947368421053e-06,
"loss": 0.7902,
"step": 7730
},
{
"epoch": 0.19,
"grad_norm": 1.8759273290634155,
"learning_rate": 4.757894736842106e-06,
"loss": 0.7308,
"step": 7740
},
{
"epoch": 0.19,
"grad_norm": 5.69119930267334,
"learning_rate": 4.736842105263158e-06,
"loss": 0.6605,
"step": 7750
},
{
"epoch": 0.19,
"grad_norm": 7.020988941192627,
"learning_rate": 4.71578947368421e-06,
"loss": 0.7678,
"step": 7760
},
{
"epoch": 0.19,
"grad_norm": 4.7685866355896,
"learning_rate": 4.694736842105264e-06,
"loss": 0.8022,
"step": 7770
},
{
"epoch": 0.19,
"grad_norm": 2.516789436340332,
"learning_rate": 4.6736842105263166e-06,
"loss": 0.6176,
"step": 7780
},
{
"epoch": 0.19,
"grad_norm": 4.267387866973877,
"learning_rate": 4.652631578947368e-06,
"loss": 0.6487,
"step": 7790
},
{
"epoch": 0.2,
"grad_norm": 5.96762228012085,
"learning_rate": 4.631578947368421e-06,
"loss": 0.7066,
"step": 7800
},
{
"epoch": 0.2,
"grad_norm": 4.345110893249512,
"learning_rate": 4.6105263157894745e-06,
"loss": 0.6072,
"step": 7810
},
{
"epoch": 0.2,
"grad_norm": 10.33462142944336,
"learning_rate": 4.589473684210526e-06,
"loss": 0.8211,
"step": 7820
},
{
"epoch": 0.2,
"grad_norm": 4.632289409637451,
"learning_rate": 4.568421052631579e-06,
"loss": 0.8335,
"step": 7830
},
{
"epoch": 0.2,
"grad_norm": 4.453967094421387,
"learning_rate": 4.547368421052632e-06,
"loss": 0.8331,
"step": 7840
},
{
"epoch": 0.2,
"grad_norm": 5.877091407775879,
"learning_rate": 4.526315789473685e-06,
"loss": 0.6793,
"step": 7850
},
{
"epoch": 0.2,
"grad_norm": 16.41980743408203,
"learning_rate": 4.505263157894737e-06,
"loss": 0.819,
"step": 7860
},
{
"epoch": 0.2,
"grad_norm": 3.1915693283081055,
"learning_rate": 4.48421052631579e-06,
"loss": 0.7217,
"step": 7870
},
{
"epoch": 0.2,
"grad_norm": 5.805244445800781,
"learning_rate": 4.463157894736842e-06,
"loss": 0.7146,
"step": 7880
},
{
"epoch": 0.2,
"grad_norm": 2.697472333908081,
"learning_rate": 4.442105263157896e-06,
"loss": 0.6748,
"step": 7890
},
{
"epoch": 0.2,
"grad_norm": 3.6001346111297607,
"learning_rate": 4.4210526315789476e-06,
"loss": 0.6972,
"step": 7900
},
{
"epoch": 0.2,
"grad_norm": 4.912445545196533,
"learning_rate": 4.4e-06,
"loss": 0.7157,
"step": 7910
},
{
"epoch": 0.2,
"grad_norm": 6.9912309646606445,
"learning_rate": 4.378947368421053e-06,
"loss": 0.5927,
"step": 7920
},
{
"epoch": 0.2,
"grad_norm": 4.380290985107422,
"learning_rate": 4.3578947368421055e-06,
"loss": 0.699,
"step": 7930
},
{
"epoch": 0.2,
"grad_norm": 4.024576663970947,
"learning_rate": 4.336842105263158e-06,
"loss": 0.8156,
"step": 7940
},
{
"epoch": 0.2,
"grad_norm": 3.523719310760498,
"learning_rate": 4.315789473684211e-06,
"loss": 0.7827,
"step": 7950
},
{
"epoch": 0.2,
"grad_norm": 10.055171966552734,
"learning_rate": 4.2947368421052635e-06,
"loss": 0.7142,
"step": 7960
},
{
"epoch": 0.2,
"grad_norm": 7.437203407287598,
"learning_rate": 4.273684210526316e-06,
"loss": 0.7184,
"step": 7970
},
{
"epoch": 0.2,
"grad_norm": 2.6910207271575928,
"learning_rate": 4.252631578947369e-06,
"loss": 0.7311,
"step": 7980
},
{
"epoch": 0.2,
"grad_norm": 12.729212760925293,
"learning_rate": 4.2315789473684215e-06,
"loss": 0.7629,
"step": 7990
},
{
"epoch": 0.2,
"grad_norm": 3.817344903945923,
"learning_rate": 4.210526315789474e-06,
"loss": 0.8676,
"step": 8000
},
{
"epoch": 0.2,
"eval_loss": 0.7396635413169861,
"eval_runtime": 67.9126,
"eval_samples_per_second": 14.725,
"eval_steps_per_second": 14.725,
"step": 8000
},
{
"epoch": 0.2,
"grad_norm": 5.193355083465576,
"learning_rate": 4.189473684210527e-06,
"loss": 0.7036,
"step": 8010
},
{
"epoch": 0.2,
"grad_norm": 3.617652177810669,
"learning_rate": 4.1684210526315794e-06,
"loss": 0.6547,
"step": 8020
},
{
"epoch": 0.2,
"grad_norm": 3.48286771774292,
"learning_rate": 4.147368421052632e-06,
"loss": 0.6756,
"step": 8030
},
{
"epoch": 0.2,
"grad_norm": 4.939229965209961,
"learning_rate": 4.126315789473685e-06,
"loss": 0.7157,
"step": 8040
},
{
"epoch": 0.2,
"grad_norm": 14.387231826782227,
"learning_rate": 4.105263157894737e-06,
"loss": 0.8052,
"step": 8050
},
{
"epoch": 0.2,
"grad_norm": 4.042211055755615,
"learning_rate": 4.08421052631579e-06,
"loss": 0.6733,
"step": 8060
},
{
"epoch": 0.2,
"grad_norm": 6.068091869354248,
"learning_rate": 4.063157894736842e-06,
"loss": 0.6172,
"step": 8070
},
{
"epoch": 0.2,
"grad_norm": 5.004486083984375,
"learning_rate": 4.042105263157895e-06,
"loss": 0.7888,
"step": 8080
},
{
"epoch": 0.2,
"grad_norm": 5.651116847991943,
"learning_rate": 4.021052631578948e-06,
"loss": 0.6979,
"step": 8090
},
{
"epoch": 0.2,
"grad_norm": 3.581594944000244,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7654,
"step": 8100
},
{
"epoch": 0.2,
"grad_norm": 2.6030330657958984,
"learning_rate": 3.9789473684210525e-06,
"loss": 0.7946,
"step": 8110
},
{
"epoch": 0.2,
"grad_norm": 5.385477542877197,
"learning_rate": 3.957894736842106e-06,
"loss": 0.7785,
"step": 8120
},
{
"epoch": 0.2,
"grad_norm": 5.688074588775635,
"learning_rate": 3.936842105263159e-06,
"loss": 0.7762,
"step": 8130
},
{
"epoch": 0.2,
"grad_norm": 1.7027924060821533,
"learning_rate": 3.9157894736842104e-06,
"loss": 0.6933,
"step": 8140
},
{
"epoch": 0.2,
"grad_norm": 5.239694118499756,
"learning_rate": 3.894736842105263e-06,
"loss": 0.8061,
"step": 8150
},
{
"epoch": 0.2,
"grad_norm": 4.3939032554626465,
"learning_rate": 3.873684210526316e-06,
"loss": 0.7537,
"step": 8160
},
{
"epoch": 0.2,
"grad_norm": 5.115386962890625,
"learning_rate": 3.852631578947369e-06,
"loss": 0.7025,
"step": 8170
},
{
"epoch": 0.2,
"grad_norm": 4.546750545501709,
"learning_rate": 3.831578947368421e-06,
"loss": 0.7108,
"step": 8180
},
{
"epoch": 0.2,
"grad_norm": 3.043384552001953,
"learning_rate": 3.810526315789474e-06,
"loss": 0.7506,
"step": 8190
},
{
"epoch": 0.2,
"grad_norm": 2.8117778301239014,
"learning_rate": 3.789473684210527e-06,
"loss": 0.773,
"step": 8200
},
{
"epoch": 0.21,
"grad_norm": 6.000233173370361,
"learning_rate": 3.768421052631579e-06,
"loss": 0.6902,
"step": 8210
},
{
"epoch": 0.21,
"grad_norm": 6.7739787101745605,
"learning_rate": 3.7473684210526317e-06,
"loss": 0.6397,
"step": 8220
},
{
"epoch": 0.21,
"grad_norm": 4.948480129241943,
"learning_rate": 3.7263157894736848e-06,
"loss": 0.6185,
"step": 8230
},
{
"epoch": 0.21,
"grad_norm": 4.269702434539795,
"learning_rate": 3.7052631578947374e-06,
"loss": 0.7487,
"step": 8240
},
{
"epoch": 0.21,
"grad_norm": 3.8336634635925293,
"learning_rate": 3.6842105263157896e-06,
"loss": 0.7805,
"step": 8250
},
{
"epoch": 0.21,
"grad_norm": 4.896543979644775,
"learning_rate": 3.6631578947368423e-06,
"loss": 0.645,
"step": 8260
},
{
"epoch": 0.21,
"grad_norm": 6.051191806793213,
"learning_rate": 3.642105263157895e-06,
"loss": 0.7477,
"step": 8270
},
{
"epoch": 0.21,
"grad_norm": 24.540451049804688,
"learning_rate": 3.621052631578948e-06,
"loss": 0.8168,
"step": 8280
},
{
"epoch": 0.21,
"grad_norm": 5.061807155609131,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.727,
"step": 8290
},
{
"epoch": 0.21,
"grad_norm": 2.3907368183135986,
"learning_rate": 3.578947368421053e-06,
"loss": 0.6614,
"step": 8300
},
{
"epoch": 0.21,
"grad_norm": 4.554809093475342,
"learning_rate": 3.5578947368421056e-06,
"loss": 0.6947,
"step": 8310
},
{
"epoch": 0.21,
"grad_norm": 3.7383534908294678,
"learning_rate": 3.536842105263158e-06,
"loss": 0.6171,
"step": 8320
},
{
"epoch": 0.21,
"grad_norm": 4.406937122344971,
"learning_rate": 3.515789473684211e-06,
"loss": 0.6102,
"step": 8330
},
{
"epoch": 0.21,
"grad_norm": 5.226219654083252,
"learning_rate": 3.4947368421052635e-06,
"loss": 0.7746,
"step": 8340
},
{
"epoch": 0.21,
"grad_norm": 6.249040126800537,
"learning_rate": 3.473684210526316e-06,
"loss": 0.7158,
"step": 8350
},
{
"epoch": 0.21,
"grad_norm": 6.806312084197998,
"learning_rate": 3.4526315789473684e-06,
"loss": 0.7249,
"step": 8360
},
{
"epoch": 0.21,
"grad_norm": 2.993473529815674,
"learning_rate": 3.4315789473684215e-06,
"loss": 0.826,
"step": 8370
},
{
"epoch": 0.21,
"grad_norm": 4.120741367340088,
"learning_rate": 3.410526315789474e-06,
"loss": 0.6238,
"step": 8380
},
{
"epoch": 0.21,
"grad_norm": 4.020960807800293,
"learning_rate": 3.3894736842105264e-06,
"loss": 0.6749,
"step": 8390
},
{
"epoch": 0.21,
"grad_norm": 6.000002384185791,
"learning_rate": 3.368421052631579e-06,
"loss": 0.7652,
"step": 8400
},
{
"epoch": 0.21,
"grad_norm": 8.221445083618164,
"learning_rate": 3.347368421052632e-06,
"loss": 0.7781,
"step": 8410
},
{
"epoch": 0.21,
"grad_norm": 5.850223541259766,
"learning_rate": 3.3263157894736848e-06,
"loss": 0.7555,
"step": 8420
},
{
"epoch": 0.21,
"grad_norm": 2.249915838241577,
"learning_rate": 3.305263157894737e-06,
"loss": 0.7305,
"step": 8430
},
{
"epoch": 0.21,
"grad_norm": 4.955141067504883,
"learning_rate": 3.2842105263157897e-06,
"loss": 0.6817,
"step": 8440
},
{
"epoch": 0.21,
"grad_norm": 2.4711403846740723,
"learning_rate": 3.2631578947368423e-06,
"loss": 0.683,
"step": 8450
},
{
"epoch": 0.21,
"grad_norm": 5.367486953735352,
"learning_rate": 3.2421052631578945e-06,
"loss": 0.6494,
"step": 8460
},
{
"epoch": 0.21,
"grad_norm": 3.283465623855591,
"learning_rate": 3.2210526315789476e-06,
"loss": 0.6092,
"step": 8470
},
{
"epoch": 0.21,
"grad_norm": 4.473137855529785,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.676,
"step": 8480
},
{
"epoch": 0.21,
"grad_norm": 3.177180528640747,
"learning_rate": 3.178947368421053e-06,
"loss": 0.6685,
"step": 8490
},
{
"epoch": 0.21,
"grad_norm": 4.735683441162109,
"learning_rate": 3.157894736842105e-06,
"loss": 0.7544,
"step": 8500
},
{
"epoch": 0.21,
"eval_loss": 0.7582711577415466,
"eval_runtime": 67.8631,
"eval_samples_per_second": 14.736,
"eval_steps_per_second": 14.736,
"step": 8500
},
{
"epoch": 0.21,
"grad_norm": 4.465471267700195,
"learning_rate": 3.1368421052631582e-06,
"loss": 0.8191,
"step": 8510
},
{
"epoch": 0.21,
"grad_norm": 3.8849751949310303,
"learning_rate": 3.115789473684211e-06,
"loss": 0.7078,
"step": 8520
},
{
"epoch": 0.21,
"grad_norm": 5.555447101593018,
"learning_rate": 3.094736842105263e-06,
"loss": 0.7332,
"step": 8530
},
{
"epoch": 0.21,
"grad_norm": 4.269344806671143,
"learning_rate": 3.0736842105263158e-06,
"loss": 0.7619,
"step": 8540
},
{
"epoch": 0.21,
"grad_norm": 5.792567729949951,
"learning_rate": 3.052631578947369e-06,
"loss": 0.6858,
"step": 8550
},
{
"epoch": 0.21,
"grad_norm": 4.095942974090576,
"learning_rate": 3.0315789473684215e-06,
"loss": 0.7793,
"step": 8560
},
{
"epoch": 0.21,
"grad_norm": 3.316791296005249,
"learning_rate": 3.0105263157894737e-06,
"loss": 0.666,
"step": 8570
},
{
"epoch": 0.21,
"grad_norm": 4.55336332321167,
"learning_rate": 2.9894736842105264e-06,
"loss": 0.7723,
"step": 8580
},
{
"epoch": 0.21,
"grad_norm": 7.5306315422058105,
"learning_rate": 2.9684210526315795e-06,
"loss": 0.7283,
"step": 8590
},
{
"epoch": 0.21,
"grad_norm": 3.935115337371826,
"learning_rate": 2.9473684210526317e-06,
"loss": 0.7843,
"step": 8600
},
{
"epoch": 0.22,
"grad_norm": 5.173915863037109,
"learning_rate": 2.9263157894736844e-06,
"loss": 0.6662,
"step": 8610
},
{
"epoch": 0.22,
"grad_norm": 3.5214264392852783,
"learning_rate": 2.905263157894737e-06,
"loss": 0.6887,
"step": 8620
},
{
"epoch": 0.22,
"grad_norm": 4.139004707336426,
"learning_rate": 2.88421052631579e-06,
"loss": 0.6778,
"step": 8630
},
{
"epoch": 0.22,
"grad_norm": 4.185042381286621,
"learning_rate": 2.8631578947368423e-06,
"loss": 0.9094,
"step": 8640
},
{
"epoch": 0.22,
"grad_norm": 3.3607513904571533,
"learning_rate": 2.842105263157895e-06,
"loss": 0.7918,
"step": 8650
},
{
"epoch": 0.22,
"grad_norm": 5.062870502471924,
"learning_rate": 2.8210526315789476e-06,
"loss": 0.7694,
"step": 8660
},
{
"epoch": 0.22,
"grad_norm": 5.099003791809082,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.7301,
"step": 8670
},
{
"epoch": 0.22,
"grad_norm": 5.512063026428223,
"learning_rate": 2.7789473684210525e-06,
"loss": 0.7887,
"step": 8680
},
{
"epoch": 0.22,
"grad_norm": 3.625652551651001,
"learning_rate": 2.7578947368421056e-06,
"loss": 0.7781,
"step": 8690
},
{
"epoch": 0.22,
"grad_norm": 2.8921008110046387,
"learning_rate": 2.7368421052631583e-06,
"loss": 0.7582,
"step": 8700
},
{
"epoch": 0.22,
"grad_norm": 10.71945571899414,
"learning_rate": 2.7157894736842105e-06,
"loss": 0.7234,
"step": 8710
},
{
"epoch": 0.22,
"grad_norm": 17.737136840820312,
"learning_rate": 2.694736842105263e-06,
"loss": 0.6298,
"step": 8720
},
{
"epoch": 0.22,
"grad_norm": 9.8464994430542,
"learning_rate": 2.6736842105263162e-06,
"loss": 0.7856,
"step": 8730
},
{
"epoch": 0.22,
"grad_norm": 7.925550937652588,
"learning_rate": 2.652631578947369e-06,
"loss": 0.8387,
"step": 8740
},
{
"epoch": 0.22,
"grad_norm": 3.530381441116333,
"learning_rate": 2.631578947368421e-06,
"loss": 0.8223,
"step": 8750
},
{
"epoch": 0.22,
"grad_norm": 6.403299808502197,
"learning_rate": 2.6105263157894738e-06,
"loss": 0.8079,
"step": 8760
},
{
"epoch": 0.22,
"grad_norm": 5.1753740310668945,
"learning_rate": 2.589473684210527e-06,
"loss": 0.7888,
"step": 8770
},
{
"epoch": 0.22,
"grad_norm": 2.760190725326538,
"learning_rate": 2.568421052631579e-06,
"loss": 0.7071,
"step": 8780
},
{
"epoch": 0.22,
"grad_norm": 5.183119297027588,
"learning_rate": 2.5473684210526317e-06,
"loss": 0.619,
"step": 8790
},
{
"epoch": 0.22,
"grad_norm": 5.66708517074585,
"learning_rate": 2.5263157894736844e-06,
"loss": 0.7888,
"step": 8800
},
{
"epoch": 0.22,
"grad_norm": 2.3660988807678223,
"learning_rate": 2.5052631578947375e-06,
"loss": 0.7466,
"step": 8810
},
{
"epoch": 0.22,
"grad_norm": 3.8384206295013428,
"learning_rate": 2.4842105263157897e-06,
"loss": 0.7371,
"step": 8820
},
{
"epoch": 0.22,
"grad_norm": 3.593717336654663,
"learning_rate": 2.4631578947368424e-06,
"loss": 0.5967,
"step": 8830
},
{
"epoch": 0.22,
"grad_norm": 2.778346538543701,
"learning_rate": 2.442105263157895e-06,
"loss": 0.6407,
"step": 8840
},
{
"epoch": 0.22,
"grad_norm": 10.841148376464844,
"learning_rate": 2.4210526315789477e-06,
"loss": 0.8172,
"step": 8850
},
{
"epoch": 0.22,
"grad_norm": 2.635694980621338,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.8135,
"step": 8860
},
{
"epoch": 0.22,
"grad_norm": 1.5510995388031006,
"learning_rate": 2.378947368421053e-06,
"loss": 0.8328,
"step": 8870
},
{
"epoch": 0.22,
"grad_norm": 3.770972967147827,
"learning_rate": 2.357894736842105e-06,
"loss": 0.6642,
"step": 8880
},
{
"epoch": 0.22,
"grad_norm": 5.756451606750488,
"learning_rate": 2.3368421052631583e-06,
"loss": 0.7484,
"step": 8890
},
{
"epoch": 0.22,
"grad_norm": 2.9202377796173096,
"learning_rate": 2.3157894736842105e-06,
"loss": 0.7381,
"step": 8900
},
{
"epoch": 0.22,
"grad_norm": 4.43782377243042,
"learning_rate": 2.294736842105263e-06,
"loss": 0.7915,
"step": 8910
},
{
"epoch": 0.22,
"grad_norm": 20.496152877807617,
"learning_rate": 2.273684210526316e-06,
"loss": 0.6872,
"step": 8920
},
{
"epoch": 0.22,
"grad_norm": 3.2591583728790283,
"learning_rate": 2.2526315789473685e-06,
"loss": 0.668,
"step": 8930
},
{
"epoch": 0.22,
"grad_norm": 2.23056960105896,
"learning_rate": 2.231578947368421e-06,
"loss": 0.6229,
"step": 8940
},
{
"epoch": 0.22,
"grad_norm": 5.419168949127197,
"learning_rate": 2.2105263157894738e-06,
"loss": 0.9534,
"step": 8950
},
{
"epoch": 0.22,
"grad_norm": 15.681089401245117,
"learning_rate": 2.1894736842105264e-06,
"loss": 0.782,
"step": 8960
},
{
"epoch": 0.22,
"grad_norm": 3.7693331241607666,
"learning_rate": 2.168421052631579e-06,
"loss": 0.8047,
"step": 8970
},
{
"epoch": 0.22,
"grad_norm": 3.4705393314361572,
"learning_rate": 2.1473684210526317e-06,
"loss": 0.7832,
"step": 8980
},
{
"epoch": 0.22,
"grad_norm": 4.295872688293457,
"learning_rate": 2.1263157894736844e-06,
"loss": 0.7355,
"step": 8990
},
{
"epoch": 0.23,
"grad_norm": 3.0480620861053467,
"learning_rate": 2.105263157894737e-06,
"loss": 0.6739,
"step": 9000
},
{
"epoch": 0.23,
"eval_loss": 0.7442497611045837,
"eval_runtime": 67.8767,
"eval_samples_per_second": 14.733,
"eval_steps_per_second": 14.733,
"step": 9000
},
{
"epoch": 0.23,
"grad_norm": 2.9723927974700928,
"learning_rate": 2.0842105263157897e-06,
"loss": 0.7003,
"step": 9010
},
{
"epoch": 0.23,
"grad_norm": 2.0932421684265137,
"learning_rate": 2.0631578947368424e-06,
"loss": 0.6897,
"step": 9020
},
{
"epoch": 0.23,
"grad_norm": 4.70625114440918,
"learning_rate": 2.042105263157895e-06,
"loss": 0.8106,
"step": 9030
},
{
"epoch": 0.23,
"grad_norm": 3.2763564586639404,
"learning_rate": 2.0210526315789477e-06,
"loss": 0.7387,
"step": 9040
},
{
"epoch": 0.23,
"grad_norm": 4.553431034088135,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7435,
"step": 9050
},
{
"epoch": 0.23,
"grad_norm": 5.36479377746582,
"learning_rate": 1.978947368421053e-06,
"loss": 0.7713,
"step": 9060
},
{
"epoch": 0.23,
"grad_norm": 4.923874855041504,
"learning_rate": 1.9578947368421052e-06,
"loss": 0.5508,
"step": 9070
},
{
"epoch": 0.23,
"grad_norm": 8.63404655456543,
"learning_rate": 1.936842105263158e-06,
"loss": 0.7323,
"step": 9080
},
{
"epoch": 0.23,
"grad_norm": 5.521135330200195,
"learning_rate": 1.9157894736842105e-06,
"loss": 0.699,
"step": 9090
},
{
"epoch": 0.23,
"grad_norm": 9.009405136108398,
"learning_rate": 1.8947368421052634e-06,
"loss": 0.789,
"step": 9100
},
{
"epoch": 0.23,
"grad_norm": 12.834007263183594,
"learning_rate": 1.8736842105263158e-06,
"loss": 0.7382,
"step": 9110
},
{
"epoch": 0.23,
"grad_norm": 3.753262758255005,
"learning_rate": 1.8526315789473687e-06,
"loss": 0.7035,
"step": 9120
},
{
"epoch": 0.23,
"grad_norm": 3.300708770751953,
"learning_rate": 1.8315789473684211e-06,
"loss": 0.7558,
"step": 9130
},
{
"epoch": 0.23,
"grad_norm": 4.416452884674072,
"learning_rate": 1.810526315789474e-06,
"loss": 0.6854,
"step": 9140
},
{
"epoch": 0.23,
"grad_norm": 7.664788722991943,
"learning_rate": 1.7894736842105265e-06,
"loss": 0.6951,
"step": 9150
},
{
"epoch": 0.23,
"grad_norm": 3.646073818206787,
"learning_rate": 1.768421052631579e-06,
"loss": 0.7472,
"step": 9160
},
{
"epoch": 0.23,
"grad_norm": 3.125991106033325,
"learning_rate": 1.7473684210526318e-06,
"loss": 0.6711,
"step": 9170
},
{
"epoch": 0.23,
"grad_norm": 5.308753967285156,
"learning_rate": 1.7263157894736842e-06,
"loss": 0.6393,
"step": 9180
},
{
"epoch": 0.23,
"grad_norm": 11.79830265045166,
"learning_rate": 1.705263157894737e-06,
"loss": 0.7358,
"step": 9190
},
{
"epoch": 0.23,
"grad_norm": 6.862399101257324,
"learning_rate": 1.6842105263157895e-06,
"loss": 0.8422,
"step": 9200
},
{
"epoch": 0.23,
"grad_norm": 5.3199968338012695,
"learning_rate": 1.6631578947368424e-06,
"loss": 0.6999,
"step": 9210
},
{
"epoch": 0.23,
"grad_norm": 3.263275146484375,
"learning_rate": 1.6421052631578948e-06,
"loss": 0.7122,
"step": 9220
},
{
"epoch": 0.23,
"grad_norm": 4.283051490783691,
"learning_rate": 1.6210526315789473e-06,
"loss": 0.7793,
"step": 9230
},
{
"epoch": 0.23,
"grad_norm": 2.0055785179138184,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.732,
"step": 9240
},
{
"epoch": 0.23,
"grad_norm": 4.184137344360352,
"learning_rate": 1.5789473684210526e-06,
"loss": 0.7339,
"step": 9250
},
{
"epoch": 0.23,
"grad_norm": 3.587636709213257,
"learning_rate": 1.5578947368421054e-06,
"loss": 0.8473,
"step": 9260
},
{
"epoch": 0.23,
"grad_norm": 8.189043045043945,
"learning_rate": 1.5368421052631579e-06,
"loss": 0.6498,
"step": 9270
},
{
"epoch": 0.23,
"grad_norm": 3.4272284507751465,
"learning_rate": 1.5157894736842108e-06,
"loss": 0.7676,
"step": 9280
},
{
"epoch": 0.23,
"grad_norm": 3.280287027359009,
"learning_rate": 1.4947368421052632e-06,
"loss": 0.6283,
"step": 9290
},
{
"epoch": 0.23,
"grad_norm": 8.722474098205566,
"learning_rate": 1.4736842105263159e-06,
"loss": 0.7555,
"step": 9300
},
{
"epoch": 0.23,
"grad_norm": 4.574818134307861,
"learning_rate": 1.4526315789473685e-06,
"loss": 0.7481,
"step": 9310
},
{
"epoch": 0.23,
"grad_norm": 3.0097527503967285,
"learning_rate": 1.4315789473684212e-06,
"loss": 0.6181,
"step": 9320
},
{
"epoch": 0.23,
"grad_norm": 6.725505352020264,
"learning_rate": 1.4105263157894738e-06,
"loss": 0.677,
"step": 9330
},
{
"epoch": 0.23,
"grad_norm": 2.934959888458252,
"learning_rate": 1.3894736842105263e-06,
"loss": 0.6932,
"step": 9340
},
{
"epoch": 0.23,
"grad_norm": 2.7491650581359863,
"learning_rate": 1.3684210526315791e-06,
"loss": 0.7361,
"step": 9350
},
{
"epoch": 0.23,
"grad_norm": 4.734315872192383,
"learning_rate": 1.3473684210526316e-06,
"loss": 0.6442,
"step": 9360
},
{
"epoch": 0.23,
"grad_norm": 4.301790714263916,
"learning_rate": 1.3263157894736844e-06,
"loss": 0.7642,
"step": 9370
},
{
"epoch": 0.23,
"grad_norm": 4.042958736419678,
"learning_rate": 1.3052631578947369e-06,
"loss": 0.7974,
"step": 9380
},
{
"epoch": 0.23,
"grad_norm": 4.941096782684326,
"learning_rate": 1.2842105263157895e-06,
"loss": 0.8603,
"step": 9390
},
{
"epoch": 0.23,
"grad_norm": 4.379117488861084,
"learning_rate": 1.2631578947368422e-06,
"loss": 0.8297,
"step": 9400
},
{
"epoch": 0.24,
"grad_norm": 6.3129048347473145,
"learning_rate": 1.2421052631578948e-06,
"loss": 0.7783,
"step": 9410
},
{
"epoch": 0.24,
"grad_norm": 5.5439133644104,
"learning_rate": 1.2210526315789475e-06,
"loss": 0.8122,
"step": 9420
},
{
"epoch": 0.24,
"grad_norm": 6.480744361877441,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.7779,
"step": 9430
},
{
"epoch": 0.24,
"grad_norm": 5.862485408782959,
"learning_rate": 1.1789473684210526e-06,
"loss": 0.6917,
"step": 9440
},
{
"epoch": 0.24,
"grad_norm": 5.7247443199157715,
"learning_rate": 1.1578947368421053e-06,
"loss": 0.7017,
"step": 9450
},
{
"epoch": 0.24,
"grad_norm": 8.194451332092285,
"learning_rate": 1.136842105263158e-06,
"loss": 0.7031,
"step": 9460
},
{
"epoch": 0.24,
"grad_norm": 8.057929992675781,
"learning_rate": 1.1157894736842106e-06,
"loss": 0.7116,
"step": 9470
},
{
"epoch": 0.24,
"grad_norm": 4.529337406158447,
"learning_rate": 1.0947368421052632e-06,
"loss": 0.8314,
"step": 9480
},
{
"epoch": 0.24,
"grad_norm": 7.412846565246582,
"learning_rate": 1.0736842105263159e-06,
"loss": 0.6448,
"step": 9490
},
{
"epoch": 0.24,
"grad_norm": 3.7076497077941895,
"learning_rate": 1.0526315789473685e-06,
"loss": 0.6291,
"step": 9500
},
{
"epoch": 0.24,
"eval_loss": 0.7395394444465637,
"eval_runtime": 67.8841,
"eval_samples_per_second": 14.731,
"eval_steps_per_second": 14.731,
"step": 9500
},
{
"epoch": 0.24,
"grad_norm": 4.488115310668945,
"learning_rate": 1.0315789473684212e-06,
"loss": 0.8611,
"step": 9510
},
{
"epoch": 0.24,
"grad_norm": 1.6314383745193481,
"learning_rate": 1.0105263157894738e-06,
"loss": 0.7694,
"step": 9520
},
{
"epoch": 0.24,
"grad_norm": 5.290372848510742,
"learning_rate": 9.894736842105265e-07,
"loss": 0.7166,
"step": 9530
},
{
"epoch": 0.24,
"grad_norm": 3.1572625637054443,
"learning_rate": 9.68421052631579e-07,
"loss": 0.7649,
"step": 9540
},
{
"epoch": 0.24,
"grad_norm": 4.951930999755859,
"learning_rate": 9.473684210526317e-07,
"loss": 0.7057,
"step": 9550
},
{
"epoch": 0.24,
"grad_norm": 4.696636199951172,
"learning_rate": 9.263157894736844e-07,
"loss": 0.7853,
"step": 9560
},
{
"epoch": 0.24,
"grad_norm": 4.211262226104736,
"learning_rate": 9.05263157894737e-07,
"loss": 0.6612,
"step": 9570
},
{
"epoch": 0.24,
"grad_norm": 4.584897041320801,
"learning_rate": 8.842105263157895e-07,
"loss": 0.6393,
"step": 9580
},
{
"epoch": 0.24,
"grad_norm": 4.64282751083374,
"learning_rate": 8.631578947368421e-07,
"loss": 0.7915,
"step": 9590
},
{
"epoch": 0.24,
"grad_norm": 3.691389799118042,
"learning_rate": 8.421052631578948e-07,
"loss": 0.659,
"step": 9600
},
{
"epoch": 0.24,
"grad_norm": 4.740243911743164,
"learning_rate": 8.210526315789474e-07,
"loss": 0.7134,
"step": 9610
},
{
"epoch": 0.24,
"grad_norm": 6.811493873596191,
"learning_rate": 8.000000000000001e-07,
"loss": 0.8592,
"step": 9620
},
{
"epoch": 0.24,
"grad_norm": 3.2056334018707275,
"learning_rate": 7.789473684210527e-07,
"loss": 0.6753,
"step": 9630
},
{
"epoch": 0.24,
"grad_norm": 4.347885608673096,
"learning_rate": 7.578947368421054e-07,
"loss": 0.7476,
"step": 9640
},
{
"epoch": 0.24,
"grad_norm": 5.63771915435791,
"learning_rate": 7.368421052631579e-07,
"loss": 0.7649,
"step": 9650
},
{
"epoch": 0.24,
"grad_norm": 3.062124013900757,
"learning_rate": 7.157894736842106e-07,
"loss": 0.6792,
"step": 9660
},
{
"epoch": 0.24,
"grad_norm": 9.334321022033691,
"learning_rate": 6.947368421052631e-07,
"loss": 0.7626,
"step": 9670
},
{
"epoch": 0.24,
"grad_norm": 7.429685115814209,
"learning_rate": 6.736842105263158e-07,
"loss": 0.6943,
"step": 9680
},
{
"epoch": 0.24,
"grad_norm": 4.459277629852295,
"learning_rate": 6.526315789473684e-07,
"loss": 0.7838,
"step": 9690
},
{
"epoch": 0.24,
"grad_norm": 6.821927070617676,
"learning_rate": 6.315789473684211e-07,
"loss": 0.7103,
"step": 9700
},
{
"epoch": 0.24,
"grad_norm": 10.438909530639648,
"learning_rate": 6.105263157894738e-07,
"loss": 0.7509,
"step": 9710
},
{
"epoch": 0.24,
"grad_norm": 11.55811882019043,
"learning_rate": 5.894736842105263e-07,
"loss": 0.7623,
"step": 9720
},
{
"epoch": 0.24,
"grad_norm": 3.1809043884277344,
"learning_rate": 5.68421052631579e-07,
"loss": 0.6294,
"step": 9730
},
{
"epoch": 0.24,
"grad_norm": 5.337337970733643,
"learning_rate": 5.473684210526316e-07,
"loss": 0.763,
"step": 9740
},
{
"epoch": 0.24,
"grad_norm": 8.130523681640625,
"learning_rate": 5.263157894736843e-07,
"loss": 0.6404,
"step": 9750
},
{
"epoch": 0.24,
"grad_norm": 4.213668346405029,
"learning_rate": 5.052631578947369e-07,
"loss": 0.7379,
"step": 9760
},
{
"epoch": 0.24,
"grad_norm": 3.8605246543884277,
"learning_rate": 4.842105263157895e-07,
"loss": 0.7483,
"step": 9770
},
{
"epoch": 0.24,
"grad_norm": 4.358519077301025,
"learning_rate": 4.631578947368422e-07,
"loss": 0.6823,
"step": 9780
},
{
"epoch": 0.24,
"grad_norm": 2.9712955951690674,
"learning_rate": 4.421052631578947e-07,
"loss": 0.679,
"step": 9790
},
{
"epoch": 0.24,
"grad_norm": 6.285613059997559,
"learning_rate": 4.210526315789474e-07,
"loss": 0.7763,
"step": 9800
},
{
"epoch": 0.25,
"grad_norm": 2.434277296066284,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.8558,
"step": 9810
},
{
"epoch": 0.25,
"grad_norm": 7.880703449249268,
"learning_rate": 3.789473684210527e-07,
"loss": 0.7494,
"step": 9820
},
{
"epoch": 0.25,
"grad_norm": 11.698799133300781,
"learning_rate": 3.578947368421053e-07,
"loss": 0.6576,
"step": 9830
},
{
"epoch": 0.25,
"grad_norm": 3.2752954959869385,
"learning_rate": 3.368421052631579e-07,
"loss": 0.6494,
"step": 9840
},
{
"epoch": 0.25,
"grad_norm": 2.878567934036255,
"learning_rate": 3.1578947368421055e-07,
"loss": 0.6781,
"step": 9850
},
{
"epoch": 0.25,
"grad_norm": 3.6086246967315674,
"learning_rate": 2.9473684210526315e-07,
"loss": 0.7339,
"step": 9860
},
{
"epoch": 0.25,
"grad_norm": 5.403782844543457,
"learning_rate": 2.736842105263158e-07,
"loss": 0.7738,
"step": 9870
},
{
"epoch": 0.25,
"grad_norm": 4.487565994262695,
"learning_rate": 2.5263157894736846e-07,
"loss": 0.8165,
"step": 9880
},
{
"epoch": 0.25,
"grad_norm": 4.29118537902832,
"learning_rate": 2.315789473684211e-07,
"loss": 0.6272,
"step": 9890
},
{
"epoch": 0.25,
"grad_norm": 3.634309768676758,
"learning_rate": 2.105263157894737e-07,
"loss": 0.6641,
"step": 9900
},
{
"epoch": 0.25,
"grad_norm": 4.989073276519775,
"learning_rate": 1.8947368421052634e-07,
"loss": 0.7111,
"step": 9910
},
{
"epoch": 0.25,
"grad_norm": 5.606556415557861,
"learning_rate": 1.6842105263157895e-07,
"loss": 0.6112,
"step": 9920
},
{
"epoch": 0.25,
"grad_norm": 5.012443542480469,
"learning_rate": 1.4736842105263158e-07,
"loss": 0.6684,
"step": 9930
},
{
"epoch": 0.25,
"grad_norm": 6.287766933441162,
"learning_rate": 1.2631578947368423e-07,
"loss": 0.6687,
"step": 9940
},
{
"epoch": 0.25,
"grad_norm": 3.646402597427368,
"learning_rate": 1.0526315789473685e-07,
"loss": 0.6452,
"step": 9950
},
{
"epoch": 0.25,
"grad_norm": 7.9046950340271,
"learning_rate": 8.421052631578947e-08,
"loss": 0.7636,
"step": 9960
},
{
"epoch": 0.25,
"grad_norm": 4.733578681945801,
"learning_rate": 6.315789473684211e-08,
"loss": 0.6619,
"step": 9970
},
{
"epoch": 0.25,
"grad_norm": 2.342442274093628,
"learning_rate": 4.2105263157894737e-08,
"loss": 0.74,
"step": 9980
},
{
"epoch": 0.25,
"grad_norm": 4.0832839012146,
"learning_rate": 2.1052631578947368e-08,
"loss": 0.7314,
"step": 9990
},
{
"epoch": 0.25,
"grad_norm": 2.517941951751709,
"learning_rate": 0.0,
"loss": 0.755,
"step": 10000
},
{
"epoch": 0.25,
"eval_loss": 0.7402730584144592,
"eval_runtime": 67.899,
"eval_samples_per_second": 14.728,
"eval_steps_per_second": 14.728,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2500,
"total_flos": 1.6102125993984e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}