{ "best_metric": 0.7402730584144592, "best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-10000", "epoch": 0.25, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.086414337158203, "learning_rate": 4.0000000000000003e-07, "loss": 0.7892, "step": 10 }, { "epoch": 0.0, "grad_norm": 8.478134155273438, "learning_rate": 8.000000000000001e-07, "loss": 0.7746, "step": 20 }, { "epoch": 0.0, "grad_norm": 5.574502468109131, "learning_rate": 1.2000000000000002e-06, "loss": 0.8222, "step": 30 }, { "epoch": 0.0, "grad_norm": 2.6497371196746826, "learning_rate": 1.6000000000000001e-06, "loss": 0.7423, "step": 40 }, { "epoch": 0.0, "grad_norm": 3.116753339767456, "learning_rate": 2.0000000000000003e-06, "loss": 0.7622, "step": 50 }, { "epoch": 0.0, "grad_norm": 3.179832696914673, "learning_rate": 2.4000000000000003e-06, "loss": 0.8183, "step": 60 }, { "epoch": 0.0, "grad_norm": 3.9869463443756104, "learning_rate": 2.8000000000000003e-06, "loss": 0.822, "step": 70 }, { "epoch": 0.0, "grad_norm": 5.093494415283203, "learning_rate": 3.2000000000000003e-06, "loss": 0.7966, "step": 80 }, { "epoch": 0.0, "grad_norm": 5.230633735656738, "learning_rate": 3.6000000000000003e-06, "loss": 0.8113, "step": 90 }, { "epoch": 0.0, "grad_norm": 9.374403953552246, "learning_rate": 4.000000000000001e-06, "loss": 0.7582, "step": 100 }, { "epoch": 0.0, "grad_norm": 6.465492248535156, "learning_rate": 4.4e-06, "loss": 0.7662, "step": 110 }, { "epoch": 0.0, "grad_norm": 6.279934883117676, "learning_rate": 4.800000000000001e-06, "loss": 0.8376, "step": 120 }, { "epoch": 0.0, "grad_norm": 5.799221992492676, "learning_rate": 5.2e-06, "loss": 0.7965, "step": 130 }, { "epoch": 0.0, "grad_norm": 3.222240686416626, "learning_rate": 5.600000000000001e-06, "loss": 0.8855, "step": 140 }, { "epoch": 0.0, "grad_norm": 9.009174346923828, "learning_rate": 6e-06, "loss": 0.8394, "step": 150 }, { "epoch": 0.0, "grad_norm": 8.040350914001465, "learning_rate": 6.4000000000000006e-06, "loss": 0.8426, "step": 160 }, { "epoch": 0.0, "grad_norm": 4.131030559539795, "learning_rate": 6.800000000000001e-06, "loss": 0.7747, "step": 170 }, { "epoch": 0.0, "grad_norm": 3.31986927986145, "learning_rate": 7.2000000000000005e-06, "loss": 0.7125, "step": 180 }, { "epoch": 0.0, "grad_norm": 5.7623395919799805, "learning_rate": 7.600000000000001e-06, "loss": 0.7854, "step": 190 }, { "epoch": 0.01, "grad_norm": 10.848206520080566, "learning_rate": 8.000000000000001e-06, "loss": 0.7756, "step": 200 }, { "epoch": 0.01, "grad_norm": 13.455166816711426, "learning_rate": 8.400000000000001e-06, "loss": 0.7894, "step": 210 }, { "epoch": 0.01, "grad_norm": 12.759767532348633, "learning_rate": 8.8e-06, "loss": 0.7454, "step": 220 }, { "epoch": 0.01, "grad_norm": 4.262899875640869, "learning_rate": 9.200000000000002e-06, "loss": 0.8555, "step": 230 }, { "epoch": 0.01, "grad_norm": 4.28985071182251, "learning_rate": 9.600000000000001e-06, "loss": 0.6845, "step": 240 }, { "epoch": 0.01, "grad_norm": 4.174241542816162, "learning_rate": 1e-05, "loss": 0.7983, "step": 250 }, { "epoch": 0.01, "grad_norm": 12.931599617004395, "learning_rate": 1.04e-05, "loss": 0.9041, "step": 260 }, { "epoch": 0.01, "grad_norm": 7.004627227783203, "learning_rate": 1.0800000000000002e-05, "loss": 0.817, "step": 270 }, { "epoch": 0.01, "grad_norm": 3.6102757453918457, "learning_rate": 1.1200000000000001e-05, "loss": 0.7292, "step": 280 }, { "epoch": 0.01, "grad_norm": 2.764902353286743, "learning_rate": 1.16e-05, "loss": 0.9042, "step": 290 }, { "epoch": 0.01, "grad_norm": 3.958317995071411, "learning_rate": 1.2e-05, "loss": 0.7539, "step": 300 }, { "epoch": 0.01, "grad_norm": 7.098923683166504, "learning_rate": 1.2400000000000002e-05, "loss": 0.7955, "step": 310 }, { "epoch": 0.01, "grad_norm": 12.129098892211914, "learning_rate": 1.2800000000000001e-05, "loss": 0.849, "step": 320 }, { "epoch": 0.01, "grad_norm": 2.054119825363159, "learning_rate": 1.3200000000000002e-05, "loss": 0.8645, "step": 330 }, { "epoch": 0.01, "grad_norm": 5.205028057098389, "learning_rate": 1.3600000000000002e-05, "loss": 0.8175, "step": 340 }, { "epoch": 0.01, "grad_norm": 2.614790439605713, "learning_rate": 1.4e-05, "loss": 0.8998, "step": 350 }, { "epoch": 0.01, "grad_norm": 2.9891204833984375, "learning_rate": 1.4400000000000001e-05, "loss": 0.8108, "step": 360 }, { "epoch": 0.01, "grad_norm": 4.152099609375, "learning_rate": 1.48e-05, "loss": 0.7855, "step": 370 }, { "epoch": 0.01, "grad_norm": 9.833850860595703, "learning_rate": 1.5200000000000002e-05, "loss": 0.7736, "step": 380 }, { "epoch": 0.01, "grad_norm": 3.849621295928955, "learning_rate": 1.5600000000000003e-05, "loss": 0.7668, "step": 390 }, { "epoch": 0.01, "grad_norm": 5.4542975425720215, "learning_rate": 1.6000000000000003e-05, "loss": 0.7781, "step": 400 }, { "epoch": 0.01, "grad_norm": 6.197661876678467, "learning_rate": 1.64e-05, "loss": 0.8654, "step": 410 }, { "epoch": 0.01, "grad_norm": 3.2606770992279053, "learning_rate": 1.6800000000000002e-05, "loss": 0.7565, "step": 420 }, { "epoch": 0.01, "grad_norm": 3.9680209159851074, "learning_rate": 1.72e-05, "loss": 0.7886, "step": 430 }, { "epoch": 0.01, "grad_norm": 18.749984741210938, "learning_rate": 1.76e-05, "loss": 0.7305, "step": 440 }, { "epoch": 0.01, "grad_norm": 5.822000503540039, "learning_rate": 1.8e-05, "loss": 0.7833, "step": 450 }, { "epoch": 0.01, "grad_norm": 12.999715805053711, "learning_rate": 1.8400000000000003e-05, "loss": 0.8483, "step": 460 }, { "epoch": 0.01, "grad_norm": 7.193736553192139, "learning_rate": 1.88e-05, "loss": 0.84, "step": 470 }, { "epoch": 0.01, "grad_norm": 12.573124885559082, "learning_rate": 1.9200000000000003e-05, "loss": 0.8437, "step": 480 }, { "epoch": 0.01, "grad_norm": 4.4221601486206055, "learning_rate": 1.9600000000000002e-05, "loss": 0.6836, "step": 490 }, { "epoch": 0.01, "grad_norm": 3.0399410724639893, "learning_rate": 2e-05, "loss": 0.8264, "step": 500 }, { "epoch": 0.01, "eval_loss": 0.8175864219665527, "eval_runtime": 67.7802, "eval_samples_per_second": 14.754, "eval_steps_per_second": 14.754, "step": 500 }, { "epoch": 0.01, "grad_norm": 3.971303701400757, "learning_rate": 1.9978947368421054e-05, "loss": 0.7385, "step": 510 }, { "epoch": 0.01, "grad_norm": 3.8043839931488037, "learning_rate": 1.9957894736842107e-05, "loss": 0.7826, "step": 520 }, { "epoch": 0.01, "grad_norm": 11.702253341674805, "learning_rate": 1.993684210526316e-05, "loss": 0.7971, "step": 530 }, { "epoch": 0.01, "grad_norm": 5.176826000213623, "learning_rate": 1.9915789473684212e-05, "loss": 0.748, "step": 540 }, { "epoch": 0.01, "grad_norm": 7.120133876800537, "learning_rate": 1.9894736842105265e-05, "loss": 0.8461, "step": 550 }, { "epoch": 0.01, "grad_norm": 12.286151885986328, "learning_rate": 1.9873684210526318e-05, "loss": 0.8335, "step": 560 }, { "epoch": 0.01, "grad_norm": 7.857172966003418, "learning_rate": 1.985263157894737e-05, "loss": 0.7231, "step": 570 }, { "epoch": 0.01, "grad_norm": 5.327859401702881, "learning_rate": 1.9831578947368423e-05, "loss": 0.877, "step": 580 }, { "epoch": 0.01, "grad_norm": 6.9340362548828125, "learning_rate": 1.9810526315789476e-05, "loss": 0.8984, "step": 590 }, { "epoch": 0.01, "grad_norm": 2.1034326553344727, "learning_rate": 1.9789473684210528e-05, "loss": 0.7045, "step": 600 }, { "epoch": 0.02, "grad_norm": 3.853721857070923, "learning_rate": 1.976842105263158e-05, "loss": 0.761, "step": 610 }, { "epoch": 0.02, "grad_norm": 7.6926398277282715, "learning_rate": 1.9747368421052633e-05, "loss": 0.9493, "step": 620 }, { "epoch": 0.02, "grad_norm": 6.261799335479736, "learning_rate": 1.9726315789473686e-05, "loss": 0.7719, "step": 630 }, { "epoch": 0.02, "grad_norm": 3.864114284515381, "learning_rate": 1.970526315789474e-05, "loss": 0.9406, "step": 640 }, { "epoch": 0.02, "grad_norm": 7.093533515930176, "learning_rate": 1.968421052631579e-05, "loss": 0.7951, "step": 650 }, { "epoch": 0.02, "grad_norm": 2.3724496364593506, "learning_rate": 1.9663157894736844e-05, "loss": 0.8648, "step": 660 }, { "epoch": 0.02, "grad_norm": 10.12341022491455, "learning_rate": 1.9642105263157897e-05, "loss": 0.7823, "step": 670 }, { "epoch": 0.02, "grad_norm": 2.80940842628479, "learning_rate": 1.962105263157895e-05, "loss": 0.706, "step": 680 }, { "epoch": 0.02, "grad_norm": 8.243487358093262, "learning_rate": 1.9600000000000002e-05, "loss": 0.8244, "step": 690 }, { "epoch": 0.02, "grad_norm": 11.420123100280762, "learning_rate": 1.9578947368421055e-05, "loss": 0.6753, "step": 700 }, { "epoch": 0.02, "grad_norm": 63.8618278503418, "learning_rate": 1.9557894736842107e-05, "loss": 0.8309, "step": 710 }, { "epoch": 0.02, "grad_norm": 4.521258354187012, "learning_rate": 1.953684210526316e-05, "loss": 0.8101, "step": 720 }, { "epoch": 0.02, "grad_norm": 2.9532318115234375, "learning_rate": 1.9515789473684213e-05, "loss": 0.8533, "step": 730 }, { "epoch": 0.02, "grad_norm": 3.792180061340332, "learning_rate": 1.9494736842105265e-05, "loss": 0.7573, "step": 740 }, { "epoch": 0.02, "grad_norm": 5.155513286590576, "learning_rate": 1.9473684210526318e-05, "loss": 0.8961, "step": 750 }, { "epoch": 0.02, "grad_norm": 9.195950508117676, "learning_rate": 1.945263157894737e-05, "loss": 0.8398, "step": 760 }, { "epoch": 0.02, "grad_norm": 6.699478626251221, "learning_rate": 1.9431578947368423e-05, "loss": 0.8018, "step": 770 }, { "epoch": 0.02, "grad_norm": 5.254507541656494, "learning_rate": 1.9410526315789476e-05, "loss": 0.8408, "step": 780 }, { "epoch": 0.02, "grad_norm": 4.351966857910156, "learning_rate": 1.9389473684210525e-05, "loss": 0.7323, "step": 790 }, { "epoch": 0.02, "grad_norm": 2.361276626586914, "learning_rate": 1.936842105263158e-05, "loss": 0.8401, "step": 800 }, { "epoch": 0.02, "grad_norm": 5.449990272521973, "learning_rate": 1.9347368421052634e-05, "loss": 0.726, "step": 810 }, { "epoch": 0.02, "grad_norm": 5.375738143920898, "learning_rate": 1.9326315789473687e-05, "loss": 0.8305, "step": 820 }, { "epoch": 0.02, "grad_norm": 2.601025342941284, "learning_rate": 1.930526315789474e-05, "loss": 0.9152, "step": 830 }, { "epoch": 0.02, "grad_norm": 12.153268814086914, "learning_rate": 1.9284210526315792e-05, "loss": 0.8423, "step": 840 }, { "epoch": 0.02, "grad_norm": 3.785663604736328, "learning_rate": 1.9263157894736845e-05, "loss": 0.7733, "step": 850 }, { "epoch": 0.02, "grad_norm": 10.162787437438965, "learning_rate": 1.9242105263157894e-05, "loss": 0.893, "step": 860 }, { "epoch": 0.02, "grad_norm": 3.871621608734131, "learning_rate": 1.922105263157895e-05, "loss": 0.798, "step": 870 }, { "epoch": 0.02, "grad_norm": 2.9919800758361816, "learning_rate": 1.9200000000000003e-05, "loss": 0.8484, "step": 880 }, { "epoch": 0.02, "grad_norm": 5.40109920501709, "learning_rate": 1.9178947368421055e-05, "loss": 0.9129, "step": 890 }, { "epoch": 0.02, "grad_norm": 6.794926643371582, "learning_rate": 1.9157894736842108e-05, "loss": 0.8687, "step": 900 }, { "epoch": 0.02, "grad_norm": 5.942440986633301, "learning_rate": 1.913684210526316e-05, "loss": 0.8564, "step": 910 }, { "epoch": 0.02, "grad_norm": 5.968307018280029, "learning_rate": 1.9115789473684213e-05, "loss": 0.8495, "step": 920 }, { "epoch": 0.02, "grad_norm": 8.425616264343262, "learning_rate": 1.9094736842105262e-05, "loss": 0.7242, "step": 930 }, { "epoch": 0.02, "grad_norm": 2.819301128387451, "learning_rate": 1.907368421052632e-05, "loss": 0.8381, "step": 940 }, { "epoch": 0.02, "grad_norm": 6.81688117980957, "learning_rate": 1.9052631578947368e-05, "loss": 0.8817, "step": 950 }, { "epoch": 0.02, "grad_norm": 5.102423191070557, "learning_rate": 1.9031578947368424e-05, "loss": 0.8274, "step": 960 }, { "epoch": 0.02, "grad_norm": 4.12994909286499, "learning_rate": 1.9010526315789476e-05, "loss": 0.7052, "step": 970 }, { "epoch": 0.02, "grad_norm": 5.15468692779541, "learning_rate": 1.898947368421053e-05, "loss": 0.772, "step": 980 }, { "epoch": 0.02, "grad_norm": 1.62323796749115, "learning_rate": 1.8968421052631582e-05, "loss": 0.7764, "step": 990 }, { "epoch": 0.03, "grad_norm": 2.546677589416504, "learning_rate": 1.894736842105263e-05, "loss": 0.8365, "step": 1000 }, { "epoch": 0.03, "eval_loss": 0.7952949404716492, "eval_runtime": 67.7544, "eval_samples_per_second": 14.759, "eval_steps_per_second": 14.759, "step": 1000 }, { "epoch": 0.03, "grad_norm": 9.28386402130127, "learning_rate": 1.8926315789473687e-05, "loss": 0.8765, "step": 1010 }, { "epoch": 0.03, "grad_norm": 7.3430304527282715, "learning_rate": 1.8905263157894736e-05, "loss": 0.8763, "step": 1020 }, { "epoch": 0.03, "grad_norm": 4.0531206130981445, "learning_rate": 1.8884210526315792e-05, "loss": 0.7943, "step": 1030 }, { "epoch": 0.03, "grad_norm": 3.028320074081421, "learning_rate": 1.886315789473684e-05, "loss": 0.836, "step": 1040 }, { "epoch": 0.03, "grad_norm": 3.3861188888549805, "learning_rate": 1.8842105263157898e-05, "loss": 0.7336, "step": 1050 }, { "epoch": 0.03, "grad_norm": 3.7832908630371094, "learning_rate": 1.882105263157895e-05, "loss": 0.9283, "step": 1060 }, { "epoch": 0.03, "grad_norm": 3.8170342445373535, "learning_rate": 1.88e-05, "loss": 0.7655, "step": 1070 }, { "epoch": 0.03, "grad_norm": 6.15322732925415, "learning_rate": 1.8778947368421056e-05, "loss": 0.9341, "step": 1080 }, { "epoch": 0.03, "grad_norm": 7.066686153411865, "learning_rate": 1.8757894736842105e-05, "loss": 0.85, "step": 1090 }, { "epoch": 0.03, "grad_norm": 2.986961603164673, "learning_rate": 1.873684210526316e-05, "loss": 0.8943, "step": 1100 }, { "epoch": 0.03, "grad_norm": 2.8456902503967285, "learning_rate": 1.871578947368421e-05, "loss": 0.8279, "step": 1110 }, { "epoch": 0.03, "grad_norm": 3.6177377700805664, "learning_rate": 1.8694736842105266e-05, "loss": 0.8192, "step": 1120 }, { "epoch": 0.03, "grad_norm": 14.768010139465332, "learning_rate": 1.8673684210526316e-05, "loss": 0.8005, "step": 1130 }, { "epoch": 0.03, "grad_norm": 11.347342491149902, "learning_rate": 1.8652631578947368e-05, "loss": 0.8081, "step": 1140 }, { "epoch": 0.03, "grad_norm": 4.0560150146484375, "learning_rate": 1.8631578947368424e-05, "loss": 0.9389, "step": 1150 }, { "epoch": 0.03, "grad_norm": 3.3164710998535156, "learning_rate": 1.8610526315789473e-05, "loss": 0.8501, "step": 1160 }, { "epoch": 0.03, "grad_norm": 11.112225532531738, "learning_rate": 1.858947368421053e-05, "loss": 0.7162, "step": 1170 }, { "epoch": 0.03, "grad_norm": 6.200588703155518, "learning_rate": 1.856842105263158e-05, "loss": 0.7448, "step": 1180 }, { "epoch": 0.03, "grad_norm": 6.573482513427734, "learning_rate": 1.8547368421052635e-05, "loss": 0.8071, "step": 1190 }, { "epoch": 0.03, "grad_norm": 5.153548717498779, "learning_rate": 1.8526315789473684e-05, "loss": 0.7957, "step": 1200 }, { "epoch": 0.03, "grad_norm": 5.3308305740356445, "learning_rate": 1.8505263157894737e-05, "loss": 0.7301, "step": 1210 }, { "epoch": 0.03, "grad_norm": 5.269808769226074, "learning_rate": 1.8484210526315793e-05, "loss": 0.8072, "step": 1220 }, { "epoch": 0.03, "grad_norm": 5.588324546813965, "learning_rate": 1.8463157894736842e-05, "loss": 0.8587, "step": 1230 }, { "epoch": 0.03, "grad_norm": 4.593557357788086, "learning_rate": 1.8442105263157898e-05, "loss": 0.856, "step": 1240 }, { "epoch": 0.03, "grad_norm": 5.2591094970703125, "learning_rate": 1.8421052631578947e-05, "loss": 0.7717, "step": 1250 }, { "epoch": 0.03, "grad_norm": 4.052567958831787, "learning_rate": 1.8400000000000003e-05, "loss": 0.7823, "step": 1260 }, { "epoch": 0.03, "grad_norm": 4.447838306427002, "learning_rate": 1.8378947368421053e-05, "loss": 0.83, "step": 1270 }, { "epoch": 0.03, "grad_norm": 4.029257774353027, "learning_rate": 1.8357894736842105e-05, "loss": 0.7504, "step": 1280 }, { "epoch": 0.03, "grad_norm": 9.053960800170898, "learning_rate": 1.8336842105263158e-05, "loss": 0.9074, "step": 1290 }, { "epoch": 0.03, "grad_norm": 2.2877705097198486, "learning_rate": 1.831578947368421e-05, "loss": 0.772, "step": 1300 }, { "epoch": 0.03, "grad_norm": 3.4482290744781494, "learning_rate": 1.8294736842105267e-05, "loss": 0.8658, "step": 1310 }, { "epoch": 0.03, "grad_norm": 6.684794902801514, "learning_rate": 1.8273684210526316e-05, "loss": 0.7848, "step": 1320 }, { "epoch": 0.03, "grad_norm": 3.553828716278076, "learning_rate": 1.8252631578947372e-05, "loss": 0.8219, "step": 1330 }, { "epoch": 0.03, "grad_norm": 2.5203397274017334, "learning_rate": 1.823157894736842e-05, "loss": 0.9071, "step": 1340 }, { "epoch": 0.03, "grad_norm": 4.961795806884766, "learning_rate": 1.8210526315789477e-05, "loss": 0.6542, "step": 1350 }, { "epoch": 0.03, "grad_norm": 3.663081645965576, "learning_rate": 1.8189473684210527e-05, "loss": 0.7402, "step": 1360 }, { "epoch": 0.03, "grad_norm": 8.785040855407715, "learning_rate": 1.816842105263158e-05, "loss": 0.7462, "step": 1370 }, { "epoch": 0.03, "grad_norm": 4.659074783325195, "learning_rate": 1.8147368421052632e-05, "loss": 0.6951, "step": 1380 }, { "epoch": 0.03, "grad_norm": 3.5885703563690186, "learning_rate": 1.8126315789473685e-05, "loss": 0.7008, "step": 1390 }, { "epoch": 0.04, "grad_norm": 3.1295347213745117, "learning_rate": 1.810526315789474e-05, "loss": 0.9103, "step": 1400 }, { "epoch": 0.04, "grad_norm": 2.4699888229370117, "learning_rate": 1.808421052631579e-05, "loss": 0.841, "step": 1410 }, { "epoch": 0.04, "grad_norm": 5.3273444175720215, "learning_rate": 1.8063157894736846e-05, "loss": 0.9041, "step": 1420 }, { "epoch": 0.04, "grad_norm": 5.149638652801514, "learning_rate": 1.8042105263157895e-05, "loss": 0.7784, "step": 1430 }, { "epoch": 0.04, "grad_norm": 3.4124910831451416, "learning_rate": 1.8021052631578948e-05, "loss": 0.8208, "step": 1440 }, { "epoch": 0.04, "grad_norm": 2.9231085777282715, "learning_rate": 1.8e-05, "loss": 0.7173, "step": 1450 }, { "epoch": 0.04, "grad_norm": 4.008113384246826, "learning_rate": 1.7978947368421053e-05, "loss": 0.7383, "step": 1460 }, { "epoch": 0.04, "grad_norm": 5.1748046875, "learning_rate": 1.795789473684211e-05, "loss": 0.8399, "step": 1470 }, { "epoch": 0.04, "grad_norm": 3.4990293979644775, "learning_rate": 1.793684210526316e-05, "loss": 0.6721, "step": 1480 }, { "epoch": 0.04, "grad_norm": 3.1186299324035645, "learning_rate": 1.7915789473684214e-05, "loss": 0.782, "step": 1490 }, { "epoch": 0.04, "grad_norm": 5.12732458114624, "learning_rate": 1.7894736842105264e-05, "loss": 0.7211, "step": 1500 }, { "epoch": 0.04, "eval_loss": 0.811568021774292, "eval_runtime": 67.7961, "eval_samples_per_second": 14.75, "eval_steps_per_second": 14.75, "step": 1500 }, { "epoch": 0.04, "grad_norm": 3.631096124649048, "learning_rate": 1.7873684210526316e-05, "loss": 0.7557, "step": 1510 }, { "epoch": 0.04, "grad_norm": 8.850045204162598, "learning_rate": 1.785263157894737e-05, "loss": 0.8757, "step": 1520 }, { "epoch": 0.04, "grad_norm": 3.1114978790283203, "learning_rate": 1.7831578947368422e-05, "loss": 0.7613, "step": 1530 }, { "epoch": 0.04, "grad_norm": 4.5038743019104, "learning_rate": 1.7810526315789474e-05, "loss": 0.8049, "step": 1540 }, { "epoch": 0.04, "grad_norm": 4.2331156730651855, "learning_rate": 1.7789473684210527e-05, "loss": 0.8277, "step": 1550 }, { "epoch": 0.04, "grad_norm": 5.05696964263916, "learning_rate": 1.7768421052631583e-05, "loss": 0.7973, "step": 1560 }, { "epoch": 0.04, "grad_norm": 2.1331920623779297, "learning_rate": 1.7747368421052632e-05, "loss": 0.7688, "step": 1570 }, { "epoch": 0.04, "grad_norm": 4.984541416168213, "learning_rate": 1.7726315789473685e-05, "loss": 0.7865, "step": 1580 }, { "epoch": 0.04, "grad_norm": 7.149406433105469, "learning_rate": 1.7705263157894738e-05, "loss": 0.7728, "step": 1590 }, { "epoch": 0.04, "grad_norm": 8.092243194580078, "learning_rate": 1.768421052631579e-05, "loss": 0.935, "step": 1600 }, { "epoch": 0.04, "grad_norm": 13.16551399230957, "learning_rate": 1.7663157894736843e-05, "loss": 0.8286, "step": 1610 }, { "epoch": 0.04, "grad_norm": 2.131350517272949, "learning_rate": 1.7642105263157896e-05, "loss": 0.7864, "step": 1620 }, { "epoch": 0.04, "grad_norm": 7.870023727416992, "learning_rate": 1.7621052631578948e-05, "loss": 0.8645, "step": 1630 }, { "epoch": 0.04, "grad_norm": 10.631692886352539, "learning_rate": 1.76e-05, "loss": 0.8473, "step": 1640 }, { "epoch": 0.04, "grad_norm": 6.421032905578613, "learning_rate": 1.7578947368421054e-05, "loss": 0.7868, "step": 1650 }, { "epoch": 0.04, "grad_norm": 4.57529878616333, "learning_rate": 1.7557894736842106e-05, "loss": 0.7882, "step": 1660 }, { "epoch": 0.04, "grad_norm": 3.8785624504089355, "learning_rate": 1.753684210526316e-05, "loss": 0.7543, "step": 1670 }, { "epoch": 0.04, "grad_norm": 5.722006320953369, "learning_rate": 1.751578947368421e-05, "loss": 0.9626, "step": 1680 }, { "epoch": 0.04, "grad_norm": 2.466771364212036, "learning_rate": 1.7494736842105264e-05, "loss": 0.783, "step": 1690 }, { "epoch": 0.04, "grad_norm": 3.072049856185913, "learning_rate": 1.7473684210526317e-05, "loss": 0.7503, "step": 1700 }, { "epoch": 0.04, "grad_norm": 5.768575668334961, "learning_rate": 1.745263157894737e-05, "loss": 0.8193, "step": 1710 }, { "epoch": 0.04, "grad_norm": 2.585022211074829, "learning_rate": 1.7431578947368422e-05, "loss": 0.8808, "step": 1720 }, { "epoch": 0.04, "grad_norm": 3.0711567401885986, "learning_rate": 1.7410526315789475e-05, "loss": 0.8098, "step": 1730 }, { "epoch": 0.04, "grad_norm": 3.3020272254943848, "learning_rate": 1.7389473684210527e-05, "loss": 0.7196, "step": 1740 }, { "epoch": 0.04, "grad_norm": 3.645238161087036, "learning_rate": 1.736842105263158e-05, "loss": 0.8904, "step": 1750 }, { "epoch": 0.04, "grad_norm": 6.018638610839844, "learning_rate": 1.7347368421052633e-05, "loss": 0.7937, "step": 1760 }, { "epoch": 0.04, "grad_norm": 3.629096746444702, "learning_rate": 1.7326315789473685e-05, "loss": 0.9171, "step": 1770 }, { "epoch": 0.04, "grad_norm": 2.5619189739227295, "learning_rate": 1.7305263157894738e-05, "loss": 0.9488, "step": 1780 }, { "epoch": 0.04, "grad_norm": 9.464752197265625, "learning_rate": 1.728421052631579e-05, "loss": 0.8459, "step": 1790 }, { "epoch": 0.04, "grad_norm": 3.9856364727020264, "learning_rate": 1.7263157894736843e-05, "loss": 0.8378, "step": 1800 }, { "epoch": 0.05, "grad_norm": 3.753553867340088, "learning_rate": 1.7242105263157896e-05, "loss": 0.8093, "step": 1810 }, { "epoch": 0.05, "grad_norm": 3.4593358039855957, "learning_rate": 1.722105263157895e-05, "loss": 0.7896, "step": 1820 }, { "epoch": 0.05, "grad_norm": 2.7163546085357666, "learning_rate": 1.72e-05, "loss": 0.7188, "step": 1830 }, { "epoch": 0.05, "grad_norm": 3.105628728866577, "learning_rate": 1.7178947368421054e-05, "loss": 0.7643, "step": 1840 }, { "epoch": 0.05, "grad_norm": 2.387368679046631, "learning_rate": 1.7157894736842107e-05, "loss": 0.8465, "step": 1850 }, { "epoch": 0.05, "grad_norm": 6.020385265350342, "learning_rate": 1.713684210526316e-05, "loss": 0.7798, "step": 1860 }, { "epoch": 0.05, "grad_norm": 4.560520172119141, "learning_rate": 1.7115789473684212e-05, "loss": 0.7704, "step": 1870 }, { "epoch": 0.05, "grad_norm": 15.739727973937988, "learning_rate": 1.7094736842105265e-05, "loss": 0.7148, "step": 1880 }, { "epoch": 0.05, "grad_norm": 5.79690408706665, "learning_rate": 1.7073684210526317e-05, "loss": 0.798, "step": 1890 }, { "epoch": 0.05, "grad_norm": 2.6939146518707275, "learning_rate": 1.705263157894737e-05, "loss": 0.7641, "step": 1900 }, { "epoch": 0.05, "grad_norm": 5.193384170532227, "learning_rate": 1.7031578947368423e-05, "loss": 0.7866, "step": 1910 }, { "epoch": 0.05, "grad_norm": 4.940731525421143, "learning_rate": 1.7010526315789475e-05, "loss": 0.8261, "step": 1920 }, { "epoch": 0.05, "grad_norm": 2.1812446117401123, "learning_rate": 1.6989473684210528e-05, "loss": 0.7973, "step": 1930 }, { "epoch": 0.05, "grad_norm": 3.7413289546966553, "learning_rate": 1.696842105263158e-05, "loss": 0.7818, "step": 1940 }, { "epoch": 0.05, "grad_norm": 4.024014472961426, "learning_rate": 1.6947368421052633e-05, "loss": 0.7237, "step": 1950 }, { "epoch": 0.05, "grad_norm": 3.0871291160583496, "learning_rate": 1.6926315789473686e-05, "loss": 0.772, "step": 1960 }, { "epoch": 0.05, "grad_norm": 3.28814435005188, "learning_rate": 1.690526315789474e-05, "loss": 0.7067, "step": 1970 }, { "epoch": 0.05, "grad_norm": 2.8241286277770996, "learning_rate": 1.688421052631579e-05, "loss": 0.8175, "step": 1980 }, { "epoch": 0.05, "grad_norm": 2.5942068099975586, "learning_rate": 1.6863157894736844e-05, "loss": 0.9265, "step": 1990 }, { "epoch": 0.05, "grad_norm": 6.6822662353515625, "learning_rate": 1.6842105263157896e-05, "loss": 0.8593, "step": 2000 }, { "epoch": 0.05, "eval_loss": 0.8064771890640259, "eval_runtime": 67.7887, "eval_samples_per_second": 14.752, "eval_steps_per_second": 14.752, "step": 2000 }, { "epoch": 0.05, "grad_norm": 7.032164573669434, "learning_rate": 1.682105263157895e-05, "loss": 0.8819, "step": 2010 }, { "epoch": 0.05, "grad_norm": 4.874982833862305, "learning_rate": 1.6800000000000002e-05, "loss": 0.8021, "step": 2020 }, { "epoch": 0.05, "grad_norm": 2.6172547340393066, "learning_rate": 1.6778947368421054e-05, "loss": 0.8017, "step": 2030 }, { "epoch": 0.05, "grad_norm": 10.659741401672363, "learning_rate": 1.6757894736842107e-05, "loss": 0.8896, "step": 2040 }, { "epoch": 0.05, "grad_norm": 6.189141750335693, "learning_rate": 1.673684210526316e-05, "loss": 0.7997, "step": 2050 }, { "epoch": 0.05, "grad_norm": 4.523468971252441, "learning_rate": 1.6715789473684212e-05, "loss": 0.8498, "step": 2060 }, { "epoch": 0.05, "grad_norm": 8.533658981323242, "learning_rate": 1.6694736842105265e-05, "loss": 0.8857, "step": 2070 }, { "epoch": 0.05, "grad_norm": 3.0041606426239014, "learning_rate": 1.6673684210526318e-05, "loss": 0.8112, "step": 2080 }, { "epoch": 0.05, "grad_norm": 5.055651664733887, "learning_rate": 1.665263157894737e-05, "loss": 0.7872, "step": 2090 }, { "epoch": 0.05, "grad_norm": 5.761922836303711, "learning_rate": 1.6631578947368423e-05, "loss": 0.7727, "step": 2100 }, { "epoch": 0.05, "grad_norm": 2.518223524093628, "learning_rate": 1.6610526315789476e-05, "loss": 0.7997, "step": 2110 }, { "epoch": 0.05, "grad_norm": 4.975761890411377, "learning_rate": 1.658947368421053e-05, "loss": 0.7457, "step": 2120 }, { "epoch": 0.05, "grad_norm": 3.2227561473846436, "learning_rate": 1.656842105263158e-05, "loss": 0.816, "step": 2130 }, { "epoch": 0.05, "grad_norm": 4.705923080444336, "learning_rate": 1.6547368421052634e-05, "loss": 0.8113, "step": 2140 }, { "epoch": 0.05, "grad_norm": 2.655057430267334, "learning_rate": 1.6526315789473686e-05, "loss": 0.7912, "step": 2150 }, { "epoch": 0.05, "grad_norm": 3.0186755657196045, "learning_rate": 1.650526315789474e-05, "loss": 0.8608, "step": 2160 }, { "epoch": 0.05, "grad_norm": 1.232386827468872, "learning_rate": 1.648421052631579e-05, "loss": 0.8549, "step": 2170 }, { "epoch": 0.05, "grad_norm": 11.968620300292969, "learning_rate": 1.6463157894736844e-05, "loss": 0.868, "step": 2180 }, { "epoch": 0.05, "grad_norm": 3.5853216648101807, "learning_rate": 1.6442105263157897e-05, "loss": 0.8388, "step": 2190 }, { "epoch": 0.06, "grad_norm": 2.375610589981079, "learning_rate": 1.642105263157895e-05, "loss": 0.9111, "step": 2200 }, { "epoch": 0.06, "grad_norm": 1.9734487533569336, "learning_rate": 1.64e-05, "loss": 0.7288, "step": 2210 }, { "epoch": 0.06, "grad_norm": 10.517192840576172, "learning_rate": 1.6378947368421055e-05, "loss": 0.698, "step": 2220 }, { "epoch": 0.06, "grad_norm": 4.183718204498291, "learning_rate": 1.6357894736842108e-05, "loss": 0.7759, "step": 2230 }, { "epoch": 0.06, "grad_norm": 3.9075675010681152, "learning_rate": 1.633684210526316e-05, "loss": 0.7829, "step": 2240 }, { "epoch": 0.06, "grad_norm": 5.287744998931885, "learning_rate": 1.6315789473684213e-05, "loss": 0.7057, "step": 2250 }, { "epoch": 0.06, "grad_norm": 4.977657318115234, "learning_rate": 1.6294736842105265e-05, "loss": 0.8346, "step": 2260 }, { "epoch": 0.06, "grad_norm": 7.196689128875732, "learning_rate": 1.6273684210526318e-05, "loss": 0.8508, "step": 2270 }, { "epoch": 0.06, "grad_norm": 2.467477798461914, "learning_rate": 1.6252631578947367e-05, "loss": 0.7179, "step": 2280 }, { "epoch": 0.06, "grad_norm": 7.059762954711914, "learning_rate": 1.6231578947368423e-05, "loss": 0.7549, "step": 2290 }, { "epoch": 0.06, "grad_norm": 3.980865955352783, "learning_rate": 1.6210526315789473e-05, "loss": 0.814, "step": 2300 }, { "epoch": 0.06, "grad_norm": 7.675939559936523, "learning_rate": 1.618947368421053e-05, "loss": 0.8227, "step": 2310 }, { "epoch": 0.06, "grad_norm": 3.530073642730713, "learning_rate": 1.616842105263158e-05, "loss": 0.8517, "step": 2320 }, { "epoch": 0.06, "grad_norm": 3.6851344108581543, "learning_rate": 1.6147368421052634e-05, "loss": 0.7684, "step": 2330 }, { "epoch": 0.06, "grad_norm": 5.206923961639404, "learning_rate": 1.6126315789473687e-05, "loss": 0.8199, "step": 2340 }, { "epoch": 0.06, "grad_norm": 5.220828056335449, "learning_rate": 1.6105263157894736e-05, "loss": 0.8871, "step": 2350 }, { "epoch": 0.06, "grad_norm": 3.5062482357025146, "learning_rate": 1.6084210526315792e-05, "loss": 0.8281, "step": 2360 }, { "epoch": 0.06, "grad_norm": 1.9830796718597412, "learning_rate": 1.606315789473684e-05, "loss": 0.8678, "step": 2370 }, { "epoch": 0.06, "grad_norm": 3.3255491256713867, "learning_rate": 1.6042105263157897e-05, "loss": 0.8337, "step": 2380 }, { "epoch": 0.06, "grad_norm": 5.259572505950928, "learning_rate": 1.6021052631578947e-05, "loss": 0.7954, "step": 2390 }, { "epoch": 0.06, "grad_norm": 3.6201376914978027, "learning_rate": 1.6000000000000003e-05, "loss": 0.818, "step": 2400 }, { "epoch": 0.06, "grad_norm": 3.3598544597625732, "learning_rate": 1.5978947368421055e-05, "loss": 0.7697, "step": 2410 }, { "epoch": 0.06, "grad_norm": 6.34808349609375, "learning_rate": 1.5957894736842105e-05, "loss": 0.6347, "step": 2420 }, { "epoch": 0.06, "grad_norm": 3.967682361602783, "learning_rate": 1.593684210526316e-05, "loss": 0.7178, "step": 2430 }, { "epoch": 0.06, "grad_norm": 10.222978591918945, "learning_rate": 1.591578947368421e-05, "loss": 0.7642, "step": 2440 }, { "epoch": 0.06, "grad_norm": 3.9339826107025146, "learning_rate": 1.5894736842105266e-05, "loss": 0.8197, "step": 2450 }, { "epoch": 0.06, "grad_norm": 2.3337771892547607, "learning_rate": 1.5873684210526315e-05, "loss": 0.9375, "step": 2460 }, { "epoch": 0.06, "grad_norm": 2.8479838371276855, "learning_rate": 1.585263157894737e-05, "loss": 0.9196, "step": 2470 }, { "epoch": 0.06, "grad_norm": 9.294541358947754, "learning_rate": 1.5831578947368424e-05, "loss": 0.7144, "step": 2480 }, { "epoch": 0.06, "grad_norm": 5.325323104858398, "learning_rate": 1.5810526315789473e-05, "loss": 0.7897, "step": 2490 }, { "epoch": 0.06, "grad_norm": 4.377369403839111, "learning_rate": 1.578947368421053e-05, "loss": 0.9008, "step": 2500 }, { "epoch": 0.06, "eval_loss": 0.8163847923278809, "eval_runtime": 67.7994, "eval_samples_per_second": 14.749, "eval_steps_per_second": 14.749, "step": 2500 }, { "epoch": 0.06, "grad_norm": 5.1105055809021, "learning_rate": 1.576842105263158e-05, "loss": 0.7897, "step": 2510 }, { "epoch": 0.06, "grad_norm": 3.321247100830078, "learning_rate": 1.5747368421052635e-05, "loss": 0.7394, "step": 2520 }, { "epoch": 0.06, "grad_norm": 2.1728689670562744, "learning_rate": 1.5726315789473684e-05, "loss": 0.7395, "step": 2530 }, { "epoch": 0.06, "grad_norm": 4.246960163116455, "learning_rate": 1.570526315789474e-05, "loss": 0.7825, "step": 2540 }, { "epoch": 0.06, "grad_norm": 4.518326282501221, "learning_rate": 1.568421052631579e-05, "loss": 0.8168, "step": 2550 }, { "epoch": 0.06, "grad_norm": 4.336541652679443, "learning_rate": 1.5663157894736842e-05, "loss": 0.8887, "step": 2560 }, { "epoch": 0.06, "grad_norm": 3.3204426765441895, "learning_rate": 1.5642105263157898e-05, "loss": 0.8257, "step": 2570 }, { "epoch": 0.06, "grad_norm": 9.327149391174316, "learning_rate": 1.5621052631578947e-05, "loss": 0.7896, "step": 2580 }, { "epoch": 0.06, "grad_norm": 1.575266718864441, "learning_rate": 1.5600000000000003e-05, "loss": 0.9021, "step": 2590 }, { "epoch": 0.07, "grad_norm": 2.955282688140869, "learning_rate": 1.5578947368421052e-05, "loss": 0.8115, "step": 2600 }, { "epoch": 0.07, "grad_norm": 6.098946571350098, "learning_rate": 1.555789473684211e-05, "loss": 0.647, "step": 2610 }, { "epoch": 0.07, "grad_norm": 3.354290723800659, "learning_rate": 1.5536842105263158e-05, "loss": 0.8033, "step": 2620 }, { "epoch": 0.07, "grad_norm": 4.7387518882751465, "learning_rate": 1.551578947368421e-05, "loss": 0.6904, "step": 2630 }, { "epoch": 0.07, "grad_norm": 7.594583034515381, "learning_rate": 1.5494736842105263e-05, "loss": 0.7914, "step": 2640 }, { "epoch": 0.07, "grad_norm": 2.994126081466675, "learning_rate": 1.5473684210526316e-05, "loss": 0.8019, "step": 2650 }, { "epoch": 0.07, "grad_norm": 5.478656768798828, "learning_rate": 1.545263157894737e-05, "loss": 0.7575, "step": 2660 }, { "epoch": 0.07, "grad_norm": 3.4734623432159424, "learning_rate": 1.543157894736842e-05, "loss": 0.7662, "step": 2670 }, { "epoch": 0.07, "grad_norm": 3.2532217502593994, "learning_rate": 1.5410526315789477e-05, "loss": 0.6782, "step": 2680 }, { "epoch": 0.07, "grad_norm": 4.5201520919799805, "learning_rate": 1.5389473684210526e-05, "loss": 0.7102, "step": 2690 }, { "epoch": 0.07, "grad_norm": 3.8668696880340576, "learning_rate": 1.536842105263158e-05, "loss": 0.8358, "step": 2700 }, { "epoch": 0.07, "grad_norm": 5.816726207733154, "learning_rate": 1.534736842105263e-05, "loss": 0.8439, "step": 2710 }, { "epoch": 0.07, "grad_norm": 3.014636516571045, "learning_rate": 1.5326315789473684e-05, "loss": 0.8699, "step": 2720 }, { "epoch": 0.07, "grad_norm": 3.465543270111084, "learning_rate": 1.530526315789474e-05, "loss": 0.7515, "step": 2730 }, { "epoch": 0.07, "grad_norm": 6.904135227203369, "learning_rate": 1.528421052631579e-05, "loss": 0.7578, "step": 2740 }, { "epoch": 0.07, "grad_norm": 3.1614532470703125, "learning_rate": 1.5263157894736846e-05, "loss": 0.6915, "step": 2750 }, { "epoch": 0.07, "grad_norm": 2.4877758026123047, "learning_rate": 1.5242105263157897e-05, "loss": 0.7607, "step": 2760 }, { "epoch": 0.07, "grad_norm": 6.346368312835693, "learning_rate": 1.5221052631578948e-05, "loss": 0.8558, "step": 2770 }, { "epoch": 0.07, "grad_norm": 4.319607734680176, "learning_rate": 1.5200000000000002e-05, "loss": 0.8349, "step": 2780 }, { "epoch": 0.07, "grad_norm": 3.130995750427246, "learning_rate": 1.5178947368421053e-05, "loss": 0.7747, "step": 2790 }, { "epoch": 0.07, "grad_norm": 1.8080275058746338, "learning_rate": 1.5157894736842107e-05, "loss": 0.696, "step": 2800 }, { "epoch": 0.07, "grad_norm": 3.184603214263916, "learning_rate": 1.5136842105263158e-05, "loss": 0.7874, "step": 2810 }, { "epoch": 0.07, "grad_norm": 2.7330257892608643, "learning_rate": 1.5115789473684212e-05, "loss": 0.6844, "step": 2820 }, { "epoch": 0.07, "grad_norm": 3.2653279304504395, "learning_rate": 1.5094736842105263e-05, "loss": 0.6429, "step": 2830 }, { "epoch": 0.07, "grad_norm": 3.826791524887085, "learning_rate": 1.5073684210526316e-05, "loss": 0.7868, "step": 2840 }, { "epoch": 0.07, "grad_norm": 5.656713008880615, "learning_rate": 1.505263157894737e-05, "loss": 0.81, "step": 2850 }, { "epoch": 0.07, "grad_norm": 2.6106789112091064, "learning_rate": 1.5031578947368421e-05, "loss": 0.7693, "step": 2860 }, { "epoch": 0.07, "grad_norm": 2.2952094078063965, "learning_rate": 1.5010526315789476e-05, "loss": 0.7727, "step": 2870 }, { "epoch": 0.07, "grad_norm": 2.8655829429626465, "learning_rate": 1.4989473684210527e-05, "loss": 0.7702, "step": 2880 }, { "epoch": 0.07, "grad_norm": 3.170299768447876, "learning_rate": 1.4968421052631581e-05, "loss": 0.884, "step": 2890 }, { "epoch": 0.07, "grad_norm": 10.48736572265625, "learning_rate": 1.4947368421052632e-05, "loss": 0.8281, "step": 2900 }, { "epoch": 0.07, "grad_norm": 6.244320392608643, "learning_rate": 1.4926315789473686e-05, "loss": 0.8549, "step": 2910 }, { "epoch": 0.07, "grad_norm": 9.334859848022461, "learning_rate": 1.4905263157894739e-05, "loss": 0.7232, "step": 2920 }, { "epoch": 0.07, "grad_norm": 6.034826755523682, "learning_rate": 1.488421052631579e-05, "loss": 0.7666, "step": 2930 }, { "epoch": 0.07, "grad_norm": 5.024431228637695, "learning_rate": 1.4863157894736844e-05, "loss": 0.7803, "step": 2940 }, { "epoch": 0.07, "grad_norm": 4.2071685791015625, "learning_rate": 1.4842105263157895e-05, "loss": 0.8844, "step": 2950 }, { "epoch": 0.07, "grad_norm": 2.319312810897827, "learning_rate": 1.482105263157895e-05, "loss": 0.7345, "step": 2960 }, { "epoch": 0.07, "grad_norm": 4.384433746337891, "learning_rate": 1.48e-05, "loss": 0.6718, "step": 2970 }, { "epoch": 0.07, "grad_norm": 3.6182382106781006, "learning_rate": 1.4778947368421055e-05, "loss": 0.7898, "step": 2980 }, { "epoch": 0.07, "grad_norm": 8.145679473876953, "learning_rate": 1.4757894736842106e-05, "loss": 0.7754, "step": 2990 }, { "epoch": 0.07, "grad_norm": 4.761884689331055, "learning_rate": 1.4736842105263159e-05, "loss": 0.7607, "step": 3000 }, { "epoch": 0.07, "eval_loss": 0.8005050420761108, "eval_runtime": 67.8219, "eval_samples_per_second": 14.744, "eval_steps_per_second": 14.744, "step": 3000 }, { "epoch": 0.08, "grad_norm": 9.505555152893066, "learning_rate": 1.4715789473684213e-05, "loss": 0.6883, "step": 3010 }, { "epoch": 0.08, "grad_norm": 4.4507155418396, "learning_rate": 1.4694736842105264e-05, "loss": 0.7203, "step": 3020 }, { "epoch": 0.08, "grad_norm": 7.255837440490723, "learning_rate": 1.4673684210526318e-05, "loss": 0.8193, "step": 3030 }, { "epoch": 0.08, "grad_norm": 6.857261657714844, "learning_rate": 1.465263157894737e-05, "loss": 0.6581, "step": 3040 }, { "epoch": 0.08, "grad_norm": 8.000073432922363, "learning_rate": 1.4631578947368424e-05, "loss": 0.7318, "step": 3050 }, { "epoch": 0.08, "grad_norm": 2.8200011253356934, "learning_rate": 1.4610526315789474e-05, "loss": 0.8176, "step": 3060 }, { "epoch": 0.08, "grad_norm": 4.028472423553467, "learning_rate": 1.4589473684210527e-05, "loss": 0.7712, "step": 3070 }, { "epoch": 0.08, "grad_norm": 3.2594337463378906, "learning_rate": 1.456842105263158e-05, "loss": 0.8032, "step": 3080 }, { "epoch": 0.08, "grad_norm": 3.751202344894409, "learning_rate": 1.4547368421052632e-05, "loss": 0.7905, "step": 3090 }, { "epoch": 0.08, "grad_norm": 4.235973834991455, "learning_rate": 1.4526315789473687e-05, "loss": 0.8402, "step": 3100 }, { "epoch": 0.08, "grad_norm": 2.327855348587036, "learning_rate": 1.4505263157894738e-05, "loss": 0.9027, "step": 3110 }, { "epoch": 0.08, "grad_norm": 5.911487102508545, "learning_rate": 1.4484210526315792e-05, "loss": 0.8241, "step": 3120 }, { "epoch": 0.08, "grad_norm": 2.3612990379333496, "learning_rate": 1.4463157894736843e-05, "loss": 0.7884, "step": 3130 }, { "epoch": 0.08, "grad_norm": 2.106682300567627, "learning_rate": 1.4442105263157896e-05, "loss": 0.7414, "step": 3140 }, { "epoch": 0.08, "grad_norm": 4.354177951812744, "learning_rate": 1.4421052631578948e-05, "loss": 0.7626, "step": 3150 }, { "epoch": 0.08, "grad_norm": 3.0019009113311768, "learning_rate": 1.4400000000000001e-05, "loss": 0.6853, "step": 3160 }, { "epoch": 0.08, "grad_norm": 3.787949562072754, "learning_rate": 1.4378947368421054e-05, "loss": 0.8105, "step": 3170 }, { "epoch": 0.08, "grad_norm": 5.0848469734191895, "learning_rate": 1.4357894736842106e-05, "loss": 0.7787, "step": 3180 }, { "epoch": 0.08, "grad_norm": 6.892744541168213, "learning_rate": 1.433684210526316e-05, "loss": 0.8883, "step": 3190 }, { "epoch": 0.08, "grad_norm": 3.717949390411377, "learning_rate": 1.4315789473684212e-05, "loss": 0.7458, "step": 3200 }, { "epoch": 0.08, "grad_norm": 3.4417831897735596, "learning_rate": 1.4294736842105263e-05, "loss": 0.7868, "step": 3210 }, { "epoch": 0.08, "grad_norm": 9.308151245117188, "learning_rate": 1.4273684210526317e-05, "loss": 0.8725, "step": 3220 }, { "epoch": 0.08, "grad_norm": 2.7036919593811035, "learning_rate": 1.425263157894737e-05, "loss": 0.8706, "step": 3230 }, { "epoch": 0.08, "grad_norm": 7.338090419769287, "learning_rate": 1.4231578947368422e-05, "loss": 0.7695, "step": 3240 }, { "epoch": 0.08, "grad_norm": 2.9678733348846436, "learning_rate": 1.4210526315789475e-05, "loss": 0.7259, "step": 3250 }, { "epoch": 0.08, "grad_norm": 3.3436050415039062, "learning_rate": 1.418947368421053e-05, "loss": 0.7193, "step": 3260 }, { "epoch": 0.08, "grad_norm": 2.23856520652771, "learning_rate": 1.416842105263158e-05, "loss": 0.6766, "step": 3270 }, { "epoch": 0.08, "grad_norm": 4.021206855773926, "learning_rate": 1.4147368421052631e-05, "loss": 0.888, "step": 3280 }, { "epoch": 0.08, "grad_norm": 7.063048839569092, "learning_rate": 1.4126315789473686e-05, "loss": 0.7543, "step": 3290 }, { "epoch": 0.08, "grad_norm": 4.777950763702393, "learning_rate": 1.4105263157894738e-05, "loss": 0.6239, "step": 3300 }, { "epoch": 0.08, "grad_norm": 3.2984225749969482, "learning_rate": 1.4084210526315791e-05, "loss": 0.8349, "step": 3310 }, { "epoch": 0.08, "grad_norm": 6.48808479309082, "learning_rate": 1.4063157894736844e-05, "loss": 0.8765, "step": 3320 }, { "epoch": 0.08, "grad_norm": 4.557926177978516, "learning_rate": 1.4042105263157896e-05, "loss": 0.8508, "step": 3330 }, { "epoch": 0.08, "grad_norm": 2.661513090133667, "learning_rate": 1.4021052631578949e-05, "loss": 0.7547, "step": 3340 }, { "epoch": 0.08, "grad_norm": 5.827274799346924, "learning_rate": 1.4e-05, "loss": 0.8556, "step": 3350 }, { "epoch": 0.08, "grad_norm": 3.0803449153900146, "learning_rate": 1.3978947368421054e-05, "loss": 0.8663, "step": 3360 }, { "epoch": 0.08, "grad_norm": 9.996018409729004, "learning_rate": 1.3957894736842105e-05, "loss": 0.68, "step": 3370 }, { "epoch": 0.08, "grad_norm": 5.443753242492676, "learning_rate": 1.393684210526316e-05, "loss": 0.7227, "step": 3380 }, { "epoch": 0.08, "grad_norm": 9.685049057006836, "learning_rate": 1.3915789473684212e-05, "loss": 0.7336, "step": 3390 }, { "epoch": 0.09, "grad_norm": 5.89941930770874, "learning_rate": 1.3894736842105265e-05, "loss": 0.7016, "step": 3400 }, { "epoch": 0.09, "grad_norm": 9.616964340209961, "learning_rate": 1.3873684210526317e-05, "loss": 0.7678, "step": 3410 }, { "epoch": 0.09, "grad_norm": 5.262804985046387, "learning_rate": 1.3852631578947368e-05, "loss": 0.8227, "step": 3420 }, { "epoch": 0.09, "grad_norm": 5.251176357269287, "learning_rate": 1.3831578947368423e-05, "loss": 0.71, "step": 3430 }, { "epoch": 0.09, "grad_norm": 5.626483917236328, "learning_rate": 1.3810526315789474e-05, "loss": 0.8204, "step": 3440 }, { "epoch": 0.09, "grad_norm": 6.990488052368164, "learning_rate": 1.3789473684210528e-05, "loss": 0.7745, "step": 3450 }, { "epoch": 0.09, "grad_norm": 7.510478496551514, "learning_rate": 1.3768421052631579e-05, "loss": 0.6286, "step": 3460 }, { "epoch": 0.09, "grad_norm": 5.006512641906738, "learning_rate": 1.3747368421052633e-05, "loss": 0.8146, "step": 3470 }, { "epoch": 0.09, "grad_norm": 6.388507843017578, "learning_rate": 1.3726315789473686e-05, "loss": 0.7914, "step": 3480 }, { "epoch": 0.09, "grad_norm": 5.217645168304443, "learning_rate": 1.3705263157894737e-05, "loss": 0.7219, "step": 3490 }, { "epoch": 0.09, "grad_norm": 6.257259368896484, "learning_rate": 1.3684210526315791e-05, "loss": 0.8404, "step": 3500 }, { "epoch": 0.09, "eval_loss": 0.8086790442466736, "eval_runtime": 67.9356, "eval_samples_per_second": 14.72, "eval_steps_per_second": 14.72, "step": 3500 }, { "epoch": 0.09, "grad_norm": 2.6733217239379883, "learning_rate": 1.3663157894736842e-05, "loss": 0.7936, "step": 3510 }, { "epoch": 0.09, "grad_norm": 2.0083932876586914, "learning_rate": 1.3642105263157897e-05, "loss": 0.7122, "step": 3520 }, { "epoch": 0.09, "grad_norm": 17.048171997070312, "learning_rate": 1.3621052631578948e-05, "loss": 0.7568, "step": 3530 }, { "epoch": 0.09, "grad_norm": 3.1675314903259277, "learning_rate": 1.3600000000000002e-05, "loss": 0.639, "step": 3540 }, { "epoch": 0.09, "grad_norm": 2.6769821643829346, "learning_rate": 1.3578947368421055e-05, "loss": 0.8691, "step": 3550 }, { "epoch": 0.09, "grad_norm": 3.516094207763672, "learning_rate": 1.3557894736842106e-05, "loss": 0.7681, "step": 3560 }, { "epoch": 0.09, "grad_norm": 2.8059396743774414, "learning_rate": 1.353684210526316e-05, "loss": 0.7387, "step": 3570 }, { "epoch": 0.09, "grad_norm": 4.531425952911377, "learning_rate": 1.3515789473684211e-05, "loss": 0.7559, "step": 3580 }, { "epoch": 0.09, "grad_norm": 9.721296310424805, "learning_rate": 1.3494736842105265e-05, "loss": 0.7816, "step": 3590 }, { "epoch": 0.09, "grad_norm": 6.60942268371582, "learning_rate": 1.3473684210526316e-05, "loss": 0.8203, "step": 3600 }, { "epoch": 0.09, "grad_norm": 2.4415578842163086, "learning_rate": 1.345263157894737e-05, "loss": 0.6731, "step": 3610 }, { "epoch": 0.09, "grad_norm": 3.322394847869873, "learning_rate": 1.3431578947368421e-05, "loss": 0.8578, "step": 3620 }, { "epoch": 0.09, "grad_norm": 2.429548978805542, "learning_rate": 1.3410526315789474e-05, "loss": 0.7576, "step": 3630 }, { "epoch": 0.09, "grad_norm": 3.9194421768188477, "learning_rate": 1.3389473684210528e-05, "loss": 0.8131, "step": 3640 }, { "epoch": 0.09, "grad_norm": 3.6766152381896973, "learning_rate": 1.336842105263158e-05, "loss": 0.9391, "step": 3650 }, { "epoch": 0.09, "grad_norm": 3.4084839820861816, "learning_rate": 1.3347368421052634e-05, "loss": 0.8242, "step": 3660 }, { "epoch": 0.09, "grad_norm": 3.2697949409484863, "learning_rate": 1.3326315789473685e-05, "loss": 0.7534, "step": 3670 }, { "epoch": 0.09, "grad_norm": 3.7821884155273438, "learning_rate": 1.3305263157894739e-05, "loss": 0.7237, "step": 3680 }, { "epoch": 0.09, "grad_norm": 4.925840854644775, "learning_rate": 1.328421052631579e-05, "loss": 0.8194, "step": 3690 }, { "epoch": 0.09, "grad_norm": 4.694246768951416, "learning_rate": 1.3263157894736843e-05, "loss": 0.7628, "step": 3700 }, { "epoch": 0.09, "grad_norm": 7.358584403991699, "learning_rate": 1.3242105263157895e-05, "loss": 0.9161, "step": 3710 }, { "epoch": 0.09, "grad_norm": 2.007431983947754, "learning_rate": 1.3221052631578948e-05, "loss": 0.6624, "step": 3720 }, { "epoch": 0.09, "grad_norm": 2.7626278400421143, "learning_rate": 1.3200000000000002e-05, "loss": 0.7662, "step": 3730 }, { "epoch": 0.09, "grad_norm": 2.4226157665252686, "learning_rate": 1.3178947368421053e-05, "loss": 0.809, "step": 3740 }, { "epoch": 0.09, "grad_norm": 3.5735135078430176, "learning_rate": 1.3157894736842108e-05, "loss": 0.6769, "step": 3750 }, { "epoch": 0.09, "grad_norm": 2.4251084327697754, "learning_rate": 1.3136842105263159e-05, "loss": 0.7785, "step": 3760 }, { "epoch": 0.09, "grad_norm": 2.311429977416992, "learning_rate": 1.3115789473684211e-05, "loss": 0.7536, "step": 3770 }, { "epoch": 0.09, "grad_norm": 3.2348074913024902, "learning_rate": 1.3094736842105264e-05, "loss": 0.8138, "step": 3780 }, { "epoch": 0.09, "grad_norm": 7.259544372558594, "learning_rate": 1.3073684210526317e-05, "loss": 0.8, "step": 3790 }, { "epoch": 0.1, "grad_norm": 3.99937105178833, "learning_rate": 1.305263157894737e-05, "loss": 0.7894, "step": 3800 }, { "epoch": 0.1, "grad_norm": 10.336478233337402, "learning_rate": 1.3031578947368422e-05, "loss": 0.7928, "step": 3810 }, { "epoch": 0.1, "grad_norm": 4.500198841094971, "learning_rate": 1.3010526315789476e-05, "loss": 0.7916, "step": 3820 }, { "epoch": 0.1, "grad_norm": 3.115521192550659, "learning_rate": 1.2989473684210527e-05, "loss": 0.7585, "step": 3830 }, { "epoch": 0.1, "grad_norm": 8.954665184020996, "learning_rate": 1.2968421052631578e-05, "loss": 0.7727, "step": 3840 }, { "epoch": 0.1, "grad_norm": 4.874253273010254, "learning_rate": 1.2947368421052633e-05, "loss": 0.7903, "step": 3850 }, { "epoch": 0.1, "grad_norm": 3.151484966278076, "learning_rate": 1.2926315789473685e-05, "loss": 0.7199, "step": 3860 }, { "epoch": 0.1, "grad_norm": 10.117889404296875, "learning_rate": 1.2905263157894738e-05, "loss": 0.7562, "step": 3870 }, { "epoch": 0.1, "grad_norm": 2.595205307006836, "learning_rate": 1.288421052631579e-05, "loss": 0.8167, "step": 3880 }, { "epoch": 0.1, "grad_norm": 3.4744372367858887, "learning_rate": 1.2863157894736845e-05, "loss": 0.7343, "step": 3890 }, { "epoch": 0.1, "grad_norm": 3.1740803718566895, "learning_rate": 1.2842105263157896e-05, "loss": 0.8754, "step": 3900 }, { "epoch": 0.1, "grad_norm": 7.299022197723389, "learning_rate": 1.2821052631578947e-05, "loss": 0.7379, "step": 3910 }, { "epoch": 0.1, "grad_norm": 3.2339208126068115, "learning_rate": 1.2800000000000001e-05, "loss": 0.771, "step": 3920 }, { "epoch": 0.1, "grad_norm": 2.5612077713012695, "learning_rate": 1.2778947368421054e-05, "loss": 0.7959, "step": 3930 }, { "epoch": 0.1, "grad_norm": 4.87350606918335, "learning_rate": 1.2757894736842106e-05, "loss": 0.7871, "step": 3940 }, { "epoch": 0.1, "grad_norm": 3.8318493366241455, "learning_rate": 1.2736842105263159e-05, "loss": 0.7502, "step": 3950 }, { "epoch": 0.1, "grad_norm": 4.797230243682861, "learning_rate": 1.2715789473684212e-05, "loss": 0.7241, "step": 3960 }, { "epoch": 0.1, "grad_norm": 4.037790775299072, "learning_rate": 1.2694736842105264e-05, "loss": 0.8642, "step": 3970 }, { "epoch": 0.1, "grad_norm": 4.736443042755127, "learning_rate": 1.2673684210526315e-05, "loss": 0.7672, "step": 3980 }, { "epoch": 0.1, "grad_norm": 3.449172258377075, "learning_rate": 1.265263157894737e-05, "loss": 0.7685, "step": 3990 }, { "epoch": 0.1, "grad_norm": 2.1893362998962402, "learning_rate": 1.263157894736842e-05, "loss": 0.6876, "step": 4000 }, { "epoch": 0.1, "eval_loss": 0.8031703233718872, "eval_runtime": 67.9677, "eval_samples_per_second": 14.713, "eval_steps_per_second": 14.713, "step": 4000 }, { "epoch": 0.1, "grad_norm": 3.5685079097747803, "learning_rate": 1.2610526315789475e-05, "loss": 0.7317, "step": 4010 }, { "epoch": 0.1, "grad_norm": 6.440120220184326, "learning_rate": 1.2589473684210528e-05, "loss": 0.7919, "step": 4020 }, { "epoch": 0.1, "grad_norm": 5.1870341300964355, "learning_rate": 1.256842105263158e-05, "loss": 0.6921, "step": 4030 }, { "epoch": 0.1, "grad_norm": 4.161406517028809, "learning_rate": 1.2547368421052633e-05, "loss": 0.7822, "step": 4040 }, { "epoch": 0.1, "grad_norm": 6.242280006408691, "learning_rate": 1.2526315789473684e-05, "loss": 0.8151, "step": 4050 }, { "epoch": 0.1, "grad_norm": 3.825861692428589, "learning_rate": 1.2505263157894738e-05, "loss": 0.7709, "step": 4060 }, { "epoch": 0.1, "grad_norm": 22.97239112854004, "learning_rate": 1.248421052631579e-05, "loss": 0.8456, "step": 4070 }, { "epoch": 0.1, "grad_norm": 3.718015193939209, "learning_rate": 1.2463157894736844e-05, "loss": 0.8354, "step": 4080 }, { "epoch": 0.1, "grad_norm": 3.266710042953491, "learning_rate": 1.2442105263157895e-05, "loss": 0.6444, "step": 4090 }, { "epoch": 0.1, "grad_norm": 4.711140155792236, "learning_rate": 1.2421052631578949e-05, "loss": 0.8418, "step": 4100 }, { "epoch": 0.1, "grad_norm": 2.8773484230041504, "learning_rate": 1.2400000000000002e-05, "loss": 0.6673, "step": 4110 }, { "epoch": 0.1, "grad_norm": 4.492387771606445, "learning_rate": 1.2378947368421053e-05, "loss": 0.7801, "step": 4120 }, { "epoch": 0.1, "grad_norm": 4.106402397155762, "learning_rate": 1.2357894736842107e-05, "loss": 0.718, "step": 4130 }, { "epoch": 0.1, "grad_norm": 4.770216941833496, "learning_rate": 1.2336842105263158e-05, "loss": 0.7546, "step": 4140 }, { "epoch": 0.1, "grad_norm": 3.7071616649627686, "learning_rate": 1.2315789473684212e-05, "loss": 0.8232, "step": 4150 }, { "epoch": 0.1, "grad_norm": 11.786856651306152, "learning_rate": 1.2294736842105263e-05, "loss": 0.845, "step": 4160 }, { "epoch": 0.1, "grad_norm": 2.190443515777588, "learning_rate": 1.2273684210526317e-05, "loss": 0.7656, "step": 4170 }, { "epoch": 0.1, "grad_norm": 6.3326239585876465, "learning_rate": 1.225263157894737e-05, "loss": 0.8145, "step": 4180 }, { "epoch": 0.1, "grad_norm": 2.735156297683716, "learning_rate": 1.2231578947368421e-05, "loss": 0.7637, "step": 4190 }, { "epoch": 0.1, "grad_norm": 44.92083740234375, "learning_rate": 1.2210526315789475e-05, "loss": 0.8358, "step": 4200 }, { "epoch": 0.11, "grad_norm": 5.335235595703125, "learning_rate": 1.2189473684210526e-05, "loss": 0.8564, "step": 4210 }, { "epoch": 0.11, "grad_norm": 6.2740349769592285, "learning_rate": 1.216842105263158e-05, "loss": 0.8443, "step": 4220 }, { "epoch": 0.11, "grad_norm": 5.290927410125732, "learning_rate": 1.2147368421052632e-05, "loss": 0.8041, "step": 4230 }, { "epoch": 0.11, "grad_norm": 9.460419654846191, "learning_rate": 1.2126315789473686e-05, "loss": 0.8054, "step": 4240 }, { "epoch": 0.11, "grad_norm": 3.963223934173584, "learning_rate": 1.2105263157894737e-05, "loss": 0.8104, "step": 4250 }, { "epoch": 0.11, "grad_norm": 5.091956615447998, "learning_rate": 1.208421052631579e-05, "loss": 0.7156, "step": 4260 }, { "epoch": 0.11, "grad_norm": 3.055617570877075, "learning_rate": 1.2063157894736844e-05, "loss": 0.835, "step": 4270 }, { "epoch": 0.11, "grad_norm": 3.4951014518737793, "learning_rate": 1.2042105263157895e-05, "loss": 0.8004, "step": 4280 }, { "epoch": 0.11, "grad_norm": 2.876716136932373, "learning_rate": 1.202105263157895e-05, "loss": 0.7324, "step": 4290 }, { "epoch": 0.11, "grad_norm": 2.0460751056671143, "learning_rate": 1.2e-05, "loss": 0.7856, "step": 4300 }, { "epoch": 0.11, "grad_norm": 2.417022943496704, "learning_rate": 1.1978947368421055e-05, "loss": 0.7649, "step": 4310 }, { "epoch": 0.11, "grad_norm": 5.435426235198975, "learning_rate": 1.1957894736842106e-05, "loss": 0.8354, "step": 4320 }, { "epoch": 0.11, "grad_norm": 3.5707461833953857, "learning_rate": 1.1936842105263158e-05, "loss": 0.7264, "step": 4330 }, { "epoch": 0.11, "grad_norm": 4.077741622924805, "learning_rate": 1.1915789473684211e-05, "loss": 0.6934, "step": 4340 }, { "epoch": 0.11, "grad_norm": 5.498834133148193, "learning_rate": 1.1894736842105264e-05, "loss": 0.7687, "step": 4350 }, { "epoch": 0.11, "grad_norm": 3.684948205947876, "learning_rate": 1.1873684210526318e-05, "loss": 0.9371, "step": 4360 }, { "epoch": 0.11, "grad_norm": 2.4451920986175537, "learning_rate": 1.1852631578947369e-05, "loss": 0.7904, "step": 4370 }, { "epoch": 0.11, "grad_norm": 3.1316657066345215, "learning_rate": 1.1831578947368423e-05, "loss": 0.6835, "step": 4380 }, { "epoch": 0.11, "grad_norm": 3.741140127182007, "learning_rate": 1.1810526315789474e-05, "loss": 0.8932, "step": 4390 }, { "epoch": 0.11, "grad_norm": 4.40395975112915, "learning_rate": 1.1789473684210527e-05, "loss": 0.9044, "step": 4400 }, { "epoch": 0.11, "grad_norm": 3.406174898147583, "learning_rate": 1.176842105263158e-05, "loss": 0.7924, "step": 4410 }, { "epoch": 0.11, "grad_norm": 6.892871379852295, "learning_rate": 1.1747368421052632e-05, "loss": 0.7356, "step": 4420 }, { "epoch": 0.11, "grad_norm": 5.855538368225098, "learning_rate": 1.1726315789473685e-05, "loss": 0.7543, "step": 4430 }, { "epoch": 0.11, "grad_norm": 4.951474666595459, "learning_rate": 1.1705263157894737e-05, "loss": 0.7416, "step": 4440 }, { "epoch": 0.11, "grad_norm": 3.314338207244873, "learning_rate": 1.1684210526315792e-05, "loss": 0.7764, "step": 4450 }, { "epoch": 0.11, "grad_norm": 5.176599502563477, "learning_rate": 1.1663157894736843e-05, "loss": 0.7658, "step": 4460 }, { "epoch": 0.11, "grad_norm": 5.275913238525391, "learning_rate": 1.1642105263157897e-05, "loss": 0.6621, "step": 4470 }, { "epoch": 0.11, "grad_norm": 3.1267452239990234, "learning_rate": 1.1621052631578948e-05, "loss": 0.7699, "step": 4480 }, { "epoch": 0.11, "grad_norm": 8.313840866088867, "learning_rate": 1.16e-05, "loss": 0.7262, "step": 4490 }, { "epoch": 0.11, "grad_norm": 8.310973167419434, "learning_rate": 1.1578947368421053e-05, "loss": 0.8871, "step": 4500 }, { "epoch": 0.11, "eval_loss": 0.7730265855789185, "eval_runtime": 67.9868, "eval_samples_per_second": 14.709, "eval_steps_per_second": 14.709, "step": 4500 }, { "epoch": 0.11, "grad_norm": 3.4506046772003174, "learning_rate": 1.1557894736842106e-05, "loss": 0.7579, "step": 4510 }, { "epoch": 0.11, "grad_norm": 3.864931106567383, "learning_rate": 1.153684210526316e-05, "loss": 0.899, "step": 4520 }, { "epoch": 0.11, "grad_norm": 5.998289108276367, "learning_rate": 1.1515789473684211e-05, "loss": 0.7924, "step": 4530 }, { "epoch": 0.11, "grad_norm": 4.859367370605469, "learning_rate": 1.1494736842105266e-05, "loss": 0.773, "step": 4540 }, { "epoch": 0.11, "grad_norm": 3.9851796627044678, "learning_rate": 1.1473684210526317e-05, "loss": 0.8665, "step": 4550 }, { "epoch": 0.11, "grad_norm": 5.357670783996582, "learning_rate": 1.145263157894737e-05, "loss": 0.7522, "step": 4560 }, { "epoch": 0.11, "grad_norm": 3.778637409210205, "learning_rate": 1.1431578947368422e-05, "loss": 0.6188, "step": 4570 }, { "epoch": 0.11, "grad_norm": 8.546213150024414, "learning_rate": 1.1410526315789475e-05, "loss": 0.7296, "step": 4580 }, { "epoch": 0.11, "grad_norm": 2.5466620922088623, "learning_rate": 1.1389473684210527e-05, "loss": 0.7774, "step": 4590 }, { "epoch": 0.12, "grad_norm": 4.85372257232666, "learning_rate": 1.136842105263158e-05, "loss": 0.7884, "step": 4600 }, { "epoch": 0.12, "grad_norm": 3.1757266521453857, "learning_rate": 1.1347368421052634e-05, "loss": 0.8008, "step": 4610 }, { "epoch": 0.12, "grad_norm": 2.7544124126434326, "learning_rate": 1.1326315789473685e-05, "loss": 0.7322, "step": 4620 }, { "epoch": 0.12, "grad_norm": 7.456575870513916, "learning_rate": 1.1305263157894736e-05, "loss": 0.7978, "step": 4630 }, { "epoch": 0.12, "grad_norm": 2.9789164066314697, "learning_rate": 1.128421052631579e-05, "loss": 0.617, "step": 4640 }, { "epoch": 0.12, "grad_norm": 4.361474514007568, "learning_rate": 1.1263157894736843e-05, "loss": 0.8214, "step": 4650 }, { "epoch": 0.12, "grad_norm": 14.45222282409668, "learning_rate": 1.1242105263157896e-05, "loss": 0.7183, "step": 4660 }, { "epoch": 0.12, "grad_norm": 4.7697906494140625, "learning_rate": 1.1221052631578949e-05, "loss": 0.729, "step": 4670 }, { "epoch": 0.12, "grad_norm": 4.225655555725098, "learning_rate": 1.1200000000000001e-05, "loss": 0.7627, "step": 4680 }, { "epoch": 0.12, "grad_norm": 4.067778587341309, "learning_rate": 1.1178947368421054e-05, "loss": 0.78, "step": 4690 }, { "epoch": 0.12, "grad_norm": 4.5654473304748535, "learning_rate": 1.1157894736842105e-05, "loss": 0.7178, "step": 4700 }, { "epoch": 0.12, "grad_norm": 1.7385423183441162, "learning_rate": 1.1136842105263159e-05, "loss": 0.9387, "step": 4710 }, { "epoch": 0.12, "grad_norm": 4.847338676452637, "learning_rate": 1.111578947368421e-05, "loss": 0.7951, "step": 4720 }, { "epoch": 0.12, "grad_norm": 2.739323377609253, "learning_rate": 1.1094736842105264e-05, "loss": 0.8198, "step": 4730 }, { "epoch": 0.12, "grad_norm": 5.23370885848999, "learning_rate": 1.1073684210526317e-05, "loss": 0.7462, "step": 4740 }, { "epoch": 0.12, "grad_norm": 4.970132350921631, "learning_rate": 1.105263157894737e-05, "loss": 0.6983, "step": 4750 }, { "epoch": 0.12, "grad_norm": 3.8072540760040283, "learning_rate": 1.1031578947368422e-05, "loss": 0.852, "step": 4760 }, { "epoch": 0.12, "grad_norm": 2.734208345413208, "learning_rate": 1.1010526315789473e-05, "loss": 0.8621, "step": 4770 }, { "epoch": 0.12, "grad_norm": 3.022127151489258, "learning_rate": 1.0989473684210528e-05, "loss": 0.7652, "step": 4780 }, { "epoch": 0.12, "grad_norm": 7.284844875335693, "learning_rate": 1.0968421052631579e-05, "loss": 0.7901, "step": 4790 }, { "epoch": 0.12, "grad_norm": 6.52205753326416, "learning_rate": 1.0947368421052633e-05, "loss": 0.8347, "step": 4800 }, { "epoch": 0.12, "grad_norm": 3.1662251949310303, "learning_rate": 1.0926315789473686e-05, "loss": 0.6105, "step": 4810 }, { "epoch": 0.12, "grad_norm": 6.027661323547363, "learning_rate": 1.0905263157894738e-05, "loss": 0.7447, "step": 4820 }, { "epoch": 0.12, "grad_norm": 9.989821434020996, "learning_rate": 1.0884210526315791e-05, "loss": 0.8144, "step": 4830 }, { "epoch": 0.12, "grad_norm": 3.886387825012207, "learning_rate": 1.0863157894736842e-05, "loss": 0.7702, "step": 4840 }, { "epoch": 0.12, "grad_norm": 8.8762845993042, "learning_rate": 1.0842105263157896e-05, "loss": 0.7305, "step": 4850 }, { "epoch": 0.12, "grad_norm": 5.934712886810303, "learning_rate": 1.0821052631578947e-05, "loss": 0.7849, "step": 4860 }, { "epoch": 0.12, "grad_norm": 2.3684771060943604, "learning_rate": 1.0800000000000002e-05, "loss": 0.8292, "step": 4870 }, { "epoch": 0.12, "grad_norm": 10.528717041015625, "learning_rate": 1.0778947368421053e-05, "loss": 0.8637, "step": 4880 }, { "epoch": 0.12, "grad_norm": 10.721526145935059, "learning_rate": 1.0757894736842107e-05, "loss": 0.7756, "step": 4890 }, { "epoch": 0.12, "grad_norm": 4.99760103225708, "learning_rate": 1.073684210526316e-05, "loss": 0.7928, "step": 4900 }, { "epoch": 0.12, "grad_norm": 8.126914978027344, "learning_rate": 1.071578947368421e-05, "loss": 0.7833, "step": 4910 }, { "epoch": 0.12, "grad_norm": 2.2220332622528076, "learning_rate": 1.0694736842105265e-05, "loss": 0.7819, "step": 4920 }, { "epoch": 0.12, "grad_norm": 7.0100321769714355, "learning_rate": 1.0673684210526316e-05, "loss": 0.8533, "step": 4930 }, { "epoch": 0.12, "grad_norm": 3.5343334674835205, "learning_rate": 1.065263157894737e-05, "loss": 0.7053, "step": 4940 }, { "epoch": 0.12, "grad_norm": 4.862158298492432, "learning_rate": 1.0631578947368421e-05, "loss": 0.7556, "step": 4950 }, { "epoch": 0.12, "grad_norm": 10.008291244506836, "learning_rate": 1.0610526315789476e-05, "loss": 0.8381, "step": 4960 }, { "epoch": 0.12, "grad_norm": 2.455188035964966, "learning_rate": 1.0589473684210526e-05, "loss": 0.749, "step": 4970 }, { "epoch": 0.12, "grad_norm": 5.882299423217773, "learning_rate": 1.0568421052631579e-05, "loss": 0.7797, "step": 4980 }, { "epoch": 0.12, "grad_norm": 5.7382001876831055, "learning_rate": 1.0547368421052633e-05, "loss": 0.8191, "step": 4990 }, { "epoch": 0.12, "grad_norm": 3.9528167247772217, "learning_rate": 1.0526315789473684e-05, "loss": 0.6382, "step": 5000 }, { "epoch": 0.12, "eval_loss": 0.7771185040473938, "eval_runtime": 67.9924, "eval_samples_per_second": 14.708, "eval_steps_per_second": 14.708, "step": 5000 }, { "epoch": 0.13, "grad_norm": 3.4830055236816406, "learning_rate": 1.0505263157894739e-05, "loss": 0.9099, "step": 5010 }, { "epoch": 0.13, "grad_norm": 5.7392096519470215, "learning_rate": 1.048421052631579e-05, "loss": 0.6423, "step": 5020 }, { "epoch": 0.13, "grad_norm": 2.720612049102783, "learning_rate": 1.0463157894736844e-05, "loss": 0.7826, "step": 5030 }, { "epoch": 0.13, "grad_norm": 3.0437145233154297, "learning_rate": 1.0442105263157895e-05, "loss": 0.7865, "step": 5040 }, { "epoch": 0.13, "grad_norm": 8.835311889648438, "learning_rate": 1.0421052631578948e-05, "loss": 0.7778, "step": 5050 }, { "epoch": 0.13, "grad_norm": 7.596973419189453, "learning_rate": 1.04e-05, "loss": 0.7381, "step": 5060 }, { "epoch": 0.13, "grad_norm": 4.108314037322998, "learning_rate": 1.0378947368421053e-05, "loss": 0.7689, "step": 5070 }, { "epoch": 0.13, "grad_norm": 3.865196704864502, "learning_rate": 1.0357894736842107e-05, "loss": 0.7785, "step": 5080 }, { "epoch": 0.13, "grad_norm": 3.4403493404388428, "learning_rate": 1.0336842105263158e-05, "loss": 0.8322, "step": 5090 }, { "epoch": 0.13, "grad_norm": 3.243029832839966, "learning_rate": 1.0315789473684213e-05, "loss": 0.6658, "step": 5100 }, { "epoch": 0.13, "grad_norm": 3.806818962097168, "learning_rate": 1.0294736842105264e-05, "loss": 0.781, "step": 5110 }, { "epoch": 0.13, "grad_norm": 3.820622205734253, "learning_rate": 1.0273684210526316e-05, "loss": 0.7499, "step": 5120 }, { "epoch": 0.13, "grad_norm": 4.203964710235596, "learning_rate": 1.0252631578947369e-05, "loss": 0.7702, "step": 5130 }, { "epoch": 0.13, "grad_norm": 2.803215503692627, "learning_rate": 1.0231578947368422e-05, "loss": 0.6291, "step": 5140 }, { "epoch": 0.13, "grad_norm": 5.486114978790283, "learning_rate": 1.0210526315789476e-05, "loss": 0.8124, "step": 5150 }, { "epoch": 0.13, "grad_norm": 7.74938440322876, "learning_rate": 1.0189473684210527e-05, "loss": 0.7735, "step": 5160 }, { "epoch": 0.13, "grad_norm": 4.10128116607666, "learning_rate": 1.0168421052631581e-05, "loss": 0.6809, "step": 5170 }, { "epoch": 0.13, "grad_norm": 6.844088554382324, "learning_rate": 1.0147368421052632e-05, "loss": 0.8294, "step": 5180 }, { "epoch": 0.13, "grad_norm": 4.329681873321533, "learning_rate": 1.0126315789473685e-05, "loss": 0.861, "step": 5190 }, { "epoch": 0.13, "grad_norm": 12.482446670532227, "learning_rate": 1.0105263157894738e-05, "loss": 0.7346, "step": 5200 }, { "epoch": 0.13, "grad_norm": 1.8471055030822754, "learning_rate": 1.008421052631579e-05, "loss": 0.7714, "step": 5210 }, { "epoch": 0.13, "grad_norm": 3.1509273052215576, "learning_rate": 1.0063157894736843e-05, "loss": 0.697, "step": 5220 }, { "epoch": 0.13, "grad_norm": 4.524876117706299, "learning_rate": 1.0042105263157896e-05, "loss": 0.8373, "step": 5230 }, { "epoch": 0.13, "grad_norm": 2.7305006980895996, "learning_rate": 1.002105263157895e-05, "loss": 0.7182, "step": 5240 }, { "epoch": 0.13, "grad_norm": 2.5194203853607178, "learning_rate": 1e-05, "loss": 0.794, "step": 5250 }, { "epoch": 0.13, "grad_norm": 14.967845916748047, "learning_rate": 9.978947368421053e-06, "loss": 0.7564, "step": 5260 }, { "epoch": 0.13, "grad_norm": 1.8730751276016235, "learning_rate": 9.957894736842106e-06, "loss": 0.726, "step": 5270 }, { "epoch": 0.13, "grad_norm": 2.1793789863586426, "learning_rate": 9.936842105263159e-06, "loss": 0.7019, "step": 5280 }, { "epoch": 0.13, "grad_norm": 5.0785651206970215, "learning_rate": 9.915789473684211e-06, "loss": 0.7771, "step": 5290 }, { "epoch": 0.13, "grad_norm": 9.810837745666504, "learning_rate": 9.894736842105264e-06, "loss": 0.7542, "step": 5300 }, { "epoch": 0.13, "grad_norm": 24.654855728149414, "learning_rate": 9.873684210526317e-06, "loss": 0.7928, "step": 5310 }, { "epoch": 0.13, "grad_norm": 3.083669424057007, "learning_rate": 9.85263157894737e-06, "loss": 0.8091, "step": 5320 }, { "epoch": 0.13, "grad_norm": 3.9507665634155273, "learning_rate": 9.831578947368422e-06, "loss": 0.7548, "step": 5330 }, { "epoch": 0.13, "grad_norm": 2.55362606048584, "learning_rate": 9.810526315789475e-06, "loss": 0.7804, "step": 5340 }, { "epoch": 0.13, "grad_norm": 3.572410821914673, "learning_rate": 9.789473684210527e-06, "loss": 0.748, "step": 5350 }, { "epoch": 0.13, "grad_norm": 3.70060658454895, "learning_rate": 9.76842105263158e-06, "loss": 0.7303, "step": 5360 }, { "epoch": 0.13, "grad_norm": 3.397512674331665, "learning_rate": 9.747368421052633e-06, "loss": 0.7209, "step": 5370 }, { "epoch": 0.13, "grad_norm": 2.797943592071533, "learning_rate": 9.726315789473685e-06, "loss": 0.9082, "step": 5380 }, { "epoch": 0.13, "grad_norm": 9.164168357849121, "learning_rate": 9.705263157894738e-06, "loss": 0.7995, "step": 5390 }, { "epoch": 0.14, "grad_norm": 6.297326564788818, "learning_rate": 9.68421052631579e-06, "loss": 0.7484, "step": 5400 }, { "epoch": 0.14, "grad_norm": 12.500905990600586, "learning_rate": 9.663157894736843e-06, "loss": 0.7291, "step": 5410 }, { "epoch": 0.14, "grad_norm": 3.1083016395568848, "learning_rate": 9.642105263157896e-06, "loss": 0.8064, "step": 5420 }, { "epoch": 0.14, "grad_norm": 4.058903694152832, "learning_rate": 9.621052631578947e-06, "loss": 0.7087, "step": 5430 }, { "epoch": 0.14, "grad_norm": 5.303778648376465, "learning_rate": 9.600000000000001e-06, "loss": 0.6257, "step": 5440 }, { "epoch": 0.14, "grad_norm": 2.8508620262145996, "learning_rate": 9.578947368421054e-06, "loss": 0.7423, "step": 5450 }, { "epoch": 0.14, "grad_norm": 5.9560956954956055, "learning_rate": 9.557894736842107e-06, "loss": 0.7304, "step": 5460 }, { "epoch": 0.14, "grad_norm": 2.8841540813446045, "learning_rate": 9.53684210526316e-06, "loss": 0.7768, "step": 5470 }, { "epoch": 0.14, "grad_norm": 2.6742358207702637, "learning_rate": 9.515789473684212e-06, "loss": 0.7618, "step": 5480 }, { "epoch": 0.14, "grad_norm": 4.105114936828613, "learning_rate": 9.494736842105265e-06, "loss": 0.7086, "step": 5490 }, { "epoch": 0.14, "grad_norm": 4.728137493133545, "learning_rate": 9.473684210526315e-06, "loss": 0.8313, "step": 5500 }, { "epoch": 0.14, "eval_loss": 0.7711445689201355, "eval_runtime": 67.9047, "eval_samples_per_second": 14.727, "eval_steps_per_second": 14.727, "step": 5500 }, { "epoch": 0.14, "grad_norm": 4.539173603057861, "learning_rate": 9.452631578947368e-06, "loss": 0.7231, "step": 5510 }, { "epoch": 0.14, "grad_norm": 4.742118835449219, "learning_rate": 9.43157894736842e-06, "loss": 0.8199, "step": 5520 }, { "epoch": 0.14, "grad_norm": 5.9068603515625, "learning_rate": 9.410526315789475e-06, "loss": 0.7615, "step": 5530 }, { "epoch": 0.14, "grad_norm": 7.106772422790527, "learning_rate": 9.389473684210528e-06, "loss": 0.7139, "step": 5540 }, { "epoch": 0.14, "grad_norm": 2.272012710571289, "learning_rate": 9.36842105263158e-06, "loss": 0.6264, "step": 5550 }, { "epoch": 0.14, "grad_norm": 14.025699615478516, "learning_rate": 9.347368421052633e-06, "loss": 0.7416, "step": 5560 }, { "epoch": 0.14, "grad_norm": 12.747345924377441, "learning_rate": 9.326315789473684e-06, "loss": 0.781, "step": 5570 }, { "epoch": 0.14, "grad_norm": 7.966195106506348, "learning_rate": 9.305263157894737e-06, "loss": 0.7503, "step": 5580 }, { "epoch": 0.14, "grad_norm": 3.3705811500549316, "learning_rate": 9.28421052631579e-06, "loss": 0.7704, "step": 5590 }, { "epoch": 0.14, "grad_norm": 5.239542007446289, "learning_rate": 9.263157894736842e-06, "loss": 0.6806, "step": 5600 }, { "epoch": 0.14, "grad_norm": 6.395047187805176, "learning_rate": 9.242105263157896e-06, "loss": 0.6961, "step": 5610 }, { "epoch": 0.14, "grad_norm": 3.807992458343506, "learning_rate": 9.221052631578949e-06, "loss": 0.769, "step": 5620 }, { "epoch": 0.14, "grad_norm": 3.8179049491882324, "learning_rate": 9.200000000000002e-06, "loss": 0.7515, "step": 5630 }, { "epoch": 0.14, "grad_norm": 4.826687812805176, "learning_rate": 9.178947368421053e-06, "loss": 0.7337, "step": 5640 }, { "epoch": 0.14, "grad_norm": 4.776168346405029, "learning_rate": 9.157894736842105e-06, "loss": 0.7173, "step": 5650 }, { "epoch": 0.14, "grad_norm": 4.10529088973999, "learning_rate": 9.136842105263158e-06, "loss": 0.7255, "step": 5660 }, { "epoch": 0.14, "grad_norm": 5.4715189933776855, "learning_rate": 9.11578947368421e-06, "loss": 0.8092, "step": 5670 }, { "epoch": 0.14, "grad_norm": 3.8921728134155273, "learning_rate": 9.094736842105263e-06, "loss": 0.6684, "step": 5680 }, { "epoch": 0.14, "grad_norm": 5.904684066772461, "learning_rate": 9.073684210526316e-06, "loss": 0.7804, "step": 5690 }, { "epoch": 0.14, "grad_norm": 9.521209716796875, "learning_rate": 9.05263157894737e-06, "loss": 0.793, "step": 5700 }, { "epoch": 0.14, "grad_norm": 11.125286102294922, "learning_rate": 9.031578947368423e-06, "loss": 0.8254, "step": 5710 }, { "epoch": 0.14, "grad_norm": 8.136049270629883, "learning_rate": 9.010526315789474e-06, "loss": 0.7475, "step": 5720 }, { "epoch": 0.14, "grad_norm": 2.4722092151641846, "learning_rate": 8.989473684210527e-06, "loss": 0.7268, "step": 5730 }, { "epoch": 0.14, "grad_norm": 3.330580711364746, "learning_rate": 8.96842105263158e-06, "loss": 0.7995, "step": 5740 }, { "epoch": 0.14, "grad_norm": 25.711868286132812, "learning_rate": 8.947368421052632e-06, "loss": 0.801, "step": 5750 }, { "epoch": 0.14, "grad_norm": 2.3957395553588867, "learning_rate": 8.926315789473685e-06, "loss": 0.6988, "step": 5760 }, { "epoch": 0.14, "grad_norm": 3.033153533935547, "learning_rate": 8.905263157894737e-06, "loss": 0.7378, "step": 5770 }, { "epoch": 0.14, "grad_norm": 4.359398365020752, "learning_rate": 8.884210526315792e-06, "loss": 0.7214, "step": 5780 }, { "epoch": 0.14, "grad_norm": 3.08485746383667, "learning_rate": 8.863157894736842e-06, "loss": 0.7034, "step": 5790 }, { "epoch": 0.14, "grad_norm": 4.156674385070801, "learning_rate": 8.842105263157895e-06, "loss": 0.7833, "step": 5800 }, { "epoch": 0.15, "grad_norm": 4.031563758850098, "learning_rate": 8.821052631578948e-06, "loss": 0.7385, "step": 5810 }, { "epoch": 0.15, "grad_norm": 9.957317352294922, "learning_rate": 8.8e-06, "loss": 0.8572, "step": 5820 }, { "epoch": 0.15, "grad_norm": 3.951910972595215, "learning_rate": 8.778947368421053e-06, "loss": 0.7374, "step": 5830 }, { "epoch": 0.15, "grad_norm": 5.296828746795654, "learning_rate": 8.757894736842106e-06, "loss": 0.7619, "step": 5840 }, { "epoch": 0.15, "grad_norm": 7.079039096832275, "learning_rate": 8.736842105263158e-06, "loss": 0.7842, "step": 5850 }, { "epoch": 0.15, "grad_norm": 4.972481727600098, "learning_rate": 8.715789473684211e-06, "loss": 0.7039, "step": 5860 }, { "epoch": 0.15, "grad_norm": 11.936322212219238, "learning_rate": 8.694736842105264e-06, "loss": 0.6701, "step": 5870 }, { "epoch": 0.15, "grad_norm": 4.164266586303711, "learning_rate": 8.673684210526316e-06, "loss": 0.7481, "step": 5880 }, { "epoch": 0.15, "grad_norm": 4.0412397384643555, "learning_rate": 8.652631578947369e-06, "loss": 0.8783, "step": 5890 }, { "epoch": 0.15, "grad_norm": 13.239718437194824, "learning_rate": 8.631578947368422e-06, "loss": 0.8639, "step": 5900 }, { "epoch": 0.15, "grad_norm": 5.553131103515625, "learning_rate": 8.610526315789474e-06, "loss": 0.7861, "step": 5910 }, { "epoch": 0.15, "grad_norm": 4.507501602172852, "learning_rate": 8.589473684210527e-06, "loss": 0.7526, "step": 5920 }, { "epoch": 0.15, "grad_norm": 3.70124888420105, "learning_rate": 8.56842105263158e-06, "loss": 0.8391, "step": 5930 }, { "epoch": 0.15, "grad_norm": 4.307315349578857, "learning_rate": 8.547368421052632e-06, "loss": 0.7253, "step": 5940 }, { "epoch": 0.15, "grad_norm": 12.232582092285156, "learning_rate": 8.526315789473685e-06, "loss": 0.8559, "step": 5950 }, { "epoch": 0.15, "grad_norm": 3.0924105644226074, "learning_rate": 8.505263157894738e-06, "loss": 0.6245, "step": 5960 }, { "epoch": 0.15, "grad_norm": 2.90191912651062, "learning_rate": 8.48421052631579e-06, "loss": 0.6643, "step": 5970 }, { "epoch": 0.15, "grad_norm": 3.4637041091918945, "learning_rate": 8.463157894736843e-06, "loss": 0.72, "step": 5980 }, { "epoch": 0.15, "grad_norm": 2.8273704051971436, "learning_rate": 8.442105263157896e-06, "loss": 0.7202, "step": 5990 }, { "epoch": 0.15, "grad_norm": 7.119280815124512, "learning_rate": 8.421052631578948e-06, "loss": 0.7047, "step": 6000 }, { "epoch": 0.15, "eval_loss": 0.7685219645500183, "eval_runtime": 67.892, "eval_samples_per_second": 14.729, "eval_steps_per_second": 14.729, "step": 6000 }, { "epoch": 0.15, "grad_norm": 4.9551520347595215, "learning_rate": 8.400000000000001e-06, "loss": 0.6911, "step": 6010 }, { "epoch": 0.15, "grad_norm": 2.9231200218200684, "learning_rate": 8.378947368421054e-06, "loss": 0.7942, "step": 6020 }, { "epoch": 0.15, "grad_norm": 7.254823684692383, "learning_rate": 8.357894736842106e-06, "loss": 0.7811, "step": 6030 }, { "epoch": 0.15, "grad_norm": 3.8563404083251953, "learning_rate": 8.336842105263159e-06, "loss": 0.7523, "step": 6040 }, { "epoch": 0.15, "grad_norm": 3.5061299800872803, "learning_rate": 8.315789473684212e-06, "loss": 0.6222, "step": 6050 }, { "epoch": 0.15, "grad_norm": 3.3213858604431152, "learning_rate": 8.294736842105264e-06, "loss": 0.7617, "step": 6060 }, { "epoch": 0.15, "grad_norm": 5.054555416107178, "learning_rate": 8.273684210526317e-06, "loss": 0.7333, "step": 6070 }, { "epoch": 0.15, "grad_norm": 3.5189318656921387, "learning_rate": 8.25263157894737e-06, "loss": 0.8676, "step": 6080 }, { "epoch": 0.15, "grad_norm": 4.989790439605713, "learning_rate": 8.231578947368422e-06, "loss": 0.6678, "step": 6090 }, { "epoch": 0.15, "grad_norm": 7.941010475158691, "learning_rate": 8.210526315789475e-06, "loss": 0.7317, "step": 6100 }, { "epoch": 0.15, "grad_norm": 6.6499247550964355, "learning_rate": 8.189473684210527e-06, "loss": 0.7484, "step": 6110 }, { "epoch": 0.15, "grad_norm": 3.512948513031006, "learning_rate": 8.16842105263158e-06, "loss": 0.8508, "step": 6120 }, { "epoch": 0.15, "grad_norm": 3.844045400619507, "learning_rate": 8.147368421052633e-06, "loss": 0.7468, "step": 6130 }, { "epoch": 0.15, "grad_norm": 2.620250701904297, "learning_rate": 8.126315789473684e-06, "loss": 0.6449, "step": 6140 }, { "epoch": 0.15, "grad_norm": 3.5233919620513916, "learning_rate": 8.105263157894736e-06, "loss": 0.7928, "step": 6150 }, { "epoch": 0.15, "grad_norm": 4.866186618804932, "learning_rate": 8.08421052631579e-06, "loss": 0.787, "step": 6160 }, { "epoch": 0.15, "grad_norm": 4.392407417297363, "learning_rate": 8.063157894736843e-06, "loss": 0.7746, "step": 6170 }, { "epoch": 0.15, "grad_norm": 6.6285176277160645, "learning_rate": 8.042105263157896e-06, "loss": 0.7304, "step": 6180 }, { "epoch": 0.15, "grad_norm": 2.571240186691284, "learning_rate": 8.021052631578949e-06, "loss": 0.7008, "step": 6190 }, { "epoch": 0.15, "grad_norm": 2.8306283950805664, "learning_rate": 8.000000000000001e-06, "loss": 0.834, "step": 6200 }, { "epoch": 0.16, "grad_norm": 2.5514955520629883, "learning_rate": 7.978947368421052e-06, "loss": 0.8136, "step": 6210 }, { "epoch": 0.16, "grad_norm": 8.471675872802734, "learning_rate": 7.957894736842105e-06, "loss": 0.8439, "step": 6220 }, { "epoch": 0.16, "grad_norm": 8.785553932189941, "learning_rate": 7.936842105263158e-06, "loss": 0.7763, "step": 6230 }, { "epoch": 0.16, "grad_norm": 5.334304332733154, "learning_rate": 7.915789473684212e-06, "loss": 0.7832, "step": 6240 }, { "epoch": 0.16, "grad_norm": 14.861701011657715, "learning_rate": 7.894736842105265e-06, "loss": 0.6889, "step": 6250 }, { "epoch": 0.16, "grad_norm": 2.040034770965576, "learning_rate": 7.873684210526317e-06, "loss": 0.7422, "step": 6260 }, { "epoch": 0.16, "grad_norm": 9.74354076385498, "learning_rate": 7.85263157894737e-06, "loss": 0.7765, "step": 6270 }, { "epoch": 0.16, "grad_norm": 3.4280757904052734, "learning_rate": 7.831578947368421e-06, "loss": 0.7465, "step": 6280 }, { "epoch": 0.16, "grad_norm": 6.530819416046143, "learning_rate": 7.810526315789474e-06, "loss": 0.8216, "step": 6290 }, { "epoch": 0.16, "grad_norm": 6.786412239074707, "learning_rate": 7.789473684210526e-06, "loss": 0.7694, "step": 6300 }, { "epoch": 0.16, "grad_norm": 4.896278381347656, "learning_rate": 7.768421052631579e-06, "loss": 0.8282, "step": 6310 }, { "epoch": 0.16, "grad_norm": 4.5938825607299805, "learning_rate": 7.747368421052631e-06, "loss": 0.6628, "step": 6320 }, { "epoch": 0.16, "grad_norm": 2.134136915206909, "learning_rate": 7.726315789473686e-06, "loss": 0.8061, "step": 6330 }, { "epoch": 0.16, "grad_norm": 7.497835159301758, "learning_rate": 7.705263157894738e-06, "loss": 0.8946, "step": 6340 }, { "epoch": 0.16, "grad_norm": 3.5185306072235107, "learning_rate": 7.68421052631579e-06, "loss": 0.6689, "step": 6350 }, { "epoch": 0.16, "grad_norm": 2.464015245437622, "learning_rate": 7.663157894736842e-06, "loss": 0.7758, "step": 6360 }, { "epoch": 0.16, "grad_norm": 2.803342580795288, "learning_rate": 7.642105263157895e-06, "loss": 0.7478, "step": 6370 }, { "epoch": 0.16, "grad_norm": 6.2652130126953125, "learning_rate": 7.621052631578948e-06, "loss": 0.7293, "step": 6380 }, { "epoch": 0.16, "grad_norm": 9.655146598815918, "learning_rate": 7.600000000000001e-06, "loss": 0.7454, "step": 6390 }, { "epoch": 0.16, "grad_norm": 5.041891574859619, "learning_rate": 7.578947368421054e-06, "loss": 0.8579, "step": 6400 }, { "epoch": 0.16, "grad_norm": 3.133237838745117, "learning_rate": 7.557894736842106e-06, "loss": 0.6662, "step": 6410 }, { "epoch": 0.16, "grad_norm": 7.207560062408447, "learning_rate": 7.536842105263158e-06, "loss": 0.8135, "step": 6420 }, { "epoch": 0.16, "grad_norm": 3.374864101409912, "learning_rate": 7.515789473684211e-06, "loss": 0.7514, "step": 6430 }, { "epoch": 0.16, "grad_norm": 4.067178249359131, "learning_rate": 7.494736842105263e-06, "loss": 0.7446, "step": 6440 }, { "epoch": 0.16, "grad_norm": 4.283421516418457, "learning_rate": 7.473684210526316e-06, "loss": 0.7955, "step": 6450 }, { "epoch": 0.16, "grad_norm": 3.092348098754883, "learning_rate": 7.4526315789473695e-06, "loss": 0.5471, "step": 6460 }, { "epoch": 0.16, "grad_norm": 9.400391578674316, "learning_rate": 7.431578947368422e-06, "loss": 0.7098, "step": 6470 }, { "epoch": 0.16, "grad_norm": 5.843224048614502, "learning_rate": 7.410526315789475e-06, "loss": 0.7943, "step": 6480 }, { "epoch": 0.16, "grad_norm": 3.5985705852508545, "learning_rate": 7.3894736842105275e-06, "loss": 0.8059, "step": 6490 }, { "epoch": 0.16, "grad_norm": 5.502979278564453, "learning_rate": 7.368421052631579e-06, "loss": 0.6236, "step": 6500 }, { "epoch": 0.16, "eval_loss": 0.7682243585586548, "eval_runtime": 67.9039, "eval_samples_per_second": 14.727, "eval_steps_per_second": 14.727, "step": 6500 }, { "epoch": 0.16, "grad_norm": 11.025419235229492, "learning_rate": 7.347368421052632e-06, "loss": 0.8343, "step": 6510 }, { "epoch": 0.16, "grad_norm": 3.4290804862976074, "learning_rate": 7.326315789473685e-06, "loss": 0.7572, "step": 6520 }, { "epoch": 0.16, "grad_norm": 3.0629210472106934, "learning_rate": 7.305263157894737e-06, "loss": 0.8245, "step": 6530 }, { "epoch": 0.16, "grad_norm": 5.065977573394775, "learning_rate": 7.28421052631579e-06, "loss": 0.6447, "step": 6540 }, { "epoch": 0.16, "grad_norm": 3.971541166305542, "learning_rate": 7.263157894736843e-06, "loss": 0.8688, "step": 6550 }, { "epoch": 0.16, "grad_norm": 3.4434573650360107, "learning_rate": 7.242105263157896e-06, "loss": 0.6749, "step": 6560 }, { "epoch": 0.16, "grad_norm": 4.323293685913086, "learning_rate": 7.221052631578948e-06, "loss": 0.7982, "step": 6570 }, { "epoch": 0.16, "grad_norm": 16.821266174316406, "learning_rate": 7.2000000000000005e-06, "loss": 0.7898, "step": 6580 }, { "epoch": 0.16, "grad_norm": 3.008687734603882, "learning_rate": 7.178947368421053e-06, "loss": 0.7375, "step": 6590 }, { "epoch": 0.17, "grad_norm": 3.629837989807129, "learning_rate": 7.157894736842106e-06, "loss": 0.7909, "step": 6600 }, { "epoch": 0.17, "grad_norm": 5.807744026184082, "learning_rate": 7.1368421052631585e-06, "loss": 0.621, "step": 6610 }, { "epoch": 0.17, "grad_norm": 3.9960129261016846, "learning_rate": 7.115789473684211e-06, "loss": 0.851, "step": 6620 }, { "epoch": 0.17, "grad_norm": 2.7165372371673584, "learning_rate": 7.094736842105265e-06, "loss": 0.7872, "step": 6630 }, { "epoch": 0.17, "grad_norm": 5.922586917877197, "learning_rate": 7.073684210526316e-06, "loss": 0.8822, "step": 6640 }, { "epoch": 0.17, "grad_norm": 9.046282768249512, "learning_rate": 7.052631578947369e-06, "loss": 0.7454, "step": 6650 }, { "epoch": 0.17, "grad_norm": 4.76317024230957, "learning_rate": 7.031578947368422e-06, "loss": 0.7116, "step": 6660 }, { "epoch": 0.17, "grad_norm": 4.31531286239624, "learning_rate": 7.010526315789474e-06, "loss": 0.7892, "step": 6670 }, { "epoch": 0.17, "grad_norm": 3.0895297527313232, "learning_rate": 6.989473684210527e-06, "loss": 0.7095, "step": 6680 }, { "epoch": 0.17, "grad_norm": 4.174783706665039, "learning_rate": 6.96842105263158e-06, "loss": 0.8007, "step": 6690 }, { "epoch": 0.17, "grad_norm": 4.1555280685424805, "learning_rate": 6.947368421052632e-06, "loss": 0.8274, "step": 6700 }, { "epoch": 0.17, "grad_norm": 3.173882246017456, "learning_rate": 6.926315789473684e-06, "loss": 0.6447, "step": 6710 }, { "epoch": 0.17, "grad_norm": 2.1489410400390625, "learning_rate": 6.905263157894737e-06, "loss": 0.7428, "step": 6720 }, { "epoch": 0.17, "grad_norm": 2.523904323577881, "learning_rate": 6.8842105263157895e-06, "loss": 0.8159, "step": 6730 }, { "epoch": 0.17, "grad_norm": 1.7494622468948364, "learning_rate": 6.863157894736843e-06, "loss": 0.863, "step": 6740 }, { "epoch": 0.17, "grad_norm": 2.552121639251709, "learning_rate": 6.842105263157896e-06, "loss": 0.7448, "step": 6750 }, { "epoch": 0.17, "grad_norm": 4.1907453536987305, "learning_rate": 6.821052631578948e-06, "loss": 0.6813, "step": 6760 }, { "epoch": 0.17, "grad_norm": 4.284384727478027, "learning_rate": 6.800000000000001e-06, "loss": 0.699, "step": 6770 }, { "epoch": 0.17, "grad_norm": 5.010688781738281, "learning_rate": 6.778947368421053e-06, "loss": 0.7803, "step": 6780 }, { "epoch": 0.17, "grad_norm": 2.5098397731781006, "learning_rate": 6.7578947368421054e-06, "loss": 0.767, "step": 6790 }, { "epoch": 0.17, "grad_norm": 2.8980441093444824, "learning_rate": 6.736842105263158e-06, "loss": 0.8084, "step": 6800 }, { "epoch": 0.17, "grad_norm": 3.8058199882507324, "learning_rate": 6.715789473684211e-06, "loss": 0.7214, "step": 6810 }, { "epoch": 0.17, "grad_norm": 2.3668529987335205, "learning_rate": 6.694736842105264e-06, "loss": 0.6759, "step": 6820 }, { "epoch": 0.17, "grad_norm": 5.715735912322998, "learning_rate": 6.673684210526317e-06, "loss": 0.7747, "step": 6830 }, { "epoch": 0.17, "grad_norm": 8.902985572814941, "learning_rate": 6.6526315789473695e-06, "loss": 0.8256, "step": 6840 }, { "epoch": 0.17, "grad_norm": 5.802920818328857, "learning_rate": 6.631578947368421e-06, "loss": 0.7682, "step": 6850 }, { "epoch": 0.17, "grad_norm": 9.218498229980469, "learning_rate": 6.610526315789474e-06, "loss": 0.7855, "step": 6860 }, { "epoch": 0.17, "grad_norm": 4.406294822692871, "learning_rate": 6.589473684210527e-06, "loss": 0.736, "step": 6870 }, { "epoch": 0.17, "grad_norm": 5.765889644622803, "learning_rate": 6.568421052631579e-06, "loss": 0.7073, "step": 6880 }, { "epoch": 0.17, "grad_norm": 2.910264015197754, "learning_rate": 6.547368421052632e-06, "loss": 0.7328, "step": 6890 }, { "epoch": 0.17, "grad_norm": 9.011739730834961, "learning_rate": 6.526315789473685e-06, "loss": 0.6798, "step": 6900 }, { "epoch": 0.17, "grad_norm": 8.296028137207031, "learning_rate": 6.505263157894738e-06, "loss": 0.7469, "step": 6910 }, { "epoch": 0.17, "grad_norm": 5.347682952880859, "learning_rate": 6.484210526315789e-06, "loss": 0.7143, "step": 6920 }, { "epoch": 0.17, "grad_norm": 5.903685092926025, "learning_rate": 6.463157894736843e-06, "loss": 0.7413, "step": 6930 }, { "epoch": 0.17, "grad_norm": 4.017665386199951, "learning_rate": 6.442105263157895e-06, "loss": 0.7569, "step": 6940 }, { "epoch": 0.17, "grad_norm": 2.3947088718414307, "learning_rate": 6.421052631578948e-06, "loss": 0.75, "step": 6950 }, { "epoch": 0.17, "grad_norm": 4.019251823425293, "learning_rate": 6.4000000000000006e-06, "loss": 0.7364, "step": 6960 }, { "epoch": 0.17, "grad_norm": 2.439628839492798, "learning_rate": 6.378947368421053e-06, "loss": 0.68, "step": 6970 }, { "epoch": 0.17, "grad_norm": 2.413942575454712, "learning_rate": 6.357894736842106e-06, "loss": 0.79, "step": 6980 }, { "epoch": 0.17, "grad_norm": 8.72237491607666, "learning_rate": 6.336842105263158e-06, "loss": 0.6678, "step": 6990 }, { "epoch": 0.17, "grad_norm": 3.9021055698394775, "learning_rate": 6.31578947368421e-06, "loss": 0.7169, "step": 7000 }, { "epoch": 0.17, "eval_loss": 0.7889605164527893, "eval_runtime": 67.8704, "eval_samples_per_second": 14.734, "eval_steps_per_second": 14.734, "step": 7000 }, { "epoch": 0.18, "grad_norm": 8.238909721374512, "learning_rate": 6.294736842105264e-06, "loss": 0.658, "step": 7010 }, { "epoch": 0.18, "grad_norm": 3.403461456298828, "learning_rate": 6.2736842105263165e-06, "loss": 0.8165, "step": 7020 }, { "epoch": 0.18, "grad_norm": 5.648688316345215, "learning_rate": 6.252631578947369e-06, "loss": 0.7506, "step": 7030 }, { "epoch": 0.18, "grad_norm": 2.380591630935669, "learning_rate": 6.231578947368422e-06, "loss": 0.8892, "step": 7040 }, { "epoch": 0.18, "grad_norm": 4.201750755310059, "learning_rate": 6.2105263157894745e-06, "loss": 0.7069, "step": 7050 }, { "epoch": 0.18, "grad_norm": 2.9994821548461914, "learning_rate": 6.189473684210526e-06, "loss": 0.6896, "step": 7060 }, { "epoch": 0.18, "grad_norm": 5.100094318389893, "learning_rate": 6.168421052631579e-06, "loss": 0.6241, "step": 7070 }, { "epoch": 0.18, "grad_norm": 3.88962721824646, "learning_rate": 6.1473684210526316e-06, "loss": 0.741, "step": 7080 }, { "epoch": 0.18, "grad_norm": 3.669283151626587, "learning_rate": 6.126315789473685e-06, "loss": 0.5153, "step": 7090 }, { "epoch": 0.18, "grad_norm": 6.010345458984375, "learning_rate": 6.105263157894738e-06, "loss": 0.7394, "step": 7100 }, { "epoch": 0.18, "grad_norm": 5.333982467651367, "learning_rate": 6.08421052631579e-06, "loss": 0.6423, "step": 7110 }, { "epoch": 0.18, "grad_norm": 2.0060064792633057, "learning_rate": 6.063157894736843e-06, "loss": 0.7073, "step": 7120 }, { "epoch": 0.18, "grad_norm": 3.618821144104004, "learning_rate": 6.042105263157895e-06, "loss": 0.7221, "step": 7130 }, { "epoch": 0.18, "grad_norm": 2.6231422424316406, "learning_rate": 6.0210526315789475e-06, "loss": 0.6748, "step": 7140 }, { "epoch": 0.18, "grad_norm": 7.207015514373779, "learning_rate": 6e-06, "loss": 0.7403, "step": 7150 }, { "epoch": 0.18, "grad_norm": 5.1877031326293945, "learning_rate": 5.978947368421053e-06, "loss": 0.6143, "step": 7160 }, { "epoch": 0.18, "grad_norm": 3.433973550796509, "learning_rate": 5.9578947368421055e-06, "loss": 0.6593, "step": 7170 }, { "epoch": 0.18, "grad_norm": 4.261890888214111, "learning_rate": 5.936842105263159e-06, "loss": 0.7119, "step": 7180 }, { "epoch": 0.18, "grad_norm": 2.4731180667877197, "learning_rate": 5.915789473684212e-06, "loss": 0.7764, "step": 7190 }, { "epoch": 0.18, "grad_norm": 3.540252923965454, "learning_rate": 5.8947368421052634e-06, "loss": 0.788, "step": 7200 }, { "epoch": 0.18, "grad_norm": 16.481884002685547, "learning_rate": 5.873684210526316e-06, "loss": 0.7411, "step": 7210 }, { "epoch": 0.18, "grad_norm": 5.3406548500061035, "learning_rate": 5.852631578947369e-06, "loss": 0.7703, "step": 7220 }, { "epoch": 0.18, "grad_norm": 5.786658763885498, "learning_rate": 5.831578947368421e-06, "loss": 0.7068, "step": 7230 }, { "epoch": 0.18, "grad_norm": 6.659720420837402, "learning_rate": 5.810526315789474e-06, "loss": 0.7287, "step": 7240 }, { "epoch": 0.18, "grad_norm": 2.9273788928985596, "learning_rate": 5.789473684210527e-06, "loss": 0.7059, "step": 7250 }, { "epoch": 0.18, "grad_norm": 5.475671768188477, "learning_rate": 5.76842105263158e-06, "loss": 0.7284, "step": 7260 }, { "epoch": 0.18, "grad_norm": 5.699868202209473, "learning_rate": 5.747368421052633e-06, "loss": 0.8036, "step": 7270 }, { "epoch": 0.18, "grad_norm": 3.518573045730591, "learning_rate": 5.726315789473685e-06, "loss": 0.7209, "step": 7280 }, { "epoch": 0.18, "grad_norm": 8.151999473571777, "learning_rate": 5.705263157894737e-06, "loss": 0.6903, "step": 7290 }, { "epoch": 0.18, "grad_norm": 4.088874340057373, "learning_rate": 5.68421052631579e-06, "loss": 0.7685, "step": 7300 }, { "epoch": 0.18, "grad_norm": 9.118200302124023, "learning_rate": 5.663157894736843e-06, "loss": 0.7256, "step": 7310 }, { "epoch": 0.18, "grad_norm": 6.765544414520264, "learning_rate": 5.642105263157895e-06, "loss": 0.8016, "step": 7320 }, { "epoch": 0.18, "grad_norm": 11.424837112426758, "learning_rate": 5.621052631578948e-06, "loss": 0.7721, "step": 7330 }, { "epoch": 0.18, "grad_norm": 5.862210750579834, "learning_rate": 5.600000000000001e-06, "loss": 0.6898, "step": 7340 }, { "epoch": 0.18, "grad_norm": 4.197153568267822, "learning_rate": 5.578947368421052e-06, "loss": 0.6907, "step": 7350 }, { "epoch": 0.18, "grad_norm": 6.712553977966309, "learning_rate": 5.557894736842105e-06, "loss": 0.773, "step": 7360 }, { "epoch": 0.18, "grad_norm": 4.968278408050537, "learning_rate": 5.5368421052631586e-06, "loss": 0.7892, "step": 7370 }, { "epoch": 0.18, "grad_norm": 3.8882153034210205, "learning_rate": 5.515789473684211e-06, "loss": 0.8365, "step": 7380 }, { "epoch": 0.18, "grad_norm": 6.6297197341918945, "learning_rate": 5.494736842105264e-06, "loss": 0.7374, "step": 7390 }, { "epoch": 0.18, "grad_norm": 2.2362327575683594, "learning_rate": 5.4736842105263165e-06, "loss": 0.8293, "step": 7400 }, { "epoch": 0.19, "grad_norm": 4.1008100509643555, "learning_rate": 5.452631578947369e-06, "loss": 0.7048, "step": 7410 }, { "epoch": 0.19, "grad_norm": 4.488921642303467, "learning_rate": 5.431578947368421e-06, "loss": 0.7902, "step": 7420 }, { "epoch": 0.19, "grad_norm": 3.7497622966766357, "learning_rate": 5.410526315789474e-06, "loss": 0.8359, "step": 7430 }, { "epoch": 0.19, "grad_norm": 3.192277193069458, "learning_rate": 5.389473684210526e-06, "loss": 0.7253, "step": 7440 }, { "epoch": 0.19, "grad_norm": 4.586243629455566, "learning_rate": 5.36842105263158e-06, "loss": 0.7588, "step": 7450 }, { "epoch": 0.19, "grad_norm": 3.424870729446411, "learning_rate": 5.3473684210526325e-06, "loss": 0.7268, "step": 7460 }, { "epoch": 0.19, "grad_norm": 28.807186126708984, "learning_rate": 5.326315789473685e-06, "loss": 0.7979, "step": 7470 }, { "epoch": 0.19, "grad_norm": 4.297823905944824, "learning_rate": 5.305263157894738e-06, "loss": 0.768, "step": 7480 }, { "epoch": 0.19, "grad_norm": 4.891976833343506, "learning_rate": 5.2842105263157896e-06, "loss": 0.7063, "step": 7490 }, { "epoch": 0.19, "grad_norm": 4.083632469177246, "learning_rate": 5.263157894736842e-06, "loss": 0.8102, "step": 7500 }, { "epoch": 0.19, "eval_loss": 0.7507393956184387, "eval_runtime": 67.8717, "eval_samples_per_second": 14.734, "eval_steps_per_second": 14.734, "step": 7500 }, { "epoch": 0.19, "grad_norm": 10.315424919128418, "learning_rate": 5.242105263157895e-06, "loss": 0.736, "step": 7510 }, { "epoch": 0.19, "grad_norm": 3.440877676010132, "learning_rate": 5.2210526315789475e-06, "loss": 0.799, "step": 7520 }, { "epoch": 0.19, "grad_norm": 2.361064910888672, "learning_rate": 5.2e-06, "loss": 0.832, "step": 7530 }, { "epoch": 0.19, "grad_norm": 2.1224961280822754, "learning_rate": 5.178947368421054e-06, "loss": 0.7118, "step": 7540 }, { "epoch": 0.19, "grad_norm": 4.9322614669799805, "learning_rate": 5.157894736842106e-06, "loss": 0.6614, "step": 7550 }, { "epoch": 0.19, "grad_norm": 4.812900066375732, "learning_rate": 5.136842105263158e-06, "loss": 0.8002, "step": 7560 }, { "epoch": 0.19, "grad_norm": 6.411820411682129, "learning_rate": 5.115789473684211e-06, "loss": 0.835, "step": 7570 }, { "epoch": 0.19, "grad_norm": 5.406981468200684, "learning_rate": 5.0947368421052635e-06, "loss": 0.8384, "step": 7580 }, { "epoch": 0.19, "grad_norm": 4.32007360458374, "learning_rate": 5.073684210526316e-06, "loss": 0.5798, "step": 7590 }, { "epoch": 0.19, "grad_norm": 4.640589714050293, "learning_rate": 5.052631578947369e-06, "loss": 0.7896, "step": 7600 }, { "epoch": 0.19, "grad_norm": 5.4717936515808105, "learning_rate": 5.0315789473684214e-06, "loss": 0.7829, "step": 7610 }, { "epoch": 0.19, "grad_norm": 2.995558261871338, "learning_rate": 5.010526315789475e-06, "loss": 0.7322, "step": 7620 }, { "epoch": 0.19, "grad_norm": 2.5911152362823486, "learning_rate": 4.989473684210527e-06, "loss": 0.7727, "step": 7630 }, { "epoch": 0.19, "grad_norm": 3.3829457759857178, "learning_rate": 4.968421052631579e-06, "loss": 0.7178, "step": 7640 }, { "epoch": 0.19, "grad_norm": 5.157157897949219, "learning_rate": 4.947368421052632e-06, "loss": 0.7241, "step": 7650 }, { "epoch": 0.19, "grad_norm": 6.205902099609375, "learning_rate": 4.926315789473685e-06, "loss": 0.7831, "step": 7660 }, { "epoch": 0.19, "grad_norm": 3.92594051361084, "learning_rate": 4.905263157894737e-06, "loss": 0.8057, "step": 7670 }, { "epoch": 0.19, "grad_norm": 4.578032493591309, "learning_rate": 4.88421052631579e-06, "loss": 0.8011, "step": 7680 }, { "epoch": 0.19, "grad_norm": 6.8539605140686035, "learning_rate": 4.863157894736843e-06, "loss": 0.7792, "step": 7690 }, { "epoch": 0.19, "grad_norm": 7.954685211181641, "learning_rate": 4.842105263157895e-06, "loss": 0.6691, "step": 7700 }, { "epoch": 0.19, "grad_norm": 2.0253312587738037, "learning_rate": 4.821052631578948e-06, "loss": 0.6483, "step": 7710 }, { "epoch": 0.19, "grad_norm": 8.230294227600098, "learning_rate": 4.800000000000001e-06, "loss": 0.8076, "step": 7720 }, { "epoch": 0.19, "grad_norm": 2.5444509983062744, "learning_rate": 4.778947368421053e-06, "loss": 0.7902, "step": 7730 }, { "epoch": 0.19, "grad_norm": 1.8759273290634155, "learning_rate": 4.757894736842106e-06, "loss": 0.7308, "step": 7740 }, { "epoch": 0.19, "grad_norm": 5.69119930267334, "learning_rate": 4.736842105263158e-06, "loss": 0.6605, "step": 7750 }, { "epoch": 0.19, "grad_norm": 7.020988941192627, "learning_rate": 4.71578947368421e-06, "loss": 0.7678, "step": 7760 }, { "epoch": 0.19, "grad_norm": 4.7685866355896, "learning_rate": 4.694736842105264e-06, "loss": 0.8022, "step": 7770 }, { "epoch": 0.19, "grad_norm": 2.516789436340332, "learning_rate": 4.6736842105263166e-06, "loss": 0.6176, "step": 7780 }, { "epoch": 0.19, "grad_norm": 4.267387866973877, "learning_rate": 4.652631578947368e-06, "loss": 0.6487, "step": 7790 }, { "epoch": 0.2, "grad_norm": 5.96762228012085, "learning_rate": 4.631578947368421e-06, "loss": 0.7066, "step": 7800 }, { "epoch": 0.2, "grad_norm": 4.345110893249512, "learning_rate": 4.6105263157894745e-06, "loss": 0.6072, "step": 7810 }, { "epoch": 0.2, "grad_norm": 10.33462142944336, "learning_rate": 4.589473684210526e-06, "loss": 0.8211, "step": 7820 }, { "epoch": 0.2, "grad_norm": 4.632289409637451, "learning_rate": 4.568421052631579e-06, "loss": 0.8335, "step": 7830 }, { "epoch": 0.2, "grad_norm": 4.453967094421387, "learning_rate": 4.547368421052632e-06, "loss": 0.8331, "step": 7840 }, { "epoch": 0.2, "grad_norm": 5.877091407775879, "learning_rate": 4.526315789473685e-06, "loss": 0.6793, "step": 7850 }, { "epoch": 0.2, "grad_norm": 16.41980743408203, "learning_rate": 4.505263157894737e-06, "loss": 0.819, "step": 7860 }, { "epoch": 0.2, "grad_norm": 3.1915693283081055, "learning_rate": 4.48421052631579e-06, "loss": 0.7217, "step": 7870 }, { "epoch": 0.2, "grad_norm": 5.805244445800781, "learning_rate": 4.463157894736842e-06, "loss": 0.7146, "step": 7880 }, { "epoch": 0.2, "grad_norm": 2.697472333908081, "learning_rate": 4.442105263157896e-06, "loss": 0.6748, "step": 7890 }, { "epoch": 0.2, "grad_norm": 3.6001346111297607, "learning_rate": 4.4210526315789476e-06, "loss": 0.6972, "step": 7900 }, { "epoch": 0.2, "grad_norm": 4.912445545196533, "learning_rate": 4.4e-06, "loss": 0.7157, "step": 7910 }, { "epoch": 0.2, "grad_norm": 6.9912309646606445, "learning_rate": 4.378947368421053e-06, "loss": 0.5927, "step": 7920 }, { "epoch": 0.2, "grad_norm": 4.380290985107422, "learning_rate": 4.3578947368421055e-06, "loss": 0.699, "step": 7930 }, { "epoch": 0.2, "grad_norm": 4.024576663970947, "learning_rate": 4.336842105263158e-06, "loss": 0.8156, "step": 7940 }, { "epoch": 0.2, "grad_norm": 3.523719310760498, "learning_rate": 4.315789473684211e-06, "loss": 0.7827, "step": 7950 }, { "epoch": 0.2, "grad_norm": 10.055171966552734, "learning_rate": 4.2947368421052635e-06, "loss": 0.7142, "step": 7960 }, { "epoch": 0.2, "grad_norm": 7.437203407287598, "learning_rate": 4.273684210526316e-06, "loss": 0.7184, "step": 7970 }, { "epoch": 0.2, "grad_norm": 2.6910207271575928, "learning_rate": 4.252631578947369e-06, "loss": 0.7311, "step": 7980 }, { "epoch": 0.2, "grad_norm": 12.729212760925293, "learning_rate": 4.2315789473684215e-06, "loss": 0.7629, "step": 7990 }, { "epoch": 0.2, "grad_norm": 3.817344903945923, "learning_rate": 4.210526315789474e-06, "loss": 0.8676, "step": 8000 }, { "epoch": 0.2, "eval_loss": 0.7396635413169861, "eval_runtime": 67.9126, "eval_samples_per_second": 14.725, "eval_steps_per_second": 14.725, "step": 8000 }, { "epoch": 0.2, "grad_norm": 5.193355083465576, "learning_rate": 4.189473684210527e-06, "loss": 0.7036, "step": 8010 }, { "epoch": 0.2, "grad_norm": 3.617652177810669, "learning_rate": 4.1684210526315794e-06, "loss": 0.6547, "step": 8020 }, { "epoch": 0.2, "grad_norm": 3.48286771774292, "learning_rate": 4.147368421052632e-06, "loss": 0.6756, "step": 8030 }, { "epoch": 0.2, "grad_norm": 4.939229965209961, "learning_rate": 4.126315789473685e-06, "loss": 0.7157, "step": 8040 }, { "epoch": 0.2, "grad_norm": 14.387231826782227, "learning_rate": 4.105263157894737e-06, "loss": 0.8052, "step": 8050 }, { "epoch": 0.2, "grad_norm": 4.042211055755615, "learning_rate": 4.08421052631579e-06, "loss": 0.6733, "step": 8060 }, { "epoch": 0.2, "grad_norm": 6.068091869354248, "learning_rate": 4.063157894736842e-06, "loss": 0.6172, "step": 8070 }, { "epoch": 0.2, "grad_norm": 5.004486083984375, "learning_rate": 4.042105263157895e-06, "loss": 0.7888, "step": 8080 }, { "epoch": 0.2, "grad_norm": 5.651116847991943, "learning_rate": 4.021052631578948e-06, "loss": 0.6979, "step": 8090 }, { "epoch": 0.2, "grad_norm": 3.581594944000244, "learning_rate": 4.000000000000001e-06, "loss": 0.7654, "step": 8100 }, { "epoch": 0.2, "grad_norm": 2.6030330657958984, "learning_rate": 3.9789473684210525e-06, "loss": 0.7946, "step": 8110 }, { "epoch": 0.2, "grad_norm": 5.385477542877197, "learning_rate": 3.957894736842106e-06, "loss": 0.7785, "step": 8120 }, { "epoch": 0.2, "grad_norm": 5.688074588775635, "learning_rate": 3.936842105263159e-06, "loss": 0.7762, "step": 8130 }, { "epoch": 0.2, "grad_norm": 1.7027924060821533, "learning_rate": 3.9157894736842104e-06, "loss": 0.6933, "step": 8140 }, { "epoch": 0.2, "grad_norm": 5.239694118499756, "learning_rate": 3.894736842105263e-06, "loss": 0.8061, "step": 8150 }, { "epoch": 0.2, "grad_norm": 4.3939032554626465, "learning_rate": 3.873684210526316e-06, "loss": 0.7537, "step": 8160 }, { "epoch": 0.2, "grad_norm": 5.115386962890625, "learning_rate": 3.852631578947369e-06, "loss": 0.7025, "step": 8170 }, { "epoch": 0.2, "grad_norm": 4.546750545501709, "learning_rate": 3.831578947368421e-06, "loss": 0.7108, "step": 8180 }, { "epoch": 0.2, "grad_norm": 3.043384552001953, "learning_rate": 3.810526315789474e-06, "loss": 0.7506, "step": 8190 }, { "epoch": 0.2, "grad_norm": 2.8117778301239014, "learning_rate": 3.789473684210527e-06, "loss": 0.773, "step": 8200 }, { "epoch": 0.21, "grad_norm": 6.000233173370361, "learning_rate": 3.768421052631579e-06, "loss": 0.6902, "step": 8210 }, { "epoch": 0.21, "grad_norm": 6.7739787101745605, "learning_rate": 3.7473684210526317e-06, "loss": 0.6397, "step": 8220 }, { "epoch": 0.21, "grad_norm": 4.948480129241943, "learning_rate": 3.7263157894736848e-06, "loss": 0.6185, "step": 8230 }, { "epoch": 0.21, "grad_norm": 4.269702434539795, "learning_rate": 3.7052631578947374e-06, "loss": 0.7487, "step": 8240 }, { "epoch": 0.21, "grad_norm": 3.8336634635925293, "learning_rate": 3.6842105263157896e-06, "loss": 0.7805, "step": 8250 }, { "epoch": 0.21, "grad_norm": 4.896543979644775, "learning_rate": 3.6631578947368423e-06, "loss": 0.645, "step": 8260 }, { "epoch": 0.21, "grad_norm": 6.051191806793213, "learning_rate": 3.642105263157895e-06, "loss": 0.7477, "step": 8270 }, { "epoch": 0.21, "grad_norm": 24.540451049804688, "learning_rate": 3.621052631578948e-06, "loss": 0.8168, "step": 8280 }, { "epoch": 0.21, "grad_norm": 5.061807155609131, "learning_rate": 3.6000000000000003e-06, "loss": 0.727, "step": 8290 }, { "epoch": 0.21, "grad_norm": 2.3907368183135986, "learning_rate": 3.578947368421053e-06, "loss": 0.6614, "step": 8300 }, { "epoch": 0.21, "grad_norm": 4.554809093475342, "learning_rate": 3.5578947368421056e-06, "loss": 0.6947, "step": 8310 }, { "epoch": 0.21, "grad_norm": 3.7383534908294678, "learning_rate": 3.536842105263158e-06, "loss": 0.6171, "step": 8320 }, { "epoch": 0.21, "grad_norm": 4.406937122344971, "learning_rate": 3.515789473684211e-06, "loss": 0.6102, "step": 8330 }, { "epoch": 0.21, "grad_norm": 5.226219654083252, "learning_rate": 3.4947368421052635e-06, "loss": 0.7746, "step": 8340 }, { "epoch": 0.21, "grad_norm": 6.249040126800537, "learning_rate": 3.473684210526316e-06, "loss": 0.7158, "step": 8350 }, { "epoch": 0.21, "grad_norm": 6.806312084197998, "learning_rate": 3.4526315789473684e-06, "loss": 0.7249, "step": 8360 }, { "epoch": 0.21, "grad_norm": 2.993473529815674, "learning_rate": 3.4315789473684215e-06, "loss": 0.826, "step": 8370 }, { "epoch": 0.21, "grad_norm": 4.120741367340088, "learning_rate": 3.410526315789474e-06, "loss": 0.6238, "step": 8380 }, { "epoch": 0.21, "grad_norm": 4.020960807800293, "learning_rate": 3.3894736842105264e-06, "loss": 0.6749, "step": 8390 }, { "epoch": 0.21, "grad_norm": 6.000002384185791, "learning_rate": 3.368421052631579e-06, "loss": 0.7652, "step": 8400 }, { "epoch": 0.21, "grad_norm": 8.221445083618164, "learning_rate": 3.347368421052632e-06, "loss": 0.7781, "step": 8410 }, { "epoch": 0.21, "grad_norm": 5.850223541259766, "learning_rate": 3.3263157894736848e-06, "loss": 0.7555, "step": 8420 }, { "epoch": 0.21, "grad_norm": 2.249915838241577, "learning_rate": 3.305263157894737e-06, "loss": 0.7305, "step": 8430 }, { "epoch": 0.21, "grad_norm": 4.955141067504883, "learning_rate": 3.2842105263157897e-06, "loss": 0.6817, "step": 8440 }, { "epoch": 0.21, "grad_norm": 2.4711403846740723, "learning_rate": 3.2631578947368423e-06, "loss": 0.683, "step": 8450 }, { "epoch": 0.21, "grad_norm": 5.367486953735352, "learning_rate": 3.2421052631578945e-06, "loss": 0.6494, "step": 8460 }, { "epoch": 0.21, "grad_norm": 3.283465623855591, "learning_rate": 3.2210526315789476e-06, "loss": 0.6092, "step": 8470 }, { "epoch": 0.21, "grad_norm": 4.473137855529785, "learning_rate": 3.2000000000000003e-06, "loss": 0.676, "step": 8480 }, { "epoch": 0.21, "grad_norm": 3.177180528640747, "learning_rate": 3.178947368421053e-06, "loss": 0.6685, "step": 8490 }, { "epoch": 0.21, "grad_norm": 4.735683441162109, "learning_rate": 3.157894736842105e-06, "loss": 0.7544, "step": 8500 }, { "epoch": 0.21, "eval_loss": 0.7582711577415466, "eval_runtime": 67.8631, "eval_samples_per_second": 14.736, "eval_steps_per_second": 14.736, "step": 8500 }, { "epoch": 0.21, "grad_norm": 4.465471267700195, "learning_rate": 3.1368421052631582e-06, "loss": 0.8191, "step": 8510 }, { "epoch": 0.21, "grad_norm": 3.8849751949310303, "learning_rate": 3.115789473684211e-06, "loss": 0.7078, "step": 8520 }, { "epoch": 0.21, "grad_norm": 5.555447101593018, "learning_rate": 3.094736842105263e-06, "loss": 0.7332, "step": 8530 }, { "epoch": 0.21, "grad_norm": 4.269344806671143, "learning_rate": 3.0736842105263158e-06, "loss": 0.7619, "step": 8540 }, { "epoch": 0.21, "grad_norm": 5.792567729949951, "learning_rate": 3.052631578947369e-06, "loss": 0.6858, "step": 8550 }, { "epoch": 0.21, "grad_norm": 4.095942974090576, "learning_rate": 3.0315789473684215e-06, "loss": 0.7793, "step": 8560 }, { "epoch": 0.21, "grad_norm": 3.316791296005249, "learning_rate": 3.0105263157894737e-06, "loss": 0.666, "step": 8570 }, { "epoch": 0.21, "grad_norm": 4.55336332321167, "learning_rate": 2.9894736842105264e-06, "loss": 0.7723, "step": 8580 }, { "epoch": 0.21, "grad_norm": 7.5306315422058105, "learning_rate": 2.9684210526315795e-06, "loss": 0.7283, "step": 8590 }, { "epoch": 0.21, "grad_norm": 3.935115337371826, "learning_rate": 2.9473684210526317e-06, "loss": 0.7843, "step": 8600 }, { "epoch": 0.22, "grad_norm": 5.173915863037109, "learning_rate": 2.9263157894736844e-06, "loss": 0.6662, "step": 8610 }, { "epoch": 0.22, "grad_norm": 3.5214264392852783, "learning_rate": 2.905263157894737e-06, "loss": 0.6887, "step": 8620 }, { "epoch": 0.22, "grad_norm": 4.139004707336426, "learning_rate": 2.88421052631579e-06, "loss": 0.6778, "step": 8630 }, { "epoch": 0.22, "grad_norm": 4.185042381286621, "learning_rate": 2.8631578947368423e-06, "loss": 0.9094, "step": 8640 }, { "epoch": 0.22, "grad_norm": 3.3607513904571533, "learning_rate": 2.842105263157895e-06, "loss": 0.7918, "step": 8650 }, { "epoch": 0.22, "grad_norm": 5.062870502471924, "learning_rate": 2.8210526315789476e-06, "loss": 0.7694, "step": 8660 }, { "epoch": 0.22, "grad_norm": 5.099003791809082, "learning_rate": 2.8000000000000003e-06, "loss": 0.7301, "step": 8670 }, { "epoch": 0.22, "grad_norm": 5.512063026428223, "learning_rate": 2.7789473684210525e-06, "loss": 0.7887, "step": 8680 }, { "epoch": 0.22, "grad_norm": 3.625652551651001, "learning_rate": 2.7578947368421056e-06, "loss": 0.7781, "step": 8690 }, { "epoch": 0.22, "grad_norm": 2.8921008110046387, "learning_rate": 2.7368421052631583e-06, "loss": 0.7582, "step": 8700 }, { "epoch": 0.22, "grad_norm": 10.71945571899414, "learning_rate": 2.7157894736842105e-06, "loss": 0.7234, "step": 8710 }, { "epoch": 0.22, "grad_norm": 17.737136840820312, "learning_rate": 2.694736842105263e-06, "loss": 0.6298, "step": 8720 }, { "epoch": 0.22, "grad_norm": 9.8464994430542, "learning_rate": 2.6736842105263162e-06, "loss": 0.7856, "step": 8730 }, { "epoch": 0.22, "grad_norm": 7.925550937652588, "learning_rate": 2.652631578947369e-06, "loss": 0.8387, "step": 8740 }, { "epoch": 0.22, "grad_norm": 3.530381441116333, "learning_rate": 2.631578947368421e-06, "loss": 0.8223, "step": 8750 }, { "epoch": 0.22, "grad_norm": 6.403299808502197, "learning_rate": 2.6105263157894738e-06, "loss": 0.8079, "step": 8760 }, { "epoch": 0.22, "grad_norm": 5.1753740310668945, "learning_rate": 2.589473684210527e-06, "loss": 0.7888, "step": 8770 }, { "epoch": 0.22, "grad_norm": 2.760190725326538, "learning_rate": 2.568421052631579e-06, "loss": 0.7071, "step": 8780 }, { "epoch": 0.22, "grad_norm": 5.183119297027588, "learning_rate": 2.5473684210526317e-06, "loss": 0.619, "step": 8790 }, { "epoch": 0.22, "grad_norm": 5.66708517074585, "learning_rate": 2.5263157894736844e-06, "loss": 0.7888, "step": 8800 }, { "epoch": 0.22, "grad_norm": 2.3660988807678223, "learning_rate": 2.5052631578947375e-06, "loss": 0.7466, "step": 8810 }, { "epoch": 0.22, "grad_norm": 3.8384206295013428, "learning_rate": 2.4842105263157897e-06, "loss": 0.7371, "step": 8820 }, { "epoch": 0.22, "grad_norm": 3.593717336654663, "learning_rate": 2.4631578947368424e-06, "loss": 0.5967, "step": 8830 }, { "epoch": 0.22, "grad_norm": 2.778346538543701, "learning_rate": 2.442105263157895e-06, "loss": 0.6407, "step": 8840 }, { "epoch": 0.22, "grad_norm": 10.841148376464844, "learning_rate": 2.4210526315789477e-06, "loss": 0.8172, "step": 8850 }, { "epoch": 0.22, "grad_norm": 2.635694980621338, "learning_rate": 2.4000000000000003e-06, "loss": 0.8135, "step": 8860 }, { "epoch": 0.22, "grad_norm": 1.5510995388031006, "learning_rate": 2.378947368421053e-06, "loss": 0.8328, "step": 8870 }, { "epoch": 0.22, "grad_norm": 3.770972967147827, "learning_rate": 2.357894736842105e-06, "loss": 0.6642, "step": 8880 }, { "epoch": 0.22, "grad_norm": 5.756451606750488, "learning_rate": 2.3368421052631583e-06, "loss": 0.7484, "step": 8890 }, { "epoch": 0.22, "grad_norm": 2.9202377796173096, "learning_rate": 2.3157894736842105e-06, "loss": 0.7381, "step": 8900 }, { "epoch": 0.22, "grad_norm": 4.43782377243042, "learning_rate": 2.294736842105263e-06, "loss": 0.7915, "step": 8910 }, { "epoch": 0.22, "grad_norm": 20.496152877807617, "learning_rate": 2.273684210526316e-06, "loss": 0.6872, "step": 8920 }, { "epoch": 0.22, "grad_norm": 3.2591583728790283, "learning_rate": 2.2526315789473685e-06, "loss": 0.668, "step": 8930 }, { "epoch": 0.22, "grad_norm": 2.23056960105896, "learning_rate": 2.231578947368421e-06, "loss": 0.6229, "step": 8940 }, { "epoch": 0.22, "grad_norm": 5.419168949127197, "learning_rate": 2.2105263157894738e-06, "loss": 0.9534, "step": 8950 }, { "epoch": 0.22, "grad_norm": 15.681089401245117, "learning_rate": 2.1894736842105264e-06, "loss": 0.782, "step": 8960 }, { "epoch": 0.22, "grad_norm": 3.7693331241607666, "learning_rate": 2.168421052631579e-06, "loss": 0.8047, "step": 8970 }, { "epoch": 0.22, "grad_norm": 3.4705393314361572, "learning_rate": 2.1473684210526317e-06, "loss": 0.7832, "step": 8980 }, { "epoch": 0.22, "grad_norm": 4.295872688293457, "learning_rate": 2.1263157894736844e-06, "loss": 0.7355, "step": 8990 }, { "epoch": 0.23, "grad_norm": 3.0480620861053467, "learning_rate": 2.105263157894737e-06, "loss": 0.6739, "step": 9000 }, { "epoch": 0.23, "eval_loss": 0.7442497611045837, "eval_runtime": 67.8767, "eval_samples_per_second": 14.733, "eval_steps_per_second": 14.733, "step": 9000 }, { "epoch": 0.23, "grad_norm": 2.9723927974700928, "learning_rate": 2.0842105263157897e-06, "loss": 0.7003, "step": 9010 }, { "epoch": 0.23, "grad_norm": 2.0932421684265137, "learning_rate": 2.0631578947368424e-06, "loss": 0.6897, "step": 9020 }, { "epoch": 0.23, "grad_norm": 4.70625114440918, "learning_rate": 2.042105263157895e-06, "loss": 0.8106, "step": 9030 }, { "epoch": 0.23, "grad_norm": 3.2763564586639404, "learning_rate": 2.0210526315789477e-06, "loss": 0.7387, "step": 9040 }, { "epoch": 0.23, "grad_norm": 4.553431034088135, "learning_rate": 2.0000000000000003e-06, "loss": 0.7435, "step": 9050 }, { "epoch": 0.23, "grad_norm": 5.36479377746582, "learning_rate": 1.978947368421053e-06, "loss": 0.7713, "step": 9060 }, { "epoch": 0.23, "grad_norm": 4.923874855041504, "learning_rate": 1.9578947368421052e-06, "loss": 0.5508, "step": 9070 }, { "epoch": 0.23, "grad_norm": 8.63404655456543, "learning_rate": 1.936842105263158e-06, "loss": 0.7323, "step": 9080 }, { "epoch": 0.23, "grad_norm": 5.521135330200195, "learning_rate": 1.9157894736842105e-06, "loss": 0.699, "step": 9090 }, { "epoch": 0.23, "grad_norm": 9.009405136108398, "learning_rate": 1.8947368421052634e-06, "loss": 0.789, "step": 9100 }, { "epoch": 0.23, "grad_norm": 12.834007263183594, "learning_rate": 1.8736842105263158e-06, "loss": 0.7382, "step": 9110 }, { "epoch": 0.23, "grad_norm": 3.753262758255005, "learning_rate": 1.8526315789473687e-06, "loss": 0.7035, "step": 9120 }, { "epoch": 0.23, "grad_norm": 3.300708770751953, "learning_rate": 1.8315789473684211e-06, "loss": 0.7558, "step": 9130 }, { "epoch": 0.23, "grad_norm": 4.416452884674072, "learning_rate": 1.810526315789474e-06, "loss": 0.6854, "step": 9140 }, { "epoch": 0.23, "grad_norm": 7.664788722991943, "learning_rate": 1.7894736842105265e-06, "loss": 0.6951, "step": 9150 }, { "epoch": 0.23, "grad_norm": 3.646073818206787, "learning_rate": 1.768421052631579e-06, "loss": 0.7472, "step": 9160 }, { "epoch": 0.23, "grad_norm": 3.125991106033325, "learning_rate": 1.7473684210526318e-06, "loss": 0.6711, "step": 9170 }, { "epoch": 0.23, "grad_norm": 5.308753967285156, "learning_rate": 1.7263157894736842e-06, "loss": 0.6393, "step": 9180 }, { "epoch": 0.23, "grad_norm": 11.79830265045166, "learning_rate": 1.705263157894737e-06, "loss": 0.7358, "step": 9190 }, { "epoch": 0.23, "grad_norm": 6.862399101257324, "learning_rate": 1.6842105263157895e-06, "loss": 0.8422, "step": 9200 }, { "epoch": 0.23, "grad_norm": 5.3199968338012695, "learning_rate": 1.6631578947368424e-06, "loss": 0.6999, "step": 9210 }, { "epoch": 0.23, "grad_norm": 3.263275146484375, "learning_rate": 1.6421052631578948e-06, "loss": 0.7122, "step": 9220 }, { "epoch": 0.23, "grad_norm": 4.283051490783691, "learning_rate": 1.6210526315789473e-06, "loss": 0.7793, "step": 9230 }, { "epoch": 0.23, "grad_norm": 2.0055785179138184, "learning_rate": 1.6000000000000001e-06, "loss": 0.732, "step": 9240 }, { "epoch": 0.23, "grad_norm": 4.184137344360352, "learning_rate": 1.5789473684210526e-06, "loss": 0.7339, "step": 9250 }, { "epoch": 0.23, "grad_norm": 3.587636709213257, "learning_rate": 1.5578947368421054e-06, "loss": 0.8473, "step": 9260 }, { "epoch": 0.23, "grad_norm": 8.189043045043945, "learning_rate": 1.5368421052631579e-06, "loss": 0.6498, "step": 9270 }, { "epoch": 0.23, "grad_norm": 3.4272284507751465, "learning_rate": 1.5157894736842108e-06, "loss": 0.7676, "step": 9280 }, { "epoch": 0.23, "grad_norm": 3.280287027359009, "learning_rate": 1.4947368421052632e-06, "loss": 0.6283, "step": 9290 }, { "epoch": 0.23, "grad_norm": 8.722474098205566, "learning_rate": 1.4736842105263159e-06, "loss": 0.7555, "step": 9300 }, { "epoch": 0.23, "grad_norm": 4.574818134307861, "learning_rate": 1.4526315789473685e-06, "loss": 0.7481, "step": 9310 }, { "epoch": 0.23, "grad_norm": 3.0097527503967285, "learning_rate": 1.4315789473684212e-06, "loss": 0.6181, "step": 9320 }, { "epoch": 0.23, "grad_norm": 6.725505352020264, "learning_rate": 1.4105263157894738e-06, "loss": 0.677, "step": 9330 }, { "epoch": 0.23, "grad_norm": 2.934959888458252, "learning_rate": 1.3894736842105263e-06, "loss": 0.6932, "step": 9340 }, { "epoch": 0.23, "grad_norm": 2.7491650581359863, "learning_rate": 1.3684210526315791e-06, "loss": 0.7361, "step": 9350 }, { "epoch": 0.23, "grad_norm": 4.734315872192383, "learning_rate": 1.3473684210526316e-06, "loss": 0.6442, "step": 9360 }, { "epoch": 0.23, "grad_norm": 4.301790714263916, "learning_rate": 1.3263157894736844e-06, "loss": 0.7642, "step": 9370 }, { "epoch": 0.23, "grad_norm": 4.042958736419678, "learning_rate": 1.3052631578947369e-06, "loss": 0.7974, "step": 9380 }, { "epoch": 0.23, "grad_norm": 4.941096782684326, "learning_rate": 1.2842105263157895e-06, "loss": 0.8603, "step": 9390 }, { "epoch": 0.23, "grad_norm": 4.379117488861084, "learning_rate": 1.2631578947368422e-06, "loss": 0.8297, "step": 9400 }, { "epoch": 0.24, "grad_norm": 6.3129048347473145, "learning_rate": 1.2421052631578948e-06, "loss": 0.7783, "step": 9410 }, { "epoch": 0.24, "grad_norm": 5.5439133644104, "learning_rate": 1.2210526315789475e-06, "loss": 0.8122, "step": 9420 }, { "epoch": 0.24, "grad_norm": 6.480744361877441, "learning_rate": 1.2000000000000002e-06, "loss": 0.7779, "step": 9430 }, { "epoch": 0.24, "grad_norm": 5.862485408782959, "learning_rate": 1.1789473684210526e-06, "loss": 0.6917, "step": 9440 }, { "epoch": 0.24, "grad_norm": 5.7247443199157715, "learning_rate": 1.1578947368421053e-06, "loss": 0.7017, "step": 9450 }, { "epoch": 0.24, "grad_norm": 8.194451332092285, "learning_rate": 1.136842105263158e-06, "loss": 0.7031, "step": 9460 }, { "epoch": 0.24, "grad_norm": 8.057929992675781, "learning_rate": 1.1157894736842106e-06, "loss": 0.7116, "step": 9470 }, { "epoch": 0.24, "grad_norm": 4.529337406158447, "learning_rate": 1.0947368421052632e-06, "loss": 0.8314, "step": 9480 }, { "epoch": 0.24, "grad_norm": 7.412846565246582, "learning_rate": 1.0736842105263159e-06, "loss": 0.6448, "step": 9490 }, { "epoch": 0.24, "grad_norm": 3.7076497077941895, "learning_rate": 1.0526315789473685e-06, "loss": 0.6291, "step": 9500 }, { "epoch": 0.24, "eval_loss": 0.7395394444465637, "eval_runtime": 67.8841, "eval_samples_per_second": 14.731, "eval_steps_per_second": 14.731, "step": 9500 }, { "epoch": 0.24, "grad_norm": 4.488115310668945, "learning_rate": 1.0315789473684212e-06, "loss": 0.8611, "step": 9510 }, { "epoch": 0.24, "grad_norm": 1.6314383745193481, "learning_rate": 1.0105263157894738e-06, "loss": 0.7694, "step": 9520 }, { "epoch": 0.24, "grad_norm": 5.290372848510742, "learning_rate": 9.894736842105265e-07, "loss": 0.7166, "step": 9530 }, { "epoch": 0.24, "grad_norm": 3.1572625637054443, "learning_rate": 9.68421052631579e-07, "loss": 0.7649, "step": 9540 }, { "epoch": 0.24, "grad_norm": 4.951930999755859, "learning_rate": 9.473684210526317e-07, "loss": 0.7057, "step": 9550 }, { "epoch": 0.24, "grad_norm": 4.696636199951172, "learning_rate": 9.263157894736844e-07, "loss": 0.7853, "step": 9560 }, { "epoch": 0.24, "grad_norm": 4.211262226104736, "learning_rate": 9.05263157894737e-07, "loss": 0.6612, "step": 9570 }, { "epoch": 0.24, "grad_norm": 4.584897041320801, "learning_rate": 8.842105263157895e-07, "loss": 0.6393, "step": 9580 }, { "epoch": 0.24, "grad_norm": 4.64282751083374, "learning_rate": 8.631578947368421e-07, "loss": 0.7915, "step": 9590 }, { "epoch": 0.24, "grad_norm": 3.691389799118042, "learning_rate": 8.421052631578948e-07, "loss": 0.659, "step": 9600 }, { "epoch": 0.24, "grad_norm": 4.740243911743164, "learning_rate": 8.210526315789474e-07, "loss": 0.7134, "step": 9610 }, { "epoch": 0.24, "grad_norm": 6.811493873596191, "learning_rate": 8.000000000000001e-07, "loss": 0.8592, "step": 9620 }, { "epoch": 0.24, "grad_norm": 3.2056334018707275, "learning_rate": 7.789473684210527e-07, "loss": 0.6753, "step": 9630 }, { "epoch": 0.24, "grad_norm": 4.347885608673096, "learning_rate": 7.578947368421054e-07, "loss": 0.7476, "step": 9640 }, { "epoch": 0.24, "grad_norm": 5.63771915435791, "learning_rate": 7.368421052631579e-07, "loss": 0.7649, "step": 9650 }, { "epoch": 0.24, "grad_norm": 3.062124013900757, "learning_rate": 7.157894736842106e-07, "loss": 0.6792, "step": 9660 }, { "epoch": 0.24, "grad_norm": 9.334321022033691, "learning_rate": 6.947368421052631e-07, "loss": 0.7626, "step": 9670 }, { "epoch": 0.24, "grad_norm": 7.429685115814209, "learning_rate": 6.736842105263158e-07, "loss": 0.6943, "step": 9680 }, { "epoch": 0.24, "grad_norm": 4.459277629852295, "learning_rate": 6.526315789473684e-07, "loss": 0.7838, "step": 9690 }, { "epoch": 0.24, "grad_norm": 6.821927070617676, "learning_rate": 6.315789473684211e-07, "loss": 0.7103, "step": 9700 }, { "epoch": 0.24, "grad_norm": 10.438909530639648, "learning_rate": 6.105263157894738e-07, "loss": 0.7509, "step": 9710 }, { "epoch": 0.24, "grad_norm": 11.55811882019043, "learning_rate": 5.894736842105263e-07, "loss": 0.7623, "step": 9720 }, { "epoch": 0.24, "grad_norm": 3.1809043884277344, "learning_rate": 5.68421052631579e-07, "loss": 0.6294, "step": 9730 }, { "epoch": 0.24, "grad_norm": 5.337337970733643, "learning_rate": 5.473684210526316e-07, "loss": 0.763, "step": 9740 }, { "epoch": 0.24, "grad_norm": 8.130523681640625, "learning_rate": 5.263157894736843e-07, "loss": 0.6404, "step": 9750 }, { "epoch": 0.24, "grad_norm": 4.213668346405029, "learning_rate": 5.052631578947369e-07, "loss": 0.7379, "step": 9760 }, { "epoch": 0.24, "grad_norm": 3.8605246543884277, "learning_rate": 4.842105263157895e-07, "loss": 0.7483, "step": 9770 }, { "epoch": 0.24, "grad_norm": 4.358519077301025, "learning_rate": 4.631578947368422e-07, "loss": 0.6823, "step": 9780 }, { "epoch": 0.24, "grad_norm": 2.9712955951690674, "learning_rate": 4.421052631578947e-07, "loss": 0.679, "step": 9790 }, { "epoch": 0.24, "grad_norm": 6.285613059997559, "learning_rate": 4.210526315789474e-07, "loss": 0.7763, "step": 9800 }, { "epoch": 0.25, "grad_norm": 2.434277296066284, "learning_rate": 4.0000000000000003e-07, "loss": 0.8558, "step": 9810 }, { "epoch": 0.25, "grad_norm": 7.880703449249268, "learning_rate": 3.789473684210527e-07, "loss": 0.7494, "step": 9820 }, { "epoch": 0.25, "grad_norm": 11.698799133300781, "learning_rate": 3.578947368421053e-07, "loss": 0.6576, "step": 9830 }, { "epoch": 0.25, "grad_norm": 3.2752954959869385, "learning_rate": 3.368421052631579e-07, "loss": 0.6494, "step": 9840 }, { "epoch": 0.25, "grad_norm": 2.878567934036255, "learning_rate": 3.1578947368421055e-07, "loss": 0.6781, "step": 9850 }, { "epoch": 0.25, "grad_norm": 3.6086246967315674, "learning_rate": 2.9473684210526315e-07, "loss": 0.7339, "step": 9860 }, { "epoch": 0.25, "grad_norm": 5.403782844543457, "learning_rate": 2.736842105263158e-07, "loss": 0.7738, "step": 9870 }, { "epoch": 0.25, "grad_norm": 4.487565994262695, "learning_rate": 2.5263157894736846e-07, "loss": 0.8165, "step": 9880 }, { "epoch": 0.25, "grad_norm": 4.29118537902832, "learning_rate": 2.315789473684211e-07, "loss": 0.6272, "step": 9890 }, { "epoch": 0.25, "grad_norm": 3.634309768676758, "learning_rate": 2.105263157894737e-07, "loss": 0.6641, "step": 9900 }, { "epoch": 0.25, "grad_norm": 4.989073276519775, "learning_rate": 1.8947368421052634e-07, "loss": 0.7111, "step": 9910 }, { "epoch": 0.25, "grad_norm": 5.606556415557861, "learning_rate": 1.6842105263157895e-07, "loss": 0.6112, "step": 9920 }, { "epoch": 0.25, "grad_norm": 5.012443542480469, "learning_rate": 1.4736842105263158e-07, "loss": 0.6684, "step": 9930 }, { "epoch": 0.25, "grad_norm": 6.287766933441162, "learning_rate": 1.2631578947368423e-07, "loss": 0.6687, "step": 9940 }, { "epoch": 0.25, "grad_norm": 3.646402597427368, "learning_rate": 1.0526315789473685e-07, "loss": 0.6452, "step": 9950 }, { "epoch": 0.25, "grad_norm": 7.9046950340271, "learning_rate": 8.421052631578947e-08, "loss": 0.7636, "step": 9960 }, { "epoch": 0.25, "grad_norm": 4.733578681945801, "learning_rate": 6.315789473684211e-08, "loss": 0.6619, "step": 9970 }, { "epoch": 0.25, "grad_norm": 2.342442274093628, "learning_rate": 4.2105263157894737e-08, "loss": 0.74, "step": 9980 }, { "epoch": 0.25, "grad_norm": 4.0832839012146, "learning_rate": 2.1052631578947368e-08, "loss": 0.7314, "step": 9990 }, { "epoch": 0.25, "grad_norm": 2.517941951751709, "learning_rate": 0.0, "loss": 0.755, "step": 10000 }, { "epoch": 0.25, "eval_loss": 0.7402730584144592, "eval_runtime": 67.899, "eval_samples_per_second": 14.728, "eval_steps_per_second": 14.728, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2500, "total_flos": 1.6102125993984e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }