|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9992532754056073, |
|
"eval_steps": 500, |
|
"global_step": 920, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010861448645713123, |
|
"grad_norm": 169.19541931152344, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 3.0856, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021722897291426246, |
|
"grad_norm": 10.523147583007812, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 3.0568, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03258434593713937, |
|
"grad_norm": 13.033576011657715, |
|
"learning_rate": 3e-06, |
|
"loss": 3.1183, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04344579458285249, |
|
"grad_norm": 14.1365966796875, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.9091, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05430724322856561, |
|
"grad_norm": 36.71495819091797, |
|
"learning_rate": 5e-06, |
|
"loss": 2.8426, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06516869187427873, |
|
"grad_norm": 14.660510063171387, |
|
"learning_rate": 6e-06, |
|
"loss": 2.7139, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07603014051999185, |
|
"grad_norm": 33.809818267822266, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 2.5489, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08689158916570498, |
|
"grad_norm": 17.694528579711914, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.5378, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0977530378114181, |
|
"grad_norm": 23.081829071044922, |
|
"learning_rate": 9e-06, |
|
"loss": 2.4152, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10861448645713122, |
|
"grad_norm": 13.884807586669922, |
|
"learning_rate": 1e-05, |
|
"loss": 2.364, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11947593510284434, |
|
"grad_norm": 10.270822525024414, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 2.5817, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13033738374855747, |
|
"grad_norm": 8.365082740783691, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.2702, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14119883239427058, |
|
"grad_norm": 15.889869689941406, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 2.0207, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1520602810399837, |
|
"grad_norm": 7.615819454193115, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 2.0305, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16292172968569682, |
|
"grad_norm": 10.326943397521973, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.0734, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17378317833140997, |
|
"grad_norm": 7.269619941711426, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.0625, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18464462697712308, |
|
"grad_norm": 4.737296104431152, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 2.0665, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1955060756228362, |
|
"grad_norm": 5.883551597595215, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.0005, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20636752426854932, |
|
"grad_norm": 7.671362400054932, |
|
"learning_rate": 1.9e-05, |
|
"loss": 1.9652, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21722897291426244, |
|
"grad_norm": 4.762546539306641, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9471, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22809042155997555, |
|
"grad_norm": 5.938209056854248, |
|
"learning_rate": 2.1e-05, |
|
"loss": 1.8201, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23895187020568867, |
|
"grad_norm": 4.879518985748291, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.9098, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24981331885140182, |
|
"grad_norm": 3.9282209873199463, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 1.7855, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.26067476749711493, |
|
"grad_norm": 5.627065181732178, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.9036, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.271536216142828, |
|
"grad_norm": 5.127110481262207, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.9096, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.28239766478854117, |
|
"grad_norm": 4.889794826507568, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.8139, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2932591134342543, |
|
"grad_norm": 6.027666091918945, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 1.8053, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3041205620799674, |
|
"grad_norm": 4.673439979553223, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.7594, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31498201072568055, |
|
"grad_norm": 15.802154541015625, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.7326, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.32584345937139364, |
|
"grad_norm": 4.887749671936035, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8835, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3367049080171068, |
|
"grad_norm": 4.747198581695557, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.7743, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34756635666281993, |
|
"grad_norm": 3.817439079284668, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.8374, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.358427805308533, |
|
"grad_norm": 8.614398002624512, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.8126, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.36928925395424617, |
|
"grad_norm": 4.607050895690918, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.7341, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.38015070259995926, |
|
"grad_norm": 5.569587707519531, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.7022, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3910121512456724, |
|
"grad_norm": 4.454966068267822, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.697, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4018735998913855, |
|
"grad_norm": 4.7365031242370605, |
|
"learning_rate": 3.7e-05, |
|
"loss": 1.7528, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.41273504853709864, |
|
"grad_norm": 5.313693523406982, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.7489, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4235964971828118, |
|
"grad_norm": 11.473447799682617, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.6342, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.43445794582852487, |
|
"grad_norm": 4.205170154571533, |
|
"learning_rate": 4e-05, |
|
"loss": 1.6882, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.445319394474238, |
|
"grad_norm": 4.324029445648193, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.6918, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4561808431199511, |
|
"grad_norm": 6.601579666137695, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.7205, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.46704229176566425, |
|
"grad_norm": 3.448477029800415, |
|
"learning_rate": 4.3e-05, |
|
"loss": 1.5888, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.47790374041137734, |
|
"grad_norm": 19.598926544189453, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.6581, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4887651890570905, |
|
"grad_norm": 4.1286702156066895, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.669, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.49962663770280363, |
|
"grad_norm": 7.4469404220581055, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.7023, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5104880863485167, |
|
"grad_norm": 3.3093454837799072, |
|
"learning_rate": 4.7e-05, |
|
"loss": 1.6477, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5213495349942299, |
|
"grad_norm": 7.244236946105957, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.6486, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.532210983639943, |
|
"grad_norm": 4.145318984985352, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.6016, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.543072432285656, |
|
"grad_norm": 4.232030868530273, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6154, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.543072432285656, |
|
"eval_loss": 1.4837769269943237, |
|
"eval_runtime": 32.1482, |
|
"eval_samples_per_second": 25.445, |
|
"eval_steps_per_second": 25.445, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5539338809313692, |
|
"grad_norm": 3.5671701431274414, |
|
"learning_rate": 4.880952380952381e-05, |
|
"loss": 1.6524, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5647953295770823, |
|
"grad_norm": 5.578204154968262, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 1.6353, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5756567782227955, |
|
"grad_norm": 7.8000993728637695, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 1.6009, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5865182268685086, |
|
"grad_norm": 10.637792587280273, |
|
"learning_rate": 4.523809523809524e-05, |
|
"loss": 1.5455, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5973796755142217, |
|
"grad_norm": 4.490563869476318, |
|
"learning_rate": 4.404761904761905e-05, |
|
"loss": 1.7016, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6082411241599348, |
|
"grad_norm": 4.277554512023926, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 1.6836, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.619102572805648, |
|
"grad_norm": 4.989715576171875, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.6839, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6299640214513611, |
|
"grad_norm": 5.333950042724609, |
|
"learning_rate": 4.047619047619048e-05, |
|
"loss": 1.6369, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6408254700970742, |
|
"grad_norm": 6.111197471618652, |
|
"learning_rate": 3.928571428571429e-05, |
|
"loss": 1.6566, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6516869187427873, |
|
"grad_norm": 9.309179306030273, |
|
"learning_rate": 3.809523809523809e-05, |
|
"loss": 1.6325, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6625483673885004, |
|
"grad_norm": 3.6230900287628174, |
|
"learning_rate": 3.690476190476191e-05, |
|
"loss": 1.5397, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6734098160342136, |
|
"grad_norm": 12.543122291564941, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 1.5934, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6842712646799267, |
|
"grad_norm": 8.352952003479004, |
|
"learning_rate": 3.4523809523809526e-05, |
|
"loss": 1.6255, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6951327133256399, |
|
"grad_norm": 3.2874395847320557, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.5855, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7059941619713529, |
|
"grad_norm": 4.011010646820068, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 1.6257, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.716855610617066, |
|
"grad_norm": 3.508124828338623, |
|
"learning_rate": 3.095238095238095e-05, |
|
"loss": 1.5792, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7277170592627792, |
|
"grad_norm": 8.244254112243652, |
|
"learning_rate": 2.9761904761904762e-05, |
|
"loss": 1.5836, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7385785079084923, |
|
"grad_norm": 4.174469947814941, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 1.6274, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7494399565542054, |
|
"grad_norm": 3.8175296783447266, |
|
"learning_rate": 2.7380952380952383e-05, |
|
"loss": 1.524, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7603014051999185, |
|
"grad_norm": 5.269782543182373, |
|
"learning_rate": 2.6190476190476192e-05, |
|
"loss": 1.6037, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7711628538456317, |
|
"grad_norm": 6.3031511306762695, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.5312, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7820243024913448, |
|
"grad_norm": 2.747074604034424, |
|
"learning_rate": 2.380952380952381e-05, |
|
"loss": 1.5516, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.792885751137058, |
|
"grad_norm": 3.978896141052246, |
|
"learning_rate": 2.261904761904762e-05, |
|
"loss": 1.6539, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.803747199782771, |
|
"grad_norm": 4.52466344833374, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 1.6088, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8146086484284841, |
|
"grad_norm": 5.761850357055664, |
|
"learning_rate": 2.023809523809524e-05, |
|
"loss": 1.578, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8254700970741973, |
|
"grad_norm": 4.5440192222595215, |
|
"learning_rate": 1.9047619047619046e-05, |
|
"loss": 1.5776, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8363315457199104, |
|
"grad_norm": 3.666665554046631, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 1.5747, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8471929943656236, |
|
"grad_norm": 4.0968403816223145, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.5851, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8580544430113366, |
|
"grad_norm": 4.229625225067139, |
|
"learning_rate": 1.5476190476190476e-05, |
|
"loss": 1.5634, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8689158916570497, |
|
"grad_norm": 30.42665672302246, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 1.5744, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8797773403027629, |
|
"grad_norm": 9.380242347717285, |
|
"learning_rate": 1.3095238095238096e-05, |
|
"loss": 1.5207, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.890638788948476, |
|
"grad_norm": 3.2494077682495117, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 1.4783, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9015002375941892, |
|
"grad_norm": 7.472358703613281, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 1.5537, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9123616862399022, |
|
"grad_norm": 4.2244439125061035, |
|
"learning_rate": 9.523809523809523e-06, |
|
"loss": 1.5969, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9232231348856154, |
|
"grad_norm": 3.104196548461914, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.6525, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9340845835313285, |
|
"grad_norm": 5.433168411254883, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 1.5486, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9449460321770417, |
|
"grad_norm": 3.924849033355713, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 1.4945, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9558074808227547, |
|
"grad_norm": 3.4002134799957275, |
|
"learning_rate": 4.7619047619047615e-06, |
|
"loss": 1.5937, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9666689294684678, |
|
"grad_norm": 5.583169460296631, |
|
"learning_rate": 3.5714285714285714e-06, |
|
"loss": 1.6058, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.977530378114181, |
|
"grad_norm": 7.243485927581787, |
|
"learning_rate": 2.3809523809523808e-06, |
|
"loss": 1.641, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9883918267598941, |
|
"grad_norm": 174.63710021972656, |
|
"learning_rate": 1.1904761904761904e-06, |
|
"loss": 1.6228, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9992532754056073, |
|
"grad_norm": 6.230108261108398, |
|
"learning_rate": 0.0, |
|
"loss": 1.4502, |
|
"step": 920 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 920, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5528070268502016.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|