{ "best_metric": 0.04311952739953995, "best_model_checkpoint": "t5/checkpoint-58320", "epoch": 100.0, "eval_steps": 500, "global_step": 486000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.14633552730083466, "learning_rate": 0.00099, "loss": 0.1231, "step": 4860 }, { "epoch": 1.0, "eval_accuracy": 0.6418053052843314, "eval_loss": 0.0717623308300972, "eval_runtime": 1333.7775, "eval_samples_per_second": 89.456, "eval_steps_per_second": 0.35, "step": 4860 }, { "epoch": 2.0, "grad_norm": 0.10493500530719757, "learning_rate": 0.00098, "loss": 0.0712, "step": 9720 }, { "epoch": 2.0, "eval_accuracy": 0.6884046431714369, "eval_loss": 0.06004703789949417, "eval_runtime": 1304.0684, "eval_samples_per_second": 91.494, "eval_steps_per_second": 0.358, "step": 9720 }, { "epoch": 3.0, "grad_norm": 0.09630604088306427, "learning_rate": 0.0009699999999999999, "loss": 0.0593, "step": 14580 }, { "epoch": 3.0, "eval_accuracy": 0.7176465658131835, "eval_loss": 0.05390430614352226, "eval_runtime": 1308.2854, "eval_samples_per_second": 91.2, "eval_steps_per_second": 0.357, "step": 14580 }, { "epoch": 4.0, "grad_norm": 0.07841313630342484, "learning_rate": 0.00096, "loss": 0.0519, "step": 19440 }, { "epoch": 4.0, "eval_accuracy": 0.730679294304991, "eval_loss": 0.050438590347766876, "eval_runtime": 1309.4742, "eval_samples_per_second": 91.117, "eval_steps_per_second": 0.357, "step": 19440 }, { "epoch": 5.0, "grad_norm": 0.07362372428178787, "learning_rate": 0.00095, "loss": 0.0464, "step": 24300 }, { "epoch": 5.0, "eval_accuracy": 0.7446423333193647, "eval_loss": 0.04836108162999153, "eval_runtime": 1308.1745, "eval_samples_per_second": 91.207, "eval_steps_per_second": 0.357, "step": 24300 }, { "epoch": 6.0, "grad_norm": 0.05613507702946663, "learning_rate": 0.00094, "loss": 0.0422, "step": 29160 }, { "epoch": 6.0, "eval_accuracy": 0.7498721870678456, "eval_loss": 0.04650866985321045, "eval_runtime": 1310.787, "eval_samples_per_second": 91.025, "eval_steps_per_second": 0.356, "step": 29160 }, { "epoch": 7.0, "grad_norm": 0.06997396796941757, "learning_rate": 0.00093, "loss": 0.0385, "step": 34020 }, { "epoch": 7.0, "eval_accuracy": 0.7611197250974312, "eval_loss": 0.04569365829229355, "eval_runtime": 1301.8483, "eval_samples_per_second": 91.65, "eval_steps_per_second": 0.359, "step": 34020 }, { "epoch": 8.0, "grad_norm": 0.054521311074495316, "learning_rate": 0.00092, "loss": 0.0354, "step": 38880 }, { "epoch": 8.0, "eval_accuracy": 0.7642039978208943, "eval_loss": 0.04475805535912514, "eval_runtime": 1307.7292, "eval_samples_per_second": 91.238, "eval_steps_per_second": 0.357, "step": 38880 }, { "epoch": 9.0, "grad_norm": 0.06338842958211899, "learning_rate": 0.00091, "loss": 0.0328, "step": 43740 }, { "epoch": 9.0, "eval_accuracy": 0.771554289066756, "eval_loss": 0.0442810133099556, "eval_runtime": 1304.569, "eval_samples_per_second": 91.459, "eval_steps_per_second": 0.358, "step": 43740 }, { "epoch": 10.0, "grad_norm": 0.054538544267416, "learning_rate": 0.0009000000000000001, "loss": 0.0304, "step": 48600 }, { "epoch": 10.0, "eval_accuracy": 0.7760130746343712, "eval_loss": 0.0437050461769104, "eval_runtime": 1308.322, "eval_samples_per_second": 91.197, "eval_steps_per_second": 0.357, "step": 48600 }, { "epoch": 11.0, "grad_norm": 0.06947464495897293, "learning_rate": 0.0008900000000000001, "loss": 0.0283, "step": 53460 }, { "epoch": 11.0, "eval_accuracy": 0.7780664627247202, "eval_loss": 0.04394479840993881, "eval_runtime": 1311.6831, "eval_samples_per_second": 90.963, "eval_steps_per_second": 0.356, "step": 53460 }, { "epoch": 12.0, "grad_norm": 0.05090058967471123, "learning_rate": 0.00088, "loss": 0.0264, "step": 58320 }, { "epoch": 12.0, "eval_accuracy": 0.7814943636592214, "eval_loss": 0.04311952739953995, "eval_runtime": 1303.6986, "eval_samples_per_second": 91.52, "eval_steps_per_second": 0.358, "step": 58320 }, { "epoch": 13.0, "grad_norm": 0.05226626992225647, "learning_rate": 0.00087, "loss": 0.0248, "step": 63180 }, { "epoch": 13.0, "eval_accuracy": 0.7855257092570087, "eval_loss": 0.04451437294483185, "eval_runtime": 1313.511, "eval_samples_per_second": 90.837, "eval_steps_per_second": 0.356, "step": 63180 }, { "epoch": 14.0, "grad_norm": 0.053579073399305344, "learning_rate": 0.00086, "loss": 0.0232, "step": 68040 }, { "epoch": 14.0, "eval_accuracy": 0.7867158362318233, "eval_loss": 0.04421268403530121, "eval_runtime": 1304.7705, "eval_samples_per_second": 91.445, "eval_steps_per_second": 0.358, "step": 68040 }, { "epoch": 15.0, "grad_norm": 0.052565447986125946, "learning_rate": 0.00085, "loss": 0.0218, "step": 72900 }, { "epoch": 15.0, "eval_accuracy": 0.7881657796588861, "eval_loss": 0.04461174085736275, "eval_runtime": 1306.4264, "eval_samples_per_second": 91.329, "eval_steps_per_second": 0.357, "step": 72900 }, { "epoch": 16.0, "grad_norm": 0.05223050341010094, "learning_rate": 0.00084, "loss": 0.0206, "step": 77760 }, { "epoch": 16.0, "eval_accuracy": 0.7900850689351716, "eval_loss": 0.046072401106357574, "eval_runtime": 1314.7849, "eval_samples_per_second": 90.749, "eval_steps_per_second": 0.355, "step": 77760 }, { "epoch": 17.0, "grad_norm": 0.04498209059238434, "learning_rate": 0.00083, "loss": 0.0194, "step": 82620 }, { "epoch": 17.0, "eval_accuracy": 0.7911410970959225, "eval_loss": 0.04610202834010124, "eval_runtime": 1308.5405, "eval_samples_per_second": 91.182, "eval_steps_per_second": 0.357, "step": 82620 }, { "epoch": 18.0, "grad_norm": 0.047790784388780594, "learning_rate": 0.00082, "loss": 0.0183, "step": 87480 }, { "epoch": 18.0, "eval_accuracy": 0.7914931064828395, "eval_loss": 0.04640175402164459, "eval_runtime": 1301.9032, "eval_samples_per_second": 91.647, "eval_steps_per_second": 0.359, "step": 87480 }, { "epoch": 19.0, "grad_norm": 0.04554256424307823, "learning_rate": 0.0008100000000000001, "loss": 0.0173, "step": 92340 }, { "epoch": 19.0, "eval_accuracy": 0.7922055064325525, "eval_loss": 0.046802300959825516, "eval_runtime": 1318.5946, "eval_samples_per_second": 90.486, "eval_steps_per_second": 0.354, "step": 92340 }, { "epoch": 20.0, "grad_norm": 0.051229629665613174, "learning_rate": 0.0008, "loss": 0.0166, "step": 97200 }, { "epoch": 20.0, "eval_accuracy": 0.7951556803419519, "eval_loss": 0.04811061546206474, "eval_runtime": 1304.3101, "eval_samples_per_second": 91.477, "eval_steps_per_second": 0.358, "step": 97200 }, { "epoch": 21.0, "grad_norm": 0.04701264947652817, "learning_rate": 0.00079, "loss": 0.0158, "step": 102060 }, { "epoch": 21.0, "eval_accuracy": 0.7942253698193856, "eval_loss": 0.04858441650867462, "eval_runtime": 1303.9515, "eval_samples_per_second": 91.503, "eval_steps_per_second": 0.358, "step": 102060 }, { "epoch": 22.0, "grad_norm": 0.07426326721906662, "learning_rate": 0.0007800000000000001, "loss": 0.015, "step": 106920 }, { "epoch": 22.0, "eval_accuracy": 0.7949126262414616, "eval_loss": 0.048401448875665665, "eval_runtime": 1304.9706, "eval_samples_per_second": 91.431, "eval_steps_per_second": 0.358, "step": 106920 }, { "epoch": 23.0, "grad_norm": 0.05088690295815468, "learning_rate": 0.0007700000000000001, "loss": 0.0143, "step": 111780 }, { "epoch": 23.0, "eval_accuracy": 0.7971001131458744, "eval_loss": 0.04964574798941612, "eval_runtime": 1311.3518, "eval_samples_per_second": 90.986, "eval_steps_per_second": 0.356, "step": 111780 }, { "epoch": 24.0, "grad_norm": 0.04760482534766197, "learning_rate": 0.00076, "loss": 0.0137, "step": 116640 }, { "epoch": 24.0, "eval_accuracy": 0.7959937979298496, "eval_loss": 0.049800001084804535, "eval_runtime": 1306.2706, "eval_samples_per_second": 91.34, "eval_steps_per_second": 0.358, "step": 116640 }, { "epoch": 25.0, "grad_norm": 0.040201518684625626, "learning_rate": 0.00075, "loss": 0.0131, "step": 121500 }, { "epoch": 25.0, "eval_accuracy": 0.7972677366634539, "eval_loss": 0.0510859489440918, "eval_runtime": 1307.6635, "eval_samples_per_second": 91.243, "eval_steps_per_second": 0.357, "step": 121500 }, { "epoch": 26.0, "grad_norm": 0.04697073623538017, "learning_rate": 0.00074, "loss": 0.0125, "step": 126360 }, { "epoch": 26.0, "eval_accuracy": 0.7971336378493903, "eval_loss": 0.05105246230959892, "eval_runtime": 1305.0277, "eval_samples_per_second": 91.427, "eval_steps_per_second": 0.358, "step": 126360 }, { "epoch": 27.0, "grad_norm": 0.035631682723760605, "learning_rate": 0.00073, "loss": 0.012, "step": 131220 }, { "epoch": 27.0, "eval_accuracy": 0.7990780706533127, "eval_loss": 0.051402851939201355, "eval_runtime": 1309.1883, "eval_samples_per_second": 91.137, "eval_steps_per_second": 0.357, "step": 131220 }, { "epoch": 28.0, "grad_norm": 0.053142111748456955, "learning_rate": 0.0007199999999999999, "loss": 0.0116, "step": 136080 }, { "epoch": 28.0, "eval_accuracy": 0.7987009177387587, "eval_loss": 0.05239921808242798, "eval_runtime": 1306.5799, "eval_samples_per_second": 91.319, "eval_steps_per_second": 0.357, "step": 136080 }, { "epoch": 29.0, "grad_norm": 0.04080447182059288, "learning_rate": 0.00071, "loss": 0.0111, "step": 140940 }, { "epoch": 29.0, "eval_accuracy": 0.7965804802413778, "eval_loss": 0.05250364542007446, "eval_runtime": 1309.3121, "eval_samples_per_second": 91.128, "eval_steps_per_second": 0.357, "step": 140940 }, { "epoch": 30.0, "grad_norm": 0.045455146580934525, "learning_rate": 0.0007, "loss": 0.0107, "step": 145800 }, { "epoch": 30.0, "eval_accuracy": 0.7984997695176633, "eval_loss": 0.0543711818754673, "eval_runtime": 1310.1708, "eval_samples_per_second": 91.068, "eval_steps_per_second": 0.356, "step": 145800 }, { "epoch": 31.0, "grad_norm": 0.03871888667345047, "learning_rate": 0.00069, "loss": 0.0104, "step": 150660 }, { "epoch": 31.0, "eval_accuracy": 0.7982399530654151, "eval_loss": 0.053769443184137344, "eval_runtime": 1306.4928, "eval_samples_per_second": 91.325, "eval_steps_per_second": 0.357, "step": 150660 }, { "epoch": 32.0, "grad_norm": 0.048963289707899094, "learning_rate": 0.00068, "loss": 0.01, "step": 155520 }, { "epoch": 32.0, "eval_accuracy": 0.8012404140300884, "eval_loss": 0.05514230951666832, "eval_runtime": 1308.1993, "eval_samples_per_second": 91.206, "eval_steps_per_second": 0.357, "step": 155520 }, { "epoch": 33.0, "grad_norm": 0.03987804055213928, "learning_rate": 0.00067, "loss": 0.0097, "step": 160380 }, { "epoch": 33.0, "eval_accuracy": 0.8019109081004064, "eval_loss": 0.055017631500959396, "eval_runtime": 1312.5155, "eval_samples_per_second": 90.906, "eval_steps_per_second": 0.356, "step": 160380 }, { "epoch": 34.0, "grad_norm": 0.0373803973197937, "learning_rate": 0.00066, "loss": 0.0094, "step": 165240 }, { "epoch": 34.0, "eval_accuracy": 0.7987009177387587, "eval_loss": 0.055196575820446014, "eval_runtime": 1306.0614, "eval_samples_per_second": 91.355, "eval_steps_per_second": 0.358, "step": 165240 }, { "epoch": 35.0, "grad_norm": 0.039517637342214584, "learning_rate": 0.0006500000000000001, "loss": 0.0091, "step": 170100 }, { "epoch": 35.0, "eval_accuracy": 0.8004777270251017, "eval_loss": 0.05661753937602043, "eval_runtime": 1306.965, "eval_samples_per_second": 91.292, "eval_steps_per_second": 0.357, "step": 170100 }, { "epoch": 36.0, "grad_norm": 0.04135722666978836, "learning_rate": 0.00064, "loss": 0.0088, "step": 174960 }, { "epoch": 36.0, "eval_accuracy": 0.8019025269245275, "eval_loss": 0.05708213895559311, "eval_runtime": 1305.0895, "eval_samples_per_second": 91.423, "eval_steps_per_second": 0.358, "step": 174960 }, { "epoch": 37.0, "grad_norm": 0.04340599477291107, "learning_rate": 0.00063, "loss": 0.0085, "step": 179820 }, { "epoch": 37.0, "eval_accuracy": 0.8015421363617315, "eval_loss": 0.0565766803920269, "eval_runtime": 1303.7543, "eval_samples_per_second": 91.516, "eval_steps_per_second": 0.358, "step": 179820 }, { "epoch": 38.0, "grad_norm": 0.042780667543411255, "learning_rate": 0.00062, "loss": 0.0082, "step": 184680 }, { "epoch": 38.0, "eval_accuracy": 0.7985919624523321, "eval_loss": 0.05795786902308464, "eval_runtime": 1306.8337, "eval_samples_per_second": 91.301, "eval_steps_per_second": 0.357, "step": 184680 }, { "epoch": 39.0, "grad_norm": 0.03298887610435486, "learning_rate": 0.00061, "loss": 0.008, "step": 189540 }, { "epoch": 39.0, "eval_accuracy": 0.8006537317185601, "eval_loss": 0.05666106194257736, "eval_runtime": 1303.7751, "eval_samples_per_second": 91.515, "eval_steps_per_second": 0.358, "step": 189540 }, { "epoch": 40.0, "grad_norm": 0.03825366497039795, "learning_rate": 0.0006, "loss": 0.0077, "step": 194400 }, { "epoch": 40.0, "eval_accuracy": 0.8003101035075221, "eval_loss": 0.05909406766295433, "eval_runtime": 1304.1065, "eval_samples_per_second": 91.492, "eval_steps_per_second": 0.358, "step": 194400 }, { "epoch": 41.0, "grad_norm": 0.049214523285627365, "learning_rate": 0.00059, "loss": 0.0075, "step": 199260 }, { "epoch": 41.0, "eval_accuracy": 0.8027406445124251, "eval_loss": 0.0589471310377121, "eval_runtime": 1305.1945, "eval_samples_per_second": 91.415, "eval_steps_per_second": 0.358, "step": 199260 }, { "epoch": 42.0, "grad_norm": 0.03445366024971008, "learning_rate": 0.00058, "loss": 0.0073, "step": 204120 }, { "epoch": 42.0, "eval_accuracy": 0.802765788040062, "eval_loss": 0.05833474174141884, "eval_runtime": 1304.7043, "eval_samples_per_second": 91.45, "eval_steps_per_second": 0.358, "step": 204120 }, { "epoch": 43.0, "grad_norm": 0.031152933835983276, "learning_rate": 0.00057, "loss": 0.007, "step": 208980 }, { "epoch": 43.0, "eval_accuracy": 0.8017851904622219, "eval_loss": 0.05955711379647255, "eval_runtime": 1302.9367, "eval_samples_per_second": 91.574, "eval_steps_per_second": 0.358, "step": 208980 }, { "epoch": 44.0, "grad_norm": 0.028931325301527977, "learning_rate": 0.0005600000000000001, "loss": 0.0069, "step": 213840 }, { "epoch": 44.0, "eval_accuracy": 0.803469806813896, "eval_loss": 0.05940761789679527, "eval_runtime": 1302.3209, "eval_samples_per_second": 91.617, "eval_steps_per_second": 0.359, "step": 213840 }, { "epoch": 45.0, "grad_norm": 0.03164521977305412, "learning_rate": 0.00055, "loss": 0.0066, "step": 218700 }, { "epoch": 45.0, "eval_accuracy": 0.803084272723463, "eval_loss": 0.0604814775288105, "eval_runtime": 1299.6461, "eval_samples_per_second": 91.806, "eval_steps_per_second": 0.359, "step": 218700 }, { "epoch": 46.0, "grad_norm": 0.09477687627077103, "learning_rate": 0.00054, "loss": 0.0065, "step": 223560 }, { "epoch": 46.0, "eval_accuracy": 0.803805053849055, "eval_loss": 0.05929319187998772, "eval_runtime": 1297.2788, "eval_samples_per_second": 91.973, "eval_steps_per_second": 0.36, "step": 223560 }, { "epoch": 47.0, "grad_norm": 0.032785411924123764, "learning_rate": 0.0005300000000000001, "loss": 0.0063, "step": 228420 }, { "epoch": 47.0, "eval_accuracy": 0.8049784184721116, "eval_loss": 0.06024543195962906, "eval_runtime": 1298.8006, "eval_samples_per_second": 91.866, "eval_steps_per_second": 0.36, "step": 228420 }, { "epoch": 48.0, "grad_norm": 0.03677200525999069, "learning_rate": 0.0005200000000000001, "loss": 0.006, "step": 233280 }, { "epoch": 48.0, "eval_accuracy": 0.8040145832460294, "eval_loss": 0.061811413615942, "eval_runtime": 1297.6127, "eval_samples_per_second": 91.95, "eval_steps_per_second": 0.36, "step": 233280 }, { "epoch": 49.0, "grad_norm": 0.030352266505360603, "learning_rate": 0.00051, "loss": 0.0059, "step": 238140 }, { "epoch": 49.0, "eval_accuracy": 0.8042241126430039, "eval_loss": 0.061159055680036545, "eval_runtime": 1299.1406, "eval_samples_per_second": 91.841, "eval_steps_per_second": 0.359, "step": 238140 }, { "epoch": 50.0, "grad_norm": 0.02934379130601883, "learning_rate": 0.0005, "loss": 0.0057, "step": 243000 }, { "epoch": 50.0, "eval_accuracy": 0.8055148137283661, "eval_loss": 0.06327831000089645, "eval_runtime": 1298.5116, "eval_samples_per_second": 91.886, "eval_steps_per_second": 0.36, "step": 243000 }, { "epoch": 51.0, "grad_norm": 0.023088792338967323, "learning_rate": 0.00049, "loss": 0.0055, "step": 247860 }, { "epoch": 51.0, "eval_accuracy": 0.8067971336378494, "eval_loss": 0.06312137842178345, "eval_runtime": 1302.1135, "eval_samples_per_second": 91.632, "eval_steps_per_second": 0.359, "step": 247860 }, { "epoch": 52.0, "grad_norm": 0.03648848831653595, "learning_rate": 0.00048, "loss": 0.0053, "step": 252720 }, { "epoch": 52.0, "eval_accuracy": 0.8061936889745631, "eval_loss": 0.06350181996822357, "eval_runtime": 1297.9213, "eval_samples_per_second": 91.928, "eval_steps_per_second": 0.36, "step": 252720 }, { "epoch": 53.0, "grad_norm": 0.03157039359211922, "learning_rate": 0.00047, "loss": 0.0051, "step": 257580 }, { "epoch": 53.0, "eval_accuracy": 0.8065121736579642, "eval_loss": 0.06361949443817139, "eval_runtime": 1305.9679, "eval_samples_per_second": 91.361, "eval_steps_per_second": 0.358, "step": 257580 }, { "epoch": 54.0, "grad_norm": 0.026564130559563637, "learning_rate": 0.00046, "loss": 0.005, "step": 262440 }, { "epoch": 54.0, "eval_accuracy": 0.8064535054268114, "eval_loss": 0.06370926648378372, "eval_runtime": 1302.6026, "eval_samples_per_second": 91.597, "eval_steps_per_second": 0.359, "step": 262440 }, { "epoch": 55.0, "grad_norm": 0.039359357208013535, "learning_rate": 0.00045000000000000004, "loss": 0.0048, "step": 267300 }, { "epoch": 55.0, "eval_accuracy": 0.8070569500900976, "eval_loss": 0.0649728775024414, "eval_runtime": 1301.934, "eval_samples_per_second": 91.644, "eval_steps_per_second": 0.359, "step": 267300 }, { "epoch": 56.0, "grad_norm": 0.02652502991259098, "learning_rate": 0.00044, "loss": 0.0047, "step": 272160 }, { "epoch": 56.0, "eval_accuracy": 0.80695637597955, "eval_loss": 0.06507979333400726, "eval_runtime": 1302.3742, "eval_samples_per_second": 91.613, "eval_steps_per_second": 0.359, "step": 272160 }, { "epoch": 57.0, "grad_norm": 0.04170479625463486, "learning_rate": 0.00043, "loss": 0.0045, "step": 277020 }, { "epoch": 57.0, "eval_accuracy": 0.8077442065121737, "eval_loss": 0.06572364270687103, "eval_runtime": 1303.9456, "eval_samples_per_second": 91.503, "eval_steps_per_second": 0.358, "step": 277020 }, { "epoch": 58.0, "grad_norm": 0.02988004870712757, "learning_rate": 0.00042, "loss": 0.0044, "step": 281880 }, { "epoch": 58.0, "eval_accuracy": 0.8076687759292629, "eval_loss": 0.06484715640544891, "eval_runtime": 1299.0165, "eval_samples_per_second": 91.85, "eval_steps_per_second": 0.36, "step": 281880 }, { "epoch": 59.0, "grad_norm": 0.031678713858127594, "learning_rate": 0.00041, "loss": 0.0042, "step": 286740 }, { "epoch": 59.0, "eval_accuracy": 0.8078950676779952, "eval_loss": 0.06634358316659927, "eval_runtime": 1304.0645, "eval_samples_per_second": 91.495, "eval_steps_per_second": 0.358, "step": 286740 }, { "epoch": 60.0, "grad_norm": 0.020897777751088142, "learning_rate": 0.0004, "loss": 0.0041, "step": 291600 }, { "epoch": 60.0, "eval_accuracy": 0.8078866865021163, "eval_loss": 0.0666716918349266, "eval_runtime": 1302.0029, "eval_samples_per_second": 91.64, "eval_steps_per_second": 0.359, "step": 291600 }, { "epoch": 61.0, "grad_norm": 0.03830067440867424, "learning_rate": 0.00039000000000000005, "loss": 0.004, "step": 296460 }, { "epoch": 61.0, "eval_accuracy": 0.8104597074969618, "eval_loss": 0.0661536380648613, "eval_runtime": 1303.993, "eval_samples_per_second": 91.5, "eval_steps_per_second": 0.358, "step": 296460 }, { "epoch": 62.0, "grad_norm": 0.025591198354959488, "learning_rate": 0.00038, "loss": 0.0037, "step": 301320 }, { "epoch": 62.0, "eval_accuracy": 0.809722164019612, "eval_loss": 0.06793326884508133, "eval_runtime": 1302.3775, "eval_samples_per_second": 91.613, "eval_steps_per_second": 0.359, "step": 301320 }, { "epoch": 63.0, "grad_norm": 0.029204251244664192, "learning_rate": 0.00037, "loss": 0.0036, "step": 306180 }, { "epoch": 63.0, "eval_accuracy": 0.8103172275070193, "eval_loss": 0.06969352066516876, "eval_runtime": 1309.132, "eval_samples_per_second": 91.141, "eval_steps_per_second": 0.357, "step": 306180 }, { "epoch": 64.0, "grad_norm": 0.029880277812480927, "learning_rate": 0.00035999999999999997, "loss": 0.0035, "step": 311040 }, { "epoch": 64.0, "eval_accuracy": 0.810057411054771, "eval_loss": 0.06940728425979614, "eval_runtime": 1308.9186, "eval_samples_per_second": 91.155, "eval_steps_per_second": 0.357, "step": 311040 }, { "epoch": 65.0, "grad_norm": 0.030717821791768074, "learning_rate": 0.00035, "loss": 0.0034, "step": 315900 }, { "epoch": 65.0, "eval_accuracy": 0.810082554582408, "eval_loss": 0.06925758719444275, "eval_runtime": 1308.5599, "eval_samples_per_second": 91.18, "eval_steps_per_second": 0.357, "step": 315900 }, { "epoch": 66.0, "grad_norm": 0.02681083045899868, "learning_rate": 0.00034, "loss": 0.0032, "step": 320760 }, { "epoch": 66.0, "eval_accuracy": 0.8104597074969618, "eval_loss": 0.07081950455904007, "eval_runtime": 1307.5379, "eval_samples_per_second": 91.252, "eval_steps_per_second": 0.357, "step": 320760 }, { "epoch": 67.0, "grad_norm": 0.023366352543234825, "learning_rate": 0.00033, "loss": 0.0031, "step": 325620 }, { "epoch": 67.0, "eval_accuracy": 0.8130411096676864, "eval_loss": 0.07085347920656204, "eval_runtime": 1306.1303, "eval_samples_per_second": 91.35, "eval_steps_per_second": 0.358, "step": 325620 }, { "epoch": 68.0, "grad_norm": 0.018703831359744072, "learning_rate": 0.00032, "loss": 0.0029, "step": 330480 }, { "epoch": 68.0, "eval_accuracy": 0.8118090768134769, "eval_loss": 0.07043693959712982, "eval_runtime": 1308.3789, "eval_samples_per_second": 91.193, "eval_steps_per_second": 0.357, "step": 330480 }, { "epoch": 69.0, "grad_norm": 0.021384961903095245, "learning_rate": 0.00031, "loss": 0.0028, "step": 335340 }, { "epoch": 69.0, "eval_accuracy": 0.8111721074466748, "eval_loss": 0.0710659921169281, "eval_runtime": 1306.3675, "eval_samples_per_second": 91.333, "eval_steps_per_second": 0.357, "step": 335340 }, { "epoch": 70.0, "grad_norm": 0.027229884639382362, "learning_rate": 0.0003, "loss": 0.0027, "step": 340200 }, { "epoch": 70.0, "eval_accuracy": 0.811775552109961, "eval_loss": 0.07277531921863556, "eval_runtime": 1306.0312, "eval_samples_per_second": 91.357, "eval_steps_per_second": 0.358, "step": 340200 }, { "epoch": 71.0, "grad_norm": 0.024667974561452866, "learning_rate": 0.00029, "loss": 0.0025, "step": 345060 }, { "epoch": 71.0, "eval_accuracy": 0.8127980555671961, "eval_loss": 0.07439424097537994, "eval_runtime": 1304.9752, "eval_samples_per_second": 91.431, "eval_steps_per_second": 0.358, "step": 345060 }, { "epoch": 72.0, "grad_norm": 0.019673120230436325, "learning_rate": 0.00028000000000000003, "loss": 0.0024, "step": 349920 }, { "epoch": 72.0, "eval_accuracy": 0.8129572979088966, "eval_loss": 0.07484369724988937, "eval_runtime": 1304.8727, "eval_samples_per_second": 91.438, "eval_steps_per_second": 0.358, "step": 349920 }, { "epoch": 73.0, "grad_norm": 0.023998018354177475, "learning_rate": 0.00027, "loss": 0.0023, "step": 354780 }, { "epoch": 73.0, "eval_accuracy": 0.8130662531953233, "eval_loss": 0.07576391845941544, "eval_runtime": 1307.7757, "eval_samples_per_second": 91.235, "eval_steps_per_second": 0.357, "step": 354780 }, { "epoch": 74.0, "grad_norm": 0.04219399765133858, "learning_rate": 0.00026000000000000003, "loss": 0.0022, "step": 359640 }, { "epoch": 74.0, "eval_accuracy": 0.8143485731048066, "eval_loss": 0.07631613314151764, "eval_runtime": 1306.0365, "eval_samples_per_second": 91.357, "eval_steps_per_second": 0.358, "step": 359640 }, { "epoch": 75.0, "grad_norm": 0.021946126595139503, "learning_rate": 0.00025, "loss": 0.0021, "step": 364500 }, { "epoch": 75.0, "eval_accuracy": 0.8144994342706282, "eval_loss": 0.07692206650972366, "eval_runtime": 1303.767, "eval_samples_per_second": 91.516, "eval_steps_per_second": 0.358, "step": 364500 }, { "epoch": 76.0, "grad_norm": 0.023164469748735428, "learning_rate": 0.00024, "loss": 0.0019, "step": 369360 }, { "epoch": 76.0, "eval_accuracy": 0.8133176884716926, "eval_loss": 0.0780106782913208, "eval_runtime": 1305.1212, "eval_samples_per_second": 91.421, "eval_steps_per_second": 0.358, "step": 369360 }, { "epoch": 77.0, "grad_norm": 0.02851826325058937, "learning_rate": 0.00023, "loss": 0.0018, "step": 374220 }, { "epoch": 77.0, "eval_accuracy": 0.8146335330846918, "eval_loss": 0.0777156725525856, "eval_runtime": 1304.8456, "eval_samples_per_second": 91.44, "eval_steps_per_second": 0.358, "step": 374220 }, { "epoch": 78.0, "grad_norm": 0.0253597479313612, "learning_rate": 0.00022, "loss": 0.0017, "step": 379080 }, { "epoch": 78.0, "eval_accuracy": 0.8162175753258182, "eval_loss": 0.07899600267410278, "eval_runtime": 1303.5402, "eval_samples_per_second": 91.532, "eval_steps_per_second": 0.358, "step": 379080 }, { "epoch": 79.0, "grad_norm": 0.02291404828429222, "learning_rate": 0.00021, "loss": 0.0016, "step": 383940 }, { "epoch": 79.0, "eval_accuracy": 0.8145245777982651, "eval_loss": 0.08102333545684814, "eval_runtime": 1304.8976, "eval_samples_per_second": 91.436, "eval_steps_per_second": 0.358, "step": 383940 }, { "epoch": 80.0, "grad_norm": 0.029658950865268707, "learning_rate": 0.0002, "loss": 0.0015, "step": 388800 }, { "epoch": 80.0, "eval_accuracy": 0.8161170012152705, "eval_loss": 0.08235077559947968, "eval_runtime": 1306.0263, "eval_samples_per_second": 91.357, "eval_steps_per_second": 0.358, "step": 388800 }, { "epoch": 81.0, "grad_norm": 0.02459796331822872, "learning_rate": 0.00019, "loss": 0.0014, "step": 393660 }, { "epoch": 81.0, "eval_accuracy": 0.815940996521812, "eval_loss": 0.08271630853414536, "eval_runtime": 1306.1311, "eval_samples_per_second": 91.35, "eval_steps_per_second": 0.358, "step": 393660 }, { "epoch": 82.0, "grad_norm": 0.01461075246334076, "learning_rate": 0.00017999999999999998, "loss": 0.0013, "step": 398520 }, { "epoch": 82.0, "eval_accuracy": 0.8159745212253279, "eval_loss": 0.08540969341993332, "eval_runtime": 1303.2922, "eval_samples_per_second": 91.549, "eval_steps_per_second": 0.358, "step": 398520 }, { "epoch": 83.0, "grad_norm": 0.011922557838261127, "learning_rate": 0.00017, "loss": 0.0012, "step": 403380 }, { "epoch": 83.0, "eval_accuracy": 0.8165025353057034, "eval_loss": 0.08608754724264145, "eval_runtime": 1300.8769, "eval_samples_per_second": 91.719, "eval_steps_per_second": 0.359, "step": 403380 }, { "epoch": 84.0, "grad_norm": 0.006066465750336647, "learning_rate": 0.00016, "loss": 0.0011, "step": 408240 }, { "epoch": 84.0, "eval_accuracy": 0.8179943846121611, "eval_loss": 0.0866456851363182, "eval_runtime": 1297.1839, "eval_samples_per_second": 91.98, "eval_steps_per_second": 0.36, "step": 408240 }, { "epoch": 85.0, "grad_norm": 0.019887538626790047, "learning_rate": 0.00015, "loss": 0.001, "step": 413100 }, { "epoch": 85.0, "eval_accuracy": 0.8174747517076646, "eval_loss": 0.08994536101818085, "eval_runtime": 1297.8948, "eval_samples_per_second": 91.93, "eval_steps_per_second": 0.36, "step": 413100 }, { "epoch": 86.0, "grad_norm": 0.019588502123951912, "learning_rate": 0.00014000000000000001, "loss": 0.0009, "step": 417960 }, { "epoch": 86.0, "eval_accuracy": 0.8186648786824792, "eval_loss": 0.08895347267389297, "eval_runtime": 1297.8927, "eval_samples_per_second": 91.93, "eval_steps_per_second": 0.36, "step": 417960 }, { "epoch": 87.0, "grad_norm": 0.019738251343369484, "learning_rate": 0.00013000000000000002, "loss": 0.0008, "step": 422820 }, { "epoch": 87.0, "eval_accuracy": 0.8184553492855048, "eval_loss": 0.09202321618795395, "eval_runtime": 1297.7904, "eval_samples_per_second": 91.937, "eval_steps_per_second": 0.36, "step": 422820 }, { "epoch": 88.0, "grad_norm": 0.0069356439635157585, "learning_rate": 0.00012, "loss": 0.0008, "step": 427680 }, { "epoch": 88.0, "eval_accuracy": 0.8189749821900013, "eval_loss": 0.09301886707544327, "eval_runtime": 1297.4851, "eval_samples_per_second": 91.959, "eval_steps_per_second": 0.36, "step": 427680 }, { "epoch": 89.0, "grad_norm": 0.0168699249625206, "learning_rate": 0.00011, "loss": 0.0007, "step": 432540 }, { "epoch": 89.0, "eval_accuracy": 0.8196370950844404, "eval_loss": 0.09475909918546677, "eval_runtime": 1297.2344, "eval_samples_per_second": 91.976, "eval_steps_per_second": 0.36, "step": 432540 }, { "epoch": 90.0, "grad_norm": 0.00997143518179655, "learning_rate": 0.0001, "loss": 0.0006, "step": 437400 }, { "epoch": 90.0, "eval_accuracy": 0.8189917445417592, "eval_loss": 0.09576508402824402, "eval_runtime": 1296.498, "eval_samples_per_second": 92.029, "eval_steps_per_second": 0.36, "step": 437400 }, { "epoch": 91.0, "grad_norm": 0.017046066001057625, "learning_rate": 8.999999999999999e-05, "loss": 0.0005, "step": 442260 }, { "epoch": 91.0, "eval_accuracy": 0.8195365209738926, "eval_loss": 0.09832222014665604, "eval_runtime": 1296.9323, "eval_samples_per_second": 91.998, "eval_steps_per_second": 0.36, "step": 442260 }, { "epoch": 92.0, "grad_norm": 0.01709928549826145, "learning_rate": 8e-05, "loss": 0.0005, "step": 447120 }, { "epoch": 92.0, "eval_accuracy": 0.8208523655868918, "eval_loss": 0.10070452094078064, "eval_runtime": 1296.8404, "eval_samples_per_second": 92.004, "eval_steps_per_second": 0.36, "step": 447120 }, { "epoch": 93.0, "grad_norm": 0.014434403739869595, "learning_rate": 7.000000000000001e-05, "loss": 0.0004, "step": 451980 }, { "epoch": 93.0, "eval_accuracy": 0.8202153962200897, "eval_loss": 0.10308787226676941, "eval_runtime": 1304.3584, "eval_samples_per_second": 91.474, "eval_steps_per_second": 0.358, "step": 451980 }, { "epoch": 94.0, "grad_norm": 0.014810960739850998, "learning_rate": 6e-05, "loss": 0.0004, "step": 456840 }, { "epoch": 94.0, "eval_accuracy": 0.82111218203914, "eval_loss": 0.10589364916086197, "eval_runtime": 1302.0953, "eval_samples_per_second": 91.633, "eval_steps_per_second": 0.359, "step": 456840 }, { "epoch": 95.0, "grad_norm": 0.0034873096738010645, "learning_rate": 5e-05, "loss": 0.0003, "step": 461700 }, { "epoch": 95.0, "eval_accuracy": 0.8215982902401207, "eval_loss": 0.10970806330442429, "eval_runtime": 1301.0636, "eval_samples_per_second": 91.706, "eval_steps_per_second": 0.359, "step": 461700 }, { "epoch": 96.0, "grad_norm": 0.002740664640441537, "learning_rate": 4e-05, "loss": 0.0003, "step": 466560 }, { "epoch": 96.0, "eval_accuracy": 0.8221179231446172, "eval_loss": 0.11456754803657532, "eval_runtime": 1302.7083, "eval_samples_per_second": 91.59, "eval_steps_per_second": 0.358, "step": 466560 }, { "epoch": 97.0, "grad_norm": 0.0014008020516484976, "learning_rate": 3e-05, "loss": 0.0002, "step": 471420 }, { "epoch": 97.0, "eval_accuracy": 0.8224364078280183, "eval_loss": 0.11764019727706909, "eval_runtime": 1302.3991, "eval_samples_per_second": 91.612, "eval_steps_per_second": 0.359, "step": 471420 }, { "epoch": 98.0, "grad_norm": 0.0054730623960494995, "learning_rate": 2e-05, "loss": 0.0002, "step": 476280 }, { "epoch": 98.0, "eval_accuracy": 0.8226543184008717, "eval_loss": 0.12025844305753708, "eval_runtime": 1302.1251, "eval_samples_per_second": 91.631, "eval_steps_per_second": 0.359, "step": 476280 }, { "epoch": 99.0, "grad_norm": 0.0020766761153936386, "learning_rate": 1e-05, "loss": 0.0002, "step": 481140 }, { "epoch": 99.0, "eval_accuracy": 0.8226878431043876, "eval_loss": 0.1223362609744072, "eval_runtime": 1301.9742, "eval_samples_per_second": 91.642, "eval_steps_per_second": 0.359, "step": 481140 }, { "epoch": 100.0, "grad_norm": 0.018789879977703094, "learning_rate": 0.0, "loss": 0.0001, "step": 486000 }, { "epoch": 100.0, "eval_accuracy": 0.8229895654360306, "eval_loss": 0.12378211319446564, "eval_runtime": 1301.6752, "eval_samples_per_second": 91.663, "eval_steps_per_second": 0.359, "step": 486000 } ], "logging_steps": 500, "max_steps": 486000, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 2.2219508552555553e+19, "train_batch_size": 256, "trial_name": null, "trial_params": null }