{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 200, "global_step": 5463, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005491488193300384, "grad_norm": 1.4225262906610563, "learning_rate": 0.0, "loss": 0.4574, "step": 1 }, { "epoch": 0.001098297638660077, "grad_norm": 1.2088518087838116, "learning_rate": 1.729696904450771e-07, "loss": 0.4484, "step": 2 }, { "epoch": 0.0016474464579901153, "grad_norm": 1.75482712948527, "learning_rate": 2.741504731167937e-07, "loss": 0.553, "step": 3 }, { "epoch": 0.002196595277320154, "grad_norm": 1.6476495401371742, "learning_rate": 3.459393808901542e-07, "loss": 0.4866, "step": 4 }, { "epoch": 0.0027457440966501922, "grad_norm": 1.168396277727929, "learning_rate": 4.016231838083946e-07, "loss": 0.4492, "step": 5 }, { "epoch": 0.0032948929159802307, "grad_norm": 1.8756805874064717, "learning_rate": 4.4712016356187073e-07, "loss": 0.5393, "step": 6 }, { "epoch": 0.003844041735310269, "grad_norm": 1.4295957251308191, "learning_rate": 4.855873118377673e-07, "loss": 0.4612, "step": 7 }, { "epoch": 0.004393190554640308, "grad_norm": 1.3019184497919265, "learning_rate": 5.189090713352312e-07, "loss": 0.4273, "step": 8 }, { "epoch": 0.004942339373970346, "grad_norm": 1.4301794602572218, "learning_rate": 5.483009462335874e-07, "loss": 0.4187, "step": 9 }, { "epoch": 0.0054914881933003845, "grad_norm": 1.1171001206048818, "learning_rate": 5.745928742534718e-07, "loss": 0.4334, "step": 10 }, { "epoch": 0.0060406370126304225, "grad_norm": 1.2023161985960118, "learning_rate": 5.983768161916053e-07, "loss": 0.4776, "step": 11 }, { "epoch": 0.006589785831960461, "grad_norm": 1.561592987562026, "learning_rate": 6.200898540069478e-07, "loss": 0.4274, "step": 12 }, { "epoch": 0.007138934651290499, "grad_norm": 1.0610561139148416, "learning_rate": 6.40063912557533e-07, "loss": 0.4131, "step": 13 }, { "epoch": 0.007688083470620538, "grad_norm": 1.0984338347008844, "learning_rate": 6.585570022828442e-07, "loss": 0.4548, "step": 14 }, { "epoch": 0.008237232289950576, "grad_norm": 1.267906923097231, "learning_rate": 6.757736569251883e-07, "loss": 0.4912, "step": 15 }, { "epoch": 0.008786381109280615, "grad_norm": 2.054936399665518, "learning_rate": 6.918787617803084e-07, "loss": 0.5067, "step": 16 }, { "epoch": 0.009335529928610654, "grad_norm": 1.0888894685956663, "learning_rate": 7.070071823568266e-07, "loss": 0.3836, "step": 17 }, { "epoch": 0.009884678747940691, "grad_norm": 0.933933401628967, "learning_rate": 7.212706366786644e-07, "loss": 0.4321, "step": 18 }, { "epoch": 0.01043382756727073, "grad_norm": 1.2062385692313455, "learning_rate": 7.34762707033463e-07, "loss": 0.4584, "step": 19 }, { "epoch": 0.010982976386600769, "grad_norm": 1.5470382581754798, "learning_rate": 7.475625646985488e-07, "loss": 0.5263, "step": 20 }, { "epoch": 0.011532125205930808, "grad_norm": 1.052236373995569, "learning_rate": 7.59737784954561e-07, "loss": 0.4506, "step": 21 }, { "epoch": 0.012081274025260845, "grad_norm": 1.0539356257460601, "learning_rate": 7.713465066366824e-07, "loss": 0.4079, "step": 22 }, { "epoch": 0.012630422844590884, "grad_norm": 0.7116874613543462, "learning_rate": 7.824391112483089e-07, "loss": 0.4279, "step": 23 }, { "epoch": 0.013179571663920923, "grad_norm": 0.9701267618020503, "learning_rate": 7.930595444520249e-07, "loss": 0.4392, "step": 24 }, { "epoch": 0.013728720483250962, "grad_norm": 0.7554895854135255, "learning_rate": 8.032463676167892e-07, "loss": 0.4201, "step": 25 }, { "epoch": 0.014277869302580999, "grad_norm": 0.9220843488420176, "learning_rate": 8.130336030026101e-07, "loss": 0.4435, "step": 26 }, { "epoch": 0.014827018121911038, "grad_norm": 1.1944285424449907, "learning_rate": 8.22451419350381e-07, "loss": 0.4906, "step": 27 }, { "epoch": 0.015376166941241077, "grad_norm": 0.7683622840626223, "learning_rate": 8.315266927279214e-07, "loss": 0.4315, "step": 28 }, { "epoch": 0.015925315760571115, "grad_norm": 0.8412199225414874, "learning_rate": 8.402834689152837e-07, "loss": 0.412, "step": 29 }, { "epoch": 0.016474464579901153, "grad_norm": 0.9195353340666571, "learning_rate": 8.487433473702655e-07, "loss": 0.425, "step": 30 }, { "epoch": 0.017023613399231193, "grad_norm": 1.014454974143959, "learning_rate": 8.569258022117608e-07, "loss": 0.4129, "step": 31 }, { "epoch": 0.01757276221856123, "grad_norm": 0.9286572043623594, "learning_rate": 8.648484522253854e-07, "loss": 0.4265, "step": 32 }, { "epoch": 0.018121911037891267, "grad_norm": 0.8888140783501768, "learning_rate": 8.725272893083989e-07, "loss": 0.4111, "step": 33 }, { "epoch": 0.018671059857221308, "grad_norm": 0.8673559176229574, "learning_rate": 8.799768728019036e-07, "loss": 0.4202, "step": 34 }, { "epoch": 0.019220208676551345, "grad_norm": 0.7808202407404673, "learning_rate": 8.87210495646162e-07, "loss": 0.4342, "step": 35 }, { "epoch": 0.019769357495881382, "grad_norm": 0.6796871324723183, "learning_rate": 8.942403271237415e-07, "loss": 0.3855, "step": 36 }, { "epoch": 0.020318506315211423, "grad_norm": 0.6081508652500198, "learning_rate": 9.010775360409045e-07, "loss": 0.4036, "step": 37 }, { "epoch": 0.02086765513454146, "grad_norm": 0.7420082429740513, "learning_rate": 9.077323974785401e-07, "loss": 0.4641, "step": 38 }, { "epoch": 0.0214168039538715, "grad_norm": 0.7524300594082499, "learning_rate": 9.142143856743266e-07, "loss": 0.4307, "step": 39 }, { "epoch": 0.021965952773201538, "grad_norm": 0.7009814899366995, "learning_rate": 9.205322551436259e-07, "loss": 0.4238, "step": 40 }, { "epoch": 0.022515101592531575, "grad_norm": 0.5935645569829775, "learning_rate": 9.266941117821921e-07, "loss": 0.4077, "step": 41 }, { "epoch": 0.023064250411861616, "grad_norm": 0.5282148216196204, "learning_rate": 9.32707475399638e-07, "loss": 0.4575, "step": 42 }, { "epoch": 0.023613399231191653, "grad_norm": 0.9126595655856949, "learning_rate": 9.38579334893854e-07, "loss": 0.4678, "step": 43 }, { "epoch": 0.02416254805052169, "grad_norm": 0.5936890079068945, "learning_rate": 9.443161970817594e-07, "loss": 0.3801, "step": 44 }, { "epoch": 0.02471169686985173, "grad_norm": 0.5387510022171181, "learning_rate": 9.499241300419819e-07, "loss": 0.4201, "step": 45 }, { "epoch": 0.025260845689181768, "grad_norm": 0.49338476111037977, "learning_rate": 9.55408801693386e-07, "loss": 0.4022, "step": 46 }, { "epoch": 0.02580999450851181, "grad_norm": 0.6924157033463041, "learning_rate": 9.60775514224357e-07, "loss": 0.4585, "step": 47 }, { "epoch": 0.026359143327841845, "grad_norm": 1.0357916266494314, "learning_rate": 9.66029234897102e-07, "loss": 0.4567, "step": 48 }, { "epoch": 0.026908292147171883, "grad_norm": 0.6202871224353081, "learning_rate": 9.711746236755347e-07, "loss": 0.3966, "step": 49 }, { "epoch": 0.027457440966501923, "grad_norm": 0.5651799439240893, "learning_rate": 9.762160580618663e-07, "loss": 0.4312, "step": 50 }, { "epoch": 0.02800658978583196, "grad_norm": 0.5999985257992868, "learning_rate": 9.811576554736202e-07, "loss": 0.3799, "step": 51 }, { "epoch": 0.028555738605161998, "grad_norm": 0.6818309292774629, "learning_rate": 9.860032934476873e-07, "loss": 0.3931, "step": 52 }, { "epoch": 0.029104887424492038, "grad_norm": 0.6041213206398276, "learning_rate": 9.907566279198219e-07, "loss": 0.4585, "step": 53 }, { "epoch": 0.029654036243822075, "grad_norm": 0.5836230976134994, "learning_rate": 9.954211097954582e-07, "loss": 0.3951, "step": 54 }, { "epoch": 0.030203185063152116, "grad_norm": 0.6090123753109918, "learning_rate": 1e-06, "loss": 0.4539, "step": 55 }, { "epoch": 0.030752333882482153, "grad_norm": 0.5963904836943319, "learning_rate": 9.999999156426468e-07, "loss": 0.4438, "step": 56 }, { "epoch": 0.03130148270181219, "grad_norm": 0.5258024706804068, "learning_rate": 9.99999662570616e-07, "loss": 0.4109, "step": 57 }, { "epoch": 0.03185063152114223, "grad_norm": 0.591275210526606, "learning_rate": 9.999992407839927e-07, "loss": 0.5208, "step": 58 }, { "epoch": 0.032399780340472265, "grad_norm": 0.5624958905139562, "learning_rate": 9.999986502829197e-07, "loss": 0.4294, "step": 59 }, { "epoch": 0.032948929159802305, "grad_norm": 0.535445043933981, "learning_rate": 9.999978910675956e-07, "loss": 0.4044, "step": 60 }, { "epoch": 0.033498077979132346, "grad_norm": 0.5665677363927388, "learning_rate": 9.999969631382771e-07, "loss": 0.3916, "step": 61 }, { "epoch": 0.034047226798462386, "grad_norm": 0.913452655593011, "learning_rate": 9.999958664952773e-07, "loss": 0.4627, "step": 62 }, { "epoch": 0.03459637561779242, "grad_norm": 0.6483781045962704, "learning_rate": 9.99994601138966e-07, "loss": 0.4445, "step": 63 }, { "epoch": 0.03514552443712246, "grad_norm": 0.5532138331818498, "learning_rate": 9.999931670697708e-07, "loss": 0.3963, "step": 64 }, { "epoch": 0.0356946732564525, "grad_norm": 0.5119120510194277, "learning_rate": 9.999915642881749e-07, "loss": 0.3909, "step": 65 }, { "epoch": 0.036243822075782535, "grad_norm": 0.4881257479615943, "learning_rate": 9.999897927947198e-07, "loss": 0.4045, "step": 66 }, { "epoch": 0.036792970895112576, "grad_norm": 0.680486722717849, "learning_rate": 9.999878525900026e-07, "loss": 0.4202, "step": 67 }, { "epoch": 0.037342119714442616, "grad_norm": 0.904382014225488, "learning_rate": 9.99985743674679e-07, "loss": 0.4007, "step": 68 }, { "epoch": 0.03789126853377265, "grad_norm": 0.5022031033791474, "learning_rate": 9.999834660494596e-07, "loss": 0.3872, "step": 69 }, { "epoch": 0.03844041735310269, "grad_norm": 0.5949380451119111, "learning_rate": 9.999810197151142e-07, "loss": 0.3993, "step": 70 }, { "epoch": 0.03898956617243273, "grad_norm": 0.42789905818799473, "learning_rate": 9.999784046724673e-07, "loss": 0.4247, "step": 71 }, { "epoch": 0.039538714991762765, "grad_norm": 0.4358760331006809, "learning_rate": 9.99975620922402e-07, "loss": 0.4127, "step": 72 }, { "epoch": 0.040087863811092805, "grad_norm": 0.39731808288616693, "learning_rate": 9.999726684658574e-07, "loss": 0.4059, "step": 73 }, { "epoch": 0.040637012630422846, "grad_norm": 0.5998558014410175, "learning_rate": 9.9996954730383e-07, "loss": 0.4605, "step": 74 }, { "epoch": 0.04118616144975288, "grad_norm": 0.7401577298826412, "learning_rate": 9.999662574373731e-07, "loss": 0.4501, "step": 75 }, { "epoch": 0.04173531026908292, "grad_norm": 0.4158209983397508, "learning_rate": 9.99962798867597e-07, "loss": 0.4015, "step": 76 }, { "epoch": 0.04228445908841296, "grad_norm": 0.41274797354801207, "learning_rate": 9.999591715956685e-07, "loss": 0.3902, "step": 77 }, { "epoch": 0.042833607907743, "grad_norm": 0.5850701714590281, "learning_rate": 9.999553756228119e-07, "loss": 0.4142, "step": 78 }, { "epoch": 0.043382756727073035, "grad_norm": 0.5032840143749217, "learning_rate": 9.999514109503082e-07, "loss": 0.403, "step": 79 }, { "epoch": 0.043931905546403076, "grad_norm": 0.43979292990216845, "learning_rate": 9.999472775794953e-07, "loss": 0.3966, "step": 80 }, { "epoch": 0.044481054365733116, "grad_norm": 0.40361288414169005, "learning_rate": 9.99942975511768e-07, "loss": 0.4216, "step": 81 }, { "epoch": 0.04503020318506315, "grad_norm": 0.39618981132075415, "learning_rate": 9.999385047485781e-07, "loss": 0.4044, "step": 82 }, { "epoch": 0.04557935200439319, "grad_norm": 0.7884738418257248, "learning_rate": 9.999338652914345e-07, "loss": 0.4311, "step": 83 }, { "epoch": 0.04612850082372323, "grad_norm": 0.583681056654287, "learning_rate": 9.999290571419028e-07, "loss": 0.4447, "step": 84 }, { "epoch": 0.046677649643053265, "grad_norm": 0.454936239495054, "learning_rate": 9.999240803016054e-07, "loss": 0.4046, "step": 85 }, { "epoch": 0.047226798462383306, "grad_norm": 0.6575839772121563, "learning_rate": 9.999189347722217e-07, "loss": 0.3932, "step": 86 }, { "epoch": 0.047775947281713346, "grad_norm": 0.5180410212467814, "learning_rate": 9.999136205554885e-07, "loss": 0.431, "step": 87 }, { "epoch": 0.04832509610104338, "grad_norm": 0.4137122043878319, "learning_rate": 9.99908137653199e-07, "loss": 0.389, "step": 88 }, { "epoch": 0.04887424492037342, "grad_norm": 0.3701143520536436, "learning_rate": 9.999024860672035e-07, "loss": 0.4106, "step": 89 }, { "epoch": 0.04942339373970346, "grad_norm": 0.5263597284937761, "learning_rate": 9.99896665799409e-07, "loss": 0.4214, "step": 90 }, { "epoch": 0.049972542559033495, "grad_norm": 0.47557602438131685, "learning_rate": 9.998906768517797e-07, "loss": 0.3858, "step": 91 }, { "epoch": 0.050521691378363535, "grad_norm": 0.47864372777099196, "learning_rate": 9.998845192263367e-07, "loss": 0.4032, "step": 92 }, { "epoch": 0.051070840197693576, "grad_norm": 0.4086842726878955, "learning_rate": 9.998781929251582e-07, "loss": 0.3923, "step": 93 }, { "epoch": 0.05161998901702362, "grad_norm": 0.41763438148578, "learning_rate": 9.998716979503789e-07, "loss": 0.359, "step": 94 }, { "epoch": 0.05216913783635365, "grad_norm": 0.41013402413321715, "learning_rate": 9.998650343041904e-07, "loss": 0.3892, "step": 95 }, { "epoch": 0.05271828665568369, "grad_norm": 0.3894742717828985, "learning_rate": 9.998582019888418e-07, "loss": 0.4144, "step": 96 }, { "epoch": 0.05326743547501373, "grad_norm": 0.496851285380317, "learning_rate": 9.998512010066385e-07, "loss": 0.3975, "step": 97 }, { "epoch": 0.053816584294343765, "grad_norm": 0.40890426232079896, "learning_rate": 9.998440313599432e-07, "loss": 0.4108, "step": 98 }, { "epoch": 0.054365733113673806, "grad_norm": 0.7032273015391635, "learning_rate": 9.998366930511754e-07, "loss": 0.4381, "step": 99 }, { "epoch": 0.054914881933003847, "grad_norm": 0.481860551076321, "learning_rate": 9.998291860828114e-07, "loss": 0.4481, "step": 100 }, { "epoch": 0.05546403075233388, "grad_norm": 0.3960595845175762, "learning_rate": 9.998215104573845e-07, "loss": 0.4003, "step": 101 }, { "epoch": 0.05601317957166392, "grad_norm": 0.5138990272499223, "learning_rate": 9.998136661774851e-07, "loss": 0.3848, "step": 102 }, { "epoch": 0.05656232839099396, "grad_norm": 0.46259719838647606, "learning_rate": 9.998056532457605e-07, "loss": 0.4253, "step": 103 }, { "epoch": 0.057111477210323995, "grad_norm": 0.4865639506859819, "learning_rate": 9.997974716649143e-07, "loss": 0.4173, "step": 104 }, { "epoch": 0.057660626029654036, "grad_norm": 0.5542792137654188, "learning_rate": 9.997891214377078e-07, "loss": 0.3924, "step": 105 }, { "epoch": 0.058209774848984076, "grad_norm": 0.3970870620860831, "learning_rate": 9.997806025669586e-07, "loss": 0.386, "step": 106 }, { "epoch": 0.05875892366831411, "grad_norm": 0.8684158596164165, "learning_rate": 9.99771915055542e-07, "loss": 0.3942, "step": 107 }, { "epoch": 0.05930807248764415, "grad_norm": 0.39883827363387453, "learning_rate": 9.997630589063896e-07, "loss": 0.3644, "step": 108 }, { "epoch": 0.05985722130697419, "grad_norm": 0.3977619556884487, "learning_rate": 9.997540341224897e-07, "loss": 0.3692, "step": 109 }, { "epoch": 0.06040637012630423, "grad_norm": 0.5584968084463278, "learning_rate": 9.997448407068881e-07, "loss": 0.3847, "step": 110 }, { "epoch": 0.060955518945634266, "grad_norm": 0.4901307252729202, "learning_rate": 9.997354786626871e-07, "loss": 0.3894, "step": 111 }, { "epoch": 0.061504667764964306, "grad_norm": 0.5357429553372771, "learning_rate": 9.99725947993046e-07, "loss": 0.3555, "step": 112 }, { "epoch": 0.06205381658429435, "grad_norm": 0.8108112801429885, "learning_rate": 9.99716248701181e-07, "loss": 0.3882, "step": 113 }, { "epoch": 0.06260296540362438, "grad_norm": 0.391691207820883, "learning_rate": 9.997063807903656e-07, "loss": 0.3741, "step": 114 }, { "epoch": 0.06315211422295441, "grad_norm": 0.3796279632840399, "learning_rate": 9.996963442639298e-07, "loss": 0.3228, "step": 115 }, { "epoch": 0.06370126304228446, "grad_norm": 0.42435147960702796, "learning_rate": 9.996861391252602e-07, "loss": 0.4165, "step": 116 }, { "epoch": 0.0642504118616145, "grad_norm": 0.5821283761287043, "learning_rate": 9.99675765377801e-07, "loss": 0.4116, "step": 117 }, { "epoch": 0.06479956068094453, "grad_norm": 0.5389945137755192, "learning_rate": 9.996652230250526e-07, "loss": 0.3862, "step": 118 }, { "epoch": 0.06534870950027458, "grad_norm": 0.4445522665144674, "learning_rate": 9.99654512070573e-07, "loss": 0.4287, "step": 119 }, { "epoch": 0.06589785831960461, "grad_norm": 0.5060430282410254, "learning_rate": 9.996436325179768e-07, "loss": 0.3616, "step": 120 }, { "epoch": 0.06644700713893466, "grad_norm": 0.30509818128951455, "learning_rate": 9.996325843709352e-07, "loss": 0.3365, "step": 121 }, { "epoch": 0.06699615595826469, "grad_norm": 0.515201460268608, "learning_rate": 9.996213676331764e-07, "loss": 0.4321, "step": 122 }, { "epoch": 0.06754530477759473, "grad_norm": 0.5446747484512473, "learning_rate": 9.99609982308486e-07, "loss": 0.4181, "step": 123 }, { "epoch": 0.06809445359692477, "grad_norm": 0.5024648771569943, "learning_rate": 9.99598428400706e-07, "loss": 0.3746, "step": 124 }, { "epoch": 0.0686436024162548, "grad_norm": 0.4652208294214632, "learning_rate": 9.995867059137356e-07, "loss": 0.3866, "step": 125 }, { "epoch": 0.06919275123558484, "grad_norm": 0.3408332502864401, "learning_rate": 9.9957481485153e-07, "loss": 0.359, "step": 126 }, { "epoch": 0.06974190005491489, "grad_norm": 0.4070667377883137, "learning_rate": 9.995627552181032e-07, "loss": 0.3273, "step": 127 }, { "epoch": 0.07029104887424492, "grad_norm": 0.36927260336226775, "learning_rate": 9.995505270175238e-07, "loss": 0.3594, "step": 128 }, { "epoch": 0.07084019769357495, "grad_norm": 0.4375876523488486, "learning_rate": 9.995381302539186e-07, "loss": 0.3958, "step": 129 }, { "epoch": 0.071389346512905, "grad_norm": 0.3792126627963935, "learning_rate": 9.995255649314714e-07, "loss": 0.3407, "step": 130 }, { "epoch": 0.07193849533223504, "grad_norm": 0.35222602094119543, "learning_rate": 9.995128310544225e-07, "loss": 0.3514, "step": 131 }, { "epoch": 0.07248764415156507, "grad_norm": 0.45819894844073455, "learning_rate": 9.994999286270687e-07, "loss": 0.3758, "step": 132 }, { "epoch": 0.07303679297089512, "grad_norm": 0.4262973424398369, "learning_rate": 9.994868576537646e-07, "loss": 0.3538, "step": 133 }, { "epoch": 0.07358594179022515, "grad_norm": 0.5336016620311667, "learning_rate": 9.994736181389207e-07, "loss": 0.4559, "step": 134 }, { "epoch": 0.07413509060955518, "grad_norm": 0.4136505217969873, "learning_rate": 9.99460210087005e-07, "loss": 0.4176, "step": 135 }, { "epoch": 0.07468423942888523, "grad_norm": 0.28382829003883114, "learning_rate": 9.994466335025425e-07, "loss": 0.3668, "step": 136 }, { "epoch": 0.07523338824821527, "grad_norm": 0.4975464706635873, "learning_rate": 9.994328883901146e-07, "loss": 0.414, "step": 137 }, { "epoch": 0.0757825370675453, "grad_norm": 0.45122723000653814, "learning_rate": 9.994189747543596e-07, "loss": 0.3959, "step": 138 }, { "epoch": 0.07633168588687535, "grad_norm": 0.5299261818855341, "learning_rate": 9.994048925999732e-07, "loss": 0.4173, "step": 139 }, { "epoch": 0.07688083470620538, "grad_norm": 0.46002340806867353, "learning_rate": 9.993906419317072e-07, "loss": 0.3942, "step": 140 }, { "epoch": 0.07742998352553541, "grad_norm": 0.3605976614381673, "learning_rate": 9.99376222754371e-07, "loss": 0.3802, "step": 141 }, { "epoch": 0.07797913234486546, "grad_norm": 0.5362236526756698, "learning_rate": 9.993616350728304e-07, "loss": 0.3779, "step": 142 }, { "epoch": 0.0785282811641955, "grad_norm": 0.34943871592531056, "learning_rate": 9.993468788920084e-07, "loss": 0.367, "step": 143 }, { "epoch": 0.07907742998352553, "grad_norm": 0.39683303312446844, "learning_rate": 9.993319542168843e-07, "loss": 0.421, "step": 144 }, { "epoch": 0.07962657880285558, "grad_norm": 0.3875959328068498, "learning_rate": 9.993168610524948e-07, "loss": 0.3809, "step": 145 }, { "epoch": 0.08017572762218561, "grad_norm": 0.4730306526908986, "learning_rate": 9.993015994039334e-07, "loss": 0.3524, "step": 146 }, { "epoch": 0.08072487644151564, "grad_norm": 0.4760955148163818, "learning_rate": 9.992861692763502e-07, "loss": 0.378, "step": 147 }, { "epoch": 0.08127402526084569, "grad_norm": 0.47121478486586815, "learning_rate": 9.992705706749524e-07, "loss": 0.3679, "step": 148 }, { "epoch": 0.08182317408017573, "grad_norm": 0.5283249542882976, "learning_rate": 9.992548036050038e-07, "loss": 0.3864, "step": 149 }, { "epoch": 0.08237232289950576, "grad_norm": 0.5002857967852061, "learning_rate": 9.992388680718255e-07, "loss": 0.387, "step": 150 }, { "epoch": 0.0829214717188358, "grad_norm": 0.679985230809286, "learning_rate": 9.99222764080795e-07, "loss": 0.3957, "step": 151 }, { "epoch": 0.08347062053816584, "grad_norm": 0.41311368019230177, "learning_rate": 9.992064916373466e-07, "loss": 0.3839, "step": 152 }, { "epoch": 0.08401976935749589, "grad_norm": 0.4091730337302347, "learning_rate": 9.99190050746972e-07, "loss": 0.412, "step": 153 }, { "epoch": 0.08456891817682592, "grad_norm": 0.4014223008819766, "learning_rate": 9.99173441415219e-07, "loss": 0.3561, "step": 154 }, { "epoch": 0.08511806699615596, "grad_norm": 0.40109303686643105, "learning_rate": 9.99156663647693e-07, "loss": 0.3372, "step": 155 }, { "epoch": 0.085667215815486, "grad_norm": 0.39806264515079226, "learning_rate": 9.991397174500556e-07, "loss": 0.3906, "step": 156 }, { "epoch": 0.08621636463481604, "grad_norm": 0.5062314911028993, "learning_rate": 9.991226028280257e-07, "loss": 0.3939, "step": 157 }, { "epoch": 0.08676551345414607, "grad_norm": 0.38662768415414206, "learning_rate": 9.99105319787379e-07, "loss": 0.3987, "step": 158 }, { "epoch": 0.08731466227347612, "grad_norm": 0.4590133256755766, "learning_rate": 9.990878683339475e-07, "loss": 0.3299, "step": 159 }, { "epoch": 0.08786381109280615, "grad_norm": 0.41380348003567224, "learning_rate": 9.990702484736205e-07, "loss": 0.4035, "step": 160 }, { "epoch": 0.08841295991213619, "grad_norm": 0.5438042147725886, "learning_rate": 9.990524602123444e-07, "loss": 0.348, "step": 161 }, { "epoch": 0.08896210873146623, "grad_norm": 0.5713386655453655, "learning_rate": 9.99034503556122e-07, "loss": 0.408, "step": 162 }, { "epoch": 0.08951125755079627, "grad_norm": 0.3783590546794582, "learning_rate": 9.990163785110126e-07, "loss": 0.3864, "step": 163 }, { "epoch": 0.0900604063701263, "grad_norm": 0.33683455763843745, "learning_rate": 9.989980850831331e-07, "loss": 0.347, "step": 164 }, { "epoch": 0.09060955518945635, "grad_norm": 0.4240240644529421, "learning_rate": 9.989796232786567e-07, "loss": 0.3352, "step": 165 }, { "epoch": 0.09115870400878638, "grad_norm": 0.5058278687597212, "learning_rate": 9.989609931038138e-07, "loss": 0.3935, "step": 166 }, { "epoch": 0.09170785282811642, "grad_norm": 0.3983611191440675, "learning_rate": 9.98942194564891e-07, "loss": 0.4081, "step": 167 }, { "epoch": 0.09225700164744646, "grad_norm": 0.4427158210708611, "learning_rate": 9.989232276682323e-07, "loss": 0.348, "step": 168 }, { "epoch": 0.0928061504667765, "grad_norm": 0.4520051829129437, "learning_rate": 9.989040924202386e-07, "loss": 0.4108, "step": 169 }, { "epoch": 0.09335529928610653, "grad_norm": 0.364594432913114, "learning_rate": 9.988847888273671e-07, "loss": 0.3583, "step": 170 }, { "epoch": 0.09390444810543658, "grad_norm": 0.3678552199329568, "learning_rate": 9.988653168961321e-07, "loss": 0.3307, "step": 171 }, { "epoch": 0.09445359692476661, "grad_norm": 0.5709532339625728, "learning_rate": 9.988456766331046e-07, "loss": 0.4148, "step": 172 }, { "epoch": 0.09500274574409664, "grad_norm": 0.3980035452670733, "learning_rate": 9.988258680449123e-07, "loss": 0.3975, "step": 173 }, { "epoch": 0.09555189456342669, "grad_norm": 0.42917281879579317, "learning_rate": 9.988058911382402e-07, "loss": 0.3384, "step": 174 }, { "epoch": 0.09610104338275673, "grad_norm": 0.3512767098796575, "learning_rate": 9.987857459198296e-07, "loss": 0.3722, "step": 175 }, { "epoch": 0.09665019220208676, "grad_norm": 0.6212737994624551, "learning_rate": 9.98765432396479e-07, "loss": 0.3965, "step": 176 }, { "epoch": 0.09719934102141681, "grad_norm": 0.5174479120329459, "learning_rate": 9.987449505750431e-07, "loss": 0.4035, "step": 177 }, { "epoch": 0.09774848984074684, "grad_norm": 0.4093324105938102, "learning_rate": 9.987243004624337e-07, "loss": 0.3578, "step": 178 }, { "epoch": 0.09829763866007687, "grad_norm": 0.34138629472472504, "learning_rate": 9.9870348206562e-07, "loss": 0.364, "step": 179 }, { "epoch": 0.09884678747940692, "grad_norm": 0.4285231430228904, "learning_rate": 9.986824953916272e-07, "loss": 0.3527, "step": 180 }, { "epoch": 0.09939593629873696, "grad_norm": 0.3971700866108027, "learning_rate": 9.986613404475373e-07, "loss": 0.3962, "step": 181 }, { "epoch": 0.09994508511806699, "grad_norm": 0.4082782904476488, "learning_rate": 9.986400172404894e-07, "loss": 0.3312, "step": 182 }, { "epoch": 0.10049423393739704, "grad_norm": 0.5190253973628005, "learning_rate": 9.986185257776794e-07, "loss": 0.3486, "step": 183 }, { "epoch": 0.10104338275672707, "grad_norm": 0.4795273521744964, "learning_rate": 9.985968660663599e-07, "loss": 0.3392, "step": 184 }, { "epoch": 0.1015925315760571, "grad_norm": 0.3460609246158477, "learning_rate": 9.985750381138403e-07, "loss": 0.3705, "step": 185 }, { "epoch": 0.10214168039538715, "grad_norm": 0.4452233441846428, "learning_rate": 9.985530419274865e-07, "loss": 0.2976, "step": 186 }, { "epoch": 0.10269082921471719, "grad_norm": 0.4553211629734614, "learning_rate": 9.985308775147213e-07, "loss": 0.3649, "step": 187 }, { "epoch": 0.10323997803404723, "grad_norm": 0.4796653268030492, "learning_rate": 9.98508544883025e-07, "loss": 0.3674, "step": 188 }, { "epoch": 0.10378912685337727, "grad_norm": 0.41997355091130645, "learning_rate": 9.984860440399334e-07, "loss": 0.3828, "step": 189 }, { "epoch": 0.1043382756727073, "grad_norm": 0.44852018455981013, "learning_rate": 9.9846337499304e-07, "loss": 0.3305, "step": 190 }, { "epoch": 0.10488742449203735, "grad_norm": 0.43346022282374863, "learning_rate": 9.984405377499948e-07, "loss": 0.3694, "step": 191 }, { "epoch": 0.10543657331136738, "grad_norm": 0.4345686434268374, "learning_rate": 9.984175323185042e-07, "loss": 0.3774, "step": 192 }, { "epoch": 0.10598572213069742, "grad_norm": 0.42529178344892793, "learning_rate": 9.983943587063322e-07, "loss": 0.3753, "step": 193 }, { "epoch": 0.10653487095002746, "grad_norm": 0.3495647195575569, "learning_rate": 9.983710169212986e-07, "loss": 0.3856, "step": 194 }, { "epoch": 0.1070840197693575, "grad_norm": 0.3526832472899213, "learning_rate": 9.983475069712806e-07, "loss": 0.3437, "step": 195 }, { "epoch": 0.10763316858868753, "grad_norm": 0.46510110816199074, "learning_rate": 9.98323828864212e-07, "loss": 0.3485, "step": 196 }, { "epoch": 0.10818231740801758, "grad_norm": 0.4348701784089849, "learning_rate": 9.98299982608083e-07, "loss": 0.3674, "step": 197 }, { "epoch": 0.10873146622734761, "grad_norm": 0.5501938433457089, "learning_rate": 9.982759682109414e-07, "loss": 0.4056, "step": 198 }, { "epoch": 0.10928061504667765, "grad_norm": 0.6244660890078381, "learning_rate": 9.982517856808903e-07, "loss": 0.4472, "step": 199 }, { "epoch": 0.10982976386600769, "grad_norm": 0.41988021802529674, "learning_rate": 9.982274350260913e-07, "loss": 0.3926, "step": 200 }, { "epoch": 0.10982976386600769, "eval_loss": 0.4592891335487366, "eval_runtime": 19.1098, "eval_samples_per_second": 23.182, "eval_steps_per_second": 0.994, "step": 200 }, { "epoch": 0.11037891268533773, "grad_norm": 0.4415435064751627, "learning_rate": 9.98202916254761e-07, "loss": 0.3667, "step": 201 }, { "epoch": 0.11092806150466776, "grad_norm": 0.35924372460927034, "learning_rate": 9.981782293751745e-07, "loss": 0.3733, "step": 202 }, { "epoch": 0.11147721032399781, "grad_norm": 0.38821674798414335, "learning_rate": 9.981533743956623e-07, "loss": 0.3668, "step": 203 }, { "epoch": 0.11202635914332784, "grad_norm": 0.40750498767827315, "learning_rate": 9.981283513246117e-07, "loss": 0.3779, "step": 204 }, { "epoch": 0.11257550796265788, "grad_norm": 0.44839782402832984, "learning_rate": 9.981031601704675e-07, "loss": 0.3933, "step": 205 }, { "epoch": 0.11312465678198792, "grad_norm": 0.5550047522714568, "learning_rate": 9.980778009417306e-07, "loss": 0.3648, "step": 206 }, { "epoch": 0.11367380560131796, "grad_norm": 0.3992747457977071, "learning_rate": 9.98052273646959e-07, "loss": 0.3474, "step": 207 }, { "epoch": 0.11422295442064799, "grad_norm": 0.395776988189523, "learning_rate": 9.98026578294767e-07, "loss": 0.3617, "step": 208 }, { "epoch": 0.11477210323997804, "grad_norm": 0.47873976999967466, "learning_rate": 9.980007148938257e-07, "loss": 0.3915, "step": 209 }, { "epoch": 0.11532125205930807, "grad_norm": 0.48073825558813443, "learning_rate": 9.979746834528634e-07, "loss": 0.3457, "step": 210 }, { "epoch": 0.1158704008786381, "grad_norm": 0.40926753855165304, "learning_rate": 9.979484839806645e-07, "loss": 0.3231, "step": 211 }, { "epoch": 0.11641954969796815, "grad_norm": 0.4951877426042713, "learning_rate": 9.979221164860707e-07, "loss": 0.3165, "step": 212 }, { "epoch": 0.11696869851729819, "grad_norm": 0.5632310747337004, "learning_rate": 9.978955809779797e-07, "loss": 0.3691, "step": 213 }, { "epoch": 0.11751784733662822, "grad_norm": 0.3411906362752352, "learning_rate": 9.978688774653462e-07, "loss": 0.3236, "step": 214 }, { "epoch": 0.11806699615595827, "grad_norm": 0.4589008696737782, "learning_rate": 9.978420059571822e-07, "loss": 0.3745, "step": 215 }, { "epoch": 0.1186161449752883, "grad_norm": 0.40534732619235747, "learning_rate": 9.978149664625552e-07, "loss": 0.3649, "step": 216 }, { "epoch": 0.11916529379461833, "grad_norm": 0.45724827266369716, "learning_rate": 9.977877589905903e-07, "loss": 0.3338, "step": 217 }, { "epoch": 0.11971444261394838, "grad_norm": 0.395872308929039, "learning_rate": 9.97760383550469e-07, "loss": 0.3511, "step": 218 }, { "epoch": 0.12026359143327842, "grad_norm": 0.37096602353182145, "learning_rate": 9.977328401514297e-07, "loss": 0.3407, "step": 219 }, { "epoch": 0.12081274025260846, "grad_norm": 0.39029994608220525, "learning_rate": 9.977051288027672e-07, "loss": 0.3656, "step": 220 }, { "epoch": 0.1213618890719385, "grad_norm": 0.4321717157067283, "learning_rate": 9.976772495138326e-07, "loss": 0.3745, "step": 221 }, { "epoch": 0.12191103789126853, "grad_norm": 0.4669317733485491, "learning_rate": 9.976492022940347e-07, "loss": 0.3879, "step": 222 }, { "epoch": 0.12246018671059858, "grad_norm": 0.41070948336327107, "learning_rate": 9.976209871528382e-07, "loss": 0.3824, "step": 223 }, { "epoch": 0.12300933552992861, "grad_norm": 0.31792415033613153, "learning_rate": 9.975926040997649e-07, "loss": 0.351, "step": 224 }, { "epoch": 0.12355848434925865, "grad_norm": 0.48492647343265355, "learning_rate": 9.975640531443926e-07, "loss": 0.3901, "step": 225 }, { "epoch": 0.1241076331685887, "grad_norm": 0.749265387589108, "learning_rate": 9.975353342963561e-07, "loss": 0.4107, "step": 226 }, { "epoch": 0.12465678198791873, "grad_norm": 0.48471643998709324, "learning_rate": 9.975064475653477e-07, "loss": 0.4175, "step": 227 }, { "epoch": 0.12520593080724876, "grad_norm": 0.4710683247831898, "learning_rate": 9.97477392961115e-07, "loss": 0.3562, "step": 228 }, { "epoch": 0.1257550796265788, "grad_norm": 0.3644446376775287, "learning_rate": 9.97448170493463e-07, "loss": 0.3559, "step": 229 }, { "epoch": 0.12630422844590883, "grad_norm": 0.3348740546399851, "learning_rate": 9.974187801722534e-07, "loss": 0.3428, "step": 230 }, { "epoch": 0.12685337726523888, "grad_norm": 0.3760807451612699, "learning_rate": 9.97389222007404e-07, "loss": 0.3615, "step": 231 }, { "epoch": 0.12740252608456892, "grad_norm": 0.4353511228168891, "learning_rate": 9.973594960088898e-07, "loss": 0.3922, "step": 232 }, { "epoch": 0.12795167490389894, "grad_norm": 0.405228004951534, "learning_rate": 9.973296021867424e-07, "loss": 0.289, "step": 233 }, { "epoch": 0.128500823723229, "grad_norm": 0.4599862585068726, "learning_rate": 9.972995405510496e-07, "loss": 0.4217, "step": 234 }, { "epoch": 0.12904997254255904, "grad_norm": 0.40813300867484475, "learning_rate": 9.97269311111956e-07, "loss": 0.3372, "step": 235 }, { "epoch": 0.12959912136188906, "grad_norm": 0.428181122274626, "learning_rate": 9.972389138796632e-07, "loss": 0.363, "step": 236 }, { "epoch": 0.1301482701812191, "grad_norm": 0.39017516006132724, "learning_rate": 9.97208348864429e-07, "loss": 0.3544, "step": 237 }, { "epoch": 0.13069741900054915, "grad_norm": 0.46677760996170964, "learning_rate": 9.971776160765678e-07, "loss": 0.341, "step": 238 }, { "epoch": 0.1312465678198792, "grad_norm": 0.3797991014913136, "learning_rate": 9.971467155264512e-07, "loss": 0.3389, "step": 239 }, { "epoch": 0.13179571663920922, "grad_norm": 0.4573302999433252, "learning_rate": 9.971156472245068e-07, "loss": 0.3472, "step": 240 }, { "epoch": 0.13234486545853927, "grad_norm": 0.39387593731789705, "learning_rate": 9.97084411181219e-07, "loss": 0.321, "step": 241 }, { "epoch": 0.13289401427786932, "grad_norm": 0.4709552965683147, "learning_rate": 9.970530074071288e-07, "loss": 0.3608, "step": 242 }, { "epoch": 0.13344316309719934, "grad_norm": 0.42579713480844766, "learning_rate": 9.970214359128335e-07, "loss": 0.3316, "step": 243 }, { "epoch": 0.13399231191652938, "grad_norm": 0.47601317716932834, "learning_rate": 9.96989696708988e-07, "loss": 0.3172, "step": 244 }, { "epoch": 0.13454146073585943, "grad_norm": 0.44384536282866577, "learning_rate": 9.969577898063025e-07, "loss": 0.3586, "step": 245 }, { "epoch": 0.13509060955518945, "grad_norm": 0.39505986923158526, "learning_rate": 9.969257152155448e-07, "loss": 0.3656, "step": 246 }, { "epoch": 0.1356397583745195, "grad_norm": 0.42418758008300533, "learning_rate": 9.968934729475387e-07, "loss": 0.3922, "step": 247 }, { "epoch": 0.13618890719384955, "grad_norm": 0.5093440412591362, "learning_rate": 9.968610630131648e-07, "loss": 0.3479, "step": 248 }, { "epoch": 0.13673805601317957, "grad_norm": 0.7139282967080007, "learning_rate": 9.968284854233602e-07, "loss": 0.3909, "step": 249 }, { "epoch": 0.1372872048325096, "grad_norm": 0.6529816407471987, "learning_rate": 9.967957401891189e-07, "loss": 0.4136, "step": 250 }, { "epoch": 0.13783635365183966, "grad_norm": 0.6216389978109844, "learning_rate": 9.967628273214908e-07, "loss": 0.3782, "step": 251 }, { "epoch": 0.13838550247116968, "grad_norm": 0.44445537712753524, "learning_rate": 9.967297468315833e-07, "loss": 0.3257, "step": 252 }, { "epoch": 0.13893465129049973, "grad_norm": 0.4020209287971924, "learning_rate": 9.966964987305595e-07, "loss": 0.3169, "step": 253 }, { "epoch": 0.13948380010982978, "grad_norm": 0.43357581260818745, "learning_rate": 9.96663083029639e-07, "loss": 0.3733, "step": 254 }, { "epoch": 0.1400329489291598, "grad_norm": 0.4795315250000879, "learning_rate": 9.966294997400994e-07, "loss": 0.4023, "step": 255 }, { "epoch": 0.14058209774848984, "grad_norm": 0.3859745136767488, "learning_rate": 9.965957488732731e-07, "loss": 0.352, "step": 256 }, { "epoch": 0.1411312465678199, "grad_norm": 0.4612583423759146, "learning_rate": 9.965618304405498e-07, "loss": 0.3563, "step": 257 }, { "epoch": 0.1416803953871499, "grad_norm": 0.4598385721150566, "learning_rate": 9.965277444533758e-07, "loss": 0.3574, "step": 258 }, { "epoch": 0.14222954420647996, "grad_norm": 0.570962890716166, "learning_rate": 9.96493490923254e-07, "loss": 0.3889, "step": 259 }, { "epoch": 0.14277869302581, "grad_norm": 0.41513443701763686, "learning_rate": 9.964590698617438e-07, "loss": 0.3026, "step": 260 }, { "epoch": 0.14332784184514002, "grad_norm": 0.49569746196966874, "learning_rate": 9.964244812804605e-07, "loss": 0.401, "step": 261 }, { "epoch": 0.14387699066447007, "grad_norm": 0.3925347823971518, "learning_rate": 9.96389725191077e-07, "loss": 0.3901, "step": 262 }, { "epoch": 0.14442613948380012, "grad_norm": 0.566322238816806, "learning_rate": 9.96354801605322e-07, "loss": 0.3576, "step": 263 }, { "epoch": 0.14497528830313014, "grad_norm": 0.39695676135462576, "learning_rate": 9.963197105349812e-07, "loss": 0.3789, "step": 264 }, { "epoch": 0.1455244371224602, "grad_norm": 0.5925334242414747, "learning_rate": 9.962844519918958e-07, "loss": 0.3467, "step": 265 }, { "epoch": 0.14607358594179023, "grad_norm": 0.60271074859749, "learning_rate": 9.962490259879652e-07, "loss": 0.3714, "step": 266 }, { "epoch": 0.14662273476112025, "grad_norm": 0.4822538904423981, "learning_rate": 9.962134325351439e-07, "loss": 0.3448, "step": 267 }, { "epoch": 0.1471718835804503, "grad_norm": 0.4667548317418967, "learning_rate": 9.96177671645443e-07, "loss": 0.3309, "step": 268 }, { "epoch": 0.14772103239978035, "grad_norm": 0.6527901023288586, "learning_rate": 9.961417433309311e-07, "loss": 0.3745, "step": 269 }, { "epoch": 0.14827018121911037, "grad_norm": 0.5089121492656247, "learning_rate": 9.961056476037324e-07, "loss": 0.334, "step": 270 }, { "epoch": 0.14881933003844042, "grad_norm": 0.44832682542367025, "learning_rate": 9.960693844760283e-07, "loss": 0.3245, "step": 271 }, { "epoch": 0.14936847885777046, "grad_norm": 0.44360234778223506, "learning_rate": 9.960329539600556e-07, "loss": 0.3703, "step": 272 }, { "epoch": 0.14991762767710048, "grad_norm": 0.6056027201575581, "learning_rate": 9.959963560681086e-07, "loss": 0.3529, "step": 273 }, { "epoch": 0.15046677649643053, "grad_norm": 0.5990019290407864, "learning_rate": 9.959595908125378e-07, "loss": 0.3643, "step": 274 }, { "epoch": 0.15101592531576058, "grad_norm": 0.3635109496239804, "learning_rate": 9.9592265820575e-07, "loss": 0.3782, "step": 275 }, { "epoch": 0.1515650741350906, "grad_norm": 0.46936898990133824, "learning_rate": 9.958855582602085e-07, "loss": 0.3327, "step": 276 }, { "epoch": 0.15211422295442065, "grad_norm": 0.3776537571444719, "learning_rate": 9.958482909884336e-07, "loss": 0.3685, "step": 277 }, { "epoch": 0.1526633717737507, "grad_norm": 0.41210296949155756, "learning_rate": 9.958108564030012e-07, "loss": 0.3322, "step": 278 }, { "epoch": 0.15321252059308071, "grad_norm": 0.48290939123842697, "learning_rate": 9.95773254516544e-07, "loss": 0.3345, "step": 279 }, { "epoch": 0.15376166941241076, "grad_norm": 0.382141670936933, "learning_rate": 9.957354853417515e-07, "loss": 0.3589, "step": 280 }, { "epoch": 0.1543108182317408, "grad_norm": 0.4175712720125532, "learning_rate": 9.956975488913697e-07, "loss": 0.3732, "step": 281 }, { "epoch": 0.15485996705107083, "grad_norm": 0.3988533984078127, "learning_rate": 9.956594451782e-07, "loss": 0.3254, "step": 282 }, { "epoch": 0.15540911587040088, "grad_norm": 0.5399841474010806, "learning_rate": 9.956211742151017e-07, "loss": 0.3533, "step": 283 }, { "epoch": 0.15595826468973092, "grad_norm": 0.3775942522914299, "learning_rate": 9.955827360149894e-07, "loss": 0.3164, "step": 284 }, { "epoch": 0.15650741350906094, "grad_norm": 0.5950048479549518, "learning_rate": 9.95544130590835e-07, "loss": 0.3749, "step": 285 }, { "epoch": 0.157056562328391, "grad_norm": 0.48837745189753445, "learning_rate": 9.955053579556659e-07, "loss": 0.3729, "step": 286 }, { "epoch": 0.15760571114772104, "grad_norm": 0.38176832343027967, "learning_rate": 9.95466418122567e-07, "loss": 0.3547, "step": 287 }, { "epoch": 0.15815485996705106, "grad_norm": 0.3954692202082432, "learning_rate": 9.954273111046783e-07, "loss": 0.3351, "step": 288 }, { "epoch": 0.1587040087863811, "grad_norm": 0.4419211445608487, "learning_rate": 9.953880369151978e-07, "loss": 0.3474, "step": 289 }, { "epoch": 0.15925315760571115, "grad_norm": 0.4666008595472022, "learning_rate": 9.953485955673785e-07, "loss": 0.3361, "step": 290 }, { "epoch": 0.15980230642504117, "grad_norm": 0.4559285279712608, "learning_rate": 9.953089870745308e-07, "loss": 0.3246, "step": 291 }, { "epoch": 0.16035145524437122, "grad_norm": 0.4129733759139695, "learning_rate": 9.952692114500208e-07, "loss": 0.3539, "step": 292 }, { "epoch": 0.16090060406370127, "grad_norm": 0.45498605264209285, "learning_rate": 9.952292687072713e-07, "loss": 0.3307, "step": 293 }, { "epoch": 0.1614497528830313, "grad_norm": 0.7105153458114862, "learning_rate": 9.95189158859762e-07, "loss": 0.3784, "step": 294 }, { "epoch": 0.16199890170236134, "grad_norm": 0.3617617956159836, "learning_rate": 9.951488819210278e-07, "loss": 0.3255, "step": 295 }, { "epoch": 0.16254805052169138, "grad_norm": 0.4289627087987424, "learning_rate": 9.95108437904661e-07, "loss": 0.3363, "step": 296 }, { "epoch": 0.1630971993410214, "grad_norm": 0.4483127161193595, "learning_rate": 9.950678268243102e-07, "loss": 0.337, "step": 297 }, { "epoch": 0.16364634816035145, "grad_norm": 0.47752519580381575, "learning_rate": 9.950270486936798e-07, "loss": 0.3238, "step": 298 }, { "epoch": 0.1641954969796815, "grad_norm": 0.4359700802585661, "learning_rate": 9.949861035265312e-07, "loss": 0.3274, "step": 299 }, { "epoch": 0.16474464579901152, "grad_norm": 0.45171067082593136, "learning_rate": 9.949449913366817e-07, "loss": 0.3437, "step": 300 }, { "epoch": 0.16529379461834157, "grad_norm": 0.41749672022873435, "learning_rate": 9.94903712138005e-07, "loss": 0.4048, "step": 301 }, { "epoch": 0.1658429434376716, "grad_norm": 0.38017600202400764, "learning_rate": 9.948622659444316e-07, "loss": 0.3336, "step": 302 }, { "epoch": 0.16639209225700163, "grad_norm": 0.43372715551315405, "learning_rate": 9.94820652769948e-07, "loss": 0.3673, "step": 303 }, { "epoch": 0.16694124107633168, "grad_norm": 0.42527734939799494, "learning_rate": 9.94778872628597e-07, "loss": 0.34, "step": 304 }, { "epoch": 0.16749038989566173, "grad_norm": 0.5650518043204806, "learning_rate": 9.947369255344778e-07, "loss": 0.3276, "step": 305 }, { "epoch": 0.16803953871499178, "grad_norm": 0.4868831880087055, "learning_rate": 9.946948115017462e-07, "loss": 0.3368, "step": 306 }, { "epoch": 0.1685886875343218, "grad_norm": 0.4492744346099952, "learning_rate": 9.946525305446142e-07, "loss": 0.3305, "step": 307 }, { "epoch": 0.16913783635365184, "grad_norm": 0.4516893606448487, "learning_rate": 9.946100826773497e-07, "loss": 0.3545, "step": 308 }, { "epoch": 0.1696869851729819, "grad_norm": 0.4485378202702623, "learning_rate": 9.945674679142776e-07, "loss": 0.3481, "step": 309 }, { "epoch": 0.1702361339923119, "grad_norm": 0.4932332975680576, "learning_rate": 9.945246862697789e-07, "loss": 0.3608, "step": 310 }, { "epoch": 0.17078528281164196, "grad_norm": 0.3717207417408704, "learning_rate": 9.944817377582905e-07, "loss": 0.3088, "step": 311 }, { "epoch": 0.171334431630972, "grad_norm": 0.37840003050223336, "learning_rate": 9.94438622394306e-07, "loss": 0.3609, "step": 312 }, { "epoch": 0.17188358045030203, "grad_norm": 0.4817829974733523, "learning_rate": 9.943953401923756e-07, "loss": 0.3328, "step": 313 }, { "epoch": 0.17243272926963207, "grad_norm": 0.3648487195651704, "learning_rate": 9.943518911671048e-07, "loss": 0.3285, "step": 314 }, { "epoch": 0.17298187808896212, "grad_norm": 0.4981300603086174, "learning_rate": 9.943082753331567e-07, "loss": 0.3949, "step": 315 }, { "epoch": 0.17353102690829214, "grad_norm": 0.48539853349156564, "learning_rate": 9.942644927052497e-07, "loss": 0.3359, "step": 316 }, { "epoch": 0.1740801757276222, "grad_norm": 0.47397270845794354, "learning_rate": 9.942205432981588e-07, "loss": 0.3564, "step": 317 }, { "epoch": 0.17462932454695224, "grad_norm": 0.3387854426456379, "learning_rate": 9.941764271267156e-07, "loss": 0.2923, "step": 318 }, { "epoch": 0.17517847336628226, "grad_norm": 0.5233782672135069, "learning_rate": 9.941321442058075e-07, "loss": 0.3302, "step": 319 }, { "epoch": 0.1757276221856123, "grad_norm": 0.5398165786661566, "learning_rate": 9.94087694550378e-07, "loss": 0.355, "step": 320 }, { "epoch": 0.17627677100494235, "grad_norm": 0.4110340753561315, "learning_rate": 9.94043078175428e-07, "loss": 0.3339, "step": 321 }, { "epoch": 0.17682591982427237, "grad_norm": 0.361861039508175, "learning_rate": 9.93998295096013e-07, "loss": 0.3528, "step": 322 }, { "epoch": 0.17737506864360242, "grad_norm": 0.4453360793221531, "learning_rate": 9.939533453272465e-07, "loss": 0.315, "step": 323 }, { "epoch": 0.17792421746293247, "grad_norm": 0.5017845392756807, "learning_rate": 9.939082288842968e-07, "loss": 0.4, "step": 324 }, { "epoch": 0.17847336628226249, "grad_norm": 0.39937084323677985, "learning_rate": 9.938629457823894e-07, "loss": 0.3554, "step": 325 }, { "epoch": 0.17902251510159253, "grad_norm": 0.508911692715381, "learning_rate": 9.93817496036805e-07, "loss": 0.3355, "step": 326 }, { "epoch": 0.17957166392092258, "grad_norm": 0.38494293982296035, "learning_rate": 9.93771879662882e-07, "loss": 0.3527, "step": 327 }, { "epoch": 0.1801208127402526, "grad_norm": 0.4027297806365311, "learning_rate": 9.93726096676014e-07, "loss": 0.3791, "step": 328 }, { "epoch": 0.18066996155958265, "grad_norm": 0.4758350347574514, "learning_rate": 9.936801470916509e-07, "loss": 0.3331, "step": 329 }, { "epoch": 0.1812191103789127, "grad_norm": 0.5093063799021175, "learning_rate": 9.93634030925299e-07, "loss": 0.3302, "step": 330 }, { "epoch": 0.18176825919824272, "grad_norm": 0.4062567803320839, "learning_rate": 9.935877481925212e-07, "loss": 0.3244, "step": 331 }, { "epoch": 0.18231740801757276, "grad_norm": 0.38813652430611273, "learning_rate": 9.935412989089358e-07, "loss": 0.3763, "step": 332 }, { "epoch": 0.1828665568369028, "grad_norm": 0.5484997009750862, "learning_rate": 9.93494683090218e-07, "loss": 0.3666, "step": 333 }, { "epoch": 0.18341570565623283, "grad_norm": 0.5308548289699279, "learning_rate": 9.934479007520986e-07, "loss": 0.4148, "step": 334 }, { "epoch": 0.18396485447556288, "grad_norm": 0.4362455388628846, "learning_rate": 9.93400951910365e-07, "loss": 0.3601, "step": 335 }, { "epoch": 0.18451400329489293, "grad_norm": 0.5214112933772652, "learning_rate": 9.933538365808612e-07, "loss": 0.3516, "step": 336 }, { "epoch": 0.18506315211422295, "grad_norm": 0.5459935043716072, "learning_rate": 9.93306554779486e-07, "loss": 0.3204, "step": 337 }, { "epoch": 0.185612300933553, "grad_norm": 0.4354546568259888, "learning_rate": 9.932591065221962e-07, "loss": 0.339, "step": 338 }, { "epoch": 0.18616144975288304, "grad_norm": 0.46089721608356077, "learning_rate": 9.93211491825003e-07, "loss": 0.3696, "step": 339 }, { "epoch": 0.18671059857221306, "grad_norm": 0.44629559276548253, "learning_rate": 9.931637107039754e-07, "loss": 0.3082, "step": 340 }, { "epoch": 0.1872597473915431, "grad_norm": 0.4618432560242684, "learning_rate": 9.931157631752371e-07, "loss": 0.3619, "step": 341 }, { "epoch": 0.18780889621087316, "grad_norm": 0.467116332557529, "learning_rate": 9.93067649254969e-07, "loss": 0.3588, "step": 342 }, { "epoch": 0.18835804503020318, "grad_norm": 0.4056585203016652, "learning_rate": 9.930193689594073e-07, "loss": 0.3059, "step": 343 }, { "epoch": 0.18890719384953322, "grad_norm": 0.364051333857049, "learning_rate": 9.929709223048455e-07, "loss": 0.3344, "step": 344 }, { "epoch": 0.18945634266886327, "grad_norm": 0.36147265732149836, "learning_rate": 9.929223093076322e-07, "loss": 0.3282, "step": 345 }, { "epoch": 0.1900054914881933, "grad_norm": 0.40663001496083007, "learning_rate": 9.928735299841727e-07, "loss": 0.3017, "step": 346 }, { "epoch": 0.19055464030752334, "grad_norm": 0.46893798236659495, "learning_rate": 9.92824584350928e-07, "loss": 0.3825, "step": 347 }, { "epoch": 0.19110378912685339, "grad_norm": 0.4554788869837212, "learning_rate": 9.927754724244154e-07, "loss": 0.3033, "step": 348 }, { "epoch": 0.1916529379461834, "grad_norm": 0.4986405237170547, "learning_rate": 9.927261942212086e-07, "loss": 0.3315, "step": 349 }, { "epoch": 0.19220208676551345, "grad_norm": 0.7660448652449976, "learning_rate": 9.926767497579368e-07, "loss": 0.3772, "step": 350 }, { "epoch": 0.1927512355848435, "grad_norm": 0.4034514573135684, "learning_rate": 9.926271390512863e-07, "loss": 0.3464, "step": 351 }, { "epoch": 0.19330038440417352, "grad_norm": 0.4837190340742054, "learning_rate": 9.925773621179983e-07, "loss": 0.3592, "step": 352 }, { "epoch": 0.19384953322350357, "grad_norm": 0.4174811670579678, "learning_rate": 9.925274189748711e-07, "loss": 0.3273, "step": 353 }, { "epoch": 0.19439868204283361, "grad_norm": 0.4045739352355624, "learning_rate": 9.924773096387583e-07, "loss": 0.3241, "step": 354 }, { "epoch": 0.19494783086216363, "grad_norm": 0.44473166066728204, "learning_rate": 9.924270341265703e-07, "loss": 0.3338, "step": 355 }, { "epoch": 0.19549697968149368, "grad_norm": 0.36575751472761997, "learning_rate": 9.92376592455273e-07, "loss": 0.3506, "step": 356 }, { "epoch": 0.19604612850082373, "grad_norm": 0.4798722835482685, "learning_rate": 9.923259846418886e-07, "loss": 0.3337, "step": 357 }, { "epoch": 0.19659527732015375, "grad_norm": 0.4544698745226302, "learning_rate": 9.922752107034955e-07, "loss": 0.3623, "step": 358 }, { "epoch": 0.1971444261394838, "grad_norm": 0.4011868522899694, "learning_rate": 9.922242706572279e-07, "loss": 0.3747, "step": 359 }, { "epoch": 0.19769357495881384, "grad_norm": 0.41895377789327076, "learning_rate": 9.921731645202763e-07, "loss": 0.3482, "step": 360 }, { "epoch": 0.19824272377814386, "grad_norm": 0.4960808386642329, "learning_rate": 9.921218923098872e-07, "loss": 0.3744, "step": 361 }, { "epoch": 0.1987918725974739, "grad_norm": 0.4346701572285728, "learning_rate": 9.920704540433632e-07, "loss": 0.332, "step": 362 }, { "epoch": 0.19934102141680396, "grad_norm": 0.4391749087620093, "learning_rate": 9.920188497380622e-07, "loss": 0.4105, "step": 363 }, { "epoch": 0.19989017023613398, "grad_norm": 0.46502915667274697, "learning_rate": 9.919670794113993e-07, "loss": 0.3201, "step": 364 }, { "epoch": 0.20043931905546403, "grad_norm": 0.3745778116593925, "learning_rate": 9.91915143080845e-07, "loss": 0.3353, "step": 365 }, { "epoch": 0.20098846787479407, "grad_norm": 0.480474338823275, "learning_rate": 9.918630407639258e-07, "loss": 0.3681, "step": 366 }, { "epoch": 0.2015376166941241, "grad_norm": 0.4572293832732304, "learning_rate": 9.918107724782245e-07, "loss": 0.3467, "step": 367 }, { "epoch": 0.20208676551345414, "grad_norm": 0.4301394422682637, "learning_rate": 9.917583382413792e-07, "loss": 0.3464, "step": 368 }, { "epoch": 0.2026359143327842, "grad_norm": 0.4455891905085749, "learning_rate": 9.917057380710854e-07, "loss": 0.3272, "step": 369 }, { "epoch": 0.2031850631521142, "grad_norm": 0.5152756392544137, "learning_rate": 9.916529719850927e-07, "loss": 0.337, "step": 370 }, { "epoch": 0.20373421197144426, "grad_norm": 0.33620823144229933, "learning_rate": 9.916000400012086e-07, "loss": 0.3225, "step": 371 }, { "epoch": 0.2042833607907743, "grad_norm": 0.48920858152010266, "learning_rate": 9.915469421372951e-07, "loss": 0.3467, "step": 372 }, { "epoch": 0.20483250961010435, "grad_norm": 0.42676704637335544, "learning_rate": 9.914936784112712e-07, "loss": 0.305, "step": 373 }, { "epoch": 0.20538165842943437, "grad_norm": 0.360908826691987, "learning_rate": 9.914402488411112e-07, "loss": 0.3359, "step": 374 }, { "epoch": 0.20593080724876442, "grad_norm": 0.45603145474492923, "learning_rate": 9.913866534448455e-07, "loss": 0.3011, "step": 375 }, { "epoch": 0.20647995606809447, "grad_norm": 0.3475347003368911, "learning_rate": 9.913328922405608e-07, "loss": 0.3579, "step": 376 }, { "epoch": 0.2070291048874245, "grad_norm": 0.38080924002398087, "learning_rate": 9.912789652463995e-07, "loss": 0.3089, "step": 377 }, { "epoch": 0.20757825370675453, "grad_norm": 0.5130024962034675, "learning_rate": 9.912248724805599e-07, "loss": 0.3427, "step": 378 }, { "epoch": 0.20812740252608458, "grad_norm": 0.4195870555367316, "learning_rate": 9.911706139612962e-07, "loss": 0.3309, "step": 379 }, { "epoch": 0.2086765513454146, "grad_norm": 0.3962602511086976, "learning_rate": 9.91116189706919e-07, "loss": 0.3273, "step": 380 }, { "epoch": 0.20922570016474465, "grad_norm": 0.42187454269421343, "learning_rate": 9.91061599735794e-07, "loss": 0.3309, "step": 381 }, { "epoch": 0.2097748489840747, "grad_norm": 0.7156265150667257, "learning_rate": 9.910068440663438e-07, "loss": 0.3885, "step": 382 }, { "epoch": 0.21032399780340472, "grad_norm": 0.48844455845770823, "learning_rate": 9.909519227170462e-07, "loss": 0.327, "step": 383 }, { "epoch": 0.21087314662273476, "grad_norm": 0.40278423009662806, "learning_rate": 9.90896835706435e-07, "loss": 0.333, "step": 384 }, { "epoch": 0.2114222954420648, "grad_norm": 0.4136773266220641, "learning_rate": 9.908415830531001e-07, "loss": 0.3112, "step": 385 }, { "epoch": 0.21197144426139483, "grad_norm": 0.4735432276249727, "learning_rate": 9.907861647756875e-07, "loss": 0.3505, "step": 386 }, { "epoch": 0.21252059308072488, "grad_norm": 0.5748703615776792, "learning_rate": 9.907305808928986e-07, "loss": 0.3604, "step": 387 }, { "epoch": 0.21306974190005493, "grad_norm": 0.5574444372579445, "learning_rate": 9.90674831423491e-07, "loss": 0.3052, "step": 388 }, { "epoch": 0.21361889071938495, "grad_norm": 0.47561555556499935, "learning_rate": 9.906189163862778e-07, "loss": 0.3821, "step": 389 }, { "epoch": 0.214168039538715, "grad_norm": 0.5230808399701486, "learning_rate": 9.905628358001286e-07, "loss": 0.3963, "step": 390 }, { "epoch": 0.21471718835804504, "grad_norm": 0.4833674871215576, "learning_rate": 9.905065896839685e-07, "loss": 0.2967, "step": 391 }, { "epoch": 0.21526633717737506, "grad_norm": 0.48558954395823517, "learning_rate": 9.904501780567783e-07, "loss": 0.2967, "step": 392 }, { "epoch": 0.2158154859967051, "grad_norm": 0.5202443193481654, "learning_rate": 9.903936009375951e-07, "loss": 0.3814, "step": 393 }, { "epoch": 0.21636463481603516, "grad_norm": 0.44496363289627866, "learning_rate": 9.903368583455112e-07, "loss": 0.3377, "step": 394 }, { "epoch": 0.21691378363536518, "grad_norm": 0.604333190934294, "learning_rate": 9.902799502996756e-07, "loss": 0.3327, "step": 395 }, { "epoch": 0.21746293245469522, "grad_norm": 0.40242662221613446, "learning_rate": 9.902228768192924e-07, "loss": 0.3188, "step": 396 }, { "epoch": 0.21801208127402527, "grad_norm": 0.4371578522370144, "learning_rate": 9.901656379236221e-07, "loss": 0.3034, "step": 397 }, { "epoch": 0.2185612300933553, "grad_norm": 0.3887168932521033, "learning_rate": 9.901082336319801e-07, "loss": 0.3485, "step": 398 }, { "epoch": 0.21911037891268534, "grad_norm": 0.46838390171823346, "learning_rate": 9.900506639637388e-07, "loss": 0.3148, "step": 399 }, { "epoch": 0.21965952773201539, "grad_norm": 0.5676664228092575, "learning_rate": 9.899929289383255e-07, "loss": 0.32, "step": 400 }, { "epoch": 0.21965952773201539, "eval_loss": 0.4228764772415161, "eval_runtime": 18.526, "eval_samples_per_second": 23.912, "eval_steps_per_second": 1.026, "step": 400 }, { "epoch": 0.2202086765513454, "grad_norm": 0.42390860934965974, "learning_rate": 9.89935028575224e-07, "loss": 0.3505, "step": 401 }, { "epoch": 0.22075782537067545, "grad_norm": 0.502187296561082, "learning_rate": 9.898769628939733e-07, "loss": 0.307, "step": 402 }, { "epoch": 0.2213069741900055, "grad_norm": 0.46942801548132196, "learning_rate": 9.898187319141685e-07, "loss": 0.3154, "step": 403 }, { "epoch": 0.22185612300933552, "grad_norm": 0.4747127982547361, "learning_rate": 9.897603356554602e-07, "loss": 0.3348, "step": 404 }, { "epoch": 0.22240527182866557, "grad_norm": 0.3863223529291394, "learning_rate": 9.897017741375553e-07, "loss": 0.3559, "step": 405 }, { "epoch": 0.22295442064799562, "grad_norm": 0.602147291296106, "learning_rate": 9.89643047380216e-07, "loss": 0.4005, "step": 406 }, { "epoch": 0.22350356946732564, "grad_norm": 0.5088089757894816, "learning_rate": 9.895841554032604e-07, "loss": 0.3228, "step": 407 }, { "epoch": 0.22405271828665568, "grad_norm": 0.41907597042910427, "learning_rate": 9.895250982265623e-07, "loss": 0.3393, "step": 408 }, { "epoch": 0.22460186710598573, "grad_norm": 0.44399642256849997, "learning_rate": 9.894658758700515e-07, "loss": 0.3604, "step": 409 }, { "epoch": 0.22515101592531575, "grad_norm": 1.0077425036179892, "learning_rate": 9.894064883537134e-07, "loss": 0.4804, "step": 410 }, { "epoch": 0.2257001647446458, "grad_norm": 0.45563398620135925, "learning_rate": 9.89346935697589e-07, "loss": 0.3046, "step": 411 }, { "epoch": 0.22624931356397585, "grad_norm": 1.004201405548782, "learning_rate": 9.89287217921775e-07, "loss": 0.3217, "step": 412 }, { "epoch": 0.22679846238330587, "grad_norm": 0.5717102983313682, "learning_rate": 9.892273350464241e-07, "loss": 0.3941, "step": 413 }, { "epoch": 0.2273476112026359, "grad_norm": 0.40676708183793037, "learning_rate": 9.891672870917443e-07, "loss": 0.3115, "step": 414 }, { "epoch": 0.22789676002196596, "grad_norm": 0.5754673510381386, "learning_rate": 9.891070740780001e-07, "loss": 0.3674, "step": 415 }, { "epoch": 0.22844590884129598, "grad_norm": 0.5466212066527785, "learning_rate": 9.89046696025511e-07, "loss": 0.3258, "step": 416 }, { "epoch": 0.22899505766062603, "grad_norm": 0.5214207015834531, "learning_rate": 9.889861529546518e-07, "loss": 0.3273, "step": 417 }, { "epoch": 0.22954420647995608, "grad_norm": 0.5770370955549603, "learning_rate": 9.889254448858543e-07, "loss": 0.3155, "step": 418 }, { "epoch": 0.2300933552992861, "grad_norm": 0.5659239065294162, "learning_rate": 9.888645718396048e-07, "loss": 0.3291, "step": 419 }, { "epoch": 0.23064250411861614, "grad_norm": 0.5331675770333455, "learning_rate": 9.888035338364458e-07, "loss": 0.3454, "step": 420 }, { "epoch": 0.2311916529379462, "grad_norm": 0.4993020562403641, "learning_rate": 9.887423308969757e-07, "loss": 0.3793, "step": 421 }, { "epoch": 0.2317408017572762, "grad_norm": 0.4403038899592374, "learning_rate": 9.88680963041848e-07, "loss": 0.3469, "step": 422 }, { "epoch": 0.23228995057660626, "grad_norm": 0.46570396264636643, "learning_rate": 9.886194302917718e-07, "loss": 0.2952, "step": 423 }, { "epoch": 0.2328390993959363, "grad_norm": 0.5520583196776558, "learning_rate": 9.885577326675123e-07, "loss": 0.3451, "step": 424 }, { "epoch": 0.23338824821526633, "grad_norm": 0.592935506523908, "learning_rate": 9.884958701898906e-07, "loss": 0.3436, "step": 425 }, { "epoch": 0.23393739703459637, "grad_norm": 0.381863023561074, "learning_rate": 9.884338428797823e-07, "loss": 0.3496, "step": 426 }, { "epoch": 0.23448654585392642, "grad_norm": 0.4081311020819649, "learning_rate": 9.8837165075812e-07, "loss": 0.3371, "step": 427 }, { "epoch": 0.23503569467325644, "grad_norm": 0.4145298686576029, "learning_rate": 9.883092938458906e-07, "loss": 0.3483, "step": 428 }, { "epoch": 0.2355848434925865, "grad_norm": 0.46947800565120756, "learning_rate": 9.88246772164138e-07, "loss": 0.3247, "step": 429 }, { "epoch": 0.23613399231191654, "grad_norm": 0.45028601564688364, "learning_rate": 9.881840857339603e-07, "loss": 0.3155, "step": 430 }, { "epoch": 0.23668314113124655, "grad_norm": 0.38861384435571195, "learning_rate": 9.881212345765125e-07, "loss": 0.3458, "step": 431 }, { "epoch": 0.2372322899505766, "grad_norm": 0.4087983048885045, "learning_rate": 9.880582187130037e-07, "loss": 0.3602, "step": 432 }, { "epoch": 0.23778143876990665, "grad_norm": 0.4451626928583509, "learning_rate": 9.879950381647004e-07, "loss": 0.2972, "step": 433 }, { "epoch": 0.23833058758923667, "grad_norm": 0.40197382302181084, "learning_rate": 9.879316929529227e-07, "loss": 0.3442, "step": 434 }, { "epoch": 0.23887973640856672, "grad_norm": 0.46154090432215, "learning_rate": 9.878681830990482e-07, "loss": 0.3248, "step": 435 }, { "epoch": 0.23942888522789676, "grad_norm": 0.5456605674792115, "learning_rate": 9.878045086245086e-07, "loss": 0.3336, "step": 436 }, { "epoch": 0.23997803404722678, "grad_norm": 0.5440045392154607, "learning_rate": 9.87740669550792e-07, "loss": 0.3114, "step": 437 }, { "epoch": 0.24052718286655683, "grad_norm": 0.43071394758946097, "learning_rate": 9.876766658994415e-07, "loss": 0.3272, "step": 438 }, { "epoch": 0.24107633168588688, "grad_norm": 0.383344813733855, "learning_rate": 9.87612497692056e-07, "loss": 0.3056, "step": 439 }, { "epoch": 0.24162548050521693, "grad_norm": 0.5017906990034449, "learning_rate": 9.875481649502897e-07, "loss": 0.3581, "step": 440 }, { "epoch": 0.24217462932454695, "grad_norm": 0.3908550031182706, "learning_rate": 9.87483667695853e-07, "loss": 0.3286, "step": 441 }, { "epoch": 0.242723778143877, "grad_norm": 0.568898227059734, "learning_rate": 9.87419005950511e-07, "loss": 0.3718, "step": 442 }, { "epoch": 0.24327292696320704, "grad_norm": 0.5045646022299286, "learning_rate": 9.87354179736085e-07, "loss": 0.3311, "step": 443 }, { "epoch": 0.24382207578253706, "grad_norm": 0.4389838124329114, "learning_rate": 9.872891890744511e-07, "loss": 0.3282, "step": 444 }, { "epoch": 0.2443712246018671, "grad_norm": 0.3868336363651343, "learning_rate": 9.872240339875414e-07, "loss": 0.301, "step": 445 }, { "epoch": 0.24492037342119716, "grad_norm": 0.5277805286968514, "learning_rate": 9.871587144973434e-07, "loss": 0.285, "step": 446 }, { "epoch": 0.24546952224052718, "grad_norm": 0.5246407034439886, "learning_rate": 9.870932306258998e-07, "loss": 0.3451, "step": 447 }, { "epoch": 0.24601867105985722, "grad_norm": 0.5173616515843543, "learning_rate": 9.870275823953094e-07, "loss": 0.2902, "step": 448 }, { "epoch": 0.24656781987918727, "grad_norm": 0.5486693791715967, "learning_rate": 9.869617698277256e-07, "loss": 0.297, "step": 449 }, { "epoch": 0.2471169686985173, "grad_norm": 0.4021717743743248, "learning_rate": 9.86895792945358e-07, "loss": 0.3276, "step": 450 }, { "epoch": 0.24766611751784734, "grad_norm": 0.45452302398819655, "learning_rate": 9.868296517704712e-07, "loss": 0.3392, "step": 451 }, { "epoch": 0.2482152663371774, "grad_norm": 0.39928704714891894, "learning_rate": 9.867633463253854e-07, "loss": 0.2873, "step": 452 }, { "epoch": 0.2487644151565074, "grad_norm": 0.6310486620676942, "learning_rate": 9.866968766324767e-07, "loss": 0.4076, "step": 453 }, { "epoch": 0.24931356397583745, "grad_norm": 0.49015974476959123, "learning_rate": 9.866302427141756e-07, "loss": 0.3256, "step": 454 }, { "epoch": 0.2498627127951675, "grad_norm": 0.4665908418347988, "learning_rate": 9.865634445929688e-07, "loss": 0.2981, "step": 455 }, { "epoch": 0.2504118616144975, "grad_norm": 0.44224726582538065, "learning_rate": 9.864964822913985e-07, "loss": 0.3123, "step": 456 }, { "epoch": 0.25096101043382757, "grad_norm": 0.4524761141687417, "learning_rate": 9.864293558320615e-07, "loss": 0.3094, "step": 457 }, { "epoch": 0.2515101592531576, "grad_norm": 0.4987568098773706, "learning_rate": 9.863620652376107e-07, "loss": 0.33, "step": 458 }, { "epoch": 0.25205930807248766, "grad_norm": 0.487182594814918, "learning_rate": 9.862946105307541e-07, "loss": 0.3318, "step": 459 }, { "epoch": 0.25260845689181766, "grad_norm": 0.45648480236265654, "learning_rate": 9.862269917342555e-07, "loss": 0.3134, "step": 460 }, { "epoch": 0.2531576057111477, "grad_norm": 0.49300967815141156, "learning_rate": 9.861592088709335e-07, "loss": 0.3525, "step": 461 }, { "epoch": 0.25370675453047775, "grad_norm": 0.3701771570295517, "learning_rate": 9.860912619636625e-07, "loss": 0.336, "step": 462 }, { "epoch": 0.2542559033498078, "grad_norm": 0.3706429817164138, "learning_rate": 9.860231510353717e-07, "loss": 0.306, "step": 463 }, { "epoch": 0.25480505216913785, "grad_norm": 0.5397276494382715, "learning_rate": 9.859548761090466e-07, "loss": 0.3404, "step": 464 }, { "epoch": 0.2553542009884679, "grad_norm": 0.5635005586188511, "learning_rate": 9.85886437207727e-07, "loss": 0.3423, "step": 465 }, { "epoch": 0.2559033498077979, "grad_norm": 0.4904923005097277, "learning_rate": 9.858178343545085e-07, "loss": 0.3386, "step": 466 }, { "epoch": 0.25645249862712793, "grad_norm": 0.5398129446008373, "learning_rate": 9.857490675725423e-07, "loss": 0.3322, "step": 467 }, { "epoch": 0.257001647446458, "grad_norm": 0.5476004333768724, "learning_rate": 9.856801368850347e-07, "loss": 0.3237, "step": 468 }, { "epoch": 0.25755079626578803, "grad_norm": 0.4837385163663176, "learning_rate": 9.856110423152472e-07, "loss": 0.3467, "step": 469 }, { "epoch": 0.2580999450851181, "grad_norm": 0.4292631708341571, "learning_rate": 9.855417838864964e-07, "loss": 0.3378, "step": 470 }, { "epoch": 0.2586490939044481, "grad_norm": 0.49297624374899757, "learning_rate": 9.854723616221547e-07, "loss": 0.2706, "step": 471 }, { "epoch": 0.2591982427237781, "grad_norm": 0.6009134590862357, "learning_rate": 9.854027755456494e-07, "loss": 0.3744, "step": 472 }, { "epoch": 0.25974739154310816, "grad_norm": 0.499450242151516, "learning_rate": 9.853330256804637e-07, "loss": 0.3075, "step": 473 }, { "epoch": 0.2602965403624382, "grad_norm": 0.48280821398488216, "learning_rate": 9.85263112050135e-07, "loss": 0.3468, "step": 474 }, { "epoch": 0.26084568918176826, "grad_norm": 0.41285380008388917, "learning_rate": 9.851930346782568e-07, "loss": 0.3569, "step": 475 }, { "epoch": 0.2613948380010983, "grad_norm": 0.5351488316687973, "learning_rate": 9.85122793588478e-07, "loss": 0.3331, "step": 476 }, { "epoch": 0.26194398682042835, "grad_norm": 0.5028414897022058, "learning_rate": 9.850523888045017e-07, "loss": 0.372, "step": 477 }, { "epoch": 0.2624931356397584, "grad_norm": 0.4735933901803555, "learning_rate": 9.849818203500874e-07, "loss": 0.2975, "step": 478 }, { "epoch": 0.2630422844590884, "grad_norm": 0.36759435988077777, "learning_rate": 9.849110882490492e-07, "loss": 0.3433, "step": 479 }, { "epoch": 0.26359143327841844, "grad_norm": 0.46104607271477915, "learning_rate": 9.848401925252565e-07, "loss": 0.3375, "step": 480 }, { "epoch": 0.2641405820977485, "grad_norm": 0.49767284968597836, "learning_rate": 9.847691332026344e-07, "loss": 0.2742, "step": 481 }, { "epoch": 0.26468973091707854, "grad_norm": 0.4351918538089829, "learning_rate": 9.846979103051624e-07, "loss": 0.3218, "step": 482 }, { "epoch": 0.2652388797364086, "grad_norm": 0.44627295269938216, "learning_rate": 9.846265238568757e-07, "loss": 0.2978, "step": 483 }, { "epoch": 0.26578802855573863, "grad_norm": 0.5349739099174223, "learning_rate": 9.845549738818645e-07, "loss": 0.3389, "step": 484 }, { "epoch": 0.2663371773750686, "grad_norm": 0.5124644709235744, "learning_rate": 9.844832604042745e-07, "loss": 0.3172, "step": 485 }, { "epoch": 0.26688632619439867, "grad_norm": 0.6418111086810989, "learning_rate": 9.844113834483061e-07, "loss": 0.3165, "step": 486 }, { "epoch": 0.2674354750137287, "grad_norm": 0.5545339025295388, "learning_rate": 9.843393430382155e-07, "loss": 0.3014, "step": 487 }, { "epoch": 0.26798462383305877, "grad_norm": 0.6592300615544398, "learning_rate": 9.842671391983135e-07, "loss": 0.3403, "step": 488 }, { "epoch": 0.2685337726523888, "grad_norm": 0.5104216097006875, "learning_rate": 9.841947719529659e-07, "loss": 0.3526, "step": 489 }, { "epoch": 0.26908292147171886, "grad_norm": 0.41034725870092115, "learning_rate": 9.841222413265942e-07, "loss": 0.3084, "step": 490 }, { "epoch": 0.26963207029104885, "grad_norm": 0.5407484438988068, "learning_rate": 9.840495473436752e-07, "loss": 0.3698, "step": 491 }, { "epoch": 0.2701812191103789, "grad_norm": 0.5530023755029136, "learning_rate": 9.8397669002874e-07, "loss": 0.3221, "step": 492 }, { "epoch": 0.27073036792970895, "grad_norm": 0.4166421182937445, "learning_rate": 9.839036694063754e-07, "loss": 0.3077, "step": 493 }, { "epoch": 0.271279516749039, "grad_norm": 0.6849734541567781, "learning_rate": 9.83830485501223e-07, "loss": 0.3319, "step": 494 }, { "epoch": 0.27182866556836904, "grad_norm": 0.4533952074845478, "learning_rate": 9.8375713833798e-07, "loss": 0.3188, "step": 495 }, { "epoch": 0.2723778143876991, "grad_norm": 0.4733529516975668, "learning_rate": 9.836836279413981e-07, "loss": 0.313, "step": 496 }, { "epoch": 0.2729269632070291, "grad_norm": 0.3758010717560732, "learning_rate": 9.836099543362845e-07, "loss": 0.29, "step": 497 }, { "epoch": 0.27347611202635913, "grad_norm": 0.6357215798182766, "learning_rate": 9.835361175475014e-07, "loss": 0.3559, "step": 498 }, { "epoch": 0.2740252608456892, "grad_norm": 0.44075419639782126, "learning_rate": 9.834621175999656e-07, "loss": 0.2992, "step": 499 }, { "epoch": 0.2745744096650192, "grad_norm": 0.3832583022306382, "learning_rate": 9.833879545186496e-07, "loss": 0.2806, "step": 500 }, { "epoch": 0.2751235584843493, "grad_norm": 0.4994007202886253, "learning_rate": 9.833136283285809e-07, "loss": 0.3425, "step": 501 }, { "epoch": 0.2756727073036793, "grad_norm": 0.4639517335419424, "learning_rate": 9.832391390548417e-07, "loss": 0.3235, "step": 502 }, { "epoch": 0.2762218561230093, "grad_norm": 0.508594139359251, "learning_rate": 9.831644867225692e-07, "loss": 0.3303, "step": 503 }, { "epoch": 0.27677100494233936, "grad_norm": 0.44897951727340707, "learning_rate": 9.83089671356956e-07, "loss": 0.2931, "step": 504 }, { "epoch": 0.2773201537616694, "grad_norm": 0.5360986959496855, "learning_rate": 9.830146929832497e-07, "loss": 0.3026, "step": 505 }, { "epoch": 0.27786930258099946, "grad_norm": 0.46558855157707546, "learning_rate": 9.829395516267524e-07, "loss": 0.3499, "step": 506 }, { "epoch": 0.2784184514003295, "grad_norm": 0.4142034337717624, "learning_rate": 9.828642473128217e-07, "loss": 0.3138, "step": 507 }, { "epoch": 0.27896760021965955, "grad_norm": 0.4189635437858386, "learning_rate": 9.8278878006687e-07, "loss": 0.3207, "step": 508 }, { "epoch": 0.27951674903898954, "grad_norm": 0.39559383258550235, "learning_rate": 9.827131499143647e-07, "loss": 0.3123, "step": 509 }, { "epoch": 0.2800658978583196, "grad_norm": 0.49778186676134906, "learning_rate": 9.826373568808282e-07, "loss": 0.337, "step": 510 }, { "epoch": 0.28061504667764964, "grad_norm": 0.5150403201162589, "learning_rate": 9.82561400991838e-07, "loss": 0.3322, "step": 511 }, { "epoch": 0.2811641954969797, "grad_norm": 0.46460221311660477, "learning_rate": 9.824852822730263e-07, "loss": 0.3322, "step": 512 }, { "epoch": 0.28171334431630973, "grad_norm": 0.4666271193920616, "learning_rate": 9.824090007500802e-07, "loss": 0.3054, "step": 513 }, { "epoch": 0.2822624931356398, "grad_norm": 0.46552853262562716, "learning_rate": 9.823325564487422e-07, "loss": 0.3143, "step": 514 }, { "epoch": 0.2828116419549698, "grad_norm": 0.7429484365482533, "learning_rate": 9.822559493948093e-07, "loss": 0.3501, "step": 515 }, { "epoch": 0.2833607907742998, "grad_norm": 0.485070342608985, "learning_rate": 9.821791796141335e-07, "loss": 0.3162, "step": 516 }, { "epoch": 0.28390993959362987, "grad_norm": 0.6414310908698505, "learning_rate": 9.821022471326217e-07, "loss": 0.3464, "step": 517 }, { "epoch": 0.2844590884129599, "grad_norm": 0.5547569678188016, "learning_rate": 9.820251519762361e-07, "loss": 0.2916, "step": 518 }, { "epoch": 0.28500823723228996, "grad_norm": 0.5534751713608991, "learning_rate": 9.819478941709933e-07, "loss": 0.3097, "step": 519 }, { "epoch": 0.28555738605162, "grad_norm": 0.3729588387904154, "learning_rate": 9.818704737429648e-07, "loss": 0.3549, "step": 520 }, { "epoch": 0.28610653487095, "grad_norm": 0.4515705048081947, "learning_rate": 9.817928907182773e-07, "loss": 0.3223, "step": 521 }, { "epoch": 0.28665568369028005, "grad_norm": 0.5633114309906939, "learning_rate": 9.81715145123112e-07, "loss": 0.3637, "step": 522 }, { "epoch": 0.2872048325096101, "grad_norm": 0.5583309472873978, "learning_rate": 9.816372369837058e-07, "loss": 0.3061, "step": 523 }, { "epoch": 0.28775398132894014, "grad_norm": 0.5645992858236648, "learning_rate": 9.81559166326349e-07, "loss": 0.3362, "step": 524 }, { "epoch": 0.2883031301482702, "grad_norm": 0.4261714536575483, "learning_rate": 9.814809331773882e-07, "loss": 0.339, "step": 525 }, { "epoch": 0.28885227896760024, "grad_norm": 0.468084747846434, "learning_rate": 9.81402537563224e-07, "loss": 0.2902, "step": 526 }, { "epoch": 0.28940142778693023, "grad_norm": 0.5508973041842398, "learning_rate": 9.813239795103118e-07, "loss": 0.3396, "step": 527 }, { "epoch": 0.2899505766062603, "grad_norm": 0.47663021394730964, "learning_rate": 9.812452590451625e-07, "loss": 0.2864, "step": 528 }, { "epoch": 0.2904997254255903, "grad_norm": 0.5248569052958911, "learning_rate": 9.81166376194341e-07, "loss": 0.3292, "step": 529 }, { "epoch": 0.2910488742449204, "grad_norm": 0.45095303752764915, "learning_rate": 9.810873309844674e-07, "loss": 0.3055, "step": 530 }, { "epoch": 0.2915980230642504, "grad_norm": 0.4316577045972638, "learning_rate": 9.810081234422168e-07, "loss": 0.3126, "step": 531 }, { "epoch": 0.29214717188358047, "grad_norm": 0.5426342316958122, "learning_rate": 9.809287535943186e-07, "loss": 0.3433, "step": 532 }, { "epoch": 0.29269632070291046, "grad_norm": 0.45129548620138626, "learning_rate": 9.80849221467557e-07, "loss": 0.295, "step": 533 }, { "epoch": 0.2932454695222405, "grad_norm": 0.4349520560713567, "learning_rate": 9.807695270887717e-07, "loss": 0.3176, "step": 534 }, { "epoch": 0.29379461834157056, "grad_norm": 0.4750205582822795, "learning_rate": 9.80689670484856e-07, "loss": 0.318, "step": 535 }, { "epoch": 0.2943437671609006, "grad_norm": 0.35635393599511017, "learning_rate": 9.80609651682759e-07, "loss": 0.3028, "step": 536 }, { "epoch": 0.29489291598023065, "grad_norm": 0.6318683320306482, "learning_rate": 9.80529470709484e-07, "loss": 0.3273, "step": 537 }, { "epoch": 0.2954420647995607, "grad_norm": 0.41845496479417515, "learning_rate": 9.804491275920891e-07, "loss": 0.3262, "step": 538 }, { "epoch": 0.2959912136188907, "grad_norm": 0.4784822041779314, "learning_rate": 9.803686223576873e-07, "loss": 0.3521, "step": 539 }, { "epoch": 0.29654036243822074, "grad_norm": 0.4881172617542623, "learning_rate": 9.80287955033446e-07, "loss": 0.3166, "step": 540 }, { "epoch": 0.2970895112575508, "grad_norm": 0.5887935373117399, "learning_rate": 9.802071256465871e-07, "loss": 0.3493, "step": 541 }, { "epoch": 0.29763866007688083, "grad_norm": 0.4421694064847873, "learning_rate": 9.801261342243882e-07, "loss": 0.2972, "step": 542 }, { "epoch": 0.2981878088962109, "grad_norm": 0.4926102490919537, "learning_rate": 9.800449807941805e-07, "loss": 0.2786, "step": 543 }, { "epoch": 0.29873695771554093, "grad_norm": 0.3943114407203231, "learning_rate": 9.799636653833503e-07, "loss": 0.2983, "step": 544 }, { "epoch": 0.299286106534871, "grad_norm": 0.5343510083262023, "learning_rate": 9.79882188019339e-07, "loss": 0.3005, "step": 545 }, { "epoch": 0.29983525535420097, "grad_norm": 0.40683853158379923, "learning_rate": 9.798005487296414e-07, "loss": 0.3554, "step": 546 }, { "epoch": 0.300384404173531, "grad_norm": 0.4227452823290433, "learning_rate": 9.797187475418085e-07, "loss": 0.3598, "step": 547 }, { "epoch": 0.30093355299286106, "grad_norm": 0.5216737746477398, "learning_rate": 9.796367844834448e-07, "loss": 0.2903, "step": 548 }, { "epoch": 0.3014827018121911, "grad_norm": 0.47930504282709674, "learning_rate": 9.795546595822099e-07, "loss": 0.2803, "step": 549 }, { "epoch": 0.30203185063152116, "grad_norm": 0.4129539214802851, "learning_rate": 9.794723728658183e-07, "loss": 0.3148, "step": 550 }, { "epoch": 0.3025809994508512, "grad_norm": 0.5261404252421711, "learning_rate": 9.79389924362038e-07, "loss": 0.3738, "step": 551 }, { "epoch": 0.3031301482701812, "grad_norm": 0.4518393459552854, "learning_rate": 9.793073140986928e-07, "loss": 0.2865, "step": 552 }, { "epoch": 0.30367929708951125, "grad_norm": 0.42913338660332895, "learning_rate": 9.792245421036605e-07, "loss": 0.3402, "step": 553 }, { "epoch": 0.3042284459088413, "grad_norm": 0.37538280309301175, "learning_rate": 9.791416084048735e-07, "loss": 0.347, "step": 554 }, { "epoch": 0.30477759472817134, "grad_norm": 0.47867070657374167, "learning_rate": 9.790585130303194e-07, "loss": 0.3229, "step": 555 }, { "epoch": 0.3053267435475014, "grad_norm": 0.5287224541697721, "learning_rate": 9.78975256008039e-07, "loss": 0.2912, "step": 556 }, { "epoch": 0.30587589236683144, "grad_norm": 0.6067703821218071, "learning_rate": 9.788918373661291e-07, "loss": 0.3871, "step": 557 }, { "epoch": 0.30642504118616143, "grad_norm": 0.5708585514938724, "learning_rate": 9.788082571327403e-07, "loss": 0.3505, "step": 558 }, { "epoch": 0.3069741900054915, "grad_norm": 0.5264752428886793, "learning_rate": 9.787245153360776e-07, "loss": 0.31, "step": 559 }, { "epoch": 0.3075233388248215, "grad_norm": 0.3697823456895624, "learning_rate": 9.786406120044012e-07, "loss": 0.3203, "step": 560 }, { "epoch": 0.30807248764415157, "grad_norm": 0.6486026958975594, "learning_rate": 9.785565471660249e-07, "loss": 0.3225, "step": 561 }, { "epoch": 0.3086216364634816, "grad_norm": 0.4376569906728734, "learning_rate": 9.784723208493178e-07, "loss": 0.3138, "step": 562 }, { "epoch": 0.30917078528281167, "grad_norm": 0.5791392718704185, "learning_rate": 9.78387933082703e-07, "loss": 0.3242, "step": 563 }, { "epoch": 0.30971993410214166, "grad_norm": 0.5969372123363916, "learning_rate": 9.783033838946583e-07, "loss": 0.3092, "step": 564 }, { "epoch": 0.3102690829214717, "grad_norm": 0.49006087125156034, "learning_rate": 9.782186733137163e-07, "loss": 0.2805, "step": 565 }, { "epoch": 0.31081823174080175, "grad_norm": 0.3876002518816803, "learning_rate": 9.781338013684633e-07, "loss": 0.3589, "step": 566 }, { "epoch": 0.3113673805601318, "grad_norm": 0.4634736741485902, "learning_rate": 9.780487680875403e-07, "loss": 0.3139, "step": 567 }, { "epoch": 0.31191652937946185, "grad_norm": 0.4101658048398333, "learning_rate": 9.779635734996432e-07, "loss": 0.2801, "step": 568 }, { "epoch": 0.3124656781987919, "grad_norm": 0.4085151582885634, "learning_rate": 9.778782176335223e-07, "loss": 0.3019, "step": 569 }, { "epoch": 0.3130148270181219, "grad_norm": 0.43294500024428756, "learning_rate": 9.777927005179814e-07, "loss": 0.3088, "step": 570 }, { "epoch": 0.31356397583745194, "grad_norm": 0.39046159391356045, "learning_rate": 9.7770702218188e-07, "loss": 0.2653, "step": 571 }, { "epoch": 0.314113124656782, "grad_norm": 0.5399396643524935, "learning_rate": 9.776211826541307e-07, "loss": 0.3196, "step": 572 }, { "epoch": 0.31466227347611203, "grad_norm": 0.3875407680219636, "learning_rate": 9.77535181963702e-07, "loss": 0.3062, "step": 573 }, { "epoch": 0.3152114222954421, "grad_norm": 0.7710894183016851, "learning_rate": 9.774490201396153e-07, "loss": 0.3637, "step": 574 }, { "epoch": 0.3157605711147721, "grad_norm": 0.6260393246599966, "learning_rate": 9.773626972109473e-07, "loss": 0.3272, "step": 575 }, { "epoch": 0.3163097199341021, "grad_norm": 0.47925775372440726, "learning_rate": 9.772762132068289e-07, "loss": 0.3634, "step": 576 }, { "epoch": 0.31685886875343217, "grad_norm": 0.5270539925513561, "learning_rate": 9.77189568156445e-07, "loss": 0.2908, "step": 577 }, { "epoch": 0.3174080175727622, "grad_norm": 0.6228008650500568, "learning_rate": 9.771027620890354e-07, "loss": 0.411, "step": 578 }, { "epoch": 0.31795716639209226, "grad_norm": 0.4044321698401574, "learning_rate": 9.770157950338937e-07, "loss": 0.3164, "step": 579 }, { "epoch": 0.3185063152114223, "grad_norm": 0.5729343020853266, "learning_rate": 9.769286670203684e-07, "loss": 0.3132, "step": 580 }, { "epoch": 0.31905546403075236, "grad_norm": 0.5291556395514854, "learning_rate": 9.768413780778617e-07, "loss": 0.3126, "step": 581 }, { "epoch": 0.31960461285008235, "grad_norm": 0.38093655467159043, "learning_rate": 9.767539282358303e-07, "loss": 0.2945, "step": 582 }, { "epoch": 0.3201537616694124, "grad_norm": 0.5146897586541881, "learning_rate": 9.766663175237855e-07, "loss": 0.289, "step": 583 }, { "epoch": 0.32070291048874244, "grad_norm": 0.6481568907692759, "learning_rate": 9.76578545971293e-07, "loss": 0.2729, "step": 584 }, { "epoch": 0.3212520593080725, "grad_norm": 0.5173196623975647, "learning_rate": 9.76490613607972e-07, "loss": 0.2613, "step": 585 }, { "epoch": 0.32180120812740254, "grad_norm": 0.7492756748904551, "learning_rate": 9.764025204634966e-07, "loss": 0.3042, "step": 586 }, { "epoch": 0.3223503569467326, "grad_norm": 0.4470615640514475, "learning_rate": 9.763142665675948e-07, "loss": 0.2581, "step": 587 }, { "epoch": 0.3228995057660626, "grad_norm": 0.44794315830164905, "learning_rate": 9.762258519500494e-07, "loss": 0.2843, "step": 588 }, { "epoch": 0.3234486545853926, "grad_norm": 0.49029693062976754, "learning_rate": 9.761372766406968e-07, "loss": 0.3074, "step": 589 }, { "epoch": 0.3239978034047227, "grad_norm": 0.605411935679811, "learning_rate": 9.76048540669428e-07, "loss": 0.3015, "step": 590 }, { "epoch": 0.3245469522240527, "grad_norm": 0.46431875631226277, "learning_rate": 9.75959644066188e-07, "loss": 0.2868, "step": 591 }, { "epoch": 0.32509610104338277, "grad_norm": 0.46579396920898236, "learning_rate": 9.758705868609762e-07, "loss": 0.3274, "step": 592 }, { "epoch": 0.3256452498627128, "grad_norm": 0.46574517082980155, "learning_rate": 9.757813690838464e-07, "loss": 0.3189, "step": 593 }, { "epoch": 0.3261943986820428, "grad_norm": 0.5517949321627059, "learning_rate": 9.756919907649059e-07, "loss": 0.3258, "step": 594 }, { "epoch": 0.32674354750137286, "grad_norm": 0.5641509784115365, "learning_rate": 9.756024519343169e-07, "loss": 0.2857, "step": 595 }, { "epoch": 0.3272926963207029, "grad_norm": 0.4618260776880444, "learning_rate": 9.755127526222953e-07, "loss": 0.2739, "step": 596 }, { "epoch": 0.32784184514003295, "grad_norm": 0.5145814405256917, "learning_rate": 9.754228928591113e-07, "loss": 0.3695, "step": 597 }, { "epoch": 0.328390993959363, "grad_norm": 0.43421024893078836, "learning_rate": 9.753328726750893e-07, "loss": 0.3141, "step": 598 }, { "epoch": 0.32894014277869305, "grad_norm": 0.4474944382114859, "learning_rate": 9.752426921006077e-07, "loss": 0.2972, "step": 599 }, { "epoch": 0.32948929159802304, "grad_norm": 0.5577674749637513, "learning_rate": 9.751523511660992e-07, "loss": 0.3282, "step": 600 }, { "epoch": 0.32948929159802304, "eval_loss": 0.3990951478481293, "eval_runtime": 21.3598, "eval_samples_per_second": 20.74, "eval_steps_per_second": 0.89, "step": 600 }, { "epoch": 0.3300384404173531, "grad_norm": 0.7535138789624793, "learning_rate": 9.750618499020507e-07, "loss": 0.2793, "step": 601 }, { "epoch": 0.33058758923668313, "grad_norm": 0.5596192387559347, "learning_rate": 9.749711883390028e-07, "loss": 0.2913, "step": 602 }, { "epoch": 0.3311367380560132, "grad_norm": 0.4916395046398006, "learning_rate": 9.748803665075505e-07, "loss": 0.2846, "step": 603 }, { "epoch": 0.3316858868753432, "grad_norm": 0.5387551035946881, "learning_rate": 9.74789384438343e-07, "loss": 0.3312, "step": 604 }, { "epoch": 0.3322350356946733, "grad_norm": 0.38088976392666973, "learning_rate": 9.74698242162083e-07, "loss": 0.3003, "step": 605 }, { "epoch": 0.33278418451400327, "grad_norm": 0.355638876086518, "learning_rate": 9.746069397095282e-07, "loss": 0.3218, "step": 606 }, { "epoch": 0.3333333333333333, "grad_norm": 0.5147789610443237, "learning_rate": 9.745154771114893e-07, "loss": 0.3119, "step": 607 }, { "epoch": 0.33388248215266336, "grad_norm": 0.4174078167671674, "learning_rate": 9.74423854398832e-07, "loss": 0.3443, "step": 608 }, { "epoch": 0.3344316309719934, "grad_norm": 0.43132485845172025, "learning_rate": 9.743320716024752e-07, "loss": 0.2791, "step": 609 }, { "epoch": 0.33498077979132346, "grad_norm": 0.5576109280264451, "learning_rate": 9.742401287533924e-07, "loss": 0.2937, "step": 610 }, { "epoch": 0.3355299286106535, "grad_norm": 0.3849380469438766, "learning_rate": 9.741480258826108e-07, "loss": 0.305, "step": 611 }, { "epoch": 0.33607907742998355, "grad_norm": 0.4118908381245245, "learning_rate": 9.74055763021212e-07, "loss": 0.3115, "step": 612 }, { "epoch": 0.33662822624931354, "grad_norm": 0.5583608347142233, "learning_rate": 9.739633402003312e-07, "loss": 0.3299, "step": 613 }, { "epoch": 0.3371773750686436, "grad_norm": 0.970066195981141, "learning_rate": 9.738707574511575e-07, "loss": 0.492, "step": 614 }, { "epoch": 0.33772652388797364, "grad_norm": 0.47213379281722617, "learning_rate": 9.737780148049343e-07, "loss": 0.3081, "step": 615 }, { "epoch": 0.3382756727073037, "grad_norm": 0.4913912076262507, "learning_rate": 9.73685112292959e-07, "loss": 0.2741, "step": 616 }, { "epoch": 0.33882482152663373, "grad_norm": 0.4597155671956856, "learning_rate": 9.735920499465826e-07, "loss": 0.3032, "step": 617 }, { "epoch": 0.3393739703459638, "grad_norm": 0.3935996589821795, "learning_rate": 9.7349882779721e-07, "loss": 0.3091, "step": 618 }, { "epoch": 0.3399231191652938, "grad_norm": 0.4209399399615395, "learning_rate": 9.734054458763005e-07, "loss": 0.3454, "step": 619 }, { "epoch": 0.3404722679846238, "grad_norm": 0.6537994713114161, "learning_rate": 9.73311904215367e-07, "loss": 0.3546, "step": 620 }, { "epoch": 0.34102141680395387, "grad_norm": 0.48905936776390374, "learning_rate": 9.732182028459767e-07, "loss": 0.2848, "step": 621 }, { "epoch": 0.3415705656232839, "grad_norm": 0.4938866124667207, "learning_rate": 9.731243417997498e-07, "loss": 0.3066, "step": 622 }, { "epoch": 0.34211971444261396, "grad_norm": 0.5613140069418393, "learning_rate": 9.730303211083612e-07, "loss": 0.313, "step": 623 }, { "epoch": 0.342668863261944, "grad_norm": 0.6190131733066165, "learning_rate": 9.729361408035396e-07, "loss": 0.3107, "step": 624 }, { "epoch": 0.343218012081274, "grad_norm": 0.5156384523404802, "learning_rate": 9.728418009170672e-07, "loss": 0.3055, "step": 625 }, { "epoch": 0.34376716090060405, "grad_norm": 0.45191362956934983, "learning_rate": 9.7274730148078e-07, "loss": 0.2908, "step": 626 }, { "epoch": 0.3443163097199341, "grad_norm": 0.47085252608535416, "learning_rate": 9.726526425265684e-07, "loss": 0.3348, "step": 627 }, { "epoch": 0.34486545853926415, "grad_norm": 0.5525369365824818, "learning_rate": 9.72557824086376e-07, "loss": 0.3105, "step": 628 }, { "epoch": 0.3454146073585942, "grad_norm": 0.4510172443452184, "learning_rate": 9.724628461922012e-07, "loss": 0.3053, "step": 629 }, { "epoch": 0.34596375617792424, "grad_norm": 0.6535047047887291, "learning_rate": 9.723677088760948e-07, "loss": 0.2922, "step": 630 }, { "epoch": 0.34651290499725423, "grad_norm": 0.74351729819573, "learning_rate": 9.722724121701626e-07, "loss": 0.3051, "step": 631 }, { "epoch": 0.3470620538165843, "grad_norm": 0.5805013009226808, "learning_rate": 9.721769561065636e-07, "loss": 0.3594, "step": 632 }, { "epoch": 0.34761120263591433, "grad_norm": 0.5814129131140799, "learning_rate": 9.720813407175104e-07, "loss": 0.3167, "step": 633 }, { "epoch": 0.3481603514552444, "grad_norm": 0.6309209196880202, "learning_rate": 9.7198556603527e-07, "loss": 0.3734, "step": 634 }, { "epoch": 0.3487095002745744, "grad_norm": 0.46224009558769286, "learning_rate": 9.71889632092163e-07, "loss": 0.287, "step": 635 }, { "epoch": 0.34925864909390447, "grad_norm": 0.5272531876962709, "learning_rate": 9.71793538920563e-07, "loss": 0.279, "step": 636 }, { "epoch": 0.34980779791323446, "grad_norm": 0.44163137495470217, "learning_rate": 9.716972865528985e-07, "loss": 0.2678, "step": 637 }, { "epoch": 0.3503569467325645, "grad_norm": 0.6380350145592627, "learning_rate": 9.716008750216508e-07, "loss": 0.3261, "step": 638 }, { "epoch": 0.35090609555189456, "grad_norm": 0.6248761009397122, "learning_rate": 9.715043043593553e-07, "loss": 0.3116, "step": 639 }, { "epoch": 0.3514552443712246, "grad_norm": 0.510139799304016, "learning_rate": 9.71407574598601e-07, "loss": 0.3322, "step": 640 }, { "epoch": 0.35200439319055465, "grad_norm": 0.5148774048289841, "learning_rate": 9.713106857720308e-07, "loss": 0.2989, "step": 641 }, { "epoch": 0.3525535420098847, "grad_norm": 0.5052236723776224, "learning_rate": 9.712136379123408e-07, "loss": 0.2853, "step": 642 }, { "epoch": 0.3531026908292147, "grad_norm": 0.4195131561701459, "learning_rate": 9.711164310522813e-07, "loss": 0.3065, "step": 643 }, { "epoch": 0.35365183964854474, "grad_norm": 0.5377543477776671, "learning_rate": 9.710190652246561e-07, "loss": 0.3426, "step": 644 }, { "epoch": 0.3542009884678748, "grad_norm": 0.4133377610162905, "learning_rate": 9.709215404623225e-07, "loss": 0.281, "step": 645 }, { "epoch": 0.35475013728720484, "grad_norm": 0.41211088337271534, "learning_rate": 9.708238567981914e-07, "loss": 0.2874, "step": 646 }, { "epoch": 0.3552992861065349, "grad_norm": 0.5723847095442636, "learning_rate": 9.707260142652274e-07, "loss": 0.2726, "step": 647 }, { "epoch": 0.35584843492586493, "grad_norm": 0.5196741588284901, "learning_rate": 9.706280128964493e-07, "loss": 0.3138, "step": 648 }, { "epoch": 0.3563975837451949, "grad_norm": 0.4787700955419951, "learning_rate": 9.705298527249282e-07, "loss": 0.3165, "step": 649 }, { "epoch": 0.35694673256452497, "grad_norm": 0.6034763612698993, "learning_rate": 9.7043153378379e-07, "loss": 0.3168, "step": 650 }, { "epoch": 0.357495881383855, "grad_norm": 0.581155600436397, "learning_rate": 9.703330561062134e-07, "loss": 0.3249, "step": 651 }, { "epoch": 0.35804503020318507, "grad_norm": 0.4296014615750807, "learning_rate": 9.702344197254315e-07, "loss": 0.2664, "step": 652 }, { "epoch": 0.3585941790225151, "grad_norm": 0.4297859552951615, "learning_rate": 9.701356246747298e-07, "loss": 0.2974, "step": 653 }, { "epoch": 0.35914332784184516, "grad_norm": 0.3995106691189651, "learning_rate": 9.700366709874486e-07, "loss": 0.3457, "step": 654 }, { "epoch": 0.35969247666117515, "grad_norm": 0.500550982356711, "learning_rate": 9.699375586969807e-07, "loss": 0.2624, "step": 655 }, { "epoch": 0.3602416254805052, "grad_norm": 0.42314748190658086, "learning_rate": 9.69838287836773e-07, "loss": 0.2745, "step": 656 }, { "epoch": 0.36079077429983525, "grad_norm": 0.4876435943431558, "learning_rate": 9.697388584403256e-07, "loss": 0.3278, "step": 657 }, { "epoch": 0.3613399231191653, "grad_norm": 0.5284152926781756, "learning_rate": 9.696392705411926e-07, "loss": 0.3094, "step": 658 }, { "epoch": 0.36188907193849534, "grad_norm": 0.4483408446657781, "learning_rate": 9.69539524172981e-07, "loss": 0.3084, "step": 659 }, { "epoch": 0.3624382207578254, "grad_norm": 0.44898867507859125, "learning_rate": 9.694396193693517e-07, "loss": 0.2784, "step": 660 }, { "epoch": 0.3629873695771554, "grad_norm": 0.5226977215584937, "learning_rate": 9.693395561640185e-07, "loss": 0.3361, "step": 661 }, { "epoch": 0.36353651839648543, "grad_norm": 0.5608672121095176, "learning_rate": 9.692393345907495e-07, "loss": 0.3167, "step": 662 }, { "epoch": 0.3640856672158155, "grad_norm": 0.5626838918802252, "learning_rate": 9.691389546833655e-07, "loss": 0.3298, "step": 663 }, { "epoch": 0.3646348160351455, "grad_norm": 0.5047475049389516, "learning_rate": 9.690384164757413e-07, "loss": 0.3005, "step": 664 }, { "epoch": 0.3651839648544756, "grad_norm": 0.3981343843323678, "learning_rate": 9.689377200018044e-07, "loss": 0.3219, "step": 665 }, { "epoch": 0.3657331136738056, "grad_norm": 0.44119825489979947, "learning_rate": 9.688368652955367e-07, "loss": 0.2966, "step": 666 }, { "epoch": 0.3662822624931356, "grad_norm": 0.5118289344134571, "learning_rate": 9.687358523909724e-07, "loss": 0.3063, "step": 667 }, { "epoch": 0.36683141131246566, "grad_norm": 0.39175252246613007, "learning_rate": 9.686346813222e-07, "loss": 0.3028, "step": 668 }, { "epoch": 0.3673805601317957, "grad_norm": 0.40019068400355734, "learning_rate": 9.685333521233608e-07, "loss": 0.2826, "step": 669 }, { "epoch": 0.36792970895112576, "grad_norm": 0.37582466249933383, "learning_rate": 9.6843186482865e-07, "loss": 0.3016, "step": 670 }, { "epoch": 0.3684788577704558, "grad_norm": 0.5268600748409547, "learning_rate": 9.683302194723155e-07, "loss": 0.3324, "step": 671 }, { "epoch": 0.36902800658978585, "grad_norm": 0.5673301016874637, "learning_rate": 9.68228416088659e-07, "loss": 0.2707, "step": 672 }, { "epoch": 0.36957715540911584, "grad_norm": 0.45456325584425716, "learning_rate": 9.68126454712035e-07, "loss": 0.2942, "step": 673 }, { "epoch": 0.3701263042284459, "grad_norm": 0.5007265572559167, "learning_rate": 9.680243353768525e-07, "loss": 0.292, "step": 674 }, { "epoch": 0.37067545304777594, "grad_norm": 0.46899432742924696, "learning_rate": 9.679220581175725e-07, "loss": 0.3011, "step": 675 }, { "epoch": 0.371224601867106, "grad_norm": 0.49659269295043473, "learning_rate": 9.678196229687098e-07, "loss": 0.3087, "step": 676 }, { "epoch": 0.37177375068643603, "grad_norm": 0.37607093840017963, "learning_rate": 9.677170299648325e-07, "loss": 0.3142, "step": 677 }, { "epoch": 0.3723228995057661, "grad_norm": 0.4646101284343405, "learning_rate": 9.67614279140562e-07, "loss": 0.3097, "step": 678 }, { "epoch": 0.37287204832509613, "grad_norm": 0.4163066954353121, "learning_rate": 9.675113705305732e-07, "loss": 0.3203, "step": 679 }, { "epoch": 0.3734211971444261, "grad_norm": 0.5328637350114481, "learning_rate": 9.674083041695935e-07, "loss": 0.2683, "step": 680 }, { "epoch": 0.37397034596375617, "grad_norm": 0.5380563053012897, "learning_rate": 9.673050800924044e-07, "loss": 0.2924, "step": 681 }, { "epoch": 0.3745194947830862, "grad_norm": 0.5125953775604254, "learning_rate": 9.672016983338397e-07, "loss": 0.2721, "step": 682 }, { "epoch": 0.37506864360241626, "grad_norm": 0.5062247634748299, "learning_rate": 9.670981589287874e-07, "loss": 0.2848, "step": 683 }, { "epoch": 0.3756177924217463, "grad_norm": 0.4022672964926895, "learning_rate": 9.669944619121884e-07, "loss": 0.2706, "step": 684 }, { "epoch": 0.37616694124107636, "grad_norm": 0.4973441997825326, "learning_rate": 9.668906073190357e-07, "loss": 0.2799, "step": 685 }, { "epoch": 0.37671609006040635, "grad_norm": 0.5102836144006504, "learning_rate": 9.667865951843774e-07, "loss": 0.277, "step": 686 }, { "epoch": 0.3772652388797364, "grad_norm": 0.48942983713029503, "learning_rate": 9.666824255433135e-07, "loss": 0.3191, "step": 687 }, { "epoch": 0.37781438769906645, "grad_norm": 0.9668850905299259, "learning_rate": 9.665780984309974e-07, "loss": 0.3065, "step": 688 }, { "epoch": 0.3783635365183965, "grad_norm": 0.5647393299084974, "learning_rate": 9.66473613882635e-07, "loss": 0.3321, "step": 689 }, { "epoch": 0.37891268533772654, "grad_norm": 0.5229824452045684, "learning_rate": 9.66368971933487e-07, "loss": 0.3316, "step": 690 }, { "epoch": 0.3794618341570566, "grad_norm": 0.49594910353265875, "learning_rate": 9.662641726188658e-07, "loss": 0.2816, "step": 691 }, { "epoch": 0.3800109829763866, "grad_norm": 0.4255955812380013, "learning_rate": 9.661592159741372e-07, "loss": 0.3278, "step": 692 }, { "epoch": 0.3805601317957166, "grad_norm": 0.5090976935406557, "learning_rate": 9.6605410203472e-07, "loss": 0.319, "step": 693 }, { "epoch": 0.3811092806150467, "grad_norm": 0.5588716904428634, "learning_rate": 9.659488308360868e-07, "loss": 0.2722, "step": 694 }, { "epoch": 0.3816584294343767, "grad_norm": 0.42069896911120797, "learning_rate": 9.658434024137623e-07, "loss": 0.318, "step": 695 }, { "epoch": 0.38220757825370677, "grad_norm": 0.6003031262008706, "learning_rate": 9.65737816803325e-07, "loss": 0.3144, "step": 696 }, { "epoch": 0.3827567270730368, "grad_norm": 0.42317790502935254, "learning_rate": 9.65632074040406e-07, "loss": 0.3242, "step": 697 }, { "epoch": 0.3833058758923668, "grad_norm": 0.484015090721489, "learning_rate": 9.655261741606898e-07, "loss": 0.3168, "step": 698 }, { "epoch": 0.38385502471169686, "grad_norm": 0.4891502973693406, "learning_rate": 9.654201171999135e-07, "loss": 0.2901, "step": 699 }, { "epoch": 0.3844041735310269, "grad_norm": 0.6789623364281554, "learning_rate": 9.653139031938674e-07, "loss": 0.3461, "step": 700 }, { "epoch": 0.38495332235035695, "grad_norm": 0.510233112331138, "learning_rate": 9.652075321783948e-07, "loss": 0.2606, "step": 701 }, { "epoch": 0.385502471169687, "grad_norm": 0.4339892378573219, "learning_rate": 9.65101004189392e-07, "loss": 0.2764, "step": 702 }, { "epoch": 0.38605161998901705, "grad_norm": 0.44546824152165365, "learning_rate": 9.649943192628088e-07, "loss": 0.303, "step": 703 }, { "epoch": 0.38660076880834704, "grad_norm": 0.5077409772487453, "learning_rate": 9.648874774346466e-07, "loss": 0.2745, "step": 704 }, { "epoch": 0.3871499176276771, "grad_norm": 0.5351817859128496, "learning_rate": 9.647804787409609e-07, "loss": 0.3215, "step": 705 }, { "epoch": 0.38769906644700713, "grad_norm": 0.6654375581180654, "learning_rate": 9.646733232178602e-07, "loss": 0.3005, "step": 706 }, { "epoch": 0.3882482152663372, "grad_norm": 0.47479590504591296, "learning_rate": 9.64566010901505e-07, "loss": 0.3026, "step": 707 }, { "epoch": 0.38879736408566723, "grad_norm": 0.45767142869618566, "learning_rate": 9.644585418281095e-07, "loss": 0.3234, "step": 708 }, { "epoch": 0.3893465129049973, "grad_norm": 0.5351000334146468, "learning_rate": 9.643509160339405e-07, "loss": 0.3178, "step": 709 }, { "epoch": 0.38989566172432727, "grad_norm": 0.5491196154093239, "learning_rate": 9.642431335553179e-07, "loss": 0.3036, "step": 710 }, { "epoch": 0.3904448105436573, "grad_norm": 0.4953373720579792, "learning_rate": 9.641351944286141e-07, "loss": 0.309, "step": 711 }, { "epoch": 0.39099395936298736, "grad_norm": 0.529089999932197, "learning_rate": 9.640270986902546e-07, "loss": 0.2937, "step": 712 }, { "epoch": 0.3915431081823174, "grad_norm": 0.5021246446188063, "learning_rate": 9.639188463767179e-07, "loss": 0.2891, "step": 713 }, { "epoch": 0.39209225700164746, "grad_norm": 0.509633641940779, "learning_rate": 9.638104375245352e-07, "loss": 0.2892, "step": 714 }, { "epoch": 0.3926414058209775, "grad_norm": 0.4862639914462522, "learning_rate": 9.6370187217029e-07, "loss": 0.3117, "step": 715 }, { "epoch": 0.3931905546403075, "grad_norm": 0.446018700976694, "learning_rate": 9.635931503506197e-07, "loss": 0.2978, "step": 716 }, { "epoch": 0.39373970345963755, "grad_norm": 0.539543136781025, "learning_rate": 9.634842721022135e-07, "loss": 0.3292, "step": 717 }, { "epoch": 0.3942888522789676, "grad_norm": 0.49828782521512316, "learning_rate": 9.63375237461814e-07, "loss": 0.2967, "step": 718 }, { "epoch": 0.39483800109829764, "grad_norm": 0.480059502587545, "learning_rate": 9.632660464662165e-07, "loss": 0.3162, "step": 719 }, { "epoch": 0.3953871499176277, "grad_norm": 0.43445689864749637, "learning_rate": 9.631566991522687e-07, "loss": 0.2881, "step": 720 }, { "epoch": 0.39593629873695774, "grad_norm": 0.4342381150865927, "learning_rate": 9.630471955568714e-07, "loss": 0.2808, "step": 721 }, { "epoch": 0.39648544755628773, "grad_norm": 0.4536683984099135, "learning_rate": 9.629375357169778e-07, "loss": 0.2769, "step": 722 }, { "epoch": 0.3970345963756178, "grad_norm": 0.4446486023148765, "learning_rate": 9.628277196695944e-07, "loss": 0.2648, "step": 723 }, { "epoch": 0.3975837451949478, "grad_norm": 0.5346333487867275, "learning_rate": 9.627177474517799e-07, "loss": 0.3142, "step": 724 }, { "epoch": 0.39813289401427787, "grad_norm": 0.5358051956974806, "learning_rate": 9.62607619100646e-07, "loss": 0.3208, "step": 725 }, { "epoch": 0.3986820428336079, "grad_norm": 0.5732398102241596, "learning_rate": 9.624973346533567e-07, "loss": 0.323, "step": 726 }, { "epoch": 0.39923119165293797, "grad_norm": 0.48547846323551097, "learning_rate": 9.623868941471292e-07, "loss": 0.3343, "step": 727 }, { "epoch": 0.39978034047226796, "grad_norm": 1.2687221970572045, "learning_rate": 9.62276297619233e-07, "loss": 0.4866, "step": 728 }, { "epoch": 0.400329489291598, "grad_norm": 0.48601698432633017, "learning_rate": 9.621655451069901e-07, "loss": 0.281, "step": 729 }, { "epoch": 0.40087863811092805, "grad_norm": 0.6849417587452975, "learning_rate": 9.620546366477761e-07, "loss": 0.2849, "step": 730 }, { "epoch": 0.4014277869302581, "grad_norm": 0.4816077181452332, "learning_rate": 9.619435722790179e-07, "loss": 0.2682, "step": 731 }, { "epoch": 0.40197693574958815, "grad_norm": 0.4787268819031274, "learning_rate": 9.618323520381958e-07, "loss": 0.2727, "step": 732 }, { "epoch": 0.4025260845689182, "grad_norm": 0.5491644244679555, "learning_rate": 9.617209759628423e-07, "loss": 0.3453, "step": 733 }, { "epoch": 0.4030752333882482, "grad_norm": 0.512411114839357, "learning_rate": 9.61609444090543e-07, "loss": 0.3192, "step": 734 }, { "epoch": 0.40362438220757824, "grad_norm": 0.6185218965870894, "learning_rate": 9.61497756458936e-07, "loss": 0.3049, "step": 735 }, { "epoch": 0.4041735310269083, "grad_norm": 0.568231175264156, "learning_rate": 9.613859131057113e-07, "loss": 0.3052, "step": 736 }, { "epoch": 0.40472267984623833, "grad_norm": 0.6400632494660251, "learning_rate": 9.612739140686123e-07, "loss": 0.3194, "step": 737 }, { "epoch": 0.4052718286655684, "grad_norm": 1.0806023635161741, "learning_rate": 9.611617593854342e-07, "loss": 0.3057, "step": 738 }, { "epoch": 0.4058209774848984, "grad_norm": 0.6063220373539205, "learning_rate": 9.610494490940252e-07, "loss": 0.3173, "step": 739 }, { "epoch": 0.4063701263042284, "grad_norm": 0.4843971420219079, "learning_rate": 9.609369832322859e-07, "loss": 0.2956, "step": 740 }, { "epoch": 0.40691927512355847, "grad_norm": 0.4905855464007312, "learning_rate": 9.608243618381695e-07, "loss": 0.2789, "step": 741 }, { "epoch": 0.4074684239428885, "grad_norm": 0.7025421677093987, "learning_rate": 9.607115849496815e-07, "loss": 0.2977, "step": 742 }, { "epoch": 0.40801757276221856, "grad_norm": 0.5256562409723281, "learning_rate": 9.605986526048801e-07, "loss": 0.3047, "step": 743 }, { "epoch": 0.4085667215815486, "grad_norm": 0.4493202041470104, "learning_rate": 9.604855648418757e-07, "loss": 0.3122, "step": 744 }, { "epoch": 0.40911587040087866, "grad_norm": 0.4457684561463568, "learning_rate": 9.603723216988308e-07, "loss": 0.2819, "step": 745 }, { "epoch": 0.4096650192202087, "grad_norm": 0.3818819244872842, "learning_rate": 9.602589232139615e-07, "loss": 0.3033, "step": 746 }, { "epoch": 0.4102141680395387, "grad_norm": 0.5439363449127944, "learning_rate": 9.60145369425535e-07, "loss": 0.2589, "step": 747 }, { "epoch": 0.41076331685886874, "grad_norm": 0.6063742306547529, "learning_rate": 9.60031660371872e-07, "loss": 0.3205, "step": 748 }, { "epoch": 0.4113124656781988, "grad_norm": 0.4810223155119117, "learning_rate": 9.599177960913448e-07, "loss": 0.3043, "step": 749 }, { "epoch": 0.41186161449752884, "grad_norm": 0.6567690969443539, "learning_rate": 9.598037766223787e-07, "loss": 0.3811, "step": 750 }, { "epoch": 0.4124107633168589, "grad_norm": 0.396378898797153, "learning_rate": 9.596896020034507e-07, "loss": 0.2467, "step": 751 }, { "epoch": 0.41295991213618893, "grad_norm": 0.4171907276326462, "learning_rate": 9.595752722730908e-07, "loss": 0.2909, "step": 752 }, { "epoch": 0.4135090609555189, "grad_norm": 0.4366953949450168, "learning_rate": 9.594607874698812e-07, "loss": 0.2808, "step": 753 }, { "epoch": 0.414058209774849, "grad_norm": 0.5098541784738473, "learning_rate": 9.593461476324559e-07, "loss": 0.2941, "step": 754 }, { "epoch": 0.414607358594179, "grad_norm": 0.5771167137620298, "learning_rate": 9.592313527995018e-07, "loss": 0.3404, "step": 755 }, { "epoch": 0.41515650741350907, "grad_norm": 0.4122083489176511, "learning_rate": 9.59116403009758e-07, "loss": 0.3289, "step": 756 }, { "epoch": 0.4157056562328391, "grad_norm": 0.576141910264704, "learning_rate": 9.590012983020156e-07, "loss": 0.3254, "step": 757 }, { "epoch": 0.41625480505216916, "grad_norm": 0.4156118943499862, "learning_rate": 9.588860387151186e-07, "loss": 0.303, "step": 758 }, { "epoch": 0.41680395387149916, "grad_norm": 0.4149677926920236, "learning_rate": 9.587706242879626e-07, "loss": 0.2428, "step": 759 }, { "epoch": 0.4173531026908292, "grad_norm": 0.46849318941518386, "learning_rate": 9.586550550594957e-07, "loss": 0.3186, "step": 760 }, { "epoch": 0.41790225151015925, "grad_norm": 0.577918250208169, "learning_rate": 9.585393310687184e-07, "loss": 0.3931, "step": 761 }, { "epoch": 0.4184514003294893, "grad_norm": 0.4835356500718274, "learning_rate": 9.58423452354683e-07, "loss": 0.2647, "step": 762 }, { "epoch": 0.41900054914881935, "grad_norm": 0.7029114169052276, "learning_rate": 9.583074189564946e-07, "loss": 0.2908, "step": 763 }, { "epoch": 0.4195496979681494, "grad_norm": 0.4504448045494503, "learning_rate": 9.5819123091331e-07, "loss": 0.2758, "step": 764 }, { "epoch": 0.4200988467874794, "grad_norm": 0.4329513902572511, "learning_rate": 9.580748882643386e-07, "loss": 0.3147, "step": 765 }, { "epoch": 0.42064799560680943, "grad_norm": 0.44771741448741187, "learning_rate": 9.579583910488415e-07, "loss": 0.2814, "step": 766 }, { "epoch": 0.4211971444261395, "grad_norm": 0.46685480671831064, "learning_rate": 9.578417393061326e-07, "loss": 0.3293, "step": 767 }, { "epoch": 0.42174629324546953, "grad_norm": 0.48456073285859785, "learning_rate": 9.57724933075577e-07, "loss": 0.3201, "step": 768 }, { "epoch": 0.4222954420647996, "grad_norm": 0.4133689638287976, "learning_rate": 9.57607972396593e-07, "loss": 0.3207, "step": 769 }, { "epoch": 0.4228445908841296, "grad_norm": 0.5014614687785577, "learning_rate": 9.5749085730865e-07, "loss": 0.3088, "step": 770 }, { "epoch": 0.4233937397034596, "grad_norm": 0.4207848646541183, "learning_rate": 9.573735878512708e-07, "loss": 0.2839, "step": 771 }, { "epoch": 0.42394288852278966, "grad_norm": 0.36226359304686095, "learning_rate": 9.572561640640286e-07, "loss": 0.285, "step": 772 }, { "epoch": 0.4244920373421197, "grad_norm": 0.5327078794794589, "learning_rate": 9.571385859865505e-07, "loss": 0.2801, "step": 773 }, { "epoch": 0.42504118616144976, "grad_norm": 0.5065927036765948, "learning_rate": 9.57020853658514e-07, "loss": 0.3097, "step": 774 }, { "epoch": 0.4255903349807798, "grad_norm": 0.5327545308558694, "learning_rate": 9.5690296711965e-07, "loss": 0.3258, "step": 775 }, { "epoch": 0.42613948380010985, "grad_norm": 0.41681155903373474, "learning_rate": 9.567849264097408e-07, "loss": 0.3346, "step": 776 }, { "epoch": 0.42668863261943984, "grad_norm": 0.41704633743883057, "learning_rate": 9.566667315686204e-07, "loss": 0.2989, "step": 777 }, { "epoch": 0.4272377814387699, "grad_norm": 0.5319763283008776, "learning_rate": 9.565483826361754e-07, "loss": 0.2987, "step": 778 }, { "epoch": 0.42778693025809994, "grad_norm": 0.4842008761674137, "learning_rate": 9.564298796523443e-07, "loss": 0.3076, "step": 779 }, { "epoch": 0.42833607907743, "grad_norm": 0.45192872305885196, "learning_rate": 9.563112226571173e-07, "loss": 0.2882, "step": 780 }, { "epoch": 0.42888522789676004, "grad_norm": 0.41908768948092573, "learning_rate": 9.56192411690537e-07, "loss": 0.2961, "step": 781 }, { "epoch": 0.4294343767160901, "grad_norm": 0.5179482366434025, "learning_rate": 9.560734467926975e-07, "loss": 0.3037, "step": 782 }, { "epoch": 0.4299835255354201, "grad_norm": 0.4916582966779928, "learning_rate": 9.559543280037453e-07, "loss": 0.29, "step": 783 }, { "epoch": 0.4305326743547501, "grad_norm": 0.5700432679096007, "learning_rate": 9.558350553638786e-07, "loss": 0.294, "step": 784 }, { "epoch": 0.43108182317408017, "grad_norm": 0.5839314684230201, "learning_rate": 9.557156289133473e-07, "loss": 0.2831, "step": 785 }, { "epoch": 0.4316309719934102, "grad_norm": 0.48557340618273365, "learning_rate": 9.555960486924535e-07, "loss": 0.3188, "step": 786 }, { "epoch": 0.43218012081274026, "grad_norm": 0.4929418875926838, "learning_rate": 9.554763147415511e-07, "loss": 0.3325, "step": 787 }, { "epoch": 0.4327292696320703, "grad_norm": 0.3809242223425811, "learning_rate": 9.553564271010462e-07, "loss": 0.282, "step": 788 }, { "epoch": 0.4332784184514003, "grad_norm": 0.45802986076725805, "learning_rate": 9.55236385811396e-07, "loss": 0.2655, "step": 789 }, { "epoch": 0.43382756727073035, "grad_norm": 0.4024472187473021, "learning_rate": 9.551161909131102e-07, "loss": 0.302, "step": 790 }, { "epoch": 0.4343767160900604, "grad_norm": 0.4529326437615863, "learning_rate": 9.549958424467503e-07, "loss": 0.3127, "step": 791 }, { "epoch": 0.43492586490939045, "grad_norm": 0.5094681766059965, "learning_rate": 9.548753404529291e-07, "loss": 0.2948, "step": 792 }, { "epoch": 0.4354750137287205, "grad_norm": 0.4079607057950661, "learning_rate": 9.547546849723122e-07, "loss": 0.2973, "step": 793 }, { "epoch": 0.43602416254805054, "grad_norm": 0.4843818222476831, "learning_rate": 9.546338760456157e-07, "loss": 0.29, "step": 794 }, { "epoch": 0.43657331136738053, "grad_norm": 0.6200251252974922, "learning_rate": 9.545129137136088e-07, "loss": 0.326, "step": 795 }, { "epoch": 0.4371224601867106, "grad_norm": 0.5148061588557831, "learning_rate": 9.543917980171111e-07, "loss": 0.2919, "step": 796 }, { "epoch": 0.43767160900604063, "grad_norm": 0.4647719726184225, "learning_rate": 9.542705289969954e-07, "loss": 0.2861, "step": 797 }, { "epoch": 0.4382207578253707, "grad_norm": 0.5207409513182187, "learning_rate": 9.541491066941852e-07, "loss": 0.3313, "step": 798 }, { "epoch": 0.4387699066447007, "grad_norm": 0.3997787911557919, "learning_rate": 9.54027531149656e-07, "loss": 0.2881, "step": 799 }, { "epoch": 0.43931905546403077, "grad_norm": 0.6069803871029724, "learning_rate": 9.539058024044351e-07, "loss": 0.2835, "step": 800 }, { "epoch": 0.43931905546403077, "eval_loss": 0.3838886320590973, "eval_runtime": 20.625, "eval_samples_per_second": 21.479, "eval_steps_per_second": 0.921, "step": 800 }, { "epoch": 0.43986820428336076, "grad_norm": 0.5217888186382518, "learning_rate": 9.537839204996016e-07, "loss": 0.3011, "step": 801 }, { "epoch": 0.4404173531026908, "grad_norm": 0.43913261925358565, "learning_rate": 9.53661885476286e-07, "loss": 0.2978, "step": 802 }, { "epoch": 0.44096650192202086, "grad_norm": 0.5022079288969393, "learning_rate": 9.535396973756706e-07, "loss": 0.278, "step": 803 }, { "epoch": 0.4415156507413509, "grad_norm": 0.35775508016281304, "learning_rate": 9.534173562389896e-07, "loss": 0.3043, "step": 804 }, { "epoch": 0.44206479956068095, "grad_norm": 0.4096278279566925, "learning_rate": 9.532948621075284e-07, "loss": 0.3128, "step": 805 }, { "epoch": 0.442613948380011, "grad_norm": 0.4079260738482481, "learning_rate": 9.531722150226246e-07, "loss": 0.2912, "step": 806 }, { "epoch": 0.443163097199341, "grad_norm": 0.4919102979709681, "learning_rate": 9.530494150256666e-07, "loss": 0.3205, "step": 807 }, { "epoch": 0.44371224601867104, "grad_norm": 0.4988189752462686, "learning_rate": 9.529264621580951e-07, "loss": 0.2765, "step": 808 }, { "epoch": 0.4442613948380011, "grad_norm": 0.6162969675302271, "learning_rate": 9.528033564614021e-07, "loss": 0.3293, "step": 809 }, { "epoch": 0.44481054365733114, "grad_norm": 0.4048799307883794, "learning_rate": 9.526800979771314e-07, "loss": 0.3248, "step": 810 }, { "epoch": 0.4453596924766612, "grad_norm": 0.5073837004555553, "learning_rate": 9.525566867468781e-07, "loss": 0.2901, "step": 811 }, { "epoch": 0.44590884129599123, "grad_norm": 0.5405585042735882, "learning_rate": 9.524331228122888e-07, "loss": 0.3353, "step": 812 }, { "epoch": 0.4464579901153213, "grad_norm": 0.620403327298711, "learning_rate": 9.523094062150621e-07, "loss": 0.284, "step": 813 }, { "epoch": 0.44700713893465127, "grad_norm": 0.5736180586292634, "learning_rate": 9.521855369969475e-07, "loss": 0.333, "step": 814 }, { "epoch": 0.4475562877539813, "grad_norm": 0.510605890936949, "learning_rate": 9.520615151997465e-07, "loss": 0.2819, "step": 815 }, { "epoch": 0.44810543657331137, "grad_norm": 0.49143475086367644, "learning_rate": 9.519373408653117e-07, "loss": 0.2391, "step": 816 }, { "epoch": 0.4486545853926414, "grad_norm": 0.45201489835598707, "learning_rate": 9.518130140355475e-07, "loss": 0.2885, "step": 817 }, { "epoch": 0.44920373421197146, "grad_norm": 0.573725064161955, "learning_rate": 9.516885347524095e-07, "loss": 0.2839, "step": 818 }, { "epoch": 0.4497528830313015, "grad_norm": 0.47386243797489513, "learning_rate": 9.51563903057905e-07, "loss": 0.2696, "step": 819 }, { "epoch": 0.4503020318506315, "grad_norm": 0.47205463916067886, "learning_rate": 9.514391189940926e-07, "loss": 0.2523, "step": 820 }, { "epoch": 0.45085118066996155, "grad_norm": 0.45044665068911605, "learning_rate": 9.513141826030823e-07, "loss": 0.3123, "step": 821 }, { "epoch": 0.4514003294892916, "grad_norm": 0.4158611609735077, "learning_rate": 9.511890939270353e-07, "loss": 0.2925, "step": 822 }, { "epoch": 0.45194947830862164, "grad_norm": 0.5223967234947888, "learning_rate": 9.510638530081648e-07, "loss": 0.2975, "step": 823 }, { "epoch": 0.4524986271279517, "grad_norm": 0.5843315058304418, "learning_rate": 9.509384598887347e-07, "loss": 0.2739, "step": 824 }, { "epoch": 0.45304777594728174, "grad_norm": 0.6273015607567209, "learning_rate": 9.50812914611061e-07, "loss": 0.2956, "step": 825 }, { "epoch": 0.45359692476661173, "grad_norm": 0.5026457859686514, "learning_rate": 9.506872172175101e-07, "loss": 0.3232, "step": 826 }, { "epoch": 0.4541460735859418, "grad_norm": 0.5180912442648508, "learning_rate": 9.505613677505003e-07, "loss": 0.297, "step": 827 }, { "epoch": 0.4546952224052718, "grad_norm": 0.45552996393675466, "learning_rate": 9.504353662525014e-07, "loss": 0.2965, "step": 828 }, { "epoch": 0.4552443712246019, "grad_norm": 0.42650174704417904, "learning_rate": 9.503092127660342e-07, "loss": 0.2819, "step": 829 }, { "epoch": 0.4557935200439319, "grad_norm": 0.3817615029385988, "learning_rate": 9.501829073336708e-07, "loss": 0.3229, "step": 830 }, { "epoch": 0.45634266886326197, "grad_norm": 0.493972109103768, "learning_rate": 9.500564499980347e-07, "loss": 0.2791, "step": 831 }, { "epoch": 0.45689181768259196, "grad_norm": 0.4867861449067947, "learning_rate": 9.499298408018004e-07, "loss": 0.3061, "step": 832 }, { "epoch": 0.457440966501922, "grad_norm": 0.41619730088568113, "learning_rate": 9.498030797876939e-07, "loss": 0.254, "step": 833 }, { "epoch": 0.45799011532125206, "grad_norm": 0.536376759166917, "learning_rate": 9.496761669984926e-07, "loss": 0.3137, "step": 834 }, { "epoch": 0.4585392641405821, "grad_norm": 0.46274798693893776, "learning_rate": 9.495491024770246e-07, "loss": 0.2837, "step": 835 }, { "epoch": 0.45908841295991215, "grad_norm": 0.39714810413306906, "learning_rate": 9.494218862661698e-07, "loss": 0.2952, "step": 836 }, { "epoch": 0.4596375617792422, "grad_norm": 0.5090674507672917, "learning_rate": 9.492945184088585e-07, "loss": 0.2747, "step": 837 }, { "epoch": 0.4601867105985722, "grad_norm": 0.452786430103756, "learning_rate": 9.491669989480734e-07, "loss": 0.2996, "step": 838 }, { "epoch": 0.46073585941790224, "grad_norm": 0.6145939049047441, "learning_rate": 9.490393279268469e-07, "loss": 0.3071, "step": 839 }, { "epoch": 0.4612850082372323, "grad_norm": 0.4456318375354344, "learning_rate": 9.489115053882636e-07, "loss": 0.3, "step": 840 }, { "epoch": 0.46183415705656233, "grad_norm": 0.4978485181987157, "learning_rate": 9.487835313754589e-07, "loss": 0.2721, "step": 841 }, { "epoch": 0.4623833058758924, "grad_norm": 0.6289211533363681, "learning_rate": 9.486554059316193e-07, "loss": 0.2631, "step": 842 }, { "epoch": 0.46293245469522243, "grad_norm": 0.4062196482018521, "learning_rate": 9.485271290999822e-07, "loss": 0.2851, "step": 843 }, { "epoch": 0.4634816035145524, "grad_norm": 0.45900569659721774, "learning_rate": 9.483987009238366e-07, "loss": 0.284, "step": 844 }, { "epoch": 0.46403075233388247, "grad_norm": 0.4263527025139981, "learning_rate": 9.482701214465223e-07, "loss": 0.2828, "step": 845 }, { "epoch": 0.4645799011532125, "grad_norm": 0.45434760533612745, "learning_rate": 9.481413907114298e-07, "loss": 0.262, "step": 846 }, { "epoch": 0.46512904997254256, "grad_norm": 0.4728893975674042, "learning_rate": 9.480125087620013e-07, "loss": 0.2823, "step": 847 }, { "epoch": 0.4656781987918726, "grad_norm": 0.5007876177507758, "learning_rate": 9.478834756417297e-07, "loss": 0.3191, "step": 848 }, { "epoch": 0.46622734761120266, "grad_norm": 0.5662662290264328, "learning_rate": 9.477542913941587e-07, "loss": 0.3078, "step": 849 }, { "epoch": 0.46677649643053265, "grad_norm": 0.5068252855750915, "learning_rate": 9.476249560628831e-07, "loss": 0.3392, "step": 850 }, { "epoch": 0.4673256452498627, "grad_norm": 0.4912610553396206, "learning_rate": 9.474954696915494e-07, "loss": 0.3353, "step": 851 }, { "epoch": 0.46787479406919275, "grad_norm": 0.4954243974910444, "learning_rate": 9.473658323238539e-07, "loss": 0.257, "step": 852 }, { "epoch": 0.4684239428885228, "grad_norm": 0.5739821658616234, "learning_rate": 9.472360440035448e-07, "loss": 0.3647, "step": 853 }, { "epoch": 0.46897309170785284, "grad_norm": 0.4583902609163127, "learning_rate": 9.471061047744207e-07, "loss": 0.2645, "step": 854 }, { "epoch": 0.4695222405271829, "grad_norm": 0.45603431744449624, "learning_rate": 9.469760146803315e-07, "loss": 0.2948, "step": 855 }, { "epoch": 0.4700713893465129, "grad_norm": 0.43730155755645916, "learning_rate": 9.468457737651775e-07, "loss": 0.3249, "step": 856 }, { "epoch": 0.4706205381658429, "grad_norm": 0.4450165051325497, "learning_rate": 9.467153820729103e-07, "loss": 0.2847, "step": 857 }, { "epoch": 0.471169686985173, "grad_norm": 0.3873009851367693, "learning_rate": 9.465848396475326e-07, "loss": 0.3554, "step": 858 }, { "epoch": 0.471718835804503, "grad_norm": 0.44400554875399245, "learning_rate": 9.464541465330972e-07, "loss": 0.292, "step": 859 }, { "epoch": 0.47226798462383307, "grad_norm": 0.39940863324033565, "learning_rate": 9.463233027737086e-07, "loss": 0.3036, "step": 860 }, { "epoch": 0.4728171334431631, "grad_norm": 0.4563234904534277, "learning_rate": 9.461923084135215e-07, "loss": 0.2964, "step": 861 }, { "epoch": 0.4733662822624931, "grad_norm": 0.6470415644215131, "learning_rate": 9.460611634967417e-07, "loss": 0.2917, "step": 862 }, { "epoch": 0.47391543108182316, "grad_norm": 0.5910395927439237, "learning_rate": 9.45929868067626e-07, "loss": 0.2868, "step": 863 }, { "epoch": 0.4744645799011532, "grad_norm": 0.7232206855561645, "learning_rate": 9.457984221704815e-07, "loss": 0.3685, "step": 864 }, { "epoch": 0.47501372872048325, "grad_norm": 0.4846117391078469, "learning_rate": 9.456668258496663e-07, "loss": 0.3112, "step": 865 }, { "epoch": 0.4755628775398133, "grad_norm": 0.5525715056574515, "learning_rate": 9.455350791495896e-07, "loss": 0.3139, "step": 866 }, { "epoch": 0.47611202635914335, "grad_norm": 0.7880040011238945, "learning_rate": 9.454031821147109e-07, "loss": 0.2924, "step": 867 }, { "epoch": 0.47666117517847334, "grad_norm": 0.5151840132964554, "learning_rate": 9.452711347895407e-07, "loss": 0.3901, "step": 868 }, { "epoch": 0.4772103239978034, "grad_norm": 0.4040778589954771, "learning_rate": 9.451389372186399e-07, "loss": 0.2988, "step": 869 }, { "epoch": 0.47775947281713343, "grad_norm": 0.4587348256162564, "learning_rate": 9.450065894466205e-07, "loss": 0.3157, "step": 870 }, { "epoch": 0.4783086216364635, "grad_norm": 0.43821097109002627, "learning_rate": 9.448740915181448e-07, "loss": 0.2581, "step": 871 }, { "epoch": 0.47885777045579353, "grad_norm": 0.4609381652044799, "learning_rate": 9.447414434779262e-07, "loss": 0.2566, "step": 872 }, { "epoch": 0.4794069192751236, "grad_norm": 0.561922576075349, "learning_rate": 9.446086453707285e-07, "loss": 0.3319, "step": 873 }, { "epoch": 0.47995606809445357, "grad_norm": 0.46019066930117003, "learning_rate": 9.44475697241366e-07, "loss": 0.238, "step": 874 }, { "epoch": 0.4805052169137836, "grad_norm": 0.40837440613786424, "learning_rate": 9.443425991347038e-07, "loss": 0.3039, "step": 875 }, { "epoch": 0.48105436573311366, "grad_norm": 0.4350413843552751, "learning_rate": 9.442093510956578e-07, "loss": 0.296, "step": 876 }, { "epoch": 0.4816035145524437, "grad_norm": 0.6111074790447228, "learning_rate": 9.440759531691941e-07, "loss": 0.2763, "step": 877 }, { "epoch": 0.48215266337177376, "grad_norm": 0.43519180562596144, "learning_rate": 9.439424054003296e-07, "loss": 0.2751, "step": 878 }, { "epoch": 0.4827018121911038, "grad_norm": 0.4598670249779719, "learning_rate": 9.438087078341321e-07, "loss": 0.2993, "step": 879 }, { "epoch": 0.48325096101043385, "grad_norm": 0.47378295023804284, "learning_rate": 9.436748605157192e-07, "loss": 0.3243, "step": 880 }, { "epoch": 0.48380010982976385, "grad_norm": 0.4461236631512308, "learning_rate": 9.435408634902595e-07, "loss": 0.2682, "step": 881 }, { "epoch": 0.4843492586490939, "grad_norm": 0.7920326725183295, "learning_rate": 9.434067168029721e-07, "loss": 0.2738, "step": 882 }, { "epoch": 0.48489840746842394, "grad_norm": 0.44157805064535044, "learning_rate": 9.432724204991268e-07, "loss": 0.2692, "step": 883 }, { "epoch": 0.485447556287754, "grad_norm": 0.5442212924937905, "learning_rate": 9.431379746240433e-07, "loss": 0.3247, "step": 884 }, { "epoch": 0.48599670510708404, "grad_norm": 0.4581860021645979, "learning_rate": 9.430033792230924e-07, "loss": 0.266, "step": 885 }, { "epoch": 0.4865458539264141, "grad_norm": 0.5049337337515573, "learning_rate": 9.428686343416948e-07, "loss": 0.2949, "step": 886 }, { "epoch": 0.4870950027457441, "grad_norm": 0.45948676235725366, "learning_rate": 9.427337400253222e-07, "loss": 0.246, "step": 887 }, { "epoch": 0.4876441515650741, "grad_norm": 0.4014316553566119, "learning_rate": 9.425986963194964e-07, "loss": 0.3193, "step": 888 }, { "epoch": 0.48819330038440417, "grad_norm": 0.466582939409045, "learning_rate": 9.424635032697897e-07, "loss": 0.299, "step": 889 }, { "epoch": 0.4887424492037342, "grad_norm": 0.45486647734910446, "learning_rate": 9.423281609218244e-07, "loss": 0.3037, "step": 890 }, { "epoch": 0.48929159802306427, "grad_norm": 0.45736700020323157, "learning_rate": 9.421926693212741e-07, "loss": 0.2934, "step": 891 }, { "epoch": 0.4898407468423943, "grad_norm": 0.46611243538903646, "learning_rate": 9.420570285138622e-07, "loss": 0.2685, "step": 892 }, { "epoch": 0.4903898956617243, "grad_norm": 0.6729260072812459, "learning_rate": 9.41921238545362e-07, "loss": 0.3409, "step": 893 }, { "epoch": 0.49093904448105435, "grad_norm": 0.5186531107833405, "learning_rate": 9.417852994615979e-07, "loss": 0.3485, "step": 894 }, { "epoch": 0.4914881933003844, "grad_norm": 0.4523439174163831, "learning_rate": 9.416492113084443e-07, "loss": 0.2894, "step": 895 }, { "epoch": 0.49203734211971445, "grad_norm": 0.5607308198942792, "learning_rate": 9.41512974131826e-07, "loss": 0.2976, "step": 896 }, { "epoch": 0.4925864909390445, "grad_norm": 0.4997124535210658, "learning_rate": 9.413765879777182e-07, "loss": 0.3161, "step": 897 }, { "epoch": 0.49313563975837454, "grad_norm": 0.5930640137470536, "learning_rate": 9.412400528921457e-07, "loss": 0.2949, "step": 898 }, { "epoch": 0.49368478857770454, "grad_norm": 0.40320785859497194, "learning_rate": 9.411033689211843e-07, "loss": 0.2995, "step": 899 }, { "epoch": 0.4942339373970346, "grad_norm": 0.42578648193220026, "learning_rate": 9.4096653611096e-07, "loss": 0.2967, "step": 900 }, { "epoch": 0.49478308621636463, "grad_norm": 0.5942916136732092, "learning_rate": 9.408295545076487e-07, "loss": 0.3156, "step": 901 }, { "epoch": 0.4953322350356947, "grad_norm": 0.5386438289462784, "learning_rate": 9.406924241574767e-07, "loss": 0.3137, "step": 902 }, { "epoch": 0.4958813838550247, "grad_norm": 0.5022407514695376, "learning_rate": 9.405551451067201e-07, "loss": 0.2902, "step": 903 }, { "epoch": 0.4964305326743548, "grad_norm": 0.4962441291250635, "learning_rate": 9.404177174017059e-07, "loss": 0.3033, "step": 904 }, { "epoch": 0.49697968149368477, "grad_norm": 0.49621231233036023, "learning_rate": 9.402801410888109e-07, "loss": 0.2938, "step": 905 }, { "epoch": 0.4975288303130148, "grad_norm": 0.49731826368397647, "learning_rate": 9.401424162144617e-07, "loss": 0.3045, "step": 906 }, { "epoch": 0.49807797913234486, "grad_norm": 0.4166538106214972, "learning_rate": 9.400045428251357e-07, "loss": 0.2894, "step": 907 }, { "epoch": 0.4986271279516749, "grad_norm": 0.4691592328681605, "learning_rate": 9.398665209673601e-07, "loss": 0.2701, "step": 908 }, { "epoch": 0.49917627677100496, "grad_norm": 0.4461402006742555, "learning_rate": 9.39728350687712e-07, "loss": 0.328, "step": 909 }, { "epoch": 0.499725425590335, "grad_norm": 0.5621232251239702, "learning_rate": 9.395900320328187e-07, "loss": 0.2918, "step": 910 }, { "epoch": 0.500274574409665, "grad_norm": 0.5227577695217629, "learning_rate": 9.39451565049358e-07, "loss": 0.2888, "step": 911 }, { "epoch": 0.500823723228995, "grad_norm": 0.39227052219957875, "learning_rate": 9.39312949784057e-07, "loss": 0.3098, "step": 912 }, { "epoch": 0.5013728720483251, "grad_norm": 0.3768420830212145, "learning_rate": 9.391741862836936e-07, "loss": 0.3085, "step": 913 }, { "epoch": 0.5019220208676551, "grad_norm": 0.4619027272530282, "learning_rate": 9.390352745950952e-07, "loss": 0.2615, "step": 914 }, { "epoch": 0.5024711696869851, "grad_norm": 0.5721247327288269, "learning_rate": 9.388962147651392e-07, "loss": 0.3285, "step": 915 }, { "epoch": 0.5030203185063152, "grad_norm": 0.42653088411587964, "learning_rate": 9.387570068407536e-07, "loss": 0.2764, "step": 916 }, { "epoch": 0.5035694673256452, "grad_norm": 0.664657702924952, "learning_rate": 9.386176508689155e-07, "loss": 0.3561, "step": 917 }, { "epoch": 0.5041186161449753, "grad_norm": 0.5253474047699631, "learning_rate": 9.384781468966527e-07, "loss": 0.287, "step": 918 }, { "epoch": 0.5046677649643053, "grad_norm": 0.5336025783006878, "learning_rate": 9.383384949710427e-07, "loss": 0.2593, "step": 919 }, { "epoch": 0.5052169137836353, "grad_norm": 0.3921022334808134, "learning_rate": 9.381986951392127e-07, "loss": 0.2852, "step": 920 }, { "epoch": 0.5057660626029654, "grad_norm": 0.4819111234045801, "learning_rate": 9.380587474483399e-07, "loss": 0.28, "step": 921 }, { "epoch": 0.5063152114222954, "grad_norm": 0.5474448343933163, "learning_rate": 9.379186519456518e-07, "loss": 0.2978, "step": 922 }, { "epoch": 0.5068643602416255, "grad_norm": 0.8522125809139546, "learning_rate": 9.377784086784252e-07, "loss": 0.3143, "step": 923 }, { "epoch": 0.5074135090609555, "grad_norm": 0.4849817257556121, "learning_rate": 9.376380176939871e-07, "loss": 0.2821, "step": 924 }, { "epoch": 0.5079626578802856, "grad_norm": 0.41393907286812087, "learning_rate": 9.374974790397144e-07, "loss": 0.32, "step": 925 }, { "epoch": 0.5085118066996156, "grad_norm": 0.434081630367767, "learning_rate": 9.373567927630336e-07, "loss": 0.2928, "step": 926 }, { "epoch": 0.5090609555189456, "grad_norm": 0.4540361450635598, "learning_rate": 9.372159589114213e-07, "loss": 0.2816, "step": 927 }, { "epoch": 0.5096101043382757, "grad_norm": 0.4516371688883139, "learning_rate": 9.370749775324033e-07, "loss": 0.2847, "step": 928 }, { "epoch": 0.5101592531576057, "grad_norm": 0.6032813006104688, "learning_rate": 9.369338486735562e-07, "loss": 0.2767, "step": 929 }, { "epoch": 0.5107084019769358, "grad_norm": 0.49894381999492476, "learning_rate": 9.367925723825053e-07, "loss": 0.2897, "step": 930 }, { "epoch": 0.5112575507962658, "grad_norm": 0.4572027432433077, "learning_rate": 9.366511487069265e-07, "loss": 0.2908, "step": 931 }, { "epoch": 0.5118066996155958, "grad_norm": 0.46266433746076124, "learning_rate": 9.365095776945451e-07, "loss": 0.2546, "step": 932 }, { "epoch": 0.5123558484349259, "grad_norm": 0.5730402499622492, "learning_rate": 9.363678593931358e-07, "loss": 0.2847, "step": 933 }, { "epoch": 0.5129049972542559, "grad_norm": 0.4967244319281186, "learning_rate": 9.362259938505233e-07, "loss": 0.2771, "step": 934 }, { "epoch": 0.513454146073586, "grad_norm": 0.49356871672781477, "learning_rate": 9.360839811145824e-07, "loss": 0.2581, "step": 935 }, { "epoch": 0.514003294892916, "grad_norm": 0.39402326807548255, "learning_rate": 9.359418212332369e-07, "loss": 0.2736, "step": 936 }, { "epoch": 0.5145524437122461, "grad_norm": 0.5069437138906921, "learning_rate": 9.357995142544604e-07, "loss": 0.2816, "step": 937 }, { "epoch": 0.5151015925315761, "grad_norm": 0.6017536805081614, "learning_rate": 9.356570602262765e-07, "loss": 0.2937, "step": 938 }, { "epoch": 0.515650741350906, "grad_norm": 0.49563430068716785, "learning_rate": 9.355144591967578e-07, "loss": 0.2873, "step": 939 }, { "epoch": 0.5161998901702362, "grad_norm": 0.47672573468420487, "learning_rate": 9.353717112140276e-07, "loss": 0.2742, "step": 940 }, { "epoch": 0.5167490389895661, "grad_norm": 0.5376604255900301, "learning_rate": 9.352288163262575e-07, "loss": 0.2763, "step": 941 }, { "epoch": 0.5172981878088962, "grad_norm": 0.4313417427628322, "learning_rate": 9.350857745816693e-07, "loss": 0.3092, "step": 942 }, { "epoch": 0.5178473366282262, "grad_norm": 0.5700375023421917, "learning_rate": 9.349425860285346e-07, "loss": 0.305, "step": 943 }, { "epoch": 0.5183964854475562, "grad_norm": 0.5325081202379571, "learning_rate": 9.347992507151739e-07, "loss": 0.297, "step": 944 }, { "epoch": 0.5189456342668863, "grad_norm": 0.4698987291310082, "learning_rate": 9.34655768689958e-07, "loss": 0.2883, "step": 945 }, { "epoch": 0.5194947830862163, "grad_norm": 0.6051610218193799, "learning_rate": 9.345121400013067e-07, "loss": 0.2988, "step": 946 }, { "epoch": 0.5200439319055464, "grad_norm": 0.5064449536476456, "learning_rate": 9.343683646976891e-07, "loss": 0.3229, "step": 947 }, { "epoch": 0.5205930807248764, "grad_norm": 0.48144431665426274, "learning_rate": 9.342244428276242e-07, "loss": 0.2403, "step": 948 }, { "epoch": 0.5211422295442065, "grad_norm": 0.4466776162042638, "learning_rate": 9.340803744396804e-07, "loss": 0.3248, "step": 949 }, { "epoch": 0.5216913783635365, "grad_norm": 0.39529463847515395, "learning_rate": 9.339361595824755e-07, "loss": 0.2985, "step": 950 }, { "epoch": 0.5222405271828665, "grad_norm": 0.4980142634653844, "learning_rate": 9.337917983046766e-07, "loss": 0.2635, "step": 951 }, { "epoch": 0.5227896760021966, "grad_norm": 0.5515774761970018, "learning_rate": 9.336472906550005e-07, "loss": 0.2817, "step": 952 }, { "epoch": 0.5233388248215266, "grad_norm": 0.4278604719170521, "learning_rate": 9.335026366822129e-07, "loss": 0.2938, "step": 953 }, { "epoch": 0.5238879736408567, "grad_norm": 0.4656892942206506, "learning_rate": 9.333578364351294e-07, "loss": 0.2975, "step": 954 }, { "epoch": 0.5244371224601867, "grad_norm": 0.4170003565448647, "learning_rate": 9.332128899626148e-07, "loss": 0.3096, "step": 955 }, { "epoch": 0.5249862712795168, "grad_norm": 0.41154133521168373, "learning_rate": 9.330677973135831e-07, "loss": 0.297, "step": 956 }, { "epoch": 0.5255354200988468, "grad_norm": 0.5552848734845941, "learning_rate": 9.329225585369976e-07, "loss": 0.2872, "step": 957 }, { "epoch": 0.5260845689181768, "grad_norm": 0.5017420360248451, "learning_rate": 9.327771736818712e-07, "loss": 0.2718, "step": 958 }, { "epoch": 0.5266337177375069, "grad_norm": 0.48497670923181874, "learning_rate": 9.32631642797266e-07, "loss": 0.3055, "step": 959 }, { "epoch": 0.5271828665568369, "grad_norm": 0.4335856737583551, "learning_rate": 9.324859659322933e-07, "loss": 0.2555, "step": 960 }, { "epoch": 0.527732015376167, "grad_norm": 0.44596133464332316, "learning_rate": 9.323401431361133e-07, "loss": 0.3581, "step": 961 }, { "epoch": 0.528281164195497, "grad_norm": 0.6106061044251618, "learning_rate": 9.321941744579363e-07, "loss": 0.2935, "step": 962 }, { "epoch": 0.528830313014827, "grad_norm": 0.4147544086466501, "learning_rate": 9.320480599470209e-07, "loss": 0.2998, "step": 963 }, { "epoch": 0.5293794618341571, "grad_norm": 0.4794800558341023, "learning_rate": 9.319017996526759e-07, "loss": 0.2754, "step": 964 }, { "epoch": 0.5299286106534871, "grad_norm": 0.44534003351037493, "learning_rate": 9.317553936242583e-07, "loss": 0.2709, "step": 965 }, { "epoch": 0.5304777594728172, "grad_norm": 0.4867035947101275, "learning_rate": 9.31608841911175e-07, "loss": 0.2648, "step": 966 }, { "epoch": 0.5310269082921472, "grad_norm": 0.4209812485651194, "learning_rate": 9.314621445628818e-07, "loss": 0.2547, "step": 967 }, { "epoch": 0.5315760571114773, "grad_norm": 0.4681960942365018, "learning_rate": 9.313153016288834e-07, "loss": 0.2919, "step": 968 }, { "epoch": 0.5321252059308073, "grad_norm": 0.4454053676970525, "learning_rate": 9.311683131587341e-07, "loss": 0.282, "step": 969 }, { "epoch": 0.5326743547501372, "grad_norm": 0.5493578070784273, "learning_rate": 9.310211792020373e-07, "loss": 0.3053, "step": 970 }, { "epoch": 0.5332235035694673, "grad_norm": 0.5390349826204069, "learning_rate": 9.308738998084448e-07, "loss": 0.2845, "step": 971 }, { "epoch": 0.5337726523887973, "grad_norm": 0.3860137685475121, "learning_rate": 9.307264750276581e-07, "loss": 0.3687, "step": 972 }, { "epoch": 0.5343218012081274, "grad_norm": 0.4849612252791877, "learning_rate": 9.305789049094279e-07, "loss": 0.2742, "step": 973 }, { "epoch": 0.5348709500274574, "grad_norm": 0.3957981429710754, "learning_rate": 9.304311895035535e-07, "loss": 0.3143, "step": 974 }, { "epoch": 0.5354200988467874, "grad_norm": 0.523471832019245, "learning_rate": 9.302833288598835e-07, "loss": 0.2877, "step": 975 }, { "epoch": 0.5359692476661175, "grad_norm": 0.4404212152421518, "learning_rate": 9.301353230283152e-07, "loss": 0.2758, "step": 976 }, { "epoch": 0.5365183964854475, "grad_norm": 0.5394204270927017, "learning_rate": 9.299871720587954e-07, "loss": 0.3072, "step": 977 }, { "epoch": 0.5370675453047776, "grad_norm": 0.5823178998925181, "learning_rate": 9.298388760013194e-07, "loss": 0.2785, "step": 978 }, { "epoch": 0.5376166941241076, "grad_norm": 0.4260921009488124, "learning_rate": 9.296904349059318e-07, "loss": 0.2717, "step": 979 }, { "epoch": 0.5381658429434377, "grad_norm": 0.4019948136463715, "learning_rate": 9.295418488227257e-07, "loss": 0.2969, "step": 980 }, { "epoch": 0.5387149917627677, "grad_norm": 0.42484620847494486, "learning_rate": 9.293931178018437e-07, "loss": 0.3319, "step": 981 }, { "epoch": 0.5392641405820977, "grad_norm": 0.49589138464291516, "learning_rate": 9.292442418934771e-07, "loss": 0.3092, "step": 982 }, { "epoch": 0.5398132894014278, "grad_norm": 0.4523297379936676, "learning_rate": 9.290952211478659e-07, "loss": 0.2805, "step": 983 }, { "epoch": 0.5403624382207578, "grad_norm": 0.49903798190139037, "learning_rate": 9.28946055615299e-07, "loss": 0.3103, "step": 984 }, { "epoch": 0.5409115870400879, "grad_norm": 0.44253415856030676, "learning_rate": 9.287967453461146e-07, "loss": 0.2831, "step": 985 }, { "epoch": 0.5414607358594179, "grad_norm": 0.4916236801329548, "learning_rate": 9.28647290390699e-07, "loss": 0.2705, "step": 986 }, { "epoch": 0.5420098846787479, "grad_norm": 0.4326158839163807, "learning_rate": 9.284976907994881e-07, "loss": 0.2763, "step": 987 }, { "epoch": 0.542559033498078, "grad_norm": 0.39425862258101324, "learning_rate": 9.28347946622966e-07, "loss": 0.2584, "step": 988 }, { "epoch": 0.543108182317408, "grad_norm": 0.5147046280713691, "learning_rate": 9.28198057911666e-07, "loss": 0.2574, "step": 989 }, { "epoch": 0.5436573311367381, "grad_norm": 0.41079396351260283, "learning_rate": 9.2804802471617e-07, "loss": 0.2752, "step": 990 }, { "epoch": 0.5442064799560681, "grad_norm": 0.4386781914515836, "learning_rate": 9.278978470871086e-07, "loss": 0.2849, "step": 991 }, { "epoch": 0.5447556287753982, "grad_norm": 0.4376795852515167, "learning_rate": 9.277475250751613e-07, "loss": 0.2851, "step": 992 }, { "epoch": 0.5453047775947282, "grad_norm": 0.5137192521633446, "learning_rate": 9.275970587310562e-07, "loss": 0.3714, "step": 993 }, { "epoch": 0.5458539264140582, "grad_norm": 0.49699132352043934, "learning_rate": 9.274464481055702e-07, "loss": 0.251, "step": 994 }, { "epoch": 0.5464030752333883, "grad_norm": 0.44790179295453575, "learning_rate": 9.272956932495288e-07, "loss": 0.2999, "step": 995 }, { "epoch": 0.5469522240527183, "grad_norm": 0.4719079324155892, "learning_rate": 9.27144794213806e-07, "loss": 0.298, "step": 996 }, { "epoch": 0.5475013728720484, "grad_norm": 0.5108807732449081, "learning_rate": 9.269937510493249e-07, "loss": 0.3093, "step": 997 }, { "epoch": 0.5480505216913784, "grad_norm": 0.6127240020836249, "learning_rate": 9.26842563807057e-07, "loss": 0.2873, "step": 998 }, { "epoch": 0.5485996705107083, "grad_norm": 0.48710113873037447, "learning_rate": 9.266912325380225e-07, "loss": 0.2569, "step": 999 }, { "epoch": 0.5491488193300385, "grad_norm": 0.5054369705066603, "learning_rate": 9.2653975729329e-07, "loss": 0.333, "step": 1000 }, { "epoch": 0.5491488193300385, "eval_loss": 0.372631311416626, "eval_runtime": 18.5946, "eval_samples_per_second": 23.824, "eval_steps_per_second": 1.022, "step": 1000 }, { "epoch": 0.5496979681493684, "grad_norm": 0.5563534486868508, "learning_rate": 9.263881381239767e-07, "loss": 0.3179, "step": 1001 }, { "epoch": 0.5502471169686985, "grad_norm": 0.8165424003556195, "learning_rate": 9.262363750812487e-07, "loss": 0.3733, "step": 1002 }, { "epoch": 0.5507962657880285, "grad_norm": 0.45455738506506776, "learning_rate": 9.260844682163204e-07, "loss": 0.2961, "step": 1003 }, { "epoch": 0.5513454146073586, "grad_norm": 0.47059950329038075, "learning_rate": 9.259324175804547e-07, "loss": 0.2794, "step": 1004 }, { "epoch": 0.5518945634266886, "grad_norm": 0.39990353842343673, "learning_rate": 9.25780223224963e-07, "loss": 0.2729, "step": 1005 }, { "epoch": 0.5524437122460186, "grad_norm": 0.5200228849861063, "learning_rate": 9.256278852012054e-07, "loss": 0.2943, "step": 1006 }, { "epoch": 0.5529928610653487, "grad_norm": 0.4439072868789634, "learning_rate": 9.254754035605905e-07, "loss": 0.2851, "step": 1007 }, { "epoch": 0.5535420098846787, "grad_norm": 0.4409853570927373, "learning_rate": 9.253227783545751e-07, "loss": 0.2854, "step": 1008 }, { "epoch": 0.5540911587040088, "grad_norm": 0.45284311556082324, "learning_rate": 9.251700096346644e-07, "loss": 0.315, "step": 1009 }, { "epoch": 0.5546403075233388, "grad_norm": 0.39707003431281584, "learning_rate": 9.250170974524126e-07, "loss": 0.299, "step": 1010 }, { "epoch": 0.5551894563426688, "grad_norm": 0.4362985555384377, "learning_rate": 9.248640418594217e-07, "loss": 0.2665, "step": 1011 }, { "epoch": 0.5557386051619989, "grad_norm": 0.44207446986588456, "learning_rate": 9.247108429073423e-07, "loss": 0.2791, "step": 1012 }, { "epoch": 0.5562877539813289, "grad_norm": 0.420943626873848, "learning_rate": 9.245575006478735e-07, "loss": 0.25, "step": 1013 }, { "epoch": 0.556836902800659, "grad_norm": 0.48573056151952415, "learning_rate": 9.244040151327625e-07, "loss": 0.2608, "step": 1014 }, { "epoch": 0.557386051619989, "grad_norm": 0.4342806958191244, "learning_rate": 9.242503864138052e-07, "loss": 0.2636, "step": 1015 }, { "epoch": 0.5579352004393191, "grad_norm": 0.6293301005515782, "learning_rate": 9.240966145428457e-07, "loss": 0.2713, "step": 1016 }, { "epoch": 0.5584843492586491, "grad_norm": 0.5747765499108737, "learning_rate": 9.239426995717761e-07, "loss": 0.349, "step": 1017 }, { "epoch": 0.5590334980779791, "grad_norm": 0.5482605833191367, "learning_rate": 9.237886415525372e-07, "loss": 0.2673, "step": 1018 }, { "epoch": 0.5595826468973092, "grad_norm": 0.4409608254364869, "learning_rate": 9.236344405371177e-07, "loss": 0.2967, "step": 1019 }, { "epoch": 0.5601317957166392, "grad_norm": 0.40965160928332, "learning_rate": 9.23480096577555e-07, "loss": 0.2837, "step": 1020 }, { "epoch": 0.5606809445359693, "grad_norm": 0.5517526784895708, "learning_rate": 9.233256097259343e-07, "loss": 0.3186, "step": 1021 }, { "epoch": 0.5612300933552993, "grad_norm": 0.5093156398528925, "learning_rate": 9.231709800343895e-07, "loss": 0.3348, "step": 1022 }, { "epoch": 0.5617792421746294, "grad_norm": 0.6554873163717477, "learning_rate": 9.230162075551021e-07, "loss": 0.3147, "step": 1023 }, { "epoch": 0.5623283909939594, "grad_norm": 0.4039473778050129, "learning_rate": 9.228612923403022e-07, "loss": 0.2834, "step": 1024 }, { "epoch": 0.5628775398132894, "grad_norm": 0.4666790654415315, "learning_rate": 9.227062344422682e-07, "loss": 0.2667, "step": 1025 }, { "epoch": 0.5634266886326195, "grad_norm": 0.4435666165995333, "learning_rate": 9.225510339133262e-07, "loss": 0.3111, "step": 1026 }, { "epoch": 0.5639758374519495, "grad_norm": 0.5154742963928248, "learning_rate": 9.223956908058508e-07, "loss": 0.3089, "step": 1027 }, { "epoch": 0.5645249862712796, "grad_norm": 0.42878577706892507, "learning_rate": 9.222402051722642e-07, "loss": 0.2542, "step": 1028 }, { "epoch": 0.5650741350906096, "grad_norm": 0.5269383382001943, "learning_rate": 9.220845770650377e-07, "loss": 0.2752, "step": 1029 }, { "epoch": 0.5656232839099395, "grad_norm": 0.4495018623801755, "learning_rate": 9.219288065366896e-07, "loss": 0.2791, "step": 1030 }, { "epoch": 0.5661724327292696, "grad_norm": 0.5065307592932909, "learning_rate": 9.217728936397868e-07, "loss": 0.3145, "step": 1031 }, { "epoch": 0.5667215815485996, "grad_norm": 0.4391342148525691, "learning_rate": 9.216168384269443e-07, "loss": 0.2635, "step": 1032 }, { "epoch": 0.5672707303679297, "grad_norm": 0.5479487923582129, "learning_rate": 9.214606409508248e-07, "loss": 0.2907, "step": 1033 }, { "epoch": 0.5678198791872597, "grad_norm": 0.46579473113214453, "learning_rate": 9.213043012641393e-07, "loss": 0.2838, "step": 1034 }, { "epoch": 0.5683690280065898, "grad_norm": 0.44503717521250913, "learning_rate": 9.211478194196466e-07, "loss": 0.2758, "step": 1035 }, { "epoch": 0.5689181768259198, "grad_norm": 1.0394730117520932, "learning_rate": 9.209911954701537e-07, "loss": 0.4999, "step": 1036 }, { "epoch": 0.5694673256452498, "grad_norm": 0.3933071990766414, "learning_rate": 9.208344294685153e-07, "loss": 0.2714, "step": 1037 }, { "epoch": 0.5700164744645799, "grad_norm": 0.47083533744879774, "learning_rate": 9.206775214676342e-07, "loss": 0.2966, "step": 1038 }, { "epoch": 0.5705656232839099, "grad_norm": 0.41288928208193654, "learning_rate": 9.20520471520461e-07, "loss": 0.2672, "step": 1039 }, { "epoch": 0.57111477210324, "grad_norm": 0.4867781728778002, "learning_rate": 9.203632796799943e-07, "loss": 0.2913, "step": 1040 }, { "epoch": 0.57166392092257, "grad_norm": 0.4596785861988866, "learning_rate": 9.202059459992808e-07, "loss": 0.2471, "step": 1041 }, { "epoch": 0.5722130697419, "grad_norm": 0.5155292158510203, "learning_rate": 9.200484705314144e-07, "loss": 0.2834, "step": 1042 }, { "epoch": 0.5727622185612301, "grad_norm": 0.42303242668989033, "learning_rate": 9.198908533295377e-07, "loss": 0.2817, "step": 1043 }, { "epoch": 0.5733113673805601, "grad_norm": 0.5924247458314694, "learning_rate": 9.197330944468401e-07, "loss": 0.3294, "step": 1044 }, { "epoch": 0.5738605161998902, "grad_norm": 0.7890449041973231, "learning_rate": 9.195751939365601e-07, "loss": 0.3521, "step": 1045 }, { "epoch": 0.5744096650192202, "grad_norm": 0.6401070170444926, "learning_rate": 9.194171518519827e-07, "loss": 0.3124, "step": 1046 }, { "epoch": 0.5749588138385503, "grad_norm": 0.47296090750512837, "learning_rate": 9.192589682464416e-07, "loss": 0.3118, "step": 1047 }, { "epoch": 0.5755079626578803, "grad_norm": 0.5072279632489303, "learning_rate": 9.19100643173318e-07, "loss": 0.2501, "step": 1048 }, { "epoch": 0.5760571114772103, "grad_norm": 0.46312294508566104, "learning_rate": 9.189421766860408e-07, "loss": 0.256, "step": 1049 }, { "epoch": 0.5766062602965404, "grad_norm": 0.44863731340899177, "learning_rate": 9.187835688380864e-07, "loss": 0.2664, "step": 1050 }, { "epoch": 0.5771554091158704, "grad_norm": 0.45904360310654274, "learning_rate": 9.186248196829791e-07, "loss": 0.2644, "step": 1051 }, { "epoch": 0.5777045579352005, "grad_norm": 0.3688717426870435, "learning_rate": 9.184659292742909e-07, "loss": 0.2833, "step": 1052 }, { "epoch": 0.5782537067545305, "grad_norm": 0.488584232027721, "learning_rate": 9.183068976656418e-07, "loss": 0.266, "step": 1053 }, { "epoch": 0.5788028555738605, "grad_norm": 0.7392820483582149, "learning_rate": 9.181477249106986e-07, "loss": 0.2481, "step": 1054 }, { "epoch": 0.5793520043931906, "grad_norm": 0.4856031534132429, "learning_rate": 9.179884110631767e-07, "loss": 0.2941, "step": 1055 }, { "epoch": 0.5799011532125206, "grad_norm": 0.45023080124313786, "learning_rate": 9.178289561768382e-07, "loss": 0.2745, "step": 1056 }, { "epoch": 0.5804503020318507, "grad_norm": 0.5244343852114275, "learning_rate": 9.176693603054937e-07, "loss": 0.2645, "step": 1057 }, { "epoch": 0.5809994508511807, "grad_norm": 0.5355774488113496, "learning_rate": 9.175096235030006e-07, "loss": 0.2561, "step": 1058 }, { "epoch": 0.5815485996705108, "grad_norm": 0.4205549619981094, "learning_rate": 9.173497458232644e-07, "loss": 0.2708, "step": 1059 }, { "epoch": 0.5820977484898407, "grad_norm": 0.5046767029446393, "learning_rate": 9.171897273202379e-07, "loss": 0.2809, "step": 1060 }, { "epoch": 0.5826468973091707, "grad_norm": 0.4208047571407197, "learning_rate": 9.170295680479212e-07, "loss": 0.2365, "step": 1061 }, { "epoch": 0.5831960461285008, "grad_norm": 0.563195578366495, "learning_rate": 9.168692680603625e-07, "loss": 0.3115, "step": 1062 }, { "epoch": 0.5837451949478308, "grad_norm": 0.4229304487438865, "learning_rate": 9.167088274116569e-07, "loss": 0.3147, "step": 1063 }, { "epoch": 0.5842943437671609, "grad_norm": 0.49847873535309245, "learning_rate": 9.165482461559472e-07, "loss": 0.3496, "step": 1064 }, { "epoch": 0.5848434925864909, "grad_norm": 0.38940763478597096, "learning_rate": 9.163875243474237e-07, "loss": 0.297, "step": 1065 }, { "epoch": 0.5853926414058209, "grad_norm": 0.8981788759355992, "learning_rate": 9.162266620403243e-07, "loss": 0.2678, "step": 1066 }, { "epoch": 0.585941790225151, "grad_norm": 0.41067609206423245, "learning_rate": 9.160656592889339e-07, "loss": 0.2692, "step": 1067 }, { "epoch": 0.586490939044481, "grad_norm": 0.4943860716989215, "learning_rate": 9.15904516147585e-07, "loss": 0.2964, "step": 1068 }, { "epoch": 0.5870400878638111, "grad_norm": 0.4997805459811917, "learning_rate": 9.157432326706575e-07, "loss": 0.289, "step": 1069 }, { "epoch": 0.5875892366831411, "grad_norm": 0.3793351213782992, "learning_rate": 9.155818089125786e-07, "loss": 0.2623, "step": 1070 }, { "epoch": 0.5881383855024712, "grad_norm": 0.562855713320707, "learning_rate": 9.154202449278229e-07, "loss": 0.2628, "step": 1071 }, { "epoch": 0.5886875343218012, "grad_norm": 0.3754049716030077, "learning_rate": 9.152585407709124e-07, "loss": 0.269, "step": 1072 }, { "epoch": 0.5892366831411312, "grad_norm": 0.5819722568773006, "learning_rate": 9.150966964964161e-07, "loss": 0.2871, "step": 1073 }, { "epoch": 0.5897858319604613, "grad_norm": 0.43918964482111517, "learning_rate": 9.149347121589505e-07, "loss": 0.267, "step": 1074 }, { "epoch": 0.5903349807797913, "grad_norm": 0.7524912779221687, "learning_rate": 9.147725878131796e-07, "loss": 0.2776, "step": 1075 }, { "epoch": 0.5908841295991214, "grad_norm": 0.9002793616105111, "learning_rate": 9.146103235138141e-07, "loss": 0.3054, "step": 1076 }, { "epoch": 0.5914332784184514, "grad_norm": 0.44101669571912955, "learning_rate": 9.144479193156124e-07, "loss": 0.2936, "step": 1077 }, { "epoch": 0.5919824272377814, "grad_norm": 0.5867875151240506, "learning_rate": 9.142853752733799e-07, "loss": 0.3049, "step": 1078 }, { "epoch": 0.5925315760571115, "grad_norm": 0.7208086384724053, "learning_rate": 9.141226914419691e-07, "loss": 0.4554, "step": 1079 }, { "epoch": 0.5930807248764415, "grad_norm": 0.3874539214172494, "learning_rate": 9.139598678762799e-07, "loss": 0.2734, "step": 1080 }, { "epoch": 0.5936298736957716, "grad_norm": 0.6043320796954844, "learning_rate": 9.137969046312594e-07, "loss": 0.3125, "step": 1081 }, { "epoch": 0.5941790225151016, "grad_norm": 0.3912012788205948, "learning_rate": 9.136338017619014e-07, "loss": 0.2394, "step": 1082 }, { "epoch": 0.5947281713344317, "grad_norm": 0.5414117089228002, "learning_rate": 9.134705593232472e-07, "loss": 0.2594, "step": 1083 }, { "epoch": 0.5952773201537617, "grad_norm": 0.8054074074934338, "learning_rate": 9.133071773703853e-07, "loss": 0.3648, "step": 1084 }, { "epoch": 0.5958264689730917, "grad_norm": 0.5807567169129422, "learning_rate": 9.131436559584509e-07, "loss": 0.3265, "step": 1085 }, { "epoch": 0.5963756177924218, "grad_norm": 0.48697925601004677, "learning_rate": 9.129799951426264e-07, "loss": 0.2625, "step": 1086 }, { "epoch": 0.5969247666117518, "grad_norm": 0.44450817749726423, "learning_rate": 9.128161949781415e-07, "loss": 0.2723, "step": 1087 }, { "epoch": 0.5974739154310819, "grad_norm": 0.4146900771755306, "learning_rate": 9.126522555202727e-07, "loss": 0.2693, "step": 1088 }, { "epoch": 0.5980230642504119, "grad_norm": 0.48869787874134757, "learning_rate": 9.124881768243433e-07, "loss": 0.2819, "step": 1089 }, { "epoch": 0.598572213069742, "grad_norm": 0.5304503915276655, "learning_rate": 9.123239589457242e-07, "loss": 0.2943, "step": 1090 }, { "epoch": 0.599121361889072, "grad_norm": 0.40953396026247785, "learning_rate": 9.121596019398323e-07, "loss": 0.2805, "step": 1091 }, { "epoch": 0.5996705107084019, "grad_norm": 0.5587497644085375, "learning_rate": 9.119951058621326e-07, "loss": 0.289, "step": 1092 }, { "epoch": 0.600219659527732, "grad_norm": 0.45194476417255347, "learning_rate": 9.118304707681362e-07, "loss": 0.271, "step": 1093 }, { "epoch": 0.600768808347062, "grad_norm": 0.41864743967092277, "learning_rate": 9.116656967134015e-07, "loss": 0.2986, "step": 1094 }, { "epoch": 0.6013179571663921, "grad_norm": 0.513933227032906, "learning_rate": 9.115007837535336e-07, "loss": 0.2785, "step": 1095 }, { "epoch": 0.6018671059857221, "grad_norm": 0.49599002704508294, "learning_rate": 9.113357319441842e-07, "loss": 0.2996, "step": 1096 }, { "epoch": 0.6024162548050521, "grad_norm": 0.423546937059523, "learning_rate": 9.11170541341053e-07, "loss": 0.2763, "step": 1097 }, { "epoch": 0.6029654036243822, "grad_norm": 0.5185983592412352, "learning_rate": 9.110052119998851e-07, "loss": 0.2707, "step": 1098 }, { "epoch": 0.6035145524437122, "grad_norm": 0.45148826524054897, "learning_rate": 9.108397439764732e-07, "loss": 0.2809, "step": 1099 }, { "epoch": 0.6040637012630423, "grad_norm": 0.4660826413233851, "learning_rate": 9.106741373266568e-07, "loss": 0.2935, "step": 1100 }, { "epoch": 0.6046128500823723, "grad_norm": 0.5419396810694019, "learning_rate": 9.105083921063221e-07, "loss": 0.2828, "step": 1101 }, { "epoch": 0.6051619989017024, "grad_norm": 0.45922532712302805, "learning_rate": 9.103425083714016e-07, "loss": 0.2889, "step": 1102 }, { "epoch": 0.6057111477210324, "grad_norm": 0.4649333694966789, "learning_rate": 9.101764861778754e-07, "loss": 0.2643, "step": 1103 }, { "epoch": 0.6062602965403624, "grad_norm": 0.418968172335678, "learning_rate": 9.100103255817696e-07, "loss": 0.3125, "step": 1104 }, { "epoch": 0.6068094453596925, "grad_norm": 0.45190514849951635, "learning_rate": 9.098440266391574e-07, "loss": 0.2483, "step": 1105 }, { "epoch": 0.6073585941790225, "grad_norm": 0.4209969854467636, "learning_rate": 9.096775894061586e-07, "loss": 0.2743, "step": 1106 }, { "epoch": 0.6079077429983526, "grad_norm": 0.4409671357922739, "learning_rate": 9.095110139389395e-07, "loss": 0.3051, "step": 1107 }, { "epoch": 0.6084568918176826, "grad_norm": 0.5279492928832483, "learning_rate": 9.093443002937131e-07, "loss": 0.2817, "step": 1108 }, { "epoch": 0.6090060406370126, "grad_norm": 0.39225930694864325, "learning_rate": 9.091774485267395e-07, "loss": 0.2871, "step": 1109 }, { "epoch": 0.6095551894563427, "grad_norm": 0.5863271436048614, "learning_rate": 9.090104586943247e-07, "loss": 0.3082, "step": 1110 }, { "epoch": 0.6101043382756727, "grad_norm": 0.4788268336104399, "learning_rate": 9.088433308528217e-07, "loss": 0.2536, "step": 1111 }, { "epoch": 0.6106534870950028, "grad_norm": 0.5112645623805705, "learning_rate": 9.0867606505863e-07, "loss": 0.3205, "step": 1112 }, { "epoch": 0.6112026359143328, "grad_norm": 0.5725254861344884, "learning_rate": 9.085086613681957e-07, "loss": 0.2549, "step": 1113 }, { "epoch": 0.6117517847336629, "grad_norm": 0.46942712798410025, "learning_rate": 9.083411198380112e-07, "loss": 0.3318, "step": 1114 }, { "epoch": 0.6123009335529929, "grad_norm": 0.49984667196143134, "learning_rate": 9.081734405246158e-07, "loss": 0.3263, "step": 1115 }, { "epoch": 0.6128500823723229, "grad_norm": 0.3927550822295869, "learning_rate": 9.08005623484595e-07, "loss": 0.3072, "step": 1116 }, { "epoch": 0.613399231191653, "grad_norm": 0.48374518376634384, "learning_rate": 9.078376687745809e-07, "loss": 0.2971, "step": 1117 }, { "epoch": 0.613948380010983, "grad_norm": 0.5909556068133605, "learning_rate": 9.07669576451252e-07, "loss": 0.2788, "step": 1118 }, { "epoch": 0.6144975288303131, "grad_norm": 0.4514702540817506, "learning_rate": 9.075013465713333e-07, "loss": 0.3036, "step": 1119 }, { "epoch": 0.615046677649643, "grad_norm": 0.5730970328312417, "learning_rate": 9.073329791915959e-07, "loss": 0.2749, "step": 1120 }, { "epoch": 0.615595826468973, "grad_norm": 0.46720357212585395, "learning_rate": 9.071644743688581e-07, "loss": 0.2834, "step": 1121 }, { "epoch": 0.6161449752883031, "grad_norm": 0.4652048446584189, "learning_rate": 9.069958321599836e-07, "loss": 0.2516, "step": 1122 }, { "epoch": 0.6166941241076331, "grad_norm": 0.49211847409586273, "learning_rate": 9.068270526218835e-07, "loss": 0.2895, "step": 1123 }, { "epoch": 0.6172432729269632, "grad_norm": 0.5225658696718942, "learning_rate": 9.06658135811514e-07, "loss": 0.3083, "step": 1124 }, { "epoch": 0.6177924217462932, "grad_norm": 0.4732494876596653, "learning_rate": 9.064890817858786e-07, "loss": 0.2883, "step": 1125 }, { "epoch": 0.6183415705656233, "grad_norm": 0.7072333921465951, "learning_rate": 9.063198906020269e-07, "loss": 0.3051, "step": 1126 }, { "epoch": 0.6188907193849533, "grad_norm": 0.568772736573002, "learning_rate": 9.061505623170547e-07, "loss": 0.3023, "step": 1127 }, { "epoch": 0.6194398682042833, "grad_norm": 0.5089502546392174, "learning_rate": 9.05981096988104e-07, "loss": 0.3292, "step": 1128 }, { "epoch": 0.6199890170236134, "grad_norm": 0.5380404881624282, "learning_rate": 9.05811494672363e-07, "loss": 0.2967, "step": 1129 }, { "epoch": 0.6205381658429434, "grad_norm": 0.4092785459290651, "learning_rate": 9.056417554270662e-07, "loss": 0.3065, "step": 1130 }, { "epoch": 0.6210873146622735, "grad_norm": 0.49815959251268416, "learning_rate": 9.054718793094945e-07, "loss": 0.2608, "step": 1131 }, { "epoch": 0.6216364634816035, "grad_norm": 0.4908110343266882, "learning_rate": 9.053018663769749e-07, "loss": 0.2671, "step": 1132 }, { "epoch": 0.6221856123009335, "grad_norm": 0.47435273371221404, "learning_rate": 9.051317166868804e-07, "loss": 0.2569, "step": 1133 }, { "epoch": 0.6227347611202636, "grad_norm": 0.42499482377738906, "learning_rate": 9.049614302966302e-07, "loss": 0.2622, "step": 1134 }, { "epoch": 0.6232839099395936, "grad_norm": 0.47501304937790956, "learning_rate": 9.047910072636896e-07, "loss": 0.3218, "step": 1135 }, { "epoch": 0.6238330587589237, "grad_norm": 0.4507543358957904, "learning_rate": 9.046204476455703e-07, "loss": 0.2616, "step": 1136 }, { "epoch": 0.6243822075782537, "grad_norm": 0.5867199213464204, "learning_rate": 9.044497514998297e-07, "loss": 0.2828, "step": 1137 }, { "epoch": 0.6249313563975838, "grad_norm": 0.5004424648258832, "learning_rate": 9.042789188840718e-07, "loss": 0.2615, "step": 1138 }, { "epoch": 0.6254805052169138, "grad_norm": 0.5353090704470145, "learning_rate": 9.041079498559459e-07, "loss": 0.2824, "step": 1139 }, { "epoch": 0.6260296540362438, "grad_norm": 0.5116562529747475, "learning_rate": 9.039368444731479e-07, "loss": 0.2674, "step": 1140 }, { "epoch": 0.6265788028555739, "grad_norm": 0.5227186253081871, "learning_rate": 9.037656027934198e-07, "loss": 0.2479, "step": 1141 }, { "epoch": 0.6271279516749039, "grad_norm": 0.45780860128985457, "learning_rate": 9.03594224874549e-07, "loss": 0.2724, "step": 1142 }, { "epoch": 0.627677100494234, "grad_norm": 0.43436053325347795, "learning_rate": 9.034227107743694e-07, "loss": 0.2733, "step": 1143 }, { "epoch": 0.628226249313564, "grad_norm": 0.48117972611454424, "learning_rate": 9.032510605507606e-07, "loss": 0.2847, "step": 1144 }, { "epoch": 0.628775398132894, "grad_norm": 0.741175894481161, "learning_rate": 9.030792742616483e-07, "loss": 0.3489, "step": 1145 }, { "epoch": 0.6293245469522241, "grad_norm": 0.664681011052934, "learning_rate": 9.029073519650042e-07, "loss": 0.2996, "step": 1146 }, { "epoch": 0.629873695771554, "grad_norm": 0.5157222479953482, "learning_rate": 9.027352937188454e-07, "loss": 0.2821, "step": 1147 }, { "epoch": 0.6304228445908842, "grad_norm": 0.4836083017833757, "learning_rate": 9.025630995812354e-07, "loss": 0.2575, "step": 1148 }, { "epoch": 0.6309719934102141, "grad_norm": 0.459569266796869, "learning_rate": 9.023907696102835e-07, "loss": 0.3097, "step": 1149 }, { "epoch": 0.6315211422295443, "grad_norm": 0.4798189540098763, "learning_rate": 9.022183038641445e-07, "loss": 0.2927, "step": 1150 }, { "epoch": 0.6320702910488742, "grad_norm": 0.4987997800145966, "learning_rate": 9.020457024010195e-07, "loss": 0.2971, "step": 1151 }, { "epoch": 0.6326194398682042, "grad_norm": 1.9096532428061457, "learning_rate": 9.018729652791548e-07, "loss": 0.3226, "step": 1152 }, { "epoch": 0.6331685886875343, "grad_norm": 0.44216483961472397, "learning_rate": 9.01700092556843e-07, "loss": 0.2638, "step": 1153 }, { "epoch": 0.6337177375068643, "grad_norm": 0.7328629429496601, "learning_rate": 9.015270842924223e-07, "loss": 0.3098, "step": 1154 }, { "epoch": 0.6342668863261944, "grad_norm": 0.5685306920819805, "learning_rate": 9.013539405442766e-07, "loss": 0.2933, "step": 1155 }, { "epoch": 0.6348160351455244, "grad_norm": 0.5822974816059474, "learning_rate": 9.011806613708355e-07, "loss": 0.3205, "step": 1156 }, { "epoch": 0.6353651839648545, "grad_norm": 0.45522725658968266, "learning_rate": 9.010072468305742e-07, "loss": 0.2643, "step": 1157 }, { "epoch": 0.6359143327841845, "grad_norm": 0.4538908806885672, "learning_rate": 9.008336969820141e-07, "loss": 0.2785, "step": 1158 }, { "epoch": 0.6364634816035145, "grad_norm": 0.4777941850696039, "learning_rate": 9.006600118837214e-07, "loss": 0.2352, "step": 1159 }, { "epoch": 0.6370126304228446, "grad_norm": 0.4816371328153312, "learning_rate": 9.004861915943088e-07, "loss": 0.3168, "step": 1160 }, { "epoch": 0.6375617792421746, "grad_norm": 0.35654766955404366, "learning_rate": 9.003122361724341e-07, "loss": 0.2711, "step": 1161 }, { "epoch": 0.6381109280615047, "grad_norm": 0.4742952913712317, "learning_rate": 9.001381456768008e-07, "loss": 0.272, "step": 1162 }, { "epoch": 0.6386600768808347, "grad_norm": 0.6322978339601845, "learning_rate": 8.999639201661583e-07, "loss": 0.2821, "step": 1163 }, { "epoch": 0.6392092257001647, "grad_norm": 0.42119244942878137, "learning_rate": 8.997895596993008e-07, "loss": 0.2994, "step": 1164 }, { "epoch": 0.6397583745194948, "grad_norm": 0.43903748409154886, "learning_rate": 8.996150643350688e-07, "loss": 0.2734, "step": 1165 }, { "epoch": 0.6403075233388248, "grad_norm": 0.4761678888000459, "learning_rate": 8.994404341323483e-07, "loss": 0.2849, "step": 1166 }, { "epoch": 0.6408566721581549, "grad_norm": 0.737157789799894, "learning_rate": 8.992656691500703e-07, "loss": 0.3252, "step": 1167 }, { "epoch": 0.6414058209774849, "grad_norm": 0.5583112072557074, "learning_rate": 8.990907694472114e-07, "loss": 0.2866, "step": 1168 }, { "epoch": 0.641954969796815, "grad_norm": 0.5338646041317848, "learning_rate": 8.989157350827942e-07, "loss": 0.2616, "step": 1169 }, { "epoch": 0.642504118616145, "grad_norm": 0.45825004829388927, "learning_rate": 8.987405661158859e-07, "loss": 0.2327, "step": 1170 }, { "epoch": 0.643053267435475, "grad_norm": 0.4595477401487814, "learning_rate": 8.985652626055998e-07, "loss": 0.2727, "step": 1171 }, { "epoch": 0.6436024162548051, "grad_norm": 0.5748678681998864, "learning_rate": 8.983898246110944e-07, "loss": 0.2777, "step": 1172 }, { "epoch": 0.6441515650741351, "grad_norm": 0.3900795295711715, "learning_rate": 8.982142521915736e-07, "loss": 0.2868, "step": 1173 }, { "epoch": 0.6447007138934652, "grad_norm": 0.5190279935080138, "learning_rate": 8.980385454062865e-07, "loss": 0.2609, "step": 1174 }, { "epoch": 0.6452498627127952, "grad_norm": 0.5151140929715181, "learning_rate": 8.978627043145279e-07, "loss": 0.2938, "step": 1175 }, { "epoch": 0.6457990115321252, "grad_norm": 0.5037281238608102, "learning_rate": 8.976867289756374e-07, "loss": 0.2767, "step": 1176 }, { "epoch": 0.6463481603514553, "grad_norm": 0.40939636643133553, "learning_rate": 8.975106194490002e-07, "loss": 0.2758, "step": 1177 }, { "epoch": 0.6468973091707853, "grad_norm": 0.568518815783045, "learning_rate": 8.973343757940471e-07, "loss": 0.3135, "step": 1178 }, { "epoch": 0.6474464579901154, "grad_norm": 0.41842298614450435, "learning_rate": 8.971579980702533e-07, "loss": 0.3016, "step": 1179 }, { "epoch": 0.6479956068094453, "grad_norm": 0.5219504029737556, "learning_rate": 8.969814863371403e-07, "loss": 0.3162, "step": 1180 }, { "epoch": 0.6485447556287754, "grad_norm": 0.5358210196394527, "learning_rate": 8.968048406542741e-07, "loss": 0.291, "step": 1181 }, { "epoch": 0.6490939044481054, "grad_norm": 0.5116646850222143, "learning_rate": 8.966280610812662e-07, "loss": 0.2922, "step": 1182 }, { "epoch": 0.6496430532674354, "grad_norm": 0.4383781220072977, "learning_rate": 8.96451147677773e-07, "loss": 0.3135, "step": 1183 }, { "epoch": 0.6501922020867655, "grad_norm": 0.5597237164126757, "learning_rate": 8.962741005034965e-07, "loss": 0.303, "step": 1184 }, { "epoch": 0.6507413509060955, "grad_norm": 0.417182537956975, "learning_rate": 8.960969196181832e-07, "loss": 0.2525, "step": 1185 }, { "epoch": 0.6512904997254256, "grad_norm": 0.575169641090054, "learning_rate": 8.959196050816257e-07, "loss": 0.2823, "step": 1186 }, { "epoch": 0.6518396485447556, "grad_norm": 0.5038074897677616, "learning_rate": 8.957421569536607e-07, "loss": 0.3152, "step": 1187 }, { "epoch": 0.6523887973640856, "grad_norm": 0.5964828672055801, "learning_rate": 8.955645752941706e-07, "loss": 0.3339, "step": 1188 }, { "epoch": 0.6529379461834157, "grad_norm": 0.4617619091425538, "learning_rate": 8.953868601630826e-07, "loss": 0.2835, "step": 1189 }, { "epoch": 0.6534870950027457, "grad_norm": 0.5474386796376182, "learning_rate": 8.952090116203688e-07, "loss": 0.2871, "step": 1190 }, { "epoch": 0.6540362438220758, "grad_norm": 0.3957478138397106, "learning_rate": 8.950310297260468e-07, "loss": 0.3139, "step": 1191 }, { "epoch": 0.6545853926414058, "grad_norm": 0.4074071223396674, "learning_rate": 8.94852914540179e-07, "loss": 0.3019, "step": 1192 }, { "epoch": 0.6551345414607359, "grad_norm": 0.5430721111070543, "learning_rate": 8.946746661228726e-07, "loss": 0.2674, "step": 1193 }, { "epoch": 0.6556836902800659, "grad_norm": 0.4412845165835301, "learning_rate": 8.944962845342798e-07, "loss": 0.3119, "step": 1194 }, { "epoch": 0.6562328390993959, "grad_norm": 0.49710103161223135, "learning_rate": 8.943177698345978e-07, "loss": 0.2757, "step": 1195 }, { "epoch": 0.656781987918726, "grad_norm": 0.44532613058567916, "learning_rate": 8.941391220840688e-07, "loss": 0.3284, "step": 1196 }, { "epoch": 0.657331136738056, "grad_norm": 0.6192176398577527, "learning_rate": 8.939603413429798e-07, "loss": 0.2528, "step": 1197 }, { "epoch": 0.6578802855573861, "grad_norm": 0.5514717092770217, "learning_rate": 8.937814276716629e-07, "loss": 0.3178, "step": 1198 }, { "epoch": 0.6584294343767161, "grad_norm": 0.44903509899343486, "learning_rate": 8.936023811304946e-07, "loss": 0.3086, "step": 1199 }, { "epoch": 0.6589785831960461, "grad_norm": 0.4886647463373787, "learning_rate": 8.934232017798967e-07, "loss": 0.2399, "step": 1200 }, { "epoch": 0.6589785831960461, "eval_loss": 0.36352044343948364, "eval_runtime": 18.5912, "eval_samples_per_second": 23.828, "eval_steps_per_second": 1.022, "step": 1200 }, { "epoch": 0.6595277320153762, "grad_norm": 0.5219718221773981, "learning_rate": 8.932438896803355e-07, "loss": 0.3048, "step": 1201 }, { "epoch": 0.6600768808347062, "grad_norm": 0.4614849462021926, "learning_rate": 8.930644448923223e-07, "loss": 0.2797, "step": 1202 }, { "epoch": 0.6606260296540363, "grad_norm": 0.3677266225613913, "learning_rate": 8.928848674764132e-07, "loss": 0.2682, "step": 1203 }, { "epoch": 0.6611751784733663, "grad_norm": 0.5123719806838465, "learning_rate": 8.927051574932087e-07, "loss": 0.2978, "step": 1204 }, { "epoch": 0.6617243272926964, "grad_norm": 0.3747966025929731, "learning_rate": 8.925253150033546e-07, "loss": 0.2791, "step": 1205 }, { "epoch": 0.6622734761120264, "grad_norm": 0.5487569411632294, "learning_rate": 8.923453400675408e-07, "loss": 0.3291, "step": 1206 }, { "epoch": 0.6628226249313564, "grad_norm": 0.5138417883651462, "learning_rate": 8.921652327465026e-07, "loss": 0.2763, "step": 1207 }, { "epoch": 0.6633717737506865, "grad_norm": 0.5061801660213829, "learning_rate": 8.91984993101019e-07, "loss": 0.31, "step": 1208 }, { "epoch": 0.6639209225700164, "grad_norm": 0.3942828388637908, "learning_rate": 8.91804621191915e-07, "loss": 0.3092, "step": 1209 }, { "epoch": 0.6644700713893466, "grad_norm": 0.37763584539676537, "learning_rate": 8.916241170800589e-07, "loss": 0.2728, "step": 1210 }, { "epoch": 0.6650192202086765, "grad_norm": 0.5009550373251159, "learning_rate": 8.914434808263644e-07, "loss": 0.264, "step": 1211 }, { "epoch": 0.6655683690280065, "grad_norm": 0.7030277484483146, "learning_rate": 8.912627124917895e-07, "loss": 0.3135, "step": 1212 }, { "epoch": 0.6661175178473366, "grad_norm": 0.42488607273609486, "learning_rate": 8.910818121373369e-07, "loss": 0.2818, "step": 1213 }, { "epoch": 0.6666666666666666, "grad_norm": 0.493342392494313, "learning_rate": 8.909007798240539e-07, "loss": 0.2954, "step": 1214 }, { "epoch": 0.6672158154859967, "grad_norm": 0.5039194951032646, "learning_rate": 8.90719615613032e-07, "loss": 0.253, "step": 1215 }, { "epoch": 0.6677649643053267, "grad_norm": 0.42422715933104815, "learning_rate": 8.905383195654078e-07, "loss": 0.2852, "step": 1216 }, { "epoch": 0.6683141131246568, "grad_norm": 0.494352707797291, "learning_rate": 8.903568917423616e-07, "loss": 0.2828, "step": 1217 }, { "epoch": 0.6688632619439868, "grad_norm": 0.5665599561495389, "learning_rate": 8.901753322051189e-07, "loss": 0.2862, "step": 1218 }, { "epoch": 0.6694124107633168, "grad_norm": 0.542595992828151, "learning_rate": 8.899936410149496e-07, "loss": 0.2638, "step": 1219 }, { "epoch": 0.6699615595826469, "grad_norm": 0.5648813250199902, "learning_rate": 8.89811818233167e-07, "loss": 0.2909, "step": 1220 }, { "epoch": 0.6705107084019769, "grad_norm": 0.4160562735138924, "learning_rate": 8.896298639211304e-07, "loss": 0.2616, "step": 1221 }, { "epoch": 0.671059857221307, "grad_norm": 0.5413512791749923, "learning_rate": 8.894477781402422e-07, "loss": 0.2484, "step": 1222 }, { "epoch": 0.671609006040637, "grad_norm": 0.4330049685128148, "learning_rate": 8.892655609519497e-07, "loss": 0.2436, "step": 1223 }, { "epoch": 0.6721581548599671, "grad_norm": 0.5443521875616191, "learning_rate": 8.890832124177447e-07, "loss": 0.2954, "step": 1224 }, { "epoch": 0.6727073036792971, "grad_norm": 0.4563967506734848, "learning_rate": 8.88900732599163e-07, "loss": 0.289, "step": 1225 }, { "epoch": 0.6732564524986271, "grad_norm": 0.4754829406830648, "learning_rate": 8.887181215577846e-07, "loss": 0.2506, "step": 1226 }, { "epoch": 0.6738056013179572, "grad_norm": 0.4934327565736833, "learning_rate": 8.885353793552343e-07, "loss": 0.2692, "step": 1227 }, { "epoch": 0.6743547501372872, "grad_norm": 0.607866696262366, "learning_rate": 8.883525060531808e-07, "loss": 0.2768, "step": 1228 }, { "epoch": 0.6749038989566173, "grad_norm": 0.4946664009061249, "learning_rate": 8.881695017133371e-07, "loss": 0.2801, "step": 1229 }, { "epoch": 0.6754530477759473, "grad_norm": 0.392121091082154, "learning_rate": 8.879863663974604e-07, "loss": 0.271, "step": 1230 }, { "epoch": 0.6760021965952773, "grad_norm": 0.44037563428625903, "learning_rate": 8.878031001673519e-07, "loss": 0.2906, "step": 1231 }, { "epoch": 0.6765513454146074, "grad_norm": 0.48088457141067054, "learning_rate": 8.876197030848575e-07, "loss": 0.2677, "step": 1232 }, { "epoch": 0.6771004942339374, "grad_norm": 0.4869690369556779, "learning_rate": 8.874361752118669e-07, "loss": 0.2861, "step": 1233 }, { "epoch": 0.6776496430532675, "grad_norm": 0.5848282881506409, "learning_rate": 8.87252516610314e-07, "loss": 0.2982, "step": 1234 }, { "epoch": 0.6781987918725975, "grad_norm": 0.4614839190247446, "learning_rate": 8.870687273421766e-07, "loss": 0.289, "step": 1235 }, { "epoch": 0.6787479406919276, "grad_norm": 0.3705363391129515, "learning_rate": 8.868848074694772e-07, "loss": 0.2719, "step": 1236 }, { "epoch": 0.6792970895112576, "grad_norm": 0.4650582139957983, "learning_rate": 8.867007570542817e-07, "loss": 0.297, "step": 1237 }, { "epoch": 0.6798462383305875, "grad_norm": 0.535860936917416, "learning_rate": 8.865165761587002e-07, "loss": 0.3275, "step": 1238 }, { "epoch": 0.6803953871499177, "grad_norm": 0.5426168703649902, "learning_rate": 8.863322648448874e-07, "loss": 0.2955, "step": 1239 }, { "epoch": 0.6809445359692476, "grad_norm": 0.47271050132721426, "learning_rate": 8.861478231750413e-07, "loss": 0.2707, "step": 1240 }, { "epoch": 0.6814936847885777, "grad_norm": 0.6385607302689855, "learning_rate": 8.859632512114042e-07, "loss": 0.2789, "step": 1241 }, { "epoch": 0.6820428336079077, "grad_norm": 0.42936884881535814, "learning_rate": 8.857785490162621e-07, "loss": 0.3035, "step": 1242 }, { "epoch": 0.6825919824272377, "grad_norm": 0.4332168020023281, "learning_rate": 8.855937166519458e-07, "loss": 0.2469, "step": 1243 }, { "epoch": 0.6831411312465678, "grad_norm": 0.5340480171003407, "learning_rate": 8.854087541808288e-07, "loss": 0.3112, "step": 1244 }, { "epoch": 0.6836902800658978, "grad_norm": 0.5095766870692692, "learning_rate": 8.852236616653294e-07, "loss": 0.257, "step": 1245 }, { "epoch": 0.6842394288852279, "grad_norm": 0.44141398065536425, "learning_rate": 8.850384391679096e-07, "loss": 0.2878, "step": 1246 }, { "epoch": 0.6847885777045579, "grad_norm": 0.6618308344776433, "learning_rate": 8.84853086751075e-07, "loss": 0.2951, "step": 1247 }, { "epoch": 0.685337726523888, "grad_norm": 0.4747078229560669, "learning_rate": 8.84667604477375e-07, "loss": 0.2885, "step": 1248 }, { "epoch": 0.685886875343218, "grad_norm": 0.5916294272889184, "learning_rate": 8.844819924094037e-07, "loss": 0.282, "step": 1249 }, { "epoch": 0.686436024162548, "grad_norm": 0.4629434827118017, "learning_rate": 8.842962506097977e-07, "loss": 0.2939, "step": 1250 }, { "epoch": 0.6869851729818781, "grad_norm": 0.46407511347786573, "learning_rate": 8.841103791412382e-07, "loss": 0.278, "step": 1251 }, { "epoch": 0.6875343218012081, "grad_norm": 0.48788645650899126, "learning_rate": 8.839243780664502e-07, "loss": 0.2979, "step": 1252 }, { "epoch": 0.6880834706205382, "grad_norm": 0.5520260733586249, "learning_rate": 8.837382474482017e-07, "loss": 0.2631, "step": 1253 }, { "epoch": 0.6886326194398682, "grad_norm": 0.4871995962379758, "learning_rate": 8.835519873493054e-07, "loss": 0.2478, "step": 1254 }, { "epoch": 0.6891817682591982, "grad_norm": 0.5275575643120288, "learning_rate": 8.833655978326171e-07, "loss": 0.3386, "step": 1255 }, { "epoch": 0.6897309170785283, "grad_norm": 0.4106161686764015, "learning_rate": 8.831790789610363e-07, "loss": 0.2702, "step": 1256 }, { "epoch": 0.6902800658978583, "grad_norm": 0.45944059978808116, "learning_rate": 8.829924307975064e-07, "loss": 0.2721, "step": 1257 }, { "epoch": 0.6908292147171884, "grad_norm": 0.47343490936984833, "learning_rate": 8.828056534050141e-07, "loss": 0.272, "step": 1258 }, { "epoch": 0.6913783635365184, "grad_norm": 0.6942036848333579, "learning_rate": 8.8261874684659e-07, "loss": 0.2572, "step": 1259 }, { "epoch": 0.6919275123558485, "grad_norm": 0.5145323761806228, "learning_rate": 8.824317111853081e-07, "loss": 0.279, "step": 1260 }, { "epoch": 0.6924766611751785, "grad_norm": 0.6755598203686058, "learning_rate": 8.822445464842862e-07, "loss": 0.2495, "step": 1261 }, { "epoch": 0.6930258099945085, "grad_norm": 0.5214951093442063, "learning_rate": 8.820572528066853e-07, "loss": 0.3271, "step": 1262 }, { "epoch": 0.6935749588138386, "grad_norm": 0.49535338438592735, "learning_rate": 8.818698302157103e-07, "loss": 0.3022, "step": 1263 }, { "epoch": 0.6941241076331686, "grad_norm": 0.4880159994943199, "learning_rate": 8.816822787746092e-07, "loss": 0.3159, "step": 1264 }, { "epoch": 0.6946732564524987, "grad_norm": 0.4488171348359517, "learning_rate": 8.814945985466738e-07, "loss": 0.3095, "step": 1265 }, { "epoch": 0.6952224052718287, "grad_norm": 0.6174403081349206, "learning_rate": 8.813067895952394e-07, "loss": 0.3042, "step": 1266 }, { "epoch": 0.6957715540911587, "grad_norm": 0.46375370522692666, "learning_rate": 8.811188519836846e-07, "loss": 0.2649, "step": 1267 }, { "epoch": 0.6963207029104888, "grad_norm": 0.5588371042652637, "learning_rate": 8.809307857754312e-07, "loss": 0.2903, "step": 1268 }, { "epoch": 0.6968698517298187, "grad_norm": 0.45054402444046043, "learning_rate": 8.807425910339446e-07, "loss": 0.3007, "step": 1269 }, { "epoch": 0.6974190005491488, "grad_norm": 0.4799235119021597, "learning_rate": 8.805542678227339e-07, "loss": 0.2325, "step": 1270 }, { "epoch": 0.6979681493684788, "grad_norm": 0.5810408907664303, "learning_rate": 8.80365816205351e-07, "loss": 0.2604, "step": 1271 }, { "epoch": 0.6985172981878089, "grad_norm": 0.5558300105500974, "learning_rate": 8.801772362453914e-07, "loss": 0.2762, "step": 1272 }, { "epoch": 0.6990664470071389, "grad_norm": 0.4957095954875263, "learning_rate": 8.79988528006494e-07, "loss": 0.2444, "step": 1273 }, { "epoch": 0.6996155958264689, "grad_norm": 0.4776697394100596, "learning_rate": 8.797996915523408e-07, "loss": 0.2641, "step": 1274 }, { "epoch": 0.700164744645799, "grad_norm": 0.48438577774761393, "learning_rate": 8.796107269466572e-07, "loss": 0.2812, "step": 1275 }, { "epoch": 0.700713893465129, "grad_norm": 0.3866968789181543, "learning_rate": 8.794216342532116e-07, "loss": 0.2676, "step": 1276 }, { "epoch": 0.7012630422844591, "grad_norm": 0.5328756379378069, "learning_rate": 8.792324135358161e-07, "loss": 0.2631, "step": 1277 }, { "epoch": 0.7018121911037891, "grad_norm": 0.4761610142019809, "learning_rate": 8.790430648583255e-07, "loss": 0.2678, "step": 1278 }, { "epoch": 0.7023613399231191, "grad_norm": 0.43582459714242316, "learning_rate": 8.788535882846382e-07, "loss": 0.2902, "step": 1279 }, { "epoch": 0.7029104887424492, "grad_norm": 0.43844197771775184, "learning_rate": 8.786639838786953e-07, "loss": 0.2853, "step": 1280 }, { "epoch": 0.7034596375617792, "grad_norm": 0.37377779055777965, "learning_rate": 8.784742517044816e-07, "loss": 0.2431, "step": 1281 }, { "epoch": 0.7040087863811093, "grad_norm": 0.46856352563180786, "learning_rate": 8.782843918260245e-07, "loss": 0.3132, "step": 1282 }, { "epoch": 0.7045579352004393, "grad_norm": 0.47171398293104033, "learning_rate": 8.780944043073946e-07, "loss": 0.2755, "step": 1283 }, { "epoch": 0.7051070840197694, "grad_norm": 0.395798689755583, "learning_rate": 8.779042892127063e-07, "loss": 0.2834, "step": 1284 }, { "epoch": 0.7056562328390994, "grad_norm": 0.41506834692102745, "learning_rate": 8.777140466061158e-07, "loss": 0.2331, "step": 1285 }, { "epoch": 0.7062053816584294, "grad_norm": 0.5499373886807347, "learning_rate": 8.77523676551823e-07, "loss": 0.2488, "step": 1286 }, { "epoch": 0.7067545304777595, "grad_norm": 0.502761676178298, "learning_rate": 8.773331791140712e-07, "loss": 0.2883, "step": 1287 }, { "epoch": 0.7073036792970895, "grad_norm": 0.44450699010240075, "learning_rate": 8.771425543571461e-07, "loss": 0.2615, "step": 1288 }, { "epoch": 0.7078528281164196, "grad_norm": 0.4731553496496406, "learning_rate": 8.769518023453763e-07, "loss": 0.3101, "step": 1289 }, { "epoch": 0.7084019769357496, "grad_norm": 0.7167440579758683, "learning_rate": 8.767609231431338e-07, "loss": 0.3068, "step": 1290 }, { "epoch": 0.7089511257550797, "grad_norm": 0.5155775414683639, "learning_rate": 8.765699168148331e-07, "loss": 0.2544, "step": 1291 }, { "epoch": 0.7095002745744097, "grad_norm": 0.5667979731520204, "learning_rate": 8.763787834249322e-07, "loss": 0.274, "step": 1292 }, { "epoch": 0.7100494233937397, "grad_norm": 0.534421880769502, "learning_rate": 8.761875230379312e-07, "loss": 0.2486, "step": 1293 }, { "epoch": 0.7105985722130698, "grad_norm": 0.48974672517736045, "learning_rate": 8.759961357183736e-07, "loss": 0.2921, "step": 1294 }, { "epoch": 0.7111477210323998, "grad_norm": 0.4641955710008565, "learning_rate": 8.758046215308456e-07, "loss": 0.2745, "step": 1295 }, { "epoch": 0.7116968698517299, "grad_norm": 0.4526752045793606, "learning_rate": 8.756129805399758e-07, "loss": 0.2663, "step": 1296 }, { "epoch": 0.7122460186710599, "grad_norm": 0.5068314778193749, "learning_rate": 8.754212128104366e-07, "loss": 0.328, "step": 1297 }, { "epoch": 0.7127951674903898, "grad_norm": 0.4377201750861751, "learning_rate": 8.75229318406942e-07, "loss": 0.2478, "step": 1298 }, { "epoch": 0.71334431630972, "grad_norm": 0.4233086920358855, "learning_rate": 8.750372973942495e-07, "loss": 0.2689, "step": 1299 }, { "epoch": 0.7138934651290499, "grad_norm": 0.5060988338664691, "learning_rate": 8.748451498371594e-07, "loss": 0.2728, "step": 1300 }, { "epoch": 0.71444261394838, "grad_norm": 0.6158788054210272, "learning_rate": 8.746528758005139e-07, "loss": 0.3505, "step": 1301 }, { "epoch": 0.71499176276771, "grad_norm": 0.4833637139077168, "learning_rate": 8.744604753491989e-07, "loss": 0.2803, "step": 1302 }, { "epoch": 0.7155409115870401, "grad_norm": 0.4589880573250104, "learning_rate": 8.742679485481419e-07, "loss": 0.2657, "step": 1303 }, { "epoch": 0.7160900604063701, "grad_norm": 0.5251864252327677, "learning_rate": 8.740752954623142e-07, "loss": 0.2787, "step": 1304 }, { "epoch": 0.7166392092257001, "grad_norm": 0.4022266755575418, "learning_rate": 8.738825161567286e-07, "loss": 0.2864, "step": 1305 }, { "epoch": 0.7171883580450302, "grad_norm": 0.4096719701134503, "learning_rate": 8.736896106964414e-07, "loss": 0.297, "step": 1306 }, { "epoch": 0.7177375068643602, "grad_norm": 0.4946200871997498, "learning_rate": 8.73496579146551e-07, "loss": 0.2872, "step": 1307 }, { "epoch": 0.7182866556836903, "grad_norm": 0.4842141565048707, "learning_rate": 8.733034215721984e-07, "loss": 0.2837, "step": 1308 }, { "epoch": 0.7188358045030203, "grad_norm": 0.6009862157947654, "learning_rate": 8.731101380385669e-07, "loss": 0.3482, "step": 1309 }, { "epoch": 0.7193849533223503, "grad_norm": 0.47470169339397655, "learning_rate": 8.729167286108831e-07, "loss": 0.2833, "step": 1310 }, { "epoch": 0.7199341021416804, "grad_norm": 0.47604670579454933, "learning_rate": 8.72723193354415e-07, "loss": 0.2633, "step": 1311 }, { "epoch": 0.7204832509610104, "grad_norm": 0.5527021362912121, "learning_rate": 8.725295323344741e-07, "loss": 0.2653, "step": 1312 }, { "epoch": 0.7210323997803405, "grad_norm": 0.5122435177100294, "learning_rate": 8.723357456164137e-07, "loss": 0.2938, "step": 1313 }, { "epoch": 0.7215815485996705, "grad_norm": 0.5672461346022489, "learning_rate": 8.721418332656295e-07, "loss": 0.2923, "step": 1314 }, { "epoch": 0.7221306974190006, "grad_norm": 0.7152713263523053, "learning_rate": 8.719477953475598e-07, "loss": 0.2764, "step": 1315 }, { "epoch": 0.7226798462383306, "grad_norm": 0.5500128662785526, "learning_rate": 8.717536319276856e-07, "loss": 0.3, "step": 1316 }, { "epoch": 0.7232289950576606, "grad_norm": 0.5586130501725085, "learning_rate": 8.715593430715294e-07, "loss": 0.2944, "step": 1317 }, { "epoch": 0.7237781438769907, "grad_norm": 0.5554440952108585, "learning_rate": 8.71364928844657e-07, "loss": 0.2943, "step": 1318 }, { "epoch": 0.7243272926963207, "grad_norm": 0.6189797783583086, "learning_rate": 8.711703893126757e-07, "loss": 0.2692, "step": 1319 }, { "epoch": 0.7248764415156508, "grad_norm": 0.522359031404474, "learning_rate": 8.709757245412356e-07, "loss": 0.2767, "step": 1320 }, { "epoch": 0.7254255903349808, "grad_norm": 0.5043463005715629, "learning_rate": 8.707809345960288e-07, "loss": 0.2284, "step": 1321 }, { "epoch": 0.7259747391543108, "grad_norm": 0.4590657611646499, "learning_rate": 8.705860195427899e-07, "loss": 0.2389, "step": 1322 }, { "epoch": 0.7265238879736409, "grad_norm": 0.6369775868563855, "learning_rate": 8.703909794472951e-07, "loss": 0.2832, "step": 1323 }, { "epoch": 0.7270730367929709, "grad_norm": 0.7609135010847301, "learning_rate": 8.701958143753639e-07, "loss": 0.3253, "step": 1324 }, { "epoch": 0.727622185612301, "grad_norm": 0.4213279947596712, "learning_rate": 8.700005243928568e-07, "loss": 0.2828, "step": 1325 }, { "epoch": 0.728171334431631, "grad_norm": 0.6779155064964311, "learning_rate": 8.698051095656772e-07, "loss": 0.2817, "step": 1326 }, { "epoch": 0.7287204832509611, "grad_norm": 0.4845191829636317, "learning_rate": 8.696095699597704e-07, "loss": 0.2572, "step": 1327 }, { "epoch": 0.729269632070291, "grad_norm": 0.5153401960551395, "learning_rate": 8.694139056411237e-07, "loss": 0.2875, "step": 1328 }, { "epoch": 0.729818780889621, "grad_norm": 0.5058468993620223, "learning_rate": 8.692181166757668e-07, "loss": 0.2871, "step": 1329 }, { "epoch": 0.7303679297089511, "grad_norm": 0.49783065344280164, "learning_rate": 8.69022203129771e-07, "loss": 0.281, "step": 1330 }, { "epoch": 0.7309170785282811, "grad_norm": 0.4085419585784888, "learning_rate": 8.688261650692502e-07, "loss": 0.2575, "step": 1331 }, { "epoch": 0.7314662273476112, "grad_norm": 0.47672473572235635, "learning_rate": 8.686300025603596e-07, "loss": 0.3043, "step": 1332 }, { "epoch": 0.7320153761669412, "grad_norm": 0.3979098555475834, "learning_rate": 8.684337156692975e-07, "loss": 0.235, "step": 1333 }, { "epoch": 0.7325645249862712, "grad_norm": 0.44429932278469225, "learning_rate": 8.682373044623027e-07, "loss": 0.3324, "step": 1334 }, { "epoch": 0.7331136738056013, "grad_norm": 0.4931167519804408, "learning_rate": 8.680407690056573e-07, "loss": 0.2728, "step": 1335 }, { "epoch": 0.7336628226249313, "grad_norm": 0.47590947382458537, "learning_rate": 8.678441093656846e-07, "loss": 0.2768, "step": 1336 }, { "epoch": 0.7342119714442614, "grad_norm": 0.5032795306263175, "learning_rate": 8.676473256087499e-07, "loss": 0.2558, "step": 1337 }, { "epoch": 0.7347611202635914, "grad_norm": 0.4162609506004113, "learning_rate": 8.674504178012607e-07, "loss": 0.2556, "step": 1338 }, { "epoch": 0.7353102690829215, "grad_norm": 0.4489167714209613, "learning_rate": 8.672533860096659e-07, "loss": 0.2452, "step": 1339 }, { "epoch": 0.7358594179022515, "grad_norm": 0.4195370879737022, "learning_rate": 8.670562303004565e-07, "loss": 0.2885, "step": 1340 }, { "epoch": 0.7364085667215815, "grad_norm": 0.4491685290850181, "learning_rate": 8.668589507401653e-07, "loss": 0.2648, "step": 1341 }, { "epoch": 0.7369577155409116, "grad_norm": 0.5190688682422215, "learning_rate": 8.666615473953671e-07, "loss": 0.2787, "step": 1342 }, { "epoch": 0.7375068643602416, "grad_norm": 0.7516698455360568, "learning_rate": 8.66464020332678e-07, "loss": 0.3141, "step": 1343 }, { "epoch": 0.7380560131795717, "grad_norm": 0.487408097688339, "learning_rate": 8.662663696187562e-07, "loss": 0.2838, "step": 1344 }, { "epoch": 0.7386051619989017, "grad_norm": 0.40509673185127437, "learning_rate": 8.660685953203017e-07, "loss": 0.2481, "step": 1345 }, { "epoch": 0.7391543108182317, "grad_norm": 0.5643832142815581, "learning_rate": 8.658706975040555e-07, "loss": 0.256, "step": 1346 }, { "epoch": 0.7397034596375618, "grad_norm": 0.446450893772803, "learning_rate": 8.656726762368014e-07, "loss": 0.2628, "step": 1347 }, { "epoch": 0.7402526084568918, "grad_norm": 0.452043600960706, "learning_rate": 8.654745315853641e-07, "loss": 0.2833, "step": 1348 }, { "epoch": 0.7408017572762219, "grad_norm": 0.5486074290974142, "learning_rate": 8.6527626361661e-07, "loss": 0.2869, "step": 1349 }, { "epoch": 0.7413509060955519, "grad_norm": 0.42369563712940844, "learning_rate": 8.650778723974473e-07, "loss": 0.3202, "step": 1350 }, { "epoch": 0.741900054914882, "grad_norm": 0.47309936413098236, "learning_rate": 8.64879357994826e-07, "loss": 0.2788, "step": 1351 }, { "epoch": 0.742449203734212, "grad_norm": 0.6012847154507803, "learning_rate": 8.646807204757367e-07, "loss": 0.2553, "step": 1352 }, { "epoch": 0.742998352553542, "grad_norm": 0.42871876512675017, "learning_rate": 8.64481959907213e-07, "loss": 0.256, "step": 1353 }, { "epoch": 0.7435475013728721, "grad_norm": 0.4563399094153482, "learning_rate": 8.642830763563289e-07, "loss": 0.3013, "step": 1354 }, { "epoch": 0.7440966501922021, "grad_norm": 0.3912004419547005, "learning_rate": 8.640840698902003e-07, "loss": 0.2409, "step": 1355 }, { "epoch": 0.7446457990115322, "grad_norm": 0.5173591154484312, "learning_rate": 8.638849405759847e-07, "loss": 0.3006, "step": 1356 }, { "epoch": 0.7451949478308622, "grad_norm": 0.4005829169066144, "learning_rate": 8.636856884808808e-07, "loss": 0.3037, "step": 1357 }, { "epoch": 0.7457440966501923, "grad_norm": 0.38922793044449194, "learning_rate": 8.634863136721288e-07, "loss": 0.2579, "step": 1358 }, { "epoch": 0.7462932454695222, "grad_norm": 0.4508264446534602, "learning_rate": 8.632868162170103e-07, "loss": 0.2635, "step": 1359 }, { "epoch": 0.7468423942888522, "grad_norm": 0.4567553232369663, "learning_rate": 8.630871961828484e-07, "loss": 0.2628, "step": 1360 }, { "epoch": 0.7473915431081823, "grad_norm": 0.514179983180096, "learning_rate": 8.628874536370076e-07, "loss": 0.262, "step": 1361 }, { "epoch": 0.7479406919275123, "grad_norm": 0.40971659640124847, "learning_rate": 8.626875886468937e-07, "loss": 0.2833, "step": 1362 }, { "epoch": 0.7484898407468424, "grad_norm": 0.40738908686850644, "learning_rate": 8.624876012799533e-07, "loss": 0.2747, "step": 1363 }, { "epoch": 0.7490389895661724, "grad_norm": 0.5058590162965317, "learning_rate": 8.622874916036755e-07, "loss": 0.2823, "step": 1364 }, { "epoch": 0.7495881383855024, "grad_norm": 0.5722901454466316, "learning_rate": 8.620872596855894e-07, "loss": 0.3056, "step": 1365 }, { "epoch": 0.7501372872048325, "grad_norm": 0.511870789687394, "learning_rate": 8.618869055932661e-07, "loss": 0.2695, "step": 1366 }, { "epoch": 0.7506864360241625, "grad_norm": 0.6014990334065109, "learning_rate": 8.616864293943177e-07, "loss": 0.2664, "step": 1367 }, { "epoch": 0.7512355848434926, "grad_norm": 0.443095209160229, "learning_rate": 8.614858311563975e-07, "loss": 0.2617, "step": 1368 }, { "epoch": 0.7517847336628226, "grad_norm": 0.48572809263891076, "learning_rate": 8.612851109472e-07, "loss": 0.2766, "step": 1369 }, { "epoch": 0.7523338824821527, "grad_norm": 0.42169219052347473, "learning_rate": 8.61084268834461e-07, "loss": 0.2732, "step": 1370 }, { "epoch": 0.7528830313014827, "grad_norm": 0.43953598898290763, "learning_rate": 8.608833048859572e-07, "loss": 0.314, "step": 1371 }, { "epoch": 0.7534321801208127, "grad_norm": 0.6201686246822434, "learning_rate": 8.606822191695065e-07, "loss": 0.2944, "step": 1372 }, { "epoch": 0.7539813289401428, "grad_norm": 0.5150555502827397, "learning_rate": 8.604810117529679e-07, "loss": 0.2951, "step": 1373 }, { "epoch": 0.7545304777594728, "grad_norm": 0.47604597941304283, "learning_rate": 8.602796827042418e-07, "loss": 0.2532, "step": 1374 }, { "epoch": 0.7550796265788029, "grad_norm": 0.4432236198856403, "learning_rate": 8.600782320912689e-07, "loss": 0.2408, "step": 1375 }, { "epoch": 0.7556287753981329, "grad_norm": 0.5165007594893655, "learning_rate": 8.598766599820316e-07, "loss": 0.3305, "step": 1376 }, { "epoch": 0.7561779242174629, "grad_norm": 0.6534173229326804, "learning_rate": 8.596749664445531e-07, "loss": 0.354, "step": 1377 }, { "epoch": 0.756727073036793, "grad_norm": 0.4805234888708049, "learning_rate": 8.594731515468975e-07, "loss": 0.2419, "step": 1378 }, { "epoch": 0.757276221856123, "grad_norm": 0.5356490285713452, "learning_rate": 8.592712153571696e-07, "loss": 0.2599, "step": 1379 }, { "epoch": 0.7578253706754531, "grad_norm": 0.5440977222147195, "learning_rate": 8.590691579435157e-07, "loss": 0.2455, "step": 1380 }, { "epoch": 0.7583745194947831, "grad_norm": 0.7149038990020312, "learning_rate": 8.588669793741231e-07, "loss": 0.3209, "step": 1381 }, { "epoch": 0.7589236683141132, "grad_norm": 0.5262992947553964, "learning_rate": 8.586646797172189e-07, "loss": 0.2631, "step": 1382 }, { "epoch": 0.7594728171334432, "grad_norm": 0.44514385035522225, "learning_rate": 8.584622590410722e-07, "loss": 0.2577, "step": 1383 }, { "epoch": 0.7600219659527732, "grad_norm": 0.5079325668733327, "learning_rate": 8.582597174139925e-07, "loss": 0.3049, "step": 1384 }, { "epoch": 0.7605711147721033, "grad_norm": 0.44969742907551946, "learning_rate": 8.580570549043299e-07, "loss": 0.2711, "step": 1385 }, { "epoch": 0.7611202635914333, "grad_norm": 0.4943841266156845, "learning_rate": 8.578542715804758e-07, "loss": 0.2616, "step": 1386 }, { "epoch": 0.7616694124107634, "grad_norm": 0.5330860403259663, "learning_rate": 8.57651367510862e-07, "loss": 0.2876, "step": 1387 }, { "epoch": 0.7622185612300933, "grad_norm": 0.4064903031737812, "learning_rate": 8.574483427639612e-07, "loss": 0.262, "step": 1388 }, { "epoch": 0.7627677100494233, "grad_norm": 0.41749032530568897, "learning_rate": 8.572451974082867e-07, "loss": 0.2657, "step": 1389 }, { "epoch": 0.7633168588687534, "grad_norm": 0.48835804584658404, "learning_rate": 8.570419315123924e-07, "loss": 0.2429, "step": 1390 }, { "epoch": 0.7638660076880834, "grad_norm": 0.4556370122488247, "learning_rate": 8.568385451448735e-07, "loss": 0.2682, "step": 1391 }, { "epoch": 0.7644151565074135, "grad_norm": 0.43206927013875696, "learning_rate": 8.56635038374365e-07, "loss": 0.2706, "step": 1392 }, { "epoch": 0.7649643053267435, "grad_norm": 0.4483782884803002, "learning_rate": 8.564314112695432e-07, "loss": 0.2874, "step": 1393 }, { "epoch": 0.7655134541460736, "grad_norm": 0.49295950220252227, "learning_rate": 8.562276638991246e-07, "loss": 0.2491, "step": 1394 }, { "epoch": 0.7660626029654036, "grad_norm": 0.4399500999953082, "learning_rate": 8.560237963318664e-07, "loss": 0.252, "step": 1395 }, { "epoch": 0.7666117517847336, "grad_norm": 0.5450671017136176, "learning_rate": 8.558198086365665e-07, "loss": 0.3032, "step": 1396 }, { "epoch": 0.7671609006040637, "grad_norm": 0.49177675167959445, "learning_rate": 8.556157008820632e-07, "loss": 0.2994, "step": 1397 }, { "epoch": 0.7677100494233937, "grad_norm": 0.4008746012352519, "learning_rate": 8.554114731372352e-07, "loss": 0.2758, "step": 1398 }, { "epoch": 0.7682591982427238, "grad_norm": 0.4946118346918174, "learning_rate": 8.552071254710023e-07, "loss": 0.254, "step": 1399 }, { "epoch": 0.7688083470620538, "grad_norm": 0.4871183570035581, "learning_rate": 8.550026579523239e-07, "loss": 0.2438, "step": 1400 }, { "epoch": 0.7688083470620538, "eval_loss": 0.35636478662490845, "eval_runtime": 18.6032, "eval_samples_per_second": 23.813, "eval_steps_per_second": 1.021, "step": 1400 }, { "epoch": 0.7693574958813838, "grad_norm": 0.4170527615018649, "learning_rate": 8.547980706502001e-07, "loss": 0.2633, "step": 1401 }, { "epoch": 0.7699066447007139, "grad_norm": 0.5268912021997442, "learning_rate": 8.545933636336719e-07, "loss": 0.2606, "step": 1402 }, { "epoch": 0.7704557935200439, "grad_norm": 0.49446906035660454, "learning_rate": 8.543885369718203e-07, "loss": 0.2869, "step": 1403 }, { "epoch": 0.771004942339374, "grad_norm": 0.5065527120322841, "learning_rate": 8.541835907337668e-07, "loss": 0.2692, "step": 1404 }, { "epoch": 0.771554091158704, "grad_norm": 0.5285636269155132, "learning_rate": 8.539785249886733e-07, "loss": 0.2466, "step": 1405 }, { "epoch": 0.7721032399780341, "grad_norm": 0.522243530792344, "learning_rate": 8.537733398057416e-07, "loss": 0.2934, "step": 1406 }, { "epoch": 0.7726523887973641, "grad_norm": 0.6485427994215435, "learning_rate": 8.535680352542143e-07, "loss": 0.35, "step": 1407 }, { "epoch": 0.7732015376166941, "grad_norm": 0.7397283206685353, "learning_rate": 8.533626114033744e-07, "loss": 0.2852, "step": 1408 }, { "epoch": 0.7737506864360242, "grad_norm": 0.5276436529049572, "learning_rate": 8.531570683225443e-07, "loss": 0.2725, "step": 1409 }, { "epoch": 0.7742998352553542, "grad_norm": 0.6132530263761391, "learning_rate": 8.529514060810878e-07, "loss": 0.2633, "step": 1410 }, { "epoch": 0.7748489840746843, "grad_norm": 0.4413060943506678, "learning_rate": 8.527456247484079e-07, "loss": 0.2646, "step": 1411 }, { "epoch": 0.7753981328940143, "grad_norm": 0.5399502638660545, "learning_rate": 8.525397243939487e-07, "loss": 0.2624, "step": 1412 }, { "epoch": 0.7759472817133443, "grad_norm": 0.3822802249378209, "learning_rate": 8.523337050871933e-07, "loss": 0.2896, "step": 1413 }, { "epoch": 0.7764964305326744, "grad_norm": 0.46875358003904677, "learning_rate": 8.521275668976661e-07, "loss": 0.2694, "step": 1414 }, { "epoch": 0.7770455793520044, "grad_norm": 0.47612295964115026, "learning_rate": 8.519213098949311e-07, "loss": 0.3008, "step": 1415 }, { "epoch": 0.7775947281713345, "grad_norm": 0.4414768466019281, "learning_rate": 8.517149341485926e-07, "loss": 0.2913, "step": 1416 }, { "epoch": 0.7781438769906645, "grad_norm": 0.5430840569533861, "learning_rate": 8.515084397282943e-07, "loss": 0.2504, "step": 1417 }, { "epoch": 0.7786930258099946, "grad_norm": 0.5355086623243603, "learning_rate": 8.51301826703721e-07, "loss": 0.3026, "step": 1418 }, { "epoch": 0.7792421746293245, "grad_norm": 0.4129496054739296, "learning_rate": 8.510950951445967e-07, "loss": 0.2529, "step": 1419 }, { "epoch": 0.7797913234486545, "grad_norm": 0.48056228009166235, "learning_rate": 8.508882451206856e-07, "loss": 0.2817, "step": 1420 }, { "epoch": 0.7803404722679846, "grad_norm": 0.45143835664255116, "learning_rate": 8.50681276701792e-07, "loss": 0.2591, "step": 1421 }, { "epoch": 0.7808896210873146, "grad_norm": 0.5844080303955883, "learning_rate": 8.504741899577604e-07, "loss": 0.281, "step": 1422 }, { "epoch": 0.7814387699066447, "grad_norm": 0.443980549403915, "learning_rate": 8.502669849584749e-07, "loss": 0.2988, "step": 1423 }, { "epoch": 0.7819879187259747, "grad_norm": 0.5543252907821477, "learning_rate": 8.500596617738592e-07, "loss": 0.296, "step": 1424 }, { "epoch": 0.7825370675453048, "grad_norm": 0.6565632002621868, "learning_rate": 8.498522204738774e-07, "loss": 0.2763, "step": 1425 }, { "epoch": 0.7830862163646348, "grad_norm": 0.3881852587096809, "learning_rate": 8.496446611285333e-07, "loss": 0.2422, "step": 1426 }, { "epoch": 0.7836353651839648, "grad_norm": 0.42959050993454223, "learning_rate": 8.494369838078708e-07, "loss": 0.2631, "step": 1427 }, { "epoch": 0.7841845140032949, "grad_norm": 0.44569840872302136, "learning_rate": 8.49229188581973e-07, "loss": 0.2744, "step": 1428 }, { "epoch": 0.7847336628226249, "grad_norm": 1.1174408877850759, "learning_rate": 8.490212755209632e-07, "loss": 0.4636, "step": 1429 }, { "epoch": 0.785282811641955, "grad_norm": 0.5434410696983889, "learning_rate": 8.488132446950046e-07, "loss": 0.2587, "step": 1430 }, { "epoch": 0.785831960461285, "grad_norm": 0.5560706595266278, "learning_rate": 8.486050961742997e-07, "loss": 0.2613, "step": 1431 }, { "epoch": 0.786381109280615, "grad_norm": 0.54384047498177, "learning_rate": 8.48396830029091e-07, "loss": 0.2623, "step": 1432 }, { "epoch": 0.7869302580999451, "grad_norm": 0.43216756513519927, "learning_rate": 8.481884463296608e-07, "loss": 0.2748, "step": 1433 }, { "epoch": 0.7874794069192751, "grad_norm": 0.7750012230290634, "learning_rate": 8.479799451463307e-07, "loss": 0.3026, "step": 1434 }, { "epoch": 0.7880285557386052, "grad_norm": 0.6095409861050018, "learning_rate": 8.477713265494625e-07, "loss": 0.2432, "step": 1435 }, { "epoch": 0.7885777045579352, "grad_norm": 0.4239718598759618, "learning_rate": 8.475625906094569e-07, "loss": 0.2759, "step": 1436 }, { "epoch": 0.7891268533772653, "grad_norm": 0.5610702879251471, "learning_rate": 8.473537373967547e-07, "loss": 0.2728, "step": 1437 }, { "epoch": 0.7896760021965953, "grad_norm": 0.4245901185666458, "learning_rate": 8.471447669818364e-07, "loss": 0.2641, "step": 1438 }, { "epoch": 0.7902251510159253, "grad_norm": 0.5104216315685541, "learning_rate": 8.469356794352217e-07, "loss": 0.2843, "step": 1439 }, { "epoch": 0.7907742998352554, "grad_norm": 0.4919090033600878, "learning_rate": 8.467264748274697e-07, "loss": 0.2603, "step": 1440 }, { "epoch": 0.7913234486545854, "grad_norm": 0.44442568399620447, "learning_rate": 8.465171532291796e-07, "loss": 0.2754, "step": 1441 }, { "epoch": 0.7918725974739155, "grad_norm": 0.5876270472590753, "learning_rate": 8.463077147109895e-07, "loss": 0.2967, "step": 1442 }, { "epoch": 0.7924217462932455, "grad_norm": 0.5110066028762595, "learning_rate": 8.460981593435772e-07, "loss": 0.2684, "step": 1443 }, { "epoch": 0.7929708951125755, "grad_norm": 0.6078274569338576, "learning_rate": 8.458884871976601e-07, "loss": 0.2446, "step": 1444 }, { "epoch": 0.7935200439319056, "grad_norm": 0.45988828601242304, "learning_rate": 8.456786983439946e-07, "loss": 0.2589, "step": 1445 }, { "epoch": 0.7940691927512356, "grad_norm": 0.4930137310689949, "learning_rate": 8.454687928533768e-07, "loss": 0.2479, "step": 1446 }, { "epoch": 0.7946183415705657, "grad_norm": 0.4758903150056251, "learning_rate": 8.452587707966422e-07, "loss": 0.2735, "step": 1447 }, { "epoch": 0.7951674903898956, "grad_norm": 0.36398183444609655, "learning_rate": 8.450486322446652e-07, "loss": 0.2913, "step": 1448 }, { "epoch": 0.7957166392092258, "grad_norm": 0.4332182368957013, "learning_rate": 8.448383772683602e-07, "loss": 0.2685, "step": 1449 }, { "epoch": 0.7962657880285557, "grad_norm": 0.47589487027704447, "learning_rate": 8.446280059386801e-07, "loss": 0.2645, "step": 1450 }, { "epoch": 0.7968149368478857, "grad_norm": 0.46798540538360656, "learning_rate": 8.444175183266178e-07, "loss": 0.2723, "step": 1451 }, { "epoch": 0.7973640856672158, "grad_norm": 0.4431372380833636, "learning_rate": 8.44206914503205e-07, "loss": 0.2544, "step": 1452 }, { "epoch": 0.7979132344865458, "grad_norm": 0.4933695669895439, "learning_rate": 8.439961945395127e-07, "loss": 0.2945, "step": 1453 }, { "epoch": 0.7984623833058759, "grad_norm": 0.38662997940798843, "learning_rate": 8.437853585066511e-07, "loss": 0.2628, "step": 1454 }, { "epoch": 0.7990115321252059, "grad_norm": 0.5037608841621234, "learning_rate": 8.435744064757698e-07, "loss": 0.2695, "step": 1455 }, { "epoch": 0.7995606809445359, "grad_norm": 0.46011657437552106, "learning_rate": 8.43363338518057e-07, "loss": 0.2801, "step": 1456 }, { "epoch": 0.800109829763866, "grad_norm": 0.5080452829175076, "learning_rate": 8.431521547047406e-07, "loss": 0.2622, "step": 1457 }, { "epoch": 0.800658978583196, "grad_norm": 0.40704945372155243, "learning_rate": 8.429408551070875e-07, "loss": 0.2663, "step": 1458 }, { "epoch": 0.8012081274025261, "grad_norm": 0.5037992387479383, "learning_rate": 8.427294397964031e-07, "loss": 0.2623, "step": 1459 }, { "epoch": 0.8017572762218561, "grad_norm": 0.5787920228462682, "learning_rate": 8.425179088440326e-07, "loss": 0.2741, "step": 1460 }, { "epoch": 0.8023064250411862, "grad_norm": 0.5576306216976004, "learning_rate": 8.423062623213598e-07, "loss": 0.29, "step": 1461 }, { "epoch": 0.8028555738605162, "grad_norm": 0.5068193207052203, "learning_rate": 8.420945002998075e-07, "loss": 0.255, "step": 1462 }, { "epoch": 0.8034047226798462, "grad_norm": 0.39963592645745505, "learning_rate": 8.418826228508379e-07, "loss": 0.2752, "step": 1463 }, { "epoch": 0.8039538714991763, "grad_norm": 0.4626883793162904, "learning_rate": 8.416706300459514e-07, "loss": 0.2591, "step": 1464 }, { "epoch": 0.8045030203185063, "grad_norm": 0.4141047875203491, "learning_rate": 8.414585219566882e-07, "loss": 0.2785, "step": 1465 }, { "epoch": 0.8050521691378364, "grad_norm": 0.45701651764755485, "learning_rate": 8.412462986546268e-07, "loss": 0.264, "step": 1466 }, { "epoch": 0.8056013179571664, "grad_norm": 0.4885042421916514, "learning_rate": 8.410339602113845e-07, "loss": 0.3199, "step": 1467 }, { "epoch": 0.8061504667764964, "grad_norm": 0.4449814084236276, "learning_rate": 8.408215066986179e-07, "loss": 0.2687, "step": 1468 }, { "epoch": 0.8066996155958265, "grad_norm": 0.4126776323920407, "learning_rate": 8.406089381880224e-07, "loss": 0.271, "step": 1469 }, { "epoch": 0.8072487644151565, "grad_norm": 0.5622513520836145, "learning_rate": 8.403962547513319e-07, "loss": 0.3105, "step": 1470 }, { "epoch": 0.8077979132344866, "grad_norm": 0.69485160530137, "learning_rate": 8.40183456460319e-07, "loss": 0.2756, "step": 1471 }, { "epoch": 0.8083470620538166, "grad_norm": 0.5449552136015597, "learning_rate": 8.399705433867958e-07, "loss": 0.2963, "step": 1472 }, { "epoch": 0.8088962108731467, "grad_norm": 0.5221242893369673, "learning_rate": 8.39757515602612e-07, "loss": 0.328, "step": 1473 }, { "epoch": 0.8094453596924767, "grad_norm": 0.4725255648775804, "learning_rate": 8.395443731796571e-07, "loss": 0.2434, "step": 1474 }, { "epoch": 0.8099945085118067, "grad_norm": 0.6265059211400055, "learning_rate": 8.393311161898585e-07, "loss": 0.3046, "step": 1475 }, { "epoch": 0.8105436573311368, "grad_norm": 0.5923430603070216, "learning_rate": 8.391177447051829e-07, "loss": 0.32, "step": 1476 }, { "epoch": 0.8110928061504667, "grad_norm": 0.5038809553361293, "learning_rate": 8.389042587976352e-07, "loss": 0.2822, "step": 1477 }, { "epoch": 0.8116419549697969, "grad_norm": 0.47485516633956404, "learning_rate": 8.386906585392588e-07, "loss": 0.2382, "step": 1478 }, { "epoch": 0.8121911037891268, "grad_norm": 0.5509063112758457, "learning_rate": 8.38476944002136e-07, "loss": 0.295, "step": 1479 }, { "epoch": 0.8127402526084568, "grad_norm": 0.43549305118057363, "learning_rate": 8.382631152583877e-07, "loss": 0.2362, "step": 1480 }, { "epoch": 0.8132894014277869, "grad_norm": 0.6980314603753898, "learning_rate": 8.380491723801735e-07, "loss": 0.2978, "step": 1481 }, { "epoch": 0.8138385502471169, "grad_norm": 0.6456071749184628, "learning_rate": 8.378351154396906e-07, "loss": 0.2637, "step": 1482 }, { "epoch": 0.814387699066447, "grad_norm": 0.39628775126095983, "learning_rate": 8.37620944509176e-07, "loss": 0.2484, "step": 1483 }, { "epoch": 0.814936847885777, "grad_norm": 0.41786867787223875, "learning_rate": 8.37406659660904e-07, "loss": 0.26, "step": 1484 }, { "epoch": 0.8154859967051071, "grad_norm": 0.4721240881385976, "learning_rate": 8.371922609671877e-07, "loss": 0.2651, "step": 1485 }, { "epoch": 0.8160351455244371, "grad_norm": 0.6565824998450281, "learning_rate": 8.369777485003795e-07, "loss": 0.2691, "step": 1486 }, { "epoch": 0.8165842943437671, "grad_norm": 0.37212098655661985, "learning_rate": 8.367631223328688e-07, "loss": 0.2632, "step": 1487 }, { "epoch": 0.8171334431630972, "grad_norm": 0.5793290484177135, "learning_rate": 8.365483825370843e-07, "loss": 0.2453, "step": 1488 }, { "epoch": 0.8176825919824272, "grad_norm": 0.46560796272313215, "learning_rate": 8.363335291854928e-07, "loss": 0.2899, "step": 1489 }, { "epoch": 0.8182317408017573, "grad_norm": 0.5249878987308157, "learning_rate": 8.361185623505993e-07, "loss": 0.2969, "step": 1490 }, { "epoch": 0.8187808896210873, "grad_norm": 0.5477312781998844, "learning_rate": 8.359034821049471e-07, "loss": 0.2703, "step": 1491 }, { "epoch": 0.8193300384404174, "grad_norm": 0.4229478399076709, "learning_rate": 8.356882885211179e-07, "loss": 0.2568, "step": 1492 }, { "epoch": 0.8198791872597474, "grad_norm": 0.3989114869226699, "learning_rate": 8.354729816717319e-07, "loss": 0.3068, "step": 1493 }, { "epoch": 0.8204283360790774, "grad_norm": 0.3893644153432786, "learning_rate": 8.352575616294467e-07, "loss": 0.2478, "step": 1494 }, { "epoch": 0.8209774848984075, "grad_norm": 0.625194027542938, "learning_rate": 8.350420284669591e-07, "loss": 0.3414, "step": 1495 }, { "epoch": 0.8215266337177375, "grad_norm": 0.4856517126022426, "learning_rate": 8.348263822570034e-07, "loss": 0.2629, "step": 1496 }, { "epoch": 0.8220757825370676, "grad_norm": 0.567719728166833, "learning_rate": 8.346106230723523e-07, "loss": 0.2626, "step": 1497 }, { "epoch": 0.8226249313563976, "grad_norm": 0.46270203184098035, "learning_rate": 8.343947509858166e-07, "loss": 0.2897, "step": 1498 }, { "epoch": 0.8231740801757276, "grad_norm": 0.6255026255765539, "learning_rate": 8.341787660702448e-07, "loss": 0.278, "step": 1499 }, { "epoch": 0.8237232289950577, "grad_norm": 0.5472470169073899, "learning_rate": 8.339626683985244e-07, "loss": 0.261, "step": 1500 }, { "epoch": 0.8242723778143877, "grad_norm": 0.43252569961690646, "learning_rate": 8.337464580435802e-07, "loss": 0.3564, "step": 1501 }, { "epoch": 0.8248215266337178, "grad_norm": 0.4634700708502407, "learning_rate": 8.335301350783752e-07, "loss": 0.2852, "step": 1502 }, { "epoch": 0.8253706754530478, "grad_norm": 0.4341353948067242, "learning_rate": 8.333136995759105e-07, "loss": 0.2372, "step": 1503 }, { "epoch": 0.8259198242723779, "grad_norm": 0.45988440402985054, "learning_rate": 8.330971516092249e-07, "loss": 0.2538, "step": 1504 }, { "epoch": 0.8264689730917079, "grad_norm": 0.41219672711224953, "learning_rate": 8.328804912513956e-07, "loss": 0.2821, "step": 1505 }, { "epoch": 0.8270181219110379, "grad_norm": 0.4304928286185643, "learning_rate": 8.326637185755373e-07, "loss": 0.2555, "step": 1506 }, { "epoch": 0.827567270730368, "grad_norm": 0.4690696985243173, "learning_rate": 8.32446833654803e-07, "loss": 0.2904, "step": 1507 }, { "epoch": 0.828116419549698, "grad_norm": 0.5449421987324699, "learning_rate": 8.322298365623833e-07, "loss": 0.3081, "step": 1508 }, { "epoch": 0.828665568369028, "grad_norm": 0.6906409481835247, "learning_rate": 8.320127273715065e-07, "loss": 0.3215, "step": 1509 }, { "epoch": 0.829214717188358, "grad_norm": 0.4849914970257505, "learning_rate": 8.317955061554393e-07, "loss": 0.2745, "step": 1510 }, { "epoch": 0.829763866007688, "grad_norm": 0.4765773804767519, "learning_rate": 8.315781729874855e-07, "loss": 0.3207, "step": 1511 }, { "epoch": 0.8303130148270181, "grad_norm": 0.474877432271796, "learning_rate": 8.313607279409874e-07, "loss": 0.3053, "step": 1512 }, { "epoch": 0.8308621636463481, "grad_norm": 0.6175212247304712, "learning_rate": 8.311431710893244e-07, "loss": 0.2984, "step": 1513 }, { "epoch": 0.8314113124656782, "grad_norm": 0.5306669975291177, "learning_rate": 8.309255025059141e-07, "loss": 0.2681, "step": 1514 }, { "epoch": 0.8319604612850082, "grad_norm": 0.547468286168826, "learning_rate": 8.307077222642117e-07, "loss": 0.28, "step": 1515 }, { "epoch": 0.8325096101043383, "grad_norm": 0.5679207644320915, "learning_rate": 8.304898304377098e-07, "loss": 0.2703, "step": 1516 }, { "epoch": 0.8330587589236683, "grad_norm": 0.48076553384995413, "learning_rate": 8.302718270999388e-07, "loss": 0.2802, "step": 1517 }, { "epoch": 0.8336079077429983, "grad_norm": 0.4553089428815726, "learning_rate": 8.300537123244671e-07, "loss": 0.2929, "step": 1518 }, { "epoch": 0.8341570565623284, "grad_norm": 0.46295785861732336, "learning_rate": 8.298354861849003e-07, "loss": 0.2609, "step": 1519 }, { "epoch": 0.8347062053816584, "grad_norm": 0.5048380294612951, "learning_rate": 8.296171487548814e-07, "loss": 0.2666, "step": 1520 }, { "epoch": 0.8352553542009885, "grad_norm": 0.4741662351274767, "learning_rate": 8.293987001080917e-07, "loss": 0.2533, "step": 1521 }, { "epoch": 0.8358045030203185, "grad_norm": 0.7822656827612668, "learning_rate": 8.291801403182492e-07, "loss": 0.2639, "step": 1522 }, { "epoch": 0.8363536518396485, "grad_norm": 0.47538701149401524, "learning_rate": 8.2896146945911e-07, "loss": 0.2416, "step": 1523 }, { "epoch": 0.8369028006589786, "grad_norm": 0.431756000720763, "learning_rate": 8.287426876044673e-07, "loss": 0.2811, "step": 1524 }, { "epoch": 0.8374519494783086, "grad_norm": 0.40932839690669437, "learning_rate": 8.28523794828152e-07, "loss": 0.2658, "step": 1525 }, { "epoch": 0.8380010982976387, "grad_norm": 0.6878260459206037, "learning_rate": 8.283047912040322e-07, "loss": 0.2394, "step": 1526 }, { "epoch": 0.8385502471169687, "grad_norm": 0.7192636381792953, "learning_rate": 8.280856768060138e-07, "loss": 0.3324, "step": 1527 }, { "epoch": 0.8390993959362988, "grad_norm": 0.4480962450712991, "learning_rate": 8.278664517080397e-07, "loss": 0.2938, "step": 1528 }, { "epoch": 0.8396485447556288, "grad_norm": 0.6394549253969104, "learning_rate": 8.276471159840903e-07, "loss": 0.3068, "step": 1529 }, { "epoch": 0.8401976935749588, "grad_norm": 0.5232886697449434, "learning_rate": 8.274276697081837e-07, "loss": 0.306, "step": 1530 }, { "epoch": 0.8407468423942889, "grad_norm": 0.5367383002514323, "learning_rate": 8.27208112954374e-07, "loss": 0.281, "step": 1531 }, { "epoch": 0.8412959912136189, "grad_norm": 0.5399608090313538, "learning_rate": 8.269884457967544e-07, "loss": 0.2736, "step": 1532 }, { "epoch": 0.841845140032949, "grad_norm": 0.5448170871645851, "learning_rate": 8.267686683094542e-07, "loss": 0.2645, "step": 1533 }, { "epoch": 0.842394288852279, "grad_norm": 0.4770975157211897, "learning_rate": 8.265487805666401e-07, "loss": 0.2702, "step": 1534 }, { "epoch": 0.842943437671609, "grad_norm": 0.41030774560154143, "learning_rate": 8.263287826425163e-07, "loss": 0.3042, "step": 1535 }, { "epoch": 0.8434925864909391, "grad_norm": 0.4883096396011661, "learning_rate": 8.261086746113236e-07, "loss": 0.2332, "step": 1536 }, { "epoch": 0.844041735310269, "grad_norm": 0.3991964499511617, "learning_rate": 8.258884565473409e-07, "loss": 0.2564, "step": 1537 }, { "epoch": 0.8445908841295992, "grad_norm": 0.47163009935174277, "learning_rate": 8.256681285248832e-07, "loss": 0.2568, "step": 1538 }, { "epoch": 0.8451400329489291, "grad_norm": 0.5429149138226584, "learning_rate": 8.254476906183034e-07, "loss": 0.2817, "step": 1539 }, { "epoch": 0.8456891817682592, "grad_norm": 0.4356510363035574, "learning_rate": 8.252271429019911e-07, "loss": 0.2688, "step": 1540 }, { "epoch": 0.8462383305875892, "grad_norm": 0.439999350761338, "learning_rate": 8.250064854503731e-07, "loss": 0.2795, "step": 1541 }, { "epoch": 0.8467874794069192, "grad_norm": 0.44638633016895746, "learning_rate": 8.247857183379129e-07, "loss": 0.2866, "step": 1542 }, { "epoch": 0.8473366282262493, "grad_norm": 0.45874440539186045, "learning_rate": 8.245648416391115e-07, "loss": 0.2513, "step": 1543 }, { "epoch": 0.8478857770455793, "grad_norm": 0.4942048701143511, "learning_rate": 8.243438554285066e-07, "loss": 0.2914, "step": 1544 }, { "epoch": 0.8484349258649094, "grad_norm": 0.6485254391068528, "learning_rate": 8.241227597806729e-07, "loss": 0.2864, "step": 1545 }, { "epoch": 0.8489840746842394, "grad_norm": 0.44552925725187204, "learning_rate": 8.239015547702221e-07, "loss": 0.2774, "step": 1546 }, { "epoch": 0.8495332235035694, "grad_norm": 0.4177584816686149, "learning_rate": 8.236802404718024e-07, "loss": 0.2588, "step": 1547 }, { "epoch": 0.8500823723228995, "grad_norm": 0.5156423265438955, "learning_rate": 8.234588169600996e-07, "loss": 0.3068, "step": 1548 }, { "epoch": 0.8506315211422295, "grad_norm": 0.5431465932616798, "learning_rate": 8.232372843098359e-07, "loss": 0.2764, "step": 1549 }, { "epoch": 0.8511806699615596, "grad_norm": 0.6826306710079628, "learning_rate": 8.230156425957702e-07, "loss": 0.3305, "step": 1550 }, { "epoch": 0.8517298187808896, "grad_norm": 0.4120972365696, "learning_rate": 8.227938918926989e-07, "loss": 0.2594, "step": 1551 }, { "epoch": 0.8522789676002197, "grad_norm": 0.7054920171237341, "learning_rate": 8.225720322754542e-07, "loss": 0.2554, "step": 1552 }, { "epoch": 0.8528281164195497, "grad_norm": 0.39428553771727576, "learning_rate": 8.223500638189058e-07, "loss": 0.2572, "step": 1553 }, { "epoch": 0.8533772652388797, "grad_norm": 0.45096872189736265, "learning_rate": 8.221279865979597e-07, "loss": 0.282, "step": 1554 }, { "epoch": 0.8539264140582098, "grad_norm": 0.5198560645426167, "learning_rate": 8.21905800687559e-07, "loss": 0.2628, "step": 1555 }, { "epoch": 0.8544755628775398, "grad_norm": 0.5615601958452272, "learning_rate": 8.21683506162683e-07, "loss": 0.2633, "step": 1556 }, { "epoch": 0.8550247116968699, "grad_norm": 0.4267297415548566, "learning_rate": 8.214611030983483e-07, "loss": 0.2517, "step": 1557 }, { "epoch": 0.8555738605161999, "grad_norm": 0.5119102447203505, "learning_rate": 8.212385915696072e-07, "loss": 0.2737, "step": 1558 }, { "epoch": 0.85612300933553, "grad_norm": 0.4772402963106916, "learning_rate": 8.210159716515495e-07, "loss": 0.2641, "step": 1559 }, { "epoch": 0.85667215815486, "grad_norm": 0.4723282602362927, "learning_rate": 8.207932434193012e-07, "loss": 0.2744, "step": 1560 }, { "epoch": 0.85722130697419, "grad_norm": 0.5247984990965343, "learning_rate": 8.205704069480249e-07, "loss": 0.2976, "step": 1561 }, { "epoch": 0.8577704557935201, "grad_norm": 0.434901082740237, "learning_rate": 8.203474623129195e-07, "loss": 0.2678, "step": 1562 }, { "epoch": 0.8583196046128501, "grad_norm": 0.4559973411833468, "learning_rate": 8.201244095892209e-07, "loss": 0.2688, "step": 1563 }, { "epoch": 0.8588687534321802, "grad_norm": 0.4231064043390817, "learning_rate": 8.199012488522009e-07, "loss": 0.245, "step": 1564 }, { "epoch": 0.8594179022515102, "grad_norm": 0.47218772093751676, "learning_rate": 8.196779801771681e-07, "loss": 0.2673, "step": 1565 }, { "epoch": 0.8599670510708401, "grad_norm": 0.5520503565540403, "learning_rate": 8.194546036394674e-07, "loss": 0.2789, "step": 1566 }, { "epoch": 0.8605161998901703, "grad_norm": 0.37498723691525676, "learning_rate": 8.192311193144804e-07, "loss": 0.2733, "step": 1567 }, { "epoch": 0.8610653487095002, "grad_norm": 0.5149189703656774, "learning_rate": 8.190075272776248e-07, "loss": 0.277, "step": 1568 }, { "epoch": 0.8616144975288303, "grad_norm": 0.45090773049821525, "learning_rate": 8.187838276043543e-07, "loss": 0.2713, "step": 1569 }, { "epoch": 0.8621636463481603, "grad_norm": 0.4617050207480255, "learning_rate": 8.185600203701596e-07, "loss": 0.2529, "step": 1570 }, { "epoch": 0.8627127951674904, "grad_norm": 0.43678266883374833, "learning_rate": 8.183361056505673e-07, "loss": 0.2147, "step": 1571 }, { "epoch": 0.8632619439868204, "grad_norm": 0.6051509419557817, "learning_rate": 8.181120835211405e-07, "loss": 0.3373, "step": 1572 }, { "epoch": 0.8638110928061504, "grad_norm": 0.5152957267725544, "learning_rate": 8.178879540574782e-07, "loss": 0.264, "step": 1573 }, { "epoch": 0.8643602416254805, "grad_norm": 0.4642906848030533, "learning_rate": 8.176637173352161e-07, "loss": 0.2868, "step": 1574 }, { "epoch": 0.8649093904448105, "grad_norm": 0.40258635278783617, "learning_rate": 8.174393734300257e-07, "loss": 0.3055, "step": 1575 }, { "epoch": 0.8654585392641406, "grad_norm": 0.48691420533764, "learning_rate": 8.172149224176146e-07, "loss": 0.2804, "step": 1576 }, { "epoch": 0.8660076880834706, "grad_norm": 0.4105792581508396, "learning_rate": 8.169903643737269e-07, "loss": 0.2602, "step": 1577 }, { "epoch": 0.8665568369028006, "grad_norm": 0.3806674382835567, "learning_rate": 8.167656993741429e-07, "loss": 0.2679, "step": 1578 }, { "epoch": 0.8671059857221307, "grad_norm": 0.5759802343499267, "learning_rate": 8.165409274946785e-07, "loss": 0.2872, "step": 1579 }, { "epoch": 0.8676551345414607, "grad_norm": 0.41258408078017383, "learning_rate": 8.16316048811186e-07, "loss": 0.2541, "step": 1580 }, { "epoch": 0.8682042833607908, "grad_norm": 0.5494199768508434, "learning_rate": 8.160910633995537e-07, "loss": 0.2628, "step": 1581 }, { "epoch": 0.8687534321801208, "grad_norm": 0.4842256746977313, "learning_rate": 8.158659713357057e-07, "loss": 0.2945, "step": 1582 }, { "epoch": 0.8693025809994509, "grad_norm": 0.5161051509926529, "learning_rate": 8.156407726956027e-07, "loss": 0.3009, "step": 1583 }, { "epoch": 0.8698517298187809, "grad_norm": 0.6009319973252065, "learning_rate": 8.154154675552405e-07, "loss": 0.2802, "step": 1584 }, { "epoch": 0.8704008786381109, "grad_norm": 0.5254518705648258, "learning_rate": 8.151900559906515e-07, "loss": 0.2651, "step": 1585 }, { "epoch": 0.870950027457441, "grad_norm": 0.4269060890893298, "learning_rate": 8.149645380779037e-07, "loss": 0.2699, "step": 1586 }, { "epoch": 0.871499176276771, "grad_norm": 0.5526083810148167, "learning_rate": 8.147389138931011e-07, "loss": 0.3272, "step": 1587 }, { "epoch": 0.8720483250961011, "grad_norm": 0.4201904199832163, "learning_rate": 8.145131835123837e-07, "loss": 0.2719, "step": 1588 }, { "epoch": 0.8725974739154311, "grad_norm": 0.4512924499577039, "learning_rate": 8.14287347011927e-07, "loss": 0.2894, "step": 1589 }, { "epoch": 0.8731466227347611, "grad_norm": 0.5214547859034492, "learning_rate": 8.140614044679426e-07, "loss": 0.2892, "step": 1590 }, { "epoch": 0.8736957715540912, "grad_norm": 0.6194074355365053, "learning_rate": 8.138353559566779e-07, "loss": 0.2898, "step": 1591 }, { "epoch": 0.8742449203734212, "grad_norm": 0.5428445392822564, "learning_rate": 8.136092015544158e-07, "loss": 0.2935, "step": 1592 }, { "epoch": 0.8747940691927513, "grad_norm": 0.4094756190013001, "learning_rate": 8.133829413374749e-07, "loss": 0.2351, "step": 1593 }, { "epoch": 0.8753432180120813, "grad_norm": 0.669719460405006, "learning_rate": 8.131565753822101e-07, "loss": 0.3262, "step": 1594 }, { "epoch": 0.8758923668314114, "grad_norm": 0.5394528055741196, "learning_rate": 8.129301037650113e-07, "loss": 0.2798, "step": 1595 }, { "epoch": 0.8764415156507414, "grad_norm": 0.5481488572312055, "learning_rate": 8.127035265623042e-07, "loss": 0.2445, "step": 1596 }, { "epoch": 0.8769906644700713, "grad_norm": 0.4536830212420526, "learning_rate": 8.124768438505506e-07, "loss": 0.254, "step": 1597 }, { "epoch": 0.8775398132894014, "grad_norm": 0.461279718781452, "learning_rate": 8.122500557062474e-07, "loss": 0.2693, "step": 1598 }, { "epoch": 0.8780889621087314, "grad_norm": 0.4768690733712926, "learning_rate": 8.12023162205927e-07, "loss": 0.2621, "step": 1599 }, { "epoch": 0.8786381109280615, "grad_norm": 0.5067142684422133, "learning_rate": 8.117961634261582e-07, "loss": 0.2927, "step": 1600 }, { "epoch": 0.8786381109280615, "eval_loss": 0.3504696190357208, "eval_runtime": 18.5691, "eval_samples_per_second": 23.857, "eval_steps_per_second": 1.023, "step": 1600 }, { "epoch": 0.8791872597473915, "grad_norm": 0.5304329469767416, "learning_rate": 8.115690594435441e-07, "loss": 0.2862, "step": 1601 }, { "epoch": 0.8797364085667215, "grad_norm": 0.4200685970405138, "learning_rate": 8.113418503347243e-07, "loss": 0.219, "step": 1602 }, { "epoch": 0.8802855573860516, "grad_norm": 0.5422488027845013, "learning_rate": 8.111145361763734e-07, "loss": 0.2781, "step": 1603 }, { "epoch": 0.8808347062053816, "grad_norm": 0.5529610412765624, "learning_rate": 8.108871170452015e-07, "loss": 0.238, "step": 1604 }, { "epoch": 0.8813838550247117, "grad_norm": 0.4069361697324099, "learning_rate": 8.106595930179541e-07, "loss": 0.2576, "step": 1605 }, { "epoch": 0.8819330038440417, "grad_norm": 0.3989512929777735, "learning_rate": 8.104319641714126e-07, "loss": 0.2547, "step": 1606 }, { "epoch": 0.8824821526633718, "grad_norm": 0.48605701233827703, "learning_rate": 8.102042305823928e-07, "loss": 0.2427, "step": 1607 }, { "epoch": 0.8830313014827018, "grad_norm": 0.5582006189848949, "learning_rate": 8.099763923277469e-07, "loss": 0.3268, "step": 1608 }, { "epoch": 0.8835804503020318, "grad_norm": 0.47369094368169923, "learning_rate": 8.097484494843616e-07, "loss": 0.2925, "step": 1609 }, { "epoch": 0.8841295991213619, "grad_norm": 0.4348971617335604, "learning_rate": 8.09520402129159e-07, "loss": 0.3077, "step": 1610 }, { "epoch": 0.8846787479406919, "grad_norm": 0.5196076776534113, "learning_rate": 8.092922503390972e-07, "loss": 0.2912, "step": 1611 }, { "epoch": 0.885227896760022, "grad_norm": 0.6551376831089732, "learning_rate": 8.090639941911689e-07, "loss": 0.2991, "step": 1612 }, { "epoch": 0.885777045579352, "grad_norm": 0.43678167140136764, "learning_rate": 8.088356337624017e-07, "loss": 0.2673, "step": 1613 }, { "epoch": 0.886326194398682, "grad_norm": 0.6194670908851351, "learning_rate": 8.086071691298594e-07, "loss": 0.3134, "step": 1614 }, { "epoch": 0.8868753432180121, "grad_norm": 0.4830895566863416, "learning_rate": 8.083786003706402e-07, "loss": 0.2639, "step": 1615 }, { "epoch": 0.8874244920373421, "grad_norm": 0.38504093989091165, "learning_rate": 8.081499275618774e-07, "loss": 0.2847, "step": 1616 }, { "epoch": 0.8879736408566722, "grad_norm": 0.47735204318074786, "learning_rate": 8.079211507807399e-07, "loss": 0.2666, "step": 1617 }, { "epoch": 0.8885227896760022, "grad_norm": 0.4310478062500324, "learning_rate": 8.076922701044314e-07, "loss": 0.2595, "step": 1618 }, { "epoch": 0.8890719384953323, "grad_norm": 0.426374728954958, "learning_rate": 8.074632856101905e-07, "loss": 0.2852, "step": 1619 }, { "epoch": 0.8896210873146623, "grad_norm": 0.4297271659043643, "learning_rate": 8.072341973752914e-07, "loss": 0.2381, "step": 1620 }, { "epoch": 0.8901702361339923, "grad_norm": 0.4319160129566455, "learning_rate": 8.070050054770427e-07, "loss": 0.2623, "step": 1621 }, { "epoch": 0.8907193849533224, "grad_norm": 0.4582320243173229, "learning_rate": 8.067757099927881e-07, "loss": 0.3038, "step": 1622 }, { "epoch": 0.8912685337726524, "grad_norm": 0.37868902040646024, "learning_rate": 8.065463109999068e-07, "loss": 0.2919, "step": 1623 }, { "epoch": 0.8918176825919825, "grad_norm": 0.4519912345460703, "learning_rate": 8.063168085758121e-07, "loss": 0.256, "step": 1624 }, { "epoch": 0.8923668314113125, "grad_norm": 0.49305590228844964, "learning_rate": 8.060872027979527e-07, "loss": 0.2513, "step": 1625 }, { "epoch": 0.8929159802306426, "grad_norm": 0.4719066298778507, "learning_rate": 8.058574937438123e-07, "loss": 0.264, "step": 1626 }, { "epoch": 0.8934651290499726, "grad_norm": 0.5336147730971229, "learning_rate": 8.056276814909091e-07, "loss": 0.2606, "step": 1627 }, { "epoch": 0.8940142778693025, "grad_norm": 0.5464721736828236, "learning_rate": 8.053977661167961e-07, "loss": 0.2509, "step": 1628 }, { "epoch": 0.8945634266886326, "grad_norm": 0.5118130759243972, "learning_rate": 8.051677476990616e-07, "loss": 0.2685, "step": 1629 }, { "epoch": 0.8951125755079626, "grad_norm": 0.4319907093704611, "learning_rate": 8.04937626315328e-07, "loss": 0.2578, "step": 1630 }, { "epoch": 0.8956617243272927, "grad_norm": 0.4372698982339803, "learning_rate": 8.047074020432532e-07, "loss": 0.2677, "step": 1631 }, { "epoch": 0.8962108731466227, "grad_norm": 0.43375702875372335, "learning_rate": 8.044770749605289e-07, "loss": 0.2614, "step": 1632 }, { "epoch": 0.8967600219659527, "grad_norm": 0.44766386955005877, "learning_rate": 8.042466451448824e-07, "loss": 0.2747, "step": 1633 }, { "epoch": 0.8973091707852828, "grad_norm": 0.49185543197181786, "learning_rate": 8.040161126740752e-07, "loss": 0.2757, "step": 1634 }, { "epoch": 0.8978583196046128, "grad_norm": 0.47647185034478784, "learning_rate": 8.037854776259034e-07, "loss": 0.3097, "step": 1635 }, { "epoch": 0.8984074684239429, "grad_norm": 0.6229743998079772, "learning_rate": 8.035547400781979e-07, "loss": 0.2361, "step": 1636 }, { "epoch": 0.8989566172432729, "grad_norm": 0.5905638636785615, "learning_rate": 8.033239001088241e-07, "loss": 0.3194, "step": 1637 }, { "epoch": 0.899505766062603, "grad_norm": 0.456828043325492, "learning_rate": 8.030929577956821e-07, "loss": 0.2507, "step": 1638 }, { "epoch": 0.900054914881933, "grad_norm": 0.5360359258999062, "learning_rate": 8.028619132167063e-07, "loss": 0.3045, "step": 1639 }, { "epoch": 0.900604063701263, "grad_norm": 0.5143411214612951, "learning_rate": 8.026307664498657e-07, "loss": 0.2654, "step": 1640 }, { "epoch": 0.9011532125205931, "grad_norm": 0.538470184145661, "learning_rate": 8.023995175731638e-07, "loss": 0.2786, "step": 1641 }, { "epoch": 0.9017023613399231, "grad_norm": 0.4591072509050711, "learning_rate": 8.02168166664639e-07, "loss": 0.2398, "step": 1642 }, { "epoch": 0.9022515101592532, "grad_norm": 0.5138710450347816, "learning_rate": 8.01936713802363e-07, "loss": 0.2438, "step": 1643 }, { "epoch": 0.9028006589785832, "grad_norm": 0.4142712971863248, "learning_rate": 8.017051590644431e-07, "loss": 0.2776, "step": 1644 }, { "epoch": 0.9033498077979132, "grad_norm": 0.5144798657168941, "learning_rate": 8.014735025290202e-07, "loss": 0.2512, "step": 1645 }, { "epoch": 0.9038989566172433, "grad_norm": 0.4515667087998328, "learning_rate": 8.012417442742703e-07, "loss": 0.2444, "step": 1646 }, { "epoch": 0.9044481054365733, "grad_norm": 0.4855148714739395, "learning_rate": 8.010098843784028e-07, "loss": 0.2548, "step": 1647 }, { "epoch": 0.9049972542559034, "grad_norm": 0.47912310899290783, "learning_rate": 8.007779229196622e-07, "loss": 0.2629, "step": 1648 }, { "epoch": 0.9055464030752334, "grad_norm": 0.5129694422052712, "learning_rate": 8.005458599763267e-07, "loss": 0.2736, "step": 1649 }, { "epoch": 0.9060955518945635, "grad_norm": 0.43532731670312247, "learning_rate": 8.003136956267091e-07, "loss": 0.2608, "step": 1650 }, { "epoch": 0.9066447007138935, "grad_norm": 0.5285884742744585, "learning_rate": 8.000814299491565e-07, "loss": 0.3026, "step": 1651 }, { "epoch": 0.9071938495332235, "grad_norm": 0.516471533962854, "learning_rate": 7.998490630220497e-07, "loss": 0.2711, "step": 1652 }, { "epoch": 0.9077429983525536, "grad_norm": 0.43593238158953923, "learning_rate": 7.996165949238041e-07, "loss": 0.2671, "step": 1653 }, { "epoch": 0.9082921471718836, "grad_norm": 0.4467082812614068, "learning_rate": 7.99384025732869e-07, "loss": 0.3158, "step": 1654 }, { "epoch": 0.9088412959912137, "grad_norm": 0.9106844683253277, "learning_rate": 7.991513555277282e-07, "loss": 0.4324, "step": 1655 }, { "epoch": 0.9093904448105437, "grad_norm": 0.47262221420614964, "learning_rate": 7.989185843868993e-07, "loss": 0.2886, "step": 1656 }, { "epoch": 0.9099395936298736, "grad_norm": 0.5029359625093429, "learning_rate": 7.986857123889336e-07, "loss": 0.2684, "step": 1657 }, { "epoch": 0.9104887424492037, "grad_norm": 0.608385988374047, "learning_rate": 7.984527396124174e-07, "loss": 0.2503, "step": 1658 }, { "epoch": 0.9110378912685337, "grad_norm": 0.4238556930706127, "learning_rate": 7.982196661359698e-07, "loss": 0.2746, "step": 1659 }, { "epoch": 0.9115870400878638, "grad_norm": 0.42403113424925926, "learning_rate": 7.979864920382449e-07, "loss": 0.2727, "step": 1660 }, { "epoch": 0.9121361889071938, "grad_norm": 0.439014572595048, "learning_rate": 7.977532173979303e-07, "loss": 0.2696, "step": 1661 }, { "epoch": 0.9126853377265239, "grad_norm": 0.45385586171833314, "learning_rate": 7.975198422937477e-07, "loss": 0.2512, "step": 1662 }, { "epoch": 0.9132344865458539, "grad_norm": 0.6168347452883322, "learning_rate": 7.972863668044524e-07, "loss": 0.2595, "step": 1663 }, { "epoch": 0.9137836353651839, "grad_norm": 0.6654375343851714, "learning_rate": 7.970527910088338e-07, "loss": 0.2848, "step": 1664 }, { "epoch": 0.914332784184514, "grad_norm": 0.5236618734783829, "learning_rate": 7.968191149857152e-07, "loss": 0.2408, "step": 1665 }, { "epoch": 0.914881933003844, "grad_norm": 0.5161043973919437, "learning_rate": 7.965853388139539e-07, "loss": 0.2606, "step": 1666 }, { "epoch": 0.9154310818231741, "grad_norm": 0.5369182393620026, "learning_rate": 7.963514625724402e-07, "loss": 0.3086, "step": 1667 }, { "epoch": 0.9159802306425041, "grad_norm": 0.4409827118930272, "learning_rate": 7.96117486340099e-07, "loss": 0.2603, "step": 1668 }, { "epoch": 0.9165293794618341, "grad_norm": 0.44597728661676106, "learning_rate": 7.958834101958888e-07, "loss": 0.2326, "step": 1669 }, { "epoch": 0.9170785282811642, "grad_norm": 0.5629941370524331, "learning_rate": 7.956492342188015e-07, "loss": 0.2472, "step": 1670 }, { "epoch": 0.9176276771004942, "grad_norm": 0.4062052252940199, "learning_rate": 7.954149584878628e-07, "loss": 0.2827, "step": 1671 }, { "epoch": 0.9181768259198243, "grad_norm": 0.6324067401399381, "learning_rate": 7.951805830821323e-07, "loss": 0.2762, "step": 1672 }, { "epoch": 0.9187259747391543, "grad_norm": 0.49222416226976573, "learning_rate": 7.94946108080703e-07, "loss": 0.2779, "step": 1673 }, { "epoch": 0.9192751235584844, "grad_norm": 0.5447677537696793, "learning_rate": 7.947115335627017e-07, "loss": 0.259, "step": 1674 }, { "epoch": 0.9198242723778144, "grad_norm": 0.7217111109642705, "learning_rate": 7.944768596072884e-07, "loss": 0.3151, "step": 1675 }, { "epoch": 0.9203734211971444, "grad_norm": 0.45892564156643595, "learning_rate": 7.942420862936569e-07, "loss": 0.2401, "step": 1676 }, { "epoch": 0.9209225700164745, "grad_norm": 0.43348993272683223, "learning_rate": 7.940072137010348e-07, "loss": 0.2601, "step": 1677 }, { "epoch": 0.9214717188358045, "grad_norm": 0.4819214666969866, "learning_rate": 7.937722419086829e-07, "loss": 0.273, "step": 1678 }, { "epoch": 0.9220208676551346, "grad_norm": 0.5343198814948842, "learning_rate": 7.935371709958953e-07, "loss": 0.2921, "step": 1679 }, { "epoch": 0.9225700164744646, "grad_norm": 0.434012804711229, "learning_rate": 7.933020010420001e-07, "loss": 0.3039, "step": 1680 }, { "epoch": 0.9231191652937946, "grad_norm": 0.4982008110946288, "learning_rate": 7.930667321263583e-07, "loss": 0.3365, "step": 1681 }, { "epoch": 0.9236683141131247, "grad_norm": 0.4736340382316756, "learning_rate": 7.928313643283644e-07, "loss": 0.2361, "step": 1682 }, { "epoch": 0.9242174629324547, "grad_norm": 0.5454444242360594, "learning_rate": 7.925958977274464e-07, "loss": 0.2323, "step": 1683 }, { "epoch": 0.9247666117517848, "grad_norm": 0.47791707385936805, "learning_rate": 7.923603324030658e-07, "loss": 0.2589, "step": 1684 }, { "epoch": 0.9253157605711148, "grad_norm": 0.5211350113278819, "learning_rate": 7.92124668434717e-07, "loss": 0.3068, "step": 1685 }, { "epoch": 0.9258649093904449, "grad_norm": 0.4796579128979127, "learning_rate": 7.918889059019283e-07, "loss": 0.2769, "step": 1686 }, { "epoch": 0.9264140582097748, "grad_norm": 0.49232718745012527, "learning_rate": 7.916530448842604e-07, "loss": 0.2439, "step": 1687 }, { "epoch": 0.9269632070291048, "grad_norm": 0.4558609718678011, "learning_rate": 7.914170854613076e-07, "loss": 0.2695, "step": 1688 }, { "epoch": 0.9275123558484349, "grad_norm": 0.4682236398006378, "learning_rate": 7.911810277126981e-07, "loss": 0.3055, "step": 1689 }, { "epoch": 0.9280615046677649, "grad_norm": 0.5488883413713245, "learning_rate": 7.909448717180924e-07, "loss": 0.2791, "step": 1690 }, { "epoch": 0.928610653487095, "grad_norm": 0.6152742935623089, "learning_rate": 7.907086175571841e-07, "loss": 0.3109, "step": 1691 }, { "epoch": 0.929159802306425, "grad_norm": 0.47479565309022026, "learning_rate": 7.90472265309701e-07, "loss": 0.2983, "step": 1692 }, { "epoch": 0.9297089511257551, "grad_norm": 0.49437316447878904, "learning_rate": 7.902358150554027e-07, "loss": 0.246, "step": 1693 }, { "epoch": 0.9302580999450851, "grad_norm": 0.5376412305321454, "learning_rate": 7.899992668740826e-07, "loss": 0.3165, "step": 1694 }, { "epoch": 0.9308072487644151, "grad_norm": 0.5358008268830613, "learning_rate": 7.89762620845567e-07, "loss": 0.2645, "step": 1695 }, { "epoch": 0.9313563975837452, "grad_norm": 0.45118466846144795, "learning_rate": 7.895258770497154e-07, "loss": 0.2682, "step": 1696 }, { "epoch": 0.9319055464030752, "grad_norm": 0.5208255452849432, "learning_rate": 7.892890355664199e-07, "loss": 0.2207, "step": 1697 }, { "epoch": 0.9324546952224053, "grad_norm": 0.4731153441278961, "learning_rate": 7.890520964756058e-07, "loss": 0.2527, "step": 1698 }, { "epoch": 0.9330038440417353, "grad_norm": 0.4575028765301887, "learning_rate": 7.888150598572311e-07, "loss": 0.2402, "step": 1699 }, { "epoch": 0.9335529928610653, "grad_norm": 0.47803200976429167, "learning_rate": 7.885779257912876e-07, "loss": 0.2648, "step": 1700 }, { "epoch": 0.9341021416803954, "grad_norm": 0.5246748879091071, "learning_rate": 7.883406943577985e-07, "loss": 0.2737, "step": 1701 }, { "epoch": 0.9346512904997254, "grad_norm": 0.5588355495947951, "learning_rate": 7.881033656368212e-07, "loss": 0.252, "step": 1702 }, { "epoch": 0.9352004393190555, "grad_norm": 0.5183516008396057, "learning_rate": 7.878659397084453e-07, "loss": 0.3132, "step": 1703 }, { "epoch": 0.9357495881383855, "grad_norm": 0.4042180592091759, "learning_rate": 7.876284166527931e-07, "loss": 0.2299, "step": 1704 }, { "epoch": 0.9362987369577156, "grad_norm": 0.6813981873075216, "learning_rate": 7.873907965500201e-07, "loss": 0.2534, "step": 1705 }, { "epoch": 0.9368478857770456, "grad_norm": 0.4490906623651215, "learning_rate": 7.871530794803144e-07, "loss": 0.2608, "step": 1706 }, { "epoch": 0.9373970345963756, "grad_norm": 0.41357000685914597, "learning_rate": 7.869152655238965e-07, "loss": 0.2657, "step": 1707 }, { "epoch": 0.9379461834157057, "grad_norm": 0.5447434118355732, "learning_rate": 7.8667735476102e-07, "loss": 0.2722, "step": 1708 }, { "epoch": 0.9384953322350357, "grad_norm": 0.4530362436525533, "learning_rate": 7.86439347271971e-07, "loss": 0.2605, "step": 1709 }, { "epoch": 0.9390444810543658, "grad_norm": 0.491906698727782, "learning_rate": 7.862012431370681e-07, "loss": 0.2476, "step": 1710 }, { "epoch": 0.9395936298736958, "grad_norm": 0.39280246518759476, "learning_rate": 7.859630424366628e-07, "loss": 0.2809, "step": 1711 }, { "epoch": 0.9401427786930258, "grad_norm": 0.4930803786947981, "learning_rate": 7.857247452511393e-07, "loss": 0.2702, "step": 1712 }, { "epoch": 0.9406919275123559, "grad_norm": 0.5755809098505037, "learning_rate": 7.854863516609137e-07, "loss": 0.2313, "step": 1713 }, { "epoch": 0.9412410763316859, "grad_norm": 0.5049170443148283, "learning_rate": 7.852478617464354e-07, "loss": 0.2807, "step": 1714 }, { "epoch": 0.941790225151016, "grad_norm": 0.4925496057542274, "learning_rate": 7.850092755881855e-07, "loss": 0.2496, "step": 1715 }, { "epoch": 0.942339373970346, "grad_norm": 0.5374667297046075, "learning_rate": 7.847705932666786e-07, "loss": 0.2715, "step": 1716 }, { "epoch": 0.942888522789676, "grad_norm": 0.4641633013192042, "learning_rate": 7.84531814862461e-07, "loss": 0.3116, "step": 1717 }, { "epoch": 0.943437671609006, "grad_norm": 0.5050485118505725, "learning_rate": 7.842929404561114e-07, "loss": 0.2595, "step": 1718 }, { "epoch": 0.943986820428336, "grad_norm": 0.5580960097482179, "learning_rate": 7.840539701282412e-07, "loss": 0.2631, "step": 1719 }, { "epoch": 0.9445359692476661, "grad_norm": 0.5289379181835928, "learning_rate": 7.838149039594943e-07, "loss": 0.2737, "step": 1720 }, { "epoch": 0.9450851180669961, "grad_norm": 0.5371059822933462, "learning_rate": 7.835757420305465e-07, "loss": 0.3384, "step": 1721 }, { "epoch": 0.9456342668863262, "grad_norm": 0.40728473281221633, "learning_rate": 7.833364844221065e-07, "loss": 0.2973, "step": 1722 }, { "epoch": 0.9461834157056562, "grad_norm": 0.4919831080277447, "learning_rate": 7.830971312149143e-07, "loss": 0.2784, "step": 1723 }, { "epoch": 0.9467325645249862, "grad_norm": 0.4836977422329483, "learning_rate": 7.828576824897431e-07, "loss": 0.2543, "step": 1724 }, { "epoch": 0.9472817133443163, "grad_norm": 0.437616976352026, "learning_rate": 7.826181383273982e-07, "loss": 0.2612, "step": 1725 }, { "epoch": 0.9478308621636463, "grad_norm": 0.45875893844486315, "learning_rate": 7.823784988087166e-07, "loss": 0.2609, "step": 1726 }, { "epoch": 0.9483800109829764, "grad_norm": 0.5033621962179399, "learning_rate": 7.821387640145682e-07, "loss": 0.2758, "step": 1727 }, { "epoch": 0.9489291598023064, "grad_norm": 0.4257649240334981, "learning_rate": 7.818989340258543e-07, "loss": 0.2538, "step": 1728 }, { "epoch": 0.9494783086216365, "grad_norm": 0.42844787389189803, "learning_rate": 7.81659008923509e-07, "loss": 0.2584, "step": 1729 }, { "epoch": 0.9500274574409665, "grad_norm": 0.5846160464916133, "learning_rate": 7.81418988788498e-07, "loss": 0.27, "step": 1730 }, { "epoch": 0.9505766062602965, "grad_norm": 0.43208504923518043, "learning_rate": 7.811788737018192e-07, "loss": 0.2354, "step": 1731 }, { "epoch": 0.9511257550796266, "grad_norm": 0.44715498190261227, "learning_rate": 7.809386637445025e-07, "loss": 0.2667, "step": 1732 }, { "epoch": 0.9516749038989566, "grad_norm": 0.47207132803701624, "learning_rate": 7.806983589976103e-07, "loss": 0.2686, "step": 1733 }, { "epoch": 0.9522240527182867, "grad_norm": 0.4787218670132171, "learning_rate": 7.804579595422362e-07, "loss": 0.2435, "step": 1734 }, { "epoch": 0.9527732015376167, "grad_norm": 0.45628303919484353, "learning_rate": 7.802174654595065e-07, "loss": 0.2728, "step": 1735 }, { "epoch": 0.9533223503569467, "grad_norm": 0.38018083744914055, "learning_rate": 7.799768768305789e-07, "loss": 0.2429, "step": 1736 }, { "epoch": 0.9538714991762768, "grad_norm": 0.5112013260262124, "learning_rate": 7.797361937366432e-07, "loss": 0.2694, "step": 1737 }, { "epoch": 0.9544206479956068, "grad_norm": 0.45696805500952237, "learning_rate": 7.79495416258921e-07, "loss": 0.2677, "step": 1738 }, { "epoch": 0.9549697968149369, "grad_norm": 0.4835116137567483, "learning_rate": 7.792545444786661e-07, "loss": 0.2727, "step": 1739 }, { "epoch": 0.9555189456342669, "grad_norm": 0.4496758064298272, "learning_rate": 7.790135784771637e-07, "loss": 0.2414, "step": 1740 }, { "epoch": 0.956068094453597, "grad_norm": 0.4659569699239641, "learning_rate": 7.787725183357307e-07, "loss": 0.2512, "step": 1741 }, { "epoch": 0.956617243272927, "grad_norm": 0.3994140737240832, "learning_rate": 7.785313641357166e-07, "loss": 0.2714, "step": 1742 }, { "epoch": 0.957166392092257, "grad_norm": 0.40864015820755256, "learning_rate": 7.782901159585015e-07, "loss": 0.266, "step": 1743 }, { "epoch": 0.9577155409115871, "grad_norm": 0.5461071439054702, "learning_rate": 7.780487738854981e-07, "loss": 0.2685, "step": 1744 }, { "epoch": 0.958264689730917, "grad_norm": 0.5286460933222177, "learning_rate": 7.778073379981501e-07, "loss": 0.2635, "step": 1745 }, { "epoch": 0.9588138385502472, "grad_norm": 0.4810688479863782, "learning_rate": 7.775658083779335e-07, "loss": 0.2601, "step": 1746 }, { "epoch": 0.9593629873695771, "grad_norm": 0.4415328360929715, "learning_rate": 7.773241851063558e-07, "loss": 0.2582, "step": 1747 }, { "epoch": 0.9599121361889071, "grad_norm": 0.6042766028826041, "learning_rate": 7.770824682649557e-07, "loss": 0.2494, "step": 1748 }, { "epoch": 0.9604612850082372, "grad_norm": 0.45869160117730795, "learning_rate": 7.768406579353036e-07, "loss": 0.2498, "step": 1749 }, { "epoch": 0.9610104338275672, "grad_norm": 0.5918779105040332, "learning_rate": 7.76598754199002e-07, "loss": 0.2829, "step": 1750 }, { "epoch": 0.9615595826468973, "grad_norm": 0.5221014159845146, "learning_rate": 7.763567571376841e-07, "loss": 0.2558, "step": 1751 }, { "epoch": 0.9621087314662273, "grad_norm": 0.445451975309114, "learning_rate": 7.761146668330152e-07, "loss": 0.2386, "step": 1752 }, { "epoch": 0.9626578802855574, "grad_norm": 0.42868800693247866, "learning_rate": 7.758724833666919e-07, "loss": 0.2292, "step": 1753 }, { "epoch": 0.9632070291048874, "grad_norm": 0.4563793054125372, "learning_rate": 7.75630206820442e-07, "loss": 0.2541, "step": 1754 }, { "epoch": 0.9637561779242174, "grad_norm": 0.44344498132545906, "learning_rate": 7.753878372760251e-07, "loss": 0.2683, "step": 1755 }, { "epoch": 0.9643053267435475, "grad_norm": 0.3956813660687399, "learning_rate": 7.751453748152318e-07, "loss": 0.2523, "step": 1756 }, { "epoch": 0.9648544755628775, "grad_norm": 0.4489393997313342, "learning_rate": 7.749028195198843e-07, "loss": 0.2468, "step": 1757 }, { "epoch": 0.9654036243822076, "grad_norm": 0.5866565355532205, "learning_rate": 7.746601714718362e-07, "loss": 0.2968, "step": 1758 }, { "epoch": 0.9659527732015376, "grad_norm": 0.4945840896004298, "learning_rate": 7.744174307529725e-07, "loss": 0.2575, "step": 1759 }, { "epoch": 0.9665019220208677, "grad_norm": 0.5579849532829233, "learning_rate": 7.741745974452088e-07, "loss": 0.2362, "step": 1760 }, { "epoch": 0.9670510708401977, "grad_norm": 0.4707748415892224, "learning_rate": 7.739316716304924e-07, "loss": 0.2783, "step": 1761 }, { "epoch": 0.9676002196595277, "grad_norm": 0.8143075151003899, "learning_rate": 7.736886533908019e-07, "loss": 0.2757, "step": 1762 }, { "epoch": 0.9681493684788578, "grad_norm": 0.43106514200214663, "learning_rate": 7.734455428081473e-07, "loss": 0.2901, "step": 1763 }, { "epoch": 0.9686985172981878, "grad_norm": 0.43445368935880396, "learning_rate": 7.732023399645692e-07, "loss": 0.2912, "step": 1764 }, { "epoch": 0.9692476661175179, "grad_norm": 0.40441583855768226, "learning_rate": 7.729590449421396e-07, "loss": 0.2796, "step": 1765 }, { "epoch": 0.9697968149368479, "grad_norm": 0.5170178164470397, "learning_rate": 7.727156578229616e-07, "loss": 0.2677, "step": 1766 }, { "epoch": 0.9703459637561779, "grad_norm": 0.5443808284667313, "learning_rate": 7.724721786891695e-07, "loss": 0.2853, "step": 1767 }, { "epoch": 0.970895112575508, "grad_norm": 0.5927298437635197, "learning_rate": 7.722286076229284e-07, "loss": 0.283, "step": 1768 }, { "epoch": 0.971444261394838, "grad_norm": 0.6127684542817139, "learning_rate": 7.719849447064347e-07, "loss": 0.2522, "step": 1769 }, { "epoch": 0.9719934102141681, "grad_norm": 0.5315395611227436, "learning_rate": 7.717411900219155e-07, "loss": 0.2699, "step": 1770 }, { "epoch": 0.9725425590334981, "grad_norm": 0.5351063404606797, "learning_rate": 7.714973436516294e-07, "loss": 0.2599, "step": 1771 }, { "epoch": 0.9730917078528282, "grad_norm": 0.8519000592417668, "learning_rate": 7.712534056778649e-07, "loss": 0.2886, "step": 1772 }, { "epoch": 0.9736408566721582, "grad_norm": 0.3980995553922157, "learning_rate": 7.710093761829425e-07, "loss": 0.2725, "step": 1773 }, { "epoch": 0.9741900054914882, "grad_norm": 0.5201613757318241, "learning_rate": 7.707652552492134e-07, "loss": 0.2693, "step": 1774 }, { "epoch": 0.9747391543108183, "grad_norm": 0.558676030612656, "learning_rate": 7.705210429590589e-07, "loss": 0.303, "step": 1775 }, { "epoch": 0.9752883031301482, "grad_norm": 0.47812981544883876, "learning_rate": 7.70276739394892e-07, "loss": 0.2667, "step": 1776 }, { "epoch": 0.9758374519494784, "grad_norm": 0.547770104025705, "learning_rate": 7.70032344639156e-07, "loss": 0.2521, "step": 1777 }, { "epoch": 0.9763866007688083, "grad_norm": 0.48037326364239863, "learning_rate": 7.697878587743251e-07, "loss": 0.2281, "step": 1778 }, { "epoch": 0.9769357495881383, "grad_norm": 0.5667793390416505, "learning_rate": 7.695432818829046e-07, "loss": 0.2451, "step": 1779 }, { "epoch": 0.9774848984074684, "grad_norm": 0.42210473340749294, "learning_rate": 7.692986140474297e-07, "loss": 0.2484, "step": 1780 }, { "epoch": 0.9780340472267984, "grad_norm": 0.4090286636235274, "learning_rate": 7.69053855350467e-07, "loss": 0.3056, "step": 1781 }, { "epoch": 0.9785831960461285, "grad_norm": 0.4898439846013174, "learning_rate": 7.688090058746136e-07, "loss": 0.2671, "step": 1782 }, { "epoch": 0.9791323448654585, "grad_norm": 0.5181603399804232, "learning_rate": 7.685640657024973e-07, "loss": 0.271, "step": 1783 }, { "epoch": 0.9796814936847886, "grad_norm": 0.6101604825789101, "learning_rate": 7.683190349167761e-07, "loss": 0.2635, "step": 1784 }, { "epoch": 0.9802306425041186, "grad_norm": 0.5039024071563895, "learning_rate": 7.68073913600139e-07, "loss": 0.2558, "step": 1785 }, { "epoch": 0.9807797913234486, "grad_norm": 0.46516505002599134, "learning_rate": 7.678287018353054e-07, "loss": 0.2702, "step": 1786 }, { "epoch": 0.9813289401427787, "grad_norm": 0.46409827859508324, "learning_rate": 7.675833997050253e-07, "loss": 0.2528, "step": 1787 }, { "epoch": 0.9818780889621087, "grad_norm": 0.5400606318479371, "learning_rate": 7.673380072920788e-07, "loss": 0.2825, "step": 1788 }, { "epoch": 0.9824272377814388, "grad_norm": 0.5564284746876682, "learning_rate": 7.670925246792773e-07, "loss": 0.2511, "step": 1789 }, { "epoch": 0.9829763866007688, "grad_norm": 0.4923896529721891, "learning_rate": 7.668469519494619e-07, "loss": 0.2346, "step": 1790 }, { "epoch": 0.9835255354200988, "grad_norm": 0.39437563269135684, "learning_rate": 7.666012891855043e-07, "loss": 0.2284, "step": 1791 }, { "epoch": 0.9840746842394289, "grad_norm": 0.41906052169314323, "learning_rate": 7.663555364703066e-07, "loss": 0.2636, "step": 1792 }, { "epoch": 0.9846238330587589, "grad_norm": 0.4023326714682953, "learning_rate": 7.661096938868013e-07, "loss": 0.2512, "step": 1793 }, { "epoch": 0.985172981878089, "grad_norm": 0.4838286623553397, "learning_rate": 7.658637615179516e-07, "loss": 0.2323, "step": 1794 }, { "epoch": 0.985722130697419, "grad_norm": 0.5575885605065329, "learning_rate": 7.656177394467502e-07, "loss": 0.275, "step": 1795 }, { "epoch": 0.9862712795167491, "grad_norm": 0.4260296588110545, "learning_rate": 7.653716277562204e-07, "loss": 0.2806, "step": 1796 }, { "epoch": 0.9868204283360791, "grad_norm": 0.46565438595880204, "learning_rate": 7.651254265294163e-07, "loss": 0.2402, "step": 1797 }, { "epoch": 0.9873695771554091, "grad_norm": 0.4230243180213196, "learning_rate": 7.648791358494213e-07, "loss": 0.2655, "step": 1798 }, { "epoch": 0.9879187259747392, "grad_norm": 0.41038079843468256, "learning_rate": 7.646327557993495e-07, "loss": 0.2921, "step": 1799 }, { "epoch": 0.9884678747940692, "grad_norm": 0.4164321056674212, "learning_rate": 7.643862864623453e-07, "loss": 0.2521, "step": 1800 }, { "epoch": 0.9884678747940692, "eval_loss": 0.34528762102127075, "eval_runtime": 18.5439, "eval_samples_per_second": 23.889, "eval_steps_per_second": 1.025, "step": 1800 }, { "epoch": 0.9890170236133993, "grad_norm": 0.4836515649227938, "learning_rate": 7.641397279215829e-07, "loss": 0.2154, "step": 1801 }, { "epoch": 0.9895661724327293, "grad_norm": 0.46368817362105696, "learning_rate": 7.638930802602665e-07, "loss": 0.2586, "step": 1802 }, { "epoch": 0.9901153212520593, "grad_norm": 0.4070155869632062, "learning_rate": 7.636463435616312e-07, "loss": 0.2453, "step": 1803 }, { "epoch": 0.9906644700713894, "grad_norm": 0.45395483380394075, "learning_rate": 7.633995179089409e-07, "loss": 0.2608, "step": 1804 }, { "epoch": 0.9912136188907194, "grad_norm": 0.3913694336666719, "learning_rate": 7.631526033854905e-07, "loss": 0.2803, "step": 1805 }, { "epoch": 0.9917627677100495, "grad_norm": 0.6396163432276057, "learning_rate": 7.629056000746046e-07, "loss": 0.2946, "step": 1806 }, { "epoch": 0.9923119165293794, "grad_norm": 0.6048936670137287, "learning_rate": 7.626585080596375e-07, "loss": 0.3021, "step": 1807 }, { "epoch": 0.9928610653487095, "grad_norm": 0.5241290650324293, "learning_rate": 7.624113274239739e-07, "loss": 0.2267, "step": 1808 }, { "epoch": 0.9934102141680395, "grad_norm": 0.5999421099843125, "learning_rate": 7.621640582510277e-07, "loss": 0.2728, "step": 1809 }, { "epoch": 0.9939593629873695, "grad_norm": 0.5236069631135204, "learning_rate": 7.619167006242437e-07, "loss": 0.2887, "step": 1810 }, { "epoch": 0.9945085118066996, "grad_norm": 0.48630894161056804, "learning_rate": 7.616692546270956e-07, "loss": 0.3145, "step": 1811 }, { "epoch": 0.9950576606260296, "grad_norm": 0.4930936152558026, "learning_rate": 7.614217203430874e-07, "loss": 0.2494, "step": 1812 }, { "epoch": 0.9956068094453597, "grad_norm": 0.44139600354087577, "learning_rate": 7.611740978557531e-07, "loss": 0.2858, "step": 1813 }, { "epoch": 0.9961559582646897, "grad_norm": 0.4877427860404887, "learning_rate": 7.609263872486557e-07, "loss": 0.2584, "step": 1814 }, { "epoch": 0.9967051070840197, "grad_norm": 0.484164544649045, "learning_rate": 7.606785886053887e-07, "loss": 0.2517, "step": 1815 }, { "epoch": 0.9972542559033498, "grad_norm": 0.44687816830556487, "learning_rate": 7.60430702009575e-07, "loss": 0.2623, "step": 1816 }, { "epoch": 0.9978034047226798, "grad_norm": 0.5620673692002126, "learning_rate": 7.60182727544867e-07, "loss": 0.2746, "step": 1817 }, { "epoch": 0.9983525535420099, "grad_norm": 0.43170546576838636, "learning_rate": 7.599346652949471e-07, "loss": 0.2665, "step": 1818 }, { "epoch": 0.9989017023613399, "grad_norm": 0.4612571656231133, "learning_rate": 7.596865153435271e-07, "loss": 0.2755, "step": 1819 }, { "epoch": 0.99945085118067, "grad_norm": 0.47048451172823025, "learning_rate": 7.594382777743487e-07, "loss": 0.2623, "step": 1820 }, { "epoch": 1.0, "grad_norm": 0.6197317419503009, "learning_rate": 7.591899526711826e-07, "loss": 0.3271, "step": 1821 }, { "epoch": 1.00054914881933, "grad_norm": 0.41578437413931263, "learning_rate": 7.589415401178294e-07, "loss": 0.2675, "step": 1822 }, { "epoch": 1.00109829763866, "grad_norm": 0.4842886147580413, "learning_rate": 7.586930401981195e-07, "loss": 0.2516, "step": 1823 }, { "epoch": 1.00164744645799, "grad_norm": 0.5930650394982593, "learning_rate": 7.584444529959122e-07, "loss": 0.3383, "step": 1824 }, { "epoch": 1.0021965952773202, "grad_norm": 0.43158603874350837, "learning_rate": 7.581957785950966e-07, "loss": 0.2677, "step": 1825 }, { "epoch": 1.0027457440966503, "grad_norm": 0.4364266572708857, "learning_rate": 7.579470170795911e-07, "loss": 0.2454, "step": 1826 }, { "epoch": 1.0032948929159802, "grad_norm": 0.43961576365943744, "learning_rate": 7.576981685333436e-07, "loss": 0.2319, "step": 1827 }, { "epoch": 1.0038440417353103, "grad_norm": 0.5380930158053616, "learning_rate": 7.574492330403313e-07, "loss": 0.2559, "step": 1828 }, { "epoch": 1.0043931905546404, "grad_norm": 0.52376181353432, "learning_rate": 7.572002106845606e-07, "loss": 0.2791, "step": 1829 }, { "epoch": 1.0049423393739703, "grad_norm": 0.6060189145093499, "learning_rate": 7.569511015500678e-07, "loss": 0.2875, "step": 1830 }, { "epoch": 1.0054914881933004, "grad_norm": 0.4892904473365606, "learning_rate": 7.567019057209177e-07, "loss": 0.2662, "step": 1831 }, { "epoch": 1.0060406370126305, "grad_norm": 0.6393816945523433, "learning_rate": 7.564526232812048e-07, "loss": 0.32, "step": 1832 }, { "epoch": 1.0065897858319606, "grad_norm": 0.4849396292906131, "learning_rate": 7.562032543150527e-07, "loss": 0.2491, "step": 1833 }, { "epoch": 1.0071389346512905, "grad_norm": 0.5726582764023308, "learning_rate": 7.559537989066141e-07, "loss": 0.277, "step": 1834 }, { "epoch": 1.0076880834706206, "grad_norm": 0.5805280895481991, "learning_rate": 7.557042571400716e-07, "loss": 0.2913, "step": 1835 }, { "epoch": 1.0082372322899507, "grad_norm": 0.4523204870483219, "learning_rate": 7.554546290996356e-07, "loss": 0.2728, "step": 1836 }, { "epoch": 1.0087863811092805, "grad_norm": 0.4392558372785879, "learning_rate": 7.552049148695469e-07, "loss": 0.2389, "step": 1837 }, { "epoch": 1.0093355299286106, "grad_norm": 0.3978654258602959, "learning_rate": 7.549551145340746e-07, "loss": 0.2833, "step": 1838 }, { "epoch": 1.0098846787479407, "grad_norm": 0.6288463716170226, "learning_rate": 7.547052281775171e-07, "loss": 0.2683, "step": 1839 }, { "epoch": 1.0104338275672706, "grad_norm": 0.4363803769542127, "learning_rate": 7.54455255884202e-07, "loss": 0.2503, "step": 1840 }, { "epoch": 1.0109829763866007, "grad_norm": 0.47027345127633846, "learning_rate": 7.542051977384857e-07, "loss": 0.2408, "step": 1841 }, { "epoch": 1.0115321252059308, "grad_norm": 0.6418223290281864, "learning_rate": 7.539550538247533e-07, "loss": 0.3113, "step": 1842 }, { "epoch": 1.012081274025261, "grad_norm": 0.5411808923595263, "learning_rate": 7.537048242274196e-07, "loss": 0.2474, "step": 1843 }, { "epoch": 1.0126304228445908, "grad_norm": 0.47371677464110296, "learning_rate": 7.534545090309276e-07, "loss": 0.2742, "step": 1844 }, { "epoch": 1.013179571663921, "grad_norm": 0.4140319891610881, "learning_rate": 7.532041083197497e-07, "loss": 0.2312, "step": 1845 }, { "epoch": 1.013728720483251, "grad_norm": 0.47098614805721045, "learning_rate": 7.529536221783867e-07, "loss": 0.2358, "step": 1846 }, { "epoch": 1.014277869302581, "grad_norm": 0.489341438823661, "learning_rate": 7.527030506913686e-07, "loss": 0.2526, "step": 1847 }, { "epoch": 1.014827018121911, "grad_norm": 0.5073707910023101, "learning_rate": 7.524523939432538e-07, "loss": 0.2859, "step": 1848 }, { "epoch": 1.015376166941241, "grad_norm": 0.471135300684498, "learning_rate": 7.522016520186299e-07, "loss": 0.2367, "step": 1849 }, { "epoch": 1.0159253157605712, "grad_norm": 0.5408069547889525, "learning_rate": 7.519508250021129e-07, "loss": 0.2933, "step": 1850 }, { "epoch": 1.016474464579901, "grad_norm": 0.4927024626190959, "learning_rate": 7.516999129783479e-07, "loss": 0.2839, "step": 1851 }, { "epoch": 1.0170236133992312, "grad_norm": 0.455187961758037, "learning_rate": 7.514489160320083e-07, "loss": 0.2126, "step": 1852 }, { "epoch": 1.0175727622185613, "grad_norm": 0.5709789836912843, "learning_rate": 7.511978342477965e-07, "loss": 0.2922, "step": 1853 }, { "epoch": 1.0181219110378912, "grad_norm": 0.5311031354348835, "learning_rate": 7.509466677104432e-07, "loss": 0.2962, "step": 1854 }, { "epoch": 1.0186710598572213, "grad_norm": 0.46834918729813446, "learning_rate": 7.50695416504708e-07, "loss": 0.2854, "step": 1855 }, { "epoch": 1.0192202086765514, "grad_norm": 0.4601976267797454, "learning_rate": 7.504440807153787e-07, "loss": 0.2556, "step": 1856 }, { "epoch": 1.0197693574958815, "grad_norm": 0.8922349338001272, "learning_rate": 7.501926604272721e-07, "loss": 0.4347, "step": 1857 }, { "epoch": 1.0203185063152114, "grad_norm": 0.42685723914488466, "learning_rate": 7.49941155725233e-07, "loss": 0.2604, "step": 1858 }, { "epoch": 1.0208676551345415, "grad_norm": 0.4826636252760197, "learning_rate": 7.496895666941353e-07, "loss": 0.2563, "step": 1859 }, { "epoch": 1.0214168039538716, "grad_norm": 0.547492542573778, "learning_rate": 7.494378934188808e-07, "loss": 0.2679, "step": 1860 }, { "epoch": 1.0219659527732015, "grad_norm": 0.43265999361011936, "learning_rate": 7.491861359844e-07, "loss": 0.2276, "step": 1861 }, { "epoch": 1.0225151015925316, "grad_norm": 0.4875130613505053, "learning_rate": 7.489342944756519e-07, "loss": 0.2241, "step": 1862 }, { "epoch": 1.0230642504118617, "grad_norm": 0.5670497970250489, "learning_rate": 7.486823689776235e-07, "loss": 0.2196, "step": 1863 }, { "epoch": 1.0236133992311915, "grad_norm": 0.5044139270536077, "learning_rate": 7.484303595753307e-07, "loss": 0.2445, "step": 1864 }, { "epoch": 1.0241625480505216, "grad_norm": 0.6485812641315216, "learning_rate": 7.48178266353817e-07, "loss": 0.2694, "step": 1865 }, { "epoch": 1.0247116968698518, "grad_norm": 0.4818520361512355, "learning_rate": 7.479260893981548e-07, "loss": 0.2943, "step": 1866 }, { "epoch": 1.0252608456891819, "grad_norm": 0.5622173081405858, "learning_rate": 7.476738287934445e-07, "loss": 0.2483, "step": 1867 }, { "epoch": 1.0258099945085117, "grad_norm": 0.4978481006420446, "learning_rate": 7.474214846248148e-07, "loss": 0.2485, "step": 1868 }, { "epoch": 1.0263591433278418, "grad_norm": 0.4195966967278341, "learning_rate": 7.471690569774224e-07, "loss": 0.2662, "step": 1869 }, { "epoch": 1.026908292147172, "grad_norm": 0.5085901706757727, "learning_rate": 7.469165459364526e-07, "loss": 0.2416, "step": 1870 }, { "epoch": 1.0274574409665018, "grad_norm": 0.4389130842819843, "learning_rate": 7.466639515871183e-07, "loss": 0.2407, "step": 1871 }, { "epoch": 1.028006589785832, "grad_norm": 0.46093514222600845, "learning_rate": 7.464112740146612e-07, "loss": 0.2681, "step": 1872 }, { "epoch": 1.028555738605162, "grad_norm": 0.47502751939237914, "learning_rate": 7.4615851330435e-07, "loss": 0.2754, "step": 1873 }, { "epoch": 1.0291048874244921, "grad_norm": 0.6312240664022183, "learning_rate": 7.459056695414827e-07, "loss": 0.3064, "step": 1874 }, { "epoch": 1.029654036243822, "grad_norm": 0.3965834518947881, "learning_rate": 7.456527428113845e-07, "loss": 0.2849, "step": 1875 }, { "epoch": 1.0302031850631521, "grad_norm": 0.4507770290194585, "learning_rate": 7.45399733199409e-07, "loss": 0.2682, "step": 1876 }, { "epoch": 1.0307523338824822, "grad_norm": 0.49520375633819635, "learning_rate": 7.451466407909374e-07, "loss": 0.2795, "step": 1877 }, { "epoch": 1.031301482701812, "grad_norm": 0.41089911654635053, "learning_rate": 7.448934656713792e-07, "loss": 0.2687, "step": 1878 }, { "epoch": 1.0318506315211422, "grad_norm": 0.5360605994884401, "learning_rate": 7.446402079261718e-07, "loss": 0.2961, "step": 1879 }, { "epoch": 1.0323997803404723, "grad_norm": 0.48125365131606435, "learning_rate": 7.443868676407801e-07, "loss": 0.288, "step": 1880 }, { "epoch": 1.0329489291598024, "grad_norm": 0.3705486460941909, "learning_rate": 7.441334449006974e-07, "loss": 0.2579, "step": 1881 }, { "epoch": 1.0334980779791323, "grad_norm": 0.41412298669102643, "learning_rate": 7.438799397914442e-07, "loss": 0.2556, "step": 1882 }, { "epoch": 1.0340472267984624, "grad_norm": 0.4340679652922135, "learning_rate": 7.436263523985695e-07, "loss": 0.2577, "step": 1883 }, { "epoch": 1.0345963756177925, "grad_norm": 0.5985237083166384, "learning_rate": 7.433726828076496e-07, "loss": 0.2409, "step": 1884 }, { "epoch": 1.0351455244371224, "grad_norm": 0.4307423706499519, "learning_rate": 7.431189311042883e-07, "loss": 0.2515, "step": 1885 }, { "epoch": 1.0356946732564525, "grad_norm": 0.613209077905052, "learning_rate": 7.428650973741179e-07, "loss": 0.2938, "step": 1886 }, { "epoch": 1.0362438220757826, "grad_norm": 0.4686732770862708, "learning_rate": 7.426111817027976e-07, "loss": 0.2466, "step": 1887 }, { "epoch": 1.0367929708951125, "grad_norm": 0.48015529289259584, "learning_rate": 7.423571841760149e-07, "loss": 0.2461, "step": 1888 }, { "epoch": 1.0373421197144426, "grad_norm": 0.46120839265425806, "learning_rate": 7.421031048794843e-07, "loss": 0.273, "step": 1889 }, { "epoch": 1.0378912685337727, "grad_norm": 0.3666383063710738, "learning_rate": 7.418489438989485e-07, "loss": 0.2929, "step": 1890 }, { "epoch": 1.0384404173531028, "grad_norm": 0.5855318134964513, "learning_rate": 7.415947013201773e-07, "loss": 0.2643, "step": 1891 }, { "epoch": 1.0389895661724327, "grad_norm": 0.5622034772547164, "learning_rate": 7.413403772289678e-07, "loss": 0.2965, "step": 1892 }, { "epoch": 1.0395387149917628, "grad_norm": 0.4851139456124104, "learning_rate": 7.410859717111459e-07, "loss": 0.2934, "step": 1893 }, { "epoch": 1.0400878638110929, "grad_norm": 0.4526450023997006, "learning_rate": 7.408314848525634e-07, "loss": 0.2472, "step": 1894 }, { "epoch": 1.0406370126304227, "grad_norm": 0.510980318120092, "learning_rate": 7.405769167391005e-07, "loss": 0.2737, "step": 1895 }, { "epoch": 1.0411861614497528, "grad_norm": 0.5989982281270091, "learning_rate": 7.403222674566647e-07, "loss": 0.3381, "step": 1896 }, { "epoch": 1.041735310269083, "grad_norm": 0.5201095290469655, "learning_rate": 7.400675370911903e-07, "loss": 0.2846, "step": 1897 }, { "epoch": 1.042284459088413, "grad_norm": 0.5081609618306947, "learning_rate": 7.398127257286399e-07, "loss": 0.2573, "step": 1898 }, { "epoch": 1.042833607907743, "grad_norm": 1.328065481774891, "learning_rate": 7.395578334550026e-07, "loss": 0.2806, "step": 1899 }, { "epoch": 1.043382756727073, "grad_norm": 0.4588942803499214, "learning_rate": 7.393028603562952e-07, "loss": 0.2728, "step": 1900 }, { "epoch": 1.0439319055464031, "grad_norm": 0.5519204757660947, "learning_rate": 7.39047806518562e-07, "loss": 0.2763, "step": 1901 }, { "epoch": 1.044481054365733, "grad_norm": 0.4273072437038306, "learning_rate": 7.387926720278739e-07, "loss": 0.2467, "step": 1902 }, { "epoch": 1.0450302031850631, "grad_norm": 0.5603419599277042, "learning_rate": 7.385374569703296e-07, "loss": 0.3278, "step": 1903 }, { "epoch": 1.0455793520043932, "grad_norm": 0.46282749221860814, "learning_rate": 7.38282161432055e-07, "loss": 0.2517, "step": 1904 }, { "epoch": 1.0461285008237233, "grad_norm": 0.5234512677935415, "learning_rate": 7.380267854992024e-07, "loss": 0.2579, "step": 1905 }, { "epoch": 1.0466776496430532, "grad_norm": 0.4939694171578199, "learning_rate": 7.37771329257952e-07, "loss": 0.2263, "step": 1906 }, { "epoch": 1.0472267984623833, "grad_norm": 0.4612044728123528, "learning_rate": 7.375157927945111e-07, "loss": 0.3081, "step": 1907 }, { "epoch": 1.0477759472817134, "grad_norm": 0.42088853837759654, "learning_rate": 7.372601761951137e-07, "loss": 0.2223, "step": 1908 }, { "epoch": 1.0483250961010433, "grad_norm": 0.5144396275656794, "learning_rate": 7.37004479546021e-07, "loss": 0.2526, "step": 1909 }, { "epoch": 1.0488742449203734, "grad_norm": 0.5106578937429289, "learning_rate": 7.36748702933521e-07, "loss": 0.2532, "step": 1910 }, { "epoch": 1.0494233937397035, "grad_norm": 0.4233076360404985, "learning_rate": 7.36492846443929e-07, "loss": 0.2318, "step": 1911 }, { "epoch": 1.0499725425590336, "grad_norm": 0.5059375016312809, "learning_rate": 7.362369101635874e-07, "loss": 0.2462, "step": 1912 }, { "epoch": 1.0505216913783635, "grad_norm": 0.4903167549395599, "learning_rate": 7.359808941788647e-07, "loss": 0.265, "step": 1913 }, { "epoch": 1.0510708401976936, "grad_norm": 0.5830956306818907, "learning_rate": 7.357247985761574e-07, "loss": 0.3169, "step": 1914 }, { "epoch": 1.0516199890170237, "grad_norm": 0.4202106558059594, "learning_rate": 7.354686234418883e-07, "loss": 0.2556, "step": 1915 }, { "epoch": 1.0521691378363536, "grad_norm": 0.4937607561452683, "learning_rate": 7.352123688625066e-07, "loss": 0.2307, "step": 1916 }, { "epoch": 1.0527182866556837, "grad_norm": 0.5376362404685917, "learning_rate": 7.349560349244894e-07, "loss": 0.2798, "step": 1917 }, { "epoch": 1.0532674354750138, "grad_norm": 0.6365819727585691, "learning_rate": 7.346996217143394e-07, "loss": 0.2861, "step": 1918 }, { "epoch": 1.0538165842943437, "grad_norm": 0.5826156240105614, "learning_rate": 7.34443129318587e-07, "loss": 0.2863, "step": 1919 }, { "epoch": 1.0543657331136738, "grad_norm": 0.4598109134569838, "learning_rate": 7.341865578237888e-07, "loss": 0.267, "step": 1920 }, { "epoch": 1.0549148819330039, "grad_norm": 0.5167649148140777, "learning_rate": 7.33929907316528e-07, "loss": 0.2504, "step": 1921 }, { "epoch": 1.055464030752334, "grad_norm": 0.5735152148626464, "learning_rate": 7.336731778834151e-07, "loss": 0.2801, "step": 1922 }, { "epoch": 1.0560131795716639, "grad_norm": 0.6863260320534952, "learning_rate": 7.334163696110866e-07, "loss": 0.2954, "step": 1923 }, { "epoch": 1.056562328390994, "grad_norm": 0.5188716198417976, "learning_rate": 7.331594825862059e-07, "loss": 0.3047, "step": 1924 }, { "epoch": 1.057111477210324, "grad_norm": 0.524826132487868, "learning_rate": 7.329025168954629e-07, "loss": 0.2566, "step": 1925 }, { "epoch": 1.057660626029654, "grad_norm": 0.6298929407110873, "learning_rate": 7.326454726255738e-07, "loss": 0.2811, "step": 1926 }, { "epoch": 1.058209774848984, "grad_norm": 0.40592329840524316, "learning_rate": 7.323883498632821e-07, "loss": 0.2698, "step": 1927 }, { "epoch": 1.0587589236683141, "grad_norm": 0.5492557069011063, "learning_rate": 7.321311486953567e-07, "loss": 0.2753, "step": 1928 }, { "epoch": 1.0593080724876442, "grad_norm": 0.4372373730770258, "learning_rate": 7.318738692085939e-07, "loss": 0.2531, "step": 1929 }, { "epoch": 1.0598572213069741, "grad_norm": 0.4890468250542675, "learning_rate": 7.31616511489816e-07, "loss": 0.2619, "step": 1930 }, { "epoch": 1.0604063701263042, "grad_norm": 0.48686492305275036, "learning_rate": 7.313590756258717e-07, "loss": 0.2671, "step": 1931 }, { "epoch": 1.0609555189456343, "grad_norm": 0.5904849452947342, "learning_rate": 7.311015617036359e-07, "loss": 0.3122, "step": 1932 }, { "epoch": 1.0615046677649642, "grad_norm": 0.5514617975174491, "learning_rate": 7.308439698100103e-07, "loss": 0.2769, "step": 1933 }, { "epoch": 1.0620538165842943, "grad_norm": 0.4893794046153872, "learning_rate": 7.305863000319228e-07, "loss": 0.2741, "step": 1934 }, { "epoch": 1.0626029654036244, "grad_norm": 0.49855065944451776, "learning_rate": 7.303285524563271e-07, "loss": 0.2617, "step": 1935 }, { "epoch": 1.0631521142229543, "grad_norm": 0.5080905203056124, "learning_rate": 7.300707271702038e-07, "loss": 0.2751, "step": 1936 }, { "epoch": 1.0637012630422844, "grad_norm": 0.509009548533992, "learning_rate": 7.298128242605592e-07, "loss": 0.3173, "step": 1937 }, { "epoch": 1.0642504118616145, "grad_norm": 0.4902855440171685, "learning_rate": 7.295548438144264e-07, "loss": 0.2549, "step": 1938 }, { "epoch": 1.0647995606809446, "grad_norm": 0.48230088700943624, "learning_rate": 7.292967859188638e-07, "loss": 0.2957, "step": 1939 }, { "epoch": 1.0653487095002745, "grad_norm": 0.5915589072443312, "learning_rate": 7.290386506609567e-07, "loss": 0.2393, "step": 1940 }, { "epoch": 1.0658978583196046, "grad_norm": 0.5812434986966898, "learning_rate": 7.287804381278164e-07, "loss": 0.3074, "step": 1941 }, { "epoch": 1.0664470071389347, "grad_norm": 0.5742467416143907, "learning_rate": 7.2852214840658e-07, "loss": 0.259, "step": 1942 }, { "epoch": 1.0669961559582646, "grad_norm": 0.44473181705933523, "learning_rate": 7.282637815844105e-07, "loss": 0.2624, "step": 1943 }, { "epoch": 1.0675453047775947, "grad_norm": 0.43849484593757254, "learning_rate": 7.280053377484974e-07, "loss": 0.2433, "step": 1944 }, { "epoch": 1.0680944535969248, "grad_norm": 0.44154688572759393, "learning_rate": 7.277468169860562e-07, "loss": 0.266, "step": 1945 }, { "epoch": 1.068643602416255, "grad_norm": 0.47092833108215504, "learning_rate": 7.274882193843278e-07, "loss": 0.2513, "step": 1946 }, { "epoch": 1.0691927512355848, "grad_norm": 0.3867134794448388, "learning_rate": 7.272295450305793e-07, "loss": 0.288, "step": 1947 }, { "epoch": 1.0697419000549149, "grad_norm": 0.466449706152349, "learning_rate": 7.269707940121041e-07, "loss": 0.2798, "step": 1948 }, { "epoch": 1.070291048874245, "grad_norm": 0.5372199937100748, "learning_rate": 7.26711966416221e-07, "loss": 0.2769, "step": 1949 }, { "epoch": 1.0708401976935749, "grad_norm": 0.5021622348268454, "learning_rate": 7.264530623302746e-07, "loss": 0.2481, "step": 1950 }, { "epoch": 1.071389346512905, "grad_norm": 0.44674989073601945, "learning_rate": 7.261940818416358e-07, "loss": 0.2557, "step": 1951 }, { "epoch": 1.071938495332235, "grad_norm": 0.46583884140733256, "learning_rate": 7.259350250377007e-07, "loss": 0.278, "step": 1952 }, { "epoch": 1.0724876441515652, "grad_norm": 0.4669746396459837, "learning_rate": 7.256758920058916e-07, "loss": 0.2328, "step": 1953 }, { "epoch": 1.073036792970895, "grad_norm": 0.4707492711977006, "learning_rate": 7.254166828336562e-07, "loss": 0.2644, "step": 1954 }, { "epoch": 1.0735859417902252, "grad_norm": 0.3834561654166051, "learning_rate": 7.251573976084681e-07, "loss": 0.261, "step": 1955 }, { "epoch": 1.0741350906095553, "grad_norm": 0.4828302677996881, "learning_rate": 7.248980364178269e-07, "loss": 0.2441, "step": 1956 }, { "epoch": 1.0746842394288851, "grad_norm": 0.5274084959211662, "learning_rate": 7.246385993492566e-07, "loss": 0.239, "step": 1957 }, { "epoch": 1.0752333882482152, "grad_norm": 0.5159718381356101, "learning_rate": 7.243790864903085e-07, "loss": 0.2636, "step": 1958 }, { "epoch": 1.0757825370675453, "grad_norm": 0.5249058530492751, "learning_rate": 7.24119497928558e-07, "loss": 0.2444, "step": 1959 }, { "epoch": 1.0763316858868754, "grad_norm": 0.5123617875634998, "learning_rate": 7.238598337516072e-07, "loss": 0.2291, "step": 1960 }, { "epoch": 1.0768808347062053, "grad_norm": 0.4976553827927659, "learning_rate": 7.236000940470829e-07, "loss": 0.2631, "step": 1961 }, { "epoch": 1.0774299835255354, "grad_norm": 0.5989997251261129, "learning_rate": 7.233402789026376e-07, "loss": 0.2583, "step": 1962 }, { "epoch": 1.0779791323448655, "grad_norm": 0.475111591657095, "learning_rate": 7.230803884059497e-07, "loss": 0.2492, "step": 1963 }, { "epoch": 1.0785282811641954, "grad_norm": 0.4123066196218989, "learning_rate": 7.22820422644722e-07, "loss": 0.2771, "step": 1964 }, { "epoch": 1.0790774299835255, "grad_norm": 0.47652151034073204, "learning_rate": 7.225603817066842e-07, "loss": 0.2666, "step": 1965 }, { "epoch": 1.0796265788028556, "grad_norm": 0.46457087787614904, "learning_rate": 7.223002656795901e-07, "loss": 0.2641, "step": 1966 }, { "epoch": 1.0801757276221857, "grad_norm": 0.47053450897897553, "learning_rate": 7.220400746512191e-07, "loss": 0.2472, "step": 1967 }, { "epoch": 1.0807248764415156, "grad_norm": 0.5973322780378363, "learning_rate": 7.217798087093765e-07, "loss": 0.2961, "step": 1968 }, { "epoch": 1.0812740252608457, "grad_norm": 0.4725500720302298, "learning_rate": 7.21519467941892e-07, "loss": 0.2802, "step": 1969 }, { "epoch": 1.0818231740801758, "grad_norm": 0.4801608388435618, "learning_rate": 7.212590524366217e-07, "loss": 0.2658, "step": 1970 }, { "epoch": 1.0823723228995057, "grad_norm": 0.483371735435856, "learning_rate": 7.209985622814456e-07, "loss": 0.24, "step": 1971 }, { "epoch": 1.0829214717188358, "grad_norm": 0.5346929497518315, "learning_rate": 7.207379975642695e-07, "loss": 0.2363, "step": 1972 }, { "epoch": 1.083470620538166, "grad_norm": 0.48291327494312974, "learning_rate": 7.20477358373025e-07, "loss": 0.2688, "step": 1973 }, { "epoch": 1.084019769357496, "grad_norm": 0.5059346187117079, "learning_rate": 7.202166447956677e-07, "loss": 0.2747, "step": 1974 }, { "epoch": 1.0845689181768259, "grad_norm": 0.809059863872899, "learning_rate": 7.199558569201793e-07, "loss": 0.4501, "step": 1975 }, { "epoch": 1.085118066996156, "grad_norm": 0.5839461010903824, "learning_rate": 7.196949948345653e-07, "loss": 0.2674, "step": 1976 }, { "epoch": 1.085667215815486, "grad_norm": 0.7100556007052125, "learning_rate": 7.194340586268578e-07, "loss": 0.2816, "step": 1977 }, { "epoch": 1.086216364634816, "grad_norm": 0.4760876293933374, "learning_rate": 7.191730483851129e-07, "loss": 0.2714, "step": 1978 }, { "epoch": 1.086765513454146, "grad_norm": 0.5396483034010165, "learning_rate": 7.189119641974118e-07, "loss": 0.259, "step": 1979 }, { "epoch": 1.0873146622734762, "grad_norm": 0.5500828313729427, "learning_rate": 7.186508061518612e-07, "loss": 0.3548, "step": 1980 }, { "epoch": 1.087863811092806, "grad_norm": 0.49716425854017776, "learning_rate": 7.183895743365919e-07, "loss": 0.2554, "step": 1981 }, { "epoch": 1.0884129599121362, "grad_norm": 0.4757884185013021, "learning_rate": 7.181282688397602e-07, "loss": 0.2379, "step": 1982 }, { "epoch": 1.0889621087314663, "grad_norm": 0.4916505537302754, "learning_rate": 7.178668897495469e-07, "loss": 0.27, "step": 1983 }, { "epoch": 1.0895112575507964, "grad_norm": 0.4118877508007891, "learning_rate": 7.176054371541582e-07, "loss": 0.275, "step": 1984 }, { "epoch": 1.0900604063701262, "grad_norm": 0.5379753195409632, "learning_rate": 7.173439111418243e-07, "loss": 0.285, "step": 1985 }, { "epoch": 1.0906095551894563, "grad_norm": 0.48881307212765185, "learning_rate": 7.170823118008009e-07, "loss": 0.2928, "step": 1986 }, { "epoch": 1.0911587040087865, "grad_norm": 0.4795189129936405, "learning_rate": 7.168206392193678e-07, "loss": 0.245, "step": 1987 }, { "epoch": 1.0917078528281163, "grad_norm": 0.44271120457183727, "learning_rate": 7.165588934858303e-07, "loss": 0.2397, "step": 1988 }, { "epoch": 1.0922570016474464, "grad_norm": 0.47926309831555297, "learning_rate": 7.162970746885176e-07, "loss": 0.2293, "step": 1989 }, { "epoch": 1.0928061504667765, "grad_norm": 0.44594922292612355, "learning_rate": 7.16035182915784e-07, "loss": 0.2368, "step": 1990 }, { "epoch": 1.0933552992861064, "grad_norm": 0.6782100494947264, "learning_rate": 7.157732182560082e-07, "loss": 0.2882, "step": 1991 }, { "epoch": 1.0939044481054365, "grad_norm": 0.6672096767124965, "learning_rate": 7.15511180797594e-07, "loss": 0.2713, "step": 1992 }, { "epoch": 1.0944535969247666, "grad_norm": 0.431389267378025, "learning_rate": 7.15249070628969e-07, "loss": 0.3044, "step": 1993 }, { "epoch": 1.0950027457440967, "grad_norm": 0.4249831014754951, "learning_rate": 7.149868878385859e-07, "loss": 0.2422, "step": 1994 }, { "epoch": 1.0955518945634266, "grad_norm": 0.4857078877857519, "learning_rate": 7.147246325149218e-07, "loss": 0.2446, "step": 1995 }, { "epoch": 1.0961010433827567, "grad_norm": 0.617692271368372, "learning_rate": 7.144623047464779e-07, "loss": 0.2952, "step": 1996 }, { "epoch": 1.0966501922020868, "grad_norm": 0.5568761916886986, "learning_rate": 7.141999046217806e-07, "loss": 0.2864, "step": 1997 }, { "epoch": 1.0971993410214167, "grad_norm": 0.5293561416445784, "learning_rate": 7.1393743222938e-07, "loss": 0.2595, "step": 1998 }, { "epoch": 1.0977484898407468, "grad_norm": 0.5180200287324501, "learning_rate": 7.136748876578508e-07, "loss": 0.2537, "step": 1999 }, { "epoch": 1.098297638660077, "grad_norm": 0.3936420295809807, "learning_rate": 7.134122709957921e-07, "loss": 0.246, "step": 2000 }, { "epoch": 1.098297638660077, "eval_loss": 0.34057578444480896, "eval_runtime": 18.722, "eval_samples_per_second": 23.662, "eval_steps_per_second": 1.015, "step": 2000 }, { "epoch": 1.098846787479407, "grad_norm": 0.4214351543698219, "learning_rate": 7.131495823318278e-07, "loss": 0.2396, "step": 2001 }, { "epoch": 1.0993959362987369, "grad_norm": 0.44422758067504065, "learning_rate": 7.128868217546051e-07, "loss": 0.2881, "step": 2002 }, { "epoch": 1.099945085118067, "grad_norm": 0.5013125845106579, "learning_rate": 7.126239893527964e-07, "loss": 0.2792, "step": 2003 }, { "epoch": 1.100494233937397, "grad_norm": 0.533982878969831, "learning_rate": 7.123610852150975e-07, "loss": 0.274, "step": 2004 }, { "epoch": 1.101043382756727, "grad_norm": 0.4459148061825294, "learning_rate": 7.120981094302293e-07, "loss": 0.275, "step": 2005 }, { "epoch": 1.101592531576057, "grad_norm": 0.5100661828730729, "learning_rate": 7.118350620869363e-07, "loss": 0.2677, "step": 2006 }, { "epoch": 1.1021416803953872, "grad_norm": 0.48410171155944226, "learning_rate": 7.115719432739873e-07, "loss": 0.2836, "step": 2007 }, { "epoch": 1.1026908292147173, "grad_norm": 0.5380579317818759, "learning_rate": 7.11308753080175e-07, "loss": 0.283, "step": 2008 }, { "epoch": 1.1032399780340472, "grad_norm": 0.46313197561519037, "learning_rate": 7.11045491594317e-07, "loss": 0.2858, "step": 2009 }, { "epoch": 1.1037891268533773, "grad_norm": 0.4891923392124667, "learning_rate": 7.107821589052536e-07, "loss": 0.292, "step": 2010 }, { "epoch": 1.1043382756727074, "grad_norm": 0.46855250123693937, "learning_rate": 7.105187551018502e-07, "loss": 0.2559, "step": 2011 }, { "epoch": 1.1048874244920373, "grad_norm": 0.5241046486898976, "learning_rate": 7.102552802729958e-07, "loss": 0.2906, "step": 2012 }, { "epoch": 1.1054365733113674, "grad_norm": 0.5115287830834213, "learning_rate": 7.099917345076037e-07, "loss": 0.2681, "step": 2013 }, { "epoch": 1.1059857221306975, "grad_norm": 0.4389809327921077, "learning_rate": 7.097281178946109e-07, "loss": 0.2449, "step": 2014 }, { "epoch": 1.1065348709500276, "grad_norm": 0.4197311446414432, "learning_rate": 7.094644305229778e-07, "loss": 0.2562, "step": 2015 }, { "epoch": 1.1070840197693574, "grad_norm": 0.6275586745185258, "learning_rate": 7.092006724816896e-07, "loss": 0.2374, "step": 2016 }, { "epoch": 1.1076331685886875, "grad_norm": 0.5791198925530107, "learning_rate": 7.089368438597548e-07, "loss": 0.2661, "step": 2017 }, { "epoch": 1.1081823174080176, "grad_norm": 0.5080422059573922, "learning_rate": 7.086729447462057e-07, "loss": 0.2433, "step": 2018 }, { "epoch": 1.1087314662273475, "grad_norm": 0.538816618379428, "learning_rate": 7.08408975230099e-07, "loss": 0.2457, "step": 2019 }, { "epoch": 1.1092806150466776, "grad_norm": 0.5395042572172819, "learning_rate": 7.081449354005142e-07, "loss": 0.2553, "step": 2020 }, { "epoch": 1.1098297638660077, "grad_norm": 0.45801502622178425, "learning_rate": 7.078808253465551e-07, "loss": 0.2574, "step": 2021 }, { "epoch": 1.1103789126853378, "grad_norm": 0.5294022144766461, "learning_rate": 7.076166451573494e-07, "loss": 0.261, "step": 2022 }, { "epoch": 1.1109280615046677, "grad_norm": 0.4751407036783218, "learning_rate": 7.073523949220478e-07, "loss": 0.2569, "step": 2023 }, { "epoch": 1.1114772103239978, "grad_norm": 0.40491857274890025, "learning_rate": 7.070880747298252e-07, "loss": 0.2538, "step": 2024 }, { "epoch": 1.112026359143328, "grad_norm": 0.5685897807916839, "learning_rate": 7.0682368466988e-07, "loss": 0.3149, "step": 2025 }, { "epoch": 1.1125755079626578, "grad_norm": 0.551199924674557, "learning_rate": 7.06559224831434e-07, "loss": 0.2611, "step": 2026 }, { "epoch": 1.113124656781988, "grad_norm": 0.5561341930314058, "learning_rate": 7.062946953037327e-07, "loss": 0.2711, "step": 2027 }, { "epoch": 1.113673805601318, "grad_norm": 0.4435343747176299, "learning_rate": 7.06030096176045e-07, "loss": 0.2519, "step": 2028 }, { "epoch": 1.114222954420648, "grad_norm": 0.4197566778935351, "learning_rate": 7.057654275376635e-07, "loss": 0.2416, "step": 2029 }, { "epoch": 1.114772103239978, "grad_norm": 0.4148005145919897, "learning_rate": 7.055006894779038e-07, "loss": 0.2129, "step": 2030 }, { "epoch": 1.115321252059308, "grad_norm": 0.4406072147515118, "learning_rate": 7.052358820861058e-07, "loss": 0.2471, "step": 2031 }, { "epoch": 1.1158704008786382, "grad_norm": 0.4715665303241404, "learning_rate": 7.049710054516316e-07, "loss": 0.2744, "step": 2032 }, { "epoch": 1.116419549697968, "grad_norm": 0.543210091975553, "learning_rate": 7.047060596638679e-07, "loss": 0.3029, "step": 2033 }, { "epoch": 1.1169686985172982, "grad_norm": 0.44540658979555325, "learning_rate": 7.044410448122236e-07, "loss": 0.2495, "step": 2034 }, { "epoch": 1.1175178473366283, "grad_norm": 0.4778455656092044, "learning_rate": 7.041759609861316e-07, "loss": 0.2196, "step": 2035 }, { "epoch": 1.1180669961559582, "grad_norm": 0.4108049274170457, "learning_rate": 7.039108082750484e-07, "loss": 0.2855, "step": 2036 }, { "epoch": 1.1186161449752883, "grad_norm": 0.6344012446162273, "learning_rate": 7.036455867684525e-07, "loss": 0.3361, "step": 2037 }, { "epoch": 1.1191652937946184, "grad_norm": 0.47896641824630637, "learning_rate": 7.033802965558471e-07, "loss": 0.2488, "step": 2038 }, { "epoch": 1.1197144426139485, "grad_norm": 0.4859682210256004, "learning_rate": 7.031149377267574e-07, "loss": 0.2696, "step": 2039 }, { "epoch": 1.1202635914332784, "grad_norm": 0.6128079625103663, "learning_rate": 7.028495103707324e-07, "loss": 0.2487, "step": 2040 }, { "epoch": 1.1208127402526085, "grad_norm": 0.5375253310545763, "learning_rate": 7.025840145773441e-07, "loss": 0.2953, "step": 2041 }, { "epoch": 1.1213618890719386, "grad_norm": 0.5447963110673394, "learning_rate": 7.023184504361874e-07, "loss": 0.2275, "step": 2042 }, { "epoch": 1.1219110378912684, "grad_norm": 0.46751918113842783, "learning_rate": 7.020528180368805e-07, "loss": 0.2649, "step": 2043 }, { "epoch": 1.1224601867105986, "grad_norm": 0.5164872267535395, "learning_rate": 7.017871174690647e-07, "loss": 0.2354, "step": 2044 }, { "epoch": 1.1230093355299287, "grad_norm": 0.37121250720665727, "learning_rate": 7.015213488224039e-07, "loss": 0.2606, "step": 2045 }, { "epoch": 1.1235584843492585, "grad_norm": 0.5491751102779595, "learning_rate": 7.012555121865853e-07, "loss": 0.2766, "step": 2046 }, { "epoch": 1.1241076331685886, "grad_norm": 0.46326418719741624, "learning_rate": 7.009896076513191e-07, "loss": 0.2743, "step": 2047 }, { "epoch": 1.1246567819879187, "grad_norm": 0.3796077874222295, "learning_rate": 7.00723635306338e-07, "loss": 0.2438, "step": 2048 }, { "epoch": 1.1252059308072488, "grad_norm": 0.45247177404577776, "learning_rate": 7.004575952413982e-07, "loss": 0.2703, "step": 2049 }, { "epoch": 1.1257550796265787, "grad_norm": 0.511537686308602, "learning_rate": 7.001914875462784e-07, "loss": 0.2713, "step": 2050 }, { "epoch": 1.1263042284459088, "grad_norm": 0.40394344228233975, "learning_rate": 6.999253123107798e-07, "loss": 0.2645, "step": 2051 }, { "epoch": 1.126853377265239, "grad_norm": 0.4322337380009016, "learning_rate": 6.996590696247268e-07, "loss": 0.2697, "step": 2052 }, { "epoch": 1.1274025260845688, "grad_norm": 0.6304146195653357, "learning_rate": 6.99392759577967e-07, "loss": 0.2681, "step": 2053 }, { "epoch": 1.127951674903899, "grad_norm": 0.5796610147192044, "learning_rate": 6.991263822603697e-07, "loss": 0.28, "step": 2054 }, { "epoch": 1.128500823723229, "grad_norm": 0.44163012072057845, "learning_rate": 6.988599377618277e-07, "loss": 0.243, "step": 2055 }, { "epoch": 1.1290499725425591, "grad_norm": 0.7345339727406786, "learning_rate": 6.985934261722561e-07, "loss": 0.2687, "step": 2056 }, { "epoch": 1.129599121361889, "grad_norm": 0.5530745437452842, "learning_rate": 6.983268475815925e-07, "loss": 0.233, "step": 2057 }, { "epoch": 1.130148270181219, "grad_norm": 0.42956914146761865, "learning_rate": 6.98060202079798e-07, "loss": 0.2514, "step": 2058 }, { "epoch": 1.1306974190005492, "grad_norm": 0.394950067161238, "learning_rate": 6.977934897568551e-07, "loss": 0.2743, "step": 2059 }, { "epoch": 1.131246567819879, "grad_norm": 0.47320784491587936, "learning_rate": 6.975267107027694e-07, "loss": 0.2677, "step": 2060 }, { "epoch": 1.1317957166392092, "grad_norm": 0.4566836815984222, "learning_rate": 6.972598650075693e-07, "loss": 0.2222, "step": 2061 }, { "epoch": 1.1323448654585393, "grad_norm": 0.5279458488988885, "learning_rate": 6.969929527613051e-07, "loss": 0.2704, "step": 2062 }, { "epoch": 1.1328940142778694, "grad_norm": 0.44655930508101255, "learning_rate": 6.9672597405405e-07, "loss": 0.2294, "step": 2063 }, { "epoch": 1.1334431630971993, "grad_norm": 0.564628631594905, "learning_rate": 6.964589289758995e-07, "loss": 0.2675, "step": 2064 }, { "epoch": 1.1339923119165294, "grad_norm": 0.3847657136218291, "learning_rate": 6.961918176169715e-07, "loss": 0.2264, "step": 2065 }, { "epoch": 1.1345414607358595, "grad_norm": 0.43930699845250576, "learning_rate": 6.959246400674059e-07, "loss": 0.2445, "step": 2066 }, { "epoch": 1.1350906095551894, "grad_norm": 0.47231699866567417, "learning_rate": 6.956573964173657e-07, "loss": 0.2174, "step": 2067 }, { "epoch": 1.1356397583745195, "grad_norm": 0.47811092606348193, "learning_rate": 6.953900867570357e-07, "loss": 0.2703, "step": 2068 }, { "epoch": 1.1361889071938496, "grad_norm": 0.5014165167841236, "learning_rate": 6.951227111766229e-07, "loss": 0.2793, "step": 2069 }, { "epoch": 1.1367380560131797, "grad_norm": 0.4219943590198461, "learning_rate": 6.948552697663568e-07, "loss": 0.3047, "step": 2070 }, { "epoch": 1.1372872048325096, "grad_norm": 0.5171397486569508, "learning_rate": 6.94587762616489e-07, "loss": 0.2813, "step": 2071 }, { "epoch": 1.1378363536518397, "grad_norm": 0.4343622223122472, "learning_rate": 6.943201898172934e-07, "loss": 0.241, "step": 2072 }, { "epoch": 1.1383855024711698, "grad_norm": 0.42913308473769335, "learning_rate": 6.940525514590657e-07, "loss": 0.2534, "step": 2073 }, { "epoch": 1.1389346512904996, "grad_norm": 0.5067389680375134, "learning_rate": 6.937848476321244e-07, "loss": 0.2715, "step": 2074 }, { "epoch": 1.1394838001098297, "grad_norm": 0.44136904618348255, "learning_rate": 6.935170784268097e-07, "loss": 0.2364, "step": 2075 }, { "epoch": 1.1400329489291599, "grad_norm": 0.5461600370571491, "learning_rate": 6.93249243933483e-07, "loss": 0.2759, "step": 2076 }, { "epoch": 1.14058209774849, "grad_norm": 0.556403684418123, "learning_rate": 6.929813442425297e-07, "loss": 0.3059, "step": 2077 }, { "epoch": 1.1411312465678198, "grad_norm": 0.49028773016860017, "learning_rate": 6.927133794443552e-07, "loss": 0.2658, "step": 2078 }, { "epoch": 1.14168039538715, "grad_norm": 0.6142294814576017, "learning_rate": 6.924453496293883e-07, "loss": 0.3269, "step": 2079 }, { "epoch": 1.14222954420648, "grad_norm": 0.4334593647887194, "learning_rate": 6.921772548880789e-07, "loss": 0.2666, "step": 2080 }, { "epoch": 1.14277869302581, "grad_norm": 0.5841749149027798, "learning_rate": 6.919090953108993e-07, "loss": 0.3252, "step": 2081 }, { "epoch": 1.14332784184514, "grad_norm": 0.5172855884529847, "learning_rate": 6.916408709883432e-07, "loss": 0.2627, "step": 2082 }, { "epoch": 1.1438769906644701, "grad_norm": 0.5713609460606477, "learning_rate": 6.913725820109266e-07, "loss": 0.2772, "step": 2083 }, { "epoch": 1.1444261394838002, "grad_norm": 0.5302134133778692, "learning_rate": 6.911042284691872e-07, "loss": 0.2656, "step": 2084 }, { "epoch": 1.1449752883031301, "grad_norm": 0.4973967230332425, "learning_rate": 6.908358104536843e-07, "loss": 0.2605, "step": 2085 }, { "epoch": 1.1455244371224602, "grad_norm": 0.4673268626434139, "learning_rate": 6.905673280549993e-07, "loss": 0.2444, "step": 2086 }, { "epoch": 1.1460735859417903, "grad_norm": 0.4682573088269765, "learning_rate": 6.90298781363735e-07, "loss": 0.2248, "step": 2087 }, { "epoch": 1.1466227347611202, "grad_norm": 0.43816531275429343, "learning_rate": 6.900301704705158e-07, "loss": 0.2446, "step": 2088 }, { "epoch": 1.1471718835804503, "grad_norm": 0.41736772392728405, "learning_rate": 6.897614954659883e-07, "loss": 0.2468, "step": 2089 }, { "epoch": 1.1477210323997804, "grad_norm": 0.5686596989297472, "learning_rate": 6.894927564408202e-07, "loss": 0.247, "step": 2090 }, { "epoch": 1.1482701812191103, "grad_norm": 0.5074497667087635, "learning_rate": 6.892239534857013e-07, "loss": 0.2963, "step": 2091 }, { "epoch": 1.1488193300384404, "grad_norm": 0.5802737362297152, "learning_rate": 6.889550866913423e-07, "loss": 0.276, "step": 2092 }, { "epoch": 1.1493684788577705, "grad_norm": 0.47884515587841325, "learning_rate": 6.88686156148476e-07, "loss": 0.2751, "step": 2093 }, { "epoch": 1.1499176276771004, "grad_norm": 0.4628259021096923, "learning_rate": 6.884171619478568e-07, "loss": 0.2487, "step": 2094 }, { "epoch": 1.1504667764964305, "grad_norm": 0.4250249064931077, "learning_rate": 6.881481041802601e-07, "loss": 0.2659, "step": 2095 }, { "epoch": 1.1510159253157606, "grad_norm": 0.446772783883665, "learning_rate": 6.878789829364828e-07, "loss": 0.2645, "step": 2096 }, { "epoch": 1.1515650741350907, "grad_norm": 0.4952865235990931, "learning_rate": 6.876097983073437e-07, "loss": 0.2596, "step": 2097 }, { "epoch": 1.1521142229544206, "grad_norm": 0.5780570036531998, "learning_rate": 6.873405503836827e-07, "loss": 0.2373, "step": 2098 }, { "epoch": 1.1526633717737507, "grad_norm": 0.5874807740289929, "learning_rate": 6.870712392563611e-07, "loss": 0.2528, "step": 2099 }, { "epoch": 1.1532125205930808, "grad_norm": 0.5298117374415783, "learning_rate": 6.868018650162612e-07, "loss": 0.2903, "step": 2100 }, { "epoch": 1.1537616694124107, "grad_norm": 0.40908675157594043, "learning_rate": 6.865324277542869e-07, "loss": 0.2787, "step": 2101 }, { "epoch": 1.1543108182317408, "grad_norm": 0.4934033397721633, "learning_rate": 6.862629275613637e-07, "loss": 0.2856, "step": 2102 }, { "epoch": 1.1548599670510709, "grad_norm": 0.5388145674356845, "learning_rate": 6.859933645284376e-07, "loss": 0.2621, "step": 2103 }, { "epoch": 1.155409115870401, "grad_norm": 0.45639083282557674, "learning_rate": 6.857237387464765e-07, "loss": 0.2576, "step": 2104 }, { "epoch": 1.1559582646897308, "grad_norm": 0.5851889731414385, "learning_rate": 6.854540503064688e-07, "loss": 0.2808, "step": 2105 }, { "epoch": 1.156507413509061, "grad_norm": 0.43187611981818386, "learning_rate": 6.851842992994248e-07, "loss": 0.2435, "step": 2106 }, { "epoch": 1.157056562328391, "grad_norm": 0.52972603231515, "learning_rate": 6.849144858163754e-07, "loss": 0.2839, "step": 2107 }, { "epoch": 1.157605711147721, "grad_norm": 0.3605870813580465, "learning_rate": 6.846446099483726e-07, "loss": 0.2373, "step": 2108 }, { "epoch": 1.158154859967051, "grad_norm": 0.436749403383961, "learning_rate": 6.843746717864898e-07, "loss": 0.2734, "step": 2109 }, { "epoch": 1.1587040087863811, "grad_norm": 0.44144694431352005, "learning_rate": 6.841046714218209e-07, "loss": 0.2299, "step": 2110 }, { "epoch": 1.1592531576057112, "grad_norm": 0.4984618775558761, "learning_rate": 6.83834608945481e-07, "loss": 0.2781, "step": 2111 }, { "epoch": 1.1598023064250411, "grad_norm": 0.6043965526426754, "learning_rate": 6.835644844486067e-07, "loss": 0.2756, "step": 2112 }, { "epoch": 1.1603514552443712, "grad_norm": 0.5394594511776718, "learning_rate": 6.832942980223547e-07, "loss": 0.2896, "step": 2113 }, { "epoch": 1.1609006040637013, "grad_norm": 0.5769961522334025, "learning_rate": 6.83024049757903e-07, "loss": 0.2822, "step": 2114 }, { "epoch": 1.1614497528830312, "grad_norm": 0.45650487649307786, "learning_rate": 6.827537397464507e-07, "loss": 0.2175, "step": 2115 }, { "epoch": 1.1619989017023613, "grad_norm": 0.45461813764392106, "learning_rate": 6.824833680792172e-07, "loss": 0.3091, "step": 2116 }, { "epoch": 1.1625480505216914, "grad_norm": 0.4373880082156897, "learning_rate": 6.82212934847443e-07, "loss": 0.3065, "step": 2117 }, { "epoch": 1.1630971993410215, "grad_norm": 0.5695126690017801, "learning_rate": 6.819424401423894e-07, "loss": 0.2355, "step": 2118 }, { "epoch": 1.1636463481603514, "grad_norm": 0.5803660099912648, "learning_rate": 6.816718840553384e-07, "loss": 0.2636, "step": 2119 }, { "epoch": 1.1641954969796815, "grad_norm": 0.3907169640570221, "learning_rate": 6.814012666775928e-07, "loss": 0.2691, "step": 2120 }, { "epoch": 1.1647446457990116, "grad_norm": 0.5566375476157985, "learning_rate": 6.811305881004758e-07, "loss": 0.2294, "step": 2121 }, { "epoch": 1.1652937946183415, "grad_norm": 0.4369089606070346, "learning_rate": 6.808598484153315e-07, "loss": 0.2631, "step": 2122 }, { "epoch": 1.1658429434376716, "grad_norm": 0.41317936199448335, "learning_rate": 6.805890477135247e-07, "loss": 0.2501, "step": 2123 }, { "epoch": 1.1663920922570017, "grad_norm": 0.45122740739912137, "learning_rate": 6.803181860864406e-07, "loss": 0.2426, "step": 2124 }, { "epoch": 1.1669412410763318, "grad_norm": 0.5024729709068514, "learning_rate": 6.80047263625485e-07, "loss": 0.2583, "step": 2125 }, { "epoch": 1.1674903898956617, "grad_norm": 0.4717435664928225, "learning_rate": 6.797762804220843e-07, "loss": 0.2934, "step": 2126 }, { "epoch": 1.1680395387149918, "grad_norm": 0.598639358461304, "learning_rate": 6.795052365676854e-07, "loss": 0.2626, "step": 2127 }, { "epoch": 1.1685886875343219, "grad_norm": 0.5251482151620749, "learning_rate": 6.792341321537551e-07, "loss": 0.2678, "step": 2128 }, { "epoch": 1.1691378363536518, "grad_norm": 0.41750186755239965, "learning_rate": 6.78962967271782e-07, "loss": 0.2587, "step": 2129 }, { "epoch": 1.1696869851729819, "grad_norm": 0.43898996882748204, "learning_rate": 6.786917420132735e-07, "loss": 0.2399, "step": 2130 }, { "epoch": 1.170236133992312, "grad_norm": 0.8178925836590968, "learning_rate": 6.784204564697587e-07, "loss": 0.3316, "step": 2131 }, { "epoch": 1.170785282811642, "grad_norm": 0.713210199250082, "learning_rate": 6.781491107327863e-07, "loss": 0.2887, "step": 2132 }, { "epoch": 1.171334431630972, "grad_norm": 0.5199231649885091, "learning_rate": 6.77877704893925e-07, "loss": 0.2555, "step": 2133 }, { "epoch": 1.171883580450302, "grad_norm": 0.38952773013299286, "learning_rate": 6.776062390447649e-07, "loss": 0.2171, "step": 2134 }, { "epoch": 1.1724327292696322, "grad_norm": 0.4734649835268734, "learning_rate": 6.773347132769157e-07, "loss": 0.2498, "step": 2135 }, { "epoch": 1.172981878088962, "grad_norm": 0.5314347504365042, "learning_rate": 6.77063127682007e-07, "loss": 0.2612, "step": 2136 }, { "epoch": 1.1735310269082921, "grad_norm": 0.5761442175693938, "learning_rate": 6.767914823516891e-07, "loss": 0.2756, "step": 2137 }, { "epoch": 1.1740801757276222, "grad_norm": 0.6230565960967867, "learning_rate": 6.765197773776323e-07, "loss": 0.2802, "step": 2138 }, { "epoch": 1.1746293245469523, "grad_norm": 0.5339967050316438, "learning_rate": 6.76248012851527e-07, "loss": 0.2896, "step": 2139 }, { "epoch": 1.1751784733662822, "grad_norm": 0.4020019474671675, "learning_rate": 6.759761888650836e-07, "loss": 0.2788, "step": 2140 }, { "epoch": 1.1757276221856123, "grad_norm": 0.4480326148913986, "learning_rate": 6.75704305510033e-07, "loss": 0.2719, "step": 2141 }, { "epoch": 1.1762767710049424, "grad_norm": 0.5296871559178602, "learning_rate": 6.754323628781256e-07, "loss": 0.2692, "step": 2142 }, { "epoch": 1.1768259198242723, "grad_norm": 0.49631193645903, "learning_rate": 6.751603610611321e-07, "loss": 0.2572, "step": 2143 }, { "epoch": 1.1773750686436024, "grad_norm": 0.48103885412321784, "learning_rate": 6.748883001508428e-07, "loss": 0.2874, "step": 2144 }, { "epoch": 1.1779242174629325, "grad_norm": 0.3762440711682999, "learning_rate": 6.746161802390686e-07, "loss": 0.2643, "step": 2145 }, { "epoch": 1.1784733662822624, "grad_norm": 0.5509963932256832, "learning_rate": 6.743440014176397e-07, "loss": 0.2231, "step": 2146 }, { "epoch": 1.1790225151015925, "grad_norm": 0.4922225222351292, "learning_rate": 6.740717637784066e-07, "loss": 0.2823, "step": 2147 }, { "epoch": 1.1795716639209226, "grad_norm": 0.5191490587212442, "learning_rate": 6.737994674132394e-07, "loss": 0.2255, "step": 2148 }, { "epoch": 1.1801208127402525, "grad_norm": 0.430135653084065, "learning_rate": 6.735271124140283e-07, "loss": 0.2548, "step": 2149 }, { "epoch": 1.1806699615595826, "grad_norm": 0.6041380159321779, "learning_rate": 6.732546988726826e-07, "loss": 0.261, "step": 2150 }, { "epoch": 1.1812191103789127, "grad_norm": 0.7758435926062973, "learning_rate": 6.729822268811321e-07, "loss": 0.2759, "step": 2151 }, { "epoch": 1.1817682591982428, "grad_norm": 0.5450707824337424, "learning_rate": 6.727096965313262e-07, "loss": 0.2911, "step": 2152 }, { "epoch": 1.1823174080175727, "grad_norm": 0.47390364020595727, "learning_rate": 6.724371079152337e-07, "loss": 0.2626, "step": 2153 }, { "epoch": 1.1828665568369028, "grad_norm": 0.6348318611884215, "learning_rate": 6.721644611248433e-07, "loss": 0.2291, "step": 2154 }, { "epoch": 1.1834157056562329, "grad_norm": 0.6086643592240616, "learning_rate": 6.71891756252163e-07, "loss": 0.2545, "step": 2155 }, { "epoch": 1.1839648544755628, "grad_norm": 0.4568836736824567, "learning_rate": 6.716189933892209e-07, "loss": 0.2641, "step": 2156 }, { "epoch": 1.1845140032948929, "grad_norm": 0.45688844961111974, "learning_rate": 6.713461726280646e-07, "loss": 0.2594, "step": 2157 }, { "epoch": 1.185063152114223, "grad_norm": 0.4574225410506315, "learning_rate": 6.710732940607605e-07, "loss": 0.2614, "step": 2158 }, { "epoch": 1.185612300933553, "grad_norm": 0.5777129559039402, "learning_rate": 6.708003577793954e-07, "loss": 0.2734, "step": 2159 }, { "epoch": 1.186161449752883, "grad_norm": 0.5471811744350423, "learning_rate": 6.705273638760752e-07, "loss": 0.2598, "step": 2160 }, { "epoch": 1.186710598572213, "grad_norm": 0.48434331771972533, "learning_rate": 6.702543124429253e-07, "loss": 0.2532, "step": 2161 }, { "epoch": 1.1872597473915432, "grad_norm": 0.5328438659996083, "learning_rate": 6.699812035720906e-07, "loss": 0.2466, "step": 2162 }, { "epoch": 1.187808896210873, "grad_norm": 0.6049055274335498, "learning_rate": 6.697080373557352e-07, "loss": 0.2526, "step": 2163 }, { "epoch": 1.1883580450302031, "grad_norm": 0.5519016985830592, "learning_rate": 6.694348138860425e-07, "loss": 0.2407, "step": 2164 }, { "epoch": 1.1889071938495333, "grad_norm": 0.46036100366904986, "learning_rate": 6.691615332552154e-07, "loss": 0.2537, "step": 2165 }, { "epoch": 1.1894563426688634, "grad_norm": 0.4785366472196232, "learning_rate": 6.688881955554764e-07, "loss": 0.261, "step": 2166 }, { "epoch": 1.1900054914881932, "grad_norm": 0.5024024129155988, "learning_rate": 6.686148008790663e-07, "loss": 0.2688, "step": 2167 }, { "epoch": 1.1905546403075233, "grad_norm": 0.4873968713537025, "learning_rate": 6.683413493182461e-07, "loss": 0.263, "step": 2168 }, { "epoch": 1.1911037891268534, "grad_norm": 0.7152654341361451, "learning_rate": 6.680678409652957e-07, "loss": 0.3223, "step": 2169 }, { "epoch": 1.1916529379461833, "grad_norm": 0.4766889123590685, "learning_rate": 6.677942759125136e-07, "loss": 0.3251, "step": 2170 }, { "epoch": 1.1922020867655134, "grad_norm": 0.5314049439535203, "learning_rate": 6.675206542522188e-07, "loss": 0.2347, "step": 2171 }, { "epoch": 1.1927512355848435, "grad_norm": 0.47337531507436137, "learning_rate": 6.672469760767477e-07, "loss": 0.2853, "step": 2172 }, { "epoch": 1.1933003844041736, "grad_norm": 0.5995445290549404, "learning_rate": 6.66973241478457e-07, "loss": 0.2598, "step": 2173 }, { "epoch": 1.1938495332235035, "grad_norm": 0.4356276959607383, "learning_rate": 6.666994505497221e-07, "loss": 0.2306, "step": 2174 }, { "epoch": 1.1943986820428336, "grad_norm": 0.5211127414669526, "learning_rate": 6.664256033829369e-07, "loss": 0.2946, "step": 2175 }, { "epoch": 1.1949478308621637, "grad_norm": 0.405545643753939, "learning_rate": 6.661517000705154e-07, "loss": 0.2047, "step": 2176 }, { "epoch": 1.1954969796814936, "grad_norm": 0.4082555110920402, "learning_rate": 6.658777407048894e-07, "loss": 0.2602, "step": 2177 }, { "epoch": 1.1960461285008237, "grad_norm": 0.5039976872573545, "learning_rate": 6.656037253785101e-07, "loss": 0.2382, "step": 2178 }, { "epoch": 1.1965952773201538, "grad_norm": 0.467384634086635, "learning_rate": 6.653296541838478e-07, "loss": 0.2694, "step": 2179 }, { "epoch": 1.197144426139484, "grad_norm": 0.4733061718436572, "learning_rate": 6.650555272133912e-07, "loss": 0.3107, "step": 2180 }, { "epoch": 1.1976935749588138, "grad_norm": 0.43902449684695394, "learning_rate": 6.647813445596483e-07, "loss": 0.2776, "step": 2181 }, { "epoch": 1.198242723778144, "grad_norm": 0.5524664763531649, "learning_rate": 6.645071063151454e-07, "loss": 0.2498, "step": 2182 }, { "epoch": 1.198791872597474, "grad_norm": 0.5103790371570295, "learning_rate": 6.64232812572428e-07, "loss": 0.2571, "step": 2183 }, { "epoch": 1.1993410214168039, "grad_norm": 0.399707347690683, "learning_rate": 6.639584634240602e-07, "loss": 0.2989, "step": 2184 }, { "epoch": 1.199890170236134, "grad_norm": 0.49242600158546507, "learning_rate": 6.636840589626243e-07, "loss": 0.27, "step": 2185 }, { "epoch": 1.200439319055464, "grad_norm": 0.49997191726719, "learning_rate": 6.634095992807221e-07, "loss": 0.2782, "step": 2186 }, { "epoch": 1.2009884678747942, "grad_norm": 0.4598548272948519, "learning_rate": 6.631350844709737e-07, "loss": 0.2668, "step": 2187 }, { "epoch": 1.201537616694124, "grad_norm": 0.6873319689011373, "learning_rate": 6.628605146260174e-07, "loss": 0.2462, "step": 2188 }, { "epoch": 1.2020867655134542, "grad_norm": 0.3998755946484521, "learning_rate": 6.625858898385104e-07, "loss": 0.2939, "step": 2189 }, { "epoch": 1.2026359143327843, "grad_norm": 0.5351111947289814, "learning_rate": 6.623112102011289e-07, "loss": 0.258, "step": 2190 }, { "epoch": 1.2031850631521142, "grad_norm": 0.508085769994925, "learning_rate": 6.620364758065667e-07, "loss": 0.2641, "step": 2191 }, { "epoch": 1.2037342119714443, "grad_norm": 0.55888243333363, "learning_rate": 6.617616867475366e-07, "loss": 0.2585, "step": 2192 }, { "epoch": 1.2042833607907744, "grad_norm": 0.4938157902879376, "learning_rate": 6.614868431167701e-07, "loss": 0.2292, "step": 2193 }, { "epoch": 1.2048325096101045, "grad_norm": 0.5587859685736685, "learning_rate": 6.612119450070164e-07, "loss": 0.2579, "step": 2194 }, { "epoch": 1.2053816584294343, "grad_norm": 0.47235668111834467, "learning_rate": 6.609369925110437e-07, "loss": 0.2392, "step": 2195 }, { "epoch": 1.2059308072487644, "grad_norm": 0.5490419319348965, "learning_rate": 6.606619857216384e-07, "loss": 0.2449, "step": 2196 }, { "epoch": 1.2064799560680946, "grad_norm": 0.4822197201239783, "learning_rate": 6.603869247316051e-07, "loss": 0.2786, "step": 2197 }, { "epoch": 1.2070291048874244, "grad_norm": 0.5307496384306721, "learning_rate": 6.601118096337668e-07, "loss": 0.2261, "step": 2198 }, { "epoch": 1.2075782537067545, "grad_norm": 0.5091163573548654, "learning_rate": 6.598366405209645e-07, "loss": 0.2693, "step": 2199 }, { "epoch": 1.2081274025260846, "grad_norm": 0.42126907648894085, "learning_rate": 6.595614174860577e-07, "loss": 0.2272, "step": 2200 }, { "epoch": 1.2081274025260846, "eval_loss": 0.3368612229824066, "eval_runtime": 18.6625, "eval_samples_per_second": 23.737, "eval_steps_per_second": 1.018, "step": 2200 }, { "epoch": 1.2086765513454145, "grad_norm": 0.43173874800445644, "learning_rate": 6.59286140621924e-07, "loss": 0.2622, "step": 2201 }, { "epoch": 1.2092257001647446, "grad_norm": 0.5411498413668017, "learning_rate": 6.590108100214596e-07, "loss": 0.303, "step": 2202 }, { "epoch": 1.2097748489840747, "grad_norm": 0.4462051774365792, "learning_rate": 6.58735425777578e-07, "loss": 0.2574, "step": 2203 }, { "epoch": 1.2103239978034046, "grad_norm": 0.5938157804849086, "learning_rate": 6.584599879832112e-07, "loss": 0.2491, "step": 2204 }, { "epoch": 1.2108731466227347, "grad_norm": 0.44387246494698895, "learning_rate": 6.581844967313097e-07, "loss": 0.2294, "step": 2205 }, { "epoch": 1.2114222954420648, "grad_norm": 0.7786217814585868, "learning_rate": 6.579089521148412e-07, "loss": 0.2165, "step": 2206 }, { "epoch": 1.211971444261395, "grad_norm": 0.6231069612650889, "learning_rate": 6.576333542267925e-07, "loss": 0.2619, "step": 2207 }, { "epoch": 1.2125205930807248, "grad_norm": 0.5102568325667478, "learning_rate": 6.573577031601669e-07, "loss": 0.2296, "step": 2208 }, { "epoch": 1.213069741900055, "grad_norm": 0.5079946725298959, "learning_rate": 6.570819990079872e-07, "loss": 0.261, "step": 2209 }, { "epoch": 1.213618890719385, "grad_norm": 0.49278078610283893, "learning_rate": 6.568062418632928e-07, "loss": 0.2761, "step": 2210 }, { "epoch": 1.2141680395387149, "grad_norm": 0.5001108108788075, "learning_rate": 6.565304318191419e-07, "loss": 0.2622, "step": 2211 }, { "epoch": 1.214717188358045, "grad_norm": 0.5101032372498414, "learning_rate": 6.562545689686105e-07, "loss": 0.214, "step": 2212 }, { "epoch": 1.215266337177375, "grad_norm": 0.4635276398319039, "learning_rate": 6.559786534047916e-07, "loss": 0.2378, "step": 2213 }, { "epoch": 1.2158154859967052, "grad_norm": 0.6141265780112877, "learning_rate": 6.557026852207966e-07, "loss": 0.2588, "step": 2214 }, { "epoch": 1.216364634816035, "grad_norm": 0.4326858319468491, "learning_rate": 6.554266645097553e-07, "loss": 0.2495, "step": 2215 }, { "epoch": 1.2169137836353652, "grad_norm": 0.4565182435415461, "learning_rate": 6.551505913648135e-07, "loss": 0.2408, "step": 2216 }, { "epoch": 1.2174629324546953, "grad_norm": 0.4313385886030161, "learning_rate": 6.548744658791365e-07, "loss": 0.2864, "step": 2217 }, { "epoch": 1.2180120812740252, "grad_norm": 0.42757787582310125, "learning_rate": 6.545982881459063e-07, "loss": 0.2772, "step": 2218 }, { "epoch": 1.2185612300933553, "grad_norm": 0.6135586508110499, "learning_rate": 6.543220582583222e-07, "loss": 0.2618, "step": 2219 }, { "epoch": 1.2191103789126854, "grad_norm": 0.47535001423953216, "learning_rate": 6.540457763096022e-07, "loss": 0.2447, "step": 2220 }, { "epoch": 1.2196595277320155, "grad_norm": 0.45185378186179237, "learning_rate": 6.537694423929813e-07, "loss": 0.2389, "step": 2221 }, { "epoch": 1.2202086765513454, "grad_norm": 0.46088105353148384, "learning_rate": 6.534930566017116e-07, "loss": 0.2662, "step": 2222 }, { "epoch": 1.2207578253706755, "grad_norm": 0.5412847710183424, "learning_rate": 6.532166190290634e-07, "loss": 0.2229, "step": 2223 }, { "epoch": 1.2213069741900056, "grad_norm": 0.5463083189980393, "learning_rate": 6.52940129768324e-07, "loss": 0.2247, "step": 2224 }, { "epoch": 1.2218561230093354, "grad_norm": 0.49425146294854966, "learning_rate": 6.526635889127986e-07, "loss": 0.2384, "step": 2225 }, { "epoch": 1.2224052718286655, "grad_norm": 0.4851479181683103, "learning_rate": 6.523869965558094e-07, "loss": 0.2432, "step": 2226 }, { "epoch": 1.2229544206479956, "grad_norm": 0.5009516478893922, "learning_rate": 6.521103527906962e-07, "loss": 0.2311, "step": 2227 }, { "epoch": 1.2235035694673257, "grad_norm": 0.717558020742917, "learning_rate": 6.518336577108159e-07, "loss": 0.2592, "step": 2228 }, { "epoch": 1.2240527182866556, "grad_norm": 0.5183795004182595, "learning_rate": 6.515569114095431e-07, "loss": 0.2783, "step": 2229 }, { "epoch": 1.2246018671059857, "grad_norm": 0.3866864111750265, "learning_rate": 6.512801139802694e-07, "loss": 0.2163, "step": 2230 }, { "epoch": 1.2251510159253158, "grad_norm": 0.46290113518458137, "learning_rate": 6.510032655164037e-07, "loss": 0.2693, "step": 2231 }, { "epoch": 1.2257001647446457, "grad_norm": 0.4272308297689702, "learning_rate": 6.507263661113723e-07, "loss": 0.2587, "step": 2232 }, { "epoch": 1.2262493135639758, "grad_norm": 0.45163920893859505, "learning_rate": 6.504494158586183e-07, "loss": 0.2468, "step": 2233 }, { "epoch": 1.226798462383306, "grad_norm": 0.5738383287617221, "learning_rate": 6.501724148516026e-07, "loss": 0.2574, "step": 2234 }, { "epoch": 1.227347611202636, "grad_norm": 0.45125445739106435, "learning_rate": 6.498953631838022e-07, "loss": 0.234, "step": 2235 }, { "epoch": 1.227896760021966, "grad_norm": 0.43967210351881736, "learning_rate": 6.496182609487124e-07, "loss": 0.2484, "step": 2236 }, { "epoch": 1.228445908841296, "grad_norm": 0.5875977907965385, "learning_rate": 6.493411082398449e-07, "loss": 0.2629, "step": 2237 }, { "epoch": 1.2289950576606261, "grad_norm": 0.5347991927899313, "learning_rate": 6.490639051507282e-07, "loss": 0.2297, "step": 2238 }, { "epoch": 1.229544206479956, "grad_norm": 0.4881992460120709, "learning_rate": 6.487866517749087e-07, "loss": 0.2559, "step": 2239 }, { "epoch": 1.230093355299286, "grad_norm": 0.5231473624481398, "learning_rate": 6.485093482059487e-07, "loss": 0.2343, "step": 2240 }, { "epoch": 1.2306425041186162, "grad_norm": 0.43789417940924563, "learning_rate": 6.482319945374281e-07, "loss": 0.2332, "step": 2241 }, { "epoch": 1.2311916529379463, "grad_norm": 0.4563740639069909, "learning_rate": 6.479545908629436e-07, "loss": 0.2424, "step": 2242 }, { "epoch": 1.2317408017572762, "grad_norm": 0.5219025626713153, "learning_rate": 6.476771372761086e-07, "loss": 0.2879, "step": 2243 }, { "epoch": 1.2322899505766063, "grad_norm": 0.5540590324315269, "learning_rate": 6.473996338705538e-07, "loss": 0.2815, "step": 2244 }, { "epoch": 1.2328390993959364, "grad_norm": 0.5358830340500076, "learning_rate": 6.471220807399258e-07, "loss": 0.2716, "step": 2245 }, { "epoch": 1.2333882482152663, "grad_norm": 0.5241034280704541, "learning_rate": 6.46844477977889e-07, "loss": 0.2217, "step": 2246 }, { "epoch": 1.2339373970345964, "grad_norm": 0.42898456649126404, "learning_rate": 6.465668256781239e-07, "loss": 0.2063, "step": 2247 }, { "epoch": 1.2344865458539265, "grad_norm": 0.6288596020822125, "learning_rate": 6.462891239343282e-07, "loss": 0.2302, "step": 2248 }, { "epoch": 1.2350356946732564, "grad_norm": 0.4849890945681238, "learning_rate": 6.460113728402157e-07, "loss": 0.2808, "step": 2249 }, { "epoch": 1.2355848434925865, "grad_norm": 0.4743214066810768, "learning_rate": 6.45733572489517e-07, "loss": 0.216, "step": 2250 }, { "epoch": 1.2361339923119166, "grad_norm": 0.4346520676028857, "learning_rate": 6.454557229759802e-07, "loss": 0.2658, "step": 2251 }, { "epoch": 1.2366831411312464, "grad_norm": 0.4921299582866448, "learning_rate": 6.451778243933685e-07, "loss": 0.2489, "step": 2252 }, { "epoch": 1.2372322899505765, "grad_norm": 0.5305772672272118, "learning_rate": 6.448998768354627e-07, "loss": 0.2622, "step": 2253 }, { "epoch": 1.2377814387699067, "grad_norm": 0.544947416318495, "learning_rate": 6.446218803960602e-07, "loss": 0.2409, "step": 2254 }, { "epoch": 1.2383305875892368, "grad_norm": 0.5296315820812031, "learning_rate": 6.443438351689741e-07, "loss": 0.2688, "step": 2255 }, { "epoch": 1.2388797364085666, "grad_norm": 0.4260918953344602, "learning_rate": 6.440657412480348e-07, "loss": 0.2781, "step": 2256 }, { "epoch": 1.2394288852278967, "grad_norm": 0.43246584287674655, "learning_rate": 6.437875987270883e-07, "loss": 0.2483, "step": 2257 }, { "epoch": 1.2399780340472268, "grad_norm": 0.46545700892403996, "learning_rate": 6.435094076999979e-07, "loss": 0.2559, "step": 2258 }, { "epoch": 1.2405271828665567, "grad_norm": 0.4977965704907033, "learning_rate": 6.432311682606424e-07, "loss": 0.2914, "step": 2259 }, { "epoch": 1.2410763316858868, "grad_norm": 0.41980461778097733, "learning_rate": 6.429528805029178e-07, "loss": 0.2415, "step": 2260 }, { "epoch": 1.241625480505217, "grad_norm": 0.4945950621335048, "learning_rate": 6.426745445207356e-07, "loss": 0.2434, "step": 2261 }, { "epoch": 1.242174629324547, "grad_norm": 0.5706188656276096, "learning_rate": 6.423961604080242e-07, "loss": 0.2085, "step": 2262 }, { "epoch": 1.242723778143877, "grad_norm": 0.696326144316046, "learning_rate": 6.421177282587278e-07, "loss": 0.3354, "step": 2263 }, { "epoch": 1.243272926963207, "grad_norm": 0.43582593221838645, "learning_rate": 6.418392481668072e-07, "loss": 0.2611, "step": 2264 }, { "epoch": 1.2438220757825371, "grad_norm": 0.4350407387790691, "learning_rate": 6.415607202262388e-07, "loss": 0.2276, "step": 2265 }, { "epoch": 1.244371224601867, "grad_norm": 0.4842562141798357, "learning_rate": 6.41282144531016e-07, "loss": 0.2288, "step": 2266 }, { "epoch": 1.244920373421197, "grad_norm": 0.48553133766101747, "learning_rate": 6.410035211751474e-07, "loss": 0.247, "step": 2267 }, { "epoch": 1.2454695222405272, "grad_norm": 0.47871644827328985, "learning_rate": 6.407248502526584e-07, "loss": 0.2323, "step": 2268 }, { "epoch": 1.2460186710598573, "grad_norm": 0.41290906913161507, "learning_rate": 6.4044613185759e-07, "loss": 0.2436, "step": 2269 }, { "epoch": 1.2465678198791872, "grad_norm": 0.439177301848866, "learning_rate": 6.401673660839996e-07, "loss": 0.2239, "step": 2270 }, { "epoch": 1.2471169686985173, "grad_norm": 0.4698859836528455, "learning_rate": 6.398885530259603e-07, "loss": 0.2219, "step": 2271 }, { "epoch": 1.2476661175178474, "grad_norm": 0.42057132323746077, "learning_rate": 6.396096927775608e-07, "loss": 0.2156, "step": 2272 }, { "epoch": 1.2482152663371773, "grad_norm": 0.4235043642535411, "learning_rate": 6.393307854329069e-07, "loss": 0.2327, "step": 2273 }, { "epoch": 1.2487644151565074, "grad_norm": 0.4592560435417908, "learning_rate": 6.39051831086119e-07, "loss": 0.2404, "step": 2274 }, { "epoch": 1.2493135639758375, "grad_norm": 0.45954551262002075, "learning_rate": 6.387728298313343e-07, "loss": 0.2631, "step": 2275 }, { "epoch": 1.2498627127951676, "grad_norm": 0.4521472741078411, "learning_rate": 6.384937817627052e-07, "loss": 0.3033, "step": 2276 }, { "epoch": 1.2504118616144975, "grad_norm": 0.5170834467265043, "learning_rate": 6.382146869744001e-07, "loss": 0.2383, "step": 2277 }, { "epoch": 1.2509610104338276, "grad_norm": 0.4056977366801977, "learning_rate": 6.379355455606036e-07, "loss": 0.2901, "step": 2278 }, { "epoch": 1.2515101592531577, "grad_norm": 0.5621462505062037, "learning_rate": 6.376563576155149e-07, "loss": 0.2483, "step": 2279 }, { "epoch": 1.2520593080724876, "grad_norm": 0.5759149122716783, "learning_rate": 6.373771232333504e-07, "loss": 0.2368, "step": 2280 }, { "epoch": 1.2526084568918177, "grad_norm": 0.5583678983849059, "learning_rate": 6.370978425083411e-07, "loss": 0.2348, "step": 2281 }, { "epoch": 1.2531576057111478, "grad_norm": 0.5609094898378422, "learning_rate": 6.368185155347338e-07, "loss": 0.2264, "step": 2282 }, { "epoch": 1.2537067545304779, "grad_norm": 0.5260052374881327, "learning_rate": 6.365391424067915e-07, "loss": 0.2712, "step": 2283 }, { "epoch": 1.2542559033498077, "grad_norm": 0.5451011571179325, "learning_rate": 6.362597232187917e-07, "loss": 0.2381, "step": 2284 }, { "epoch": 1.2548050521691378, "grad_norm": 0.5356336125938774, "learning_rate": 6.359802580650287e-07, "loss": 0.2634, "step": 2285 }, { "epoch": 1.255354200988468, "grad_norm": 0.5011731436450628, "learning_rate": 6.357007470398114e-07, "loss": 0.2431, "step": 2286 }, { "epoch": 1.2559033498077978, "grad_norm": 0.4015543539043109, "learning_rate": 6.354211902374645e-07, "loss": 0.2351, "step": 2287 }, { "epoch": 1.256452498627128, "grad_norm": 0.47037413804392814, "learning_rate": 6.351415877523281e-07, "loss": 0.2684, "step": 2288 }, { "epoch": 1.257001647446458, "grad_norm": 0.664137721681803, "learning_rate": 6.34861939678758e-07, "loss": 0.3004, "step": 2289 }, { "epoch": 1.2575507962657881, "grad_norm": 0.4530660333600251, "learning_rate": 6.345822461111248e-07, "loss": 0.246, "step": 2290 }, { "epoch": 1.258099945085118, "grad_norm": 0.4742814296200423, "learning_rate": 6.343025071438147e-07, "loss": 0.2397, "step": 2291 }, { "epoch": 1.2586490939044481, "grad_norm": 0.47323844601798776, "learning_rate": 6.340227228712296e-07, "loss": 0.2552, "step": 2292 }, { "epoch": 1.259198242723778, "grad_norm": 0.39972734589093006, "learning_rate": 6.337428933877861e-07, "loss": 0.2259, "step": 2293 }, { "epoch": 1.259747391543108, "grad_norm": 0.497524206282316, "learning_rate": 6.334630187879167e-07, "loss": 0.2647, "step": 2294 }, { "epoch": 1.2602965403624382, "grad_norm": 0.5203691818206629, "learning_rate": 6.331830991660685e-07, "loss": 0.2977, "step": 2295 }, { "epoch": 1.2608456891817683, "grad_norm": 0.5492122693014649, "learning_rate": 6.329031346167041e-07, "loss": 0.2456, "step": 2296 }, { "epoch": 1.2613948380010984, "grad_norm": 0.5250374161432911, "learning_rate": 6.326231252343012e-07, "loss": 0.2185, "step": 2297 }, { "epoch": 1.2619439868204283, "grad_norm": 0.4622927218160168, "learning_rate": 6.323430711133527e-07, "loss": 0.2664, "step": 2298 }, { "epoch": 1.2624931356397584, "grad_norm": 0.6355792112496628, "learning_rate": 6.320629723483665e-07, "loss": 0.2761, "step": 2299 }, { "epoch": 1.2630422844590883, "grad_norm": 0.38958125945190264, "learning_rate": 6.317828290338659e-07, "loss": 0.2437, "step": 2300 }, { "epoch": 1.2635914332784184, "grad_norm": 0.485900382530048, "learning_rate": 6.315026412643886e-07, "loss": 0.2431, "step": 2301 }, { "epoch": 1.2641405820977485, "grad_norm": 0.4710573802841512, "learning_rate": 6.312224091344876e-07, "loss": 0.2492, "step": 2302 }, { "epoch": 1.2646897309170786, "grad_norm": 0.4378029888574416, "learning_rate": 6.309421327387312e-07, "loss": 0.2715, "step": 2303 }, { "epoch": 1.2652388797364087, "grad_norm": 0.5038211334071603, "learning_rate": 6.306618121717022e-07, "loss": 0.2218, "step": 2304 }, { "epoch": 1.2657880285557386, "grad_norm": 0.39342863304494163, "learning_rate": 6.303814475279985e-07, "loss": 0.2382, "step": 2305 }, { "epoch": 1.2663371773750687, "grad_norm": 0.5472122124005581, "learning_rate": 6.301010389022329e-07, "loss": 0.2476, "step": 2306 }, { "epoch": 1.2668863261943986, "grad_norm": 0.4204149320928284, "learning_rate": 6.298205863890329e-07, "loss": 0.2305, "step": 2307 }, { "epoch": 1.2674354750137287, "grad_norm": 0.48839023347011995, "learning_rate": 6.295400900830407e-07, "loss": 0.2399, "step": 2308 }, { "epoch": 1.2679846238330588, "grad_norm": 0.48381023915923504, "learning_rate": 6.29259550078914e-07, "loss": 0.2662, "step": 2309 }, { "epoch": 1.2685337726523889, "grad_norm": 0.46112448013887636, "learning_rate": 6.289789664713239e-07, "loss": 0.2899, "step": 2310 }, { "epoch": 1.269082921471719, "grad_norm": 0.5001991882835727, "learning_rate": 6.286983393549581e-07, "loss": 0.2586, "step": 2311 }, { "epoch": 1.2696320702910489, "grad_norm": 0.5270895575498046, "learning_rate": 6.28417668824517e-07, "loss": 0.2145, "step": 2312 }, { "epoch": 1.270181219110379, "grad_norm": 0.5899290181009109, "learning_rate": 6.28136954974717e-07, "loss": 0.2667, "step": 2313 }, { "epoch": 1.2707303679297088, "grad_norm": 0.5022821468700811, "learning_rate": 6.278561979002886e-07, "loss": 0.3089, "step": 2314 }, { "epoch": 1.271279516749039, "grad_norm": 0.528402273141273, "learning_rate": 6.275753976959767e-07, "loss": 0.2709, "step": 2315 }, { "epoch": 1.271828665568369, "grad_norm": 0.4359182504560581, "learning_rate": 6.272945544565416e-07, "loss": 0.2355, "step": 2316 }, { "epoch": 1.2723778143876991, "grad_norm": 0.39775066322042124, "learning_rate": 6.270136682767571e-07, "loss": 0.2522, "step": 2317 }, { "epoch": 1.272926963207029, "grad_norm": 0.48430113812871545, "learning_rate": 6.26732739251412e-07, "loss": 0.216, "step": 2318 }, { "epoch": 1.2734761120263591, "grad_norm": 0.5248439607982119, "learning_rate": 6.264517674753096e-07, "loss": 0.272, "step": 2319 }, { "epoch": 1.2740252608456892, "grad_norm": 0.4589812459543606, "learning_rate": 6.261707530432676e-07, "loss": 0.2425, "step": 2320 }, { "epoch": 1.2745744096650191, "grad_norm": 0.5294243029520208, "learning_rate": 6.258896960501177e-07, "loss": 0.2525, "step": 2321 }, { "epoch": 1.2751235584843492, "grad_norm": 0.4806539546141748, "learning_rate": 6.256085965907065e-07, "loss": 0.2698, "step": 2322 }, { "epoch": 1.2756727073036793, "grad_norm": 0.6216759041487745, "learning_rate": 6.253274547598948e-07, "loss": 0.2443, "step": 2323 }, { "epoch": 1.2762218561230094, "grad_norm": 0.3662546022104597, "learning_rate": 6.250462706525574e-07, "loss": 0.2642, "step": 2324 }, { "epoch": 1.2767710049423393, "grad_norm": 0.43009471837584184, "learning_rate": 6.247650443635837e-07, "loss": 0.2298, "step": 2325 }, { "epoch": 1.2773201537616694, "grad_norm": 0.5503477309160206, "learning_rate": 6.244837759878773e-07, "loss": 0.2651, "step": 2326 }, { "epoch": 1.2778693025809995, "grad_norm": 0.5037855423476184, "learning_rate": 6.242024656203556e-07, "loss": 0.2818, "step": 2327 }, { "epoch": 1.2784184514003294, "grad_norm": 0.45081964596908497, "learning_rate": 6.239211133559509e-07, "loss": 0.2422, "step": 2328 }, { "epoch": 1.2789676002196595, "grad_norm": 0.4384904442778476, "learning_rate": 6.236397192896089e-07, "loss": 0.2504, "step": 2329 }, { "epoch": 1.2795167490389896, "grad_norm": 0.6192084212713824, "learning_rate": 6.233582835162896e-07, "loss": 0.2529, "step": 2330 }, { "epoch": 1.2800658978583197, "grad_norm": 0.6199476574200089, "learning_rate": 6.230768061309679e-07, "loss": 0.2558, "step": 2331 }, { "epoch": 1.2806150466776496, "grad_norm": 0.44252914439257596, "learning_rate": 6.227952872286313e-07, "loss": 0.2827, "step": 2332 }, { "epoch": 1.2811641954969797, "grad_norm": 0.4212682730812276, "learning_rate": 6.225137269042824e-07, "loss": 0.2503, "step": 2333 }, { "epoch": 1.2817133443163098, "grad_norm": 0.5047942427404624, "learning_rate": 6.222321252529375e-07, "loss": 0.2514, "step": 2334 }, { "epoch": 1.2822624931356397, "grad_norm": 0.5303095165684414, "learning_rate": 6.219504823696264e-07, "loss": 0.2264, "step": 2335 }, { "epoch": 1.2828116419549698, "grad_norm": 0.5949753194922828, "learning_rate": 6.216687983493938e-07, "loss": 0.2713, "step": 2336 }, { "epoch": 1.2833607907742999, "grad_norm": 0.5717953850256383, "learning_rate": 6.213870732872971e-07, "loss": 0.3197, "step": 2337 }, { "epoch": 1.28390993959363, "grad_norm": 0.4909180922219964, "learning_rate": 6.211053072784085e-07, "loss": 0.27, "step": 2338 }, { "epoch": 1.2844590884129599, "grad_norm": 0.4575632010776429, "learning_rate": 6.208235004178135e-07, "loss": 0.2818, "step": 2339 }, { "epoch": 1.28500823723229, "grad_norm": 0.4638671824982353, "learning_rate": 6.205416528006116e-07, "loss": 0.2573, "step": 2340 }, { "epoch": 1.28555738605162, "grad_norm": 0.5360334357515462, "learning_rate": 6.202597645219158e-07, "loss": 0.2281, "step": 2341 }, { "epoch": 1.28610653487095, "grad_norm": 0.4988337616203063, "learning_rate": 6.199778356768533e-07, "loss": 0.2774, "step": 2342 }, { "epoch": 1.28665568369028, "grad_norm": 0.3520819615679302, "learning_rate": 6.196958663605645e-07, "loss": 0.249, "step": 2343 }, { "epoch": 1.2872048325096102, "grad_norm": 0.49948810745698624, "learning_rate": 6.194138566682038e-07, "loss": 0.2423, "step": 2344 }, { "epoch": 1.2877539813289403, "grad_norm": 0.5670759727891223, "learning_rate": 6.191318066949388e-07, "loss": 0.2937, "step": 2345 }, { "epoch": 1.2883031301482701, "grad_norm": 0.4802610824409761, "learning_rate": 6.188497165359514e-07, "loss": 0.2392, "step": 2346 }, { "epoch": 1.2888522789676002, "grad_norm": 0.46163550280303034, "learning_rate": 6.185675862864361e-07, "loss": 0.2625, "step": 2347 }, { "epoch": 1.2894014277869301, "grad_norm": 0.4941057724688069, "learning_rate": 6.18285416041602e-07, "loss": 0.2779, "step": 2348 }, { "epoch": 1.2899505766062602, "grad_norm": 0.38576050511059606, "learning_rate": 6.180032058966708e-07, "loss": 0.276, "step": 2349 }, { "epoch": 1.2904997254255903, "grad_norm": 0.5161901157687193, "learning_rate": 6.177209559468783e-07, "loss": 0.2367, "step": 2350 }, { "epoch": 1.2910488742449204, "grad_norm": 0.5185303933376392, "learning_rate": 6.174386662874731e-07, "loss": 0.2016, "step": 2351 }, { "epoch": 1.2915980230642505, "grad_norm": 0.3869660677666794, "learning_rate": 6.171563370137177e-07, "loss": 0.2188, "step": 2352 }, { "epoch": 1.2921471718835804, "grad_norm": 0.5150224562754442, "learning_rate": 6.168739682208883e-07, "loss": 0.2823, "step": 2353 }, { "epoch": 1.2926963207029105, "grad_norm": 0.48286002695917574, "learning_rate": 6.165915600042732e-07, "loss": 0.2515, "step": 2354 }, { "epoch": 1.2932454695222404, "grad_norm": 0.608242619759271, "learning_rate": 6.163091124591754e-07, "loss": 0.3008, "step": 2355 }, { "epoch": 1.2937946183415705, "grad_norm": 0.48636372186267857, "learning_rate": 6.160266256809101e-07, "loss": 0.2594, "step": 2356 }, { "epoch": 1.2943437671609006, "grad_norm": 0.44609050407570444, "learning_rate": 6.157440997648066e-07, "loss": 0.2729, "step": 2357 }, { "epoch": 1.2948929159802307, "grad_norm": 0.529375457396473, "learning_rate": 6.154615348062066e-07, "loss": 0.2274, "step": 2358 }, { "epoch": 1.2954420647995608, "grad_norm": 0.4845060690017235, "learning_rate": 6.151789309004653e-07, "loss": 0.2618, "step": 2359 }, { "epoch": 1.2959912136188907, "grad_norm": 0.5704097552229095, "learning_rate": 6.148962881429515e-07, "loss": 0.2568, "step": 2360 }, { "epoch": 1.2965403624382208, "grad_norm": 0.7886896850220623, "learning_rate": 6.146136066290466e-07, "loss": 0.2523, "step": 2361 }, { "epoch": 1.2970895112575507, "grad_norm": 0.5737922948999047, "learning_rate": 6.14330886454145e-07, "loss": 0.283, "step": 2362 }, { "epoch": 1.2976386600768808, "grad_norm": 0.5328075772317451, "learning_rate": 6.140481277136545e-07, "loss": 0.2308, "step": 2363 }, { "epoch": 1.2981878088962109, "grad_norm": 0.43702475603721397, "learning_rate": 6.137653305029959e-07, "loss": 0.2577, "step": 2364 }, { "epoch": 1.298736957715541, "grad_norm": 0.4837406904190512, "learning_rate": 6.134824949176025e-07, "loss": 0.2772, "step": 2365 }, { "epoch": 1.299286106534871, "grad_norm": 0.5833756849252187, "learning_rate": 6.131996210529211e-07, "loss": 0.2759, "step": 2366 }, { "epoch": 1.299835255354201, "grad_norm": 0.61194212598957, "learning_rate": 6.129167090044112e-07, "loss": 0.2722, "step": 2367 }, { "epoch": 1.300384404173531, "grad_norm": 0.49804604937916336, "learning_rate": 6.12633758867545e-07, "loss": 0.2483, "step": 2368 }, { "epoch": 1.300933552992861, "grad_norm": 0.5437653024539507, "learning_rate": 6.123507707378082e-07, "loss": 0.3221, "step": 2369 }, { "epoch": 1.301482701812191, "grad_norm": 0.7506257884529773, "learning_rate": 6.120677447106985e-07, "loss": 0.2646, "step": 2370 }, { "epoch": 1.3020318506315212, "grad_norm": 0.4401457582562464, "learning_rate": 6.117846808817265e-07, "loss": 0.2229, "step": 2371 }, { "epoch": 1.3025809994508513, "grad_norm": 0.3913317399340691, "learning_rate": 6.115015793464166e-07, "loss": 0.2438, "step": 2372 }, { "epoch": 1.3031301482701811, "grad_norm": 0.5488944304946382, "learning_rate": 6.112184402003043e-07, "loss": 0.2681, "step": 2373 }, { "epoch": 1.3036792970895112, "grad_norm": 0.5214923752537053, "learning_rate": 6.109352635389393e-07, "loss": 0.2476, "step": 2374 }, { "epoch": 1.3042284459088413, "grad_norm": 0.5386440607410913, "learning_rate": 6.10652049457883e-07, "loss": 0.254, "step": 2375 }, { "epoch": 1.3047775947281712, "grad_norm": 0.459380186973586, "learning_rate": 6.103687980527096e-07, "loss": 0.2572, "step": 2376 }, { "epoch": 1.3053267435475013, "grad_norm": 0.44055375050561413, "learning_rate": 6.100855094190063e-07, "loss": 0.2573, "step": 2377 }, { "epoch": 1.3058758923668314, "grad_norm": 0.556157564051401, "learning_rate": 6.09802183652372e-07, "loss": 0.2286, "step": 2378 }, { "epoch": 1.3064250411861615, "grad_norm": 0.4735665444049734, "learning_rate": 6.095188208484196e-07, "loss": 0.2626, "step": 2379 }, { "epoch": 1.3069741900054914, "grad_norm": 0.5276252703781577, "learning_rate": 6.092354211027728e-07, "loss": 0.2729, "step": 2380 }, { "epoch": 1.3075233388248215, "grad_norm": 0.41114834934992206, "learning_rate": 6.089519845110691e-07, "loss": 0.2392, "step": 2381 }, { "epoch": 1.3080724876441516, "grad_norm": 0.47880102615558046, "learning_rate": 6.086685111689578e-07, "loss": 0.2416, "step": 2382 }, { "epoch": 1.3086216364634815, "grad_norm": 0.529909762649163, "learning_rate": 6.083850011721005e-07, "loss": 0.2222, "step": 2383 }, { "epoch": 1.3091707852828116, "grad_norm": 0.5119668027464246, "learning_rate": 6.081014546161715e-07, "loss": 0.2052, "step": 2384 }, { "epoch": 1.3097199341021417, "grad_norm": 0.4980503614782221, "learning_rate": 6.078178715968572e-07, "loss": 0.2347, "step": 2385 }, { "epoch": 1.3102690829214718, "grad_norm": 0.4396007619534081, "learning_rate": 6.075342522098568e-07, "loss": 0.2392, "step": 2386 }, { "epoch": 1.3108182317408017, "grad_norm": 0.399550761859906, "learning_rate": 6.072505965508809e-07, "loss": 0.2772, "step": 2387 }, { "epoch": 1.3113673805601318, "grad_norm": 0.5174675878715941, "learning_rate": 6.069669047156531e-07, "loss": 0.2655, "step": 2388 }, { "epoch": 1.311916529379462, "grad_norm": 0.4691218248171227, "learning_rate": 6.06683176799909e-07, "loss": 0.2527, "step": 2389 }, { "epoch": 1.3124656781987918, "grad_norm": 0.3996865937564217, "learning_rate": 6.063994128993962e-07, "loss": 0.2358, "step": 2390 }, { "epoch": 1.3130148270181219, "grad_norm": 0.5172584538084732, "learning_rate": 6.061156131098747e-07, "loss": 0.2295, "step": 2391 }, { "epoch": 1.313563975837452, "grad_norm": 0.45495144427974676, "learning_rate": 6.058317775271161e-07, "loss": 0.2852, "step": 2392 }, { "epoch": 1.314113124656782, "grad_norm": 0.46218713079339824, "learning_rate": 6.055479062469049e-07, "loss": 0.2768, "step": 2393 }, { "epoch": 1.314662273476112, "grad_norm": 0.471574396130733, "learning_rate": 6.052639993650371e-07, "loss": 0.235, "step": 2394 }, { "epoch": 1.315211422295442, "grad_norm": 0.426589813019802, "learning_rate": 6.049800569773205e-07, "loss": 0.2944, "step": 2395 }, { "epoch": 1.3157605711147722, "grad_norm": 0.5787804052387165, "learning_rate": 6.046960791795755e-07, "loss": 0.2671, "step": 2396 }, { "epoch": 1.316309719934102, "grad_norm": 0.696148070546797, "learning_rate": 6.044120660676341e-07, "loss": 0.2613, "step": 2397 }, { "epoch": 1.3168588687534322, "grad_norm": 0.477230687781496, "learning_rate": 6.041280177373403e-07, "loss": 0.2609, "step": 2398 }, { "epoch": 1.3174080175727623, "grad_norm": 0.6922259431504914, "learning_rate": 6.038439342845498e-07, "loss": 0.2646, "step": 2399 }, { "epoch": 1.3179571663920924, "grad_norm": 0.43326734156572505, "learning_rate": 6.035598158051304e-07, "loss": 0.25, "step": 2400 }, { "epoch": 1.3179571663920924, "eval_loss": 0.3342709541320801, "eval_runtime": 18.6615, "eval_samples_per_second": 23.739, "eval_steps_per_second": 1.018, "step": 2400 }, { "epoch": 1.3185063152114223, "grad_norm": 0.5582642862296483, "learning_rate": 6.032756623949617e-07, "loss": 0.2237, "step": 2401 }, { "epoch": 1.3190554640307524, "grad_norm": 0.6070163777152277, "learning_rate": 6.029914741499348e-07, "loss": 0.2473, "step": 2402 }, { "epoch": 1.3196046128500822, "grad_norm": 0.46983240480105454, "learning_rate": 6.027072511659529e-07, "loss": 0.2637, "step": 2403 }, { "epoch": 1.3201537616694123, "grad_norm": 0.5640555729076856, "learning_rate": 6.024229935389307e-07, "loss": 0.2846, "step": 2404 }, { "epoch": 1.3207029104887424, "grad_norm": 0.4701649920665535, "learning_rate": 6.021387013647949e-07, "loss": 0.2348, "step": 2405 }, { "epoch": 1.3212520593080725, "grad_norm": 0.413143040151184, "learning_rate": 6.018543747394832e-07, "loss": 0.2576, "step": 2406 }, { "epoch": 1.3218012081274026, "grad_norm": 0.5772483056703775, "learning_rate": 6.015700137589456e-07, "loss": 0.258, "step": 2407 }, { "epoch": 1.3223503569467325, "grad_norm": 0.4289702048272331, "learning_rate": 6.012856185191437e-07, "loss": 0.2904, "step": 2408 }, { "epoch": 1.3228995057660626, "grad_norm": 0.4529811899966002, "learning_rate": 6.010011891160501e-07, "loss": 0.2551, "step": 2409 }, { "epoch": 1.3234486545853925, "grad_norm": 0.4727635072521221, "learning_rate": 6.007167256456494e-07, "loss": 0.2906, "step": 2410 }, { "epoch": 1.3239978034047226, "grad_norm": 0.48081553901207286, "learning_rate": 6.004322282039376e-07, "loss": 0.2549, "step": 2411 }, { "epoch": 1.3245469522240527, "grad_norm": 0.4640237959522978, "learning_rate": 6.001476968869217e-07, "loss": 0.2852, "step": 2412 }, { "epoch": 1.3250961010433828, "grad_norm": 0.5688770709079247, "learning_rate": 5.998631317906211e-07, "loss": 0.282, "step": 2413 }, { "epoch": 1.325645249862713, "grad_norm": 0.4770977407895928, "learning_rate": 5.995785330110655e-07, "loss": 0.2513, "step": 2414 }, { "epoch": 1.3261943986820428, "grad_norm": 0.5102072642963997, "learning_rate": 5.99293900644297e-07, "loss": 0.258, "step": 2415 }, { "epoch": 1.326743547501373, "grad_norm": 0.47970741903012687, "learning_rate": 5.990092347863681e-07, "loss": 0.2415, "step": 2416 }, { "epoch": 1.3272926963207028, "grad_norm": 0.4310159790090193, "learning_rate": 5.987245355333433e-07, "loss": 0.2532, "step": 2417 }, { "epoch": 1.327841845140033, "grad_norm": 0.4263900358912085, "learning_rate": 5.984398029812982e-07, "loss": 0.2312, "step": 2418 }, { "epoch": 1.328390993959363, "grad_norm": 0.5669634055433357, "learning_rate": 5.981550372263194e-07, "loss": 0.2703, "step": 2419 }, { "epoch": 1.328940142778693, "grad_norm": 0.3807539676192585, "learning_rate": 5.978702383645047e-07, "loss": 0.2191, "step": 2420 }, { "epoch": 1.329489291598023, "grad_norm": 0.47136000678226764, "learning_rate": 5.975854064919633e-07, "loss": 0.2571, "step": 2421 }, { "epoch": 1.330038440417353, "grad_norm": 0.4041068511004413, "learning_rate": 5.973005417048157e-07, "loss": 0.2603, "step": 2422 }, { "epoch": 1.3305875892366832, "grad_norm": 0.551995659922425, "learning_rate": 5.97015644099193e-07, "loss": 0.2459, "step": 2423 }, { "epoch": 1.331136738056013, "grad_norm": 0.48285534348477194, "learning_rate": 5.967307137712379e-07, "loss": 0.2208, "step": 2424 }, { "epoch": 1.3316858868753432, "grad_norm": 0.46239674419313187, "learning_rate": 5.964457508171035e-07, "loss": 0.2396, "step": 2425 }, { "epoch": 1.3322350356946733, "grad_norm": 0.4840796632741789, "learning_rate": 5.961607553329546e-07, "loss": 0.2819, "step": 2426 }, { "epoch": 1.3327841845140034, "grad_norm": 0.42200141093854815, "learning_rate": 5.95875727414967e-07, "loss": 0.2655, "step": 2427 }, { "epoch": 1.3333333333333333, "grad_norm": 0.47428505447130925, "learning_rate": 5.955906671593264e-07, "loss": 0.2841, "step": 2428 }, { "epoch": 1.3338824821526634, "grad_norm": 0.4781691837979046, "learning_rate": 5.953055746622304e-07, "loss": 0.2322, "step": 2429 }, { "epoch": 1.3344316309719935, "grad_norm": 0.5036167432226426, "learning_rate": 5.950204500198875e-07, "loss": 0.307, "step": 2430 }, { "epoch": 1.3349807797913233, "grad_norm": 0.42585647119331005, "learning_rate": 5.947352933285163e-07, "loss": 0.2745, "step": 2431 }, { "epoch": 1.3355299286106534, "grad_norm": 0.4329544521371594, "learning_rate": 5.944501046843472e-07, "loss": 0.2707, "step": 2432 }, { "epoch": 1.3360790774299836, "grad_norm": 0.6444549695104927, "learning_rate": 5.941648841836203e-07, "loss": 0.3025, "step": 2433 }, { "epoch": 1.3366282262493137, "grad_norm": 0.40606083158676226, "learning_rate": 5.938796319225875e-07, "loss": 0.2349, "step": 2434 }, { "epoch": 1.3371773750686435, "grad_norm": 0.47466180277619985, "learning_rate": 5.935943479975108e-07, "loss": 0.2485, "step": 2435 }, { "epoch": 1.3377265238879736, "grad_norm": 0.39685006021856656, "learning_rate": 5.933090325046628e-07, "loss": 0.241, "step": 2436 }, { "epoch": 1.3382756727073037, "grad_norm": 0.492738848161958, "learning_rate": 5.93023685540327e-07, "loss": 0.2567, "step": 2437 }, { "epoch": 1.3388248215266336, "grad_norm": 0.49493265820804205, "learning_rate": 5.927383072007977e-07, "loss": 0.2569, "step": 2438 }, { "epoch": 1.3393739703459637, "grad_norm": 0.542842745466424, "learning_rate": 5.924528975823797e-07, "loss": 0.2795, "step": 2439 }, { "epoch": 1.3399231191652938, "grad_norm": 0.4239838438167338, "learning_rate": 5.921674567813877e-07, "loss": 0.2557, "step": 2440 }, { "epoch": 1.340472267984624, "grad_norm": 0.4966233028405723, "learning_rate": 5.91881984894148e-07, "loss": 0.2496, "step": 2441 }, { "epoch": 1.3410214168039538, "grad_norm": 0.5519329968073597, "learning_rate": 5.915964820169965e-07, "loss": 0.2565, "step": 2442 }, { "epoch": 1.341570565623284, "grad_norm": 0.7372612067644724, "learning_rate": 5.913109482462799e-07, "loss": 0.2613, "step": 2443 }, { "epoch": 1.342119714442614, "grad_norm": 0.4796087341765942, "learning_rate": 5.910253836783555e-07, "loss": 0.2801, "step": 2444 }, { "epoch": 1.342668863261944, "grad_norm": 0.4536777578695405, "learning_rate": 5.907397884095909e-07, "loss": 0.268, "step": 2445 }, { "epoch": 1.343218012081274, "grad_norm": 0.5109361683793235, "learning_rate": 5.904541625363636e-07, "loss": 0.264, "step": 2446 }, { "epoch": 1.343767160900604, "grad_norm": 0.41996241567495374, "learning_rate": 5.901685061550622e-07, "loss": 0.2869, "step": 2447 }, { "epoch": 1.3443163097199342, "grad_norm": 0.5003238455776378, "learning_rate": 5.89882819362085e-07, "loss": 0.3071, "step": 2448 }, { "epoch": 1.344865458539264, "grad_norm": 0.4697344593713444, "learning_rate": 5.895971022538409e-07, "loss": 0.2568, "step": 2449 }, { "epoch": 1.3454146073585942, "grad_norm": 0.40731067950571975, "learning_rate": 5.893113549267485e-07, "loss": 0.2122, "step": 2450 }, { "epoch": 1.3459637561779243, "grad_norm": 0.5396908951315318, "learning_rate": 5.890255774772377e-07, "loss": 0.2813, "step": 2451 }, { "epoch": 1.3465129049972542, "grad_norm": 0.5313718306793825, "learning_rate": 5.887397700017474e-07, "loss": 0.3161, "step": 2452 }, { "epoch": 1.3470620538165843, "grad_norm": 0.47293868100635245, "learning_rate": 5.88453932596727e-07, "loss": 0.2389, "step": 2453 }, { "epoch": 1.3476112026359144, "grad_norm": 0.5193843611130018, "learning_rate": 5.881680653586365e-07, "loss": 0.2494, "step": 2454 }, { "epoch": 1.3481603514552445, "grad_norm": 0.5043662695880984, "learning_rate": 5.87882168383945e-07, "loss": 0.2277, "step": 2455 }, { "epoch": 1.3487095002745744, "grad_norm": 0.44989104679921166, "learning_rate": 5.875962417691327e-07, "loss": 0.2243, "step": 2456 }, { "epoch": 1.3492586490939045, "grad_norm": 0.6009330874211384, "learning_rate": 5.873102856106892e-07, "loss": 0.2492, "step": 2457 }, { "epoch": 1.3498077979132344, "grad_norm": 0.5758412385916694, "learning_rate": 5.87024300005114e-07, "loss": 0.2785, "step": 2458 }, { "epoch": 1.3503569467325645, "grad_norm": 0.5857406608086171, "learning_rate": 5.867382850489168e-07, "loss": 0.3015, "step": 2459 }, { "epoch": 1.3509060955518946, "grad_norm": 0.599070986036983, "learning_rate": 5.864522408386171e-07, "loss": 0.2387, "step": 2460 }, { "epoch": 1.3514552443712247, "grad_norm": 0.5288798150966122, "learning_rate": 5.861661674707444e-07, "loss": 0.2274, "step": 2461 }, { "epoch": 1.3520043931905548, "grad_norm": 0.4573537222909525, "learning_rate": 5.858800650418375e-07, "loss": 0.2865, "step": 2462 }, { "epoch": 1.3525535420098846, "grad_norm": 0.5769587325016156, "learning_rate": 5.85593933648446e-07, "loss": 0.3223, "step": 2463 }, { "epoch": 1.3531026908292147, "grad_norm": 0.5953040386204894, "learning_rate": 5.853077733871283e-07, "loss": 0.2422, "step": 2464 }, { "epoch": 1.3536518396485446, "grad_norm": 0.5418375818296769, "learning_rate": 5.850215843544533e-07, "loss": 0.2603, "step": 2465 }, { "epoch": 1.3542009884678747, "grad_norm": 0.5155977732195074, "learning_rate": 5.847353666469988e-07, "loss": 0.285, "step": 2466 }, { "epoch": 1.3547501372872048, "grad_norm": 0.4692721414597744, "learning_rate": 5.844491203613531e-07, "loss": 0.2382, "step": 2467 }, { "epoch": 1.355299286106535, "grad_norm": 0.4578483513598431, "learning_rate": 5.841628455941135e-07, "loss": 0.2494, "step": 2468 }, { "epoch": 1.355848434925865, "grad_norm": 0.45991865497465073, "learning_rate": 5.838765424418875e-07, "loss": 0.2398, "step": 2469 }, { "epoch": 1.356397583745195, "grad_norm": 0.4556434117858961, "learning_rate": 5.835902110012916e-07, "loss": 0.2242, "step": 2470 }, { "epoch": 1.356946732564525, "grad_norm": 0.45410712187680624, "learning_rate": 5.833038513689523e-07, "loss": 0.2664, "step": 2471 }, { "epoch": 1.357495881383855, "grad_norm": 0.5445284448606036, "learning_rate": 5.830174636415052e-07, "loss": 0.2714, "step": 2472 }, { "epoch": 1.358045030203185, "grad_norm": 0.4794969236306016, "learning_rate": 5.827310479155959e-07, "loss": 0.2502, "step": 2473 }, { "epoch": 1.3585941790225151, "grad_norm": 0.5442755281048148, "learning_rate": 5.82444604287879e-07, "loss": 0.2825, "step": 2474 }, { "epoch": 1.3591433278418452, "grad_norm": 0.5662614867465593, "learning_rate": 5.821581328550184e-07, "loss": 0.2532, "step": 2475 }, { "epoch": 1.359692476661175, "grad_norm": 0.5073370931357127, "learning_rate": 5.818716337136884e-07, "loss": 0.2464, "step": 2476 }, { "epoch": 1.3602416254805052, "grad_norm": 0.5424531251966654, "learning_rate": 5.815851069605711e-07, "loss": 0.2616, "step": 2477 }, { "epoch": 1.3607907742998353, "grad_norm": 0.52104895308535, "learning_rate": 5.812985526923591e-07, "loss": 0.3444, "step": 2478 }, { "epoch": 1.3613399231191652, "grad_norm": 0.4914302070386866, "learning_rate": 5.810119710057538e-07, "loss": 0.2669, "step": 2479 }, { "epoch": 1.3618890719384953, "grad_norm": 0.49525888737768364, "learning_rate": 5.807253619974662e-07, "loss": 0.2473, "step": 2480 }, { "epoch": 1.3624382207578254, "grad_norm": 0.45723054270017466, "learning_rate": 5.804387257642161e-07, "loss": 0.2381, "step": 2481 }, { "epoch": 1.3629873695771555, "grad_norm": 0.6350684233287965, "learning_rate": 5.801520624027325e-07, "loss": 0.2564, "step": 2482 }, { "epoch": 1.3635365183964854, "grad_norm": 0.6547472277532573, "learning_rate": 5.79865372009754e-07, "loss": 0.3224, "step": 2483 }, { "epoch": 1.3640856672158155, "grad_norm": 0.6118024936507656, "learning_rate": 5.795786546820281e-07, "loss": 0.3172, "step": 2484 }, { "epoch": 1.3646348160351456, "grad_norm": 0.562626551495939, "learning_rate": 5.79291910516311e-07, "loss": 0.2578, "step": 2485 }, { "epoch": 1.3651839648544755, "grad_norm": 0.6253248867173034, "learning_rate": 5.790051396093685e-07, "loss": 0.2506, "step": 2486 }, { "epoch": 1.3657331136738056, "grad_norm": 0.5364924692535818, "learning_rate": 5.787183420579751e-07, "loss": 0.2306, "step": 2487 }, { "epoch": 1.3662822624931357, "grad_norm": 0.4977927712948899, "learning_rate": 5.784315179589147e-07, "loss": 0.2373, "step": 2488 }, { "epoch": 1.3668314113124658, "grad_norm": 0.480445926524663, "learning_rate": 5.781446674089795e-07, "loss": 0.2312, "step": 2489 }, { "epoch": 1.3673805601317957, "grad_norm": 0.47823587381587335, "learning_rate": 5.778577905049712e-07, "loss": 0.2514, "step": 2490 }, { "epoch": 1.3679297089511258, "grad_norm": 0.5089134721308354, "learning_rate": 5.775708873437002e-07, "loss": 0.2489, "step": 2491 }, { "epoch": 1.3684788577704559, "grad_norm": 0.543664384488189, "learning_rate": 5.772839580219855e-07, "loss": 0.2633, "step": 2492 }, { "epoch": 1.3690280065897857, "grad_norm": 0.5718583790966169, "learning_rate": 5.769970026366558e-07, "loss": 0.239, "step": 2493 }, { "epoch": 1.3695771554091158, "grad_norm": 0.636404273665429, "learning_rate": 5.767100212845469e-07, "loss": 0.2265, "step": 2494 }, { "epoch": 1.370126304228446, "grad_norm": 0.43856841466032964, "learning_rate": 5.764230140625055e-07, "loss": 0.2565, "step": 2495 }, { "epoch": 1.370675453047776, "grad_norm": 0.5263825798160457, "learning_rate": 5.761359810673854e-07, "loss": 0.2261, "step": 2496 }, { "epoch": 1.371224601867106, "grad_norm": 0.508098245950131, "learning_rate": 5.758489223960499e-07, "loss": 0.2793, "step": 2497 }, { "epoch": 1.371773750686436, "grad_norm": 0.45843999919833206, "learning_rate": 5.755618381453705e-07, "loss": 0.2365, "step": 2498 }, { "epoch": 1.3723228995057661, "grad_norm": 0.42731087370538556, "learning_rate": 5.752747284122278e-07, "loss": 0.2247, "step": 2499 }, { "epoch": 1.372872048325096, "grad_norm": 0.4160735941327004, "learning_rate": 5.749875932935106e-07, "loss": 0.2756, "step": 2500 }, { "epoch": 1.3734211971444261, "grad_norm": 0.4423198050966064, "learning_rate": 5.747004328861164e-07, "loss": 0.2425, "step": 2501 }, { "epoch": 1.3739703459637562, "grad_norm": 0.5042669213959131, "learning_rate": 5.744132472869513e-07, "loss": 0.2408, "step": 2502 }, { "epoch": 1.3745194947830863, "grad_norm": 0.5972578833811647, "learning_rate": 5.741260365929299e-07, "loss": 0.2335, "step": 2503 }, { "epoch": 1.3750686436024162, "grad_norm": 0.43908733900145513, "learning_rate": 5.738388009009752e-07, "loss": 0.2528, "step": 2504 }, { "epoch": 1.3756177924217463, "grad_norm": 0.4893645423258682, "learning_rate": 5.735515403080186e-07, "loss": 0.2635, "step": 2505 }, { "epoch": 1.3761669412410764, "grad_norm": 0.4935391899408243, "learning_rate": 5.732642549110001e-07, "loss": 0.2575, "step": 2506 }, { "epoch": 1.3767160900604063, "grad_norm": 0.5785809231834036, "learning_rate": 5.72976944806868e-07, "loss": 0.2845, "step": 2507 }, { "epoch": 1.3772652388797364, "grad_norm": 0.6537242709620882, "learning_rate": 5.726896100925786e-07, "loss": 0.3096, "step": 2508 }, { "epoch": 1.3778143876990665, "grad_norm": 0.4137915299865373, "learning_rate": 5.72402250865097e-07, "loss": 0.2323, "step": 2509 }, { "epoch": 1.3783635365183966, "grad_norm": 0.5136053530625145, "learning_rate": 5.721148672213963e-07, "loss": 0.2168, "step": 2510 }, { "epoch": 1.3789126853377265, "grad_norm": 0.4159143943694344, "learning_rate": 5.718274592584578e-07, "loss": 0.2794, "step": 2511 }, { "epoch": 1.3794618341570566, "grad_norm": 0.5419850159318301, "learning_rate": 5.715400270732712e-07, "loss": 0.2312, "step": 2512 }, { "epoch": 1.3800109829763865, "grad_norm": 0.4186260239621793, "learning_rate": 5.712525707628341e-07, "loss": 0.2495, "step": 2513 }, { "epoch": 1.3805601317957166, "grad_norm": 0.47562755374013893, "learning_rate": 5.709650904241527e-07, "loss": 0.263, "step": 2514 }, { "epoch": 1.3811092806150467, "grad_norm": 0.5295503656180506, "learning_rate": 5.70677586154241e-07, "loss": 0.2605, "step": 2515 }, { "epoch": 1.3816584294343768, "grad_norm": 0.46895869887789937, "learning_rate": 5.703900580501208e-07, "loss": 0.2222, "step": 2516 }, { "epoch": 1.3822075782537069, "grad_norm": 0.4913421000376898, "learning_rate": 5.701025062088224e-07, "loss": 0.2349, "step": 2517 }, { "epoch": 1.3827567270730368, "grad_norm": 0.803768407645122, "learning_rate": 5.698149307273842e-07, "loss": 0.2827, "step": 2518 }, { "epoch": 1.3833058758923669, "grad_norm": 0.5402814334114402, "learning_rate": 5.695273317028519e-07, "loss": 0.2939, "step": 2519 }, { "epoch": 1.3838550247116967, "grad_norm": 0.5907488125202154, "learning_rate": 5.692397092322799e-07, "loss": 0.2706, "step": 2520 }, { "epoch": 1.3844041735310268, "grad_norm": 0.4638994814653254, "learning_rate": 5.6895206341273e-07, "loss": 0.2343, "step": 2521 }, { "epoch": 1.384953322350357, "grad_norm": 0.5495487458929103, "learning_rate": 5.686643943412721e-07, "loss": 0.2576, "step": 2522 }, { "epoch": 1.385502471169687, "grad_norm": 0.576275510503027, "learning_rate": 5.68376702114984e-07, "loss": 0.2188, "step": 2523 }, { "epoch": 1.3860516199890172, "grad_norm": 0.4657395568890571, "learning_rate": 5.680889868309512e-07, "loss": 0.2529, "step": 2524 }, { "epoch": 1.386600768808347, "grad_norm": 0.5293121771997737, "learning_rate": 5.67801248586267e-07, "loss": 0.2514, "step": 2525 }, { "epoch": 1.3871499176276771, "grad_norm": 0.7043331956655084, "learning_rate": 5.675134874780324e-07, "loss": 0.3011, "step": 2526 }, { "epoch": 1.387699066447007, "grad_norm": 0.5158144099322439, "learning_rate": 5.672257036033563e-07, "loss": 0.2636, "step": 2527 }, { "epoch": 1.3882482152663371, "grad_norm": 0.4884975221293426, "learning_rate": 5.66937897059355e-07, "loss": 0.2743, "step": 2528 }, { "epoch": 1.3887973640856672, "grad_norm": 0.45123586354908496, "learning_rate": 5.666500679431527e-07, "loss": 0.2593, "step": 2529 }, { "epoch": 1.3893465129049973, "grad_norm": 0.4325611101252934, "learning_rate": 5.663622163518809e-07, "loss": 0.2467, "step": 2530 }, { "epoch": 1.3898956617243272, "grad_norm": 0.43465666319707863, "learning_rate": 5.660743423826794e-07, "loss": 0.2574, "step": 2531 }, { "epoch": 1.3904448105436573, "grad_norm": 0.5231091803343897, "learning_rate": 5.657864461326948e-07, "loss": 0.2795, "step": 2532 }, { "epoch": 1.3909939593629874, "grad_norm": 0.5357165792952369, "learning_rate": 5.654985276990812e-07, "loss": 0.2226, "step": 2533 }, { "epoch": 1.3915431081823173, "grad_norm": 0.509134167848142, "learning_rate": 5.652105871790007e-07, "loss": 0.2304, "step": 2534 }, { "epoch": 1.3920922570016474, "grad_norm": 0.6021000356961625, "learning_rate": 5.649226246696227e-07, "loss": 0.2812, "step": 2535 }, { "epoch": 1.3926414058209775, "grad_norm": 0.5313372727157338, "learning_rate": 5.646346402681239e-07, "loss": 0.2286, "step": 2536 }, { "epoch": 1.3931905546403076, "grad_norm": 0.5913908856058032, "learning_rate": 5.643466340716884e-07, "loss": 0.2776, "step": 2537 }, { "epoch": 1.3937397034596375, "grad_norm": 0.5329024596754857, "learning_rate": 5.640586061775076e-07, "loss": 0.2174, "step": 2538 }, { "epoch": 1.3942888522789676, "grad_norm": 0.4571165121305411, "learning_rate": 5.637705566827805e-07, "loss": 0.283, "step": 2539 }, { "epoch": 1.3948380010982977, "grad_norm": 0.44971009680316143, "learning_rate": 5.63482485684713e-07, "loss": 0.2193, "step": 2540 }, { "epoch": 1.3953871499176276, "grad_norm": 0.5243972938723075, "learning_rate": 5.631943932805184e-07, "loss": 0.2332, "step": 2541 }, { "epoch": 1.3959362987369577, "grad_norm": 0.6058049010178508, "learning_rate": 5.629062795674176e-07, "loss": 0.2506, "step": 2542 }, { "epoch": 1.3964854475562878, "grad_norm": 0.5101434796348728, "learning_rate": 5.626181446426381e-07, "loss": 0.2515, "step": 2543 }, { "epoch": 1.3970345963756179, "grad_norm": 0.4129566191730494, "learning_rate": 5.623299886034148e-07, "loss": 0.248, "step": 2544 }, { "epoch": 1.3975837451949478, "grad_norm": 0.4633916935043192, "learning_rate": 5.620418115469897e-07, "loss": 0.223, "step": 2545 }, { "epoch": 1.3981328940142779, "grad_norm": 0.437336720089803, "learning_rate": 5.617536135706123e-07, "loss": 0.2577, "step": 2546 }, { "epoch": 1.398682042833608, "grad_norm": 0.5145684666117926, "learning_rate": 5.614653947715384e-07, "loss": 0.2439, "step": 2547 }, { "epoch": 1.3992311916529379, "grad_norm": 0.42040693609440827, "learning_rate": 5.611771552470314e-07, "loss": 0.2589, "step": 2548 }, { "epoch": 1.399780340472268, "grad_norm": 0.46380505194661154, "learning_rate": 5.608888950943615e-07, "loss": 0.2593, "step": 2549 }, { "epoch": 1.400329489291598, "grad_norm": 0.475476396017217, "learning_rate": 5.60600614410806e-07, "loss": 0.2959, "step": 2550 }, { "epoch": 1.4008786381109282, "grad_norm": 0.4021118305109138, "learning_rate": 5.603123132936488e-07, "loss": 0.2564, "step": 2551 }, { "epoch": 1.401427786930258, "grad_norm": 0.7449958522114163, "learning_rate": 5.600239918401809e-07, "loss": 0.3015, "step": 2552 }, { "epoch": 1.4019769357495881, "grad_norm": 0.4923443819161266, "learning_rate": 5.597356501477004e-07, "loss": 0.2796, "step": 2553 }, { "epoch": 1.4025260845689183, "grad_norm": 0.5396102012256275, "learning_rate": 5.59447288313512e-07, "loss": 0.2577, "step": 2554 }, { "epoch": 1.4030752333882481, "grad_norm": 0.46616252587884144, "learning_rate": 5.59158906434927e-07, "loss": 0.2478, "step": 2555 }, { "epoch": 1.4036243822075782, "grad_norm": 0.4940803585234028, "learning_rate": 5.588705046092635e-07, "loss": 0.275, "step": 2556 }, { "epoch": 1.4041735310269083, "grad_norm": 0.49968851038405493, "learning_rate": 5.585820829338468e-07, "loss": 0.2337, "step": 2557 }, { "epoch": 1.4047226798462384, "grad_norm": 0.5900768026188389, "learning_rate": 5.582936415060086e-07, "loss": 0.2702, "step": 2558 }, { "epoch": 1.4052718286655683, "grad_norm": 0.4433791718295024, "learning_rate": 5.580051804230872e-07, "loss": 0.2835, "step": 2559 }, { "epoch": 1.4058209774848984, "grad_norm": 0.5546085479721924, "learning_rate": 5.577166997824275e-07, "loss": 0.2538, "step": 2560 }, { "epoch": 1.4063701263042283, "grad_norm": 0.49338905195185423, "learning_rate": 5.574281996813811e-07, "loss": 0.26, "step": 2561 }, { "epoch": 1.4069192751235584, "grad_norm": 0.43563317633591875, "learning_rate": 5.571396802173062e-07, "loss": 0.2182, "step": 2562 }, { "epoch": 1.4074684239428885, "grad_norm": 0.5110553028461078, "learning_rate": 5.568511414875675e-07, "loss": 0.2453, "step": 2563 }, { "epoch": 1.4080175727622186, "grad_norm": 0.47177296049403633, "learning_rate": 5.565625835895361e-07, "loss": 0.2822, "step": 2564 }, { "epoch": 1.4085667215815487, "grad_norm": 0.5496409664971602, "learning_rate": 5.562740066205898e-07, "loss": 0.2947, "step": 2565 }, { "epoch": 1.4091158704008786, "grad_norm": 0.39637733569512956, "learning_rate": 5.559854106781127e-07, "loss": 0.2709, "step": 2566 }, { "epoch": 1.4096650192202087, "grad_norm": 0.6151857303727866, "learning_rate": 5.556967958594953e-07, "loss": 0.2568, "step": 2567 }, { "epoch": 1.4102141680395386, "grad_norm": 0.5803853484798582, "learning_rate": 5.55408162262134e-07, "loss": 0.2518, "step": 2568 }, { "epoch": 1.4107633168588687, "grad_norm": 0.4794455894749099, "learning_rate": 5.551195099834326e-07, "loss": 0.2411, "step": 2569 }, { "epoch": 1.4113124656781988, "grad_norm": 0.7649530136773677, "learning_rate": 5.548308391208007e-07, "loss": 0.3087, "step": 2570 }, { "epoch": 1.411861614497529, "grad_norm": 0.4666597748023007, "learning_rate": 5.545421497716533e-07, "loss": 0.2248, "step": 2571 }, { "epoch": 1.412410763316859, "grad_norm": 0.5121423297813796, "learning_rate": 5.542534420334132e-07, "loss": 0.2334, "step": 2572 }, { "epoch": 1.4129599121361889, "grad_norm": 0.43375497668417196, "learning_rate": 5.539647160035084e-07, "loss": 0.2429, "step": 2573 }, { "epoch": 1.413509060955519, "grad_norm": 0.5666466346825227, "learning_rate": 5.536759717793731e-07, "loss": 0.2768, "step": 2574 }, { "epoch": 1.4140582097748489, "grad_norm": 0.4708220637290964, "learning_rate": 5.53387209458448e-07, "loss": 0.243, "step": 2575 }, { "epoch": 1.414607358594179, "grad_norm": 0.3731129077020286, "learning_rate": 5.530984291381798e-07, "loss": 0.2716, "step": 2576 }, { "epoch": 1.415156507413509, "grad_norm": 0.4546681364342084, "learning_rate": 5.52809630916021e-07, "loss": 0.2629, "step": 2577 }, { "epoch": 1.4157056562328392, "grad_norm": 0.6906878348430655, "learning_rate": 5.525208148894306e-07, "loss": 0.2748, "step": 2578 }, { "epoch": 1.4162548050521693, "grad_norm": 0.46942272826029585, "learning_rate": 5.522319811558732e-07, "loss": 0.2676, "step": 2579 }, { "epoch": 1.4168039538714992, "grad_norm": 0.48851766663718893, "learning_rate": 5.519431298128196e-07, "loss": 0.2085, "step": 2580 }, { "epoch": 1.4173531026908293, "grad_norm": 0.504752960820369, "learning_rate": 5.516542609577467e-07, "loss": 0.2415, "step": 2581 }, { "epoch": 1.4179022515101591, "grad_norm": 0.605966929059181, "learning_rate": 5.513653746881365e-07, "loss": 0.2698, "step": 2582 }, { "epoch": 1.4184514003294892, "grad_norm": 0.573766959448316, "learning_rate": 5.510764711014782e-07, "loss": 0.2603, "step": 2583 }, { "epoch": 1.4190005491488193, "grad_norm": 0.5586293384385642, "learning_rate": 5.507875502952657e-07, "loss": 0.2489, "step": 2584 }, { "epoch": 1.4195496979681494, "grad_norm": 0.546072716851964, "learning_rate": 5.504986123669993e-07, "loss": 0.2878, "step": 2585 }, { "epoch": 1.4200988467874793, "grad_norm": 0.436997322045323, "learning_rate": 5.502096574141844e-07, "loss": 0.2502, "step": 2586 }, { "epoch": 1.4206479956068094, "grad_norm": 0.5397995331885239, "learning_rate": 5.499206855343336e-07, "loss": 0.2827, "step": 2587 }, { "epoch": 1.4211971444261395, "grad_norm": 0.680275200376474, "learning_rate": 5.496316968249634e-07, "loss": 0.2992, "step": 2588 }, { "epoch": 1.4217462932454694, "grad_norm": 0.4541928028488033, "learning_rate": 5.493426913835973e-07, "loss": 0.3032, "step": 2589 }, { "epoch": 1.4222954420647995, "grad_norm": 0.5513539341347823, "learning_rate": 5.490536693077639e-07, "loss": 0.251, "step": 2590 }, { "epoch": 1.4228445908841296, "grad_norm": 0.45338952528699095, "learning_rate": 5.487646306949973e-07, "loss": 0.2233, "step": 2591 }, { "epoch": 1.4233937397034597, "grad_norm": 0.5300731874273379, "learning_rate": 5.484755756428378e-07, "loss": 0.2369, "step": 2592 }, { "epoch": 1.4239428885227896, "grad_norm": 0.5631313003508217, "learning_rate": 5.481865042488303e-07, "loss": 0.2341, "step": 2593 }, { "epoch": 1.4244920373421197, "grad_norm": 0.39539025393663096, "learning_rate": 5.478974166105261e-07, "loss": 0.2461, "step": 2594 }, { "epoch": 1.4250411861614498, "grad_norm": 0.4185735648095177, "learning_rate": 5.476083128254817e-07, "loss": 0.2175, "step": 2595 }, { "epoch": 1.4255903349807797, "grad_norm": 0.4015870712824598, "learning_rate": 5.473191929912586e-07, "loss": 0.2488, "step": 2596 }, { "epoch": 1.4261394838001098, "grad_norm": 0.5286556184113277, "learning_rate": 5.470300572054246e-07, "loss": 0.2803, "step": 2597 }, { "epoch": 1.42668863261944, "grad_norm": 0.45533696118154177, "learning_rate": 5.467409055655519e-07, "loss": 0.26, "step": 2598 }, { "epoch": 1.42723778143877, "grad_norm": 0.43986376000385236, "learning_rate": 5.464517381692188e-07, "loss": 0.2551, "step": 2599 }, { "epoch": 1.4277869302580999, "grad_norm": 0.5337924992622999, "learning_rate": 5.461625551140085e-07, "loss": 0.2679, "step": 2600 }, { "epoch": 1.4277869302580999, "eval_loss": 0.3316677212715149, "eval_runtime": 18.662, "eval_samples_per_second": 23.738, "eval_steps_per_second": 1.018, "step": 2600 }, { "epoch": 1.42833607907743, "grad_norm": 0.506140021645145, "learning_rate": 5.458733564975097e-07, "loss": 0.2353, "step": 2601 }, { "epoch": 1.42888522789676, "grad_norm": 0.42814061027411326, "learning_rate": 5.455841424173163e-07, "loss": 0.2567, "step": 2602 }, { "epoch": 1.42943437671609, "grad_norm": 0.5344115049008108, "learning_rate": 5.452949129710275e-07, "loss": 0.2852, "step": 2603 }, { "epoch": 1.42998352553542, "grad_norm": 0.47070608186687296, "learning_rate": 5.450056682562473e-07, "loss": 0.2435, "step": 2604 }, { "epoch": 1.4305326743547502, "grad_norm": 0.6258021311931141, "learning_rate": 5.447164083705852e-07, "loss": 0.2634, "step": 2605 }, { "epoch": 1.4310818231740803, "grad_norm": 0.45701378140338594, "learning_rate": 5.44427133411656e-07, "loss": 0.2232, "step": 2606 }, { "epoch": 1.4316309719934102, "grad_norm": 0.3758460619300753, "learning_rate": 5.441378434770793e-07, "loss": 0.2566, "step": 2607 }, { "epoch": 1.4321801208127403, "grad_norm": 0.56930879889863, "learning_rate": 5.438485386644793e-07, "loss": 0.2415, "step": 2608 }, { "epoch": 1.4327292696320704, "grad_norm": 0.5159955252920533, "learning_rate": 5.435592190714865e-07, "loss": 0.2395, "step": 2609 }, { "epoch": 1.4332784184514002, "grad_norm": 1.4458408876279631, "learning_rate": 5.432698847957349e-07, "loss": 0.2974, "step": 2610 }, { "epoch": 1.4338275672707304, "grad_norm": 0.5688117102745841, "learning_rate": 5.429805359348647e-07, "loss": 0.2404, "step": 2611 }, { "epoch": 1.4343767160900605, "grad_norm": 0.46225900616139803, "learning_rate": 5.426911725865199e-07, "loss": 0.2384, "step": 2612 }, { "epoch": 1.4349258649093906, "grad_norm": 0.5214826644002363, "learning_rate": 5.424017948483504e-07, "loss": 0.2606, "step": 2613 }, { "epoch": 1.4354750137287204, "grad_norm": 0.44529699562222247, "learning_rate": 5.421124028180108e-07, "loss": 0.2177, "step": 2614 }, { "epoch": 1.4360241625480505, "grad_norm": 0.47321200397728697, "learning_rate": 5.418229965931594e-07, "loss": 0.2409, "step": 2615 }, { "epoch": 1.4365733113673804, "grad_norm": 0.5576254840904888, "learning_rate": 5.415335762714609e-07, "loss": 0.2344, "step": 2616 }, { "epoch": 1.4371224601867105, "grad_norm": 0.42250710323786533, "learning_rate": 5.412441419505838e-07, "loss": 0.2515, "step": 2617 }, { "epoch": 1.4376716090060406, "grad_norm": 0.4731986090222987, "learning_rate": 5.409546937282013e-07, "loss": 0.2467, "step": 2618 }, { "epoch": 1.4382207578253707, "grad_norm": 0.4880657685005266, "learning_rate": 5.406652317019916e-07, "loss": 0.2683, "step": 2619 }, { "epoch": 1.4387699066447008, "grad_norm": 0.43509023256039153, "learning_rate": 5.403757559696376e-07, "loss": 0.261, "step": 2620 }, { "epoch": 1.4393190554640307, "grad_norm": 0.6029958800878207, "learning_rate": 5.400862666288265e-07, "loss": 0.2525, "step": 2621 }, { "epoch": 1.4398682042833608, "grad_norm": 0.47333294480863286, "learning_rate": 5.397967637772505e-07, "loss": 0.2344, "step": 2622 }, { "epoch": 1.4404173531026907, "grad_norm": 0.5498391862105979, "learning_rate": 5.39507247512606e-07, "loss": 0.2805, "step": 2623 }, { "epoch": 1.4409665019220208, "grad_norm": 0.5792132187858705, "learning_rate": 5.392177179325941e-07, "loss": 0.2677, "step": 2624 }, { "epoch": 1.441515650741351, "grad_norm": 0.45634207142877636, "learning_rate": 5.389281751349205e-07, "loss": 0.2695, "step": 2625 }, { "epoch": 1.442064799560681, "grad_norm": 0.5111337837092467, "learning_rate": 5.38638619217295e-07, "loss": 0.2347, "step": 2626 }, { "epoch": 1.4426139483800111, "grad_norm": 0.5226584799766982, "learning_rate": 5.383490502774321e-07, "loss": 0.2445, "step": 2627 }, { "epoch": 1.443163097199341, "grad_norm": 0.6101541611251752, "learning_rate": 5.38059468413051e-07, "loss": 0.2609, "step": 2628 }, { "epoch": 1.443712246018671, "grad_norm": 0.616957255212788, "learning_rate": 5.377698737218742e-07, "loss": 0.2199, "step": 2629 }, { "epoch": 1.444261394838001, "grad_norm": 0.5183692949811273, "learning_rate": 5.374802663016299e-07, "loss": 0.2941, "step": 2630 }, { "epoch": 1.444810543657331, "grad_norm": 0.5860769920445142, "learning_rate": 5.371906462500499e-07, "loss": 0.2391, "step": 2631 }, { "epoch": 1.4453596924766612, "grad_norm": 0.4543579287778307, "learning_rate": 5.369010136648698e-07, "loss": 0.2356, "step": 2632 }, { "epoch": 1.4459088412959913, "grad_norm": 0.4858947003640477, "learning_rate": 5.366113686438304e-07, "loss": 0.2688, "step": 2633 }, { "epoch": 1.4464579901153214, "grad_norm": 0.39022698174573645, "learning_rate": 5.36321711284676e-07, "loss": 0.2425, "step": 2634 }, { "epoch": 1.4470071389346513, "grad_norm": 0.38506034226710706, "learning_rate": 5.360320416851552e-07, "loss": 0.2415, "step": 2635 }, { "epoch": 1.4475562877539814, "grad_norm": 0.6389120861997443, "learning_rate": 5.357423599430212e-07, "loss": 0.25, "step": 2636 }, { "epoch": 1.4481054365733113, "grad_norm": 0.4498904953062231, "learning_rate": 5.354526661560305e-07, "loss": 0.2901, "step": 2637 }, { "epoch": 1.4486545853926414, "grad_norm": 0.5173101027188047, "learning_rate": 5.351629604219444e-07, "loss": 0.2732, "step": 2638 }, { "epoch": 1.4492037342119715, "grad_norm": 0.4845016181242439, "learning_rate": 5.348732428385276e-07, "loss": 0.2719, "step": 2639 }, { "epoch": 1.4497528830313016, "grad_norm": 0.4825193919964431, "learning_rate": 5.345835135035493e-07, "loss": 0.2437, "step": 2640 }, { "epoch": 1.4503020318506314, "grad_norm": 0.41436395796893055, "learning_rate": 5.342937725147824e-07, "loss": 0.2313, "step": 2641 }, { "epoch": 1.4508511806699615, "grad_norm": 0.46689583167545246, "learning_rate": 5.34004019970004e-07, "loss": 0.2203, "step": 2642 }, { "epoch": 1.4514003294892917, "grad_norm": 0.4408505165786209, "learning_rate": 5.337142559669947e-07, "loss": 0.2763, "step": 2643 }, { "epoch": 1.4519494783086215, "grad_norm": 0.6343247874570068, "learning_rate": 5.334244806035393e-07, "loss": 0.2719, "step": 2644 }, { "epoch": 1.4524986271279516, "grad_norm": 0.39150453636226124, "learning_rate": 5.331346939774262e-07, "loss": 0.2212, "step": 2645 }, { "epoch": 1.4530477759472817, "grad_norm": 0.5350730065114141, "learning_rate": 5.328448961864476e-07, "loss": 0.2622, "step": 2646 }, { "epoch": 1.4535969247666118, "grad_norm": 0.4290653147645335, "learning_rate": 5.325550873284002e-07, "loss": 0.2453, "step": 2647 }, { "epoch": 1.4541460735859417, "grad_norm": 0.42234693864061046, "learning_rate": 5.322652675010831e-07, "loss": 0.2351, "step": 2648 }, { "epoch": 1.4546952224052718, "grad_norm": 0.5251760271542221, "learning_rate": 5.319754368022999e-07, "loss": 0.257, "step": 2649 }, { "epoch": 1.455244371224602, "grad_norm": 0.6055466949052233, "learning_rate": 5.316855953298581e-07, "loss": 0.2956, "step": 2650 }, { "epoch": 1.4557935200439318, "grad_norm": 0.42472652979370273, "learning_rate": 5.313957431815683e-07, "loss": 0.2299, "step": 2651 }, { "epoch": 1.456342668863262, "grad_norm": 0.4978694832161189, "learning_rate": 5.311058804552451e-07, "loss": 0.2561, "step": 2652 }, { "epoch": 1.456891817682592, "grad_norm": 0.5700744751729543, "learning_rate": 5.308160072487063e-07, "loss": 0.2548, "step": 2653 }, { "epoch": 1.4574409665019221, "grad_norm": 0.5785968744616552, "learning_rate": 5.305261236597736e-07, "loss": 0.2384, "step": 2654 }, { "epoch": 1.457990115321252, "grad_norm": 0.6136199304080862, "learning_rate": 5.30236229786272e-07, "loss": 0.297, "step": 2655 }, { "epoch": 1.458539264140582, "grad_norm": 0.50998355629445, "learning_rate": 5.299463257260298e-07, "loss": 0.2589, "step": 2656 }, { "epoch": 1.4590884129599122, "grad_norm": 0.5930326135534739, "learning_rate": 5.296564115768791e-07, "loss": 0.2141, "step": 2657 }, { "epoch": 1.459637561779242, "grad_norm": 0.5905736821105259, "learning_rate": 5.293664874366553e-07, "loss": 0.2429, "step": 2658 }, { "epoch": 1.4601867105985722, "grad_norm": 0.4964043577377883, "learning_rate": 5.290765534031969e-07, "loss": 0.3254, "step": 2659 }, { "epoch": 1.4607358594179023, "grad_norm": 0.5603752880220949, "learning_rate": 5.287866095743462e-07, "loss": 0.2859, "step": 2660 }, { "epoch": 1.4612850082372324, "grad_norm": 0.6600462159729269, "learning_rate": 5.284966560479485e-07, "loss": 0.3033, "step": 2661 }, { "epoch": 1.4618341570565623, "grad_norm": 0.4699250880858402, "learning_rate": 5.282066929218524e-07, "loss": 0.2528, "step": 2662 }, { "epoch": 1.4623833058758924, "grad_norm": 0.5797770154985005, "learning_rate": 5.279167202939098e-07, "loss": 0.2465, "step": 2663 }, { "epoch": 1.4629324546952225, "grad_norm": 0.583065554331777, "learning_rate": 5.276267382619757e-07, "loss": 0.3069, "step": 2664 }, { "epoch": 1.4634816035145524, "grad_norm": 0.5649048337148856, "learning_rate": 5.273367469239083e-07, "loss": 0.283, "step": 2665 }, { "epoch": 1.4640307523338825, "grad_norm": 0.6121639525245796, "learning_rate": 5.270467463775691e-07, "loss": 0.2612, "step": 2666 }, { "epoch": 1.4645799011532126, "grad_norm": 0.45398134711720356, "learning_rate": 5.267567367208227e-07, "loss": 0.2453, "step": 2667 }, { "epoch": 1.4651290499725427, "grad_norm": 0.4282179423529771, "learning_rate": 5.264667180515365e-07, "loss": 0.2805, "step": 2668 }, { "epoch": 1.4656781987918726, "grad_norm": 0.5531656173222034, "learning_rate": 5.261766904675813e-07, "loss": 0.2605, "step": 2669 }, { "epoch": 1.4662273476112027, "grad_norm": 0.5565252140594215, "learning_rate": 5.258866540668305e-07, "loss": 0.2953, "step": 2670 }, { "epoch": 1.4667764964305325, "grad_norm": 0.4665867633885385, "learning_rate": 5.255966089471607e-07, "loss": 0.2815, "step": 2671 }, { "epoch": 1.4673256452498626, "grad_norm": 0.6569944690830295, "learning_rate": 5.253065552064517e-07, "loss": 0.2333, "step": 2672 }, { "epoch": 1.4678747940691927, "grad_norm": 0.572554215877364, "learning_rate": 5.250164929425858e-07, "loss": 0.2892, "step": 2673 }, { "epoch": 1.4684239428885228, "grad_norm": 0.4238094446643601, "learning_rate": 5.247264222534483e-07, "loss": 0.3005, "step": 2674 }, { "epoch": 1.468973091707853, "grad_norm": 0.40406640471735505, "learning_rate": 5.244363432369274e-07, "loss": 0.2355, "step": 2675 }, { "epoch": 1.4695222405271828, "grad_norm": 0.43470440123019133, "learning_rate": 5.241462559909142e-07, "loss": 0.244, "step": 2676 }, { "epoch": 1.470071389346513, "grad_norm": 0.4149616378586847, "learning_rate": 5.23856160613302e-07, "loss": 0.2605, "step": 2677 }, { "epoch": 1.4706205381658428, "grad_norm": 0.47488965214710716, "learning_rate": 5.235660572019879e-07, "loss": 0.2556, "step": 2678 }, { "epoch": 1.471169686985173, "grad_norm": 0.4834643430587485, "learning_rate": 5.23275945854871e-07, "loss": 0.2627, "step": 2679 }, { "epoch": 1.471718835804503, "grad_norm": 0.46479757121507154, "learning_rate": 5.229858266698527e-07, "loss": 0.2378, "step": 2680 }, { "epoch": 1.4722679846238331, "grad_norm": 0.45106727718447925, "learning_rate": 5.226956997448381e-07, "loss": 0.2366, "step": 2681 }, { "epoch": 1.4728171334431632, "grad_norm": 0.6245623747590792, "learning_rate": 5.224055651777341e-07, "loss": 0.2245, "step": 2682 }, { "epoch": 1.473366282262493, "grad_norm": 0.5333705901248235, "learning_rate": 5.221154230664503e-07, "loss": 0.2585, "step": 2683 }, { "epoch": 1.4739154310818232, "grad_norm": 0.4454217985917844, "learning_rate": 5.218252735088994e-07, "loss": 0.2709, "step": 2684 }, { "epoch": 1.474464579901153, "grad_norm": 0.492677155289305, "learning_rate": 5.215351166029958e-07, "loss": 0.2333, "step": 2685 }, { "epoch": 1.4750137287204832, "grad_norm": 0.541343606657762, "learning_rate": 5.212449524466568e-07, "loss": 0.2484, "step": 2686 }, { "epoch": 1.4755628775398133, "grad_norm": 0.599707183663272, "learning_rate": 5.209547811378024e-07, "loss": 0.2643, "step": 2687 }, { "epoch": 1.4761120263591434, "grad_norm": 0.4764911838342862, "learning_rate": 5.206646027743542e-07, "loss": 0.2394, "step": 2688 }, { "epoch": 1.4766611751784733, "grad_norm": 0.702106968173194, "learning_rate": 5.203744174542373e-07, "loss": 0.2314, "step": 2689 }, { "epoch": 1.4772103239978034, "grad_norm": 0.5433508843543885, "learning_rate": 5.200842252753783e-07, "loss": 0.257, "step": 2690 }, { "epoch": 1.4777594728171335, "grad_norm": 0.39340349662844637, "learning_rate": 5.197940263357064e-07, "loss": 0.2511, "step": 2691 }, { "epoch": 1.4783086216364634, "grad_norm": 0.47094471464433973, "learning_rate": 5.195038207331526e-07, "loss": 0.2188, "step": 2692 }, { "epoch": 1.4788577704557935, "grad_norm": 0.6779354378610623, "learning_rate": 5.192136085656513e-07, "loss": 0.2733, "step": 2693 }, { "epoch": 1.4794069192751236, "grad_norm": 0.4980524396937037, "learning_rate": 5.189233899311382e-07, "loss": 0.2618, "step": 2694 }, { "epoch": 1.4799560680944537, "grad_norm": 0.468198827537501, "learning_rate": 5.186331649275513e-07, "loss": 0.2579, "step": 2695 }, { "epoch": 1.4805052169137836, "grad_norm": 0.6309454274631521, "learning_rate": 5.183429336528308e-07, "loss": 0.2329, "step": 2696 }, { "epoch": 1.4810543657331137, "grad_norm": 0.6001086497412195, "learning_rate": 5.18052696204919e-07, "loss": 0.2316, "step": 2697 }, { "epoch": 1.4816035145524438, "grad_norm": 0.5296571386786367, "learning_rate": 5.177624526817605e-07, "loss": 0.2621, "step": 2698 }, { "epoch": 1.4821526633717736, "grad_norm": 0.5165644413207069, "learning_rate": 5.174722031813019e-07, "loss": 0.2635, "step": 2699 }, { "epoch": 1.4827018121911038, "grad_norm": 0.4407409227683134, "learning_rate": 5.171819478014915e-07, "loss": 0.3084, "step": 2700 }, { "epoch": 1.4832509610104339, "grad_norm": 0.46859597156517513, "learning_rate": 5.1689168664028e-07, "loss": 0.2609, "step": 2701 }, { "epoch": 1.483800109829764, "grad_norm": 0.45523213938016316, "learning_rate": 5.166014197956197e-07, "loss": 0.2758, "step": 2702 }, { "epoch": 1.4843492586490938, "grad_norm": 0.42492725823899286, "learning_rate": 5.163111473654649e-07, "loss": 0.2527, "step": 2703 }, { "epoch": 1.484898407468424, "grad_norm": 0.5647787170303656, "learning_rate": 5.160208694477719e-07, "loss": 0.2488, "step": 2704 }, { "epoch": 1.485447556287754, "grad_norm": 0.5579652793177825, "learning_rate": 5.157305861404989e-07, "loss": 0.2609, "step": 2705 }, { "epoch": 1.485996705107084, "grad_norm": 0.49343176120630367, "learning_rate": 5.154402975416059e-07, "loss": 0.2449, "step": 2706 }, { "epoch": 1.486545853926414, "grad_norm": 1.3426944652586155, "learning_rate": 5.151500037490544e-07, "loss": 0.3039, "step": 2707 }, { "epoch": 1.4870950027457441, "grad_norm": 0.4756923426018933, "learning_rate": 5.148597048608079e-07, "loss": 0.2405, "step": 2708 }, { "epoch": 1.4876441515650742, "grad_norm": 0.5196741383174757, "learning_rate": 5.145694009748316e-07, "loss": 0.2588, "step": 2709 }, { "epoch": 1.4881933003844041, "grad_norm": 0.4485991373916076, "learning_rate": 5.142790921890923e-07, "loss": 0.2388, "step": 2710 }, { "epoch": 1.4887424492037342, "grad_norm": 0.6206980603521447, "learning_rate": 5.139887786015589e-07, "loss": 0.2534, "step": 2711 }, { "epoch": 1.4892915980230643, "grad_norm": 0.4207048720145005, "learning_rate": 5.136984603102011e-07, "loss": 0.2493, "step": 2712 }, { "epoch": 1.4898407468423942, "grad_norm": 0.4686330635148352, "learning_rate": 5.134081374129908e-07, "loss": 0.2236, "step": 2713 }, { "epoch": 1.4903898956617243, "grad_norm": 0.3882104550111546, "learning_rate": 5.13117810007901e-07, "loss": 0.2929, "step": 2714 }, { "epoch": 1.4909390444810544, "grad_norm": 0.610081166844695, "learning_rate": 5.128274781929069e-07, "loss": 0.2396, "step": 2715 }, { "epoch": 1.4914881933003845, "grad_norm": 0.4324690403827052, "learning_rate": 5.125371420659848e-07, "loss": 0.2799, "step": 2716 }, { "epoch": 1.4920373421197144, "grad_norm": 0.44893638495618865, "learning_rate": 5.122468017251123e-07, "loss": 0.2493, "step": 2717 }, { "epoch": 1.4925864909390445, "grad_norm": 0.6030894086321118, "learning_rate": 5.119564572682684e-07, "loss": 0.2486, "step": 2718 }, { "epoch": 1.4931356397583746, "grad_norm": 0.5319080072435015, "learning_rate": 5.116661087934339e-07, "loss": 0.2611, "step": 2719 }, { "epoch": 1.4936847885777045, "grad_norm": 0.4904603796543683, "learning_rate": 5.113757563985905e-07, "loss": 0.2143, "step": 2720 }, { "epoch": 1.4942339373970346, "grad_norm": 0.3943689438120682, "learning_rate": 5.110854001817218e-07, "loss": 0.2338, "step": 2721 }, { "epoch": 1.4947830862163647, "grad_norm": 0.4280408818761351, "learning_rate": 5.107950402408117e-07, "loss": 0.2532, "step": 2722 }, { "epoch": 1.4953322350356948, "grad_norm": 0.4712825699658468, "learning_rate": 5.105046766738468e-07, "loss": 0.2693, "step": 2723 }, { "epoch": 1.4958813838550247, "grad_norm": 0.6279286985478026, "learning_rate": 5.102143095788136e-07, "loss": 0.2684, "step": 2724 }, { "epoch": 1.4964305326743548, "grad_norm": 0.4676177476682958, "learning_rate": 5.099239390537003e-07, "loss": 0.2419, "step": 2725 }, { "epoch": 1.4969796814936847, "grad_norm": 0.4783012679225463, "learning_rate": 5.096335651964962e-07, "loss": 0.229, "step": 2726 }, { "epoch": 1.4975288303130148, "grad_norm": 0.5479412453284265, "learning_rate": 5.093431881051923e-07, "loss": 0.2677, "step": 2727 }, { "epoch": 1.4980779791323449, "grad_norm": 0.5287442832379158, "learning_rate": 5.090528078777796e-07, "loss": 0.2456, "step": 2728 }, { "epoch": 1.498627127951675, "grad_norm": 0.4726957226628252, "learning_rate": 5.087624246122509e-07, "loss": 0.2559, "step": 2729 }, { "epoch": 1.499176276771005, "grad_norm": 0.6389936118238199, "learning_rate": 5.084720384065998e-07, "loss": 0.2526, "step": 2730 }, { "epoch": 1.499725425590335, "grad_norm": 0.6294137990739049, "learning_rate": 5.081816493588209e-07, "loss": 0.2595, "step": 2731 }, { "epoch": 1.500274574409665, "grad_norm": 0.5661325736011077, "learning_rate": 5.078912575669102e-07, "loss": 0.2553, "step": 2732 }, { "epoch": 1.500823723228995, "grad_norm": 0.44453987711884313, "learning_rate": 5.076008631288639e-07, "loss": 0.2507, "step": 2733 }, { "epoch": 1.501372872048325, "grad_norm": 0.41941441739540863, "learning_rate": 5.073104661426795e-07, "loss": 0.2575, "step": 2734 }, { "epoch": 1.5019220208676551, "grad_norm": 0.4024798561247544, "learning_rate": 5.070200667063552e-07, "loss": 0.2869, "step": 2735 }, { "epoch": 1.5024711696869852, "grad_norm": 0.4851995634394856, "learning_rate": 5.0672966491789e-07, "loss": 0.2117, "step": 2736 }, { "epoch": 1.5030203185063153, "grad_norm": 0.49586482497733075, "learning_rate": 5.064392608752842e-07, "loss": 0.2213, "step": 2737 }, { "epoch": 1.5035694673256452, "grad_norm": 0.610983778213739, "learning_rate": 5.061488546765381e-07, "loss": 0.2996, "step": 2738 }, { "epoch": 1.5041186161449753, "grad_norm": 0.4901422886842051, "learning_rate": 5.058584464196535e-07, "loss": 0.2479, "step": 2739 }, { "epoch": 1.5046677649643052, "grad_norm": 0.4217227682290988, "learning_rate": 5.05568036202632e-07, "loss": 0.2812, "step": 2740 }, { "epoch": 1.5052169137836353, "grad_norm": 0.4728839540110898, "learning_rate": 5.052776241234765e-07, "loss": 0.2468, "step": 2741 }, { "epoch": 1.5057660626029654, "grad_norm": 0.4811207959723812, "learning_rate": 5.049872102801907e-07, "loss": 0.2514, "step": 2742 }, { "epoch": 1.5063152114222955, "grad_norm": 0.6262813840019746, "learning_rate": 5.04696794770778e-07, "loss": 0.2567, "step": 2743 }, { "epoch": 1.5068643602416256, "grad_norm": 0.6070475350339779, "learning_rate": 5.044063776932435e-07, "loss": 0.2682, "step": 2744 }, { "epoch": 1.5074135090609555, "grad_norm": 0.5430928628778702, "learning_rate": 5.04115959145592e-07, "loss": 0.3159, "step": 2745 }, { "epoch": 1.5079626578802856, "grad_norm": 0.42694634996063924, "learning_rate": 5.038255392258292e-07, "loss": 0.28, "step": 2746 }, { "epoch": 1.5085118066996155, "grad_norm": 0.4997085437107576, "learning_rate": 5.035351180319607e-07, "loss": 0.2726, "step": 2747 }, { "epoch": 1.5090609555189456, "grad_norm": 0.491112701998596, "learning_rate": 5.032446956619933e-07, "loss": 0.2488, "step": 2748 }, { "epoch": 1.5096101043382757, "grad_norm": 0.5624050785288854, "learning_rate": 5.02954272213934e-07, "loss": 0.2479, "step": 2749 }, { "epoch": 1.5101592531576058, "grad_norm": 0.46811582611774843, "learning_rate": 5.026638477857898e-07, "loss": 0.261, "step": 2750 }, { "epoch": 1.510708401976936, "grad_norm": 0.5568232500099091, "learning_rate": 5.023734224755682e-07, "loss": 0.2668, "step": 2751 }, { "epoch": 1.5112575507962658, "grad_norm": 0.504604675938705, "learning_rate": 5.020829963812772e-07, "loss": 0.2334, "step": 2752 }, { "epoch": 1.5118066996155957, "grad_norm": 0.4298497785727823, "learning_rate": 5.017925696009246e-07, "loss": 0.25, "step": 2753 }, { "epoch": 1.5123558484349258, "grad_norm": 0.5428490430779151, "learning_rate": 5.015021422325191e-07, "loss": 0.2309, "step": 2754 }, { "epoch": 1.5129049972542559, "grad_norm": 0.4758247364992933, "learning_rate": 5.012117143740691e-07, "loss": 0.2321, "step": 2755 }, { "epoch": 1.513454146073586, "grad_norm": 0.6265576010153409, "learning_rate": 5.009212861235835e-07, "loss": 0.2389, "step": 2756 }, { "epoch": 1.514003294892916, "grad_norm": 0.45926949556654206, "learning_rate": 5.006308575790705e-07, "loss": 0.2486, "step": 2757 }, { "epoch": 1.5145524437122462, "grad_norm": 0.5247480638344076, "learning_rate": 5.003404288385398e-07, "loss": 0.2453, "step": 2758 }, { "epoch": 1.515101592531576, "grad_norm": 0.49608735657361896, "learning_rate": 5.0005e-07, "loss": 0.2524, "step": 2759 }, { "epoch": 1.515650741350906, "grad_norm": 0.5732476954448429, "learning_rate": 4.997595711614601e-07, "loss": 0.2566, "step": 2760 }, { "epoch": 1.516199890170236, "grad_norm": 0.5339956472299764, "learning_rate": 4.994691424209294e-07, "loss": 0.2865, "step": 2761 }, { "epoch": 1.5167490389895661, "grad_norm": 0.511260164508659, "learning_rate": 4.991787138764166e-07, "loss": 0.2451, "step": 2762 }, { "epoch": 1.5172981878088962, "grad_norm": 0.46790112493431585, "learning_rate": 4.988882856259308e-07, "loss": 0.2518, "step": 2763 }, { "epoch": 1.5178473366282264, "grad_norm": 0.5529417222910061, "learning_rate": 4.985978577674808e-07, "loss": 0.2793, "step": 2764 }, { "epoch": 1.5183964854475562, "grad_norm": 0.5244153677044221, "learning_rate": 4.983074303990752e-07, "loss": 0.2449, "step": 2765 }, { "epoch": 1.5189456342668863, "grad_norm": 0.5624430538268501, "learning_rate": 4.980170036187228e-07, "loss": 0.2653, "step": 2766 }, { "epoch": 1.5194947830862162, "grad_norm": 0.5570505670582001, "learning_rate": 4.977265775244318e-07, "loss": 0.2805, "step": 2767 }, { "epoch": 1.5200439319055463, "grad_norm": 0.5538178841106489, "learning_rate": 4.974361522142103e-07, "loss": 0.3197, "step": 2768 }, { "epoch": 1.5205930807248764, "grad_norm": 0.3895195377805498, "learning_rate": 4.971457277860661e-07, "loss": 0.2549, "step": 2769 }, { "epoch": 1.5211422295442065, "grad_norm": 0.5339255523984222, "learning_rate": 4.968553043380066e-07, "loss": 0.2525, "step": 2770 }, { "epoch": 1.5216913783635366, "grad_norm": 0.5030742725309424, "learning_rate": 4.965648819680394e-07, "loss": 0.2389, "step": 2771 }, { "epoch": 1.5222405271828665, "grad_norm": 0.4073478753597533, "learning_rate": 4.962744607741711e-07, "loss": 0.2592, "step": 2772 }, { "epoch": 1.5227896760021966, "grad_norm": 0.47508916853304656, "learning_rate": 4.959840408544082e-07, "loss": 0.2232, "step": 2773 }, { "epoch": 1.5233388248215265, "grad_norm": 0.45696601647300056, "learning_rate": 4.956936223067565e-07, "loss": 0.2431, "step": 2774 }, { "epoch": 1.5238879736408566, "grad_norm": 0.578582244142802, "learning_rate": 4.954032052292219e-07, "loss": 0.2869, "step": 2775 }, { "epoch": 1.5244371224601867, "grad_norm": 0.4447708933681857, "learning_rate": 4.951127897198094e-07, "loss": 0.2543, "step": 2776 }, { "epoch": 1.5249862712795168, "grad_norm": 0.5865695691582431, "learning_rate": 4.948223758765233e-07, "loss": 0.2434, "step": 2777 }, { "epoch": 1.525535420098847, "grad_norm": 0.4724304171462187, "learning_rate": 4.945319637973682e-07, "loss": 0.2439, "step": 2778 }, { "epoch": 1.5260845689181768, "grad_norm": 0.5146074905577956, "learning_rate": 4.942415535803467e-07, "loss": 0.2325, "step": 2779 }, { "epoch": 1.526633717737507, "grad_norm": 0.5559587529345579, "learning_rate": 4.939511453234618e-07, "loss": 0.2748, "step": 2780 }, { "epoch": 1.5271828665568368, "grad_norm": 0.4366947372514167, "learning_rate": 4.936607391247159e-07, "loss": 0.2659, "step": 2781 }, { "epoch": 1.5277320153761669, "grad_norm": 0.5169265145637658, "learning_rate": 4.933703350821099e-07, "loss": 0.2852, "step": 2782 }, { "epoch": 1.528281164195497, "grad_norm": 0.616839517904532, "learning_rate": 4.930799332936451e-07, "loss": 0.2503, "step": 2783 }, { "epoch": 1.528830313014827, "grad_norm": 0.4454459922958409, "learning_rate": 4.927895338573206e-07, "loss": 0.2705, "step": 2784 }, { "epoch": 1.5293794618341572, "grad_norm": 0.46594004554378604, "learning_rate": 4.924991368711361e-07, "loss": 0.2615, "step": 2785 }, { "epoch": 1.529928610653487, "grad_norm": 0.4909687962361828, "learning_rate": 4.922087424330898e-07, "loss": 0.2449, "step": 2786 }, { "epoch": 1.5304777594728172, "grad_norm": 0.7049498724185411, "learning_rate": 4.919183506411788e-07, "loss": 0.2905, "step": 2787 }, { "epoch": 1.531026908292147, "grad_norm": 0.4732403124777763, "learning_rate": 4.916279615934001e-07, "loss": 0.2602, "step": 2788 }, { "epoch": 1.5315760571114772, "grad_norm": 0.5182010829093522, "learning_rate": 4.913375753877492e-07, "loss": 0.2672, "step": 2789 }, { "epoch": 1.5321252059308073, "grad_norm": 0.5026743147169596, "learning_rate": 4.910471921222205e-07, "loss": 0.2625, "step": 2790 }, { "epoch": 1.5326743547501374, "grad_norm": 0.44690930349256075, "learning_rate": 4.907568118948077e-07, "loss": 0.249, "step": 2791 }, { "epoch": 1.5332235035694675, "grad_norm": 0.5718200409503246, "learning_rate": 4.904664348035035e-07, "loss": 0.2351, "step": 2792 }, { "epoch": 1.5337726523887973, "grad_norm": 0.5512620151554342, "learning_rate": 4.901760609462997e-07, "loss": 0.2733, "step": 2793 }, { "epoch": 1.5343218012081274, "grad_norm": 0.699421534858735, "learning_rate": 4.898856904211865e-07, "loss": 0.2359, "step": 2794 }, { "epoch": 1.5348709500274573, "grad_norm": 0.509253187397344, "learning_rate": 4.895953233261532e-07, "loss": 0.2556, "step": 2795 }, { "epoch": 1.5354200988467874, "grad_norm": 0.5551986883696253, "learning_rate": 4.893049597591881e-07, "loss": 0.2609, "step": 2796 }, { "epoch": 1.5359692476661175, "grad_norm": 0.5507101551011097, "learning_rate": 4.890145998182782e-07, "loss": 0.2663, "step": 2797 }, { "epoch": 1.5365183964854476, "grad_norm": 0.5518083046074918, "learning_rate": 4.887242436014094e-07, "loss": 0.225, "step": 2798 }, { "epoch": 1.5370675453047777, "grad_norm": 0.5100213412006098, "learning_rate": 4.884338912065661e-07, "loss": 0.2843, "step": 2799 }, { "epoch": 1.5376166941241076, "grad_norm": 0.5627764538731767, "learning_rate": 4.881435427317318e-07, "loss": 0.259, "step": 2800 }, { "epoch": 1.5376166941241076, "eval_loss": 0.3302614986896515, "eval_runtime": 18.6514, "eval_samples_per_second": 23.752, "eval_steps_per_second": 1.019, "step": 2800 }, { "epoch": 1.5381658429434377, "grad_norm": 0.5079239392088903, "learning_rate": 4.878531982748878e-07, "loss": 0.221, "step": 2801 }, { "epoch": 1.5387149917627676, "grad_norm": 0.47189967828123247, "learning_rate": 4.875628579340152e-07, "loss": 0.2636, "step": 2802 }, { "epoch": 1.5392641405820977, "grad_norm": 0.46554472572458666, "learning_rate": 4.872725218070929e-07, "loss": 0.2355, "step": 2803 }, { "epoch": 1.5398132894014278, "grad_norm": 0.5274608123664462, "learning_rate": 4.869821899920989e-07, "loss": 0.2428, "step": 2804 }, { "epoch": 1.540362438220758, "grad_norm": 0.4220397869253295, "learning_rate": 4.866918625870093e-07, "loss": 0.2543, "step": 2805 }, { "epoch": 1.540911587040088, "grad_norm": 0.5095811165668853, "learning_rate": 4.864015396897991e-07, "loss": 0.2538, "step": 2806 }, { "epoch": 1.541460735859418, "grad_norm": 0.5697834414711409, "learning_rate": 4.861112213984412e-07, "loss": 0.2641, "step": 2807 }, { "epoch": 1.5420098846787478, "grad_norm": 0.5035818037724551, "learning_rate": 4.858209078109075e-07, "loss": 0.2464, "step": 2808 }, { "epoch": 1.5425590334980779, "grad_norm": 0.4456871751261883, "learning_rate": 4.855305990251683e-07, "loss": 0.2321, "step": 2809 }, { "epoch": 1.543108182317408, "grad_norm": 0.5023359837200629, "learning_rate": 4.852402951391921e-07, "loss": 0.2935, "step": 2810 }, { "epoch": 1.543657331136738, "grad_norm": 0.5208603097744479, "learning_rate": 4.849499962509457e-07, "loss": 0.2673, "step": 2811 }, { "epoch": 1.5442064799560682, "grad_norm": 0.4771329519172141, "learning_rate": 4.846597024583941e-07, "loss": 0.2294, "step": 2812 }, { "epoch": 1.5447556287753983, "grad_norm": 0.4586798953661012, "learning_rate": 4.843694138595009e-07, "loss": 0.2713, "step": 2813 }, { "epoch": 1.5453047775947282, "grad_norm": 0.4293257092269013, "learning_rate": 4.840791305522279e-07, "loss": 0.2615, "step": 2814 }, { "epoch": 1.545853926414058, "grad_norm": 0.5131812310994579, "learning_rate": 4.837888526345351e-07, "loss": 0.2406, "step": 2815 }, { "epoch": 1.5464030752333882, "grad_norm": 0.41261192051933104, "learning_rate": 4.834985802043805e-07, "loss": 0.2561, "step": 2816 }, { "epoch": 1.5469522240527183, "grad_norm": 0.5330911791964046, "learning_rate": 4.832083133597201e-07, "loss": 0.2512, "step": 2817 }, { "epoch": 1.5475013728720484, "grad_norm": 0.4793069942382472, "learning_rate": 4.829180521985084e-07, "loss": 0.2337, "step": 2818 }, { "epoch": 1.5480505216913785, "grad_norm": 0.6861835783449852, "learning_rate": 4.826277968186981e-07, "loss": 0.3115, "step": 2819 }, { "epoch": 1.5485996705107083, "grad_norm": 0.47167697189091795, "learning_rate": 4.823375473182394e-07, "loss": 0.2505, "step": 2820 }, { "epoch": 1.5491488193300385, "grad_norm": 0.4335175847578868, "learning_rate": 4.820473037950809e-07, "loss": 0.2408, "step": 2821 }, { "epoch": 1.5496979681493683, "grad_norm": 0.5367871943997862, "learning_rate": 4.817570663471693e-07, "loss": 0.2811, "step": 2822 }, { "epoch": 1.5502471169686984, "grad_norm": 0.516667475918073, "learning_rate": 4.814668350724488e-07, "loss": 0.2534, "step": 2823 }, { "epoch": 1.5507962657880285, "grad_norm": 0.4311751040848791, "learning_rate": 4.811766100688619e-07, "loss": 0.2216, "step": 2824 }, { "epoch": 1.5513454146073586, "grad_norm": 0.47806329077048054, "learning_rate": 4.808863914343485e-07, "loss": 0.2548, "step": 2825 }, { "epoch": 1.5518945634266887, "grad_norm": 0.5159309659668649, "learning_rate": 4.805961792668472e-07, "loss": 0.2399, "step": 2826 }, { "epoch": 1.5524437122460186, "grad_norm": 0.40036973603765497, "learning_rate": 4.803059736642939e-07, "loss": 0.243, "step": 2827 }, { "epoch": 1.5529928610653487, "grad_norm": 0.4839111243014504, "learning_rate": 4.800157747246218e-07, "loss": 0.2011, "step": 2828 }, { "epoch": 1.5535420098846786, "grad_norm": 0.5146675444979022, "learning_rate": 4.797255825457627e-07, "loss": 0.281, "step": 2829 }, { "epoch": 1.5540911587040087, "grad_norm": 0.3958161927252821, "learning_rate": 4.794353972256456e-07, "loss": 0.2519, "step": 2830 }, { "epoch": 1.5546403075233388, "grad_norm": 0.5641128052728285, "learning_rate": 4.791452188621977e-07, "loss": 0.2344, "step": 2831 }, { "epoch": 1.555189456342669, "grad_norm": 0.47553884372946037, "learning_rate": 4.788550475533431e-07, "loss": 0.2475, "step": 2832 }, { "epoch": 1.555738605161999, "grad_norm": 0.5227082372446495, "learning_rate": 4.785648833970044e-07, "loss": 0.2716, "step": 2833 }, { "epoch": 1.556287753981329, "grad_norm": 0.5457467193642, "learning_rate": 4.782747264911008e-07, "loss": 0.2448, "step": 2834 }, { "epoch": 1.556836902800659, "grad_norm": 0.519459123520073, "learning_rate": 4.779845769335496e-07, "loss": 0.2485, "step": 2835 }, { "epoch": 1.5573860516199889, "grad_norm": 0.43250234112820973, "learning_rate": 4.776944348222659e-07, "loss": 0.2387, "step": 2836 }, { "epoch": 1.557935200439319, "grad_norm": 0.4219763667737606, "learning_rate": 4.774043002551619e-07, "loss": 0.2414, "step": 2837 }, { "epoch": 1.558484349258649, "grad_norm": 0.49870717394184294, "learning_rate": 4.771141733301474e-07, "loss": 0.2549, "step": 2838 }, { "epoch": 1.5590334980779792, "grad_norm": 0.5180297132312424, "learning_rate": 4.7682405414512914e-07, "loss": 0.2339, "step": 2839 }, { "epoch": 1.5595826468973093, "grad_norm": 0.4023740770065276, "learning_rate": 4.765339427980121e-07, "loss": 0.2531, "step": 2840 }, { "epoch": 1.5601317957166392, "grad_norm": 0.6759746217795632, "learning_rate": 4.7624383938669795e-07, "loss": 0.2457, "step": 2841 }, { "epoch": 1.5606809445359693, "grad_norm": 0.4582921175709837, "learning_rate": 4.7595374400908586e-07, "loss": 0.1942, "step": 2842 }, { "epoch": 1.5612300933552992, "grad_norm": 0.4408915341293445, "learning_rate": 4.7566365676307254e-07, "loss": 0.24, "step": 2843 }, { "epoch": 1.5617792421746293, "grad_norm": 0.5412665892755424, "learning_rate": 4.753735777465517e-07, "loss": 0.2497, "step": 2844 }, { "epoch": 1.5623283909939594, "grad_norm": 0.5910076589366715, "learning_rate": 4.750835070574143e-07, "loss": 0.2483, "step": 2845 }, { "epoch": 1.5628775398132895, "grad_norm": 0.47649446864655787, "learning_rate": 4.747934447935483e-07, "loss": 0.2403, "step": 2846 }, { "epoch": 1.5634266886326196, "grad_norm": 0.5713036156338777, "learning_rate": 4.745033910528392e-07, "loss": 0.2355, "step": 2847 }, { "epoch": 1.5639758374519495, "grad_norm": 0.5221159320948752, "learning_rate": 4.742133459331695e-07, "loss": 0.2497, "step": 2848 }, { "epoch": 1.5645249862712796, "grad_norm": 0.4792748310165088, "learning_rate": 4.739233095324189e-07, "loss": 0.2677, "step": 2849 }, { "epoch": 1.5650741350906094, "grad_norm": 0.5071728225173472, "learning_rate": 4.736332819484636e-07, "loss": 0.244, "step": 2850 }, { "epoch": 1.5656232839099395, "grad_norm": 0.5242730812459762, "learning_rate": 4.733432632791774e-07, "loss": 0.2359, "step": 2851 }, { "epoch": 1.5661724327292696, "grad_norm": 0.620705334260142, "learning_rate": 4.730532536224308e-07, "loss": 0.2556, "step": 2852 }, { "epoch": 1.5667215815485998, "grad_norm": 0.4966527226893142, "learning_rate": 4.7276325307609167e-07, "loss": 0.259, "step": 2853 }, { "epoch": 1.5672707303679299, "grad_norm": 0.5311382760062338, "learning_rate": 4.7247326173802443e-07, "loss": 0.2991, "step": 2854 }, { "epoch": 1.5678198791872597, "grad_norm": 0.5417483707486203, "learning_rate": 4.721832797060904e-07, "loss": 0.2616, "step": 2855 }, { "epoch": 1.5683690280065898, "grad_norm": 0.5628074090904469, "learning_rate": 4.718933070781476e-07, "loss": 0.2635, "step": 2856 }, { "epoch": 1.5689181768259197, "grad_norm": 0.43905958760313846, "learning_rate": 4.7160334395205145e-07, "loss": 0.2276, "step": 2857 }, { "epoch": 1.5694673256452498, "grad_norm": 0.4854982643494795, "learning_rate": 4.713133904256537e-07, "loss": 0.2446, "step": 2858 }, { "epoch": 1.57001647446458, "grad_norm": 0.42413907062845707, "learning_rate": 4.7102344659680295e-07, "loss": 0.251, "step": 2859 }, { "epoch": 1.57056562328391, "grad_norm": 0.5144845224316664, "learning_rate": 4.7073351256334485e-07, "loss": 0.2338, "step": 2860 }, { "epoch": 1.5711147721032401, "grad_norm": 0.4153310956832003, "learning_rate": 4.70443588423121e-07, "loss": 0.2533, "step": 2861 }, { "epoch": 1.57166392092257, "grad_norm": 0.46665532721938946, "learning_rate": 4.701536742739703e-07, "loss": 0.2849, "step": 2862 }, { "epoch": 1.5722130697419, "grad_norm": 0.5837429769653172, "learning_rate": 4.698637702137281e-07, "loss": 0.2501, "step": 2863 }, { "epoch": 1.57276221856123, "grad_norm": 0.5773208981833295, "learning_rate": 4.695738763402263e-07, "loss": 0.2606, "step": 2864 }, { "epoch": 1.57331136738056, "grad_norm": 0.5369744819112152, "learning_rate": 4.692839927512936e-07, "loss": 0.2297, "step": 2865 }, { "epoch": 1.5738605161998902, "grad_norm": 0.5004406673236037, "learning_rate": 4.689941195447549e-07, "loss": 0.261, "step": 2866 }, { "epoch": 1.5744096650192203, "grad_norm": 0.5129416586224244, "learning_rate": 4.6870425681843176e-07, "loss": 0.2501, "step": 2867 }, { "epoch": 1.5749588138385504, "grad_norm": 0.4909208400078727, "learning_rate": 4.6841440467014196e-07, "loss": 0.2621, "step": 2868 }, { "epoch": 1.5755079626578803, "grad_norm": 0.5365978804718112, "learning_rate": 4.6812456319770005e-07, "loss": 0.3015, "step": 2869 }, { "epoch": 1.5760571114772102, "grad_norm": 0.5630410598392107, "learning_rate": 4.6783473249891695e-07, "loss": 0.2805, "step": 2870 }, { "epoch": 1.5766062602965403, "grad_norm": 0.4858405245119762, "learning_rate": 4.6754491267160003e-07, "loss": 0.2371, "step": 2871 }, { "epoch": 1.5771554091158704, "grad_norm": 0.4915165415925142, "learning_rate": 4.672551038135523e-07, "loss": 0.2625, "step": 2872 }, { "epoch": 1.5777045579352005, "grad_norm": 0.5179868761116622, "learning_rate": 4.6696530602257377e-07, "loss": 0.2886, "step": 2873 }, { "epoch": 1.5782537067545306, "grad_norm": 0.5767617509982645, "learning_rate": 4.666755193964607e-07, "loss": 0.2261, "step": 2874 }, { "epoch": 1.5788028555738605, "grad_norm": 0.5371114574703987, "learning_rate": 4.663857440330052e-07, "loss": 0.2732, "step": 2875 }, { "epoch": 1.5793520043931906, "grad_norm": 0.5254404425452375, "learning_rate": 4.660959800299958e-07, "loss": 0.2201, "step": 2876 }, { "epoch": 1.5799011532125204, "grad_norm": 0.5920513853064838, "learning_rate": 4.658062274852177e-07, "loss": 0.2594, "step": 2877 }, { "epoch": 1.5804503020318506, "grad_norm": 0.45988148115272065, "learning_rate": 4.655164864964507e-07, "loss": 0.2548, "step": 2878 }, { "epoch": 1.5809994508511807, "grad_norm": 0.4890117316439837, "learning_rate": 4.6522675716147246e-07, "loss": 0.2612, "step": 2879 }, { "epoch": 1.5815485996705108, "grad_norm": 0.6594903247878485, "learning_rate": 4.6493703957805577e-07, "loss": 0.2809, "step": 2880 }, { "epoch": 1.5820977484898409, "grad_norm": 0.46956046295488574, "learning_rate": 4.6464733384396937e-07, "loss": 0.2512, "step": 2881 }, { "epoch": 1.5826468973091707, "grad_norm": 0.5311362838617913, "learning_rate": 4.643576400569788e-07, "loss": 0.2458, "step": 2882 }, { "epoch": 1.5831960461285008, "grad_norm": 0.4114939896171229, "learning_rate": 4.6406795831484474e-07, "loss": 0.2386, "step": 2883 }, { "epoch": 1.5837451949478307, "grad_norm": 0.4748702916828854, "learning_rate": 4.6377828871532406e-07, "loss": 0.2091, "step": 2884 }, { "epoch": 1.5842943437671608, "grad_norm": 0.4888204489371328, "learning_rate": 4.6348863135616967e-07, "loss": 0.2239, "step": 2885 }, { "epoch": 1.584843492586491, "grad_norm": 0.44056629172269635, "learning_rate": 4.631989863351301e-07, "loss": 0.2071, "step": 2886 }, { "epoch": 1.585392641405821, "grad_norm": 0.4337837133265149, "learning_rate": 4.629093537499501e-07, "loss": 0.2311, "step": 2887 }, { "epoch": 1.5859417902251511, "grad_norm": 0.5317649078035698, "learning_rate": 4.6261973369837e-07, "loss": 0.259, "step": 2888 }, { "epoch": 1.586490939044481, "grad_norm": 0.5759756900852464, "learning_rate": 4.623301262781257e-07, "loss": 0.2461, "step": 2889 }, { "epoch": 1.5870400878638111, "grad_norm": 0.4703292757241802, "learning_rate": 4.620405315869491e-07, "loss": 0.2526, "step": 2890 }, { "epoch": 1.587589236683141, "grad_norm": 0.5109377423745527, "learning_rate": 4.617509497225678e-07, "loss": 0.2369, "step": 2891 }, { "epoch": 1.588138385502471, "grad_norm": 0.4790144687244557, "learning_rate": 4.61461380782705e-07, "loss": 0.2666, "step": 2892 }, { "epoch": 1.5886875343218012, "grad_norm": 0.42019280365685124, "learning_rate": 4.6117182486507956e-07, "loss": 0.2445, "step": 2893 }, { "epoch": 1.5892366831411313, "grad_norm": 0.8250619793909048, "learning_rate": 4.60882282067406e-07, "loss": 0.2254, "step": 2894 }, { "epoch": 1.5897858319604614, "grad_norm": 0.5050084157595379, "learning_rate": 4.6059275248739403e-07, "loss": 0.2502, "step": 2895 }, { "epoch": 1.5903349807797913, "grad_norm": 0.6483260235065413, "learning_rate": 4.6030323622274955e-07, "loss": 0.2665, "step": 2896 }, { "epoch": 1.5908841295991214, "grad_norm": 0.9964039036691177, "learning_rate": 4.600137333711735e-07, "loss": 0.4358, "step": 2897 }, { "epoch": 1.5914332784184513, "grad_norm": 0.42513017428971134, "learning_rate": 4.5972424403036235e-07, "loss": 0.238, "step": 2898 }, { "epoch": 1.5919824272377814, "grad_norm": 0.4474820186996626, "learning_rate": 4.5943476829800855e-07, "loss": 0.2518, "step": 2899 }, { "epoch": 1.5925315760571115, "grad_norm": 0.5755466069582273, "learning_rate": 4.5914530627179874e-07, "loss": 0.2588, "step": 2900 }, { "epoch": 1.5930807248764416, "grad_norm": 0.5813029226034779, "learning_rate": 4.5885585804941625e-07, "loss": 0.2491, "step": 2901 }, { "epoch": 1.5936298736957717, "grad_norm": 0.5109848564477342, "learning_rate": 4.5856642372853897e-07, "loss": 0.2672, "step": 2902 }, { "epoch": 1.5941790225151016, "grad_norm": 0.5573530714024166, "learning_rate": 4.5827700340684033e-07, "loss": 0.2252, "step": 2903 }, { "epoch": 1.5947281713344317, "grad_norm": 0.5316256991857502, "learning_rate": 4.579875971819892e-07, "loss": 0.2401, "step": 2904 }, { "epoch": 1.5952773201537616, "grad_norm": 0.45888881091440503, "learning_rate": 4.576982051516494e-07, "loss": 0.2523, "step": 2905 }, { "epoch": 1.5958264689730917, "grad_norm": 0.600405516725246, "learning_rate": 4.5740882741348003e-07, "loss": 0.2582, "step": 2906 }, { "epoch": 1.5963756177924218, "grad_norm": 0.5128721216329234, "learning_rate": 4.5711946406513537e-07, "loss": 0.2937, "step": 2907 }, { "epoch": 1.5969247666117519, "grad_norm": 0.4440745041487263, "learning_rate": 4.56830115204265e-07, "loss": 0.2338, "step": 2908 }, { "epoch": 1.597473915431082, "grad_norm": 0.47602391207624395, "learning_rate": 4.5654078092851355e-07, "loss": 0.1956, "step": 2909 }, { "epoch": 1.5980230642504119, "grad_norm": 0.49377983855292046, "learning_rate": 4.562514613355207e-07, "loss": 0.2515, "step": 2910 }, { "epoch": 1.598572213069742, "grad_norm": 0.4529738345543638, "learning_rate": 4.559621565229209e-07, "loss": 0.2605, "step": 2911 }, { "epoch": 1.5991213618890718, "grad_norm": 0.4692302722186548, "learning_rate": 4.55672866588344e-07, "loss": 0.249, "step": 2912 }, { "epoch": 1.599670510708402, "grad_norm": 0.4241445715442823, "learning_rate": 4.553835916294147e-07, "loss": 0.2417, "step": 2913 }, { "epoch": 1.600219659527732, "grad_norm": 0.46085066528851054, "learning_rate": 4.550943317437527e-07, "loss": 0.2092, "step": 2914 }, { "epoch": 1.6007688083470621, "grad_norm": 0.7157498983630428, "learning_rate": 4.5480508702897244e-07, "loss": 0.2866, "step": 2915 }, { "epoch": 1.6013179571663922, "grad_norm": 0.47809016786972625, "learning_rate": 4.545158575826838e-07, "loss": 0.2467, "step": 2916 }, { "epoch": 1.6018671059857221, "grad_norm": 0.5618657293761034, "learning_rate": 4.5422664350249024e-07, "loss": 0.2364, "step": 2917 }, { "epoch": 1.602416254805052, "grad_norm": 0.43233356556320074, "learning_rate": 4.539374448859915e-07, "loss": 0.2402, "step": 2918 }, { "epoch": 1.6029654036243821, "grad_norm": 0.7321003951557213, "learning_rate": 4.536482618307813e-07, "loss": 0.3063, "step": 2919 }, { "epoch": 1.6035145524437122, "grad_norm": 0.43415863774815755, "learning_rate": 4.5335909443444804e-07, "loss": 0.2425, "step": 2920 }, { "epoch": 1.6040637012630423, "grad_norm": 0.5596418258407864, "learning_rate": 4.530699427945755e-07, "loss": 0.2438, "step": 2921 }, { "epoch": 1.6046128500823724, "grad_norm": 0.3735831192460479, "learning_rate": 4.5278080700874135e-07, "loss": 0.2149, "step": 2922 }, { "epoch": 1.6051619989017025, "grad_norm": 0.5462719096950205, "learning_rate": 4.5249168717451836e-07, "loss": 0.2358, "step": 2923 }, { "epoch": 1.6057111477210324, "grad_norm": 0.4586230731201346, "learning_rate": 4.522025833894739e-07, "loss": 0.2801, "step": 2924 }, { "epoch": 1.6062602965403623, "grad_norm": 0.5270734219029176, "learning_rate": 4.519134957511697e-07, "loss": 0.2956, "step": 2925 }, { "epoch": 1.6068094453596924, "grad_norm": 0.4319738323864442, "learning_rate": 4.516244243571623e-07, "loss": 0.2584, "step": 2926 }, { "epoch": 1.6073585941790225, "grad_norm": 0.5245532255376102, "learning_rate": 4.5133536930500275e-07, "loss": 0.2119, "step": 2927 }, { "epoch": 1.6079077429983526, "grad_norm": 0.5092278036719082, "learning_rate": 4.5104633069223623e-07, "loss": 0.2885, "step": 2928 }, { "epoch": 1.6084568918176827, "grad_norm": 0.5176222098633811, "learning_rate": 4.5075730861640263e-07, "loss": 0.2648, "step": 2929 }, { "epoch": 1.6090060406370126, "grad_norm": 0.47960674212322696, "learning_rate": 4.504683031750365e-07, "loss": 0.3016, "step": 2930 }, { "epoch": 1.6095551894563427, "grad_norm": 0.6056835781585965, "learning_rate": 4.5017931446566645e-07, "loss": 0.2549, "step": 2931 }, { "epoch": 1.6101043382756726, "grad_norm": 0.5065387766463945, "learning_rate": 4.4989034258581554e-07, "loss": 0.2848, "step": 2932 }, { "epoch": 1.6106534870950027, "grad_norm": 0.4253787830554584, "learning_rate": 4.496013876330009e-07, "loss": 0.2213, "step": 2933 }, { "epoch": 1.6112026359143328, "grad_norm": 0.4924427881477454, "learning_rate": 4.493124497047343e-07, "loss": 0.2763, "step": 2934 }, { "epoch": 1.6117517847336629, "grad_norm": 0.5894542706753981, "learning_rate": 4.490235288985218e-07, "loss": 0.2547, "step": 2935 }, { "epoch": 1.612300933552993, "grad_norm": 0.4289044751118621, "learning_rate": 4.4873462531186336e-07, "loss": 0.2465, "step": 2936 }, { "epoch": 1.6128500823723229, "grad_norm": 0.4284522949866835, "learning_rate": 4.484457390422533e-07, "loss": 0.256, "step": 2937 }, { "epoch": 1.613399231191653, "grad_norm": 0.46967226013457086, "learning_rate": 4.4815687018718034e-07, "loss": 0.2395, "step": 2938 }, { "epoch": 1.6139483800109828, "grad_norm": 0.5372992493207225, "learning_rate": 4.478680188441268e-07, "loss": 0.2456, "step": 2939 }, { "epoch": 1.614497528830313, "grad_norm": 0.4558068624278018, "learning_rate": 4.475791851105694e-07, "loss": 0.2242, "step": 2940 }, { "epoch": 1.615046677649643, "grad_norm": 0.5065016767222446, "learning_rate": 4.4729036908397897e-07, "loss": 0.2326, "step": 2941 }, { "epoch": 1.6155958264689732, "grad_norm": 0.49429867215469064, "learning_rate": 4.470015708618202e-07, "loss": 0.2504, "step": 2942 }, { "epoch": 1.6161449752883033, "grad_norm": 0.5180948292580143, "learning_rate": 4.4671279054155196e-07, "loss": 0.2364, "step": 2943 }, { "epoch": 1.6166941241076331, "grad_norm": 0.4372385560713874, "learning_rate": 4.4642402822062693e-07, "loss": 0.1974, "step": 2944 }, { "epoch": 1.6172432729269632, "grad_norm": 0.6174486625859185, "learning_rate": 4.461352839964916e-07, "loss": 0.3246, "step": 2945 }, { "epoch": 1.6177924217462931, "grad_norm": 0.4371823318333394, "learning_rate": 4.458465579665866e-07, "loss": 0.2382, "step": 2946 }, { "epoch": 1.6183415705656232, "grad_norm": 0.38280662823048006, "learning_rate": 4.455578502283465e-07, "loss": 0.2617, "step": 2947 }, { "epoch": 1.6188907193849533, "grad_norm": 0.5114612302213691, "learning_rate": 4.452691608791994e-07, "loss": 0.2317, "step": 2948 }, { "epoch": 1.6194398682042834, "grad_norm": 0.5964975038278817, "learning_rate": 4.449804900165673e-07, "loss": 0.2241, "step": 2949 }, { "epoch": 1.6199890170236135, "grad_norm": 0.4855671845593758, "learning_rate": 4.44691837737866e-07, "loss": 0.2514, "step": 2950 }, { "epoch": 1.6205381658429434, "grad_norm": 0.7675216385164546, "learning_rate": 4.444032041405049e-07, "loss": 0.3687, "step": 2951 }, { "epoch": 1.6210873146622735, "grad_norm": 0.4842077014594088, "learning_rate": 4.441145893218873e-07, "loss": 0.2206, "step": 2952 }, { "epoch": 1.6216364634816034, "grad_norm": 0.4759565469441726, "learning_rate": 4.4382599337941014e-07, "loss": 0.2446, "step": 2953 }, { "epoch": 1.6221856123009335, "grad_norm": 0.45472033527722416, "learning_rate": 4.435374164104639e-07, "loss": 0.293, "step": 2954 }, { "epoch": 1.6227347611202636, "grad_norm": 0.45031174769397214, "learning_rate": 4.432488585124326e-07, "loss": 0.2354, "step": 2955 }, { "epoch": 1.6232839099395937, "grad_norm": 0.5195973926986616, "learning_rate": 4.429603197826938e-07, "loss": 0.2423, "step": 2956 }, { "epoch": 1.6238330587589238, "grad_norm": 0.5096366177397833, "learning_rate": 4.426718003186189e-07, "loss": 0.2297, "step": 2957 }, { "epoch": 1.6243822075782537, "grad_norm": 0.5905113957478257, "learning_rate": 4.4238330021757256e-07, "loss": 0.2547, "step": 2958 }, { "epoch": 1.6249313563975838, "grad_norm": 0.4565435197256595, "learning_rate": 4.420948195769127e-07, "loss": 0.2542, "step": 2959 }, { "epoch": 1.6254805052169137, "grad_norm": 0.455943454240101, "learning_rate": 4.4180635849399134e-07, "loss": 0.243, "step": 2960 }, { "epoch": 1.6260296540362438, "grad_norm": 0.4164779031807216, "learning_rate": 4.415179170661532e-07, "loss": 0.2381, "step": 2961 }, { "epoch": 1.6265788028555739, "grad_norm": 0.595340020939723, "learning_rate": 4.412294953907365e-07, "loss": 0.2429, "step": 2962 }, { "epoch": 1.627127951674904, "grad_norm": 0.6312148140574662, "learning_rate": 4.4094109356507307e-07, "loss": 0.2387, "step": 2963 }, { "epoch": 1.627677100494234, "grad_norm": 0.4595678941474588, "learning_rate": 4.40652711686488e-07, "loss": 0.2327, "step": 2964 }, { "epoch": 1.628226249313564, "grad_norm": 0.5612732792189951, "learning_rate": 4.403643498522996e-07, "loss": 0.2236, "step": 2965 }, { "epoch": 1.6287753981328938, "grad_norm": 0.4130375784010273, "learning_rate": 4.400760081598191e-07, "loss": 0.261, "step": 2966 }, { "epoch": 1.629324546952224, "grad_norm": 0.80573332259453, "learning_rate": 4.397876867063512e-07, "loss": 0.2718, "step": 2967 }, { "epoch": 1.629873695771554, "grad_norm": 0.5073023708688011, "learning_rate": 4.3949938558919403e-07, "loss": 0.2437, "step": 2968 }, { "epoch": 1.6304228445908842, "grad_norm": 0.6603871450293803, "learning_rate": 4.392111049056385e-07, "loss": 0.2148, "step": 2969 }, { "epoch": 1.6309719934102143, "grad_norm": 0.583614769125041, "learning_rate": 4.3892284475296857e-07, "loss": 0.2409, "step": 2970 }, { "epoch": 1.6315211422295444, "grad_norm": 0.5203909843693727, "learning_rate": 4.3863460522846176e-07, "loss": 0.236, "step": 2971 }, { "epoch": 1.6320702910488742, "grad_norm": 0.5894065647013814, "learning_rate": 4.3834638642938786e-07, "loss": 0.2718, "step": 2972 }, { "epoch": 1.6326194398682041, "grad_norm": 0.4757715836098419, "learning_rate": 4.3805818845301025e-07, "loss": 0.277, "step": 2973 }, { "epoch": 1.6331685886875342, "grad_norm": 0.6137710197467259, "learning_rate": 4.3777001139658524e-07, "loss": 0.2649, "step": 2974 }, { "epoch": 1.6337177375068643, "grad_norm": 0.45116394860291625, "learning_rate": 4.3748185535736196e-07, "loss": 0.241, "step": 2975 }, { "epoch": 1.6342668863261944, "grad_norm": 0.5311190405784784, "learning_rate": 4.3719372043258254e-07, "loss": 0.2415, "step": 2976 }, { "epoch": 1.6348160351455245, "grad_norm": 0.6620057299676249, "learning_rate": 4.369056067194815e-07, "loss": 0.2432, "step": 2977 }, { "epoch": 1.6353651839648546, "grad_norm": 0.4879195016935656, "learning_rate": 4.3661751431528703e-07, "loss": 0.2338, "step": 2978 }, { "epoch": 1.6359143327841845, "grad_norm": 0.5464755861012252, "learning_rate": 4.3632944331721954e-07, "loss": 0.2476, "step": 2979 }, { "epoch": 1.6364634816035144, "grad_norm": 0.5635139062204975, "learning_rate": 4.3604139382249224e-07, "loss": 0.2515, "step": 2980 }, { "epoch": 1.6370126304228445, "grad_norm": 0.5224059360813856, "learning_rate": 4.3575336592831153e-07, "loss": 0.2449, "step": 2981 }, { "epoch": 1.6375617792421746, "grad_norm": 0.5869731629868171, "learning_rate": 4.3546535973187603e-07, "loss": 0.2375, "step": 2982 }, { "epoch": 1.6381109280615047, "grad_norm": 0.4615835679590841, "learning_rate": 4.351773753303772e-07, "loss": 0.2849, "step": 2983 }, { "epoch": 1.6386600768808348, "grad_norm": 0.5504512368035249, "learning_rate": 4.3488941282099927e-07, "loss": 0.2296, "step": 2984 }, { "epoch": 1.6392092257001647, "grad_norm": 0.46984498389223545, "learning_rate": 4.346014723009188e-07, "loss": 0.2399, "step": 2985 }, { "epoch": 1.6397583745194948, "grad_norm": 0.55724959746297, "learning_rate": 4.3431355386730536e-07, "loss": 0.2436, "step": 2986 }, { "epoch": 1.6403075233388247, "grad_norm": 0.49595845804379907, "learning_rate": 4.3402565761732063e-07, "loss": 0.2256, "step": 2987 }, { "epoch": 1.6408566721581548, "grad_norm": 0.5661298030128081, "learning_rate": 4.337377836481191e-07, "loss": 0.2426, "step": 2988 }, { "epoch": 1.6414058209774849, "grad_norm": 0.5694723374714927, "learning_rate": 4.334499320568474e-07, "loss": 0.2575, "step": 2989 }, { "epoch": 1.641954969796815, "grad_norm": 0.45196041404453813, "learning_rate": 4.3316210294064496e-07, "loss": 0.2122, "step": 2990 }, { "epoch": 1.642504118616145, "grad_norm": 0.464440428690339, "learning_rate": 4.328742963966437e-07, "loss": 0.2192, "step": 2991 }, { "epoch": 1.643053267435475, "grad_norm": 0.532738911522231, "learning_rate": 4.325865125219675e-07, "loss": 0.2703, "step": 2992 }, { "epoch": 1.643602416254805, "grad_norm": 0.5140227276067537, "learning_rate": 4.322987514137331e-07, "loss": 0.2321, "step": 2993 }, { "epoch": 1.644151565074135, "grad_norm": 0.5598448935949341, "learning_rate": 4.320110131690487e-07, "loss": 0.3058, "step": 2994 }, { "epoch": 1.644700713893465, "grad_norm": 0.6395590360057778, "learning_rate": 4.317232978850159e-07, "loss": 0.2661, "step": 2995 }, { "epoch": 1.6452498627127952, "grad_norm": 0.4245140397625883, "learning_rate": 4.314356056587279e-07, "loss": 0.2654, "step": 2996 }, { "epoch": 1.6457990115321253, "grad_norm": 0.6458112749863569, "learning_rate": 4.311479365872699e-07, "loss": 0.2059, "step": 2997 }, { "epoch": 1.6463481603514554, "grad_norm": 0.4576900188920051, "learning_rate": 4.3086029076772025e-07, "loss": 0.2247, "step": 2998 }, { "epoch": 1.6468973091707853, "grad_norm": 0.4685017041085208, "learning_rate": 4.305726682971481e-07, "loss": 0.2431, "step": 2999 }, { "epoch": 1.6474464579901154, "grad_norm": 0.5015590452693746, "learning_rate": 4.302850692726159e-07, "loss": 0.2525, "step": 3000 }, { "epoch": 1.6474464579901154, "eval_loss": 0.32768309116363525, "eval_runtime": 18.6661, "eval_samples_per_second": 23.733, "eval_steps_per_second": 1.018, "step": 3000 }, { "epoch": 1.6479956068094452, "grad_norm": 0.5321880247918745, "learning_rate": 4.2999749379117755e-07, "loss": 0.226, "step": 3001 }, { "epoch": 1.6485447556287753, "grad_norm": 0.5016436617857616, "learning_rate": 4.2970994194987916e-07, "loss": 0.2387, "step": 3002 }, { "epoch": 1.6490939044481054, "grad_norm": 0.4711429385206377, "learning_rate": 4.29422413845759e-07, "loss": 0.2668, "step": 3003 }, { "epoch": 1.6496430532674355, "grad_norm": 0.5024320098710312, "learning_rate": 4.2913490957584725e-07, "loss": 0.2489, "step": 3004 }, { "epoch": 1.6501922020867656, "grad_norm": 0.5101561852971753, "learning_rate": 4.2884742923716586e-07, "loss": 0.239, "step": 3005 }, { "epoch": 1.6507413509060955, "grad_norm": 0.44747282520626047, "learning_rate": 4.285599729267289e-07, "loss": 0.2242, "step": 3006 }, { "epoch": 1.6512904997254256, "grad_norm": 0.5446473872220932, "learning_rate": 4.2827254074154226e-07, "loss": 0.2545, "step": 3007 }, { "epoch": 1.6518396485447555, "grad_norm": 0.4644172645656962, "learning_rate": 4.279851327786038e-07, "loss": 0.3082, "step": 3008 }, { "epoch": 1.6523887973640856, "grad_norm": 0.38494898532173805, "learning_rate": 4.276977491349031e-07, "loss": 0.2493, "step": 3009 }, { "epoch": 1.6529379461834157, "grad_norm": 0.5527168698358322, "learning_rate": 4.274103899074215e-07, "loss": 0.2193, "step": 3010 }, { "epoch": 1.6534870950027458, "grad_norm": 0.5531703585469031, "learning_rate": 4.27123055193132e-07, "loss": 0.2681, "step": 3011 }, { "epoch": 1.654036243822076, "grad_norm": 0.8028292194172078, "learning_rate": 4.268357450889998e-07, "loss": 0.2343, "step": 3012 }, { "epoch": 1.6545853926414058, "grad_norm": 0.5005892526439495, "learning_rate": 4.2654845969198133e-07, "loss": 0.2488, "step": 3013 }, { "epoch": 1.655134541460736, "grad_norm": 0.4240992579928759, "learning_rate": 4.262611990990247e-07, "loss": 0.2698, "step": 3014 }, { "epoch": 1.6556836902800658, "grad_norm": 0.5390274157023256, "learning_rate": 4.2597396340707024e-07, "loss": 0.2347, "step": 3015 }, { "epoch": 1.656232839099396, "grad_norm": 0.407457690983785, "learning_rate": 4.256867527130487e-07, "loss": 0.2454, "step": 3016 }, { "epoch": 1.656781987918726, "grad_norm": 0.567694031133933, "learning_rate": 4.2539956711388363e-07, "loss": 0.2617, "step": 3017 }, { "epoch": 1.657331136738056, "grad_norm": 0.5975179865111244, "learning_rate": 4.251124067064895e-07, "loss": 0.254, "step": 3018 }, { "epoch": 1.6578802855573862, "grad_norm": 0.5648313514852326, "learning_rate": 4.248252715877722e-07, "loss": 0.2823, "step": 3019 }, { "epoch": 1.658429434376716, "grad_norm": 0.48146795297414174, "learning_rate": 4.245381618546296e-07, "loss": 0.2656, "step": 3020 }, { "epoch": 1.658978583196046, "grad_norm": 0.5884997084633288, "learning_rate": 4.242510776039501e-07, "loss": 0.247, "step": 3021 }, { "epoch": 1.659527732015376, "grad_norm": 0.40603972320622483, "learning_rate": 4.2396401893261457e-07, "loss": 0.282, "step": 3022 }, { "epoch": 1.6600768808347062, "grad_norm": 0.4674583345888558, "learning_rate": 4.2367698593749444e-07, "loss": 0.2364, "step": 3023 }, { "epoch": 1.6606260296540363, "grad_norm": 0.5658847248246237, "learning_rate": 4.233899787154529e-07, "loss": 0.2607, "step": 3024 }, { "epoch": 1.6611751784733664, "grad_norm": 0.4368981402855396, "learning_rate": 4.2310299736334435e-07, "loss": 0.2381, "step": 3025 }, { "epoch": 1.6617243272926965, "grad_norm": 0.5643001206864895, "learning_rate": 4.228160419780145e-07, "loss": 0.2748, "step": 3026 }, { "epoch": 1.6622734761120264, "grad_norm": 0.43976914199520706, "learning_rate": 4.225291126562999e-07, "loss": 0.2514, "step": 3027 }, { "epoch": 1.6628226249313562, "grad_norm": 0.4494410236378608, "learning_rate": 4.2224220949502873e-07, "loss": 0.25, "step": 3028 }, { "epoch": 1.6633717737506863, "grad_norm": 0.5527842355106057, "learning_rate": 4.2195533259102053e-07, "loss": 0.2795, "step": 3029 }, { "epoch": 1.6639209225700164, "grad_norm": 0.5640588007644353, "learning_rate": 4.2166848204108527e-07, "loss": 0.2404, "step": 3030 }, { "epoch": 1.6644700713893466, "grad_norm": 0.6545184724717047, "learning_rate": 4.213816579420249e-07, "loss": 0.2909, "step": 3031 }, { "epoch": 1.6650192202086767, "grad_norm": 0.5910175990205282, "learning_rate": 4.2109486039063155e-07, "loss": 0.2251, "step": 3032 }, { "epoch": 1.6655683690280065, "grad_norm": 0.5683617619745938, "learning_rate": 4.208080894836891e-07, "loss": 0.291, "step": 3033 }, { "epoch": 1.6661175178473366, "grad_norm": 0.4400071920958627, "learning_rate": 4.2052134531797195e-07, "loss": 0.2845, "step": 3034 }, { "epoch": 1.6666666666666665, "grad_norm": 0.664955262231542, "learning_rate": 4.2023462799024594e-07, "loss": 0.2823, "step": 3035 }, { "epoch": 1.6672158154859966, "grad_norm": 0.4348603489456307, "learning_rate": 4.199479375972673e-07, "loss": 0.2295, "step": 3036 }, { "epoch": 1.6677649643053267, "grad_norm": 0.5921591066388595, "learning_rate": 4.19661274235784e-07, "loss": 0.2268, "step": 3037 }, { "epoch": 1.6683141131246568, "grad_norm": 0.46590129443327344, "learning_rate": 4.193746380025338e-07, "loss": 0.2378, "step": 3038 }, { "epoch": 1.668863261943987, "grad_norm": 0.5381341974645318, "learning_rate": 4.1908802899424613e-07, "loss": 0.2619, "step": 3039 }, { "epoch": 1.6694124107633168, "grad_norm": 0.47844780942334575, "learning_rate": 4.1880144730764096e-07, "loss": 0.2662, "step": 3040 }, { "epoch": 1.669961559582647, "grad_norm": 0.44402481051341247, "learning_rate": 4.185148930394288e-07, "loss": 0.2621, "step": 3041 }, { "epoch": 1.6705107084019768, "grad_norm": 0.4071245777064109, "learning_rate": 4.1822836628631176e-07, "loss": 0.2404, "step": 3042 }, { "epoch": 1.671059857221307, "grad_norm": 0.4723422532810768, "learning_rate": 4.1794186714498154e-07, "loss": 0.2385, "step": 3043 }, { "epoch": 1.671609006040637, "grad_norm": 0.4566559495746314, "learning_rate": 4.176553957121211e-07, "loss": 0.236, "step": 3044 }, { "epoch": 1.672158154859967, "grad_norm": 0.4081581679093222, "learning_rate": 4.17368952084404e-07, "loss": 0.2489, "step": 3045 }, { "epoch": 1.6727073036792972, "grad_norm": 0.5154388675068508, "learning_rate": 4.1708253635849464e-07, "loss": 0.2631, "step": 3046 }, { "epoch": 1.673256452498627, "grad_norm": 0.4720196217539502, "learning_rate": 4.167961486310477e-07, "loss": 0.2293, "step": 3047 }, { "epoch": 1.6738056013179572, "grad_norm": 0.3739883659854765, "learning_rate": 4.165097889987085e-07, "loss": 0.2627, "step": 3048 }, { "epoch": 1.674354750137287, "grad_norm": 0.4986845999349851, "learning_rate": 4.162234575581126e-07, "loss": 0.2416, "step": 3049 }, { "epoch": 1.6749038989566172, "grad_norm": 0.4291976326128812, "learning_rate": 4.159371544058864e-07, "loss": 0.2425, "step": 3050 }, { "epoch": 1.6754530477759473, "grad_norm": 0.47218705680666195, "learning_rate": 4.156508796386469e-07, "loss": 0.2339, "step": 3051 }, { "epoch": 1.6760021965952774, "grad_norm": 0.48858651592646163, "learning_rate": 4.153646333530012e-07, "loss": 0.2503, "step": 3052 }, { "epoch": 1.6765513454146075, "grad_norm": 0.4245944408227513, "learning_rate": 4.150784156455469e-07, "loss": 0.2271, "step": 3053 }, { "epoch": 1.6771004942339374, "grad_norm": 0.4755699562336762, "learning_rate": 4.147922266128718e-07, "loss": 0.2638, "step": 3054 }, { "epoch": 1.6776496430532675, "grad_norm": 0.44170874335767984, "learning_rate": 4.14506066351554e-07, "loss": 0.2599, "step": 3055 }, { "epoch": 1.6781987918725974, "grad_norm": 0.5659207277110574, "learning_rate": 4.1421993495816244e-07, "loss": 0.2634, "step": 3056 }, { "epoch": 1.6787479406919275, "grad_norm": 0.41536777601673885, "learning_rate": 4.1393383252925576e-07, "loss": 0.219, "step": 3057 }, { "epoch": 1.6792970895112576, "grad_norm": 0.5545571789442999, "learning_rate": 4.1364775916138283e-07, "loss": 0.2945, "step": 3058 }, { "epoch": 1.6798462383305877, "grad_norm": 0.5636197241564115, "learning_rate": 4.133617149510832e-07, "loss": 0.3089, "step": 3059 }, { "epoch": 1.6803953871499178, "grad_norm": 0.4611510670906123, "learning_rate": 4.1307569999488594e-07, "loss": 0.2348, "step": 3060 }, { "epoch": 1.6809445359692476, "grad_norm": 0.5169147010074834, "learning_rate": 4.127897143893108e-07, "loss": 0.2886, "step": 3061 }, { "epoch": 1.6814936847885777, "grad_norm": 0.44274446558278335, "learning_rate": 4.1250375823086714e-07, "loss": 0.2342, "step": 3062 }, { "epoch": 1.6820428336079076, "grad_norm": 0.4768397816729602, "learning_rate": 4.1221783161605483e-07, "loss": 0.2369, "step": 3063 }, { "epoch": 1.6825919824272377, "grad_norm": 0.5264918258891385, "learning_rate": 4.119319346413636e-07, "loss": 0.2602, "step": 3064 }, { "epoch": 1.6831411312465678, "grad_norm": 0.6069052234793488, "learning_rate": 4.11646067403273e-07, "loss": 0.2749, "step": 3065 }, { "epoch": 1.683690280065898, "grad_norm": 0.42850816209249276, "learning_rate": 4.113602299982527e-07, "loss": 0.2609, "step": 3066 }, { "epoch": 1.684239428885228, "grad_norm": 0.4603559031022033, "learning_rate": 4.1107442252276225e-07, "loss": 0.2471, "step": 3067 }, { "epoch": 1.684788577704558, "grad_norm": 0.5446735982756209, "learning_rate": 4.107886450732513e-07, "loss": 0.2756, "step": 3068 }, { "epoch": 1.685337726523888, "grad_norm": 0.5394376541359203, "learning_rate": 4.1050289774615916e-07, "loss": 0.2377, "step": 3069 }, { "epoch": 1.685886875343218, "grad_norm": 0.45254873948708396, "learning_rate": 4.102171806379151e-07, "loss": 0.2346, "step": 3070 }, { "epoch": 1.686436024162548, "grad_norm": 0.5343895926456478, "learning_rate": 4.099314938449379e-07, "loss": 0.2701, "step": 3071 }, { "epoch": 1.6869851729818781, "grad_norm": 0.5108232989065905, "learning_rate": 4.0964583746363635e-07, "loss": 0.2409, "step": 3072 }, { "epoch": 1.6875343218012082, "grad_norm": 0.5231078379118408, "learning_rate": 4.0936021159040915e-07, "loss": 0.2786, "step": 3073 }, { "epoch": 1.6880834706205383, "grad_norm": 0.5152354640813975, "learning_rate": 4.0907461632164447e-07, "loss": 0.2575, "step": 3074 }, { "epoch": 1.6886326194398682, "grad_norm": 0.5472219437014276, "learning_rate": 4.087890517537202e-07, "loss": 0.2332, "step": 3075 }, { "epoch": 1.689181768259198, "grad_norm": 0.38895894219446736, "learning_rate": 4.085035179830036e-07, "loss": 0.2304, "step": 3076 }, { "epoch": 1.6897309170785282, "grad_norm": 0.5220282932782652, "learning_rate": 4.0821801510585205e-07, "loss": 0.2685, "step": 3077 }, { "epoch": 1.6902800658978583, "grad_norm": 0.5260645108427713, "learning_rate": 4.079325432186122e-07, "loss": 0.2433, "step": 3078 }, { "epoch": 1.6908292147171884, "grad_norm": 0.5034220690315326, "learning_rate": 4.076471024176202e-07, "loss": 0.2558, "step": 3079 }, { "epoch": 1.6913783635365185, "grad_norm": 0.5551764237867175, "learning_rate": 4.07361692799202e-07, "loss": 0.2574, "step": 3080 }, { "epoch": 1.6919275123558486, "grad_norm": 0.5404895842846217, "learning_rate": 4.070763144596729e-07, "loss": 0.2607, "step": 3081 }, { "epoch": 1.6924766611751785, "grad_norm": 0.5640804741252682, "learning_rate": 4.067909674953373e-07, "loss": 0.2453, "step": 3082 }, { "epoch": 1.6930258099945084, "grad_norm": 0.5786779535412679, "learning_rate": 4.0650565200248933e-07, "loss": 0.2513, "step": 3083 }, { "epoch": 1.6935749588138385, "grad_norm": 0.4598450893297382, "learning_rate": 4.062203680774124e-07, "loss": 0.2418, "step": 3084 }, { "epoch": 1.6941241076331686, "grad_norm": 0.44104779650131115, "learning_rate": 4.059351158163796e-07, "loss": 0.2484, "step": 3085 }, { "epoch": 1.6946732564524987, "grad_norm": 0.47550985987042466, "learning_rate": 4.056498953156529e-07, "loss": 0.238, "step": 3086 }, { "epoch": 1.6952224052718288, "grad_norm": 0.6137502707603291, "learning_rate": 4.053647066714837e-07, "loss": 0.2755, "step": 3087 }, { "epoch": 1.6957715540911587, "grad_norm": 0.5543583922579137, "learning_rate": 4.0507954998011265e-07, "loss": 0.2988, "step": 3088 }, { "epoch": 1.6963207029104888, "grad_norm": 0.5893322682434013, "learning_rate": 4.0479442533776955e-07, "loss": 0.2442, "step": 3089 }, { "epoch": 1.6968698517298186, "grad_norm": 0.45276982487131684, "learning_rate": 4.0450933284067366e-07, "loss": 0.2465, "step": 3090 }, { "epoch": 1.6974190005491487, "grad_norm": 0.5278766670600922, "learning_rate": 4.042242725850331e-07, "loss": 0.2502, "step": 3091 }, { "epoch": 1.6979681493684788, "grad_norm": 0.4758375739314072, "learning_rate": 4.0393924466704534e-07, "loss": 0.2442, "step": 3092 }, { "epoch": 1.698517298187809, "grad_norm": 0.5885298342925978, "learning_rate": 4.0365424918289644e-07, "loss": 0.2368, "step": 3093 }, { "epoch": 1.699066447007139, "grad_norm": 0.5152674847901895, "learning_rate": 4.0336928622876215e-07, "loss": 0.2194, "step": 3094 }, { "epoch": 1.699615595826469, "grad_norm": 0.49793174083045577, "learning_rate": 4.0308435590080705e-07, "loss": 0.2816, "step": 3095 }, { "epoch": 1.700164744645799, "grad_norm": 0.4300896716494866, "learning_rate": 4.0279945829518423e-07, "loss": 0.231, "step": 3096 }, { "epoch": 1.700713893465129, "grad_norm": 0.49250786620509346, "learning_rate": 4.025145935080368e-07, "loss": 0.258, "step": 3097 }, { "epoch": 1.701263042284459, "grad_norm": 0.5053087407780013, "learning_rate": 4.0222976163549536e-07, "loss": 0.2708, "step": 3098 }, { "epoch": 1.7018121911037891, "grad_norm": 0.7154528004056316, "learning_rate": 4.0194496277368075e-07, "loss": 0.2989, "step": 3099 }, { "epoch": 1.7023613399231192, "grad_norm": 0.5934695872602223, "learning_rate": 4.0166019701870184e-07, "loss": 0.2905, "step": 3100 }, { "epoch": 1.7029104887424493, "grad_norm": 0.5199988431031511, "learning_rate": 4.0137546446665647e-07, "loss": 0.2362, "step": 3101 }, { "epoch": 1.7034596375617792, "grad_norm": 0.4673650758981907, "learning_rate": 4.010907652136318e-07, "loss": 0.2466, "step": 3102 }, { "epoch": 1.7040087863811093, "grad_norm": 0.44362318001821655, "learning_rate": 4.008060993557031e-07, "loss": 0.2822, "step": 3103 }, { "epoch": 1.7045579352004392, "grad_norm": 0.4690346733420638, "learning_rate": 4.005214669889345e-07, "loss": 0.2859, "step": 3104 }, { "epoch": 1.7051070840197693, "grad_norm": 0.5514408605871878, "learning_rate": 4.0023686820937904e-07, "loss": 0.2279, "step": 3105 }, { "epoch": 1.7056562328390994, "grad_norm": 0.4465984950657351, "learning_rate": 3.999523031130782e-07, "loss": 0.2262, "step": 3106 }, { "epoch": 1.7062053816584295, "grad_norm": 0.5291215831599918, "learning_rate": 3.996677717960624e-07, "loss": 0.2255, "step": 3107 }, { "epoch": 1.7067545304777596, "grad_norm": 0.46027985056910864, "learning_rate": 3.993832743543506e-07, "loss": 0.2574, "step": 3108 }, { "epoch": 1.7073036792970895, "grad_norm": 0.5165453949960084, "learning_rate": 3.990988108839499e-07, "loss": 0.2551, "step": 3109 }, { "epoch": 1.7078528281164196, "grad_norm": 0.4490921720324647, "learning_rate": 3.988143814808562e-07, "loss": 0.2299, "step": 3110 }, { "epoch": 1.7084019769357495, "grad_norm": 0.5796011417052417, "learning_rate": 3.985299862410542e-07, "loss": 0.307, "step": 3111 }, { "epoch": 1.7089511257550796, "grad_norm": 0.5328523502035938, "learning_rate": 3.9824562526051676e-07, "loss": 0.2241, "step": 3112 }, { "epoch": 1.7095002745744097, "grad_norm": 0.4315466244748324, "learning_rate": 3.9796129863520525e-07, "loss": 0.2432, "step": 3113 }, { "epoch": 1.7100494233937398, "grad_norm": 0.5330556906878818, "learning_rate": 3.976770064610694e-07, "loss": 0.2579, "step": 3114 }, { "epoch": 1.7105985722130699, "grad_norm": 0.6080950836886879, "learning_rate": 3.973927488340471e-07, "loss": 0.2288, "step": 3115 }, { "epoch": 1.7111477210323998, "grad_norm": 0.45139733692259465, "learning_rate": 3.971085258500652e-07, "loss": 0.2501, "step": 3116 }, { "epoch": 1.7116968698517299, "grad_norm": 0.6968899326731548, "learning_rate": 3.9682433760503837e-07, "loss": 0.2212, "step": 3117 }, { "epoch": 1.7122460186710597, "grad_norm": 0.5001687952993538, "learning_rate": 3.965401841948694e-07, "loss": 0.232, "step": 3118 }, { "epoch": 1.7127951674903898, "grad_norm": 0.42450290855908773, "learning_rate": 3.9625606571545024e-07, "loss": 0.2794, "step": 3119 }, { "epoch": 1.71334431630972, "grad_norm": 0.41209379030662363, "learning_rate": 3.959719822626597e-07, "loss": 0.2359, "step": 3120 }, { "epoch": 1.71389346512905, "grad_norm": 0.5675986413898608, "learning_rate": 3.9568793393236584e-07, "loss": 0.3001, "step": 3121 }, { "epoch": 1.7144426139483802, "grad_norm": 0.5105181315278182, "learning_rate": 3.9540392082042445e-07, "loss": 0.2616, "step": 3122 }, { "epoch": 1.71499176276771, "grad_norm": 0.49757355196998587, "learning_rate": 3.9511994302267937e-07, "loss": 0.2756, "step": 3123 }, { "epoch": 1.7155409115870401, "grad_norm": 0.4234705397472543, "learning_rate": 3.948360006349629e-07, "loss": 0.2279, "step": 3124 }, { "epoch": 1.71609006040637, "grad_norm": 0.45087437876275227, "learning_rate": 3.945520937530951e-07, "loss": 0.2425, "step": 3125 }, { "epoch": 1.7166392092257001, "grad_norm": 0.3960204243058321, "learning_rate": 3.942682224728839e-07, "loss": 0.2591, "step": 3126 }, { "epoch": 1.7171883580450302, "grad_norm": 0.5467669386972142, "learning_rate": 3.9398438689012534e-07, "loss": 0.2301, "step": 3127 }, { "epoch": 1.7177375068643603, "grad_norm": 0.5064991548688178, "learning_rate": 3.937005871006038e-07, "loss": 0.2691, "step": 3128 }, { "epoch": 1.7182866556836904, "grad_norm": 0.6087739473357225, "learning_rate": 3.9341682320009094e-07, "loss": 0.2398, "step": 3129 }, { "epoch": 1.7188358045030203, "grad_norm": 0.649749375518376, "learning_rate": 3.9313309528434693e-07, "loss": 0.3193, "step": 3130 }, { "epoch": 1.7193849533223502, "grad_norm": 0.5693126558938569, "learning_rate": 3.928494034491192e-07, "loss": 0.3099, "step": 3131 }, { "epoch": 1.7199341021416803, "grad_norm": 0.627447786560114, "learning_rate": 3.925657477901433e-07, "loss": 0.2403, "step": 3132 }, { "epoch": 1.7204832509610104, "grad_norm": 0.5176273730919496, "learning_rate": 3.922821284031428e-07, "loss": 0.2338, "step": 3133 }, { "epoch": 1.7210323997803405, "grad_norm": 0.42653543726467824, "learning_rate": 3.919985453838286e-07, "loss": 0.2325, "step": 3134 }, { "epoch": 1.7215815485996706, "grad_norm": 0.5938867839631261, "learning_rate": 3.917149988278995e-07, "loss": 0.2314, "step": 3135 }, { "epoch": 1.7221306974190007, "grad_norm": 0.46559949079715124, "learning_rate": 3.9143148883104245e-07, "loss": 0.2181, "step": 3136 }, { "epoch": 1.7226798462383306, "grad_norm": 0.5826948530045537, "learning_rate": 3.911480154889308e-07, "loss": 0.2622, "step": 3137 }, { "epoch": 1.7232289950576605, "grad_norm": 0.4790098693673044, "learning_rate": 3.9086457889722714e-07, "loss": 0.2267, "step": 3138 }, { "epoch": 1.7237781438769906, "grad_norm": 0.5398186957105922, "learning_rate": 3.9058117915158045e-07, "loss": 0.2485, "step": 3139 }, { "epoch": 1.7243272926963207, "grad_norm": 0.48776148280007303, "learning_rate": 3.902978163476278e-07, "loss": 0.2418, "step": 3140 }, { "epoch": 1.7248764415156508, "grad_norm": 0.44243516779979836, "learning_rate": 3.900144905809939e-07, "loss": 0.2854, "step": 3141 }, { "epoch": 1.7254255903349809, "grad_norm": 0.43468136136378044, "learning_rate": 3.8973120194729047e-07, "loss": 0.2516, "step": 3142 }, { "epoch": 1.7259747391543108, "grad_norm": 0.5795905403945281, "learning_rate": 3.8944795054211714e-07, "loss": 0.2697, "step": 3143 }, { "epoch": 1.7265238879736409, "grad_norm": 0.501383559017645, "learning_rate": 3.8916473646106073e-07, "loss": 0.2557, "step": 3144 }, { "epoch": 1.7270730367929708, "grad_norm": 0.5969208261131543, "learning_rate": 3.888815597996956e-07, "loss": 0.2819, "step": 3145 }, { "epoch": 1.7276221856123009, "grad_norm": 0.6574309701900565, "learning_rate": 3.8859842065358344e-07, "loss": 0.2502, "step": 3146 }, { "epoch": 1.728171334431631, "grad_norm": 0.47809567820512366, "learning_rate": 3.8831531911827347e-07, "loss": 0.2136, "step": 3147 }, { "epoch": 1.728720483250961, "grad_norm": 0.4990969370217534, "learning_rate": 3.8803225528930166e-07, "loss": 0.2463, "step": 3148 }, { "epoch": 1.7292696320702912, "grad_norm": 0.5446776872440375, "learning_rate": 3.877492292621918e-07, "loss": 0.266, "step": 3149 }, { "epoch": 1.729818780889621, "grad_norm": 0.6854649798885917, "learning_rate": 3.8746624113245487e-07, "loss": 0.2274, "step": 3150 }, { "epoch": 1.7303679297089511, "grad_norm": 0.4490608893339381, "learning_rate": 3.871832909955888e-07, "loss": 0.2257, "step": 3151 }, { "epoch": 1.730917078528281, "grad_norm": 0.4650256158242345, "learning_rate": 3.8690037894707897e-07, "loss": 0.264, "step": 3152 }, { "epoch": 1.7314662273476111, "grad_norm": 0.4402758653056426, "learning_rate": 3.866175050823975e-07, "loss": 0.2323, "step": 3153 }, { "epoch": 1.7320153761669412, "grad_norm": 0.452233599288595, "learning_rate": 3.863346694970041e-07, "loss": 0.239, "step": 3154 }, { "epoch": 1.7325645249862713, "grad_norm": 0.4767185355255673, "learning_rate": 3.8605187228634537e-07, "loss": 0.2672, "step": 3155 }, { "epoch": 1.7331136738056014, "grad_norm": 0.4199317265503855, "learning_rate": 3.857691135458549e-07, "loss": 0.2456, "step": 3156 }, { "epoch": 1.7336628226249313, "grad_norm": 0.5390364099938132, "learning_rate": 3.854863933709533e-07, "loss": 0.2227, "step": 3157 }, { "epoch": 1.7342119714442614, "grad_norm": 0.48385503439722066, "learning_rate": 3.852037118570484e-07, "loss": 0.2242, "step": 3158 }, { "epoch": 1.7347611202635913, "grad_norm": 0.4468525853930528, "learning_rate": 3.849210690995346e-07, "loss": 0.2603, "step": 3159 }, { "epoch": 1.7353102690829214, "grad_norm": 0.506347106145234, "learning_rate": 3.846384651937935e-07, "loss": 0.2465, "step": 3160 }, { "epoch": 1.7358594179022515, "grad_norm": 0.47819259152248483, "learning_rate": 3.843559002351935e-07, "loss": 0.2278, "step": 3161 }, { "epoch": 1.7364085667215816, "grad_norm": 0.5426082721839471, "learning_rate": 3.840733743190897e-07, "loss": 0.247, "step": 3162 }, { "epoch": 1.7369577155409117, "grad_norm": 0.47794256238335486, "learning_rate": 3.837908875408246e-07, "loss": 0.2339, "step": 3163 }, { "epoch": 1.7375068643602416, "grad_norm": 0.5178755229089875, "learning_rate": 3.835084399957267e-07, "loss": 0.2361, "step": 3164 }, { "epoch": 1.7380560131795717, "grad_norm": 0.6085767036546321, "learning_rate": 3.832260317791118e-07, "loss": 0.2676, "step": 3165 }, { "epoch": 1.7386051619989016, "grad_norm": 0.59198103852928, "learning_rate": 3.8294366298628205e-07, "loss": 0.2874, "step": 3166 }, { "epoch": 1.7391543108182317, "grad_norm": 0.5707234253565714, "learning_rate": 3.8266133371252685e-07, "loss": 0.2579, "step": 3167 }, { "epoch": 1.7397034596375618, "grad_norm": 0.40381937534860596, "learning_rate": 3.8237904405312176e-07, "loss": 0.2159, "step": 3168 }, { "epoch": 1.740252608456892, "grad_norm": 0.48095654262877086, "learning_rate": 3.820967941033293e-07, "loss": 0.2842, "step": 3169 }, { "epoch": 1.740801757276222, "grad_norm": 0.9403320984223763, "learning_rate": 3.8181458395839814e-07, "loss": 0.4686, "step": 3170 }, { "epoch": 1.7413509060955519, "grad_norm": 0.5686372139645117, "learning_rate": 3.8153241371356387e-07, "loss": 0.2682, "step": 3171 }, { "epoch": 1.741900054914882, "grad_norm": 0.5594098645763892, "learning_rate": 3.8125028346404877e-07, "loss": 0.2843, "step": 3172 }, { "epoch": 1.7424492037342119, "grad_norm": 0.4907330043285017, "learning_rate": 3.809681933050612e-07, "loss": 0.2389, "step": 3173 }, { "epoch": 1.742998352553542, "grad_norm": 0.5919787133788562, "learning_rate": 3.806861433317964e-07, "loss": 0.2304, "step": 3174 }, { "epoch": 1.743547501372872, "grad_norm": 0.45987727566740183, "learning_rate": 3.8040413363943566e-07, "loss": 0.2383, "step": 3175 }, { "epoch": 1.7440966501922022, "grad_norm": 0.4470820653165698, "learning_rate": 3.801221643231467e-07, "loss": 0.2254, "step": 3176 }, { "epoch": 1.7446457990115323, "grad_norm": 0.5258470182980035, "learning_rate": 3.7984023547808413e-07, "loss": 0.2005, "step": 3177 }, { "epoch": 1.7451949478308622, "grad_norm": 0.47036590120171595, "learning_rate": 3.7955834719938846e-07, "loss": 0.2691, "step": 3178 }, { "epoch": 1.7457440966501923, "grad_norm": 0.5159167021826463, "learning_rate": 3.792764995821864e-07, "loss": 0.2325, "step": 3179 }, { "epoch": 1.7462932454695221, "grad_norm": 0.5269752323244645, "learning_rate": 3.789946927215915e-07, "loss": 0.2448, "step": 3180 }, { "epoch": 1.7468423942888522, "grad_norm": 0.4794105661476877, "learning_rate": 3.787129267127029e-07, "loss": 0.2883, "step": 3181 }, { "epoch": 1.7473915431081823, "grad_norm": 0.44920477675043563, "learning_rate": 3.7843120165060627e-07, "loss": 0.2428, "step": 3182 }, { "epoch": 1.7479406919275124, "grad_norm": 0.6126053014623515, "learning_rate": 3.781495176303734e-07, "loss": 0.2475, "step": 3183 }, { "epoch": 1.7484898407468425, "grad_norm": 0.578415721406623, "learning_rate": 3.778678747470625e-07, "loss": 0.2313, "step": 3184 }, { "epoch": 1.7490389895661724, "grad_norm": 0.4291465808488704, "learning_rate": 3.775862730957176e-07, "loss": 0.2274, "step": 3185 }, { "epoch": 1.7495881383855023, "grad_norm": 0.4346473547603644, "learning_rate": 3.7730471277136873e-07, "loss": 0.2413, "step": 3186 }, { "epoch": 1.7501372872048324, "grad_norm": 0.44539489679557687, "learning_rate": 3.7702319386903226e-07, "loss": 0.2572, "step": 3187 }, { "epoch": 1.7506864360241625, "grad_norm": 0.5458412759554239, "learning_rate": 3.767417164837102e-07, "loss": 0.2389, "step": 3188 }, { "epoch": 1.7512355848434926, "grad_norm": 0.5770531557851083, "learning_rate": 3.7646028071039116e-07, "loss": 0.219, "step": 3189 }, { "epoch": 1.7517847336628227, "grad_norm": 0.46108234758931604, "learning_rate": 3.7617888664404913e-07, "loss": 0.2703, "step": 3190 }, { "epoch": 1.7523338824821528, "grad_norm": 0.4912210098982382, "learning_rate": 3.7589753437964443e-07, "loss": 0.3073, "step": 3191 }, { "epoch": 1.7528830313014827, "grad_norm": 0.7505786550244599, "learning_rate": 3.7561622401212283e-07, "loss": 0.2649, "step": 3192 }, { "epoch": 1.7534321801208126, "grad_norm": 0.41835237808575687, "learning_rate": 3.753349556364162e-07, "loss": 0.2188, "step": 3193 }, { "epoch": 1.7539813289401427, "grad_norm": 0.462376610471587, "learning_rate": 3.7505372934744263e-07, "loss": 0.2526, "step": 3194 }, { "epoch": 1.7545304777594728, "grad_norm": 0.6157864425204377, "learning_rate": 3.7477254524010523e-07, "loss": 0.2877, "step": 3195 }, { "epoch": 1.755079626578803, "grad_norm": 0.37951275263709683, "learning_rate": 3.744914034092936e-07, "loss": 0.257, "step": 3196 }, { "epoch": 1.755628775398133, "grad_norm": 0.49268215522244563, "learning_rate": 3.742103039498823e-07, "loss": 0.2472, "step": 3197 }, { "epoch": 1.7561779242174629, "grad_norm": 0.48769839553429445, "learning_rate": 3.739292469567325e-07, "loss": 0.2508, "step": 3198 }, { "epoch": 1.756727073036793, "grad_norm": 0.4251522379520149, "learning_rate": 3.7364823252469033e-07, "loss": 0.2451, "step": 3199 }, { "epoch": 1.7572762218561229, "grad_norm": 0.5453702661324836, "learning_rate": 3.733672607485879e-07, "loss": 0.2479, "step": 3200 }, { "epoch": 1.7572762218561229, "eval_loss": 0.32651084661483765, "eval_runtime": 18.6705, "eval_samples_per_second": 23.727, "eval_steps_per_second": 1.018, "step": 3200 }, { "epoch": 1.757825370675453, "grad_norm": 0.4903260712000596, "learning_rate": 3.7308633172324283e-07, "loss": 0.2282, "step": 3201 }, { "epoch": 1.758374519494783, "grad_norm": 0.4486805009652084, "learning_rate": 3.7280544554345846e-07, "loss": 0.2306, "step": 3202 }, { "epoch": 1.7589236683141132, "grad_norm": 0.5809799612838047, "learning_rate": 3.725246023040232e-07, "loss": 0.2739, "step": 3203 }, { "epoch": 1.7594728171334433, "grad_norm": 0.6060920429058457, "learning_rate": 3.7224380209971153e-07, "loss": 0.2624, "step": 3204 }, { "epoch": 1.7600219659527732, "grad_norm": 0.5634604983694165, "learning_rate": 3.7196304502528297e-07, "loss": 0.2484, "step": 3205 }, { "epoch": 1.7605711147721033, "grad_norm": 0.5038282148433267, "learning_rate": 3.71682331175483e-07, "loss": 0.3132, "step": 3206 }, { "epoch": 1.7611202635914331, "grad_norm": 0.6483740917769496, "learning_rate": 3.7140166064504205e-07, "loss": 0.2869, "step": 3207 }, { "epoch": 1.7616694124107632, "grad_norm": 0.5641040189469726, "learning_rate": 3.71121033528676e-07, "loss": 0.2613, "step": 3208 }, { "epoch": 1.7622185612300933, "grad_norm": 0.7041012804385147, "learning_rate": 3.708404499210862e-07, "loss": 0.2822, "step": 3209 }, { "epoch": 1.7627677100494235, "grad_norm": 1.0419248578923135, "learning_rate": 3.7055990991695916e-07, "loss": 0.2868, "step": 3210 }, { "epoch": 1.7633168588687536, "grad_norm": 0.46563562491042026, "learning_rate": 3.702794136109672e-07, "loss": 0.2698, "step": 3211 }, { "epoch": 1.7638660076880834, "grad_norm": 0.4853378410871895, "learning_rate": 3.6999896109776713e-07, "loss": 0.2616, "step": 3212 }, { "epoch": 1.7644151565074135, "grad_norm": 0.4546893316616232, "learning_rate": 3.697185524720016e-07, "loss": 0.2428, "step": 3213 }, { "epoch": 1.7649643053267434, "grad_norm": 0.5519007941147956, "learning_rate": 3.694381878282978e-07, "loss": 0.2403, "step": 3214 }, { "epoch": 1.7655134541460735, "grad_norm": 0.4528682823390998, "learning_rate": 3.691578672612688e-07, "loss": 0.265, "step": 3215 }, { "epoch": 1.7660626029654036, "grad_norm": 0.4953911294349023, "learning_rate": 3.6887759086551235e-07, "loss": 0.2646, "step": 3216 }, { "epoch": 1.7666117517847337, "grad_norm": 0.4707344059943829, "learning_rate": 3.685973587356114e-07, "loss": 0.228, "step": 3217 }, { "epoch": 1.7671609006040638, "grad_norm": 0.5598948318476332, "learning_rate": 3.6831717096613426e-07, "loss": 0.281, "step": 3218 }, { "epoch": 1.7677100494233937, "grad_norm": 0.5801101030852658, "learning_rate": 3.6803702765163337e-07, "loss": 0.2487, "step": 3219 }, { "epoch": 1.7682591982427238, "grad_norm": 0.4717302770850839, "learning_rate": 3.6775692888664723e-07, "loss": 0.2389, "step": 3220 }, { "epoch": 1.7688083470620537, "grad_norm": 0.446720715673289, "learning_rate": 3.6747687476569883e-07, "loss": 0.2276, "step": 3221 }, { "epoch": 1.7693574958813838, "grad_norm": 0.4158763452649629, "learning_rate": 3.671968653832959e-07, "loss": 0.2236, "step": 3222 }, { "epoch": 1.769906644700714, "grad_norm": 0.4693401163218621, "learning_rate": 3.669169008339315e-07, "loss": 0.2094, "step": 3223 }, { "epoch": 1.770455793520044, "grad_norm": 0.48881947504972323, "learning_rate": 3.6663698121208335e-07, "loss": 0.2659, "step": 3224 }, { "epoch": 1.771004942339374, "grad_norm": 0.48686041646318634, "learning_rate": 3.663571066122139e-07, "loss": 0.2526, "step": 3225 }, { "epoch": 1.771554091158704, "grad_norm": 0.545055480574814, "learning_rate": 3.660772771287706e-07, "loss": 0.2464, "step": 3226 }, { "epoch": 1.772103239978034, "grad_norm": 0.4794289708321269, "learning_rate": 3.6579749285618526e-07, "loss": 0.2081, "step": 3227 }, { "epoch": 1.772652388797364, "grad_norm": 0.507152922032444, "learning_rate": 3.655177538888753e-07, "loss": 0.2634, "step": 3228 }, { "epoch": 1.773201537616694, "grad_norm": 0.514742983722494, "learning_rate": 3.652380603212422e-07, "loss": 0.2687, "step": 3229 }, { "epoch": 1.7737506864360242, "grad_norm": 0.5844575171720781, "learning_rate": 3.6495841224767187e-07, "loss": 0.3007, "step": 3230 }, { "epoch": 1.7742998352553543, "grad_norm": 0.5095379695529898, "learning_rate": 3.6467880976253546e-07, "loss": 0.24, "step": 3231 }, { "epoch": 1.7748489840746844, "grad_norm": 0.4665299076642937, "learning_rate": 3.6439925296018855e-07, "loss": 0.2715, "step": 3232 }, { "epoch": 1.7753981328940143, "grad_norm": 0.564394016760433, "learning_rate": 3.6411974193497124e-07, "loss": 0.2711, "step": 3233 }, { "epoch": 1.7759472817133442, "grad_norm": 0.5983560688762918, "learning_rate": 3.638402767812081e-07, "loss": 0.2964, "step": 3234 }, { "epoch": 1.7764964305326743, "grad_norm": 0.5658658730567936, "learning_rate": 3.635608575932087e-07, "loss": 0.2441, "step": 3235 }, { "epoch": 1.7770455793520044, "grad_norm": 0.5308114066814922, "learning_rate": 3.6328148446526614e-07, "loss": 0.2381, "step": 3236 }, { "epoch": 1.7775947281713345, "grad_norm": 0.5556881448340648, "learning_rate": 3.6300215749165895e-07, "loss": 0.2229, "step": 3237 }, { "epoch": 1.7781438769906646, "grad_norm": 0.4337866717149387, "learning_rate": 3.627228767666496e-07, "loss": 0.2351, "step": 3238 }, { "epoch": 1.7786930258099947, "grad_norm": 0.5481610194304144, "learning_rate": 3.624436423844849e-07, "loss": 0.2915, "step": 3239 }, { "epoch": 1.7792421746293245, "grad_norm": 0.512099984685357, "learning_rate": 3.621644544393966e-07, "loss": 0.2603, "step": 3240 }, { "epoch": 1.7797913234486544, "grad_norm": 0.5051440651140077, "learning_rate": 3.6188531302559984e-07, "loss": 0.2201, "step": 3241 }, { "epoch": 1.7803404722679845, "grad_norm": 0.47662284382696746, "learning_rate": 3.6160621823729476e-07, "loss": 0.2821, "step": 3242 }, { "epoch": 1.7808896210873146, "grad_norm": 0.3649006325811564, "learning_rate": 3.6132717016866567e-07, "loss": 0.2186, "step": 3243 }, { "epoch": 1.7814387699066447, "grad_norm": 0.5482396022855541, "learning_rate": 3.6104816891388073e-07, "loss": 0.2222, "step": 3244 }, { "epoch": 1.7819879187259748, "grad_norm": 0.511544758700288, "learning_rate": 3.60769214567093e-07, "loss": 0.2574, "step": 3245 }, { "epoch": 1.782537067545305, "grad_norm": 0.5246072658336789, "learning_rate": 3.604903072224391e-07, "loss": 0.2773, "step": 3246 }, { "epoch": 1.7830862163646348, "grad_norm": 0.5869311556169076, "learning_rate": 3.602114469740399e-07, "loss": 0.2609, "step": 3247 }, { "epoch": 1.7836353651839647, "grad_norm": 0.467989409545172, "learning_rate": 3.5993263391600037e-07, "loss": 0.2199, "step": 3248 }, { "epoch": 1.7841845140032948, "grad_norm": 0.5192217759404012, "learning_rate": 3.5965386814240987e-07, "loss": 0.2379, "step": 3249 }, { "epoch": 1.784733662822625, "grad_norm": 0.5701503659754419, "learning_rate": 3.593751497473416e-07, "loss": 0.259, "step": 3250 }, { "epoch": 1.785282811641955, "grad_norm": 0.5643240402143217, "learning_rate": 3.5909647882485266e-07, "loss": 0.2308, "step": 3251 }, { "epoch": 1.7858319604612851, "grad_norm": 0.5328041780015402, "learning_rate": 3.588178554689842e-07, "loss": 0.2214, "step": 3252 }, { "epoch": 1.786381109280615, "grad_norm": 0.5426625576103913, "learning_rate": 3.585392797737611e-07, "loss": 0.2166, "step": 3253 }, { "epoch": 1.786930258099945, "grad_norm": 0.4940794330177362, "learning_rate": 3.5826075183319286e-07, "loss": 0.2693, "step": 3254 }, { "epoch": 1.787479406919275, "grad_norm": 0.48448048737137134, "learning_rate": 3.579822717412722e-07, "loss": 0.278, "step": 3255 }, { "epoch": 1.788028555738605, "grad_norm": 0.5043826849912504, "learning_rate": 3.5770383959197575e-07, "loss": 0.2633, "step": 3256 }, { "epoch": 1.7885777045579352, "grad_norm": 0.5112786403034757, "learning_rate": 3.574254554792645e-07, "loss": 0.2284, "step": 3257 }, { "epoch": 1.7891268533772653, "grad_norm": 0.4758405516370161, "learning_rate": 3.5714711949708226e-07, "loss": 0.2379, "step": 3258 }, { "epoch": 1.7896760021965954, "grad_norm": 0.899476067865799, "learning_rate": 3.5686883173935763e-07, "loss": 0.3947, "step": 3259 }, { "epoch": 1.7902251510159253, "grad_norm": 0.5641049914931646, "learning_rate": 3.565905923000022e-07, "loss": 0.2643, "step": 3260 }, { "epoch": 1.7907742998352554, "grad_norm": 0.7342891116262842, "learning_rate": 3.563124012729116e-07, "loss": 0.2914, "step": 3261 }, { "epoch": 1.7913234486545853, "grad_norm": 0.512582626025038, "learning_rate": 3.5603425875196534e-07, "loss": 0.2101, "step": 3262 }, { "epoch": 1.7918725974739154, "grad_norm": 0.58488868171998, "learning_rate": 3.557561648310259e-07, "loss": 0.2982, "step": 3263 }, { "epoch": 1.7924217462932455, "grad_norm": 0.7268295553867393, "learning_rate": 3.5547811960393985e-07, "loss": 0.2613, "step": 3264 }, { "epoch": 1.7929708951125756, "grad_norm": 0.5802114773102711, "learning_rate": 3.5520012316453713e-07, "loss": 0.2425, "step": 3265 }, { "epoch": 1.7935200439319057, "grad_norm": 0.5748311928173353, "learning_rate": 3.549221756066315e-07, "loss": 0.247, "step": 3266 }, { "epoch": 1.7940691927512356, "grad_norm": 0.47113767207193835, "learning_rate": 3.5464427702401996e-07, "loss": 0.2442, "step": 3267 }, { "epoch": 1.7946183415705657, "grad_norm": 0.40046543758087666, "learning_rate": 3.54366427510483e-07, "loss": 0.2387, "step": 3268 }, { "epoch": 1.7951674903898955, "grad_norm": 0.5703506825836538, "learning_rate": 3.5408862715978447e-07, "loss": 0.2606, "step": 3269 }, { "epoch": 1.7957166392092256, "grad_norm": 0.7577000751785343, "learning_rate": 3.5381087606567186e-07, "loss": 0.2624, "step": 3270 }, { "epoch": 1.7962657880285557, "grad_norm": 0.4937723803464274, "learning_rate": 3.5353317432187606e-07, "loss": 0.2512, "step": 3271 }, { "epoch": 1.7968149368478858, "grad_norm": 0.5193242962032641, "learning_rate": 3.53255522022111e-07, "loss": 0.2516, "step": 3272 }, { "epoch": 1.797364085667216, "grad_norm": 0.4701291006188349, "learning_rate": 3.529779192600743e-07, "loss": 0.2672, "step": 3273 }, { "epoch": 1.7979132344865458, "grad_norm": 0.422034434328806, "learning_rate": 3.527003661294464e-07, "loss": 0.2642, "step": 3274 }, { "epoch": 1.798462383305876, "grad_norm": 0.4939452338842813, "learning_rate": 3.524228627238913e-07, "loss": 0.2201, "step": 3275 }, { "epoch": 1.7990115321252058, "grad_norm": 0.5613767994813553, "learning_rate": 3.5214540913705635e-07, "loss": 0.2228, "step": 3276 }, { "epoch": 1.799560680944536, "grad_norm": 0.562746067724081, "learning_rate": 3.5186800546257184e-07, "loss": 0.2581, "step": 3277 }, { "epoch": 1.800109829763866, "grad_norm": 0.43113111352859795, "learning_rate": 3.5159065179405116e-07, "loss": 0.215, "step": 3278 }, { "epoch": 1.8006589785831961, "grad_norm": 0.566018725884266, "learning_rate": 3.5131334822509134e-07, "loss": 0.2443, "step": 3279 }, { "epoch": 1.8012081274025262, "grad_norm": 0.5677231417463248, "learning_rate": 3.510360948492716e-07, "loss": 0.2718, "step": 3280 }, { "epoch": 1.801757276221856, "grad_norm": 0.4536306527881557, "learning_rate": 3.507588917601551e-07, "loss": 0.2459, "step": 3281 }, { "epoch": 1.8023064250411862, "grad_norm": 0.6184050401266717, "learning_rate": 3.504817390512875e-07, "loss": 0.2357, "step": 3282 }, { "epoch": 1.802855573860516, "grad_norm": 2.1524517582018334, "learning_rate": 3.502046368161977e-07, "loss": 0.246, "step": 3283 }, { "epoch": 1.8034047226798462, "grad_norm": 0.44682765231633387, "learning_rate": 3.4992758514839767e-07, "loss": 0.2236, "step": 3284 }, { "epoch": 1.8039538714991763, "grad_norm": 0.5462948191383195, "learning_rate": 3.496505841413818e-07, "loss": 0.2359, "step": 3285 }, { "epoch": 1.8045030203185064, "grad_norm": 0.4010125054167588, "learning_rate": 3.4937363388862783e-07, "loss": 0.2292, "step": 3286 }, { "epoch": 1.8050521691378365, "grad_norm": 0.54869646098954, "learning_rate": 3.4909673448359624e-07, "loss": 0.2776, "step": 3287 }, { "epoch": 1.8056013179571664, "grad_norm": 0.6495084283304535, "learning_rate": 3.4881988601973055e-07, "loss": 0.2488, "step": 3288 }, { "epoch": 1.8061504667764963, "grad_norm": 0.5258360983492615, "learning_rate": 3.485430885904569e-07, "loss": 0.2687, "step": 3289 }, { "epoch": 1.8066996155958264, "grad_norm": 0.46800944299671204, "learning_rate": 3.4826634228918414e-07, "loss": 0.245, "step": 3290 }, { "epoch": 1.8072487644151565, "grad_norm": 0.4299142812589447, "learning_rate": 3.4798964720930393e-07, "loss": 0.2449, "step": 3291 }, { "epoch": 1.8077979132344866, "grad_norm": 1.2017761612083697, "learning_rate": 3.477130034441906e-07, "loss": 0.4559, "step": 3292 }, { "epoch": 1.8083470620538167, "grad_norm": 0.5119317605734569, "learning_rate": 3.4743641108720135e-07, "loss": 0.233, "step": 3293 }, { "epoch": 1.8088962108731468, "grad_norm": 0.5050380547877428, "learning_rate": 3.471598702316759e-07, "loss": 0.2697, "step": 3294 }, { "epoch": 1.8094453596924767, "grad_norm": 0.5954131961798498, "learning_rate": 3.468833809709368e-07, "loss": 0.238, "step": 3295 }, { "epoch": 1.8099945085118065, "grad_norm": 0.5516829060282231, "learning_rate": 3.466069433982884e-07, "loss": 0.2457, "step": 3296 }, { "epoch": 1.8105436573311366, "grad_norm": 0.45441878738749614, "learning_rate": 3.463305576070188e-07, "loss": 0.2516, "step": 3297 }, { "epoch": 1.8110928061504667, "grad_norm": 0.5462777838903079, "learning_rate": 3.460542236903977e-07, "loss": 0.2525, "step": 3298 }, { "epoch": 1.8116419549697969, "grad_norm": 0.47309562175546327, "learning_rate": 3.457779417416776e-07, "loss": 0.2307, "step": 3299 }, { "epoch": 1.812191103789127, "grad_norm": 0.4240047692737509, "learning_rate": 3.455017118540938e-07, "loss": 0.2505, "step": 3300 }, { "epoch": 1.8127402526084568, "grad_norm": 0.5762786980741748, "learning_rate": 3.4522553412086353e-07, "loss": 0.285, "step": 3301 }, { "epoch": 1.813289401427787, "grad_norm": 0.5259079217713657, "learning_rate": 3.4494940863518646e-07, "loss": 0.2522, "step": 3302 }, { "epoch": 1.8138385502471168, "grad_norm": 0.5426784610316483, "learning_rate": 3.446733354902448e-07, "loss": 0.2899, "step": 3303 }, { "epoch": 1.814387699066447, "grad_norm": 0.4451335710910681, "learning_rate": 3.443973147792031e-07, "loss": 0.2415, "step": 3304 }, { "epoch": 1.814936847885777, "grad_norm": 0.4617689131159005, "learning_rate": 3.441213465952084e-07, "loss": 0.2791, "step": 3305 }, { "epoch": 1.8154859967051071, "grad_norm": 0.5168043908442046, "learning_rate": 3.438454310313896e-07, "loss": 0.2387, "step": 3306 }, { "epoch": 1.8160351455244372, "grad_norm": 0.6146801816851613, "learning_rate": 3.43569568180858e-07, "loss": 0.2959, "step": 3307 }, { "epoch": 1.8165842943437671, "grad_norm": 0.6069504528694258, "learning_rate": 3.432937581367073e-07, "loss": 0.2476, "step": 3308 }, { "epoch": 1.8171334431630972, "grad_norm": 0.48388766222356844, "learning_rate": 3.430180009920129e-07, "loss": 0.2613, "step": 3309 }, { "epoch": 1.817682591982427, "grad_norm": 0.48494068353860936, "learning_rate": 3.4274229683983304e-07, "loss": 0.2912, "step": 3310 }, { "epoch": 1.8182317408017572, "grad_norm": 0.5403158036794079, "learning_rate": 3.4246664577320765e-07, "loss": 0.2287, "step": 3311 }, { "epoch": 1.8187808896210873, "grad_norm": 0.5326817938968692, "learning_rate": 3.421910478851588e-07, "loss": 0.2312, "step": 3312 }, { "epoch": 1.8193300384404174, "grad_norm": 0.4645358000272507, "learning_rate": 3.4191550326869036e-07, "loss": 0.2544, "step": 3313 }, { "epoch": 1.8198791872597475, "grad_norm": 0.45602159824573074, "learning_rate": 3.4164001201678875e-07, "loss": 0.2687, "step": 3314 }, { "epoch": 1.8204283360790774, "grad_norm": 0.506816176667018, "learning_rate": 3.413645742224221e-07, "loss": 0.2716, "step": 3315 }, { "epoch": 1.8209774848984075, "grad_norm": 0.5293502363415263, "learning_rate": 3.4108918997854033e-07, "loss": 0.2548, "step": 3316 }, { "epoch": 1.8215266337177374, "grad_norm": 0.5408978460323642, "learning_rate": 3.40813859378076e-07, "loss": 0.2634, "step": 3317 }, { "epoch": 1.8220757825370675, "grad_norm": 0.5542902630733466, "learning_rate": 3.405385825139424e-07, "loss": 0.2401, "step": 3318 }, { "epoch": 1.8226249313563976, "grad_norm": 0.5583277538987553, "learning_rate": 3.402633594790357e-07, "loss": 0.2298, "step": 3319 }, { "epoch": 1.8231740801757277, "grad_norm": 0.4737070848412397, "learning_rate": 3.3998819036623334e-07, "loss": 0.278, "step": 3320 }, { "epoch": 1.8237232289950578, "grad_norm": 0.5411237796452439, "learning_rate": 3.397130752683948e-07, "loss": 0.2631, "step": 3321 }, { "epoch": 1.8242723778143877, "grad_norm": 0.4800744141929021, "learning_rate": 3.3943801427836147e-07, "loss": 0.2619, "step": 3322 }, { "epoch": 1.8248215266337178, "grad_norm": 0.5408293775671056, "learning_rate": 3.3916300748895615e-07, "loss": 0.2472, "step": 3323 }, { "epoch": 1.8253706754530477, "grad_norm": 0.537518396279777, "learning_rate": 3.388880549929836e-07, "loss": 0.2422, "step": 3324 }, { "epoch": 1.8259198242723778, "grad_norm": 0.5445276380746559, "learning_rate": 3.3861315688322995e-07, "loss": 0.2627, "step": 3325 }, { "epoch": 1.8264689730917079, "grad_norm": 0.6465367905144461, "learning_rate": 3.3833831325246327e-07, "loss": 0.2387, "step": 3326 }, { "epoch": 1.827018121911038, "grad_norm": 0.6305545886703103, "learning_rate": 3.3806352419343334e-07, "loss": 0.2545, "step": 3327 }, { "epoch": 1.827567270730368, "grad_norm": 0.4842206344722125, "learning_rate": 3.3778878979887125e-07, "loss": 0.2063, "step": 3328 }, { "epoch": 1.828116419549698, "grad_norm": 0.5541104374469734, "learning_rate": 3.3751411016148963e-07, "loss": 0.2574, "step": 3329 }, { "epoch": 1.828665568369028, "grad_norm": 0.43378948519634647, "learning_rate": 3.372394853739827e-07, "loss": 0.2492, "step": 3330 }, { "epoch": 1.829214717188358, "grad_norm": 0.40222017261294285, "learning_rate": 3.3696491552902635e-07, "loss": 0.2522, "step": 3331 }, { "epoch": 1.829763866007688, "grad_norm": 0.5094872750381726, "learning_rate": 3.3669040071927783e-07, "loss": 0.2362, "step": 3332 }, { "epoch": 1.8303130148270181, "grad_norm": 0.5265832019033582, "learning_rate": 3.364159410373755e-07, "loss": 0.2619, "step": 3333 }, { "epoch": 1.8308621636463482, "grad_norm": 0.5543041495016467, "learning_rate": 3.3614153657594006e-07, "loss": 0.2511, "step": 3334 }, { "epoch": 1.8314113124656783, "grad_norm": 0.5691177331659057, "learning_rate": 3.35867187427572e-07, "loss": 0.2538, "step": 3335 }, { "epoch": 1.8319604612850082, "grad_norm": 0.45534277969542813, "learning_rate": 3.355928936848546e-07, "loss": 0.183, "step": 3336 }, { "epoch": 1.8325096101043383, "grad_norm": 0.49743842436769414, "learning_rate": 3.3531865544035184e-07, "loss": 0.2128, "step": 3337 }, { "epoch": 1.8330587589236682, "grad_norm": 0.5037559589518178, "learning_rate": 3.3504447278660867e-07, "loss": 0.2175, "step": 3338 }, { "epoch": 1.8336079077429983, "grad_norm": 0.4787201459836962, "learning_rate": 3.347703458161524e-07, "loss": 0.2323, "step": 3339 }, { "epoch": 1.8341570565623284, "grad_norm": 0.4747446443338989, "learning_rate": 3.3449627462149e-07, "loss": 0.2302, "step": 3340 }, { "epoch": 1.8347062053816585, "grad_norm": 0.4057185011756123, "learning_rate": 3.342222592951107e-07, "loss": 0.2408, "step": 3341 }, { "epoch": 1.8352553542009886, "grad_norm": 0.5403296029683571, "learning_rate": 3.339482999294847e-07, "loss": 0.2215, "step": 3342 }, { "epoch": 1.8358045030203185, "grad_norm": 0.5943642653391777, "learning_rate": 3.3367439661706293e-07, "loss": 0.2748, "step": 3343 }, { "epoch": 1.8363536518396484, "grad_norm": 0.4129034959731085, "learning_rate": 3.334005494502779e-07, "loss": 0.2491, "step": 3344 }, { "epoch": 1.8369028006589785, "grad_norm": 0.5002790970553266, "learning_rate": 3.33126758521543e-07, "loss": 0.2974, "step": 3345 }, { "epoch": 1.8374519494783086, "grad_norm": 0.5174459510775207, "learning_rate": 3.3285302392325233e-07, "loss": 0.2508, "step": 3346 }, { "epoch": 1.8380010982976387, "grad_norm": 0.5027383483532301, "learning_rate": 3.3257934574778126e-07, "loss": 0.2524, "step": 3347 }, { "epoch": 1.8385502471169688, "grad_norm": 0.5751787865850971, "learning_rate": 3.323057240874862e-07, "loss": 0.2245, "step": 3348 }, { "epoch": 1.839099395936299, "grad_norm": 0.5875041255322028, "learning_rate": 3.320321590347044e-07, "loss": 0.2432, "step": 3349 }, { "epoch": 1.8396485447556288, "grad_norm": 0.49012613857855764, "learning_rate": 3.3175865068175403e-07, "loss": 0.2516, "step": 3350 }, { "epoch": 1.8401976935749587, "grad_norm": 0.47469708730575483, "learning_rate": 3.3148519912093387e-07, "loss": 0.2149, "step": 3351 }, { "epoch": 1.8407468423942888, "grad_norm": 0.5787021560105725, "learning_rate": 3.3121180444452373e-07, "loss": 0.2494, "step": 3352 }, { "epoch": 1.8412959912136189, "grad_norm": 0.4949237819687673, "learning_rate": 3.3093846674478455e-07, "loss": 0.231, "step": 3353 }, { "epoch": 1.841845140032949, "grad_norm": 0.5387514125064902, "learning_rate": 3.306651861139575e-07, "loss": 0.2346, "step": 3354 }, { "epoch": 1.842394288852279, "grad_norm": 0.6502984452848531, "learning_rate": 3.303919626442647e-07, "loss": 0.2576, "step": 3355 }, { "epoch": 1.842943437671609, "grad_norm": 0.514665672711255, "learning_rate": 3.3011879642790947e-07, "loss": 0.2291, "step": 3356 }, { "epoch": 1.843492586490939, "grad_norm": 0.5011657894212321, "learning_rate": 3.298456875570746e-07, "loss": 0.2766, "step": 3357 }, { "epoch": 1.844041735310269, "grad_norm": 0.4965709040699433, "learning_rate": 3.2957263612392477e-07, "loss": 0.244, "step": 3358 }, { "epoch": 1.844590884129599, "grad_norm": 0.44766157578846966, "learning_rate": 3.292996422206047e-07, "loss": 0.2473, "step": 3359 }, { "epoch": 1.8451400329489291, "grad_norm": 0.6333386369232549, "learning_rate": 3.2902670593923946e-07, "loss": 0.2303, "step": 3360 }, { "epoch": 1.8456891817682592, "grad_norm": 0.46904986279500355, "learning_rate": 3.287538273719356e-07, "loss": 0.244, "step": 3361 }, { "epoch": 1.8462383305875893, "grad_norm": 0.47952786079284393, "learning_rate": 3.284810066107791e-07, "loss": 0.2725, "step": 3362 }, { "epoch": 1.8467874794069192, "grad_norm": 0.5417386364279941, "learning_rate": 3.2820824374783695e-07, "loss": 0.2637, "step": 3363 }, { "epoch": 1.8473366282262493, "grad_norm": 0.5475352527154584, "learning_rate": 3.2793553887515674e-07, "loss": 0.2313, "step": 3364 }, { "epoch": 1.8478857770455792, "grad_norm": 0.5229136802125284, "learning_rate": 3.276628920847662e-07, "loss": 0.2438, "step": 3365 }, { "epoch": 1.8484349258649093, "grad_norm": 0.46736828229650823, "learning_rate": 3.2739030346867377e-07, "loss": 0.2259, "step": 3366 }, { "epoch": 1.8489840746842394, "grad_norm": 0.44848653284047074, "learning_rate": 3.271177731188679e-07, "loss": 0.244, "step": 3367 }, { "epoch": 1.8495332235035695, "grad_norm": 0.5167429015228152, "learning_rate": 3.2684530112731746e-07, "loss": 0.2685, "step": 3368 }, { "epoch": 1.8500823723228996, "grad_norm": 0.4507284177451039, "learning_rate": 3.2657288758597176e-07, "loss": 0.2377, "step": 3369 }, { "epoch": 1.8506315211422295, "grad_norm": 0.5828584903515878, "learning_rate": 3.263005325867605e-07, "loss": 0.2954, "step": 3370 }, { "epoch": 1.8511806699615596, "grad_norm": 0.713476964102636, "learning_rate": 3.260282362215933e-07, "loss": 0.2486, "step": 3371 }, { "epoch": 1.8517298187808895, "grad_norm": 0.545337100163429, "learning_rate": 3.257559985823603e-07, "loss": 0.2553, "step": 3372 }, { "epoch": 1.8522789676002196, "grad_norm": 0.37106111183336316, "learning_rate": 3.254838197609315e-07, "loss": 0.2773, "step": 3373 }, { "epoch": 1.8528281164195497, "grad_norm": 0.5001622745163616, "learning_rate": 3.252116998491572e-07, "loss": 0.2421, "step": 3374 }, { "epoch": 1.8533772652388798, "grad_norm": 0.4840128639367088, "learning_rate": 3.24939638938868e-07, "loss": 0.2814, "step": 3375 }, { "epoch": 1.85392641405821, "grad_norm": 0.3916739295962748, "learning_rate": 3.246676371218744e-07, "loss": 0.2504, "step": 3376 }, { "epoch": 1.8544755628775398, "grad_norm": 0.4475002867875725, "learning_rate": 3.2439569448996686e-07, "loss": 0.2324, "step": 3377 }, { "epoch": 1.8550247116968699, "grad_norm": 0.5275915896438861, "learning_rate": 3.2412381113491623e-07, "loss": 0.248, "step": 3378 }, { "epoch": 1.8555738605161998, "grad_norm": 0.4353998259510274, "learning_rate": 3.23851987148473e-07, "loss": 0.2068, "step": 3379 }, { "epoch": 1.8561230093355299, "grad_norm": 0.6358772487505907, "learning_rate": 3.235802226223677e-07, "loss": 0.2777, "step": 3380 }, { "epoch": 1.85667215815486, "grad_norm": 0.6134352381471003, "learning_rate": 3.233085176483109e-07, "loss": 0.2779, "step": 3381 }, { "epoch": 1.85722130697419, "grad_norm": 0.5711003635531834, "learning_rate": 3.23036872317993e-07, "loss": 0.2319, "step": 3382 }, { "epoch": 1.8577704557935202, "grad_norm": 0.47049117151547104, "learning_rate": 3.227652867230843e-07, "loss": 0.2081, "step": 3383 }, { "epoch": 1.85831960461285, "grad_norm": 0.49170460434593816, "learning_rate": 3.22493760955235e-07, "loss": 0.2548, "step": 3384 }, { "epoch": 1.8588687534321802, "grad_norm": 0.47086735681156167, "learning_rate": 3.22222295106075e-07, "loss": 0.2275, "step": 3385 }, { "epoch": 1.85941790225151, "grad_norm": 0.49382018819762935, "learning_rate": 3.2195088926721384e-07, "loss": 0.2476, "step": 3386 }, { "epoch": 1.8599670510708401, "grad_norm": 0.5279623403524477, "learning_rate": 3.216795435302413e-07, "loss": 0.2399, "step": 3387 }, { "epoch": 1.8605161998901703, "grad_norm": 0.4754473271036646, "learning_rate": 3.214082579867264e-07, "loss": 0.259, "step": 3388 }, { "epoch": 1.8610653487095004, "grad_norm": 0.45962946804898536, "learning_rate": 3.2113703272821816e-07, "loss": 0.2587, "step": 3389 }, { "epoch": 1.8616144975288305, "grad_norm": 0.4840917537347479, "learning_rate": 3.2086586784624487e-07, "loss": 0.283, "step": 3390 }, { "epoch": 1.8621636463481603, "grad_norm": 0.49234760335355526, "learning_rate": 3.205947634323147e-07, "loss": 0.2413, "step": 3391 }, { "epoch": 1.8627127951674904, "grad_norm": 0.453726410943816, "learning_rate": 3.2032371957791564e-07, "loss": 0.2055, "step": 3392 }, { "epoch": 1.8632619439868203, "grad_norm": 0.6519542418912082, "learning_rate": 3.200527363745149e-07, "loss": 0.2732, "step": 3393 }, { "epoch": 1.8638110928061504, "grad_norm": 0.45802618419018054, "learning_rate": 3.1978181391355916e-07, "loss": 0.2412, "step": 3394 }, { "epoch": 1.8643602416254805, "grad_norm": 0.5231012394047956, "learning_rate": 3.1951095228647516e-07, "loss": 0.2564, "step": 3395 }, { "epoch": 1.8649093904448106, "grad_norm": 0.6105060454516515, "learning_rate": 3.1924015158466837e-07, "loss": 0.2917, "step": 3396 }, { "epoch": 1.8654585392641407, "grad_norm": 0.520991227776756, "learning_rate": 3.189694118995242e-07, "loss": 0.2467, "step": 3397 }, { "epoch": 1.8660076880834706, "grad_norm": 0.4760909224745546, "learning_rate": 3.186987333224073e-07, "loss": 0.2304, "step": 3398 }, { "epoch": 1.8665568369028005, "grad_norm": 0.5315733801345761, "learning_rate": 3.1842811594466145e-07, "loss": 0.2676, "step": 3399 }, { "epoch": 1.8671059857221306, "grad_norm": 0.6339503831065317, "learning_rate": 3.181575598576106e-07, "loss": 0.2206, "step": 3400 }, { "epoch": 1.8671059857221306, "eval_loss": 0.3253461718559265, "eval_runtime": 18.6946, "eval_samples_per_second": 23.697, "eval_steps_per_second": 1.016, "step": 3400 }, { "epoch": 1.8676551345414607, "grad_norm": 0.621225604830136, "learning_rate": 3.1788706515255703e-07, "loss": 0.2168, "step": 3401 }, { "epoch": 1.8682042833607908, "grad_norm": 0.4668938231581029, "learning_rate": 3.1761663192078285e-07, "loss": 0.2342, "step": 3402 }, { "epoch": 1.868753432180121, "grad_norm": 0.48267350744936544, "learning_rate": 3.173462602535492e-07, "loss": 0.2439, "step": 3403 }, { "epoch": 1.869302580999451, "grad_norm": 0.4769968008709638, "learning_rate": 3.170759502420968e-07, "loss": 0.2246, "step": 3404 }, { "epoch": 1.869851729818781, "grad_norm": 0.561999375905934, "learning_rate": 3.1680570197764523e-07, "loss": 0.2241, "step": 3405 }, { "epoch": 1.8704008786381108, "grad_norm": 0.5396921113371931, "learning_rate": 3.165355155513934e-07, "loss": 0.196, "step": 3406 }, { "epoch": 1.8709500274574409, "grad_norm": 0.5467559691250319, "learning_rate": 3.16265391054519e-07, "loss": 0.2593, "step": 3407 }, { "epoch": 1.871499176276771, "grad_norm": 0.7266026109914191, "learning_rate": 3.159953285781792e-07, "loss": 0.3206, "step": 3408 }, { "epoch": 1.872048325096101, "grad_norm": 0.4534962872981325, "learning_rate": 3.1572532821351035e-07, "loss": 0.2563, "step": 3409 }, { "epoch": 1.8725974739154312, "grad_norm": 0.4942119231837779, "learning_rate": 3.1545539005162735e-07, "loss": 0.2551, "step": 3410 }, { "epoch": 1.873146622734761, "grad_norm": 0.5206176123358769, "learning_rate": 3.151855141836247e-07, "loss": 0.2449, "step": 3411 }, { "epoch": 1.8736957715540912, "grad_norm": 0.5132252752366697, "learning_rate": 3.149157007005752e-07, "loss": 0.2412, "step": 3412 }, { "epoch": 1.874244920373421, "grad_norm": 0.787143973000236, "learning_rate": 3.1464594969353115e-07, "loss": 0.2841, "step": 3413 }, { "epoch": 1.8747940691927512, "grad_norm": 0.5094105932039332, "learning_rate": 3.143762612535236e-07, "loss": 0.2616, "step": 3414 }, { "epoch": 1.8753432180120813, "grad_norm": 0.4942261865413558, "learning_rate": 3.141066354715625e-07, "loss": 0.2572, "step": 3415 }, { "epoch": 1.8758923668314114, "grad_norm": 0.5080046242839126, "learning_rate": 3.138370724386362e-07, "loss": 0.2582, "step": 3416 }, { "epoch": 1.8764415156507415, "grad_norm": 0.5377222566433009, "learning_rate": 3.13567572245713e-07, "loss": 0.2412, "step": 3417 }, { "epoch": 1.8769906644700713, "grad_norm": 0.4442819450254423, "learning_rate": 3.1329813498373886e-07, "loss": 0.2378, "step": 3418 }, { "epoch": 1.8775398132894014, "grad_norm": 0.46640323843178394, "learning_rate": 3.1302876074363896e-07, "loss": 0.25, "step": 3419 }, { "epoch": 1.8780889621087313, "grad_norm": 0.49314153111608217, "learning_rate": 3.127594496163172e-07, "loss": 0.2343, "step": 3420 }, { "epoch": 1.8786381109280614, "grad_norm": 0.5668384776222555, "learning_rate": 3.124902016926561e-07, "loss": 0.2524, "step": 3421 }, { "epoch": 1.8791872597473915, "grad_norm": 0.6377310032623316, "learning_rate": 3.122210170635171e-07, "loss": 0.2618, "step": 3422 }, { "epoch": 1.8797364085667216, "grad_norm": 0.5092031029825205, "learning_rate": 3.1195189581974004e-07, "loss": 0.2725, "step": 3423 }, { "epoch": 1.8802855573860517, "grad_norm": 0.5096082849109681, "learning_rate": 3.1168283805214326e-07, "loss": 0.2234, "step": 3424 }, { "epoch": 1.8808347062053816, "grad_norm": 0.4481000777424056, "learning_rate": 3.1141384385152383e-07, "loss": 0.2336, "step": 3425 }, { "epoch": 1.8813838550247117, "grad_norm": 0.6002745810595073, "learning_rate": 3.111449133086577e-07, "loss": 0.2937, "step": 3426 }, { "epoch": 1.8819330038440416, "grad_norm": 0.44636072584504, "learning_rate": 3.1087604651429876e-07, "loss": 0.2439, "step": 3427 }, { "epoch": 1.8824821526633717, "grad_norm": 0.5381264811098155, "learning_rate": 3.106072435591798e-07, "loss": 0.2427, "step": 3428 }, { "epoch": 1.8830313014827018, "grad_norm": 0.5793866095485689, "learning_rate": 3.103385045340118e-07, "loss": 0.2389, "step": 3429 }, { "epoch": 1.883580450302032, "grad_norm": 0.37894530138232296, "learning_rate": 3.100698295294842e-07, "loss": 0.2387, "step": 3430 }, { "epoch": 1.884129599121362, "grad_norm": 0.520801339899228, "learning_rate": 3.0980121863626506e-07, "loss": 0.3204, "step": 3431 }, { "epoch": 1.884678747940692, "grad_norm": 0.49529620861758034, "learning_rate": 3.095326719450007e-07, "loss": 0.2405, "step": 3432 }, { "epoch": 1.885227896760022, "grad_norm": 0.45988611331257506, "learning_rate": 3.092641895463157e-07, "loss": 0.2391, "step": 3433 }, { "epoch": 1.8857770455793519, "grad_norm": 0.4217670019251589, "learning_rate": 3.089957715308128e-07, "loss": 0.2176, "step": 3434 }, { "epoch": 1.886326194398682, "grad_norm": 0.5640913476568804, "learning_rate": 3.0872741798907337e-07, "loss": 0.2394, "step": 3435 }, { "epoch": 1.886875343218012, "grad_norm": 0.48773088253283076, "learning_rate": 3.084591290116569e-07, "loss": 0.2493, "step": 3436 }, { "epoch": 1.8874244920373422, "grad_norm": 0.6007741500051827, "learning_rate": 3.081909046891007e-07, "loss": 0.2425, "step": 3437 }, { "epoch": 1.8879736408566723, "grad_norm": 0.5142447056479005, "learning_rate": 3.0792274511192103e-07, "loss": 0.2222, "step": 3438 }, { "epoch": 1.8885227896760022, "grad_norm": 0.4078709312343764, "learning_rate": 3.0765465037061176e-07, "loss": 0.2398, "step": 3439 }, { "epoch": 1.8890719384953323, "grad_norm": 0.4681143045054727, "learning_rate": 3.0738662055564474e-07, "loss": 0.2735, "step": 3440 }, { "epoch": 1.8896210873146622, "grad_norm": 0.48117129035144396, "learning_rate": 3.071186557574705e-07, "loss": 0.239, "step": 3441 }, { "epoch": 1.8901702361339923, "grad_norm": 0.4733367001994018, "learning_rate": 3.068507560665168e-07, "loss": 0.2005, "step": 3442 }, { "epoch": 1.8907193849533224, "grad_norm": 0.4714859656673274, "learning_rate": 3.0658292157319047e-07, "loss": 0.2383, "step": 3443 }, { "epoch": 1.8912685337726525, "grad_norm": 0.4941764977268472, "learning_rate": 3.063151523678755e-07, "loss": 0.2693, "step": 3444 }, { "epoch": 1.8918176825919826, "grad_norm": 0.4687583054869288, "learning_rate": 3.060474485409342e-07, "loss": 0.2559, "step": 3445 }, { "epoch": 1.8923668314113125, "grad_norm": 0.39587052487787744, "learning_rate": 3.057798101827067e-07, "loss": 0.2273, "step": 3446 }, { "epoch": 1.8929159802306426, "grad_norm": 0.5401863754440832, "learning_rate": 3.0551223738351095e-07, "loss": 0.2551, "step": 3447 }, { "epoch": 1.8934651290499724, "grad_norm": 0.44288774422393146, "learning_rate": 3.0524473023364324e-07, "loss": 0.2147, "step": 3448 }, { "epoch": 1.8940142778693025, "grad_norm": 0.4692914808594869, "learning_rate": 3.0497728882337715e-07, "loss": 0.24, "step": 3449 }, { "epoch": 1.8945634266886326, "grad_norm": 0.4966494669074835, "learning_rate": 3.0470991324296445e-07, "loss": 0.2443, "step": 3450 }, { "epoch": 1.8951125755079627, "grad_norm": 0.44355694562598424, "learning_rate": 3.0444260358263427e-07, "loss": 0.2669, "step": 3451 }, { "epoch": 1.8956617243272929, "grad_norm": 0.45874816369881105, "learning_rate": 3.041753599325941e-07, "loss": 0.2717, "step": 3452 }, { "epoch": 1.8962108731466227, "grad_norm": 0.5612745481377038, "learning_rate": 3.039081823830286e-07, "loss": 0.2576, "step": 3453 }, { "epoch": 1.8967600219659526, "grad_norm": 0.4401127331815031, "learning_rate": 3.036410710241004e-07, "loss": 0.2378, "step": 3454 }, { "epoch": 1.8973091707852827, "grad_norm": 0.5590536605079881, "learning_rate": 3.033740259459501e-07, "loss": 0.2553, "step": 3455 }, { "epoch": 1.8978583196046128, "grad_norm": 0.3964795653782331, "learning_rate": 3.031070472386949e-07, "loss": 0.2223, "step": 3456 }, { "epoch": 1.898407468423943, "grad_norm": 0.5081343904334528, "learning_rate": 3.028401349924308e-07, "loss": 0.2579, "step": 3457 }, { "epoch": 1.898956617243273, "grad_norm": 0.6032386389083711, "learning_rate": 3.025732892972306e-07, "loss": 0.2557, "step": 3458 }, { "epoch": 1.8995057660626031, "grad_norm": 0.6850028431438505, "learning_rate": 3.0230651024314484e-07, "loss": 0.2739, "step": 3459 }, { "epoch": 1.900054914881933, "grad_norm": 0.491731791959141, "learning_rate": 3.0203979792020196e-07, "loss": 0.2309, "step": 3460 }, { "epoch": 1.900604063701263, "grad_norm": 0.5856933081034505, "learning_rate": 3.0177315241840736e-07, "loss": 0.2584, "step": 3461 }, { "epoch": 1.901153212520593, "grad_norm": 0.7204728664034864, "learning_rate": 3.0150657382774396e-07, "loss": 0.2644, "step": 3462 }, { "epoch": 1.901702361339923, "grad_norm": 0.4154287994693699, "learning_rate": 3.012400622381724e-07, "loss": 0.2168, "step": 3463 }, { "epoch": 1.9022515101592532, "grad_norm": 0.4574514949048822, "learning_rate": 3.0097361773963025e-07, "loss": 0.2398, "step": 3464 }, { "epoch": 1.9028006589785833, "grad_norm": 0.5492084644887225, "learning_rate": 3.00707240422033e-07, "loss": 0.2025, "step": 3465 }, { "epoch": 1.9033498077979132, "grad_norm": 0.385074751717564, "learning_rate": 3.004409303752731e-07, "loss": 0.2552, "step": 3466 }, { "epoch": 1.9038989566172433, "grad_norm": 0.5363834118827968, "learning_rate": 3.0017468768922036e-07, "loss": 0.2806, "step": 3467 }, { "epoch": 1.9044481054365732, "grad_norm": 0.5411701116909793, "learning_rate": 2.999085124537217e-07, "loss": 0.2507, "step": 3468 }, { "epoch": 1.9049972542559033, "grad_norm": 0.5117768984504683, "learning_rate": 2.9964240475860174e-07, "loss": 0.2471, "step": 3469 }, { "epoch": 1.9055464030752334, "grad_norm": 0.4404353668899281, "learning_rate": 2.99376364693662e-07, "loss": 0.2815, "step": 3470 }, { "epoch": 1.9060955518945635, "grad_norm": 0.4788918464678227, "learning_rate": 2.991103923486809e-07, "loss": 0.2333, "step": 3471 }, { "epoch": 1.9066447007138936, "grad_norm": 0.5785813725644967, "learning_rate": 2.988444878134148e-07, "loss": 0.2222, "step": 3472 }, { "epoch": 1.9071938495332235, "grad_norm": 0.4232710318241545, "learning_rate": 2.9857865117759607e-07, "loss": 0.2382, "step": 3473 }, { "epoch": 1.9077429983525536, "grad_norm": 0.52991427976696, "learning_rate": 2.983128825309353e-07, "loss": 0.2437, "step": 3474 }, { "epoch": 1.9082921471718834, "grad_norm": 0.5550804056179012, "learning_rate": 2.980471819631194e-07, "loss": 0.2688, "step": 3475 }, { "epoch": 1.9088412959912135, "grad_norm": 0.52100450679202, "learning_rate": 2.9778154956381246e-07, "loss": 0.2296, "step": 3476 }, { "epoch": 1.9093904448105437, "grad_norm": 0.5542332546738085, "learning_rate": 2.97515985422656e-07, "loss": 0.2807, "step": 3477 }, { "epoch": 1.9099395936298738, "grad_norm": 0.6044029928486673, "learning_rate": 2.9725048962926757e-07, "loss": 0.2589, "step": 3478 }, { "epoch": 1.9104887424492039, "grad_norm": 0.4866484505258523, "learning_rate": 2.969850622732426e-07, "loss": 0.2316, "step": 3479 }, { "epoch": 1.9110378912685337, "grad_norm": 0.4953313876800655, "learning_rate": 2.967197034441529e-07, "loss": 0.2546, "step": 3480 }, { "epoch": 1.9115870400878638, "grad_norm": 0.7121089140791472, "learning_rate": 2.964544132315473e-07, "loss": 0.2661, "step": 3481 }, { "epoch": 1.9121361889071937, "grad_norm": 0.43189198642144205, "learning_rate": 2.961891917249516e-07, "loss": 0.2404, "step": 3482 }, { "epoch": 1.9126853377265238, "grad_norm": 0.4650863471224219, "learning_rate": 2.959240390138683e-07, "loss": 0.2601, "step": 3483 }, { "epoch": 1.913234486545854, "grad_norm": 0.5929447563504545, "learning_rate": 2.9565895518777647e-07, "loss": 0.2323, "step": 3484 }, { "epoch": 1.913783635365184, "grad_norm": 0.40527071770482453, "learning_rate": 2.9539394033613216e-07, "loss": 0.2515, "step": 3485 }, { "epoch": 1.9143327841845141, "grad_norm": 0.8125714935134585, "learning_rate": 2.9512899454836826e-07, "loss": 0.292, "step": 3486 }, { "epoch": 1.914881933003844, "grad_norm": 0.6883562582271736, "learning_rate": 2.948641179138942e-07, "loss": 0.3109, "step": 3487 }, { "epoch": 1.9154310818231741, "grad_norm": 0.5018155614481865, "learning_rate": 2.9459931052209617e-07, "loss": 0.2455, "step": 3488 }, { "epoch": 1.915980230642504, "grad_norm": 0.43482533807172197, "learning_rate": 2.943345724623366e-07, "loss": 0.2647, "step": 3489 }, { "epoch": 1.916529379461834, "grad_norm": 0.5132415898751335, "learning_rate": 2.940699038239549e-07, "loss": 0.2127, "step": 3490 }, { "epoch": 1.9170785282811642, "grad_norm": 0.45603569157175095, "learning_rate": 2.938053046962673e-07, "loss": 0.255, "step": 3491 }, { "epoch": 1.9176276771004943, "grad_norm": 0.4516091899983879, "learning_rate": 2.9354077516856593e-07, "loss": 0.235, "step": 3492 }, { "epoch": 1.9181768259198244, "grad_norm": 0.5579624955390328, "learning_rate": 2.932763153301199e-07, "loss": 0.2356, "step": 3493 }, { "epoch": 1.9187259747391543, "grad_norm": 0.46280204989417323, "learning_rate": 2.930119252701748e-07, "loss": 0.2702, "step": 3494 }, { "epoch": 1.9192751235584844, "grad_norm": 0.42000327519408087, "learning_rate": 2.927476050779522e-07, "loss": 0.2864, "step": 3495 }, { "epoch": 1.9198242723778143, "grad_norm": 0.5388514746794255, "learning_rate": 2.9248335484265064e-07, "loss": 0.2892, "step": 3496 }, { "epoch": 1.9203734211971444, "grad_norm": 0.5100549631576503, "learning_rate": 2.922191746534448e-07, "loss": 0.2065, "step": 3497 }, { "epoch": 1.9209225700164745, "grad_norm": 0.5544308013162146, "learning_rate": 2.9195506459948584e-07, "loss": 0.2872, "step": 3498 }, { "epoch": 1.9214717188358046, "grad_norm": 0.482773426027997, "learning_rate": 2.9169102476990117e-07, "loss": 0.241, "step": 3499 }, { "epoch": 1.9220208676551347, "grad_norm": 0.5885970173162935, "learning_rate": 2.9142705525379417e-07, "loss": 0.2451, "step": 3500 }, { "epoch": 1.9225700164744646, "grad_norm": 0.4753095417820037, "learning_rate": 2.9116315614024524e-07, "loss": 0.2964, "step": 3501 }, { "epoch": 1.9231191652937945, "grad_norm": 0.4821001546327114, "learning_rate": 2.9089932751831046e-07, "loss": 0.2287, "step": 3502 }, { "epoch": 1.9236683141131246, "grad_norm": 0.5030692832079965, "learning_rate": 2.906355694770222e-07, "loss": 0.236, "step": 3503 }, { "epoch": 1.9242174629324547, "grad_norm": 0.8496379344842603, "learning_rate": 2.903718821053891e-07, "loss": 0.2729, "step": 3504 }, { "epoch": 1.9247666117517848, "grad_norm": 0.5785599421220556, "learning_rate": 2.901082654923962e-07, "loss": 0.2235, "step": 3505 }, { "epoch": 1.9253157605711149, "grad_norm": 0.9781381882079483, "learning_rate": 2.898447197270041e-07, "loss": 0.2572, "step": 3506 }, { "epoch": 1.925864909390445, "grad_norm": 0.5339835928687925, "learning_rate": 2.8958124489814984e-07, "loss": 0.2758, "step": 3507 }, { "epoch": 1.9264140582097748, "grad_norm": 0.48705531471153135, "learning_rate": 2.893178410947466e-07, "loss": 0.2161, "step": 3508 }, { "epoch": 1.9269632070291047, "grad_norm": 0.609102743577628, "learning_rate": 2.8905450840568315e-07, "loss": 0.2846, "step": 3509 }, { "epoch": 1.9275123558484348, "grad_norm": 0.46646050526114474, "learning_rate": 2.8879124691982495e-07, "loss": 0.2427, "step": 3510 }, { "epoch": 1.928061504667765, "grad_norm": 0.5085058742071942, "learning_rate": 2.885280567260127e-07, "loss": 0.2391, "step": 3511 }, { "epoch": 1.928610653487095, "grad_norm": 0.47156540177633205, "learning_rate": 2.8826493791306385e-07, "loss": 0.2462, "step": 3512 }, { "epoch": 1.9291598023064251, "grad_norm": 0.4961401790197671, "learning_rate": 2.880018905697707e-07, "loss": 0.2497, "step": 3513 }, { "epoch": 1.9297089511257552, "grad_norm": 0.5758535825905837, "learning_rate": 2.8773891478490243e-07, "loss": 0.2568, "step": 3514 }, { "epoch": 1.9302580999450851, "grad_norm": 0.539416186952519, "learning_rate": 2.8747601064720375e-07, "loss": 0.259, "step": 3515 }, { "epoch": 1.930807248764415, "grad_norm": 0.4670349550398961, "learning_rate": 2.8721317824539506e-07, "loss": 0.26, "step": 3516 }, { "epoch": 1.931356397583745, "grad_norm": 0.46194109371862896, "learning_rate": 2.869504176681723e-07, "loss": 0.2583, "step": 3517 }, { "epoch": 1.9319055464030752, "grad_norm": 0.49477642024608054, "learning_rate": 2.866877290042077e-07, "loss": 0.288, "step": 3518 }, { "epoch": 1.9324546952224053, "grad_norm": 0.49374123699041367, "learning_rate": 2.864251123421493e-07, "loss": 0.2093, "step": 3519 }, { "epoch": 1.9330038440417354, "grad_norm": 0.41737011331805424, "learning_rate": 2.8616256777062005e-07, "loss": 0.2337, "step": 3520 }, { "epoch": 1.9335529928610653, "grad_norm": 0.48499202992710083, "learning_rate": 2.8590009537821944e-07, "loss": 0.278, "step": 3521 }, { "epoch": 1.9341021416803954, "grad_norm": 0.5005301142547031, "learning_rate": 2.856376952535221e-07, "loss": 0.2182, "step": 3522 }, { "epoch": 1.9346512904997253, "grad_norm": 0.46972440779576397, "learning_rate": 2.8537536748507825e-07, "loss": 0.185, "step": 3523 }, { "epoch": 1.9352004393190554, "grad_norm": 0.4960262447388615, "learning_rate": 2.8511311216141394e-07, "loss": 0.2252, "step": 3524 }, { "epoch": 1.9357495881383855, "grad_norm": 0.4767517721703788, "learning_rate": 2.8485092937103097e-07, "loss": 0.2459, "step": 3525 }, { "epoch": 1.9362987369577156, "grad_norm": 0.5033764727629504, "learning_rate": 2.845888192024059e-07, "loss": 0.2443, "step": 3526 }, { "epoch": 1.9368478857770457, "grad_norm": 0.41232197287742584, "learning_rate": 2.8432678174399174e-07, "loss": 0.2736, "step": 3527 }, { "epoch": 1.9373970345963756, "grad_norm": 0.5101825855825676, "learning_rate": 2.8406481708421595e-07, "loss": 0.2533, "step": 3528 }, { "epoch": 1.9379461834157057, "grad_norm": 0.5581123936733484, "learning_rate": 2.8380292531148245e-07, "loss": 0.2855, "step": 3529 }, { "epoch": 1.9384953322350356, "grad_norm": 0.5321908556518186, "learning_rate": 2.8354110651416975e-07, "loss": 0.2003, "step": 3530 }, { "epoch": 1.9390444810543657, "grad_norm": 0.4039345443259138, "learning_rate": 2.8327936078063196e-07, "loss": 0.2502, "step": 3531 }, { "epoch": 1.9395936298736958, "grad_norm": 0.5358782010635976, "learning_rate": 2.8301768819919915e-07, "loss": 0.2571, "step": 3532 }, { "epoch": 1.9401427786930259, "grad_norm": 0.5140520490582524, "learning_rate": 2.8275608885817574e-07, "loss": 0.2803, "step": 3533 }, { "epoch": 1.940691927512356, "grad_norm": 0.4766739576154817, "learning_rate": 2.8249456284584177e-07, "loss": 0.2745, "step": 3534 }, { "epoch": 1.9412410763316859, "grad_norm": 0.49560530532484964, "learning_rate": 2.822331102504529e-07, "loss": 0.2792, "step": 3535 }, { "epoch": 1.941790225151016, "grad_norm": 0.6728348026040153, "learning_rate": 2.819717311602398e-07, "loss": 0.2834, "step": 3536 }, { "epoch": 1.9423393739703458, "grad_norm": 0.44623214683764584, "learning_rate": 2.8171042566340796e-07, "loss": 0.2562, "step": 3537 }, { "epoch": 1.942888522789676, "grad_norm": 0.5329961545712149, "learning_rate": 2.814491938481388e-07, "loss": 0.2642, "step": 3538 }, { "epoch": 1.943437671609006, "grad_norm": 0.5382481918275226, "learning_rate": 2.8118803580258813e-07, "loss": 0.226, "step": 3539 }, { "epoch": 1.9439868204283361, "grad_norm": 0.4294502804872192, "learning_rate": 2.8092695161488707e-07, "loss": 0.23, "step": 3540 }, { "epoch": 1.9445359692476663, "grad_norm": 0.5181966375096064, "learning_rate": 2.80665941373142e-07, "loss": 0.1869, "step": 3541 }, { "epoch": 1.9450851180669961, "grad_norm": 0.6269697155225653, "learning_rate": 2.8040500516543463e-07, "loss": 0.2309, "step": 3542 }, { "epoch": 1.9456342668863262, "grad_norm": 0.6137518663724708, "learning_rate": 2.8014414307982106e-07, "loss": 0.2249, "step": 3543 }, { "epoch": 1.9461834157056561, "grad_norm": 0.5313339834262895, "learning_rate": 2.798833552043323e-07, "loss": 0.2905, "step": 3544 }, { "epoch": 1.9467325645249862, "grad_norm": 0.47623669426324017, "learning_rate": 2.796226416269749e-07, "loss": 0.2482, "step": 3545 }, { "epoch": 1.9472817133443163, "grad_norm": 0.4970381567156964, "learning_rate": 2.793620024357304e-07, "loss": 0.2247, "step": 3546 }, { "epoch": 1.9478308621636464, "grad_norm": 0.5149144769041067, "learning_rate": 2.791014377185545e-07, "loss": 0.2254, "step": 3547 }, { "epoch": 1.9483800109829765, "grad_norm": 0.4714395363889867, "learning_rate": 2.788409475633782e-07, "loss": 0.2171, "step": 3548 }, { "epoch": 1.9489291598023064, "grad_norm": 0.5546220391096439, "learning_rate": 2.7858053205810775e-07, "loss": 0.2507, "step": 3549 }, { "epoch": 1.9494783086216365, "grad_norm": 0.5715019993948425, "learning_rate": 2.7832019129062354e-07, "loss": 0.2274, "step": 3550 }, { "epoch": 1.9500274574409664, "grad_norm": 0.5221463850932294, "learning_rate": 2.780599253487809e-07, "loss": 0.248, "step": 3551 }, { "epoch": 1.9505766062602965, "grad_norm": 0.6335872984457526, "learning_rate": 2.7779973432040985e-07, "loss": 0.2545, "step": 3552 }, { "epoch": 1.9511257550796266, "grad_norm": 0.516797373127701, "learning_rate": 2.775396182933158e-07, "loss": 0.2929, "step": 3553 }, { "epoch": 1.9516749038989567, "grad_norm": 0.508830977419197, "learning_rate": 2.7727957735527797e-07, "loss": 0.2417, "step": 3554 }, { "epoch": 1.9522240527182868, "grad_norm": 0.5296110612524638, "learning_rate": 2.770196115940504e-07, "loss": 0.2285, "step": 3555 }, { "epoch": 1.9527732015376167, "grad_norm": 0.4311907075317863, "learning_rate": 2.7675972109736246e-07, "loss": 0.2235, "step": 3556 }, { "epoch": 1.9533223503569466, "grad_norm": 0.5738238851652983, "learning_rate": 2.7649990595291714e-07, "loss": 0.3177, "step": 3557 }, { "epoch": 1.9538714991762767, "grad_norm": 0.5212528638403917, "learning_rate": 2.762401662483927e-07, "loss": 0.2731, "step": 3558 }, { "epoch": 1.9544206479956068, "grad_norm": 0.5662470614689857, "learning_rate": 2.759805020714419e-07, "loss": 0.2553, "step": 3559 }, { "epoch": 1.9549697968149369, "grad_norm": 0.478234869839864, "learning_rate": 2.7572091350969166e-07, "loss": 0.2559, "step": 3560 }, { "epoch": 1.955518945634267, "grad_norm": 0.4569328627608001, "learning_rate": 2.754614006507433e-07, "loss": 0.2507, "step": 3561 }, { "epoch": 1.956068094453597, "grad_norm": 0.49520417970736014, "learning_rate": 2.7520196358217316e-07, "loss": 0.2685, "step": 3562 }, { "epoch": 1.956617243272927, "grad_norm": 0.6050404354330664, "learning_rate": 2.749426023915318e-07, "loss": 0.2357, "step": 3563 }, { "epoch": 1.9571663920922568, "grad_norm": 0.5574042803861196, "learning_rate": 2.746833171663437e-07, "loss": 0.2732, "step": 3564 }, { "epoch": 1.957715540911587, "grad_norm": 0.3887222643172956, "learning_rate": 2.744241079941085e-07, "loss": 0.2374, "step": 3565 }, { "epoch": 1.958264689730917, "grad_norm": 0.4481860368134919, "learning_rate": 2.741649749622992e-07, "loss": 0.2913, "step": 3566 }, { "epoch": 1.9588138385502472, "grad_norm": 0.5775864143543111, "learning_rate": 2.7390591815836426e-07, "loss": 0.2871, "step": 3567 }, { "epoch": 1.9593629873695773, "grad_norm": 0.4395967867609971, "learning_rate": 2.736469376697253e-07, "loss": 0.2116, "step": 3568 }, { "epoch": 1.9599121361889071, "grad_norm": 0.47823803480965066, "learning_rate": 2.733880335837789e-07, "loss": 0.2255, "step": 3569 }, { "epoch": 1.9604612850082372, "grad_norm": 0.5883628932934369, "learning_rate": 2.7312920598789584e-07, "loss": 0.2271, "step": 3570 }, { "epoch": 1.9610104338275671, "grad_norm": 0.507651767944914, "learning_rate": 2.728704549694207e-07, "loss": 0.2408, "step": 3571 }, { "epoch": 1.9615595826468972, "grad_norm": 0.5376614816173333, "learning_rate": 2.7261178061567225e-07, "loss": 0.2269, "step": 3572 }, { "epoch": 1.9621087314662273, "grad_norm": 0.49546209558327126, "learning_rate": 2.723531830139439e-07, "loss": 0.2244, "step": 3573 }, { "epoch": 1.9626578802855574, "grad_norm": 0.7384842611794925, "learning_rate": 2.7209466225150247e-07, "loss": 0.2602, "step": 3574 }, { "epoch": 1.9632070291048875, "grad_norm": 0.5869937728032119, "learning_rate": 2.718362184155894e-07, "loss": 0.2227, "step": 3575 }, { "epoch": 1.9637561779242174, "grad_norm": 0.5177536804884671, "learning_rate": 2.715778515934201e-07, "loss": 0.2656, "step": 3576 }, { "epoch": 1.9643053267435475, "grad_norm": 0.5577516716783074, "learning_rate": 2.713195618721837e-07, "loss": 0.257, "step": 3577 }, { "epoch": 1.9648544755628774, "grad_norm": 0.5940239868467071, "learning_rate": 2.710613493390432e-07, "loss": 0.2235, "step": 3578 }, { "epoch": 1.9654036243822075, "grad_norm": 0.5068552014319503, "learning_rate": 2.7080321408113615e-07, "loss": 0.2024, "step": 3579 }, { "epoch": 1.9659527732015376, "grad_norm": 0.49311385759555865, "learning_rate": 2.7054515618557375e-07, "loss": 0.2476, "step": 3580 }, { "epoch": 1.9665019220208677, "grad_norm": 0.4699113785027886, "learning_rate": 2.702871757394407e-07, "loss": 0.2094, "step": 3581 }, { "epoch": 1.9670510708401978, "grad_norm": 0.47229523733660006, "learning_rate": 2.700292728297963e-07, "loss": 0.2264, "step": 3582 }, { "epoch": 1.9676002196595277, "grad_norm": 0.4694630373614784, "learning_rate": 2.697714475436729e-07, "loss": 0.255, "step": 3583 }, { "epoch": 1.9681493684788578, "grad_norm": 0.46771376675508464, "learning_rate": 2.695136999680773e-07, "loss": 0.2307, "step": 3584 }, { "epoch": 1.9686985172981877, "grad_norm": 0.44396687966745196, "learning_rate": 2.6925603018998966e-07, "loss": 0.2485, "step": 3585 }, { "epoch": 1.9692476661175178, "grad_norm": 0.44524074931127644, "learning_rate": 2.6899843829636395e-07, "loss": 0.2739, "step": 3586 }, { "epoch": 1.9697968149368479, "grad_norm": 0.589200210770938, "learning_rate": 2.6874092437412855e-07, "loss": 0.2931, "step": 3587 }, { "epoch": 1.970345963756178, "grad_norm": 0.47753013862159893, "learning_rate": 2.68483488510184e-07, "loss": 0.2236, "step": 3588 }, { "epoch": 1.970895112575508, "grad_norm": 0.5227814670826163, "learning_rate": 2.6822613079140597e-07, "loss": 0.2315, "step": 3589 }, { "epoch": 1.971444261394838, "grad_norm": 0.4266441980724434, "learning_rate": 2.679688513046433e-07, "loss": 0.2207, "step": 3590 }, { "epoch": 1.971993410214168, "grad_norm": 0.4613314865314026, "learning_rate": 2.6771165013671785e-07, "loss": 0.2426, "step": 3591 }, { "epoch": 1.972542559033498, "grad_norm": 0.7017836692745851, "learning_rate": 2.67454527374426e-07, "loss": 0.2831, "step": 3592 }, { "epoch": 1.973091707852828, "grad_norm": 0.6804160652458832, "learning_rate": 2.6719748310453714e-07, "loss": 0.25, "step": 3593 }, { "epoch": 1.9736408566721582, "grad_norm": 0.5091180316829615, "learning_rate": 2.669405174137942e-07, "loss": 0.269, "step": 3594 }, { "epoch": 1.9741900054914883, "grad_norm": 0.48702463995146106, "learning_rate": 2.666836303889134e-07, "loss": 0.2365, "step": 3595 }, { "epoch": 1.9747391543108184, "grad_norm": 0.44661616246263575, "learning_rate": 2.664268221165848e-07, "loss": 0.2264, "step": 3596 }, { "epoch": 1.9752883031301482, "grad_norm": 0.5647887117299537, "learning_rate": 2.661700926834719e-07, "loss": 0.2458, "step": 3597 }, { "epoch": 1.9758374519494784, "grad_norm": 0.7156215820950582, "learning_rate": 2.6591344217621136e-07, "loss": 0.2926, "step": 3598 }, { "epoch": 1.9763866007688082, "grad_norm": 0.5652838793747974, "learning_rate": 2.6565687068141306e-07, "loss": 0.2483, "step": 3599 }, { "epoch": 1.9769357495881383, "grad_norm": 0.43264227965890273, "learning_rate": 2.654003782856605e-07, "loss": 0.2261, "step": 3600 }, { "epoch": 1.9769357495881383, "eval_loss": 0.32394054532051086, "eval_runtime": 18.6725, "eval_samples_per_second": 23.725, "eval_steps_per_second": 1.018, "step": 3600 }, { "epoch": 1.9774848984074684, "grad_norm": 0.6006732363553414, "learning_rate": 2.651439650755107e-07, "loss": 0.2306, "step": 3601 }, { "epoch": 1.9780340472267985, "grad_norm": 0.43265428377104304, "learning_rate": 2.6488763113749316e-07, "loss": 0.2423, "step": 3602 }, { "epoch": 1.9785831960461286, "grad_norm": 0.4812925459313404, "learning_rate": 2.646313765581116e-07, "loss": 0.2558, "step": 3603 }, { "epoch": 1.9791323448654585, "grad_norm": 0.9687244444160324, "learning_rate": 2.643752014238427e-07, "loss": 0.3615, "step": 3604 }, { "epoch": 1.9796814936847886, "grad_norm": 0.5550788339791402, "learning_rate": 2.641191058211353e-07, "loss": 0.288, "step": 3605 }, { "epoch": 1.9802306425041185, "grad_norm": 0.47120334809379805, "learning_rate": 2.6386308983641265e-07, "loss": 0.2641, "step": 3606 }, { "epoch": 1.9807797913234486, "grad_norm": 0.5795971406042949, "learning_rate": 2.63607153556071e-07, "loss": 0.2564, "step": 3607 }, { "epoch": 1.9813289401427787, "grad_norm": 0.41810703743001404, "learning_rate": 2.6335129706647904e-07, "loss": 0.2483, "step": 3608 }, { "epoch": 1.9818780889621088, "grad_norm": 0.5061621132036708, "learning_rate": 2.630955204539792e-07, "loss": 0.236, "step": 3609 }, { "epoch": 1.982427237781439, "grad_norm": 0.820991755189141, "learning_rate": 2.628398238048862e-07, "loss": 0.2757, "step": 3610 }, { "epoch": 1.9829763866007688, "grad_norm": 0.5066048790646377, "learning_rate": 2.625842072054889e-07, "loss": 0.2445, "step": 3611 }, { "epoch": 1.9835255354200987, "grad_norm": 0.4935582575850033, "learning_rate": 2.623286707420479e-07, "loss": 0.2539, "step": 3612 }, { "epoch": 1.9840746842394288, "grad_norm": 0.5648924305374863, "learning_rate": 2.6207321450079757e-07, "loss": 0.2352, "step": 3613 }, { "epoch": 1.984623833058759, "grad_norm": 0.45576977983877387, "learning_rate": 2.6181783856794516e-07, "loss": 0.228, "step": 3614 }, { "epoch": 1.985172981878089, "grad_norm": 0.44968516032300915, "learning_rate": 2.6156254302967043e-07, "loss": 0.2313, "step": 3615 }, { "epoch": 1.985722130697419, "grad_norm": 0.42093194093827885, "learning_rate": 2.6130732797212605e-07, "loss": 0.2325, "step": 3616 }, { "epoch": 1.9862712795167492, "grad_norm": 0.4888210529399285, "learning_rate": 2.61052193481438e-07, "loss": 0.2778, "step": 3617 }, { "epoch": 1.986820428336079, "grad_norm": 0.500435220620874, "learning_rate": 2.6079713964370476e-07, "loss": 0.2284, "step": 3618 }, { "epoch": 1.987369577155409, "grad_norm": 0.5204022908253535, "learning_rate": 2.605421665449974e-07, "loss": 0.2354, "step": 3619 }, { "epoch": 1.987918725974739, "grad_norm": 0.5594751912788976, "learning_rate": 2.602872742713602e-07, "loss": 0.1998, "step": 3620 }, { "epoch": 1.9884678747940692, "grad_norm": 0.48481399736334296, "learning_rate": 2.600324629088098e-07, "loss": 0.2534, "step": 3621 }, { "epoch": 1.9890170236133993, "grad_norm": 0.5950512650619166, "learning_rate": 2.597777325433354e-07, "loss": 0.2515, "step": 3622 }, { "epoch": 1.9895661724327294, "grad_norm": 0.4864217013229987, "learning_rate": 2.5952308326089933e-07, "loss": 0.2537, "step": 3623 }, { "epoch": 1.9901153212520593, "grad_norm": 0.5592803340178949, "learning_rate": 2.592685151474366e-07, "loss": 0.2615, "step": 3624 }, { "epoch": 1.9906644700713894, "grad_norm": 0.5036800182919009, "learning_rate": 2.5901402828885405e-07, "loss": 0.2597, "step": 3625 }, { "epoch": 1.9912136188907192, "grad_norm": 0.510648531093211, "learning_rate": 2.5875962277103215e-07, "loss": 0.2357, "step": 3626 }, { "epoch": 1.9917627677100493, "grad_norm": 0.4747778790674222, "learning_rate": 2.5850529867982287e-07, "loss": 0.2186, "step": 3627 }, { "epoch": 1.9923119165293794, "grad_norm": 0.5567352724280351, "learning_rate": 2.582510561010517e-07, "loss": 0.2797, "step": 3628 }, { "epoch": 1.9928610653487095, "grad_norm": 0.45626220406813023, "learning_rate": 2.5799689512051566e-07, "loss": 0.2197, "step": 3629 }, { "epoch": 1.9934102141680397, "grad_norm": 0.4986417900459531, "learning_rate": 2.5774281582398505e-07, "loss": 0.2151, "step": 3630 }, { "epoch": 1.9939593629873695, "grad_norm": 0.5717684346993673, "learning_rate": 2.574888182972024e-07, "loss": 0.2311, "step": 3631 }, { "epoch": 1.9945085118066996, "grad_norm": 0.47070901347542604, "learning_rate": 2.5723490262588226e-07, "loss": 0.2663, "step": 3632 }, { "epoch": 1.9950576606260295, "grad_norm": 0.5017313535995244, "learning_rate": 2.569810688957117e-07, "loss": 0.2418, "step": 3633 }, { "epoch": 1.9956068094453596, "grad_norm": 0.6641539261341836, "learning_rate": 2.567273171923505e-07, "loss": 0.2822, "step": 3634 }, { "epoch": 1.9961559582646897, "grad_norm": 0.5058400435451753, "learning_rate": 2.5647364760143046e-07, "loss": 0.247, "step": 3635 }, { "epoch": 1.9967051070840198, "grad_norm": 0.5190345756668289, "learning_rate": 2.5622006020855556e-07, "loss": 0.2404, "step": 3636 }, { "epoch": 1.99725425590335, "grad_norm": 0.4651825098744332, "learning_rate": 2.559665550993027e-07, "loss": 0.2277, "step": 3637 }, { "epoch": 1.9978034047226798, "grad_norm": 0.5409533537145913, "learning_rate": 2.5571313235922e-07, "loss": 0.2398, "step": 3638 }, { "epoch": 1.99835255354201, "grad_norm": 0.48219384988012653, "learning_rate": 2.554597920738282e-07, "loss": 0.2437, "step": 3639 }, { "epoch": 1.9989017023613398, "grad_norm": 0.446420724986946, "learning_rate": 2.5520653432862067e-07, "loss": 0.2465, "step": 3640 }, { "epoch": 1.99945085118067, "grad_norm": 0.560738757985528, "learning_rate": 2.549533592090627e-07, "loss": 0.2333, "step": 3641 }, { "epoch": 2.0, "grad_norm": 0.522990454926645, "learning_rate": 2.547002668005913e-07, "loss": 0.3024, "step": 3642 }, { "epoch": 2.00054914881933, "grad_norm": 0.4913238742866613, "learning_rate": 2.544472571886156e-07, "loss": 0.2508, "step": 3643 }, { "epoch": 2.00109829763866, "grad_norm": 0.5890426877099678, "learning_rate": 2.541943304585173e-07, "loss": 0.2331, "step": 3644 }, { "epoch": 2.0016474464579903, "grad_norm": 0.5080251643304994, "learning_rate": 2.5394148669565e-07, "loss": 0.2173, "step": 3645 }, { "epoch": 2.00219659527732, "grad_norm": 0.44952741736715174, "learning_rate": 2.5368872598533884e-07, "loss": 0.2295, "step": 3646 }, { "epoch": 2.00274574409665, "grad_norm": 0.524021575698407, "learning_rate": 2.534360484128815e-07, "loss": 0.2506, "step": 3647 }, { "epoch": 2.00329489291598, "grad_norm": 0.5158759558335935, "learning_rate": 2.531834540635473e-07, "loss": 0.2503, "step": 3648 }, { "epoch": 2.0038440417353103, "grad_norm": 0.5547007620685842, "learning_rate": 2.5293094302257757e-07, "loss": 0.2736, "step": 3649 }, { "epoch": 2.0043931905546404, "grad_norm": 0.5083564571381022, "learning_rate": 2.5267851537518517e-07, "loss": 0.2623, "step": 3650 }, { "epoch": 2.0049423393739705, "grad_norm": 0.4403522819862046, "learning_rate": 2.524261712065553e-07, "loss": 0.2615, "step": 3651 }, { "epoch": 2.0054914881933006, "grad_norm": 0.4358984123669056, "learning_rate": 2.5217391060184514e-07, "loss": 0.2255, "step": 3652 }, { "epoch": 2.0060406370126302, "grad_norm": 0.45244034398901245, "learning_rate": 2.5192173364618305e-07, "loss": 0.2241, "step": 3653 }, { "epoch": 2.0065897858319603, "grad_norm": 0.4788176702233535, "learning_rate": 2.5166964042466933e-07, "loss": 0.268, "step": 3654 }, { "epoch": 2.0071389346512905, "grad_norm": 0.4612233064431962, "learning_rate": 2.514176310223765e-07, "loss": 0.2261, "step": 3655 }, { "epoch": 2.0076880834706206, "grad_norm": 0.5659295130333686, "learning_rate": 2.5116570552434815e-07, "loss": 0.2846, "step": 3656 }, { "epoch": 2.0082372322899507, "grad_norm": 0.4493441227221307, "learning_rate": 2.5091386401559986e-07, "loss": 0.2796, "step": 3657 }, { "epoch": 2.0087863811092808, "grad_norm": 0.5306724533364384, "learning_rate": 2.5066210658111925e-07, "loss": 0.2283, "step": 3658 }, { "epoch": 2.009335529928611, "grad_norm": 0.4661457411455569, "learning_rate": 2.504104333058649e-07, "loss": 0.2593, "step": 3659 }, { "epoch": 2.0098846787479405, "grad_norm": 0.4706345804215834, "learning_rate": 2.50158844274767e-07, "loss": 0.222, "step": 3660 }, { "epoch": 2.0104338275672706, "grad_norm": 0.36380555660343594, "learning_rate": 2.499073395727279e-07, "loss": 0.2238, "step": 3661 }, { "epoch": 2.0109829763866007, "grad_norm": 0.514270903659223, "learning_rate": 2.4965591928462133e-07, "loss": 0.2471, "step": 3662 }, { "epoch": 2.011532125205931, "grad_norm": 0.4595319827290595, "learning_rate": 2.49404583495292e-07, "loss": 0.2211, "step": 3663 }, { "epoch": 2.012081274025261, "grad_norm": 0.5108482565124504, "learning_rate": 2.491533322895568e-07, "loss": 0.2179, "step": 3664 }, { "epoch": 2.012630422844591, "grad_norm": 0.4418248432507296, "learning_rate": 2.4890216575220346e-07, "loss": 0.1961, "step": 3665 }, { "epoch": 2.013179571663921, "grad_norm": 0.5602175894663437, "learning_rate": 2.486510839679917e-07, "loss": 0.2121, "step": 3666 }, { "epoch": 2.013728720483251, "grad_norm": 0.479482566881761, "learning_rate": 2.484000870216521e-07, "loss": 0.2356, "step": 3667 }, { "epoch": 2.014277869302581, "grad_norm": 0.5451192328961294, "learning_rate": 2.48149174997887e-07, "loss": 0.2199, "step": 3668 }, { "epoch": 2.014827018121911, "grad_norm": 0.4751755961725391, "learning_rate": 2.4789834798137023e-07, "loss": 0.2187, "step": 3669 }, { "epoch": 2.015376166941241, "grad_norm": 0.4247030647692184, "learning_rate": 2.476476060567464e-07, "loss": 0.2448, "step": 3670 }, { "epoch": 2.015925315760571, "grad_norm": 0.4889571587715578, "learning_rate": 2.4739694930863154e-07, "loss": 0.2401, "step": 3671 }, { "epoch": 2.0164744645799013, "grad_norm": 0.5445404840598219, "learning_rate": 2.471463778216134e-07, "loss": 0.2522, "step": 3672 }, { "epoch": 2.017023613399231, "grad_norm": 0.4251242325513694, "learning_rate": 2.4689589168025025e-07, "loss": 0.2479, "step": 3673 }, { "epoch": 2.017572762218561, "grad_norm": 0.49828441205774543, "learning_rate": 2.466454909690722e-07, "loss": 0.2422, "step": 3674 }, { "epoch": 2.018121911037891, "grad_norm": 0.42696144251965235, "learning_rate": 2.463951757725804e-07, "loss": 0.2272, "step": 3675 }, { "epoch": 2.0186710598572213, "grad_norm": 0.5634285239560771, "learning_rate": 2.461449461752468e-07, "loss": 0.2193, "step": 3676 }, { "epoch": 2.0192202086765514, "grad_norm": 0.5673945137309203, "learning_rate": 2.458948022615144e-07, "loss": 0.2329, "step": 3677 }, { "epoch": 2.0197693574958815, "grad_norm": 0.5791208428472995, "learning_rate": 2.456447441157979e-07, "loss": 0.2709, "step": 3678 }, { "epoch": 2.0203185063152116, "grad_norm": 0.521930087551579, "learning_rate": 2.453947718224829e-07, "loss": 0.249, "step": 3679 }, { "epoch": 2.0208676551345413, "grad_norm": 0.4568367999451592, "learning_rate": 2.4514488546592537e-07, "loss": 0.2173, "step": 3680 }, { "epoch": 2.0214168039538714, "grad_norm": 0.4385463580231092, "learning_rate": 2.448950851304531e-07, "loss": 0.1912, "step": 3681 }, { "epoch": 2.0219659527732015, "grad_norm": 0.5235594459331956, "learning_rate": 2.446453709003643e-07, "loss": 0.2277, "step": 3682 }, { "epoch": 2.0225151015925316, "grad_norm": 0.45791817816673974, "learning_rate": 2.443957428599285e-07, "loss": 0.2309, "step": 3683 }, { "epoch": 2.0230642504118617, "grad_norm": 0.4945584772320004, "learning_rate": 2.441462010933857e-07, "loss": 0.2668, "step": 3684 }, { "epoch": 2.0236133992311918, "grad_norm": 0.499310769481328, "learning_rate": 2.4389674568494716e-07, "loss": 0.2126, "step": 3685 }, { "epoch": 2.024162548050522, "grad_norm": 0.524986527507172, "learning_rate": 2.436473767187954e-07, "loss": 0.2744, "step": 3686 }, { "epoch": 2.0247116968698515, "grad_norm": 0.47052143792597523, "learning_rate": 2.433980942790824e-07, "loss": 0.2469, "step": 3687 }, { "epoch": 2.0252608456891816, "grad_norm": 0.5317884135434877, "learning_rate": 2.431488984499322e-07, "loss": 0.2246, "step": 3688 }, { "epoch": 2.0258099945085117, "grad_norm": 0.5329157293653187, "learning_rate": 2.428997893154393e-07, "loss": 0.2407, "step": 3689 }, { "epoch": 2.026359143327842, "grad_norm": 0.5024132645160868, "learning_rate": 2.4265076695966873e-07, "loss": 0.3134, "step": 3690 }, { "epoch": 2.026908292147172, "grad_norm": 0.5931602083943923, "learning_rate": 2.4240183146665636e-07, "loss": 0.2293, "step": 3691 }, { "epoch": 2.027457440966502, "grad_norm": 0.5750470369442647, "learning_rate": 2.42152982920409e-07, "loss": 0.2907, "step": 3692 }, { "epoch": 2.028006589785832, "grad_norm": 0.5384362284972389, "learning_rate": 2.4190422140490353e-07, "loss": 0.289, "step": 3693 }, { "epoch": 2.028555738605162, "grad_norm": 0.6641661829926486, "learning_rate": 2.4165554700408784e-07, "loss": 0.236, "step": 3694 }, { "epoch": 2.029104887424492, "grad_norm": 0.5166578794021188, "learning_rate": 2.414069598018804e-07, "loss": 0.2813, "step": 3695 }, { "epoch": 2.029654036243822, "grad_norm": 0.5265678694848518, "learning_rate": 2.4115845988217057e-07, "loss": 0.2142, "step": 3696 }, { "epoch": 2.030203185063152, "grad_norm": 0.47410386001414817, "learning_rate": 2.409100473288175e-07, "loss": 0.2498, "step": 3697 }, { "epoch": 2.030752333882482, "grad_norm": 0.48895341463331876, "learning_rate": 2.4066172222565136e-07, "loss": 0.2118, "step": 3698 }, { "epoch": 2.0313014827018123, "grad_norm": 0.4741926194621475, "learning_rate": 2.404134846564727e-07, "loss": 0.2523, "step": 3699 }, { "epoch": 2.0318506315211424, "grad_norm": 0.5511789681328455, "learning_rate": 2.401653347050529e-07, "loss": 0.2604, "step": 3700 }, { "epoch": 2.032399780340472, "grad_norm": 0.48089572061100344, "learning_rate": 2.3991727245513293e-07, "loss": 0.2568, "step": 3701 }, { "epoch": 2.032948929159802, "grad_norm": 0.6666944723346373, "learning_rate": 2.3966929799042484e-07, "loss": 0.2271, "step": 3702 }, { "epoch": 2.0334980779791323, "grad_norm": 0.4396406877108257, "learning_rate": 2.3942141139461136e-07, "loss": 0.2485, "step": 3703 }, { "epoch": 2.0340472267984624, "grad_norm": 0.40949006014845785, "learning_rate": 2.391736127513443e-07, "loss": 0.2624, "step": 3704 }, { "epoch": 2.0345963756177925, "grad_norm": 0.5933709531190807, "learning_rate": 2.389259021442469e-07, "loss": 0.2334, "step": 3705 }, { "epoch": 2.0351455244371226, "grad_norm": 0.4982454202896799, "learning_rate": 2.3867827965691256e-07, "loss": 0.2179, "step": 3706 }, { "epoch": 2.0356946732564527, "grad_norm": 0.5901651404969978, "learning_rate": 2.3843074537290435e-07, "loss": 0.2415, "step": 3707 }, { "epoch": 2.0362438220757824, "grad_norm": 0.5306472157925395, "learning_rate": 2.381832993757564e-07, "loss": 0.2338, "step": 3708 }, { "epoch": 2.0367929708951125, "grad_norm": 0.6982664677160196, "learning_rate": 2.3793594174897228e-07, "loss": 0.3125, "step": 3709 }, { "epoch": 2.0373421197144426, "grad_norm": 0.53708927366282, "learning_rate": 2.3768867257602638e-07, "loss": 0.2446, "step": 3710 }, { "epoch": 2.0378912685337727, "grad_norm": 0.5591794106420714, "learning_rate": 2.3744149194036255e-07, "loss": 0.2632, "step": 3711 }, { "epoch": 2.0384404173531028, "grad_norm": 0.5474616318706089, "learning_rate": 2.3719439992539537e-07, "loss": 0.2802, "step": 3712 }, { "epoch": 2.038989566172433, "grad_norm": 0.4685594000787345, "learning_rate": 2.3694739661450942e-07, "loss": 0.2537, "step": 3713 }, { "epoch": 2.039538714991763, "grad_norm": 0.5497578317726087, "learning_rate": 2.3670048209105916e-07, "loss": 0.2472, "step": 3714 }, { "epoch": 2.0400878638110926, "grad_norm": 0.4759317287317202, "learning_rate": 2.3645365643836883e-07, "loss": 0.2422, "step": 3715 }, { "epoch": 2.0406370126304227, "grad_norm": 0.543921832696685, "learning_rate": 2.362069197397333e-07, "loss": 0.2381, "step": 3716 }, { "epoch": 2.041186161449753, "grad_norm": 0.6741609031094414, "learning_rate": 2.3596027207841718e-07, "loss": 0.293, "step": 3717 }, { "epoch": 2.041735310269083, "grad_norm": 0.5048027411023959, "learning_rate": 2.3571371353765465e-07, "loss": 0.2727, "step": 3718 }, { "epoch": 2.042284459088413, "grad_norm": 0.4948460004611238, "learning_rate": 2.3546724420065052e-07, "loss": 0.2934, "step": 3719 }, { "epoch": 2.042833607907743, "grad_norm": 0.47749876201506164, "learning_rate": 2.3522086415057892e-07, "loss": 0.2441, "step": 3720 }, { "epoch": 2.0433827567270733, "grad_norm": 0.4855937909171848, "learning_rate": 2.3497457347058383e-07, "loss": 0.264, "step": 3721 }, { "epoch": 2.043931905546403, "grad_norm": 0.5566052795249073, "learning_rate": 2.347283722437795e-07, "loss": 0.2005, "step": 3722 }, { "epoch": 2.044481054365733, "grad_norm": 0.49769009933463193, "learning_rate": 2.3448226055324988e-07, "loss": 0.2423, "step": 3723 }, { "epoch": 2.045030203185063, "grad_norm": 0.45765774246114027, "learning_rate": 2.3423623848204838e-07, "loss": 0.227, "step": 3724 }, { "epoch": 2.0455793520043932, "grad_norm": 0.4684956250690747, "learning_rate": 2.339903061131986e-07, "loss": 0.2497, "step": 3725 }, { "epoch": 2.0461285008237233, "grad_norm": 0.4850885699692227, "learning_rate": 2.3374446352969334e-07, "loss": 0.2531, "step": 3726 }, { "epoch": 2.0466776496430534, "grad_norm": 0.5762772916999483, "learning_rate": 2.3349871081449584e-07, "loss": 0.2483, "step": 3727 }, { "epoch": 2.047226798462383, "grad_norm": 0.9858405558390863, "learning_rate": 2.3325304805053813e-07, "loss": 0.3359, "step": 3728 }, { "epoch": 2.047775947281713, "grad_norm": 0.5405403754986029, "learning_rate": 2.3300747532072259e-07, "loss": 0.2844, "step": 3729 }, { "epoch": 2.0483250961010433, "grad_norm": 0.5968157813374904, "learning_rate": 2.3276199270792115e-07, "loss": 0.2556, "step": 3730 }, { "epoch": 2.0488742449203734, "grad_norm": 0.4559604562747986, "learning_rate": 2.3251660029497493e-07, "loss": 0.2613, "step": 3731 }, { "epoch": 2.0494233937397035, "grad_norm": 0.6698311755335101, "learning_rate": 2.3227129816469465e-07, "loss": 0.3045, "step": 3732 }, { "epoch": 2.0499725425590336, "grad_norm": 0.4872289244983262, "learning_rate": 2.3202608639986094e-07, "loss": 0.2128, "step": 3733 }, { "epoch": 2.0505216913783637, "grad_norm": 0.5772283582422382, "learning_rate": 2.3178096508322396e-07, "loss": 0.2739, "step": 3734 }, { "epoch": 2.0510708401976934, "grad_norm": 0.584930686828448, "learning_rate": 2.3153593429750263e-07, "loss": 0.2598, "step": 3735 }, { "epoch": 2.0516199890170235, "grad_norm": 0.47094955435821995, "learning_rate": 2.3129099412538632e-07, "loss": 0.2283, "step": 3736 }, { "epoch": 2.0521691378363536, "grad_norm": 0.45991486335144016, "learning_rate": 2.310461446495331e-07, "loss": 0.2389, "step": 3737 }, { "epoch": 2.0527182866556837, "grad_norm": 0.57405142005613, "learning_rate": 2.3080138595257034e-07, "loss": 0.2794, "step": 3738 }, { "epoch": 2.053267435475014, "grad_norm": 0.4832220333411837, "learning_rate": 2.3055671811709545e-07, "loss": 0.2083, "step": 3739 }, { "epoch": 2.053816584294344, "grad_norm": 0.4814050010028005, "learning_rate": 2.303121412256749e-07, "loss": 0.2296, "step": 3740 }, { "epoch": 2.054365733113674, "grad_norm": 0.47238033740331364, "learning_rate": 2.3006765536084415e-07, "loss": 0.2336, "step": 3741 }, { "epoch": 2.0549148819330036, "grad_norm": 0.48366892342414747, "learning_rate": 2.298232606051081e-07, "loss": 0.2692, "step": 3742 }, { "epoch": 2.0554640307523337, "grad_norm": 0.4576644957872836, "learning_rate": 2.2957895704094107e-07, "loss": 0.2244, "step": 3743 }, { "epoch": 2.056013179571664, "grad_norm": 0.544131458687228, "learning_rate": 2.2933474475078672e-07, "loss": 0.2591, "step": 3744 }, { "epoch": 2.056562328390994, "grad_norm": 0.52715649908787, "learning_rate": 2.2909062381705738e-07, "loss": 0.2651, "step": 3745 }, { "epoch": 2.057111477210324, "grad_norm": 0.6577435472712102, "learning_rate": 2.28846594322135e-07, "loss": 0.2148, "step": 3746 }, { "epoch": 2.057660626029654, "grad_norm": 0.49543446218548004, "learning_rate": 2.286026563483707e-07, "loss": 0.2645, "step": 3747 }, { "epoch": 2.0582097748489843, "grad_norm": 0.48652722294534334, "learning_rate": 2.2835880997808452e-07, "loss": 0.2218, "step": 3748 }, { "epoch": 2.058758923668314, "grad_norm": 0.5418631667098508, "learning_rate": 2.2811505529356525e-07, "loss": 0.2675, "step": 3749 }, { "epoch": 2.059308072487644, "grad_norm": 0.5638116910798441, "learning_rate": 2.2787139237707142e-07, "loss": 0.2651, "step": 3750 }, { "epoch": 2.059857221306974, "grad_norm": 0.47532490014027007, "learning_rate": 2.276278213108305e-07, "loss": 0.2527, "step": 3751 }, { "epoch": 2.0604063701263042, "grad_norm": 0.5046365348091819, "learning_rate": 2.2738434217703845e-07, "loss": 0.2678, "step": 3752 }, { "epoch": 2.0609555189456343, "grad_norm": 0.4684628862107452, "learning_rate": 2.2714095505786043e-07, "loss": 0.2808, "step": 3753 }, { "epoch": 2.0615046677649644, "grad_norm": 0.49971409070025663, "learning_rate": 2.2689766003543092e-07, "loss": 0.266, "step": 3754 }, { "epoch": 2.0620538165842945, "grad_norm": 0.5453090647639149, "learning_rate": 2.266544571918527e-07, "loss": 0.2372, "step": 3755 }, { "epoch": 2.062602965403624, "grad_norm": 0.46607772246298784, "learning_rate": 2.2641134660919794e-07, "loss": 0.2122, "step": 3756 }, { "epoch": 2.0631521142229543, "grad_norm": 0.5628090209722687, "learning_rate": 2.2616832836950768e-07, "loss": 0.24, "step": 3757 }, { "epoch": 2.0637012630422844, "grad_norm": 0.4282866311731098, "learning_rate": 2.2592540255479147e-07, "loss": 0.2517, "step": 3758 }, { "epoch": 2.0642504118616145, "grad_norm": 0.6044630348501255, "learning_rate": 2.256825692470276e-07, "loss": 0.2344, "step": 3759 }, { "epoch": 2.0647995606809446, "grad_norm": 0.5774481563162135, "learning_rate": 2.2543982852816358e-07, "loss": 0.2576, "step": 3760 }, { "epoch": 2.0653487095002747, "grad_norm": 0.45519867540968767, "learning_rate": 2.2519718048011563e-07, "loss": 0.2087, "step": 3761 }, { "epoch": 2.065897858319605, "grad_norm": 0.5417368969411511, "learning_rate": 2.2495462518476815e-07, "loss": 0.2109, "step": 3762 }, { "epoch": 2.0664470071389345, "grad_norm": 0.3908792725996102, "learning_rate": 2.24712162723975e-07, "loss": 0.2117, "step": 3763 }, { "epoch": 2.0669961559582646, "grad_norm": 0.5074637351207244, "learning_rate": 2.2446979317955798e-07, "loss": 0.2663, "step": 3764 }, { "epoch": 2.0675453047775947, "grad_norm": 0.48395573281234366, "learning_rate": 2.2422751663330825e-07, "loss": 0.2516, "step": 3765 }, { "epoch": 2.068094453596925, "grad_norm": 0.50185767696027, "learning_rate": 2.2398533316698473e-07, "loss": 0.2139, "step": 3766 }, { "epoch": 2.068643602416255, "grad_norm": 0.39385996109314064, "learning_rate": 2.237432428623158e-07, "loss": 0.2435, "step": 3767 }, { "epoch": 2.069192751235585, "grad_norm": 0.5282952753837166, "learning_rate": 2.23501245800998e-07, "loss": 0.2591, "step": 3768 }, { "epoch": 2.069741900054915, "grad_norm": 0.5051684722163543, "learning_rate": 2.232593420646964e-07, "loss": 0.2451, "step": 3769 }, { "epoch": 2.0702910488742448, "grad_norm": 0.5759166501310068, "learning_rate": 2.2301753173504435e-07, "loss": 0.2071, "step": 3770 }, { "epoch": 2.070840197693575, "grad_norm": 0.5554354398589165, "learning_rate": 2.2277581489364427e-07, "loss": 0.2418, "step": 3771 }, { "epoch": 2.071389346512905, "grad_norm": 0.6074187367943809, "learning_rate": 2.225341916220664e-07, "loss": 0.2464, "step": 3772 }, { "epoch": 2.071938495332235, "grad_norm": 0.5100792478591916, "learning_rate": 2.2229266200184982e-07, "loss": 0.2188, "step": 3773 }, { "epoch": 2.072487644151565, "grad_norm": 0.4474649241624674, "learning_rate": 2.2205122611450203e-07, "loss": 0.2158, "step": 3774 }, { "epoch": 2.0730367929708953, "grad_norm": 0.5056464396208762, "learning_rate": 2.2180988404149858e-07, "loss": 0.2181, "step": 3775 }, { "epoch": 2.073585941790225, "grad_norm": 0.4524101987449746, "learning_rate": 2.2156863586428345e-07, "loss": 0.2192, "step": 3776 }, { "epoch": 2.074135090609555, "grad_norm": 0.4821130558052258, "learning_rate": 2.213274816642691e-07, "loss": 0.2227, "step": 3777 }, { "epoch": 2.074684239428885, "grad_norm": 0.5370172777596177, "learning_rate": 2.2108642152283632e-07, "loss": 0.2448, "step": 3778 }, { "epoch": 2.0752333882482152, "grad_norm": 0.49649226010859776, "learning_rate": 2.2084545552133377e-07, "loss": 0.2188, "step": 3779 }, { "epoch": 2.0757825370675453, "grad_norm": 0.5363380279314732, "learning_rate": 2.2060458374107887e-07, "loss": 0.2503, "step": 3780 }, { "epoch": 2.0763316858868754, "grad_norm": 0.5222857862752548, "learning_rate": 2.203638062633567e-07, "loss": 0.2589, "step": 3781 }, { "epoch": 2.0768808347062055, "grad_norm": 0.4330076218078724, "learning_rate": 2.2012312316942114e-07, "loss": 0.1966, "step": 3782 }, { "epoch": 2.077429983525535, "grad_norm": 0.41076589476972525, "learning_rate": 2.1988253454049338e-07, "loss": 0.2269, "step": 3783 }, { "epoch": 2.0779791323448653, "grad_norm": 0.4842394243581011, "learning_rate": 2.1964204045776354e-07, "loss": 0.2263, "step": 3784 }, { "epoch": 2.0785282811641954, "grad_norm": 0.5536101759418829, "learning_rate": 2.1940164100238987e-07, "loss": 0.2244, "step": 3785 }, { "epoch": 2.0790774299835255, "grad_norm": 0.5055241417302644, "learning_rate": 2.1916133625549752e-07, "loss": 0.2547, "step": 3786 }, { "epoch": 2.0796265788028556, "grad_norm": 0.6893800562974802, "learning_rate": 2.189211262981809e-07, "loss": 0.2134, "step": 3787 }, { "epoch": 2.0801757276221857, "grad_norm": 0.45605780828887804, "learning_rate": 2.1868101121150215e-07, "loss": 0.2271, "step": 3788 }, { "epoch": 2.080724876441516, "grad_norm": 0.5305595435770265, "learning_rate": 2.1844099107649098e-07, "loss": 0.2451, "step": 3789 }, { "epoch": 2.0812740252608455, "grad_norm": 0.4558087135623228, "learning_rate": 2.1820106597414552e-07, "loss": 0.2343, "step": 3790 }, { "epoch": 2.0818231740801756, "grad_norm": 0.4866574220451204, "learning_rate": 2.1796123598543176e-07, "loss": 0.2667, "step": 3791 }, { "epoch": 2.0823723228995057, "grad_norm": 0.4521099309667922, "learning_rate": 2.1772150119128337e-07, "loss": 0.2898, "step": 3792 }, { "epoch": 2.082921471718836, "grad_norm": 0.47755378233209367, "learning_rate": 2.1748186167260182e-07, "loss": 0.2671, "step": 3793 }, { "epoch": 2.083470620538166, "grad_norm": 0.5074995119966317, "learning_rate": 2.1724231751025682e-07, "loss": 0.2196, "step": 3794 }, { "epoch": 2.084019769357496, "grad_norm": 0.5878029375941489, "learning_rate": 2.1700286878508575e-07, "loss": 0.2571, "step": 3795 }, { "epoch": 2.084568918176826, "grad_norm": 0.3962019074412491, "learning_rate": 2.1676351557789374e-07, "loss": 0.2601, "step": 3796 }, { "epoch": 2.0851180669961558, "grad_norm": 0.5069977074525809, "learning_rate": 2.1652425796945342e-07, "loss": 0.2313, "step": 3797 }, { "epoch": 2.085667215815486, "grad_norm": 0.48424608577258116, "learning_rate": 2.1628509604050555e-07, "loss": 0.2629, "step": 3798 }, { "epoch": 2.086216364634816, "grad_norm": 0.43422893700090626, "learning_rate": 2.1604602987175869e-07, "loss": 0.2253, "step": 3799 }, { "epoch": 2.086765513454146, "grad_norm": 0.4378541739451928, "learning_rate": 2.1580705954388853e-07, "loss": 0.2195, "step": 3800 }, { "epoch": 2.086765513454146, "eval_loss": 0.32376018166542053, "eval_runtime": 18.6635, "eval_samples_per_second": 23.736, "eval_steps_per_second": 1.018, "step": 3800 }, { "epoch": 2.087314662273476, "grad_norm": 0.44835065325415924, "learning_rate": 2.155681851375389e-07, "loss": 0.2464, "step": 3801 }, { "epoch": 2.0878638110928063, "grad_norm": 0.4203034524653535, "learning_rate": 2.1532940673332145e-07, "loss": 0.2427, "step": 3802 }, { "epoch": 2.0884129599121364, "grad_norm": 0.5156345649781823, "learning_rate": 2.150907244118144e-07, "loss": 0.1809, "step": 3803 }, { "epoch": 2.088962108731466, "grad_norm": 0.4733299234827269, "learning_rate": 2.1485213825356465e-07, "loss": 0.2605, "step": 3804 }, { "epoch": 2.089511257550796, "grad_norm": 0.43428342259449787, "learning_rate": 2.1461364833908639e-07, "loss": 0.2136, "step": 3805 }, { "epoch": 2.0900604063701262, "grad_norm": 0.4493739008877683, "learning_rate": 2.1437525474886072e-07, "loss": 0.2291, "step": 3806 }, { "epoch": 2.0906095551894563, "grad_norm": 0.49859453829732686, "learning_rate": 2.1413695756333722e-07, "loss": 0.2744, "step": 3807 }, { "epoch": 2.0911587040087865, "grad_norm": 0.4666041875816198, "learning_rate": 2.138987568629319e-07, "loss": 0.2166, "step": 3808 }, { "epoch": 2.0917078528281166, "grad_norm": 0.47040347298971286, "learning_rate": 2.1366065272802916e-07, "loss": 0.2462, "step": 3809 }, { "epoch": 2.0922570016474467, "grad_norm": 0.5370374026317133, "learning_rate": 2.1342264523898002e-07, "loss": 0.2566, "step": 3810 }, { "epoch": 2.0928061504667763, "grad_norm": 0.43451435409501626, "learning_rate": 2.131847344761034e-07, "loss": 0.2608, "step": 3811 }, { "epoch": 2.0933552992861064, "grad_norm": 0.48277301974793396, "learning_rate": 2.129469205196856e-07, "loss": 0.2288, "step": 3812 }, { "epoch": 2.0939044481054365, "grad_norm": 0.5986344646388713, "learning_rate": 2.1270920344997992e-07, "loss": 0.2645, "step": 3813 }, { "epoch": 2.0944535969247666, "grad_norm": 0.5817618984398569, "learning_rate": 2.1247158334720682e-07, "loss": 0.2352, "step": 3814 }, { "epoch": 2.0950027457440967, "grad_norm": 0.49920736071095206, "learning_rate": 2.1223406029155464e-07, "loss": 0.2961, "step": 3815 }, { "epoch": 2.095551894563427, "grad_norm": 0.5042657754474339, "learning_rate": 2.119966343631788e-07, "loss": 0.2604, "step": 3816 }, { "epoch": 2.096101043382757, "grad_norm": 0.6238536057071339, "learning_rate": 2.117593056422014e-07, "loss": 0.223, "step": 3817 }, { "epoch": 2.0966501922020866, "grad_norm": 0.48103575923231945, "learning_rate": 2.1152207420871258e-07, "loss": 0.2703, "step": 3818 }, { "epoch": 2.0971993410214167, "grad_norm": 0.4944920323125504, "learning_rate": 2.1128494014276896e-07, "loss": 0.2668, "step": 3819 }, { "epoch": 2.097748489840747, "grad_norm": 1.1698406483269248, "learning_rate": 2.1104790352439438e-07, "loss": 0.4154, "step": 3820 }, { "epoch": 2.098297638660077, "grad_norm": 0.5817018983620151, "learning_rate": 2.1081096443358012e-07, "loss": 0.2501, "step": 3821 }, { "epoch": 2.098846787479407, "grad_norm": 0.5829269175913295, "learning_rate": 2.105741229502847e-07, "loss": 0.2451, "step": 3822 }, { "epoch": 2.099395936298737, "grad_norm": 0.54771538874892, "learning_rate": 2.103373791544329e-07, "loss": 0.246, "step": 3823 }, { "epoch": 2.099945085118067, "grad_norm": 0.4884146239005686, "learning_rate": 2.1010073312591745e-07, "loss": 0.2794, "step": 3824 }, { "epoch": 2.100494233937397, "grad_norm": 0.4313857458398654, "learning_rate": 2.0986418494459728e-07, "loss": 0.2601, "step": 3825 }, { "epoch": 2.101043382756727, "grad_norm": 0.5389038758455438, "learning_rate": 2.096277346902991e-07, "loss": 0.2417, "step": 3826 }, { "epoch": 2.101592531576057, "grad_norm": 0.44093859615205977, "learning_rate": 2.0939138244281573e-07, "loss": 0.2352, "step": 3827 }, { "epoch": 2.102141680395387, "grad_norm": 0.5182439145384664, "learning_rate": 2.0915512828190753e-07, "loss": 0.2662, "step": 3828 }, { "epoch": 2.1026908292147173, "grad_norm": 0.46981164250174345, "learning_rate": 2.0891897228730185e-07, "loss": 0.249, "step": 3829 }, { "epoch": 2.1032399780340474, "grad_norm": 0.5498105127851545, "learning_rate": 2.0868291453869236e-07, "loss": 0.2696, "step": 3830 }, { "epoch": 2.1037891268533775, "grad_norm": 0.5089161702593171, "learning_rate": 2.084469551157397e-07, "loss": 0.2374, "step": 3831 }, { "epoch": 2.104338275672707, "grad_norm": 0.5217421166675897, "learning_rate": 2.082110940980717e-07, "loss": 0.2213, "step": 3832 }, { "epoch": 2.1048874244920373, "grad_norm": 0.4910412197627618, "learning_rate": 2.0797533156528289e-07, "loss": 0.2286, "step": 3833 }, { "epoch": 2.1054365733113674, "grad_norm": 0.4612081084092483, "learning_rate": 2.0773966759693407e-07, "loss": 0.2322, "step": 3834 }, { "epoch": 2.1059857221306975, "grad_norm": 0.4837269622859846, "learning_rate": 2.0750410227255355e-07, "loss": 0.2537, "step": 3835 }, { "epoch": 2.1065348709500276, "grad_norm": 0.49360714895182956, "learning_rate": 2.0726863567163574e-07, "loss": 0.2692, "step": 3836 }, { "epoch": 2.1070840197693577, "grad_norm": 0.6477791973519866, "learning_rate": 2.0703326787364184e-07, "loss": 0.2729, "step": 3837 }, { "epoch": 2.1076331685886873, "grad_norm": 0.4402285014292362, "learning_rate": 2.0679799895799984e-07, "loss": 0.2245, "step": 3838 }, { "epoch": 2.1081823174080174, "grad_norm": 0.4973315046775512, "learning_rate": 2.0656282900410465e-07, "loss": 0.2357, "step": 3839 }, { "epoch": 2.1087314662273475, "grad_norm": 0.479704554789677, "learning_rate": 2.0632775809131726e-07, "loss": 0.2585, "step": 3840 }, { "epoch": 2.1092806150466776, "grad_norm": 0.4869899078769164, "learning_rate": 2.0609278629896518e-07, "loss": 0.2636, "step": 3841 }, { "epoch": 2.1098297638660077, "grad_norm": 0.5585917548592232, "learning_rate": 2.05857913706343e-07, "loss": 0.2112, "step": 3842 }, { "epoch": 2.110378912685338, "grad_norm": 0.710217625113275, "learning_rate": 2.056231403927117e-07, "loss": 0.244, "step": 3843 }, { "epoch": 2.110928061504668, "grad_norm": 0.39836715100691994, "learning_rate": 2.053884664372983e-07, "loss": 0.2284, "step": 3844 }, { "epoch": 2.1114772103239976, "grad_norm": 0.4458689999105017, "learning_rate": 2.0515389191929678e-07, "loss": 0.2548, "step": 3845 }, { "epoch": 2.1120263591433277, "grad_norm": 0.5247679999941065, "learning_rate": 2.0491941691786757e-07, "loss": 0.2369, "step": 3846 }, { "epoch": 2.112575507962658, "grad_norm": 0.5087362219602194, "learning_rate": 2.0468504151213714e-07, "loss": 0.2411, "step": 3847 }, { "epoch": 2.113124656781988, "grad_norm": 0.5468988295400588, "learning_rate": 2.0445076578119845e-07, "loss": 0.2032, "step": 3848 }, { "epoch": 2.113673805601318, "grad_norm": 0.47950365843329795, "learning_rate": 2.0421658980411106e-07, "loss": 0.2104, "step": 3849 }, { "epoch": 2.114222954420648, "grad_norm": 0.3730845942111509, "learning_rate": 2.0398251365990088e-07, "loss": 0.2192, "step": 3850 }, { "epoch": 2.114772103239978, "grad_norm": 0.45984650798458504, "learning_rate": 2.0374853742755986e-07, "loss": 0.2162, "step": 3851 }, { "epoch": 2.115321252059308, "grad_norm": 0.4534019289876038, "learning_rate": 2.035146611860462e-07, "loss": 0.2434, "step": 3852 }, { "epoch": 2.115870400878638, "grad_norm": 0.46751695961336376, "learning_rate": 2.0328088501428477e-07, "loss": 0.233, "step": 3853 }, { "epoch": 2.116419549697968, "grad_norm": 0.4765868265720958, "learning_rate": 2.0304720899116616e-07, "loss": 0.2156, "step": 3854 }, { "epoch": 2.116968698517298, "grad_norm": 0.4621921873096323, "learning_rate": 2.0281363319554756e-07, "loss": 0.2551, "step": 3855 }, { "epoch": 2.1175178473366283, "grad_norm": 0.5497337891261651, "learning_rate": 2.0258015770625238e-07, "loss": 0.2263, "step": 3856 }, { "epoch": 2.1180669961559584, "grad_norm": 0.5482509629339043, "learning_rate": 2.0234678260206976e-07, "loss": 0.2734, "step": 3857 }, { "epoch": 2.1186161449752885, "grad_norm": 0.4433565742325993, "learning_rate": 2.0211350796175513e-07, "loss": 0.2204, "step": 3858 }, { "epoch": 2.119165293794618, "grad_norm": 0.5116139811620837, "learning_rate": 2.018803338640302e-07, "loss": 0.2278, "step": 3859 }, { "epoch": 2.1197144426139483, "grad_norm": 0.48212694774789133, "learning_rate": 2.0164726038758276e-07, "loss": 0.2534, "step": 3860 }, { "epoch": 2.1202635914332784, "grad_norm": 0.5388688753747408, "learning_rate": 2.0141428761106629e-07, "loss": 0.204, "step": 3861 }, { "epoch": 2.1208127402526085, "grad_norm": 0.48245621013780615, "learning_rate": 2.0118141561310085e-07, "loss": 0.2798, "step": 3862 }, { "epoch": 2.1213618890719386, "grad_norm": 0.4260629640231019, "learning_rate": 2.0094864447227168e-07, "loss": 0.239, "step": 3863 }, { "epoch": 2.1219110378912687, "grad_norm": 0.5264671645008749, "learning_rate": 2.0071597426713094e-07, "loss": 0.2631, "step": 3864 }, { "epoch": 2.1224601867105988, "grad_norm": 0.5708440471622848, "learning_rate": 2.0048340507619593e-07, "loss": 0.2464, "step": 3865 }, { "epoch": 2.1230093355299284, "grad_norm": 0.5607639535724869, "learning_rate": 2.002509369779502e-07, "loss": 0.2247, "step": 3866 }, { "epoch": 2.1235584843492585, "grad_norm": 0.4672443926232953, "learning_rate": 2.0001857005084348e-07, "loss": 0.2548, "step": 3867 }, { "epoch": 2.1241076331685886, "grad_norm": 0.3878676680814944, "learning_rate": 1.9978630437329086e-07, "loss": 0.2814, "step": 3868 }, { "epoch": 2.1246567819879187, "grad_norm": 0.5503157690141373, "learning_rate": 1.9955414002367327e-07, "loss": 0.2204, "step": 3869 }, { "epoch": 2.125205930807249, "grad_norm": 0.5181351304480719, "learning_rate": 1.9932207708033785e-07, "loss": 0.2632, "step": 3870 }, { "epoch": 2.125755079626579, "grad_norm": 0.5040983743257278, "learning_rate": 1.990901156215971e-07, "loss": 0.2393, "step": 3871 }, { "epoch": 2.1263042284459086, "grad_norm": 0.39817991322482477, "learning_rate": 1.988582557257296e-07, "loss": 0.2627, "step": 3872 }, { "epoch": 2.1268533772652387, "grad_norm": 0.5551303017650082, "learning_rate": 1.9862649747097967e-07, "loss": 0.2556, "step": 3873 }, { "epoch": 2.127402526084569, "grad_norm": 0.4220738732677619, "learning_rate": 1.9839484093555707e-07, "loss": 0.2284, "step": 3874 }, { "epoch": 2.127951674903899, "grad_norm": 0.47052582757112177, "learning_rate": 1.9816328619763706e-07, "loss": 0.2609, "step": 3875 }, { "epoch": 2.128500823723229, "grad_norm": 0.4152980893138041, "learning_rate": 1.979318333353611e-07, "loss": 0.2511, "step": 3876 }, { "epoch": 2.129049972542559, "grad_norm": 0.49090859663218683, "learning_rate": 1.9770048242683616e-07, "loss": 0.2538, "step": 3877 }, { "epoch": 2.1295991213618892, "grad_norm": 0.48114834075147034, "learning_rate": 1.9746923355013425e-07, "loss": 0.2674, "step": 3878 }, { "epoch": 2.130148270181219, "grad_norm": 0.503727489866788, "learning_rate": 1.9723808678329377e-07, "loss": 0.2595, "step": 3879 }, { "epoch": 2.130697419000549, "grad_norm": 0.5008553988693354, "learning_rate": 1.9700704220431785e-07, "loss": 0.2448, "step": 3880 }, { "epoch": 2.131246567819879, "grad_norm": 0.49410063409296384, "learning_rate": 1.967760998911759e-07, "loss": 0.2815, "step": 3881 }, { "epoch": 2.131795716639209, "grad_norm": 0.497035474782331, "learning_rate": 1.9654525992180203e-07, "loss": 0.2366, "step": 3882 }, { "epoch": 2.1323448654585393, "grad_norm": 0.5782493695908826, "learning_rate": 1.9631452237409648e-07, "loss": 0.2298, "step": 3883 }, { "epoch": 2.1328940142778694, "grad_norm": 0.4393680726477153, "learning_rate": 1.9608388732592495e-07, "loss": 0.285, "step": 3884 }, { "epoch": 2.1334431630971995, "grad_norm": 0.4928816193499844, "learning_rate": 1.9585335485511763e-07, "loss": 0.2413, "step": 3885 }, { "epoch": 2.133992311916529, "grad_norm": 0.5164242630041197, "learning_rate": 1.9562292503947107e-07, "loss": 0.265, "step": 3886 }, { "epoch": 2.1345414607358593, "grad_norm": 0.6824778741241071, "learning_rate": 1.9539259795674698e-07, "loss": 0.2297, "step": 3887 }, { "epoch": 2.1350906095551894, "grad_norm": 0.5217338025973458, "learning_rate": 1.9516237368467194e-07, "loss": 0.2346, "step": 3888 }, { "epoch": 2.1356397583745195, "grad_norm": 0.6202267849174579, "learning_rate": 1.9493225230093831e-07, "loss": 0.2259, "step": 3889 }, { "epoch": 2.1361889071938496, "grad_norm": 0.47296869159058896, "learning_rate": 1.9470223388320386e-07, "loss": 0.2318, "step": 3890 }, { "epoch": 2.1367380560131797, "grad_norm": 0.5412293417627534, "learning_rate": 1.9447231850909103e-07, "loss": 0.242, "step": 3891 }, { "epoch": 2.13728720483251, "grad_norm": 0.4307234486208026, "learning_rate": 1.942425062561877e-07, "loss": 0.2326, "step": 3892 }, { "epoch": 2.1378363536518394, "grad_norm": 0.491847468059347, "learning_rate": 1.9401279720204712e-07, "loss": 0.2477, "step": 3893 }, { "epoch": 2.1383855024711695, "grad_norm": 0.4957298364097486, "learning_rate": 1.937831914241879e-07, "loss": 0.2552, "step": 3894 }, { "epoch": 2.1389346512904996, "grad_norm": 0.597020614326026, "learning_rate": 1.935536890000933e-07, "loss": 0.2489, "step": 3895 }, { "epoch": 2.1394838001098297, "grad_norm": 0.425252952318177, "learning_rate": 1.9332429000721178e-07, "loss": 0.2408, "step": 3896 }, { "epoch": 2.14003294892916, "grad_norm": 0.5941925186049234, "learning_rate": 1.9309499452295727e-07, "loss": 0.3189, "step": 3897 }, { "epoch": 2.14058209774849, "grad_norm": 0.4277229126926347, "learning_rate": 1.9286580262470858e-07, "loss": 0.2522, "step": 3898 }, { "epoch": 2.14113124656782, "grad_norm": 0.4731538800259954, "learning_rate": 1.9263671438980938e-07, "loss": 0.2911, "step": 3899 }, { "epoch": 2.1416803953871497, "grad_norm": 0.5085825572256085, "learning_rate": 1.9240772989556855e-07, "loss": 0.2495, "step": 3900 }, { "epoch": 2.14222954420648, "grad_norm": 1.217562512470521, "learning_rate": 1.9217884921926027e-07, "loss": 0.4166, "step": 3901 }, { "epoch": 2.14277869302581, "grad_norm": 0.43371609794287097, "learning_rate": 1.919500724381227e-07, "loss": 0.2491, "step": 3902 }, { "epoch": 2.14332784184514, "grad_norm": 0.4917208768632998, "learning_rate": 1.9172139962935987e-07, "loss": 0.2538, "step": 3903 }, { "epoch": 2.14387699066447, "grad_norm": 0.45810023421376667, "learning_rate": 1.9149283087014064e-07, "loss": 0.2746, "step": 3904 }, { "epoch": 2.1444261394838002, "grad_norm": 0.4372699453223447, "learning_rate": 1.912643662375982e-07, "loss": 0.2112, "step": 3905 }, { "epoch": 2.1449752883031303, "grad_norm": 0.43792039356987905, "learning_rate": 1.9103600580883106e-07, "loss": 0.2409, "step": 3906 }, { "epoch": 2.14552443712246, "grad_norm": 0.5609103810209406, "learning_rate": 1.908077496609027e-07, "loss": 0.276, "step": 3907 }, { "epoch": 2.14607358594179, "grad_norm": 0.42324528721595817, "learning_rate": 1.9057959787084098e-07, "loss": 0.2607, "step": 3908 }, { "epoch": 2.14662273476112, "grad_norm": 0.4715374997362142, "learning_rate": 1.9035155051563847e-07, "loss": 0.2111, "step": 3909 }, { "epoch": 2.1471718835804503, "grad_norm": 0.49983634210228417, "learning_rate": 1.9012360767225304e-07, "loss": 0.2451, "step": 3910 }, { "epoch": 2.1477210323997804, "grad_norm": 0.4661224660618886, "learning_rate": 1.898957694176071e-07, "loss": 0.2685, "step": 3911 }, { "epoch": 2.1482701812191105, "grad_norm": 0.4300289303413506, "learning_rate": 1.8966803582858745e-07, "loss": 0.2542, "step": 3912 }, { "epoch": 2.1488193300384406, "grad_norm": 0.4619576387584257, "learning_rate": 1.894404069820457e-07, "loss": 0.2225, "step": 3913 }, { "epoch": 2.1493684788577703, "grad_norm": 0.5031134581390188, "learning_rate": 1.8921288295479842e-07, "loss": 0.1993, "step": 3914 }, { "epoch": 2.1499176276771004, "grad_norm": 0.47178946077145767, "learning_rate": 1.8898546382362663e-07, "loss": 0.2321, "step": 3915 }, { "epoch": 2.1504667764964305, "grad_norm": 0.543018371515395, "learning_rate": 1.8875814966527565e-07, "loss": 0.2398, "step": 3916 }, { "epoch": 2.1510159253157606, "grad_norm": 0.49361401099524627, "learning_rate": 1.885309405564559e-07, "loss": 0.2135, "step": 3917 }, { "epoch": 2.1515650741350907, "grad_norm": 0.5388751454699288, "learning_rate": 1.8830383657384193e-07, "loss": 0.2438, "step": 3918 }, { "epoch": 2.152114222954421, "grad_norm": 0.5655443709542992, "learning_rate": 1.8807683779407294e-07, "loss": 0.2579, "step": 3919 }, { "epoch": 2.152663371773751, "grad_norm": 0.46217653882996385, "learning_rate": 1.8784994429375265e-07, "loss": 0.2658, "step": 3920 }, { "epoch": 2.1532125205930805, "grad_norm": 0.5998136405598767, "learning_rate": 1.8762315614944943e-07, "loss": 0.2581, "step": 3921 }, { "epoch": 2.1537616694124107, "grad_norm": 0.417790543709564, "learning_rate": 1.8739647343769571e-07, "loss": 0.223, "step": 3922 }, { "epoch": 2.1543108182317408, "grad_norm": 0.5439570688326004, "learning_rate": 1.8716989623498882e-07, "loss": 0.2454, "step": 3923 }, { "epoch": 2.154859967051071, "grad_norm": 0.38858703529285105, "learning_rate": 1.8694342461778987e-07, "loss": 0.2413, "step": 3924 }, { "epoch": 2.155409115870401, "grad_norm": 0.5126797149447225, "learning_rate": 1.8671705866252507e-07, "loss": 0.202, "step": 3925 }, { "epoch": 2.155958264689731, "grad_norm": 0.3872490208690947, "learning_rate": 1.8649079844558418e-07, "loss": 0.2347, "step": 3926 }, { "epoch": 2.156507413509061, "grad_norm": 0.5346932465994957, "learning_rate": 1.8626464404332194e-07, "loss": 0.2743, "step": 3927 }, { "epoch": 2.157056562328391, "grad_norm": 0.5104704753214021, "learning_rate": 1.8603859553205726e-07, "loss": 0.2579, "step": 3928 }, { "epoch": 2.157605711147721, "grad_norm": 0.44627563518851293, "learning_rate": 1.8581265298807296e-07, "loss": 0.2235, "step": 3929 }, { "epoch": 2.158154859967051, "grad_norm": 0.5252819958388432, "learning_rate": 1.8558681648761622e-07, "loss": 0.2501, "step": 3930 }, { "epoch": 2.158704008786381, "grad_norm": 0.5322887516973847, "learning_rate": 1.8536108610689877e-07, "loss": 0.211, "step": 3931 }, { "epoch": 2.1592531576057112, "grad_norm": 0.4791201013389418, "learning_rate": 1.8513546192209635e-07, "loss": 0.2299, "step": 3932 }, { "epoch": 2.1598023064250413, "grad_norm": 0.48616022178910523, "learning_rate": 1.8490994400934848e-07, "loss": 0.2426, "step": 3933 }, { "epoch": 2.1603514552443714, "grad_norm": 0.5260594505768331, "learning_rate": 1.8468453244475954e-07, "loss": 0.2564, "step": 3934 }, { "epoch": 2.160900604063701, "grad_norm": 0.4914406655976773, "learning_rate": 1.8445922730439746e-07, "loss": 0.2643, "step": 3935 }, { "epoch": 2.161449752883031, "grad_norm": 0.5328852501050172, "learning_rate": 1.8423402866429425e-07, "loss": 0.2216, "step": 3936 }, { "epoch": 2.1619989017023613, "grad_norm": 0.4707301182259253, "learning_rate": 1.8400893660044627e-07, "loss": 0.272, "step": 3937 }, { "epoch": 2.1625480505216914, "grad_norm": 0.5433831647293316, "learning_rate": 1.8378395118881397e-07, "loss": 0.2202, "step": 3938 }, { "epoch": 2.1630971993410215, "grad_norm": 0.42325050341266984, "learning_rate": 1.8355907250532147e-07, "loss": 0.2303, "step": 3939 }, { "epoch": 2.1636463481603516, "grad_norm": 0.5721251469300657, "learning_rate": 1.833343006258571e-07, "loss": 0.2647, "step": 3940 }, { "epoch": 2.1641954969796817, "grad_norm": 0.46529600474690497, "learning_rate": 1.8310963562627295e-07, "loss": 0.2635, "step": 3941 }, { "epoch": 2.1647446457990114, "grad_norm": 0.563222643702606, "learning_rate": 1.8288507758238547e-07, "loss": 0.2419, "step": 3942 }, { "epoch": 2.1652937946183415, "grad_norm": 0.4081828292050501, "learning_rate": 1.826606265699744e-07, "loss": 0.2625, "step": 3943 }, { "epoch": 2.1658429434376716, "grad_norm": 0.4530398764917165, "learning_rate": 1.824362826647838e-07, "loss": 0.2732, "step": 3944 }, { "epoch": 2.1663920922570017, "grad_norm": 0.5222420863608187, "learning_rate": 1.8221204594252177e-07, "loss": 0.2733, "step": 3945 }, { "epoch": 2.166941241076332, "grad_norm": 0.6618122020250323, "learning_rate": 1.8198791647885958e-07, "loss": 0.2984, "step": 3946 }, { "epoch": 2.167490389895662, "grad_norm": 0.5269650152741272, "learning_rate": 1.8176389434943267e-07, "loss": 0.2706, "step": 3947 }, { "epoch": 2.168039538714992, "grad_norm": 0.5129607360245605, "learning_rate": 1.8153997962984046e-07, "loss": 0.254, "step": 3948 }, { "epoch": 2.1685886875343217, "grad_norm": 0.4855314552044702, "learning_rate": 1.8131617239564572e-07, "loss": 0.2485, "step": 3949 }, { "epoch": 2.1691378363536518, "grad_norm": 0.49262213174779745, "learning_rate": 1.8109247272237514e-07, "loss": 0.2591, "step": 3950 }, { "epoch": 2.169686985172982, "grad_norm": 0.44830683718731024, "learning_rate": 1.808688806855195e-07, "loss": 0.2602, "step": 3951 }, { "epoch": 2.170236133992312, "grad_norm": 0.6136236495959323, "learning_rate": 1.8064539636053255e-07, "loss": 0.2747, "step": 3952 }, { "epoch": 2.170785282811642, "grad_norm": 0.672191262207508, "learning_rate": 1.8042201982283185e-07, "loss": 0.3057, "step": 3953 }, { "epoch": 2.171334431630972, "grad_norm": 0.447855063441009, "learning_rate": 1.8019875114779902e-07, "loss": 0.2123, "step": 3954 }, { "epoch": 2.171883580450302, "grad_norm": 0.5069398622137821, "learning_rate": 1.7997559041077915e-07, "loss": 0.2471, "step": 3955 }, { "epoch": 2.172432729269632, "grad_norm": 0.5361294945466532, "learning_rate": 1.7975253768708054e-07, "loss": 0.2365, "step": 3956 }, { "epoch": 2.172981878088962, "grad_norm": 0.6224601248513151, "learning_rate": 1.7952959305197513e-07, "loss": 0.215, "step": 3957 }, { "epoch": 2.173531026908292, "grad_norm": 0.5168587742842273, "learning_rate": 1.7930675658069868e-07, "loss": 0.2629, "step": 3958 }, { "epoch": 2.1740801757276222, "grad_norm": 0.5258211594824284, "learning_rate": 1.790840283484505e-07, "loss": 0.223, "step": 3959 }, { "epoch": 2.1746293245469523, "grad_norm": 0.41397066435051055, "learning_rate": 1.7886140843039277e-07, "loss": 0.2183, "step": 3960 }, { "epoch": 2.1751784733662825, "grad_norm": 0.5916820698763788, "learning_rate": 1.7863889690165168e-07, "loss": 0.262, "step": 3961 }, { "epoch": 2.175727622185612, "grad_norm": 0.44775479293884324, "learning_rate": 1.7841649383731686e-07, "loss": 0.2783, "step": 3962 }, { "epoch": 2.176276771004942, "grad_norm": 0.4606341896234673, "learning_rate": 1.7819419931244104e-07, "loss": 0.2333, "step": 3963 }, { "epoch": 2.1768259198242723, "grad_norm": 0.5536635371358996, "learning_rate": 1.7797201340204019e-07, "loss": 0.2304, "step": 3964 }, { "epoch": 2.1773750686436024, "grad_norm": 0.486297622134742, "learning_rate": 1.7774993618109423e-07, "loss": 0.2248, "step": 3965 }, { "epoch": 2.1779242174629325, "grad_norm": 0.523951370415108, "learning_rate": 1.7752796772454567e-07, "loss": 0.2684, "step": 3966 }, { "epoch": 2.1784733662822626, "grad_norm": 0.4708976868499049, "learning_rate": 1.773061081073011e-07, "loss": 0.2529, "step": 3967 }, { "epoch": 2.1790225151015927, "grad_norm": 0.5273083356613111, "learning_rate": 1.7708435740422958e-07, "loss": 0.254, "step": 3968 }, { "epoch": 2.1795716639209224, "grad_norm": 0.5195379750212576, "learning_rate": 1.7686271569016418e-07, "loss": 0.2283, "step": 3969 }, { "epoch": 2.1801208127402525, "grad_norm": 1.1111378467366022, "learning_rate": 1.7664118303990036e-07, "loss": 0.2481, "step": 3970 }, { "epoch": 2.1806699615595826, "grad_norm": 0.5094539722800087, "learning_rate": 1.764197595281975e-07, "loss": 0.2321, "step": 3971 }, { "epoch": 2.1812191103789127, "grad_norm": 0.5543417892199277, "learning_rate": 1.7619844522977807e-07, "loss": 0.257, "step": 3972 }, { "epoch": 2.181768259198243, "grad_norm": 0.4368811340571751, "learning_rate": 1.7597724021932723e-07, "loss": 0.2554, "step": 3973 }, { "epoch": 2.182317408017573, "grad_norm": 0.5215848285605968, "learning_rate": 1.7575614457149336e-07, "loss": 0.219, "step": 3974 }, { "epoch": 2.182866556836903, "grad_norm": 0.48589017065286566, "learning_rate": 1.755351583608884e-07, "loss": 0.2306, "step": 3975 }, { "epoch": 2.1834157056562327, "grad_norm": 0.5080207906688253, "learning_rate": 1.7531428166208705e-07, "loss": 0.2147, "step": 3976 }, { "epoch": 2.1839648544755628, "grad_norm": 0.44408588542997157, "learning_rate": 1.7509351454962684e-07, "loss": 0.2345, "step": 3977 }, { "epoch": 2.184514003294893, "grad_norm": 0.5653806576790686, "learning_rate": 1.748728570980088e-07, "loss": 0.2128, "step": 3978 }, { "epoch": 2.185063152114223, "grad_norm": 0.3957641151179787, "learning_rate": 1.7465230938169658e-07, "loss": 0.2201, "step": 3979 }, { "epoch": 2.185612300933553, "grad_norm": 0.5583549334857763, "learning_rate": 1.7443187147511676e-07, "loss": 0.2166, "step": 3980 }, { "epoch": 2.186161449752883, "grad_norm": 0.5544191422615505, "learning_rate": 1.7421154345265905e-07, "loss": 0.2723, "step": 3981 }, { "epoch": 2.186710598572213, "grad_norm": 0.5012992234043202, "learning_rate": 1.7399132538867637e-07, "loss": 0.2341, "step": 3982 }, { "epoch": 2.187259747391543, "grad_norm": 0.49661256429628026, "learning_rate": 1.7377121735748376e-07, "loss": 0.22, "step": 3983 }, { "epoch": 2.187808896210873, "grad_norm": 0.5815286108172486, "learning_rate": 1.7355121943335991e-07, "loss": 0.2687, "step": 3984 }, { "epoch": 2.188358045030203, "grad_norm": 0.47338165728106446, "learning_rate": 1.7333133169054572e-07, "loss": 0.2523, "step": 3985 }, { "epoch": 2.1889071938495333, "grad_norm": 0.4074535547435078, "learning_rate": 1.7311155420324557e-07, "loss": 0.2485, "step": 3986 }, { "epoch": 2.1894563426688634, "grad_norm": 0.5842482722695632, "learning_rate": 1.7289188704562588e-07, "loss": 0.2753, "step": 3987 }, { "epoch": 2.1900054914881935, "grad_norm": 0.503630989785303, "learning_rate": 1.7267233029181638e-07, "loss": 0.232, "step": 3988 }, { "epoch": 2.190554640307523, "grad_norm": 0.5080407192677118, "learning_rate": 1.7245288401590955e-07, "loss": 0.2207, "step": 3989 }, { "epoch": 2.191103789126853, "grad_norm": 0.44232842031396075, "learning_rate": 1.7223354829196025e-07, "loss": 0.2272, "step": 3990 }, { "epoch": 2.1916529379461833, "grad_norm": 0.5701332853466663, "learning_rate": 1.720143231939861e-07, "loss": 0.2555, "step": 3991 }, { "epoch": 2.1922020867655134, "grad_norm": 0.5214866224133593, "learning_rate": 1.7179520879596768e-07, "loss": 0.2368, "step": 3992 }, { "epoch": 2.1927512355848435, "grad_norm": 0.4714742898345364, "learning_rate": 1.7157620517184806e-07, "loss": 0.2475, "step": 3993 }, { "epoch": 2.1933003844041736, "grad_norm": 0.5685763605519386, "learning_rate": 1.713573123955327e-07, "loss": 0.2593, "step": 3994 }, { "epoch": 2.1938495332235037, "grad_norm": 0.6082992119483821, "learning_rate": 1.7113853054089006e-07, "loss": 0.2467, "step": 3995 }, { "epoch": 2.1943986820428334, "grad_norm": 0.4996313225914673, "learning_rate": 1.7091985968175087e-07, "loss": 0.257, "step": 3996 }, { "epoch": 2.1949478308621635, "grad_norm": 0.4858029386986306, "learning_rate": 1.7070129989190832e-07, "loss": 0.2402, "step": 3997 }, { "epoch": 2.1954969796814936, "grad_norm": 0.5147496668729445, "learning_rate": 1.7048285124511844e-07, "loss": 0.2684, "step": 3998 }, { "epoch": 2.1960461285008237, "grad_norm": 0.6161496259139179, "learning_rate": 1.7026451381509976e-07, "loss": 0.21, "step": 3999 }, { "epoch": 2.196595277320154, "grad_norm": 0.4815709985299374, "learning_rate": 1.70046287675533e-07, "loss": 0.2623, "step": 4000 }, { "epoch": 2.196595277320154, "eval_loss": 0.322488009929657, "eval_runtime": 18.6785, "eval_samples_per_second": 23.717, "eval_steps_per_second": 1.017, "step": 4000 }, { "epoch": 2.197144426139484, "grad_norm": 0.6414869548260358, "learning_rate": 1.6982817290006112e-07, "loss": 0.2525, "step": 4001 }, { "epoch": 2.197693574958814, "grad_norm": 0.4105625253986696, "learning_rate": 1.696101695622902e-07, "loss": 0.2442, "step": 4002 }, { "epoch": 2.1982427237781437, "grad_norm": 0.4620457673593199, "learning_rate": 1.6939227773578836e-07, "loss": 0.2443, "step": 4003 }, { "epoch": 2.1987918725974738, "grad_norm": 0.5645562413226259, "learning_rate": 1.6917449749408576e-07, "loss": 0.2851, "step": 4004 }, { "epoch": 2.199341021416804, "grad_norm": 0.4889406909450005, "learning_rate": 1.6895682891067544e-07, "loss": 0.2739, "step": 4005 }, { "epoch": 2.199890170236134, "grad_norm": 0.6484999147281099, "learning_rate": 1.687392720590126e-07, "loss": 0.2605, "step": 4006 }, { "epoch": 2.200439319055464, "grad_norm": 0.5146071446875392, "learning_rate": 1.6852182701251455e-07, "loss": 0.2317, "step": 4007 }, { "epoch": 2.200988467874794, "grad_norm": 0.41168765729540324, "learning_rate": 1.683044938445608e-07, "loss": 0.2818, "step": 4008 }, { "epoch": 2.2015376166941243, "grad_norm": 0.539069737040092, "learning_rate": 1.680872726284934e-07, "loss": 0.2349, "step": 4009 }, { "epoch": 2.202086765513454, "grad_norm": 0.4869059414028793, "learning_rate": 1.6787016343761678e-07, "loss": 0.2308, "step": 4010 }, { "epoch": 2.202635914332784, "grad_norm": 0.5087358252541315, "learning_rate": 1.6765316634519707e-07, "loss": 0.2546, "step": 4011 }, { "epoch": 2.203185063152114, "grad_norm": 0.45386299329004964, "learning_rate": 1.6743628142446264e-07, "loss": 0.2121, "step": 4012 }, { "epoch": 2.2037342119714443, "grad_norm": 0.5280596873106801, "learning_rate": 1.6721950874860454e-07, "loss": 0.2551, "step": 4013 }, { "epoch": 2.2042833607907744, "grad_norm": 1.02622346579664, "learning_rate": 1.670028483907751e-07, "loss": 0.4209, "step": 4014 }, { "epoch": 2.2048325096101045, "grad_norm": 0.43413318763392755, "learning_rate": 1.6678630042408952e-07, "loss": 0.2374, "step": 4015 }, { "epoch": 2.2053816584294346, "grad_norm": 0.43318741293930946, "learning_rate": 1.6656986492162478e-07, "loss": 0.2331, "step": 4016 }, { "epoch": 2.2059308072487642, "grad_norm": 0.489768975704594, "learning_rate": 1.6635354195641985e-07, "loss": 0.2421, "step": 4017 }, { "epoch": 2.2064799560680943, "grad_norm": 0.46233173815457396, "learning_rate": 1.6613733160147554e-07, "loss": 0.2585, "step": 4018 }, { "epoch": 2.2070291048874244, "grad_norm": 0.5509875085989772, "learning_rate": 1.6592123392975505e-07, "loss": 0.2445, "step": 4019 }, { "epoch": 2.2075782537067545, "grad_norm": 0.520179801161191, "learning_rate": 1.6570524901418356e-07, "loss": 0.2334, "step": 4020 }, { "epoch": 2.2081274025260846, "grad_norm": 0.5163865066842153, "learning_rate": 1.654893769276477e-07, "loss": 0.2159, "step": 4021 }, { "epoch": 2.2086765513454147, "grad_norm": 0.4984183004108988, "learning_rate": 1.652736177429966e-07, "loss": 0.3069, "step": 4022 }, { "epoch": 2.209225700164745, "grad_norm": 0.42803332023888097, "learning_rate": 1.6505797153304082e-07, "loss": 0.2599, "step": 4023 }, { "epoch": 2.2097748489840745, "grad_norm": 0.5260417229374884, "learning_rate": 1.6484243837055327e-07, "loss": 0.2287, "step": 4024 }, { "epoch": 2.2103239978034046, "grad_norm": 0.5136301521855394, "learning_rate": 1.6462701832826814e-07, "loss": 0.2382, "step": 4025 }, { "epoch": 2.2108731466227347, "grad_norm": 0.5088282106460268, "learning_rate": 1.6441171147888187e-07, "loss": 0.2285, "step": 4026 }, { "epoch": 2.211422295442065, "grad_norm": 0.45164567314033205, "learning_rate": 1.6419651789505285e-07, "loss": 0.2727, "step": 4027 }, { "epoch": 2.211971444261395, "grad_norm": 0.603651328225906, "learning_rate": 1.639814376494008e-07, "loss": 0.2463, "step": 4028 }, { "epoch": 2.212520593080725, "grad_norm": 0.485303946701434, "learning_rate": 1.6376647081450717e-07, "loss": 0.235, "step": 4029 }, { "epoch": 2.213069741900055, "grad_norm": 0.6459471769862432, "learning_rate": 1.6355161746291568e-07, "loss": 0.2565, "step": 4030 }, { "epoch": 2.213618890719385, "grad_norm": 0.6934801056751915, "learning_rate": 1.633368776671311e-07, "loss": 0.3024, "step": 4031 }, { "epoch": 2.214168039538715, "grad_norm": 0.577355114181319, "learning_rate": 1.6312225149962038e-07, "loss": 0.2506, "step": 4032 }, { "epoch": 2.214717188358045, "grad_norm": 0.5525564996174658, "learning_rate": 1.6290773903281215e-07, "loss": 0.2512, "step": 4033 }, { "epoch": 2.215266337177375, "grad_norm": 0.5917190480644012, "learning_rate": 1.626933403390962e-07, "loss": 0.2301, "step": 4034 }, { "epoch": 2.215815485996705, "grad_norm": 0.5862396722785724, "learning_rate": 1.624790554908241e-07, "loss": 0.2596, "step": 4035 }, { "epoch": 2.2163646348160353, "grad_norm": 0.5124609754820473, "learning_rate": 1.622648845603092e-07, "loss": 0.3049, "step": 4036 }, { "epoch": 2.2169137836353654, "grad_norm": 0.5090848377694975, "learning_rate": 1.6205082761982656e-07, "loss": 0.2601, "step": 4037 }, { "epoch": 2.217462932454695, "grad_norm": 0.5332019121630734, "learning_rate": 1.6183688474161207e-07, "loss": 0.247, "step": 4038 }, { "epoch": 2.218012081274025, "grad_norm": 0.4836118264097785, "learning_rate": 1.61623055997864e-07, "loss": 0.2577, "step": 4039 }, { "epoch": 2.2185612300933553, "grad_norm": 0.4831484679955604, "learning_rate": 1.6140934146074122e-07, "loss": 0.2323, "step": 4040 }, { "epoch": 2.2191103789126854, "grad_norm": 0.4385352786071897, "learning_rate": 1.6119574120236496e-07, "loss": 0.2439, "step": 4041 }, { "epoch": 2.2196595277320155, "grad_norm": 0.44689419860704854, "learning_rate": 1.6098225529481705e-07, "loss": 0.2456, "step": 4042 }, { "epoch": 2.2202086765513456, "grad_norm": 0.5279272153579984, "learning_rate": 1.6076888381014133e-07, "loss": 0.2743, "step": 4043 }, { "epoch": 2.2207578253706757, "grad_norm": 0.47364791781897286, "learning_rate": 1.6055562682034306e-07, "loss": 0.2756, "step": 4044 }, { "epoch": 2.2213069741900053, "grad_norm": 0.4418247722099023, "learning_rate": 1.6034248439738808e-07, "loss": 0.2634, "step": 4045 }, { "epoch": 2.2218561230093354, "grad_norm": 0.6459587624664268, "learning_rate": 1.601294566132043e-07, "loss": 0.2784, "step": 4046 }, { "epoch": 2.2224052718286655, "grad_norm": 0.5170173724091901, "learning_rate": 1.5991654353968095e-07, "loss": 0.2439, "step": 4047 }, { "epoch": 2.2229544206479956, "grad_norm": 0.46719571459521797, "learning_rate": 1.597037452486681e-07, "loss": 0.2325, "step": 4048 }, { "epoch": 2.2235035694673257, "grad_norm": 0.6445040020296987, "learning_rate": 1.5949106181197745e-07, "loss": 0.2374, "step": 4049 }, { "epoch": 2.224052718286656, "grad_norm": 0.646825191451552, "learning_rate": 1.59278493301382e-07, "loss": 0.2676, "step": 4050 }, { "epoch": 2.224601867105986, "grad_norm": 0.4673100278924, "learning_rate": 1.5906603978861559e-07, "loss": 0.2476, "step": 4051 }, { "epoch": 2.2251510159253156, "grad_norm": 0.5407872055791325, "learning_rate": 1.5885370134537327e-07, "loss": 0.2062, "step": 4052 }, { "epoch": 2.2257001647446457, "grad_norm": 0.4708126322166367, "learning_rate": 1.5864147804331166e-07, "loss": 0.239, "step": 4053 }, { "epoch": 2.226249313563976, "grad_norm": 0.5738989256146745, "learning_rate": 1.5842936995404848e-07, "loss": 0.235, "step": 4054 }, { "epoch": 2.226798462383306, "grad_norm": 0.4213264483050754, "learning_rate": 1.5821737714916222e-07, "loss": 0.2569, "step": 4055 }, { "epoch": 2.227347611202636, "grad_norm": 0.4920217189253605, "learning_rate": 1.5800549970019243e-07, "loss": 0.2559, "step": 4056 }, { "epoch": 2.227896760021966, "grad_norm": 0.49807406052357484, "learning_rate": 1.5779373767864017e-07, "loss": 0.2486, "step": 4057 }, { "epoch": 2.228445908841296, "grad_norm": 0.41236021888645474, "learning_rate": 1.5758209115596746e-07, "loss": 0.2685, "step": 4058 }, { "epoch": 2.228995057660626, "grad_norm": 0.5765654989844405, "learning_rate": 1.5737056020359682e-07, "loss": 0.2405, "step": 4059 }, { "epoch": 2.229544206479956, "grad_norm": 0.4717565799493322, "learning_rate": 1.5715914489291244e-07, "loss": 0.2028, "step": 4060 }, { "epoch": 2.230093355299286, "grad_norm": 0.43428493381873695, "learning_rate": 1.5694784529525938e-07, "loss": 0.2317, "step": 4061 }, { "epoch": 2.230642504118616, "grad_norm": 0.44660722205331316, "learning_rate": 1.5673666148194295e-07, "loss": 0.226, "step": 4062 }, { "epoch": 2.2311916529379463, "grad_norm": 0.7357898715805092, "learning_rate": 1.565255935242302e-07, "loss": 0.2154, "step": 4063 }, { "epoch": 2.2317408017572764, "grad_norm": 0.4598953633135372, "learning_rate": 1.563146414933489e-07, "loss": 0.241, "step": 4064 }, { "epoch": 2.232289950576606, "grad_norm": 0.4815475185276992, "learning_rate": 1.5610380546048723e-07, "loss": 0.2458, "step": 4065 }, { "epoch": 2.232839099395936, "grad_norm": 0.46722349964631205, "learning_rate": 1.5589308549679504e-07, "loss": 0.2958, "step": 4066 }, { "epoch": 2.2333882482152663, "grad_norm": 0.4280929222132165, "learning_rate": 1.5568248167338217e-07, "loss": 0.2783, "step": 4067 }, { "epoch": 2.2339373970345964, "grad_norm": 0.6224497554969846, "learning_rate": 1.5547199406131993e-07, "loss": 0.2571, "step": 4068 }, { "epoch": 2.2344865458539265, "grad_norm": 0.689987558103597, "learning_rate": 1.5526162273163983e-07, "loss": 0.2701, "step": 4069 }, { "epoch": 2.2350356946732566, "grad_norm": 0.4536144589688099, "learning_rate": 1.5505136775533463e-07, "loss": 0.2664, "step": 4070 }, { "epoch": 2.2355848434925867, "grad_norm": 0.44457846028632664, "learning_rate": 1.548412292033578e-07, "loss": 0.2524, "step": 4071 }, { "epoch": 2.2361339923119163, "grad_norm": 0.5144545991105619, "learning_rate": 1.5463120714662322e-07, "loss": 0.2293, "step": 4072 }, { "epoch": 2.2366831411312464, "grad_norm": 0.494227605491344, "learning_rate": 1.5442130165600538e-07, "loss": 0.2399, "step": 4073 }, { "epoch": 2.2372322899505765, "grad_norm": 0.5653225293394155, "learning_rate": 1.5421151280233982e-07, "loss": 0.2508, "step": 4074 }, { "epoch": 2.2377814387699067, "grad_norm": 0.4994693054488046, "learning_rate": 1.5400184065642272e-07, "loss": 0.2268, "step": 4075 }, { "epoch": 2.2383305875892368, "grad_norm": 0.4689069691469193, "learning_rate": 1.5379228528901043e-07, "loss": 0.2931, "step": 4076 }, { "epoch": 2.238879736408567, "grad_norm": 0.6373272585422195, "learning_rate": 1.5358284677082042e-07, "loss": 0.2347, "step": 4077 }, { "epoch": 2.239428885227897, "grad_norm": 0.5542172172071798, "learning_rate": 1.5337352517253032e-07, "loss": 0.2371, "step": 4078 }, { "epoch": 2.2399780340472266, "grad_norm": 0.53591832041364, "learning_rate": 1.5316432056477836e-07, "loss": 0.2347, "step": 4079 }, { "epoch": 2.2405271828665567, "grad_norm": 0.5276035884714745, "learning_rate": 1.5295523301816346e-07, "loss": 0.2234, "step": 4080 }, { "epoch": 2.241076331685887, "grad_norm": 0.41176842871437136, "learning_rate": 1.527462626032452e-07, "loss": 0.2099, "step": 4081 }, { "epoch": 2.241625480505217, "grad_norm": 0.4666858806452414, "learning_rate": 1.5253740939054306e-07, "loss": 0.2212, "step": 4082 }, { "epoch": 2.242174629324547, "grad_norm": 0.4843293028575849, "learning_rate": 1.5232867345053764e-07, "loss": 0.2774, "step": 4083 }, { "epoch": 2.242723778143877, "grad_norm": 0.5509966873108286, "learning_rate": 1.5212005485366918e-07, "loss": 0.259, "step": 4084 }, { "epoch": 2.2432729269632072, "grad_norm": 0.5969395767924995, "learning_rate": 1.5191155367033924e-07, "loss": 0.2954, "step": 4085 }, { "epoch": 2.243822075782537, "grad_norm": 0.6112912825644582, "learning_rate": 1.5170316997090892e-07, "loss": 0.2618, "step": 4086 }, { "epoch": 2.244371224601867, "grad_norm": 0.47695269373134713, "learning_rate": 1.5149490382570017e-07, "loss": 0.2208, "step": 4087 }, { "epoch": 2.244920373421197, "grad_norm": 0.5348327908424828, "learning_rate": 1.5128675530499537e-07, "loss": 0.285, "step": 4088 }, { "epoch": 2.245469522240527, "grad_norm": 0.49428017049523315, "learning_rate": 1.5107872447903681e-07, "loss": 0.2247, "step": 4089 }, { "epoch": 2.2460186710598573, "grad_norm": 0.5260675279454065, "learning_rate": 1.5087081141802696e-07, "loss": 0.277, "step": 4090 }, { "epoch": 2.2465678198791874, "grad_norm": 0.4333691550355185, "learning_rate": 1.5066301619212916e-07, "loss": 0.2315, "step": 4091 }, { "epoch": 2.247116968698517, "grad_norm": 0.46681953633637757, "learning_rate": 1.5045533887146663e-07, "loss": 0.2437, "step": 4092 }, { "epoch": 2.247666117517847, "grad_norm": 0.43472436170194234, "learning_rate": 1.5024777952612255e-07, "loss": 0.2199, "step": 4093 }, { "epoch": 2.2482152663371773, "grad_norm": 0.3900325184868594, "learning_rate": 1.500403382261409e-07, "loss": 0.2391, "step": 4094 }, { "epoch": 2.2487644151565074, "grad_norm": 0.4997800989330255, "learning_rate": 1.4983301504152536e-07, "loss": 0.2198, "step": 4095 }, { "epoch": 2.2493135639758375, "grad_norm": 0.48044212882901877, "learning_rate": 1.4962581004223954e-07, "loss": 0.2373, "step": 4096 }, { "epoch": 2.2498627127951676, "grad_norm": 0.6828704663809463, "learning_rate": 1.4941872329820787e-07, "loss": 0.2848, "step": 4097 }, { "epoch": 2.2504118616144977, "grad_norm": 0.4851853837271703, "learning_rate": 1.4921175487931452e-07, "loss": 0.2422, "step": 4098 }, { "epoch": 2.2509610104338273, "grad_norm": 0.4223467889212768, "learning_rate": 1.490049048554035e-07, "loss": 0.2341, "step": 4099 }, { "epoch": 2.2515101592531575, "grad_norm": 0.804796153082989, "learning_rate": 1.4879817329627905e-07, "loss": 0.2767, "step": 4100 }, { "epoch": 2.2520593080724876, "grad_norm": 0.5008326743721045, "learning_rate": 1.4859156027170557e-07, "loss": 0.2691, "step": 4101 }, { "epoch": 2.2526084568918177, "grad_norm": 0.4789662682726989, "learning_rate": 1.4838506585140746e-07, "loss": 0.241, "step": 4102 }, { "epoch": 2.2531576057111478, "grad_norm": 0.5204002515437153, "learning_rate": 1.481786901050687e-07, "loss": 0.2855, "step": 4103 }, { "epoch": 2.253706754530478, "grad_norm": 0.44641206706778724, "learning_rate": 1.4797243310233368e-07, "loss": 0.2435, "step": 4104 }, { "epoch": 2.254255903349808, "grad_norm": 0.4257416843597193, "learning_rate": 1.4776629491280663e-07, "loss": 0.2354, "step": 4105 }, { "epoch": 2.2548050521691376, "grad_norm": 0.40645721899295234, "learning_rate": 1.4756027560605144e-07, "loss": 0.2239, "step": 4106 }, { "epoch": 2.2553542009884677, "grad_norm": 0.5542036143621838, "learning_rate": 1.4735437525159197e-07, "loss": 0.2051, "step": 4107 }, { "epoch": 2.255903349807798, "grad_norm": 0.5349724416344412, "learning_rate": 1.4714859391891208e-07, "loss": 0.2252, "step": 4108 }, { "epoch": 2.256452498627128, "grad_norm": 0.5051684785400459, "learning_rate": 1.4694293167745558e-07, "loss": 0.2852, "step": 4109 }, { "epoch": 2.257001647446458, "grad_norm": 0.469247869430049, "learning_rate": 1.4673738859662574e-07, "loss": 0.2155, "step": 4110 }, { "epoch": 2.257550796265788, "grad_norm": 0.5772923371348587, "learning_rate": 1.4653196474578557e-07, "loss": 0.2255, "step": 4111 }, { "epoch": 2.2580999450851182, "grad_norm": 0.4824546686346936, "learning_rate": 1.4632666019425845e-07, "loss": 0.2101, "step": 4112 }, { "epoch": 2.258649093904448, "grad_norm": 0.5374394105576086, "learning_rate": 1.461214750113267e-07, "loss": 0.2365, "step": 4113 }, { "epoch": 2.259198242723778, "grad_norm": 0.42661273111054115, "learning_rate": 1.4591640926623304e-07, "loss": 0.2303, "step": 4114 }, { "epoch": 2.259747391543108, "grad_norm": 0.49170551201387735, "learning_rate": 1.4571146302817958e-07, "loss": 0.2261, "step": 4115 }, { "epoch": 2.260296540362438, "grad_norm": 0.4792400377244397, "learning_rate": 1.4550663636632815e-07, "loss": 0.2247, "step": 4116 }, { "epoch": 2.2608456891817683, "grad_norm": 0.5174771700880091, "learning_rate": 1.4530192934979993e-07, "loss": 0.2932, "step": 4117 }, { "epoch": 2.2613948380010984, "grad_norm": 0.48536259945038496, "learning_rate": 1.450973420476762e-07, "loss": 0.2744, "step": 4118 }, { "epoch": 2.2619439868204285, "grad_norm": 0.47287259796667797, "learning_rate": 1.448928745289978e-07, "loss": 0.2601, "step": 4119 }, { "epoch": 2.262493135639758, "grad_norm": 0.5177657418212257, "learning_rate": 1.446885268627646e-07, "loss": 0.2379, "step": 4120 }, { "epoch": 2.2630422844590883, "grad_norm": 0.6116402132194474, "learning_rate": 1.4448429911793683e-07, "loss": 0.2399, "step": 4121 }, { "epoch": 2.2635914332784184, "grad_norm": 0.48870409654320696, "learning_rate": 1.4428019136343343e-07, "loss": 0.2247, "step": 4122 }, { "epoch": 2.2641405820977485, "grad_norm": 0.43384224228435914, "learning_rate": 1.4407620366813365e-07, "loss": 0.2178, "step": 4123 }, { "epoch": 2.2646897309170786, "grad_norm": 0.6020925406231517, "learning_rate": 1.438723361008754e-07, "loss": 0.2423, "step": 4124 }, { "epoch": 2.2652388797364087, "grad_norm": 0.46034038408160577, "learning_rate": 1.436685887304567e-07, "loss": 0.2195, "step": 4125 }, { "epoch": 2.265788028555739, "grad_norm": 0.4027509383441927, "learning_rate": 1.4346496162563496e-07, "loss": 0.2132, "step": 4126 }, { "epoch": 2.2663371773750685, "grad_norm": 0.4291865889254372, "learning_rate": 1.432614548551266e-07, "loss": 0.2573, "step": 4127 }, { "epoch": 2.2668863261943986, "grad_norm": 0.4991068444154207, "learning_rate": 1.4305806848760748e-07, "loss": 0.2392, "step": 4128 }, { "epoch": 2.2674354750137287, "grad_norm": 0.526838059294347, "learning_rate": 1.4285480259171346e-07, "loss": 0.2817, "step": 4129 }, { "epoch": 2.2679846238330588, "grad_norm": 0.50959453292258, "learning_rate": 1.426516572360388e-07, "loss": 0.2276, "step": 4130 }, { "epoch": 2.268533772652389, "grad_norm": 0.5983390269755672, "learning_rate": 1.4244863248913789e-07, "loss": 0.2743, "step": 4131 }, { "epoch": 2.269082921471719, "grad_norm": 0.4584851259788919, "learning_rate": 1.4224572841952415e-07, "loss": 0.2313, "step": 4132 }, { "epoch": 2.269632070291049, "grad_norm": 0.5201587455828883, "learning_rate": 1.4204294509567013e-07, "loss": 0.2858, "step": 4133 }, { "epoch": 2.2701812191103787, "grad_norm": 0.41876205462654115, "learning_rate": 1.4184028258600756e-07, "loss": 0.2502, "step": 4134 }, { "epoch": 2.270730367929709, "grad_norm": 0.4640150705458623, "learning_rate": 1.4163774095892772e-07, "loss": 0.273, "step": 4135 }, { "epoch": 2.271279516749039, "grad_norm": 0.46891782931303666, "learning_rate": 1.414353202827811e-07, "loss": 0.2476, "step": 4136 }, { "epoch": 2.271828665568369, "grad_norm": 0.4752361435919389, "learning_rate": 1.4123302062587685e-07, "loss": 0.2374, "step": 4137 }, { "epoch": 2.272377814387699, "grad_norm": 0.6349830260268547, "learning_rate": 1.4103084205648407e-07, "loss": 0.2081, "step": 4138 }, { "epoch": 2.2729269632070292, "grad_norm": 0.6378449238164671, "learning_rate": 1.408287846428303e-07, "loss": 0.2051, "step": 4139 }, { "epoch": 2.2734761120263594, "grad_norm": 0.5101686674793757, "learning_rate": 1.4062684845310263e-07, "loss": 0.2736, "step": 4140 }, { "epoch": 2.274025260845689, "grad_norm": 0.4879750356050871, "learning_rate": 1.4042503355544686e-07, "loss": 0.2576, "step": 4141 }, { "epoch": 2.274574409665019, "grad_norm": 0.5910931952269277, "learning_rate": 1.4022334001796823e-07, "loss": 0.281, "step": 4142 }, { "epoch": 2.275123558484349, "grad_norm": 0.4879267299939271, "learning_rate": 1.4002176790873118e-07, "loss": 0.2519, "step": 4143 }, { "epoch": 2.2756727073036793, "grad_norm": 0.5600512166164386, "learning_rate": 1.398203172957583e-07, "loss": 0.2039, "step": 4144 }, { "epoch": 2.2762218561230094, "grad_norm": 0.4623371108049366, "learning_rate": 1.3961898824703198e-07, "loss": 0.2435, "step": 4145 }, { "epoch": 2.2767710049423395, "grad_norm": 0.4657295018169696, "learning_rate": 1.3941778083049355e-07, "loss": 0.2361, "step": 4146 }, { "epoch": 2.2773201537616696, "grad_norm": 0.3943734515947322, "learning_rate": 1.3921669511404282e-07, "loss": 0.2495, "step": 4147 }, { "epoch": 2.2778693025809993, "grad_norm": 0.5030704399097244, "learning_rate": 1.3901573116553891e-07, "loss": 0.2137, "step": 4148 }, { "epoch": 2.2784184514003294, "grad_norm": 0.5123039149905788, "learning_rate": 1.3881488905279994e-07, "loss": 0.2701, "step": 4149 }, { "epoch": 2.2789676002196595, "grad_norm": 1.1897659495246475, "learning_rate": 1.3861416884360257e-07, "loss": 0.2341, "step": 4150 }, { "epoch": 2.2795167490389896, "grad_norm": 0.49501193114645126, "learning_rate": 1.3841357060568228e-07, "loss": 0.2319, "step": 4151 }, { "epoch": 2.2800658978583197, "grad_norm": 0.5728811954742753, "learning_rate": 1.382130944067338e-07, "loss": 0.2336, "step": 4152 }, { "epoch": 2.28061504667765, "grad_norm": 0.5020682990748071, "learning_rate": 1.3801274031441057e-07, "loss": 0.2859, "step": 4153 }, { "epoch": 2.28116419549698, "grad_norm": 0.4705747603793455, "learning_rate": 1.378125083963246e-07, "loss": 0.2467, "step": 4154 }, { "epoch": 2.2817133443163096, "grad_norm": 0.4840372994927695, "learning_rate": 1.3761239872004663e-07, "loss": 0.2232, "step": 4155 }, { "epoch": 2.2822624931356397, "grad_norm": 0.4573782643179784, "learning_rate": 1.3741241135310638e-07, "loss": 0.2246, "step": 4156 }, { "epoch": 2.2828116419549698, "grad_norm": 0.48441613925674715, "learning_rate": 1.372125463629924e-07, "loss": 0.2308, "step": 4157 }, { "epoch": 2.2833607907743, "grad_norm": 0.6238382663211087, "learning_rate": 1.3701280381715151e-07, "loss": 0.2608, "step": 4158 }, { "epoch": 2.28390993959363, "grad_norm": 0.4027952861337082, "learning_rate": 1.3681318378298963e-07, "loss": 0.2332, "step": 4159 }, { "epoch": 2.28445908841296, "grad_norm": 0.4280092420858081, "learning_rate": 1.366136863278714e-07, "loss": 0.2535, "step": 4160 }, { "epoch": 2.28500823723229, "grad_norm": 0.4679994580993365, "learning_rate": 1.3641431151911932e-07, "loss": 0.2402, "step": 4161 }, { "epoch": 2.28555738605162, "grad_norm": 0.5013205493799293, "learning_rate": 1.3621505942401523e-07, "loss": 0.2244, "step": 4162 }, { "epoch": 2.28610653487095, "grad_norm": 0.5671218638074996, "learning_rate": 1.3601593010979964e-07, "loss": 0.2505, "step": 4163 }, { "epoch": 2.28665568369028, "grad_norm": 0.5536695446176169, "learning_rate": 1.35816923643671e-07, "loss": 0.2236, "step": 4164 }, { "epoch": 2.28720483250961, "grad_norm": 0.598254009050342, "learning_rate": 1.3561804009278698e-07, "loss": 0.2456, "step": 4165 }, { "epoch": 2.2877539813289403, "grad_norm": 0.5059548541358106, "learning_rate": 1.354192795242632e-07, "loss": 0.2131, "step": 4166 }, { "epoch": 2.2883031301482704, "grad_norm": 0.6420163200377998, "learning_rate": 1.352206420051742e-07, "loss": 0.1971, "step": 4167 }, { "epoch": 2.2888522789676005, "grad_norm": 0.5120859935343264, "learning_rate": 1.3502212760255262e-07, "loss": 0.2105, "step": 4168 }, { "epoch": 2.28940142778693, "grad_norm": 0.4237407187151969, "learning_rate": 1.3482373638338991e-07, "loss": 0.2348, "step": 4169 }, { "epoch": 2.2899505766062602, "grad_norm": 0.5369619556933241, "learning_rate": 1.3462546841463595e-07, "loss": 0.2431, "step": 4170 }, { "epoch": 2.2904997254255903, "grad_norm": 0.4841101372581998, "learning_rate": 1.3442732376319868e-07, "loss": 0.2372, "step": 4171 }, { "epoch": 2.2910488742449204, "grad_norm": 0.4577820813197618, "learning_rate": 1.3422930249594447e-07, "loss": 0.2743, "step": 4172 }, { "epoch": 2.2915980230642505, "grad_norm": 0.47536115409390084, "learning_rate": 1.3403140467969833e-07, "loss": 0.2825, "step": 4173 }, { "epoch": 2.2921471718835806, "grad_norm": 0.46715608398591546, "learning_rate": 1.338336303812438e-07, "loss": 0.2347, "step": 4174 }, { "epoch": 2.2926963207029103, "grad_norm": 0.539137596474045, "learning_rate": 1.336359796673219e-07, "loss": 0.2302, "step": 4175 }, { "epoch": 2.2932454695222404, "grad_norm": 0.5050491248706318, "learning_rate": 1.3343845260463288e-07, "loss": 0.2821, "step": 4176 }, { "epoch": 2.2937946183415705, "grad_norm": 0.5095488283924023, "learning_rate": 1.3324104925983468e-07, "loss": 0.2427, "step": 4177 }, { "epoch": 2.2943437671609006, "grad_norm": 0.45178748063899143, "learning_rate": 1.3304376969954354e-07, "loss": 0.2393, "step": 4178 }, { "epoch": 2.2948929159802307, "grad_norm": 0.4152125291904169, "learning_rate": 1.3284661399033408e-07, "loss": 0.2588, "step": 4179 }, { "epoch": 2.295442064799561, "grad_norm": 0.4989093197992777, "learning_rate": 1.3264958219873937e-07, "loss": 0.2323, "step": 4180 }, { "epoch": 2.2959912136188905, "grad_norm": 0.4718507509496132, "learning_rate": 1.3245267439124998e-07, "loss": 0.2469, "step": 4181 }, { "epoch": 2.2965403624382206, "grad_norm": 0.5042173308827593, "learning_rate": 1.322558906343154e-07, "loss": 0.2469, "step": 4182 }, { "epoch": 2.2970895112575507, "grad_norm": 0.5479451490372311, "learning_rate": 1.3205923099434264e-07, "loss": 0.2674, "step": 4183 }, { "epoch": 2.297638660076881, "grad_norm": 0.48121238253968684, "learning_rate": 1.318626955376973e-07, "loss": 0.2754, "step": 4184 }, { "epoch": 2.298187808896211, "grad_norm": 0.4613830281941604, "learning_rate": 1.3166628433070255e-07, "loss": 0.2644, "step": 4185 }, { "epoch": 2.298736957715541, "grad_norm": 0.4497349512087476, "learning_rate": 1.3146999743964013e-07, "loss": 0.2197, "step": 4186 }, { "epoch": 2.299286106534871, "grad_norm": 0.5615817774770319, "learning_rate": 1.3127383493074981e-07, "loss": 0.2507, "step": 4187 }, { "epoch": 2.2998352553542007, "grad_norm": 0.4706206430873418, "learning_rate": 1.31077796870229e-07, "loss": 0.2641, "step": 4188 }, { "epoch": 2.300384404173531, "grad_norm": 0.46509667714035746, "learning_rate": 1.3088188332423322e-07, "loss": 0.2251, "step": 4189 }, { "epoch": 2.300933552992861, "grad_norm": 0.42678614100547047, "learning_rate": 1.3068609435887611e-07, "loss": 0.2259, "step": 4190 }, { "epoch": 2.301482701812191, "grad_norm": 0.5081430522107518, "learning_rate": 1.3049043004022956e-07, "loss": 0.2709, "step": 4191 }, { "epoch": 2.302031850631521, "grad_norm": 0.4473636918398846, "learning_rate": 1.3029489043432267e-07, "loss": 0.2467, "step": 4192 }, { "epoch": 2.3025809994508513, "grad_norm": 0.44088709699424494, "learning_rate": 1.3009947560714317e-07, "loss": 0.238, "step": 4193 }, { "epoch": 2.3031301482701814, "grad_norm": 0.46544621864065105, "learning_rate": 1.2990418562463617e-07, "loss": 0.2472, "step": 4194 }, { "epoch": 2.303679297089511, "grad_norm": 0.4953664352897843, "learning_rate": 1.297090205527048e-07, "loss": 0.2749, "step": 4195 }, { "epoch": 2.304228445908841, "grad_norm": 0.44324776675655786, "learning_rate": 1.2951398045721013e-07, "loss": 0.2695, "step": 4196 }, { "epoch": 2.3047775947281712, "grad_norm": 0.5218129026698359, "learning_rate": 1.2931906540397115e-07, "loss": 0.2444, "step": 4197 }, { "epoch": 2.3053267435475013, "grad_norm": 0.5470852378369458, "learning_rate": 1.2912427545876448e-07, "loss": 0.2496, "step": 4198 }, { "epoch": 2.3058758923668314, "grad_norm": 0.4819022869823857, "learning_rate": 1.289296106873243e-07, "loss": 0.2459, "step": 4199 }, { "epoch": 2.3064250411861615, "grad_norm": 0.5023444791148095, "learning_rate": 1.2873507115534297e-07, "loss": 0.2213, "step": 4200 }, { "epoch": 2.3064250411861615, "eval_loss": 0.32245030999183655, "eval_runtime": 18.6676, "eval_samples_per_second": 23.731, "eval_steps_per_second": 1.018, "step": 4200 }, { "epoch": 2.3069741900054916, "grad_norm": 0.5028076886002473, "learning_rate": 1.2854065692847057e-07, "loss": 0.215, "step": 4201 }, { "epoch": 2.3075233388248213, "grad_norm": 0.5222683948325819, "learning_rate": 1.2834636807231442e-07, "loss": 0.2789, "step": 4202 }, { "epoch": 2.3080724876441514, "grad_norm": 0.4420205263254049, "learning_rate": 1.2815220465244004e-07, "loss": 0.2355, "step": 4203 }, { "epoch": 2.3086216364634815, "grad_norm": 0.6417395145747327, "learning_rate": 1.279581667343705e-07, "loss": 0.3134, "step": 4204 }, { "epoch": 2.3091707852828116, "grad_norm": 0.4976515160196446, "learning_rate": 1.2776425438358644e-07, "loss": 0.258, "step": 4205 }, { "epoch": 2.3097199341021417, "grad_norm": 0.4119084196110188, "learning_rate": 1.2757046766552583e-07, "loss": 0.2363, "step": 4206 }, { "epoch": 2.310269082921472, "grad_norm": 0.5520819338288431, "learning_rate": 1.2737680664558474e-07, "loss": 0.2691, "step": 4207 }, { "epoch": 2.310818231740802, "grad_norm": 0.5878284488854353, "learning_rate": 1.2718327138911692e-07, "loss": 0.2321, "step": 4208 }, { "epoch": 2.3113673805601316, "grad_norm": 0.4695433374773004, "learning_rate": 1.2698986196143308e-07, "loss": 0.2258, "step": 4209 }, { "epoch": 2.3119165293794617, "grad_norm": 0.4606583882657425, "learning_rate": 1.2679657842780164e-07, "loss": 0.2397, "step": 4210 }, { "epoch": 2.312465678198792, "grad_norm": 0.6718002381521354, "learning_rate": 1.2660342085344904e-07, "loss": 0.288, "step": 4211 }, { "epoch": 2.313014827018122, "grad_norm": 0.599579526968771, "learning_rate": 1.264103893035585e-07, "loss": 0.2629, "step": 4212 }, { "epoch": 2.313563975837452, "grad_norm": 0.662819334016803, "learning_rate": 1.2621748384327125e-07, "loss": 0.1986, "step": 4213 }, { "epoch": 2.314113124656782, "grad_norm": 0.5326928265387822, "learning_rate": 1.2602470453768582e-07, "loss": 0.2403, "step": 4214 }, { "epoch": 2.314662273476112, "grad_norm": 0.5305556106293511, "learning_rate": 1.258320514518581e-07, "loss": 0.2355, "step": 4215 }, { "epoch": 2.315211422295442, "grad_norm": 0.44449672727673184, "learning_rate": 1.256395246508012e-07, "loss": 0.2361, "step": 4216 }, { "epoch": 2.315760571114772, "grad_norm": 0.5026382580085097, "learning_rate": 1.2544712419948597e-07, "loss": 0.2485, "step": 4217 }, { "epoch": 2.316309719934102, "grad_norm": 0.4567895108347193, "learning_rate": 1.2525485016284066e-07, "loss": 0.2275, "step": 4218 }, { "epoch": 2.316858868753432, "grad_norm": 0.49035864016220915, "learning_rate": 1.2506270260575034e-07, "loss": 0.2487, "step": 4219 }, { "epoch": 2.3174080175727623, "grad_norm": 0.4639398044704909, "learning_rate": 1.2487068159305802e-07, "loss": 0.2056, "step": 4220 }, { "epoch": 2.3179571663920924, "grad_norm": 0.49657167218952303, "learning_rate": 1.2467878718956345e-07, "loss": 0.2729, "step": 4221 }, { "epoch": 2.3185063152114225, "grad_norm": 0.5182790923584928, "learning_rate": 1.2448701946002416e-07, "loss": 0.2255, "step": 4222 }, { "epoch": 2.319055464030752, "grad_norm": 0.45015121881861003, "learning_rate": 1.2429537846915446e-07, "loss": 0.2386, "step": 4223 }, { "epoch": 2.3196046128500822, "grad_norm": 1.1108999488654705, "learning_rate": 1.241038642816263e-07, "loss": 0.2267, "step": 4224 }, { "epoch": 2.3201537616694123, "grad_norm": 0.4938133111052092, "learning_rate": 1.2391247696206871e-07, "loss": 0.2223, "step": 4225 }, { "epoch": 2.3207029104887424, "grad_norm": 0.5120856297107359, "learning_rate": 1.237212165750678e-07, "loss": 0.2604, "step": 4226 }, { "epoch": 2.3212520593080725, "grad_norm": 0.4763940134692922, "learning_rate": 1.235300831851667e-07, "loss": 0.2249, "step": 4227 }, { "epoch": 2.3218012081274026, "grad_norm": 0.5368401775453835, "learning_rate": 1.2333907685686626e-07, "loss": 0.2599, "step": 4228 }, { "epoch": 2.3223503569467328, "grad_norm": 0.40448978934655716, "learning_rate": 1.2314819765462365e-07, "loss": 0.2459, "step": 4229 }, { "epoch": 2.3228995057660624, "grad_norm": 0.5502038566417293, "learning_rate": 1.229574456428539e-07, "loss": 0.2701, "step": 4230 }, { "epoch": 2.3234486545853925, "grad_norm": 0.6532108981027288, "learning_rate": 1.2276682088592874e-07, "loss": 0.2563, "step": 4231 }, { "epoch": 2.3239978034047226, "grad_norm": 0.4854354415425811, "learning_rate": 1.2257632344817694e-07, "loss": 0.2718, "step": 4232 }, { "epoch": 2.3245469522240527, "grad_norm": 0.45894294338083286, "learning_rate": 1.2238595339388425e-07, "loss": 0.2459, "step": 4233 }, { "epoch": 2.325096101043383, "grad_norm": 0.5349834782133368, "learning_rate": 1.221957107872937e-07, "loss": 0.2448, "step": 4234 }, { "epoch": 2.325645249862713, "grad_norm": 0.5371325491267095, "learning_rate": 1.2200559569260526e-07, "loss": 0.2224, "step": 4235 }, { "epoch": 2.326194398682043, "grad_norm": 0.7260262052011621, "learning_rate": 1.218156081739755e-07, "loss": 0.2927, "step": 4236 }, { "epoch": 2.3267435475013727, "grad_norm": 0.4847790014182052, "learning_rate": 1.216257482955185e-07, "loss": 0.2366, "step": 4237 }, { "epoch": 2.327292696320703, "grad_norm": 0.4921433509960056, "learning_rate": 1.2143601612130463e-07, "loss": 0.259, "step": 4238 }, { "epoch": 2.327841845140033, "grad_norm": 0.47500745214077006, "learning_rate": 1.2124641171536192e-07, "loss": 0.24, "step": 4239 }, { "epoch": 2.328390993959363, "grad_norm": 0.5335704117403829, "learning_rate": 1.2105693514167447e-07, "loss": 0.213, "step": 4240 }, { "epoch": 2.328940142778693, "grad_norm": 0.4364405302540622, "learning_rate": 1.2086758646418388e-07, "loss": 0.2679, "step": 4241 }, { "epoch": 2.329489291598023, "grad_norm": 0.4609656074571149, "learning_rate": 1.2067836574678852e-07, "loss": 0.2616, "step": 4242 }, { "epoch": 2.3300384404173533, "grad_norm": 0.5191173781151636, "learning_rate": 1.2048927305334293e-07, "loss": 0.2272, "step": 4243 }, { "epoch": 2.330587589236683, "grad_norm": 0.46160447828026485, "learning_rate": 1.203003084476592e-07, "loss": 0.2217, "step": 4244 }, { "epoch": 2.331136738056013, "grad_norm": 0.5488737327224031, "learning_rate": 1.2011147199350604e-07, "loss": 0.2557, "step": 4245 }, { "epoch": 2.331685886875343, "grad_norm": 0.6538087947424733, "learning_rate": 1.1992276375460852e-07, "loss": 0.2326, "step": 4246 }, { "epoch": 2.3322350356946733, "grad_norm": 0.4730574495749438, "learning_rate": 1.1973418379464894e-07, "loss": 0.2372, "step": 4247 }, { "epoch": 2.3327841845140034, "grad_norm": 0.5177884979699207, "learning_rate": 1.1954573217726606e-07, "loss": 0.2656, "step": 4248 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5921690084074351, "learning_rate": 1.1935740896605537e-07, "loss": 0.2662, "step": 4249 }, { "epoch": 2.3338824821526636, "grad_norm": 0.5201907411818603, "learning_rate": 1.1916921422456882e-07, "loss": 0.2207, "step": 4250 }, { "epoch": 2.3344316309719932, "grad_norm": 0.5260692883393193, "learning_rate": 1.1898114801631536e-07, "loss": 0.2384, "step": 4251 }, { "epoch": 2.3349807797913233, "grad_norm": 0.5756611759536572, "learning_rate": 1.1879321040476047e-07, "loss": 0.2512, "step": 4252 }, { "epoch": 2.3355299286106534, "grad_norm": 0.4889136823495688, "learning_rate": 1.1860540145332616e-07, "loss": 0.2206, "step": 4253 }, { "epoch": 2.3360790774299836, "grad_norm": 0.42958622210826153, "learning_rate": 1.1841772122539078e-07, "loss": 0.2953, "step": 4254 }, { "epoch": 2.3366282262493137, "grad_norm": 0.44851739303000837, "learning_rate": 1.1823016978428967e-07, "loss": 0.2457, "step": 4255 }, { "epoch": 2.3371773750686438, "grad_norm": 0.47484907975911395, "learning_rate": 1.1804274719331467e-07, "loss": 0.2373, "step": 4256 }, { "epoch": 2.337726523887974, "grad_norm": 0.4319152563531555, "learning_rate": 1.1785545351571377e-07, "loss": 0.2239, "step": 4257 }, { "epoch": 2.3382756727073035, "grad_norm": 0.41076212400247253, "learning_rate": 1.1766828881469174e-07, "loss": 0.2251, "step": 4258 }, { "epoch": 2.3388248215266336, "grad_norm": 0.5131923747593286, "learning_rate": 1.1748125315341012e-07, "loss": 0.2626, "step": 4259 }, { "epoch": 2.3393739703459637, "grad_norm": 0.4577463645190812, "learning_rate": 1.1729434659498595e-07, "loss": 0.2418, "step": 4260 }, { "epoch": 2.339923119165294, "grad_norm": 0.6556110706574263, "learning_rate": 1.1710756920249362e-07, "loss": 0.2544, "step": 4261 }, { "epoch": 2.340472267984624, "grad_norm": 0.5940060863553352, "learning_rate": 1.1692092103896368e-07, "loss": 0.2513, "step": 4262 }, { "epoch": 2.341021416803954, "grad_norm": 0.46886447583609353, "learning_rate": 1.1673440216738284e-07, "loss": 0.2249, "step": 4263 }, { "epoch": 2.341570565623284, "grad_norm": 0.5204159100522225, "learning_rate": 1.1654801265069461e-07, "loss": 0.2189, "step": 4264 }, { "epoch": 2.342119714442614, "grad_norm": 0.4714101620279995, "learning_rate": 1.1636175255179827e-07, "loss": 0.2488, "step": 4265 }, { "epoch": 2.342668863261944, "grad_norm": 0.46709236563612133, "learning_rate": 1.1617562193354997e-07, "loss": 0.2556, "step": 4266 }, { "epoch": 2.343218012081274, "grad_norm": 0.4485807640582403, "learning_rate": 1.1598962085876179e-07, "loss": 0.2493, "step": 4267 }, { "epoch": 2.343767160900604, "grad_norm": 0.4828530596839069, "learning_rate": 1.1580374939020224e-07, "loss": 0.2736, "step": 4268 }, { "epoch": 2.344316309719934, "grad_norm": 0.5000389152064297, "learning_rate": 1.1561800759059631e-07, "loss": 0.244, "step": 4269 }, { "epoch": 2.3448654585392643, "grad_norm": 0.4523261706748118, "learning_rate": 1.1543239552262491e-07, "loss": 0.2279, "step": 4270 }, { "epoch": 2.3454146073585944, "grad_norm": 0.5014126950733165, "learning_rate": 1.1524691324892504e-07, "loss": 0.2257, "step": 4271 }, { "epoch": 2.345963756177924, "grad_norm": 0.433614731458899, "learning_rate": 1.150615608320903e-07, "loss": 0.2479, "step": 4272 }, { "epoch": 2.346512904997254, "grad_norm": 0.5294204203885148, "learning_rate": 1.148763383346705e-07, "loss": 0.2661, "step": 4273 }, { "epoch": 2.3470620538165843, "grad_norm": 0.6317452629057796, "learning_rate": 1.146912458191711e-07, "loss": 0.2397, "step": 4274 }, { "epoch": 2.3476112026359144, "grad_norm": 0.6496618431978495, "learning_rate": 1.1450628334805424e-07, "loss": 0.3358, "step": 4275 }, { "epoch": 2.3481603514552445, "grad_norm": 0.5387620894767209, "learning_rate": 1.1432145098373784e-07, "loss": 0.2644, "step": 4276 }, { "epoch": 2.3487095002745746, "grad_norm": 0.5069993642692732, "learning_rate": 1.1413674878859586e-07, "loss": 0.2929, "step": 4277 }, { "epoch": 2.3492586490939047, "grad_norm": 0.420093953231744, "learning_rate": 1.1395217682495869e-07, "loss": 0.2621, "step": 4278 }, { "epoch": 2.3498077979132344, "grad_norm": 0.5087446544423593, "learning_rate": 1.1376773515511264e-07, "loss": 0.276, "step": 4279 }, { "epoch": 2.3503569467325645, "grad_norm": 0.5749831515642492, "learning_rate": 1.1358342384129964e-07, "loss": 0.2166, "step": 4280 }, { "epoch": 2.3509060955518946, "grad_norm": 0.4719869175860627, "learning_rate": 1.1339924294571836e-07, "loss": 0.2641, "step": 4281 }, { "epoch": 2.3514552443712247, "grad_norm": 0.400708437347147, "learning_rate": 1.1321519253052279e-07, "loss": 0.2407, "step": 4282 }, { "epoch": 2.3520043931905548, "grad_norm": 0.43711396043717626, "learning_rate": 1.1303127265782336e-07, "loss": 0.2181, "step": 4283 }, { "epoch": 2.352553542009885, "grad_norm": 0.54790020943926, "learning_rate": 1.1284748338968601e-07, "loss": 0.2371, "step": 4284 }, { "epoch": 2.3531026908292145, "grad_norm": 0.5012709329124785, "learning_rate": 1.1266382478813301e-07, "loss": 0.2269, "step": 4285 }, { "epoch": 2.3536518396485446, "grad_norm": 0.38092286863274005, "learning_rate": 1.1248029691514248e-07, "loss": 0.2236, "step": 4286 }, { "epoch": 2.3542009884678747, "grad_norm": 0.47856482859649163, "learning_rate": 1.1229689983264818e-07, "loss": 0.2545, "step": 4287 }, { "epoch": 2.354750137287205, "grad_norm": 0.6555891944072414, "learning_rate": 1.121136336025397e-07, "loss": 0.2283, "step": 4288 }, { "epoch": 2.355299286106535, "grad_norm": 0.6117859112792935, "learning_rate": 1.119304982866629e-07, "loss": 0.2879, "step": 4289 }, { "epoch": 2.355848434925865, "grad_norm": 0.5068529822453939, "learning_rate": 1.117474939468192e-07, "loss": 0.2465, "step": 4290 }, { "epoch": 2.3563975837451947, "grad_norm": 0.4535440561125151, "learning_rate": 1.1156462064476561e-07, "loss": 0.2202, "step": 4291 }, { "epoch": 2.356946732564525, "grad_norm": 0.5119287633543099, "learning_rate": 1.1138187844221538e-07, "loss": 0.2665, "step": 4292 }, { "epoch": 2.357495881383855, "grad_norm": 0.5412579399554837, "learning_rate": 1.1119926740083718e-07, "loss": 0.2119, "step": 4293 }, { "epoch": 2.358045030203185, "grad_norm": 0.38685315237206247, "learning_rate": 1.1101678758225536e-07, "loss": 0.2533, "step": 4294 }, { "epoch": 2.358594179022515, "grad_norm": 0.47099203630115455, "learning_rate": 1.1083443904805026e-07, "loss": 0.2561, "step": 4295 }, { "epoch": 2.359143327841845, "grad_norm": 0.6391279683496444, "learning_rate": 1.1065222185975791e-07, "loss": 0.2463, "step": 4296 }, { "epoch": 2.3596924766611753, "grad_norm": 0.5486280607492743, "learning_rate": 1.1047013607886977e-07, "loss": 0.2534, "step": 4297 }, { "epoch": 2.360241625480505, "grad_norm": 0.4241321222380373, "learning_rate": 1.1028818176683295e-07, "loss": 0.2747, "step": 4298 }, { "epoch": 2.360790774299835, "grad_norm": 0.5938147227404399, "learning_rate": 1.101063589850505e-07, "loss": 0.3289, "step": 4299 }, { "epoch": 2.361339923119165, "grad_norm": 0.5139592711795388, "learning_rate": 1.0992466779488099e-07, "loss": 0.2682, "step": 4300 }, { "epoch": 2.3618890719384953, "grad_norm": 0.44562065599471057, "learning_rate": 1.0974310825763829e-07, "loss": 0.253, "step": 4301 }, { "epoch": 2.3624382207578254, "grad_norm": 0.527512526247087, "learning_rate": 1.0956168043459215e-07, "loss": 0.2395, "step": 4302 }, { "epoch": 2.3629873695771555, "grad_norm": 0.49398659460407146, "learning_rate": 1.093803843869679e-07, "loss": 0.2223, "step": 4303 }, { "epoch": 2.3635365183964856, "grad_norm": 0.5210338927778311, "learning_rate": 1.0919922017594612e-07, "loss": 0.2948, "step": 4304 }, { "epoch": 2.3640856672158153, "grad_norm": 0.49469326074013886, "learning_rate": 1.0901818786266303e-07, "loss": 0.2489, "step": 4305 }, { "epoch": 2.3646348160351454, "grad_norm": 0.3976455802712967, "learning_rate": 1.088372875082104e-07, "loss": 0.2786, "step": 4306 }, { "epoch": 2.3651839648544755, "grad_norm": 0.4604244835348604, "learning_rate": 1.0865651917363561e-07, "loss": 0.2306, "step": 4307 }, { "epoch": 2.3657331136738056, "grad_norm": 0.5098586663669838, "learning_rate": 1.0847588291994118e-07, "loss": 0.2363, "step": 4308 }, { "epoch": 2.3662822624931357, "grad_norm": 0.4770138024860164, "learning_rate": 1.0829537880808503e-07, "loss": 0.234, "step": 4309 }, { "epoch": 2.3668314113124658, "grad_norm": 0.4544513169824785, "learning_rate": 1.0811500689898097e-07, "loss": 0.2674, "step": 4310 }, { "epoch": 2.367380560131796, "grad_norm": 0.41911644776361084, "learning_rate": 1.0793476725349751e-07, "loss": 0.2227, "step": 4311 }, { "epoch": 2.3679297089511255, "grad_norm": 0.40929455911116724, "learning_rate": 1.0775465993245913e-07, "loss": 0.2235, "step": 4312 }, { "epoch": 2.3684788577704556, "grad_norm": 0.4778856964032043, "learning_rate": 1.075746849966455e-07, "loss": 0.2316, "step": 4313 }, { "epoch": 2.3690280065897857, "grad_norm": 0.6766860883109573, "learning_rate": 1.0739484250679135e-07, "loss": 0.3291, "step": 4314 }, { "epoch": 2.369577155409116, "grad_norm": 0.5104888238273152, "learning_rate": 1.0721513252358688e-07, "loss": 0.2885, "step": 4315 }, { "epoch": 2.370126304228446, "grad_norm": 0.5276061311172372, "learning_rate": 1.0703555510767761e-07, "loss": 0.247, "step": 4316 }, { "epoch": 2.370675453047776, "grad_norm": 0.45460748097813075, "learning_rate": 1.068561103196645e-07, "loss": 0.2388, "step": 4317 }, { "epoch": 2.371224601867106, "grad_norm": 0.49649655949071936, "learning_rate": 1.0667679822010326e-07, "loss": 0.248, "step": 4318 }, { "epoch": 2.371773750686436, "grad_norm": 0.5078286189025655, "learning_rate": 1.0649761886950542e-07, "loss": 0.2349, "step": 4319 }, { "epoch": 2.372322899505766, "grad_norm": 0.4909254745863682, "learning_rate": 1.063185723283371e-07, "loss": 0.2359, "step": 4320 }, { "epoch": 2.372872048325096, "grad_norm": 0.5059202788011038, "learning_rate": 1.0613965865702015e-07, "loss": 0.1924, "step": 4321 }, { "epoch": 2.373421197144426, "grad_norm": 0.3974481013083046, "learning_rate": 1.059608779159312e-07, "loss": 0.2288, "step": 4322 }, { "epoch": 2.3739703459637562, "grad_norm": 0.4573274318098116, "learning_rate": 1.0578223016540212e-07, "loss": 0.2637, "step": 4323 }, { "epoch": 2.3745194947830863, "grad_norm": 0.45358793399633246, "learning_rate": 1.0560371546572027e-07, "loss": 0.2269, "step": 4324 }, { "epoch": 2.3750686436024164, "grad_norm": 0.7731302920356223, "learning_rate": 1.0542533387712747e-07, "loss": 0.2488, "step": 4325 }, { "epoch": 2.375617792421746, "grad_norm": 0.588222509849646, "learning_rate": 1.052470854598209e-07, "loss": 0.2218, "step": 4326 }, { "epoch": 2.376166941241076, "grad_norm": 0.4793681619176967, "learning_rate": 1.0506897027395312e-07, "loss": 0.2213, "step": 4327 }, { "epoch": 2.3767160900604063, "grad_norm": 0.5120876375606997, "learning_rate": 1.048909883796311e-07, "loss": 0.2425, "step": 4328 }, { "epoch": 2.3772652388797364, "grad_norm": 0.531751945029049, "learning_rate": 1.047131398369174e-07, "loss": 0.2301, "step": 4329 }, { "epoch": 2.3778143876990665, "grad_norm": 0.43461029557694114, "learning_rate": 1.045354247058294e-07, "loss": 0.2735, "step": 4330 }, { "epoch": 2.3783635365183966, "grad_norm": 0.631674505967079, "learning_rate": 1.0435784304633932e-07, "loss": 0.2589, "step": 4331 }, { "epoch": 2.3789126853377267, "grad_norm": 0.3914681156675498, "learning_rate": 1.0418039491837425e-07, "loss": 0.246, "step": 4332 }, { "epoch": 2.3794618341570564, "grad_norm": 0.4413235226073941, "learning_rate": 1.0400308038181661e-07, "loss": 0.2, "step": 4333 }, { "epoch": 2.3800109829763865, "grad_norm": 0.49201735353071807, "learning_rate": 1.0382589949650357e-07, "loss": 0.2672, "step": 4334 }, { "epoch": 2.3805601317957166, "grad_norm": 0.5373055270318895, "learning_rate": 1.0364885232222695e-07, "loss": 0.2239, "step": 4335 }, { "epoch": 2.3811092806150467, "grad_norm": 0.486141598801213, "learning_rate": 1.0347193891873385e-07, "loss": 0.198, "step": 4336 }, { "epoch": 2.381658429434377, "grad_norm": 0.6010151478710597, "learning_rate": 1.0329515934572584e-07, "loss": 0.2476, "step": 4337 }, { "epoch": 2.382207578253707, "grad_norm": 0.4759674124733145, "learning_rate": 1.0311851366285973e-07, "loss": 0.2297, "step": 4338 }, { "epoch": 2.382756727073037, "grad_norm": 0.6590453929601003, "learning_rate": 1.0294200192974665e-07, "loss": 0.2399, "step": 4339 }, { "epoch": 2.3833058758923666, "grad_norm": 0.44851794214500884, "learning_rate": 1.0276562420595296e-07, "loss": 0.2418, "step": 4340 }, { "epoch": 2.3838550247116967, "grad_norm": 0.512476302877383, "learning_rate": 1.0258938055099996e-07, "loss": 0.2339, "step": 4341 }, { "epoch": 2.384404173531027, "grad_norm": 0.4426795224852948, "learning_rate": 1.0241327102436273e-07, "loss": 0.2931, "step": 4342 }, { "epoch": 2.384953322350357, "grad_norm": 0.4880026757668009, "learning_rate": 1.0223729568547216e-07, "loss": 0.2113, "step": 4343 }, { "epoch": 2.385502471169687, "grad_norm": 0.5599612822022351, "learning_rate": 1.020614545937135e-07, "loss": 0.2274, "step": 4344 }, { "epoch": 2.386051619989017, "grad_norm": 0.5012609443804696, "learning_rate": 1.0188574780842638e-07, "loss": 0.2711, "step": 4345 }, { "epoch": 2.3866007688083473, "grad_norm": 0.5387146725539798, "learning_rate": 1.0171017538890549e-07, "loss": 0.2556, "step": 4346 }, { "epoch": 2.387149917627677, "grad_norm": 0.5983525041283249, "learning_rate": 1.0153473739440018e-07, "loss": 0.2585, "step": 4347 }, { "epoch": 2.387699066447007, "grad_norm": 0.5029789926137569, "learning_rate": 1.0135943388411421e-07, "loss": 0.208, "step": 4348 }, { "epoch": 2.388248215266337, "grad_norm": 0.48430980156286985, "learning_rate": 1.0118426491720595e-07, "loss": 0.2597, "step": 4349 }, { "epoch": 2.3887973640856672, "grad_norm": 0.4665689250511465, "learning_rate": 1.0100923055278854e-07, "loss": 0.2246, "step": 4350 }, { "epoch": 2.3893465129049973, "grad_norm": 0.5592559655570383, "learning_rate": 1.0083433084992975e-07, "loss": 0.2664, "step": 4351 }, { "epoch": 2.3898956617243274, "grad_norm": 0.6540075497942756, "learning_rate": 1.0065956586765175e-07, "loss": 0.2521, "step": 4352 }, { "epoch": 2.3904448105436575, "grad_norm": 0.47202705992815247, "learning_rate": 1.0048493566493109e-07, "loss": 0.2563, "step": 4353 }, { "epoch": 2.390993959362987, "grad_norm": 0.5767527481812633, "learning_rate": 1.0031044030069913e-07, "loss": 0.2849, "step": 4354 }, { "epoch": 2.3915431081823173, "grad_norm": 0.46186204981589096, "learning_rate": 1.0013607983384179e-07, "loss": 0.2412, "step": 4355 }, { "epoch": 2.3920922570016474, "grad_norm": 0.46172287020894826, "learning_rate": 9.996185432319904e-08, "loss": 0.2357, "step": 4356 }, { "epoch": 2.3926414058209775, "grad_norm": 0.543553376912218, "learning_rate": 9.978776382756572e-08, "loss": 0.227, "step": 4357 }, { "epoch": 2.3931905546403076, "grad_norm": 0.41105971152644444, "learning_rate": 9.961380840569121e-08, "loss": 0.2489, "step": 4358 }, { "epoch": 2.3937397034596377, "grad_norm": 0.4974158564396423, "learning_rate": 9.943998811627856e-08, "loss": 0.2483, "step": 4359 }, { "epoch": 2.394288852278968, "grad_norm": 0.5365922929244143, "learning_rate": 9.926630301798592e-08, "loss": 0.2906, "step": 4360 }, { "epoch": 2.3948380010982975, "grad_norm": 0.4947522001725725, "learning_rate": 9.909275316942574e-08, "loss": 0.2398, "step": 4361 }, { "epoch": 2.3953871499176276, "grad_norm": 0.5752732662987095, "learning_rate": 9.891933862916449e-08, "loss": 0.2288, "step": 4362 }, { "epoch": 2.3959362987369577, "grad_norm": 0.3997133026869859, "learning_rate": 9.874605945572346e-08, "loss": 0.2253, "step": 4363 }, { "epoch": 2.396485447556288, "grad_norm": 0.45715600577386994, "learning_rate": 9.857291570757764e-08, "loss": 0.2227, "step": 4364 }, { "epoch": 2.397034596375618, "grad_norm": 0.4213770743715131, "learning_rate": 9.839990744315699e-08, "loss": 0.2402, "step": 4365 }, { "epoch": 2.397583745194948, "grad_norm": 0.5166226279311078, "learning_rate": 9.82270347208452e-08, "loss": 0.2539, "step": 4366 }, { "epoch": 2.398132894014278, "grad_norm": 0.452688891661523, "learning_rate": 9.805429759898045e-08, "loss": 0.2656, "step": 4367 }, { "epoch": 2.3986820428336078, "grad_norm": 0.4586291992283334, "learning_rate": 9.788169613585539e-08, "loss": 0.2349, "step": 4368 }, { "epoch": 2.399231191652938, "grad_norm": 0.4045984015829321, "learning_rate": 9.770923038971654e-08, "loss": 0.2479, "step": 4369 }, { "epoch": 2.399780340472268, "grad_norm": 0.5521762898576388, "learning_rate": 9.75369004187645e-08, "loss": 0.2596, "step": 4370 }, { "epoch": 2.400329489291598, "grad_norm": 0.419191957298065, "learning_rate": 9.736470628115454e-08, "loss": 0.2386, "step": 4371 }, { "epoch": 2.400878638110928, "grad_norm": 0.4252823717825137, "learning_rate": 9.71926480349959e-08, "loss": 0.2214, "step": 4372 }, { "epoch": 2.4014277869302583, "grad_norm": 0.5684931772677159, "learning_rate": 9.70207257383516e-08, "loss": 0.2426, "step": 4373 }, { "epoch": 2.4019769357495884, "grad_norm": 0.4485266747106593, "learning_rate": 9.684893944923945e-08, "loss": 0.2338, "step": 4374 }, { "epoch": 2.402526084568918, "grad_norm": 0.5105006286040609, "learning_rate": 9.667728922563079e-08, "loss": 0.2348, "step": 4375 }, { "epoch": 2.403075233388248, "grad_norm": 0.577179913354242, "learning_rate": 9.650577512545107e-08, "loss": 0.2186, "step": 4376 }, { "epoch": 2.4036243822075782, "grad_norm": 0.5205406358972823, "learning_rate": 9.633439720658025e-08, "loss": 0.2059, "step": 4377 }, { "epoch": 2.4041735310269083, "grad_norm": 0.5416767468489839, "learning_rate": 9.616315552685206e-08, "loss": 0.2527, "step": 4378 }, { "epoch": 2.4047226798462384, "grad_norm": 0.5383829047059119, "learning_rate": 9.599205014405403e-08, "loss": 0.2653, "step": 4379 }, { "epoch": 2.4052718286655685, "grad_norm": 0.5205052997060304, "learning_rate": 9.582108111592828e-08, "loss": 0.2469, "step": 4380 }, { "epoch": 2.4058209774848986, "grad_norm": 0.563233640111495, "learning_rate": 9.565024850017018e-08, "loss": 0.2469, "step": 4381 }, { "epoch": 2.4063701263042283, "grad_norm": 0.4728240975570711, "learning_rate": 9.547955235442973e-08, "loss": 0.2205, "step": 4382 }, { "epoch": 2.4069192751235584, "grad_norm": 0.4895681046957997, "learning_rate": 9.530899273631037e-08, "loss": 0.205, "step": 4383 }, { "epoch": 2.4074684239428885, "grad_norm": 0.44901135203918346, "learning_rate": 9.513856970336978e-08, "loss": 0.2338, "step": 4384 }, { "epoch": 2.4080175727622186, "grad_norm": 0.5643198771423413, "learning_rate": 9.496828331311964e-08, "loss": 0.2368, "step": 4385 }, { "epoch": 2.4085667215815487, "grad_norm": 0.6805435945224101, "learning_rate": 9.479813362302514e-08, "loss": 0.301, "step": 4386 }, { "epoch": 2.409115870400879, "grad_norm": 0.49187417084380314, "learning_rate": 9.462812069050539e-08, "loss": 0.2706, "step": 4387 }, { "epoch": 2.409665019220209, "grad_norm": 0.4268395573369364, "learning_rate": 9.445824457293372e-08, "loss": 0.2374, "step": 4388 }, { "epoch": 2.4102141680395386, "grad_norm": 0.46956514558084034, "learning_rate": 9.428850532763706e-08, "loss": 0.2406, "step": 4389 }, { "epoch": 2.4107633168588687, "grad_norm": 0.5022831073057764, "learning_rate": 9.411890301189598e-08, "loss": 0.234, "step": 4390 }, { "epoch": 2.411312465678199, "grad_norm": 0.3807012479632907, "learning_rate": 9.394943768294525e-08, "loss": 0.2192, "step": 4391 }, { "epoch": 2.411861614497529, "grad_norm": 0.5172628021923034, "learning_rate": 9.378010939797307e-08, "loss": 0.2502, "step": 4392 }, { "epoch": 2.412410763316859, "grad_norm": 0.46136877782017105, "learning_rate": 9.361091821412134e-08, "loss": 0.2591, "step": 4393 }, { "epoch": 2.412959912136189, "grad_norm": 0.48927396447017557, "learning_rate": 9.3441864188486e-08, "loss": 0.2353, "step": 4394 }, { "epoch": 2.4135090609555188, "grad_norm": 0.46291028976332893, "learning_rate": 9.327294737811666e-08, "loss": 0.2353, "step": 4395 }, { "epoch": 2.414058209774849, "grad_norm": 0.4997470876740817, "learning_rate": 9.31041678400164e-08, "loss": 0.2547, "step": 4396 }, { "epoch": 2.414607358594179, "grad_norm": 0.46639621530332454, "learning_rate": 9.293552563114196e-08, "loss": 0.2083, "step": 4397 }, { "epoch": 2.415156507413509, "grad_norm": 0.49796551252809607, "learning_rate": 9.276702080840402e-08, "loss": 0.2186, "step": 4398 }, { "epoch": 2.415705656232839, "grad_norm": 0.45295016261714366, "learning_rate": 9.259865342866681e-08, "loss": 0.2436, "step": 4399 }, { "epoch": 2.4162548050521693, "grad_norm": 0.5644035711929897, "learning_rate": 9.243042354874802e-08, "loss": 0.2199, "step": 4400 }, { "epoch": 2.4162548050521693, "eval_loss": 0.3218434751033783, "eval_runtime": 18.8179, "eval_samples_per_second": 23.541, "eval_steps_per_second": 1.01, "step": 4400 }, { "epoch": 2.416803953871499, "grad_norm": 0.4954053405517015, "learning_rate": 9.226233122541902e-08, "loss": 0.249, "step": 4401 }, { "epoch": 2.417353102690829, "grad_norm": 0.5466308595919291, "learning_rate": 9.209437651540493e-08, "loss": 0.2358, "step": 4402 }, { "epoch": 2.417902251510159, "grad_norm": 0.58704118126291, "learning_rate": 9.19265594753842e-08, "loss": 0.2745, "step": 4403 }, { "epoch": 2.4184514003294892, "grad_norm": 0.5497120333793641, "learning_rate": 9.175888016198873e-08, "loss": 0.2349, "step": 4404 }, { "epoch": 2.4190005491488193, "grad_norm": 0.4464225928039183, "learning_rate": 9.159133863180433e-08, "loss": 0.2645, "step": 4405 }, { "epoch": 2.4195496979681494, "grad_norm": 0.44321208222461583, "learning_rate": 9.142393494136993e-08, "loss": 0.2287, "step": 4406 }, { "epoch": 2.4200988467874796, "grad_norm": 0.6709706945677903, "learning_rate": 9.125666914717822e-08, "loss": 0.2722, "step": 4407 }, { "epoch": 2.420647995606809, "grad_norm": 0.5212296637611338, "learning_rate": 9.10895413056753e-08, "loss": 0.2794, "step": 4408 }, { "epoch": 2.4211971444261393, "grad_norm": 0.46196723960508035, "learning_rate": 9.092255147326056e-08, "loss": 0.2835, "step": 4409 }, { "epoch": 2.4217462932454694, "grad_norm": 0.4501118265794812, "learning_rate": 9.075569970628686e-08, "loss": 0.2357, "step": 4410 }, { "epoch": 2.4222954420647995, "grad_norm": 0.6753508987401065, "learning_rate": 9.058898606106053e-08, "loss": 0.2913, "step": 4411 }, { "epoch": 2.4228445908841296, "grad_norm": 0.6366039990706125, "learning_rate": 9.04224105938415e-08, "loss": 0.2695, "step": 4412 }, { "epoch": 2.4233937397034597, "grad_norm": 0.4195244742532176, "learning_rate": 9.02559733608427e-08, "loss": 0.2555, "step": 4413 }, { "epoch": 2.42394288852279, "grad_norm": 0.524655756756528, "learning_rate": 9.008967441823042e-08, "loss": 0.2124, "step": 4414 }, { "epoch": 2.4244920373421195, "grad_norm": 0.5548239347341882, "learning_rate": 8.992351382212459e-08, "loss": 0.2168, "step": 4415 }, { "epoch": 2.4250411861614496, "grad_norm": 0.5828240840118025, "learning_rate": 8.975749162859838e-08, "loss": 0.2384, "step": 4416 }, { "epoch": 2.4255903349807797, "grad_norm": 0.5240522025761735, "learning_rate": 8.959160789367792e-08, "loss": 0.2792, "step": 4417 }, { "epoch": 2.42613948380011, "grad_norm": 0.5088922393469666, "learning_rate": 8.942586267334307e-08, "loss": 0.2841, "step": 4418 }, { "epoch": 2.42668863261944, "grad_norm": 0.43944622161327423, "learning_rate": 8.926025602352669e-08, "loss": 0.2244, "step": 4419 }, { "epoch": 2.42723778143877, "grad_norm": 0.4738918514059997, "learning_rate": 8.909478800011494e-08, "loss": 0.2439, "step": 4420 }, { "epoch": 2.4277869302581, "grad_norm": 0.5248901979871129, "learning_rate": 8.8929458658947e-08, "loss": 0.2057, "step": 4421 }, { "epoch": 2.4283360790774298, "grad_norm": 0.4695585077638441, "learning_rate": 8.876426805581569e-08, "loss": 0.2308, "step": 4422 }, { "epoch": 2.42888522789676, "grad_norm": 0.4761902535846165, "learning_rate": 8.85992162464665e-08, "loss": 0.2591, "step": 4423 }, { "epoch": 2.42943437671609, "grad_norm": 0.4752856968165827, "learning_rate": 8.843430328659858e-08, "loss": 0.256, "step": 4424 }, { "epoch": 2.42998352553542, "grad_norm": 0.48433398470774713, "learning_rate": 8.826952923186375e-08, "loss": 0.2796, "step": 4425 }, { "epoch": 2.43053267435475, "grad_norm": 0.5097786327147819, "learning_rate": 8.810489413786743e-08, "loss": 0.2361, "step": 4426 }, { "epoch": 2.4310818231740803, "grad_norm": 0.49916075192925374, "learning_rate": 8.794039806016759e-08, "loss": 0.2463, "step": 4427 }, { "epoch": 2.4316309719934104, "grad_norm": 0.4966817761403099, "learning_rate": 8.777604105427581e-08, "loss": 0.2215, "step": 4428 }, { "epoch": 2.43218012081274, "grad_norm": 0.5574000482092913, "learning_rate": 8.761182317565659e-08, "loss": 0.2385, "step": 4429 }, { "epoch": 2.43272926963207, "grad_norm": 0.4475154018773907, "learning_rate": 8.744774447972733e-08, "loss": 0.2652, "step": 4430 }, { "epoch": 2.4332784184514002, "grad_norm": 0.5247334845105581, "learning_rate": 8.728380502185838e-08, "loss": 0.2308, "step": 4431 }, { "epoch": 2.4338275672707304, "grad_norm": 0.5204925237892747, "learning_rate": 8.712000485737344e-08, "loss": 0.2527, "step": 4432 }, { "epoch": 2.4343767160900605, "grad_norm": 0.5186614406128908, "learning_rate": 8.695634404154914e-08, "loss": 0.2481, "step": 4433 }, { "epoch": 2.4349258649093906, "grad_norm": 0.42919351804571243, "learning_rate": 8.679282262961467e-08, "loss": 0.249, "step": 4434 }, { "epoch": 2.4354750137287207, "grad_norm": 0.5287714075267016, "learning_rate": 8.662944067675274e-08, "loss": 0.2586, "step": 4435 }, { "epoch": 2.4360241625480503, "grad_norm": 0.5175462565875877, "learning_rate": 8.646619823809872e-08, "loss": 0.2156, "step": 4436 }, { "epoch": 2.4365733113673804, "grad_norm": 0.4817574722613448, "learning_rate": 8.630309536874068e-08, "loss": 0.2366, "step": 4437 }, { "epoch": 2.4371224601867105, "grad_norm": 0.5957092456174624, "learning_rate": 8.614013212372002e-08, "loss": 0.257, "step": 4438 }, { "epoch": 2.4376716090060406, "grad_norm": 0.5695344567779025, "learning_rate": 8.597730855803093e-08, "loss": 0.2312, "step": 4439 }, { "epoch": 2.4382207578253707, "grad_norm": 0.5328666390892294, "learning_rate": 8.581462472662011e-08, "loss": 0.2982, "step": 4440 }, { "epoch": 2.438769906644701, "grad_norm": 0.5243554604630376, "learning_rate": 8.56520806843876e-08, "loss": 0.2625, "step": 4441 }, { "epoch": 2.439319055464031, "grad_norm": 0.4682395946228307, "learning_rate": 8.548967648618579e-08, "loss": 0.2602, "step": 4442 }, { "epoch": 2.4398682042833606, "grad_norm": 0.5506235333576627, "learning_rate": 8.53274121868204e-08, "loss": 0.2365, "step": 4443 }, { "epoch": 2.4404173531026907, "grad_norm": 0.5556772014264871, "learning_rate": 8.51652878410494e-08, "loss": 0.2424, "step": 4444 }, { "epoch": 2.440966501922021, "grad_norm": 0.4626338906764464, "learning_rate": 8.500330350358385e-08, "loss": 0.2306, "step": 4445 }, { "epoch": 2.441515650741351, "grad_norm": 0.39699238188640135, "learning_rate": 8.484145922908761e-08, "loss": 0.2548, "step": 4446 }, { "epoch": 2.442064799560681, "grad_norm": 0.4829888207451511, "learning_rate": 8.46797550721771e-08, "loss": 0.2334, "step": 4447 }, { "epoch": 2.442613948380011, "grad_norm": 0.5386809218348696, "learning_rate": 8.451819108742143e-08, "loss": 0.2187, "step": 4448 }, { "epoch": 2.443163097199341, "grad_norm": 0.5237659146626669, "learning_rate": 8.435676732934246e-08, "loss": 0.2248, "step": 4449 }, { "epoch": 2.443712246018671, "grad_norm": 0.5034164044930222, "learning_rate": 8.419548385241503e-08, "loss": 0.229, "step": 4450 }, { "epoch": 2.444261394838001, "grad_norm": 0.5076426120158689, "learning_rate": 8.403434071106605e-08, "loss": 0.2304, "step": 4451 }, { "epoch": 2.444810543657331, "grad_norm": 0.48753968449021995, "learning_rate": 8.38733379596757e-08, "loss": 0.2532, "step": 4452 }, { "epoch": 2.445359692476661, "grad_norm": 0.523920882184603, "learning_rate": 8.371247565257629e-08, "loss": 0.2769, "step": 4453 }, { "epoch": 2.4459088412959913, "grad_norm": 0.43749291452132416, "learning_rate": 8.355175384405283e-08, "loss": 0.2764, "step": 4454 }, { "epoch": 2.4464579901153214, "grad_norm": 0.4863054375290364, "learning_rate": 8.339117258834317e-08, "loss": 0.2379, "step": 4455 }, { "epoch": 2.4470071389346515, "grad_norm": 0.5243852634954214, "learning_rate": 8.323073193963758e-08, "loss": 0.2111, "step": 4456 }, { "epoch": 2.447556287753981, "grad_norm": 0.7103775551260478, "learning_rate": 8.307043195207887e-08, "loss": 0.2828, "step": 4457 }, { "epoch": 2.4481054365733113, "grad_norm": 0.5806254059960709, "learning_rate": 8.291027267976216e-08, "loss": 0.192, "step": 4458 }, { "epoch": 2.4486545853926414, "grad_norm": 0.5596410581920414, "learning_rate": 8.275025417673548e-08, "loss": 0.3013, "step": 4459 }, { "epoch": 2.4492037342119715, "grad_norm": 0.4114491355966605, "learning_rate": 8.259037649699932e-08, "loss": 0.2291, "step": 4460 }, { "epoch": 2.4497528830313016, "grad_norm": 2.0832545864842773, "learning_rate": 8.243063969450624e-08, "loss": 0.256, "step": 4461 }, { "epoch": 2.4503020318506317, "grad_norm": 0.44952515865386644, "learning_rate": 8.22710438231616e-08, "loss": 0.2542, "step": 4462 }, { "epoch": 2.4508511806699618, "grad_norm": 0.4124600747021196, "learning_rate": 8.21115889368233e-08, "loss": 0.2271, "step": 4463 }, { "epoch": 2.4514003294892914, "grad_norm": 0.47810416982513626, "learning_rate": 8.195227508930136e-08, "loss": 0.2556, "step": 4464 }, { "epoch": 2.4519494783086215, "grad_norm": 0.4388783779008914, "learning_rate": 8.179310233435819e-08, "loss": 0.2416, "step": 4465 }, { "epoch": 2.4524986271279516, "grad_norm": 0.6032316472019654, "learning_rate": 8.163407072570892e-08, "loss": 0.3062, "step": 4466 }, { "epoch": 2.4530477759472817, "grad_norm": 0.4790239740382794, "learning_rate": 8.147518031702092e-08, "loss": 0.2549, "step": 4467 }, { "epoch": 2.453596924766612, "grad_norm": 0.480721351821681, "learning_rate": 8.131643116191371e-08, "loss": 0.251, "step": 4468 }, { "epoch": 2.454146073585942, "grad_norm": 0.4607013919199864, "learning_rate": 8.115782331395924e-08, "loss": 0.2279, "step": 4469 }, { "epoch": 2.454695222405272, "grad_norm": 0.6073723304130222, "learning_rate": 8.099935682668194e-08, "loss": 0.2358, "step": 4470 }, { "epoch": 2.4552443712246017, "grad_norm": 0.4751717768485519, "learning_rate": 8.084103175355832e-08, "loss": 0.2275, "step": 4471 }, { "epoch": 2.455793520043932, "grad_norm": 0.47306168264036774, "learning_rate": 8.068284814801725e-08, "loss": 0.2093, "step": 4472 }, { "epoch": 2.456342668863262, "grad_norm": 0.5320540279394275, "learning_rate": 8.052480606344001e-08, "loss": 0.2317, "step": 4473 }, { "epoch": 2.456891817682592, "grad_norm": 0.5957291137148731, "learning_rate": 8.036690555315996e-08, "loss": 0.2292, "step": 4474 }, { "epoch": 2.457440966501922, "grad_norm": 0.5102692225089511, "learning_rate": 8.020914667046244e-08, "loss": 0.2226, "step": 4475 }, { "epoch": 2.4579901153212522, "grad_norm": 0.5950801030992386, "learning_rate": 8.00515294685855e-08, "loss": 0.2628, "step": 4476 }, { "epoch": 2.4585392641405823, "grad_norm": 0.49189330529355263, "learning_rate": 7.989405400071921e-08, "loss": 0.2717, "step": 4477 }, { "epoch": 2.459088412959912, "grad_norm": 0.4903939322800323, "learning_rate": 7.97367203200055e-08, "loss": 0.2349, "step": 4478 }, { "epoch": 2.459637561779242, "grad_norm": 0.538097157514057, "learning_rate": 7.957952847953895e-08, "loss": 0.267, "step": 4479 }, { "epoch": 2.460186710598572, "grad_norm": 0.5781107241347448, "learning_rate": 7.94224785323657e-08, "loss": 0.2512, "step": 4480 }, { "epoch": 2.4607358594179023, "grad_norm": 0.48795582583770775, "learning_rate": 7.926557053148471e-08, "loss": 0.2625, "step": 4481 }, { "epoch": 2.4612850082372324, "grad_norm": 0.6214673506142165, "learning_rate": 7.910880452984625e-08, "loss": 0.2736, "step": 4482 }, { "epoch": 2.4618341570565625, "grad_norm": 0.48320027033809915, "learning_rate": 7.895218058035325e-08, "loss": 0.2472, "step": 4483 }, { "epoch": 2.4623833058758926, "grad_norm": 0.47121023996135325, "learning_rate": 7.879569873586071e-08, "loss": 0.2093, "step": 4484 }, { "epoch": 2.4629324546952223, "grad_norm": 0.4884511373491773, "learning_rate": 7.863935904917524e-08, "loss": 0.2336, "step": 4485 }, { "epoch": 2.4634816035145524, "grad_norm": 0.4668186580042982, "learning_rate": 7.848316157305574e-08, "loss": 0.2444, "step": 4486 }, { "epoch": 2.4640307523338825, "grad_norm": 0.6922833591825045, "learning_rate": 7.832710636021325e-08, "loss": 0.2328, "step": 4487 }, { "epoch": 2.4645799011532126, "grad_norm": 0.5211608502495902, "learning_rate": 7.817119346331042e-08, "loss": 0.2208, "step": 4488 }, { "epoch": 2.4651290499725427, "grad_norm": 0.5028986802921136, "learning_rate": 7.801542293496228e-08, "loss": 0.2701, "step": 4489 }, { "epoch": 2.4656781987918728, "grad_norm": 0.49736401459194174, "learning_rate": 7.785979482773573e-08, "loss": 0.2339, "step": 4490 }, { "epoch": 2.466227347611203, "grad_norm": 0.7339926705720224, "learning_rate": 7.770430919414939e-08, "loss": 0.2853, "step": 4491 }, { "epoch": 2.4667764964305325, "grad_norm": 0.46659383366906854, "learning_rate": 7.754896608667385e-08, "loss": 0.257, "step": 4492 }, { "epoch": 2.4673256452498626, "grad_norm": 0.5086141346490957, "learning_rate": 7.739376555773176e-08, "loss": 0.2006, "step": 4493 }, { "epoch": 2.4678747940691927, "grad_norm": 0.5174660375398773, "learning_rate": 7.723870765969778e-08, "loss": 0.2473, "step": 4494 }, { "epoch": 2.468423942888523, "grad_norm": 0.49977753171792233, "learning_rate": 7.708379244489792e-08, "loss": 0.2357, "step": 4495 }, { "epoch": 2.468973091707853, "grad_norm": 0.48136542085855316, "learning_rate": 7.692901996561063e-08, "loss": 0.2814, "step": 4496 }, { "epoch": 2.469522240527183, "grad_norm": 0.5758850227474778, "learning_rate": 7.677439027406562e-08, "loss": 0.2767, "step": 4497 }, { "epoch": 2.4700713893465127, "grad_norm": 0.4671251903692303, "learning_rate": 7.66199034224451e-08, "loss": 0.2701, "step": 4498 }, { "epoch": 2.470620538165843, "grad_norm": 0.5008892223899206, "learning_rate": 7.646555946288228e-08, "loss": 0.2265, "step": 4499 }, { "epoch": 2.471169686985173, "grad_norm": 0.5094438666476996, "learning_rate": 7.63113584474628e-08, "loss": 0.2837, "step": 4500 }, { "epoch": 2.471718835804503, "grad_norm": 0.5499901741684797, "learning_rate": 7.615730042822398e-08, "loss": 0.2597, "step": 4501 }, { "epoch": 2.472267984623833, "grad_norm": 0.5113366532756419, "learning_rate": 7.600338545715435e-08, "loss": 0.227, "step": 4502 }, { "epoch": 2.4728171334431632, "grad_norm": 0.43814711456255245, "learning_rate": 7.584961358619464e-08, "loss": 0.2647, "step": 4503 }, { "epoch": 2.473366282262493, "grad_norm": 0.5650533464833986, "learning_rate": 7.569598486723742e-08, "loss": 0.2449, "step": 4504 }, { "epoch": 2.473915431081823, "grad_norm": 0.5229518966054156, "learning_rate": 7.55424993521265e-08, "loss": 0.2533, "step": 4505 }, { "epoch": 2.474464579901153, "grad_norm": 0.5845494639846216, "learning_rate": 7.538915709265764e-08, "loss": 0.2337, "step": 4506 }, { "epoch": 2.475013728720483, "grad_norm": 0.47235258697002974, "learning_rate": 7.523595814057832e-08, "loss": 0.2312, "step": 4507 }, { "epoch": 2.4755628775398133, "grad_norm": 0.6198207237108116, "learning_rate": 7.508290254758745e-08, "loss": 0.2642, "step": 4508 }, { "epoch": 2.4761120263591434, "grad_norm": 0.5605875853866593, "learning_rate": 7.49299903653355e-08, "loss": 0.247, "step": 4509 }, { "epoch": 2.4766611751784735, "grad_norm": 0.5745619828258272, "learning_rate": 7.477722164542491e-08, "loss": 0.2588, "step": 4510 }, { "epoch": 2.477210323997803, "grad_norm": 0.5176779724039913, "learning_rate": 7.46245964394095e-08, "loss": 0.228, "step": 4511 }, { "epoch": 2.4777594728171333, "grad_norm": 0.4248294792666142, "learning_rate": 7.447211479879459e-08, "loss": 0.2342, "step": 4512 }, { "epoch": 2.4783086216364634, "grad_norm": 0.44726024423781924, "learning_rate": 7.431977677503703e-08, "loss": 0.2133, "step": 4513 }, { "epoch": 2.4788577704557935, "grad_norm": 0.3952718732118721, "learning_rate": 7.416758241954532e-08, "loss": 0.2748, "step": 4514 }, { "epoch": 2.4794069192751236, "grad_norm": 0.4332565378001954, "learning_rate": 7.401553178367965e-08, "loss": 0.2284, "step": 4515 }, { "epoch": 2.4799560680944537, "grad_norm": 0.4947101319545579, "learning_rate": 7.386362491875126e-08, "loss": 0.2641, "step": 4516 }, { "epoch": 2.480505216913784, "grad_norm": 0.49513473950212356, "learning_rate": 7.371186187602318e-08, "loss": 0.2427, "step": 4517 }, { "epoch": 2.4810543657331134, "grad_norm": 0.3983030302307297, "learning_rate": 7.356024270671012e-08, "loss": 0.2258, "step": 4518 }, { "epoch": 2.4816035145524435, "grad_norm": 0.4788093986685573, "learning_rate": 7.34087674619775e-08, "loss": 0.2549, "step": 4519 }, { "epoch": 2.4821526633717736, "grad_norm": 0.5697330974536536, "learning_rate": 7.32574361929429e-08, "loss": 0.2366, "step": 4520 }, { "epoch": 2.4827018121911038, "grad_norm": 0.440195295438078, "learning_rate": 7.310624895067508e-08, "loss": 0.2142, "step": 4521 }, { "epoch": 2.483250961010434, "grad_norm": 0.9897982550390857, "learning_rate": 7.295520578619398e-08, "loss": 0.2756, "step": 4522 }, { "epoch": 2.483800109829764, "grad_norm": 0.46753288558942485, "learning_rate": 7.280430675047138e-08, "loss": 0.2139, "step": 4523 }, { "epoch": 2.484349258649094, "grad_norm": 0.5184665839066971, "learning_rate": 7.265355189442982e-08, "loss": 0.2265, "step": 4524 }, { "epoch": 2.4848984074684237, "grad_norm": 0.4402984143153301, "learning_rate": 7.250294126894384e-08, "loss": 0.2255, "step": 4525 }, { "epoch": 2.485447556287754, "grad_norm": 0.5200456503794312, "learning_rate": 7.235247492483868e-08, "loss": 0.2238, "step": 4526 }, { "epoch": 2.485996705107084, "grad_norm": 0.4625367661801, "learning_rate": 7.220215291289126e-08, "loss": 0.2514, "step": 4527 }, { "epoch": 2.486545853926414, "grad_norm": 0.5023040369718166, "learning_rate": 7.205197528382993e-08, "loss": 0.2306, "step": 4528 }, { "epoch": 2.487095002745744, "grad_norm": 0.5331758263751909, "learning_rate": 7.190194208833401e-08, "loss": 0.2359, "step": 4529 }, { "epoch": 2.4876441515650742, "grad_norm": 0.516711920347689, "learning_rate": 7.175205337703395e-08, "loss": 0.2352, "step": 4530 }, { "epoch": 2.4881933003844043, "grad_norm": 0.5026366629981299, "learning_rate": 7.160230920051183e-08, "loss": 0.2371, "step": 4531 }, { "epoch": 2.488742449203734, "grad_norm": 0.45576839945889447, "learning_rate": 7.145270960930095e-08, "loss": 0.2472, "step": 4532 }, { "epoch": 2.489291598023064, "grad_norm": 0.6569633844071732, "learning_rate": 7.130325465388541e-08, "loss": 0.258, "step": 4533 }, { "epoch": 2.489840746842394, "grad_norm": 0.5096238614233091, "learning_rate": 7.115394438470095e-08, "loss": 0.2183, "step": 4534 }, { "epoch": 2.4903898956617243, "grad_norm": 0.5313299516427737, "learning_rate": 7.100477885213421e-08, "loss": 0.2261, "step": 4535 }, { "epoch": 2.4909390444810544, "grad_norm": 0.474860027829337, "learning_rate": 7.085575810652291e-08, "loss": 0.2645, "step": 4536 }, { "epoch": 2.4914881933003845, "grad_norm": 0.5323962353379724, "learning_rate": 7.070688219815618e-08, "loss": 0.2313, "step": 4537 }, { "epoch": 2.4920373421197146, "grad_norm": 0.4636722831600837, "learning_rate": 7.05581511772743e-08, "loss": 0.3081, "step": 4538 }, { "epoch": 2.4925864909390443, "grad_norm": 0.5509361312060048, "learning_rate": 7.040956509406825e-08, "loss": 0.235, "step": 4539 }, { "epoch": 2.4931356397583744, "grad_norm": 0.471863275186115, "learning_rate": 7.026112399868062e-08, "loss": 0.2668, "step": 4540 }, { "epoch": 2.4936847885777045, "grad_norm": 0.4683989622672536, "learning_rate": 7.011282794120456e-08, "loss": 0.2156, "step": 4541 }, { "epoch": 2.4942339373970346, "grad_norm": 0.5776108222735982, "learning_rate": 6.996467697168477e-08, "loss": 0.2069, "step": 4542 }, { "epoch": 2.4947830862163647, "grad_norm": 0.5306033669221574, "learning_rate": 6.981667114011653e-08, "loss": 0.2751, "step": 4543 }, { "epoch": 2.495332235035695, "grad_norm": 0.5634408895800741, "learning_rate": 6.966881049644639e-08, "loss": 0.2308, "step": 4544 }, { "epoch": 2.495881383855025, "grad_norm": 0.6693938448580908, "learning_rate": 6.952109509057203e-08, "loss": 0.2405, "step": 4545 }, { "epoch": 2.4964305326743546, "grad_norm": 0.5068423685233001, "learning_rate": 6.937352497234187e-08, "loss": 0.2572, "step": 4546 }, { "epoch": 2.4969796814936847, "grad_norm": 0.5496883168462887, "learning_rate": 6.922610019155528e-08, "loss": 0.2522, "step": 4547 }, { "epoch": 2.4975288303130148, "grad_norm": 0.4884693918673925, "learning_rate": 6.90788207979628e-08, "loss": 0.2322, "step": 4548 }, { "epoch": 2.498077979132345, "grad_norm": 0.5531472933888497, "learning_rate": 6.893168684126583e-08, "loss": 0.2279, "step": 4549 }, { "epoch": 2.498627127951675, "grad_norm": 0.5063406033870022, "learning_rate": 6.878469837111653e-08, "loss": 0.2613, "step": 4550 }, { "epoch": 2.499176276771005, "grad_norm": 0.6110378821811832, "learning_rate": 6.863785543711825e-08, "loss": 0.2879, "step": 4551 }, { "epoch": 2.499725425590335, "grad_norm": 0.4579330647645518, "learning_rate": 6.849115808882504e-08, "loss": 0.2259, "step": 4552 }, { "epoch": 2.500274574409665, "grad_norm": 0.46921298050231774, "learning_rate": 6.834460637574168e-08, "loss": 0.2349, "step": 4553 }, { "epoch": 2.500823723228995, "grad_norm": 0.46107669884783736, "learning_rate": 6.819820034732414e-08, "loss": 0.2683, "step": 4554 }, { "epoch": 2.501372872048325, "grad_norm": 0.386687713141255, "learning_rate": 6.805194005297904e-08, "loss": 0.2367, "step": 4555 }, { "epoch": 2.501922020867655, "grad_norm": 0.4647654784828155, "learning_rate": 6.790582554206392e-08, "loss": 0.2556, "step": 4556 }, { "epoch": 2.5024711696869852, "grad_norm": 0.5675260963879627, "learning_rate": 6.775985686388675e-08, "loss": 0.2563, "step": 4557 }, { "epoch": 2.5030203185063153, "grad_norm": 0.47829580131106575, "learning_rate": 6.761403406770683e-08, "loss": 0.244, "step": 4558 }, { "epoch": 2.5035694673256454, "grad_norm": 0.5103853474552995, "learning_rate": 6.746835720273404e-08, "loss": 0.2347, "step": 4559 }, { "epoch": 2.504118616144975, "grad_norm": 0.5431709889759623, "learning_rate": 6.732282631812872e-08, "loss": 0.2478, "step": 4560 }, { "epoch": 2.504667764964305, "grad_norm": 0.4391728531090908, "learning_rate": 6.717744146300231e-08, "loss": 0.2269, "step": 4561 }, { "epoch": 2.5052169137836353, "grad_norm": 0.5637558717523288, "learning_rate": 6.703220268641694e-08, "loss": 0.2337, "step": 4562 }, { "epoch": 2.5057660626029654, "grad_norm": 0.4756145350254178, "learning_rate": 6.688711003738521e-08, "loss": 0.2665, "step": 4563 }, { "epoch": 2.5063152114222955, "grad_norm": 0.5487562631567913, "learning_rate": 6.674216356487053e-08, "loss": 0.2268, "step": 4564 }, { "epoch": 2.5068643602416256, "grad_norm": 0.6230405643413375, "learning_rate": 6.6597363317787e-08, "loss": 0.2716, "step": 4565 }, { "epoch": 2.5074135090609557, "grad_norm": 0.5576654051290235, "learning_rate": 6.645270934499952e-08, "loss": 0.2295, "step": 4566 }, { "epoch": 2.5079626578802854, "grad_norm": 0.572549915909029, "learning_rate": 6.630820169532339e-08, "loss": 0.2414, "step": 4567 }, { "epoch": 2.5085118066996155, "grad_norm": 0.5535643744964316, "learning_rate": 6.616384041752447e-08, "loss": 0.2632, "step": 4568 }, { "epoch": 2.5090609555189456, "grad_norm": 0.44145340255293003, "learning_rate": 6.601962556031963e-08, "loss": 0.2191, "step": 4569 }, { "epoch": 2.5096101043382757, "grad_norm": 0.43349912844193744, "learning_rate": 6.587555717237578e-08, "loss": 0.2384, "step": 4570 }, { "epoch": 2.510159253157606, "grad_norm": 0.42865721939646434, "learning_rate": 6.573163530231091e-08, "loss": 0.2324, "step": 4571 }, { "epoch": 2.510708401976936, "grad_norm": 0.5355449374415615, "learning_rate": 6.55878599986934e-08, "loss": 0.2199, "step": 4572 }, { "epoch": 2.511257550796266, "grad_norm": 0.4551018733552217, "learning_rate": 6.544423131004196e-08, "loss": 0.2192, "step": 4573 }, { "epoch": 2.5118066996155957, "grad_norm": 0.5076103463348527, "learning_rate": 6.530074928482596e-08, "loss": 0.2167, "step": 4574 }, { "epoch": 2.5123558484349258, "grad_norm": 0.48417642715467457, "learning_rate": 6.515741397146532e-08, "loss": 0.2504, "step": 4575 }, { "epoch": 2.512904997254256, "grad_norm": 0.456682360057578, "learning_rate": 6.501422541833066e-08, "loss": 0.2271, "step": 4576 }, { "epoch": 2.513454146073586, "grad_norm": 0.44263984504166337, "learning_rate": 6.487118367374251e-08, "loss": 0.2292, "step": 4577 }, { "epoch": 2.514003294892916, "grad_norm": 0.4579839038792415, "learning_rate": 6.47282887859724e-08, "loss": 0.253, "step": 4578 }, { "epoch": 2.514552443712246, "grad_norm": 0.4514081843179174, "learning_rate": 6.458554080324198e-08, "loss": 0.2496, "step": 4579 }, { "epoch": 2.5151015925315763, "grad_norm": 0.5035294651778898, "learning_rate": 6.444293977372356e-08, "loss": 0.2579, "step": 4580 }, { "epoch": 2.515650741350906, "grad_norm": 0.5353868280447703, "learning_rate": 6.430048574553959e-08, "loss": 0.2311, "step": 4581 }, { "epoch": 2.516199890170236, "grad_norm": 0.4951859746805475, "learning_rate": 6.415817876676311e-08, "loss": 0.2053, "step": 4582 }, { "epoch": 2.516749038989566, "grad_norm": 0.564127433527795, "learning_rate": 6.401601888541754e-08, "loss": 0.2079, "step": 4583 }, { "epoch": 2.5172981878088962, "grad_norm": 0.46829716616200456, "learning_rate": 6.387400614947661e-08, "loss": 0.2189, "step": 4584 }, { "epoch": 2.5178473366282264, "grad_norm": 0.48694046259194307, "learning_rate": 6.373214060686422e-08, "loss": 0.2331, "step": 4585 }, { "epoch": 2.518396485447556, "grad_norm": 0.49583766450853206, "learning_rate": 6.359042230545498e-08, "loss": 0.2606, "step": 4586 }, { "epoch": 2.5189456342668866, "grad_norm": 0.5135448151667028, "learning_rate": 6.344885129307338e-08, "loss": 0.2444, "step": 4587 }, { "epoch": 2.519494783086216, "grad_norm": 0.5133435337744062, "learning_rate": 6.330742761749455e-08, "loss": 0.2577, "step": 4588 }, { "epoch": 2.5200439319055463, "grad_norm": 0.5072700573587068, "learning_rate": 6.31661513264438e-08, "loss": 0.2866, "step": 4589 }, { "epoch": 2.5205930807248764, "grad_norm": 0.5292344438892093, "learning_rate": 6.302502246759667e-08, "loss": 0.2819, "step": 4590 }, { "epoch": 2.5211422295442065, "grad_norm": 0.6788155184018672, "learning_rate": 6.288404108857883e-08, "loss": 0.2721, "step": 4591 }, { "epoch": 2.5216913783635366, "grad_norm": 0.5735647635235057, "learning_rate": 6.274320723696639e-08, "loss": 0.2397, "step": 4592 }, { "epoch": 2.5222405271828663, "grad_norm": 0.5417002077065098, "learning_rate": 6.260252096028565e-08, "loss": 0.2361, "step": 4593 }, { "epoch": 2.522789676002197, "grad_norm": 0.5061394958599946, "learning_rate": 6.246198230601283e-08, "loss": 0.2136, "step": 4594 }, { "epoch": 2.5233388248215265, "grad_norm": 0.41952177618486725, "learning_rate": 6.232159132157487e-08, "loss": 0.2506, "step": 4595 }, { "epoch": 2.5238879736408566, "grad_norm": 0.4056545649175082, "learning_rate": 6.21813480543482e-08, "loss": 0.2027, "step": 4596 }, { "epoch": 2.5244371224601867, "grad_norm": 0.4372681392455932, "learning_rate": 6.20412525516601e-08, "loss": 0.2481, "step": 4597 }, { "epoch": 2.524986271279517, "grad_norm": 0.4966730329918373, "learning_rate": 6.190130486078729e-08, "loss": 0.2302, "step": 4598 }, { "epoch": 2.525535420098847, "grad_norm": 0.44463476211589104, "learning_rate": 6.176150502895718e-08, "loss": 0.2184, "step": 4599 }, { "epoch": 2.5260845689181766, "grad_norm": 0.4848012548769994, "learning_rate": 6.162185310334725e-08, "loss": 0.2305, "step": 4600 }, { "epoch": 2.5260845689181766, "eval_loss": 0.3213394284248352, "eval_runtime": 18.6927, "eval_samples_per_second": 23.699, "eval_steps_per_second": 1.016, "step": 4600 }, { "epoch": 2.526633717737507, "grad_norm": 0.5031891557484789, "learning_rate": 6.148234913108445e-08, "loss": 0.273, "step": 4601 }, { "epoch": 2.5271828665568368, "grad_norm": 0.5066358923837541, "learning_rate": 6.134299315924644e-08, "loss": 0.257, "step": 4602 }, { "epoch": 2.527732015376167, "grad_norm": 0.5450729969251152, "learning_rate": 6.120378523486075e-08, "loss": 0.2321, "step": 4603 }, { "epoch": 2.528281164195497, "grad_norm": 0.5417822891997048, "learning_rate": 6.106472540490486e-08, "loss": 0.2501, "step": 4604 }, { "epoch": 2.528830313014827, "grad_norm": 0.46206438555126045, "learning_rate": 6.092581371630639e-08, "loss": 0.2099, "step": 4605 }, { "epoch": 2.529379461834157, "grad_norm": 0.5361145957176399, "learning_rate": 6.078705021594296e-08, "loss": 0.2152, "step": 4606 }, { "epoch": 2.529928610653487, "grad_norm": 0.5258311917250434, "learning_rate": 6.064843495064211e-08, "loss": 0.2296, "step": 4607 }, { "epoch": 2.5304777594728174, "grad_norm": 0.6302793171276116, "learning_rate": 6.050996796718128e-08, "loss": 0.2523, "step": 4608 }, { "epoch": 2.531026908292147, "grad_norm": 0.5163400990954325, "learning_rate": 6.037164931228799e-08, "loss": 0.2511, "step": 4609 }, { "epoch": 2.531576057111477, "grad_norm": 0.46544326232452143, "learning_rate": 6.023347903263991e-08, "loss": 0.2833, "step": 4610 }, { "epoch": 2.5321252059308073, "grad_norm": 0.5379793448658092, "learning_rate": 6.009545717486425e-08, "loss": 0.2336, "step": 4611 }, { "epoch": 2.5326743547501374, "grad_norm": 0.59740990034428, "learning_rate": 5.995758378553819e-08, "loss": 0.2679, "step": 4612 }, { "epoch": 2.5332235035694675, "grad_norm": 0.47440094686162837, "learning_rate": 5.981985891118909e-08, "loss": 0.2265, "step": 4613 }, { "epoch": 2.533772652388797, "grad_norm": 0.5154349656381697, "learning_rate": 5.968228259829405e-08, "loss": 0.2215, "step": 4614 }, { "epoch": 2.5343218012081277, "grad_norm": 0.6560178788666248, "learning_rate": 5.9544854893279796e-08, "loss": 0.2166, "step": 4615 }, { "epoch": 2.5348709500274573, "grad_norm": 0.5391996347459551, "learning_rate": 5.940757584252331e-08, "loss": 0.2352, "step": 4616 }, { "epoch": 2.5354200988467874, "grad_norm": 0.5210384854916712, "learning_rate": 5.9270445492351334e-08, "loss": 0.2268, "step": 4617 }, { "epoch": 2.5359692476661175, "grad_norm": 0.42573991189010507, "learning_rate": 5.913346388903995e-08, "loss": 0.2274, "step": 4618 }, { "epoch": 2.5365183964854476, "grad_norm": 0.6428833072681643, "learning_rate": 5.899663107881555e-08, "loss": 0.2418, "step": 4619 }, { "epoch": 2.5370675453047777, "grad_norm": 0.5093942390967556, "learning_rate": 5.885994710785435e-08, "loss": 0.2504, "step": 4620 }, { "epoch": 2.5376166941241074, "grad_norm": 0.5011740978857232, "learning_rate": 5.87234120222819e-08, "loss": 0.2806, "step": 4621 }, { "epoch": 2.538165842943438, "grad_norm": 0.47394706730958225, "learning_rate": 5.858702586817389e-08, "loss": 0.2634, "step": 4622 }, { "epoch": 2.5387149917627676, "grad_norm": 0.5010001792865919, "learning_rate": 5.8450788691555573e-08, "loss": 0.2732, "step": 4623 }, { "epoch": 2.5392641405820977, "grad_norm": 0.6443936850229762, "learning_rate": 5.831470053840211e-08, "loss": 0.251, "step": 4624 }, { "epoch": 2.539813289401428, "grad_norm": 0.4062283201118275, "learning_rate": 5.8178761454637974e-08, "loss": 0.2398, "step": 4625 }, { "epoch": 2.540362438220758, "grad_norm": 0.4725602560706316, "learning_rate": 5.80429714861378e-08, "loss": 0.2129, "step": 4626 }, { "epoch": 2.540911587040088, "grad_norm": 0.548604268347584, "learning_rate": 5.790733067872573e-08, "loss": 0.2901, "step": 4627 }, { "epoch": 2.5414607358594177, "grad_norm": 0.645901346883235, "learning_rate": 5.777183907817548e-08, "loss": 0.2554, "step": 4628 }, { "epoch": 2.5420098846787478, "grad_norm": 0.4765275483505522, "learning_rate": 5.7636496730210375e-08, "loss": 0.2305, "step": 4629 }, { "epoch": 2.542559033498078, "grad_norm": 0.4588742452096127, "learning_rate": 5.750130368050355e-08, "loss": 0.2453, "step": 4630 }, { "epoch": 2.543108182317408, "grad_norm": 0.556414516005863, "learning_rate": 5.736625997467776e-08, "loss": 0.2749, "step": 4631 }, { "epoch": 2.543657331136738, "grad_norm": 0.5022395623924697, "learning_rate": 5.723136565830512e-08, "loss": 0.2529, "step": 4632 }, { "epoch": 2.544206479956068, "grad_norm": 0.5592360813869147, "learning_rate": 5.709662077690764e-08, "loss": 0.2516, "step": 4633 }, { "epoch": 2.5447556287753983, "grad_norm": 0.5079009995072159, "learning_rate": 5.696202537595674e-08, "loss": 0.2762, "step": 4634 }, { "epoch": 2.545304777594728, "grad_norm": 0.4681104189612712, "learning_rate": 5.6827579500873206e-08, "loss": 0.2282, "step": 4635 }, { "epoch": 2.545853926414058, "grad_norm": 0.4630116772608012, "learning_rate": 5.669328319702776e-08, "loss": 0.2606, "step": 4636 }, { "epoch": 2.546403075233388, "grad_norm": 0.4641448645647968, "learning_rate": 5.6559136509740504e-08, "loss": 0.2422, "step": 4637 }, { "epoch": 2.5469522240527183, "grad_norm": 0.4312021971699116, "learning_rate": 5.642513948428081e-08, "loss": 0.2585, "step": 4638 }, { "epoch": 2.5475013728720484, "grad_norm": 0.5139453143354953, "learning_rate": 5.629129216586796e-08, "loss": 0.2611, "step": 4639 }, { "epoch": 2.5480505216913785, "grad_norm": 0.5612319951795401, "learning_rate": 5.6157594599670265e-08, "loss": 0.2578, "step": 4640 }, { "epoch": 2.5485996705107086, "grad_norm": 0.48901502797507146, "learning_rate": 5.6024046830805954e-08, "loss": 0.2813, "step": 4641 }, { "epoch": 2.5491488193300382, "grad_norm": 0.4744618146143959, "learning_rate": 5.589064890434224e-08, "loss": 0.2327, "step": 4642 }, { "epoch": 2.5496979681493683, "grad_norm": 0.39310686310390536, "learning_rate": 5.575740086529616e-08, "loss": 0.2264, "step": 4643 }, { "epoch": 2.5502471169686984, "grad_norm": 0.5184786477437552, "learning_rate": 5.562430275863407e-08, "loss": 0.2336, "step": 4644 }, { "epoch": 2.5507962657880285, "grad_norm": 0.5133046584505918, "learning_rate": 5.5491354629271554e-08, "loss": 0.2698, "step": 4645 }, { "epoch": 2.5513454146073586, "grad_norm": 0.6326811499528321, "learning_rate": 5.535855652207374e-08, "loss": 0.2969, "step": 4646 }, { "epoch": 2.5518945634266887, "grad_norm": 0.540072706259425, "learning_rate": 5.5225908481855065e-08, "loss": 0.2561, "step": 4647 }, { "epoch": 2.552443712246019, "grad_norm": 0.6264760229681093, "learning_rate": 5.5093410553379516e-08, "loss": 0.2761, "step": 4648 }, { "epoch": 2.5529928610653485, "grad_norm": 0.5268250905941013, "learning_rate": 5.496106278136001e-08, "loss": 0.2334, "step": 4649 }, { "epoch": 2.5535420098846786, "grad_norm": 0.484754195266986, "learning_rate": 5.4828865210459326e-08, "loss": 0.2151, "step": 4650 }, { "epoch": 2.5540911587040087, "grad_norm": 0.4467375174246866, "learning_rate": 5.469681788528906e-08, "loss": 0.2266, "step": 4651 }, { "epoch": 2.554640307523339, "grad_norm": 0.48980020237373173, "learning_rate": 5.456492085041034e-08, "loss": 0.2452, "step": 4652 }, { "epoch": 2.555189456342669, "grad_norm": 0.5705675482473022, "learning_rate": 5.44331741503336e-08, "loss": 0.2252, "step": 4653 }, { "epoch": 2.555738605161999, "grad_norm": 0.43588192716967844, "learning_rate": 5.4301577829518615e-08, "loss": 0.2514, "step": 4654 }, { "epoch": 2.556287753981329, "grad_norm": 0.5239870541830424, "learning_rate": 5.417013193237414e-08, "loss": 0.2655, "step": 4655 }, { "epoch": 2.556836902800659, "grad_norm": 0.5352064399550495, "learning_rate": 5.4038836503258285e-08, "loss": 0.2694, "step": 4656 }, { "epoch": 2.557386051619989, "grad_norm": 0.5399455872232076, "learning_rate": 5.3907691586478485e-08, "loss": 0.2137, "step": 4657 }, { "epoch": 2.557935200439319, "grad_norm": 0.4376944943348027, "learning_rate": 5.3776697226291435e-08, "loss": 0.2221, "step": 4658 }, { "epoch": 2.558484349258649, "grad_norm": 0.4495307972598976, "learning_rate": 5.364585346690275e-08, "loss": 0.2598, "step": 4659 }, { "epoch": 2.559033498077979, "grad_norm": 0.5656563703152465, "learning_rate": 5.3515160352467354e-08, "loss": 0.2271, "step": 4660 }, { "epoch": 2.5595826468973093, "grad_norm": 0.42037946863955317, "learning_rate": 5.338461792708956e-08, "loss": 0.219, "step": 4661 }, { "epoch": 2.5601317957166394, "grad_norm": 0.4156499751036329, "learning_rate": 5.325422623482255e-08, "loss": 0.2247, "step": 4662 }, { "epoch": 2.560680944535969, "grad_norm": 0.48706999344370866, "learning_rate": 5.31239853196685e-08, "loss": 0.2364, "step": 4663 }, { "epoch": 2.561230093355299, "grad_norm": 0.5788069674855978, "learning_rate": 5.299389522557912e-08, "loss": 0.2436, "step": 4664 }, { "epoch": 2.5617792421746293, "grad_norm": 0.5347028951822241, "learning_rate": 5.2863955996455136e-08, "loss": 0.225, "step": 4665 }, { "epoch": 2.5623283909939594, "grad_norm": 0.6792519261978894, "learning_rate": 5.2734167676146027e-08, "loss": 0.2513, "step": 4666 }, { "epoch": 2.5628775398132895, "grad_norm": 0.46016879879885514, "learning_rate": 5.260453030845064e-08, "loss": 0.2385, "step": 4667 }, { "epoch": 2.5634266886326196, "grad_norm": 0.5288548738347421, "learning_rate": 5.247504393711682e-08, "loss": 0.2986, "step": 4668 }, { "epoch": 2.5639758374519497, "grad_norm": 0.48035546757544517, "learning_rate": 5.234570860584144e-08, "loss": 0.2351, "step": 4669 }, { "epoch": 2.5645249862712793, "grad_norm": 0.650647669868789, "learning_rate": 5.2216524358270344e-08, "loss": 0.257, "step": 4670 }, { "epoch": 2.5650741350906094, "grad_norm": 0.49146587420418614, "learning_rate": 5.208749123799865e-08, "loss": 0.2471, "step": 4671 }, { "epoch": 2.5656232839099395, "grad_norm": 0.4767436034446234, "learning_rate": 5.195860928857022e-08, "loss": 0.2227, "step": 4672 }, { "epoch": 2.5661724327292696, "grad_norm": 0.4915045283337026, "learning_rate": 5.182987855347775e-08, "loss": 0.233, "step": 4673 }, { "epoch": 2.5667215815485998, "grad_norm": 0.45810460134085823, "learning_rate": 5.1701299076163244e-08, "loss": 0.2387, "step": 4674 }, { "epoch": 2.56727073036793, "grad_norm": 0.423622698497717, "learning_rate": 5.157287090001777e-08, "loss": 0.2157, "step": 4675 }, { "epoch": 2.56781987918726, "grad_norm": 0.7069934117936212, "learning_rate": 5.144459406838069e-08, "loss": 0.2594, "step": 4676 }, { "epoch": 2.5683690280065896, "grad_norm": 0.5397225037934974, "learning_rate": 5.131646862454112e-08, "loss": 0.2596, "step": 4677 }, { "epoch": 2.5689181768259197, "grad_norm": 0.45142993986312696, "learning_rate": 5.1188494611736313e-08, "loss": 0.2111, "step": 4678 }, { "epoch": 2.56946732564525, "grad_norm": 0.4386055649080123, "learning_rate": 5.106067207315311e-08, "loss": 0.2494, "step": 4679 }, { "epoch": 2.57001647446458, "grad_norm": 0.5212506057678268, "learning_rate": 5.09330010519266e-08, "loss": 0.2512, "step": 4680 }, { "epoch": 2.57056562328391, "grad_norm": 0.5749257055246332, "learning_rate": 5.080548159114125e-08, "loss": 0.24, "step": 4681 }, { "epoch": 2.57111477210324, "grad_norm": 0.5165471932835463, "learning_rate": 5.0678113733830195e-08, "loss": 0.2049, "step": 4682 }, { "epoch": 2.5716639209225702, "grad_norm": 0.6501241332802173, "learning_rate": 5.0550897522975344e-08, "loss": 0.2429, "step": 4683 }, { "epoch": 2.5722130697419, "grad_norm": 0.5005552885572347, "learning_rate": 5.04238330015074e-08, "loss": 0.2196, "step": 4684 }, { "epoch": 2.57276221856123, "grad_norm": 0.568246674109851, "learning_rate": 5.029692021230605e-08, "loss": 0.2459, "step": 4685 }, { "epoch": 2.57331136738056, "grad_norm": 0.4251384491598092, "learning_rate": 5.017015919819963e-08, "loss": 0.2445, "step": 4686 }, { "epoch": 2.57386051619989, "grad_norm": 0.5083118640870605, "learning_rate": 5.0043550001965305e-08, "loss": 0.2054, "step": 4687 }, { "epoch": 2.5744096650192203, "grad_norm": 0.5184652791660596, "learning_rate": 4.991709266632919e-08, "loss": 0.2327, "step": 4688 }, { "epoch": 2.5749588138385504, "grad_norm": 0.6602559013259608, "learning_rate": 4.979078723396576e-08, "loss": 0.2591, "step": 4689 }, { "epoch": 2.5755079626578805, "grad_norm": 0.4260702856673771, "learning_rate": 4.966463374749848e-08, "loss": 0.2385, "step": 4690 }, { "epoch": 2.57605711147721, "grad_norm": 0.4855338421035337, "learning_rate": 4.953863224949954e-08, "loss": 0.2511, "step": 4691 }, { "epoch": 2.5766062602965403, "grad_norm": 0.5094669445022787, "learning_rate": 4.9412782782489954e-08, "loss": 0.2434, "step": 4692 }, { "epoch": 2.5771554091158704, "grad_norm": 0.4382320243575517, "learning_rate": 4.9287085388938994e-08, "loss": 0.2498, "step": 4693 }, { "epoch": 2.5777045579352005, "grad_norm": 0.7268016991155437, "learning_rate": 4.916154011126514e-08, "loss": 0.2322, "step": 4694 }, { "epoch": 2.5782537067545306, "grad_norm": 0.36841152629993473, "learning_rate": 4.9036146991835066e-08, "loss": 0.232, "step": 4695 }, { "epoch": 2.5788028555738602, "grad_norm": 0.4556278474869672, "learning_rate": 4.8910906072964606e-08, "loss": 0.2111, "step": 4696 }, { "epoch": 2.579352004393191, "grad_norm": 0.3859393500165948, "learning_rate": 4.8785817396917735e-08, "loss": 0.2254, "step": 4697 }, { "epoch": 2.5799011532125204, "grad_norm": 0.5499506639583974, "learning_rate": 4.8660881005907347e-08, "loss": 0.2276, "step": 4698 }, { "epoch": 2.5804503020318506, "grad_norm": 0.4730504136293765, "learning_rate": 4.8536096942095054e-08, "loss": 0.2353, "step": 4699 }, { "epoch": 2.5809994508511807, "grad_norm": 0.4406221867805422, "learning_rate": 4.8411465247590505e-08, "loss": 0.2538, "step": 4700 }, { "epoch": 2.5815485996705108, "grad_norm": 0.5484447179927151, "learning_rate": 4.828698596445252e-08, "loss": 0.2506, "step": 4701 }, { "epoch": 2.582097748489841, "grad_norm": 0.46067474930657737, "learning_rate": 4.816265913468834e-08, "loss": 0.2573, "step": 4702 }, { "epoch": 2.5826468973091705, "grad_norm": 0.6997940313212557, "learning_rate": 4.803848480025355e-08, "loss": 0.2418, "step": 4703 }, { "epoch": 2.583196046128501, "grad_norm": 0.5189259824433953, "learning_rate": 4.7914463003052436e-08, "loss": 0.2464, "step": 4704 }, { "epoch": 2.5837451949478307, "grad_norm": 0.4341819655441293, "learning_rate": 4.7790593784937875e-08, "loss": 0.2612, "step": 4705 }, { "epoch": 2.584294343767161, "grad_norm": 0.4773838832455015, "learning_rate": 4.766687718771114e-08, "loss": 0.2646, "step": 4706 }, { "epoch": 2.584843492586491, "grad_norm": 0.5527088749553231, "learning_rate": 4.754331325312193e-08, "loss": 0.2632, "step": 4707 }, { "epoch": 2.585392641405821, "grad_norm": 0.4594578033926344, "learning_rate": 4.741990202286855e-08, "loss": 0.2148, "step": 4708 }, { "epoch": 2.585941790225151, "grad_norm": 0.5046396883730685, "learning_rate": 4.729664353859786e-08, "loss": 0.2456, "step": 4709 }, { "epoch": 2.586490939044481, "grad_norm": 0.4518190799723593, "learning_rate": 4.7173537841904974e-08, "loss": 0.2155, "step": 4710 }, { "epoch": 2.5870400878638113, "grad_norm": 0.4370217874947079, "learning_rate": 4.7050584974333445e-08, "loss": 0.2427, "step": 4711 }, { "epoch": 2.587589236683141, "grad_norm": 0.4905549012073179, "learning_rate": 4.692778497737542e-08, "loss": 0.2397, "step": 4712 }, { "epoch": 2.588138385502471, "grad_norm": 0.5202777342920942, "learning_rate": 4.6805137892471515e-08, "loss": 0.2467, "step": 4713 }, { "epoch": 2.588687534321801, "grad_norm": 0.46490505046190356, "learning_rate": 4.6682643761010297e-08, "loss": 0.2185, "step": 4714 }, { "epoch": 2.5892366831411313, "grad_norm": 0.4497853482569377, "learning_rate": 4.656030262432923e-08, "loss": 0.264, "step": 4715 }, { "epoch": 2.5897858319604614, "grad_norm": 0.6418503318297589, "learning_rate": 4.6438114523714044e-08, "loss": 0.284, "step": 4716 }, { "epoch": 2.590334980779791, "grad_norm": 0.649968508941332, "learning_rate": 4.631607950039841e-08, "loss": 0.2906, "step": 4717 }, { "epoch": 2.5908841295991216, "grad_norm": 0.7254143065533668, "learning_rate": 4.619419759556482e-08, "loss": 0.3622, "step": 4718 }, { "epoch": 2.5914332784184513, "grad_norm": 0.5590200146163397, "learning_rate": 4.607246885034403e-08, "loss": 0.252, "step": 4719 }, { "epoch": 2.5919824272377814, "grad_norm": 0.5707436091864629, "learning_rate": 4.59508933058148e-08, "loss": 0.2312, "step": 4720 }, { "epoch": 2.5925315760571115, "grad_norm": 0.4954821586055, "learning_rate": 4.5829471003004586e-08, "loss": 0.2565, "step": 4721 }, { "epoch": 2.5930807248764416, "grad_norm": 0.45626611841852777, "learning_rate": 4.570820198288873e-08, "loss": 0.2607, "step": 4722 }, { "epoch": 2.5936298736957717, "grad_norm": 0.6896882682186795, "learning_rate": 4.5587086286391287e-08, "loss": 0.2407, "step": 4723 }, { "epoch": 2.5941790225151014, "grad_norm": 0.5087293344070736, "learning_rate": 4.546612395438416e-08, "loss": 0.218, "step": 4724 }, { "epoch": 2.594728171334432, "grad_norm": 0.4757263257198551, "learning_rate": 4.53453150276878e-08, "loss": 0.2718, "step": 4725 }, { "epoch": 2.5952773201537616, "grad_norm": 0.4343287030943466, "learning_rate": 4.5224659547070764e-08, "loss": 0.2257, "step": 4726 }, { "epoch": 2.5958264689730917, "grad_norm": 0.5822430903768527, "learning_rate": 4.510415755324978e-08, "loss": 0.2655, "step": 4727 }, { "epoch": 2.5963756177924218, "grad_norm": 0.494715238234534, "learning_rate": 4.498380908688981e-08, "loss": 0.2276, "step": 4728 }, { "epoch": 2.596924766611752, "grad_norm": 0.4957113386494103, "learning_rate": 4.486361418860402e-08, "loss": 0.2247, "step": 4729 }, { "epoch": 2.597473915431082, "grad_norm": 0.47788379642563933, "learning_rate": 4.474357289895391e-08, "loss": 0.2196, "step": 4730 }, { "epoch": 2.5980230642504116, "grad_norm": 0.5128436610885865, "learning_rate": 4.46236852584488e-08, "loss": 0.2266, "step": 4731 }, { "epoch": 2.598572213069742, "grad_norm": 0.5149207240709779, "learning_rate": 4.45039513075465e-08, "loss": 0.2008, "step": 4732 }, { "epoch": 2.599121361889072, "grad_norm": 0.49792792860810636, "learning_rate": 4.4384371086652805e-08, "loss": 0.2163, "step": 4733 }, { "epoch": 2.599670510708402, "grad_norm": 0.5099988112118381, "learning_rate": 4.42649446361214e-08, "loss": 0.2804, "step": 4734 }, { "epoch": 2.600219659527732, "grad_norm": 0.5140093471043612, "learning_rate": 4.414567199625458e-08, "loss": 0.2403, "step": 4735 }, { "epoch": 2.600768808347062, "grad_norm": 0.4783320401528292, "learning_rate": 4.402655320730243e-08, "loss": 0.2408, "step": 4736 }, { "epoch": 2.6013179571663922, "grad_norm": 0.5582262996127252, "learning_rate": 4.3907588309462944e-08, "loss": 0.2571, "step": 4737 }, { "epoch": 2.601867105985722, "grad_norm": 0.43664375354722884, "learning_rate": 4.378877734288272e-08, "loss": 0.2281, "step": 4738 }, { "epoch": 2.602416254805052, "grad_norm": 0.43676963304185085, "learning_rate": 4.367012034765573e-08, "loss": 0.2338, "step": 4739 }, { "epoch": 2.602965403624382, "grad_norm": 0.3930096124133771, "learning_rate": 4.355161736382471e-08, "loss": 0.2659, "step": 4740 }, { "epoch": 2.603514552443712, "grad_norm": 0.48572630598438443, "learning_rate": 4.343326843137966e-08, "loss": 0.2497, "step": 4741 }, { "epoch": 2.6040637012630423, "grad_norm": 0.5787132558029076, "learning_rate": 4.3315073590259265e-08, "loss": 0.2879, "step": 4742 }, { "epoch": 2.6046128500823724, "grad_norm": 0.4619929764939395, "learning_rate": 4.3197032880349886e-08, "loss": 0.2735, "step": 4743 }, { "epoch": 2.6051619989017025, "grad_norm": 0.6160254005259093, "learning_rate": 4.3079146341485904e-08, "loss": 0.2964, "step": 4744 }, { "epoch": 2.605711147721032, "grad_norm": 0.43864630226171125, "learning_rate": 4.2961414013449516e-08, "loss": 0.2438, "step": 4745 }, { "epoch": 2.6062602965403623, "grad_norm": 0.4938834227245852, "learning_rate": 4.284383593597123e-08, "loss": 0.2499, "step": 4746 }, { "epoch": 2.6068094453596924, "grad_norm": 0.47847280336475584, "learning_rate": 4.2726412148729344e-08, "loss": 0.2195, "step": 4747 }, { "epoch": 2.6073585941790225, "grad_norm": 0.5337392924298261, "learning_rate": 4.2609142691349867e-08, "loss": 0.2505, "step": 4748 }, { "epoch": 2.6079077429983526, "grad_norm": 0.4751169068365289, "learning_rate": 4.249202760340717e-08, "loss": 0.2691, "step": 4749 }, { "epoch": 2.6084568918176827, "grad_norm": 0.4896171720270328, "learning_rate": 4.237506692442308e-08, "loss": 0.2549, "step": 4750 }, { "epoch": 2.609006040637013, "grad_norm": 0.42036685652941597, "learning_rate": 4.225826069386756e-08, "loss": 0.2641, "step": 4751 }, { "epoch": 2.6095551894563425, "grad_norm": 0.5197911504008706, "learning_rate": 4.2141608951158385e-08, "loss": 0.2571, "step": 4752 }, { "epoch": 2.6101043382756726, "grad_norm": 0.4530523762933118, "learning_rate": 4.2025111735661376e-08, "loss": 0.2398, "step": 4753 }, { "epoch": 2.6106534870950027, "grad_norm": 0.5978128843061661, "learning_rate": 4.1908769086689935e-08, "loss": 0.2974, "step": 4754 }, { "epoch": 2.6112026359143328, "grad_norm": 0.5174108726268848, "learning_rate": 4.1792581043505403e-08, "loss": 0.2148, "step": 4755 }, { "epoch": 2.611751784733663, "grad_norm": 0.4386146138526397, "learning_rate": 4.167654764531692e-08, "loss": 0.2327, "step": 4756 }, { "epoch": 2.612300933552993, "grad_norm": 0.5184461066340873, "learning_rate": 4.156066893128165e-08, "loss": 0.255, "step": 4757 }, { "epoch": 2.612850082372323, "grad_norm": 0.43807923241645813, "learning_rate": 4.144494494050422e-08, "loss": 0.2169, "step": 4758 }, { "epoch": 2.6133992311916527, "grad_norm": 0.4776469085295862, "learning_rate": 4.132937571203732e-08, "loss": 0.2354, "step": 4759 }, { "epoch": 2.613948380010983, "grad_norm": 0.3983504999517481, "learning_rate": 4.121396128488129e-08, "loss": 0.2149, "step": 4760 }, { "epoch": 2.614497528830313, "grad_norm": 0.47691603814494976, "learning_rate": 4.1098701697984256e-08, "loss": 0.2291, "step": 4761 }, { "epoch": 2.615046677649643, "grad_norm": 0.8825558310205681, "learning_rate": 4.098359699024197e-08, "loss": 0.2106, "step": 4762 }, { "epoch": 2.615595826468973, "grad_norm": 0.5530698801960924, "learning_rate": 4.0868647200498155e-08, "loss": 0.2109, "step": 4763 }, { "epoch": 2.6161449752883033, "grad_norm": 0.43491063384156153, "learning_rate": 4.075385236754416e-08, "loss": 0.2567, "step": 4764 }, { "epoch": 2.6166941241076334, "grad_norm": 0.46522762582385974, "learning_rate": 4.063921253011891e-08, "loss": 0.2291, "step": 4765 }, { "epoch": 2.617243272926963, "grad_norm": 0.41670362278975864, "learning_rate": 4.0524727726909093e-08, "loss": 0.2355, "step": 4766 }, { "epoch": 2.617792421746293, "grad_norm": 0.6314867618161334, "learning_rate": 4.041039799654926e-08, "loss": 0.2786, "step": 4767 }, { "epoch": 2.618341570565623, "grad_norm": 0.4983037001286197, "learning_rate": 4.029622337762135e-08, "loss": 0.2376, "step": 4768 }, { "epoch": 2.6188907193849533, "grad_norm": 0.4913778373908841, "learning_rate": 4.0182203908655134e-08, "loss": 0.2642, "step": 4769 }, { "epoch": 2.6194398682042834, "grad_norm": 0.49607649863674436, "learning_rate": 4.0068339628128046e-08, "loss": 0.2653, "step": 4770 }, { "epoch": 2.6199890170236135, "grad_norm": 0.5196374994707973, "learning_rate": 3.9954630574465054e-08, "loss": 0.2773, "step": 4771 }, { "epoch": 2.6205381658429436, "grad_norm": 0.5866207636399032, "learning_rate": 3.984107678603867e-08, "loss": 0.2545, "step": 4772 }, { "epoch": 2.6210873146622733, "grad_norm": 0.5520044519370948, "learning_rate": 3.9727678301169195e-08, "loss": 0.1979, "step": 4773 }, { "epoch": 2.6216364634816034, "grad_norm": 0.6095799394588446, "learning_rate": 3.961443515812452e-08, "loss": 0.2776, "step": 4774 }, { "epoch": 2.6221856123009335, "grad_norm": 0.5071296505031105, "learning_rate": 3.9501347395119845e-08, "loss": 0.261, "step": 4775 }, { "epoch": 2.6227347611202636, "grad_norm": 0.4315683716276093, "learning_rate": 3.938841505031834e-08, "loss": 0.2052, "step": 4776 }, { "epoch": 2.6232839099395937, "grad_norm": 0.4441442619975704, "learning_rate": 3.927563816183032e-08, "loss": 0.2209, "step": 4777 }, { "epoch": 2.623833058758924, "grad_norm": 0.47724451620799896, "learning_rate": 3.916301676771402e-08, "loss": 0.222, "step": 4778 }, { "epoch": 2.624382207578254, "grad_norm": 0.5163301849893983, "learning_rate": 3.905055090597479e-08, "loss": 0.2557, "step": 4779 }, { "epoch": 2.6249313563975836, "grad_norm": 0.48395737176672, "learning_rate": 3.8938240614565865e-08, "loss": 0.2508, "step": 4780 }, { "epoch": 2.6254805052169137, "grad_norm": 0.6187439087642276, "learning_rate": 3.882608593138787e-08, "loss": 0.2993, "step": 4781 }, { "epoch": 2.6260296540362438, "grad_norm": 0.48989836631521816, "learning_rate": 3.8714086894288776e-08, "loss": 0.2502, "step": 4782 }, { "epoch": 2.626578802855574, "grad_norm": 0.5692960320177147, "learning_rate": 3.860224354106408e-08, "loss": 0.2318, "step": 4783 }, { "epoch": 2.627127951674904, "grad_norm": 0.5825133502871823, "learning_rate": 3.8490555909456963e-08, "loss": 0.2688, "step": 4784 }, { "epoch": 2.627677100494234, "grad_norm": 0.6675933587545213, "learning_rate": 3.8379024037157744e-08, "loss": 0.248, "step": 4785 }, { "epoch": 2.628226249313564, "grad_norm": 0.4833019451073019, "learning_rate": 3.8267647961804316e-08, "loss": 0.2178, "step": 4786 }, { "epoch": 2.628775398132894, "grad_norm": 0.48857225437207225, "learning_rate": 3.81564277209822e-08, "loss": 0.2697, "step": 4787 }, { "epoch": 2.629324546952224, "grad_norm": 0.43774263443358197, "learning_rate": 3.804536335222398e-08, "loss": 0.2222, "step": 4788 }, { "epoch": 2.629873695771554, "grad_norm": 0.5054812119234828, "learning_rate": 3.7934454893009723e-08, "loss": 0.2537, "step": 4789 }, { "epoch": 2.630422844590884, "grad_norm": 0.41473094609152666, "learning_rate": 3.782370238076696e-08, "loss": 0.2233, "step": 4790 }, { "epoch": 2.6309719934102143, "grad_norm": 0.5206399353226098, "learning_rate": 3.771310585287077e-08, "loss": 0.2208, "step": 4791 }, { "epoch": 2.6315211422295444, "grad_norm": 0.694570190374752, "learning_rate": 3.7602665346643236e-08, "loss": 0.2797, "step": 4792 }, { "epoch": 2.6320702910488745, "grad_norm": 0.5165322160565938, "learning_rate": 3.749238089935403e-08, "loss": 0.2581, "step": 4793 }, { "epoch": 2.632619439868204, "grad_norm": 0.5109998318786616, "learning_rate": 3.738225254822e-08, "loss": 0.223, "step": 4794 }, { "epoch": 2.6331685886875342, "grad_norm": 1.0135762538631525, "learning_rate": 3.7272280330405584e-08, "loss": 0.4277, "step": 4795 }, { "epoch": 2.6337177375068643, "grad_norm": 0.5786264229954179, "learning_rate": 3.716246428302215e-08, "loss": 0.2366, "step": 4796 }, { "epoch": 2.6342668863261944, "grad_norm": 0.5173131435774923, "learning_rate": 3.705280444312863e-08, "loss": 0.2482, "step": 4797 }, { "epoch": 2.6348160351455245, "grad_norm": 0.5550825757669052, "learning_rate": 3.6943300847731443e-08, "loss": 0.2704, "step": 4798 }, { "epoch": 2.6353651839648546, "grad_norm": 0.4867997213383429, "learning_rate": 3.6833953533783546e-08, "loss": 0.2413, "step": 4799 }, { "epoch": 2.6359143327841847, "grad_norm": 0.4490853152675713, "learning_rate": 3.6724762538185905e-08, "loss": 0.2229, "step": 4800 }, { "epoch": 2.6359143327841847, "eval_loss": 0.32113179564476013, "eval_runtime": 21.4684, "eval_samples_per_second": 20.635, "eval_steps_per_second": 0.885, "step": 4800 }, { "epoch": 2.6364634816035144, "grad_norm": 0.5456981746640015, "learning_rate": 3.6615727897786504e-08, "loss": 0.2099, "step": 4801 }, { "epoch": 2.6370126304228445, "grad_norm": 0.46428354700875457, "learning_rate": 3.650684964938036e-08, "loss": 0.2438, "step": 4802 }, { "epoch": 2.6375617792421746, "grad_norm": 1.2796978972485438, "learning_rate": 3.639812782970995e-08, "loss": 0.2926, "step": 4803 }, { "epoch": 2.6381109280615047, "grad_norm": 0.4842927387322822, "learning_rate": 3.628956247546497e-08, "loss": 0.2348, "step": 4804 }, { "epoch": 2.638660076880835, "grad_norm": 0.48045938593014503, "learning_rate": 3.6181153623282124e-08, "loss": 0.2403, "step": 4805 }, { "epoch": 2.6392092257001645, "grad_norm": 0.41626390713212097, "learning_rate": 3.607290130974532e-08, "loss": 0.2301, "step": 4806 }, { "epoch": 2.639758374519495, "grad_norm": 0.48325979992824303, "learning_rate": 3.5964805571385886e-08, "loss": 0.2259, "step": 4807 }, { "epoch": 2.6403075233388247, "grad_norm": 0.4874169459482029, "learning_rate": 3.585686644468212e-08, "loss": 0.2245, "step": 4808 }, { "epoch": 2.640856672158155, "grad_norm": 0.5433786319980606, "learning_rate": 3.574908396605945e-08, "loss": 0.2628, "step": 4809 }, { "epoch": 2.641405820977485, "grad_norm": 0.4962202430289521, "learning_rate": 3.564145817189048e-08, "loss": 0.2429, "step": 4810 }, { "epoch": 2.641954969796815, "grad_norm": 0.5702342301314084, "learning_rate": 3.553398909849496e-08, "loss": 0.2413, "step": 4811 }, { "epoch": 2.642504118616145, "grad_norm": 0.46093850208134857, "learning_rate": 3.5426676782139867e-08, "loss": 0.223, "step": 4812 }, { "epoch": 2.6430532674354748, "grad_norm": 0.6184806961237568, "learning_rate": 3.5319521259038945e-08, "loss": 0.2505, "step": 4813 }, { "epoch": 2.6436024162548053, "grad_norm": 0.4372682975351609, "learning_rate": 3.5212522565353364e-08, "loss": 0.2279, "step": 4814 }, { "epoch": 2.644151565074135, "grad_norm": 0.504706124104358, "learning_rate": 3.510568073719135e-08, "loss": 0.2451, "step": 4815 }, { "epoch": 2.644700713893465, "grad_norm": 0.47670269759905914, "learning_rate": 3.4998995810607834e-08, "loss": 0.2259, "step": 4816 }, { "epoch": 2.645249862712795, "grad_norm": 0.4785142498105993, "learning_rate": 3.4892467821605156e-08, "loss": 0.2458, "step": 4817 }, { "epoch": 2.6457990115321253, "grad_norm": 0.47663264737018324, "learning_rate": 3.478609680613266e-08, "loss": 0.2237, "step": 4818 }, { "epoch": 2.6463481603514554, "grad_norm": 0.4318924356824676, "learning_rate": 3.4679882800086555e-08, "loss": 0.2414, "step": 4819 }, { "epoch": 2.646897309170785, "grad_norm": 0.6796083376274291, "learning_rate": 3.4573825839310193e-08, "loss": 0.2208, "step": 4820 }, { "epoch": 2.6474464579901156, "grad_norm": 0.5073959478250678, "learning_rate": 3.446792595959385e-08, "loss": 0.2268, "step": 4821 }, { "epoch": 2.6479956068094452, "grad_norm": 0.46543262243015693, "learning_rate": 3.436218319667493e-08, "loss": 0.2228, "step": 4822 }, { "epoch": 2.6485447556287753, "grad_norm": 0.7280188301601501, "learning_rate": 3.425659758623759e-08, "loss": 0.2425, "step": 4823 }, { "epoch": 2.6490939044481054, "grad_norm": 0.41028964483969926, "learning_rate": 3.4151169163913154e-08, "loss": 0.23, "step": 4824 }, { "epoch": 2.6496430532674355, "grad_norm": 0.4559522883297716, "learning_rate": 3.404589796527993e-08, "loss": 0.2095, "step": 4825 }, { "epoch": 2.6501922020867656, "grad_norm": 0.5478090947613791, "learning_rate": 3.3940784025862943e-08, "loss": 0.2021, "step": 4826 }, { "epoch": 2.6507413509060953, "grad_norm": 0.501476545180347, "learning_rate": 3.383582738113422e-08, "loss": 0.2649, "step": 4827 }, { "epoch": 2.651290499725426, "grad_norm": 0.5006936297336, "learning_rate": 3.373102806651282e-08, "loss": 0.2482, "step": 4828 }, { "epoch": 2.6518396485447555, "grad_norm": 0.4403504834932586, "learning_rate": 3.3626386117364756e-08, "loss": 0.2193, "step": 4829 }, { "epoch": 2.6523887973640856, "grad_norm": 0.49348525094876605, "learning_rate": 3.3521901569002686e-08, "loss": 0.24, "step": 4830 }, { "epoch": 2.6529379461834157, "grad_norm": 0.7051309449637, "learning_rate": 3.341757445668642e-08, "loss": 0.2546, "step": 4831 }, { "epoch": 2.653487095002746, "grad_norm": 0.44109932531266094, "learning_rate": 3.331340481562246e-08, "loss": 0.2193, "step": 4832 }, { "epoch": 2.654036243822076, "grad_norm": 0.4419866694168216, "learning_rate": 3.32093926809641e-08, "loss": 0.2367, "step": 4833 }, { "epoch": 2.6545853926414056, "grad_norm": 0.4858334143656286, "learning_rate": 3.310553808781175e-08, "loss": 0.2854, "step": 4834 }, { "epoch": 2.655134541460736, "grad_norm": 0.5694831049555559, "learning_rate": 3.300184107121254e-08, "loss": 0.2443, "step": 4835 }, { "epoch": 2.655683690280066, "grad_norm": 0.47315185803673554, "learning_rate": 3.28983016661603e-08, "loss": 0.2211, "step": 4836 }, { "epoch": 2.656232839099396, "grad_norm": 1.0256090943503928, "learning_rate": 3.279491990759581e-08, "loss": 0.4211, "step": 4837 }, { "epoch": 2.656781987918726, "grad_norm": 0.5645629369708329, "learning_rate": 3.2691695830406496e-08, "loss": 0.2585, "step": 4838 }, { "epoch": 2.657331136738056, "grad_norm": 0.5431007264724083, "learning_rate": 3.2588629469426916e-08, "loss": 0.2752, "step": 4839 }, { "epoch": 2.657880285557386, "grad_norm": 0.5093189502130503, "learning_rate": 3.248572085943791e-08, "loss": 0.2291, "step": 4840 }, { "epoch": 2.658429434376716, "grad_norm": 0.5146057830957038, "learning_rate": 3.238297003516746e-08, "loss": 0.2413, "step": 4841 }, { "epoch": 2.658978583196046, "grad_norm": 0.4449937411261722, "learning_rate": 3.228037703129024e-08, "loss": 0.2294, "step": 4842 }, { "epoch": 2.659527732015376, "grad_norm": 0.41961856315516494, "learning_rate": 3.21779418824276e-08, "loss": 0.2106, "step": 4843 }, { "epoch": 2.660076880834706, "grad_norm": 0.4333694538975023, "learning_rate": 3.2075664623147476e-08, "loss": 0.2692, "step": 4844 }, { "epoch": 2.6606260296540363, "grad_norm": 0.4838558120526579, "learning_rate": 3.197354528796474e-08, "loss": 0.2453, "step": 4845 }, { "epoch": 2.6611751784733664, "grad_norm": 0.5347644049041683, "learning_rate": 3.1871583911341064e-08, "loss": 0.2707, "step": 4846 }, { "epoch": 2.6617243272926965, "grad_norm": 0.49431354929411075, "learning_rate": 3.176978052768447e-08, "loss": 0.2468, "step": 4847 }, { "epoch": 2.662273476112026, "grad_norm": 0.5847404281839407, "learning_rate": 3.166813517134998e-08, "loss": 0.228, "step": 4848 }, { "epoch": 2.6628226249313562, "grad_norm": 0.5193248938427552, "learning_rate": 3.156664787663911e-08, "loss": 0.2421, "step": 4849 }, { "epoch": 2.6633717737506863, "grad_norm": 0.4686096420199456, "learning_rate": 3.1465318677799986e-08, "loss": 0.2318, "step": 4850 }, { "epoch": 2.6639209225700164, "grad_norm": 0.6297040089024432, "learning_rate": 3.136414760902756e-08, "loss": 0.2359, "step": 4851 }, { "epoch": 2.6644700713893466, "grad_norm": 0.676232011286027, "learning_rate": 3.126313470446337e-08, "loss": 0.2683, "step": 4852 }, { "epoch": 2.6650192202086767, "grad_norm": 0.567241017164346, "learning_rate": 3.1162279998195576e-08, "loss": 0.2536, "step": 4853 }, { "epoch": 2.6655683690280068, "grad_norm": 0.44046380969904236, "learning_rate": 3.106158352425877e-08, "loss": 0.231, "step": 4854 }, { "epoch": 2.6661175178473364, "grad_norm": 0.5205575706978108, "learning_rate": 3.0961045316634415e-08, "loss": 0.2605, "step": 4855 }, { "epoch": 2.6666666666666665, "grad_norm": 0.3995389622633099, "learning_rate": 3.086066540925049e-08, "loss": 0.2491, "step": 4856 }, { "epoch": 2.6672158154859966, "grad_norm": 0.4517377287767716, "learning_rate": 3.076044383598139e-08, "loss": 0.2612, "step": 4857 }, { "epoch": 2.6677649643053267, "grad_norm": 0.5033442555925016, "learning_rate": 3.0660380630648336e-08, "loss": 0.2545, "step": 4858 }, { "epoch": 2.668314113124657, "grad_norm": 0.4426893257620993, "learning_rate": 3.056047582701894e-08, "loss": 0.2334, "step": 4859 }, { "epoch": 2.668863261943987, "grad_norm": 0.5027233849534848, "learning_rate": 3.046072945880737e-08, "loss": 0.2412, "step": 4860 }, { "epoch": 2.669412410763317, "grad_norm": 0.49120870198862665, "learning_rate": 3.036114155967426e-08, "loss": 0.2185, "step": 4861 }, { "epoch": 2.6699615595826467, "grad_norm": 0.5424228261614846, "learning_rate": 3.0261712163227045e-08, "loss": 0.2388, "step": 4862 }, { "epoch": 2.670510708401977, "grad_norm": 0.43651380995353, "learning_rate": 3.016244130301935e-08, "loss": 0.2186, "step": 4863 }, { "epoch": 2.671059857221307, "grad_norm": 0.46963173674274244, "learning_rate": 3.0063329012551465e-08, "loss": 0.2778, "step": 4864 }, { "epoch": 2.671609006040637, "grad_norm": 0.5121753666209871, "learning_rate": 2.996437532527013e-08, "loss": 0.2586, "step": 4865 }, { "epoch": 2.672158154859967, "grad_norm": 0.4614380929876249, "learning_rate": 2.986558027456862e-08, "loss": 0.2412, "step": 4866 }, { "epoch": 2.672707303679297, "grad_norm": 0.45120111555524817, "learning_rate": 2.9766943893786485e-08, "loss": 0.2378, "step": 4867 }, { "epoch": 2.6732564524986273, "grad_norm": 0.6231189870386167, "learning_rate": 2.966846621620999e-08, "loss": 0.2519, "step": 4868 }, { "epoch": 2.673805601317957, "grad_norm": 0.48047031557212383, "learning_rate": 2.9570147275071782e-08, "loss": 0.2457, "step": 4869 }, { "epoch": 2.674354750137287, "grad_norm": 0.5351004762584878, "learning_rate": 2.9471987103550827e-08, "loss": 0.2748, "step": 4870 }, { "epoch": 2.674903898956617, "grad_norm": 0.5651518470370255, "learning_rate": 2.9373985734772436e-08, "loss": 0.2877, "step": 4871 }, { "epoch": 2.6754530477759473, "grad_norm": 0.42705072077220757, "learning_rate": 2.927614320180855e-08, "loss": 0.2229, "step": 4872 }, { "epoch": 2.6760021965952774, "grad_norm": 0.6181988755090085, "learning_rate": 2.9178459537677522e-08, "loss": 0.2876, "step": 4873 }, { "epoch": 2.6765513454146075, "grad_norm": 0.44169036427918057, "learning_rate": 2.9080934775343844e-08, "loss": 0.2758, "step": 4874 }, { "epoch": 2.6771004942339376, "grad_norm": 0.5315306275174523, "learning_rate": 2.8983568947718617e-08, "loss": 0.2345, "step": 4875 }, { "epoch": 2.6776496430532672, "grad_norm": 0.4628996329856816, "learning_rate": 2.888636208765916e-08, "loss": 0.2921, "step": 4876 }, { "epoch": 2.6781987918725974, "grad_norm": 0.5411055501001494, "learning_rate": 2.878931422796928e-08, "loss": 0.2415, "step": 4877 }, { "epoch": 2.6787479406919275, "grad_norm": 0.4915468666057728, "learning_rate": 2.8692425401398947e-08, "loss": 0.2584, "step": 4878 }, { "epoch": 2.6792970895112576, "grad_norm": 0.5244792845127695, "learning_rate": 2.859569564064474e-08, "loss": 0.2556, "step": 4879 }, { "epoch": 2.6798462383305877, "grad_norm": 0.5590238983774226, "learning_rate": 2.8499124978349162e-08, "loss": 0.2437, "step": 4880 }, { "epoch": 2.6803953871499178, "grad_norm": 0.6340945545160989, "learning_rate": 2.8402713447101498e-08, "loss": 0.2718, "step": 4881 }, { "epoch": 2.680944535969248, "grad_norm": 0.5530680597235798, "learning_rate": 2.830646107943685e-08, "loss": 0.2145, "step": 4882 }, { "epoch": 2.6814936847885775, "grad_norm": 0.48490918468783145, "learning_rate": 2.82103679078371e-08, "loss": 0.2512, "step": 4883 }, { "epoch": 2.6820428336079076, "grad_norm": 0.6962739720227539, "learning_rate": 2.8114433964729894e-08, "loss": 0.2938, "step": 4884 }, { "epoch": 2.6825919824272377, "grad_norm": 0.5233997463606875, "learning_rate": 2.801865928248954e-08, "loss": 0.2273, "step": 4885 }, { "epoch": 2.683141131246568, "grad_norm": 0.4171421413165741, "learning_rate": 2.7923043893436504e-08, "loss": 0.215, "step": 4886 }, { "epoch": 2.683690280065898, "grad_norm": 0.6892123477886988, "learning_rate": 2.7827587829837414e-08, "loss": 0.2849, "step": 4887 }, { "epoch": 2.684239428885228, "grad_norm": 0.5174142156914096, "learning_rate": 2.7732291123905114e-08, "loss": 0.2453, "step": 4888 }, { "epoch": 2.684788577704558, "grad_norm": 0.4563016924596952, "learning_rate": 2.7637153807798762e-08, "loss": 0.2537, "step": 4889 }, { "epoch": 2.685337726523888, "grad_norm": 0.4861223305876166, "learning_rate": 2.7542175913623805e-08, "loss": 0.2223, "step": 4890 }, { "epoch": 2.685886875343218, "grad_norm": 0.40081623983261666, "learning_rate": 2.744735747343156e-08, "loss": 0.2559, "step": 4891 }, { "epoch": 2.686436024162548, "grad_norm": 0.5721874703656299, "learning_rate": 2.7352698519220066e-08, "loss": 0.3036, "step": 4892 }, { "epoch": 2.686985172981878, "grad_norm": 0.4667411153445306, "learning_rate": 2.7258199082933015e-08, "loss": 0.2723, "step": 4893 }, { "epoch": 2.687534321801208, "grad_norm": 0.5192952045162714, "learning_rate": 2.7163859196460424e-08, "loss": 0.2507, "step": 4894 }, { "epoch": 2.6880834706205383, "grad_norm": 0.4527469898357065, "learning_rate": 2.7069678891638645e-08, "loss": 0.2601, "step": 4895 }, { "epoch": 2.6886326194398684, "grad_norm": 0.4309425637236867, "learning_rate": 2.6975658200250185e-08, "loss": 0.2461, "step": 4896 }, { "epoch": 2.689181768259198, "grad_norm": 0.5844077457521643, "learning_rate": 2.6881797154023376e-08, "loss": 0.2267, "step": 4897 }, { "epoch": 2.689730917078528, "grad_norm": 0.47405942808257356, "learning_rate": 2.6788095784632816e-08, "loss": 0.2216, "step": 4898 }, { "epoch": 2.6902800658978583, "grad_norm": 0.4239939799608646, "learning_rate": 2.6694554123699443e-08, "loss": 0.2195, "step": 4899 }, { "epoch": 2.6908292147171884, "grad_norm": 0.5087841237892199, "learning_rate": 2.6601172202790067e-08, "loss": 0.3188, "step": 4900 }, { "epoch": 2.6913783635365185, "grad_norm": 0.548148178196635, "learning_rate": 2.650795005341755e-08, "loss": 0.2592, "step": 4901 }, { "epoch": 2.6919275123558486, "grad_norm": 0.5109474338816282, "learning_rate": 2.6414887707041015e-08, "loss": 0.236, "step": 4902 }, { "epoch": 2.6924766611751787, "grad_norm": 0.5446294601714476, "learning_rate": 2.632198519506565e-08, "loss": 0.2517, "step": 4903 }, { "epoch": 2.6930258099945084, "grad_norm": 0.5458928161896962, "learning_rate": 2.6229242548842566e-08, "loss": 0.2276, "step": 4904 }, { "epoch": 2.6935749588138385, "grad_norm": 0.4023448717764117, "learning_rate": 2.6136659799668878e-08, "loss": 0.2252, "step": 4905 }, { "epoch": 2.6941241076331686, "grad_norm": 0.5578741475607514, "learning_rate": 2.604423697878791e-08, "loss": 0.2382, "step": 4906 }, { "epoch": 2.6946732564524987, "grad_norm": 0.6101061316383598, "learning_rate": 2.595197411738909e-08, "loss": 0.2423, "step": 4907 }, { "epoch": 2.6952224052718288, "grad_norm": 0.4884589627863502, "learning_rate": 2.5859871246607674e-08, "loss": 0.2716, "step": 4908 }, { "epoch": 2.6957715540911584, "grad_norm": 0.5182588558553695, "learning_rate": 2.5767928397524798e-08, "loss": 0.2734, "step": 4909 }, { "epoch": 2.696320702910489, "grad_norm": 0.5245660927892314, "learning_rate": 2.56761456011681e-08, "loss": 0.2588, "step": 4910 }, { "epoch": 2.6968698517298186, "grad_norm": 0.5914038936050346, "learning_rate": 2.5584522888510642e-08, "loss": 0.2631, "step": 4911 }, { "epoch": 2.6974190005491487, "grad_norm": 0.529872640793922, "learning_rate": 2.5493060290471763e-08, "loss": 0.26, "step": 4912 }, { "epoch": 2.697968149368479, "grad_norm": 0.4488507125725789, "learning_rate": 2.540175783791685e-08, "loss": 0.2762, "step": 4913 }, { "epoch": 2.698517298187809, "grad_norm": 0.4845562250305764, "learning_rate": 2.5310615561657066e-08, "loss": 0.235, "step": 4914 }, { "epoch": 2.699066447007139, "grad_norm": 0.5874808642021006, "learning_rate": 2.521963349244939e-08, "loss": 0.2441, "step": 4915 }, { "epoch": 2.6996155958264687, "grad_norm": 0.4473712901296477, "learning_rate": 2.5128811660997138e-08, "loss": 0.2413, "step": 4916 }, { "epoch": 2.7001647446457993, "grad_norm": 0.4902113364196196, "learning_rate": 2.5038150097949286e-08, "loss": 0.2331, "step": 4917 }, { "epoch": 2.700713893465129, "grad_norm": 0.6036341792134454, "learning_rate": 2.4947648833900677e-08, "loss": 0.2622, "step": 4918 }, { "epoch": 2.701263042284459, "grad_norm": 0.6077157233582943, "learning_rate": 2.4857307899392326e-08, "loss": 0.2476, "step": 4919 }, { "epoch": 2.701812191103789, "grad_norm": 0.5190692657498618, "learning_rate": 2.4767127324910748e-08, "loss": 0.223, "step": 4920 }, { "epoch": 2.702361339923119, "grad_norm": 0.442205025721488, "learning_rate": 2.467710714088876e-08, "loss": 0.2442, "step": 4921 }, { "epoch": 2.7029104887424493, "grad_norm": 0.5356300374663129, "learning_rate": 2.4587247377704706e-08, "loss": 0.2741, "step": 4922 }, { "epoch": 2.703459637561779, "grad_norm": 0.5836134635015584, "learning_rate": 2.449754806568306e-08, "loss": 0.2241, "step": 4923 }, { "epoch": 2.7040087863811095, "grad_norm": 0.5271917207323396, "learning_rate": 2.4408009235094023e-08, "loss": 0.2686, "step": 4924 }, { "epoch": 2.704557935200439, "grad_norm": 0.48575130148369294, "learning_rate": 2.4318630916153627e-08, "loss": 0.1987, "step": 4925 }, { "epoch": 2.7051070840197693, "grad_norm": 0.4439269282609546, "learning_rate": 2.4229413139023667e-08, "loss": 0.2472, "step": 4926 }, { "epoch": 2.7056562328390994, "grad_norm": 0.5748669959447291, "learning_rate": 2.4140355933812052e-08, "loss": 0.2413, "step": 4927 }, { "epoch": 2.7062053816584295, "grad_norm": 0.4646401593129316, "learning_rate": 2.4051459330572113e-08, "loss": 0.2271, "step": 4928 }, { "epoch": 2.7067545304777596, "grad_norm": 0.4906102769161399, "learning_rate": 2.3962723359303252e-08, "loss": 0.2642, "step": 4929 }, { "epoch": 2.7073036792970893, "grad_norm": 0.5596044901439898, "learning_rate": 2.3874148049950668e-08, "loss": 0.2356, "step": 4930 }, { "epoch": 2.70785282811642, "grad_norm": 0.467085806425759, "learning_rate": 2.3785733432405254e-08, "loss": 0.2588, "step": 4931 }, { "epoch": 2.7084019769357495, "grad_norm": 0.45690554314413107, "learning_rate": 2.3697479536503533e-08, "loss": 0.224, "step": 4932 }, { "epoch": 2.7089511257550796, "grad_norm": 0.4031258936131543, "learning_rate": 2.3609386392028038e-08, "loss": 0.2644, "step": 4933 }, { "epoch": 2.7095002745744097, "grad_norm": 0.4883419823489305, "learning_rate": 2.3521454028707017e-08, "loss": 0.2629, "step": 4934 }, { "epoch": 2.7100494233937398, "grad_norm": 0.5625389695208602, "learning_rate": 2.3433682476214315e-08, "loss": 0.2644, "step": 4935 }, { "epoch": 2.71059857221307, "grad_norm": 0.45619597404894086, "learning_rate": 2.334607176416966e-08, "loss": 0.2285, "step": 4936 }, { "epoch": 2.7111477210323995, "grad_norm": 0.4852952628418778, "learning_rate": 2.3258621922138397e-08, "loss": 0.2503, "step": 4937 }, { "epoch": 2.71169686985173, "grad_norm": 0.5175605498385517, "learning_rate": 2.317133297963168e-08, "loss": 0.2651, "step": 4938 }, { "epoch": 2.7122460186710597, "grad_norm": 0.5748179756578118, "learning_rate": 2.3084204966106164e-08, "loss": 0.2494, "step": 4939 }, { "epoch": 2.71279516749039, "grad_norm": 0.37584208995286544, "learning_rate": 2.2997237910964503e-08, "loss": 0.2639, "step": 4940 }, { "epoch": 2.71334431630972, "grad_norm": 0.5816300527133113, "learning_rate": 2.291043184355489e-08, "loss": 0.2698, "step": 4941 }, { "epoch": 2.71389346512905, "grad_norm": 0.5689256369093204, "learning_rate": 2.282378679317113e-08, "loss": 0.2273, "step": 4942 }, { "epoch": 2.71444261394838, "grad_norm": 0.5090513664812023, "learning_rate": 2.273730278905263e-08, "loss": 0.2519, "step": 4943 }, { "epoch": 2.71499176276771, "grad_norm": 0.4261794224335437, "learning_rate": 2.2650979860384673e-08, "loss": 0.2425, "step": 4944 }, { "epoch": 2.7155409115870404, "grad_norm": 0.375358571443588, "learning_rate": 2.256481803629799e-08, "loss": 0.2646, "step": 4945 }, { "epoch": 2.71609006040637, "grad_norm": 0.5481259155015067, "learning_rate": 2.247881734586908e-08, "loss": 0.2128, "step": 4946 }, { "epoch": 2.7166392092257, "grad_norm": 0.5224270666361082, "learning_rate": 2.239297781812004e-08, "loss": 0.2209, "step": 4947 }, { "epoch": 2.7171883580450302, "grad_norm": 0.5589637620447399, "learning_rate": 2.2307299482018527e-08, "loss": 0.2426, "step": 4948 }, { "epoch": 2.7177375068643603, "grad_norm": 0.5079024072759784, "learning_rate": 2.2221782366477737e-08, "loss": 0.2334, "step": 4949 }, { "epoch": 2.7182866556836904, "grad_norm": 0.4648135145274041, "learning_rate": 2.213642650035659e-08, "loss": 0.2594, "step": 4950 }, { "epoch": 2.71883580450302, "grad_norm": 0.4253684948590923, "learning_rate": 2.20512319124596e-08, "loss": 0.2234, "step": 4951 }, { "epoch": 2.71938495332235, "grad_norm": 0.4754023150802178, "learning_rate": 2.196619863153673e-08, "loss": 0.2489, "step": 4952 }, { "epoch": 2.7199341021416803, "grad_norm": 0.5029429333093721, "learning_rate": 2.1881326686283707e-08, "loss": 0.2471, "step": 4953 }, { "epoch": 2.7204832509610104, "grad_norm": 0.40527744255725423, "learning_rate": 2.1796616105341474e-08, "loss": 0.2563, "step": 4954 }, { "epoch": 2.7210323997803405, "grad_norm": 0.46073782796084034, "learning_rate": 2.1712066917296925e-08, "loss": 0.2453, "step": 4955 }, { "epoch": 2.7215815485996706, "grad_norm": 0.5350719592123236, "learning_rate": 2.1627679150682195e-08, "loss": 0.2333, "step": 4956 }, { "epoch": 2.7221306974190007, "grad_norm": 0.4119552767031087, "learning_rate": 2.1543452833975003e-08, "loss": 0.2655, "step": 4957 }, { "epoch": 2.7226798462383304, "grad_norm": 0.48166289941841856, "learning_rate": 2.1459387995598875e-08, "loss": 0.2892, "step": 4958 }, { "epoch": 2.7232289950576605, "grad_norm": 0.44100757522445644, "learning_rate": 2.1375484663922278e-08, "loss": 0.2186, "step": 4959 }, { "epoch": 2.7237781438769906, "grad_norm": 0.4993312172052009, "learning_rate": 2.1291742867259667e-08, "loss": 0.2528, "step": 4960 }, { "epoch": 2.7243272926963207, "grad_norm": 0.461729328950306, "learning_rate": 2.1208162633870842e-08, "loss": 0.2527, "step": 4961 }, { "epoch": 2.724876441515651, "grad_norm": 0.46348331423329175, "learning_rate": 2.112474399196091e-08, "loss": 0.258, "step": 4962 }, { "epoch": 2.725425590334981, "grad_norm": 0.5620340865649814, "learning_rate": 2.104148696968065e-08, "loss": 0.2215, "step": 4963 }, { "epoch": 2.725974739154311, "grad_norm": 0.5338029847880069, "learning_rate": 2.095839159512633e-08, "loss": 0.2399, "step": 4964 }, { "epoch": 2.7265238879736406, "grad_norm": 0.48888011503359435, "learning_rate": 2.087545789633954e-08, "loss": 0.2088, "step": 4965 }, { "epoch": 2.7270730367929708, "grad_norm": 0.5174026137824582, "learning_rate": 2.0792685901307267e-08, "loss": 0.2575, "step": 4966 }, { "epoch": 2.727622185612301, "grad_norm": 0.45864951542109644, "learning_rate": 2.0710075637962034e-08, "loss": 0.237, "step": 4967 }, { "epoch": 2.728171334431631, "grad_norm": 0.4079523939350482, "learning_rate": 2.0627627134181862e-08, "loss": 0.271, "step": 4968 }, { "epoch": 2.728720483250961, "grad_norm": 0.437259590008257, "learning_rate": 2.0545340417789987e-08, "loss": 0.2538, "step": 4969 }, { "epoch": 2.729269632070291, "grad_norm": 0.569050611168137, "learning_rate": 2.046321551655514e-08, "loss": 0.2743, "step": 4970 }, { "epoch": 2.7298187808896213, "grad_norm": 0.5257257757668972, "learning_rate": 2.0381252458191435e-08, "loss": 0.2359, "step": 4971 }, { "epoch": 2.730367929708951, "grad_norm": 0.5759081388039314, "learning_rate": 2.0299451270358534e-08, "loss": 0.2544, "step": 4972 }, { "epoch": 2.730917078528281, "grad_norm": 0.4225202393414164, "learning_rate": 2.0217811980661085e-08, "loss": 0.2451, "step": 4973 }, { "epoch": 2.731466227347611, "grad_norm": 0.5070898998372633, "learning_rate": 2.0136334616649524e-08, "loss": 0.255, "step": 4974 }, { "epoch": 2.7320153761669412, "grad_norm": 0.42131656723878, "learning_rate": 2.0055019205819546e-08, "loss": 0.232, "step": 4975 }, { "epoch": 2.7325645249862713, "grad_norm": 0.48378272458071003, "learning_rate": 1.997386577561184e-08, "loss": 0.2282, "step": 4976 }, { "epoch": 2.7331136738056014, "grad_norm": 0.5315784443276939, "learning_rate": 1.9892874353412822e-08, "loss": 0.2408, "step": 4977 }, { "epoch": 2.7336628226249315, "grad_norm": 0.564072666372069, "learning_rate": 1.981204496655416e-08, "loss": 0.2162, "step": 4978 }, { "epoch": 2.734211971444261, "grad_norm": 0.4742593676266815, "learning_rate": 1.97313776423127e-08, "loss": 0.2427, "step": 4979 }, { "epoch": 2.7347611202635913, "grad_norm": 0.5115095606261856, "learning_rate": 1.9650872407910772e-08, "loss": 0.1961, "step": 4980 }, { "epoch": 2.7353102690829214, "grad_norm": 0.5004548845267979, "learning_rate": 1.9570529290515876e-08, "loss": 0.2737, "step": 4981 }, { "epoch": 2.7358594179022515, "grad_norm": 0.4687813168646375, "learning_rate": 1.9490348317240937e-08, "loss": 0.2439, "step": 4982 }, { "epoch": 2.7364085667215816, "grad_norm": 0.5635970057686996, "learning_rate": 1.941032951514399e-08, "loss": 0.2278, "step": 4983 }, { "epoch": 2.7369577155409117, "grad_norm": 0.5951728546258948, "learning_rate": 1.9330472911228403e-08, "loss": 0.2456, "step": 4984 }, { "epoch": 2.737506864360242, "grad_norm": 0.45844129921794513, "learning_rate": 1.925077853244298e-08, "loss": 0.2126, "step": 4985 }, { "epoch": 2.7380560131795715, "grad_norm": 0.38123684682173714, "learning_rate": 1.9171246405681562e-08, "loss": 0.2287, "step": 4986 }, { "epoch": 2.7386051619989016, "grad_norm": 0.49315016689118535, "learning_rate": 1.909187655778323e-08, "loss": 0.2378, "step": 4987 }, { "epoch": 2.7391543108182317, "grad_norm": 0.5101497171317687, "learning_rate": 1.9012669015532473e-08, "loss": 0.2391, "step": 4988 }, { "epoch": 2.739703459637562, "grad_norm": 0.45965694559687714, "learning_rate": 1.893362380565897e-08, "loss": 0.2212, "step": 4989 }, { "epoch": 2.740252608456892, "grad_norm": 0.5542807583698355, "learning_rate": 1.8854740954837427e-08, "loss": 0.2477, "step": 4990 }, { "epoch": 2.740801757276222, "grad_norm": 0.4971634743216813, "learning_rate": 1.8776020489688053e-08, "loss": 0.2571, "step": 4991 }, { "epoch": 2.741350906095552, "grad_norm": 0.4486212172025292, "learning_rate": 1.8697462436776e-08, "loss": 0.2677, "step": 4992 }, { "epoch": 2.7419000549148818, "grad_norm": 0.4892775376051474, "learning_rate": 1.8619066822611688e-08, "loss": 0.294, "step": 4993 }, { "epoch": 2.742449203734212, "grad_norm": 0.4729746022527434, "learning_rate": 1.854083367365085e-08, "loss": 0.2679, "step": 4994 }, { "epoch": 2.742998352553542, "grad_norm": 0.4216127257532748, "learning_rate": 1.8462763016294236e-08, "loss": 0.2423, "step": 4995 }, { "epoch": 2.743547501372872, "grad_norm": 1.1914398622962583, "learning_rate": 1.8384854876887797e-08, "loss": 0.2078, "step": 4996 }, { "epoch": 2.744096650192202, "grad_norm": 0.4367380337872933, "learning_rate": 1.8307109281722756e-08, "loss": 0.2565, "step": 4997 }, { "epoch": 2.7446457990115323, "grad_norm": 0.5649037433721597, "learning_rate": 1.8229526257035234e-08, "loss": 0.2576, "step": 4998 }, { "epoch": 2.7451949478308624, "grad_norm": 0.4180890148063339, "learning_rate": 1.8152105829006822e-08, "loss": 0.2282, "step": 4999 }, { "epoch": 2.745744096650192, "grad_norm": 0.4385310182183909, "learning_rate": 1.8074848023763906e-08, "loss": 0.2877, "step": 5000 }, { "epoch": 2.745744096650192, "eval_loss": 0.3212934732437134, "eval_runtime": 21.8566, "eval_samples_per_second": 20.268, "eval_steps_per_second": 0.869, "step": 5000 }, { "epoch": 2.746293245469522, "grad_norm": 1.0854706808411037, "learning_rate": 1.7997752867378173e-08, "loss": 0.3939, "step": 5001 }, { "epoch": 2.7468423942888522, "grad_norm": 0.5488600894277761, "learning_rate": 1.79208203858665e-08, "loss": 0.2223, "step": 5002 }, { "epoch": 2.7473915431081823, "grad_norm": 1.052559729427076, "learning_rate": 1.7844050605190734e-08, "loss": 0.2392, "step": 5003 }, { "epoch": 2.7479406919275124, "grad_norm": 0.4639835887427518, "learning_rate": 1.776744355125773e-08, "loss": 0.2398, "step": 5004 }, { "epoch": 2.7484898407468425, "grad_norm": 0.4203915465420762, "learning_rate": 1.7690999249919668e-08, "loss": 0.2091, "step": 5005 }, { "epoch": 2.7490389895661727, "grad_norm": 0.5421252102410677, "learning_rate": 1.761471772697371e-08, "loss": 0.2653, "step": 5006 }, { "epoch": 2.7495881383855023, "grad_norm": 0.48594127185207453, "learning_rate": 1.753859900816192e-08, "loss": 0.2903, "step": 5007 }, { "epoch": 2.7501372872048324, "grad_norm": 0.3801758684200115, "learning_rate": 1.746264311917167e-08, "loss": 0.2398, "step": 5008 }, { "epoch": 2.7506864360241625, "grad_norm": 0.5051549780740687, "learning_rate": 1.738685008563529e-08, "loss": 0.2307, "step": 5009 }, { "epoch": 2.7512355848434926, "grad_norm": 0.4860628714145241, "learning_rate": 1.731121993312997e-08, "loss": 0.2573, "step": 5010 }, { "epoch": 2.7517847336628227, "grad_norm": 0.49585270929986675, "learning_rate": 1.7235752687178252e-08, "loss": 0.2199, "step": 5011 }, { "epoch": 2.752333882482153, "grad_norm": 0.4387766748218117, "learning_rate": 1.7160448373247596e-08, "loss": 0.1984, "step": 5012 }, { "epoch": 2.752883031301483, "grad_norm": 0.5224956328166734, "learning_rate": 1.7085307016750357e-08, "loss": 0.2633, "step": 5013 }, { "epoch": 2.7534321801208126, "grad_norm": 0.5967674768119109, "learning_rate": 1.7010328643043875e-08, "loss": 0.244, "step": 5014 }, { "epoch": 2.7539813289401427, "grad_norm": 0.5569663904705111, "learning_rate": 1.693551327743072e-08, "loss": 0.321, "step": 5015 }, { "epoch": 2.754530477759473, "grad_norm": 0.3880751461877746, "learning_rate": 1.686086094515838e-08, "loss": 0.2734, "step": 5016 }, { "epoch": 2.755079626578803, "grad_norm": 0.3654994366709466, "learning_rate": 1.6786371671419077e-08, "loss": 0.286, "step": 5017 }, { "epoch": 2.755628775398133, "grad_norm": 0.48424280020099336, "learning_rate": 1.6712045481350297e-08, "loss": 0.2587, "step": 5018 }, { "epoch": 2.7561779242174627, "grad_norm": 0.42123712627422644, "learning_rate": 1.6637882400034413e-08, "loss": 0.2101, "step": 5019 }, { "epoch": 2.756727073036793, "grad_norm": 0.6383529619197001, "learning_rate": 1.6563882452498785e-08, "loss": 0.2529, "step": 5020 }, { "epoch": 2.757276221856123, "grad_norm": 0.4531460010661179, "learning_rate": 1.6490045663715498e-08, "loss": 0.261, "step": 5021 }, { "epoch": 2.757825370675453, "grad_norm": 0.5546910098671959, "learning_rate": 1.641637205860184e-08, "loss": 0.2683, "step": 5022 }, { "epoch": 2.758374519494783, "grad_norm": 0.488864237138569, "learning_rate": 1.634286166202e-08, "loss": 0.2401, "step": 5023 }, { "epoch": 2.758923668314113, "grad_norm": 0.6878684562227984, "learning_rate": 1.6269514498776976e-08, "loss": 0.3071, "step": 5024 }, { "epoch": 2.7594728171334433, "grad_norm": 0.5127496862945731, "learning_rate": 1.619633059362467e-08, "loss": 0.2248, "step": 5025 }, { "epoch": 2.760021965952773, "grad_norm": 0.5857746240738712, "learning_rate": 1.612330997126006e-08, "loss": 0.258, "step": 5026 }, { "epoch": 2.7605711147721035, "grad_norm": 0.5218581751604049, "learning_rate": 1.6050452656324816e-08, "loss": 0.2876, "step": 5027 }, { "epoch": 2.761120263591433, "grad_norm": 0.910198175727525, "learning_rate": 1.5977758673405637e-08, "loss": 0.2745, "step": 5028 }, { "epoch": 2.7616694124107632, "grad_norm": 0.44545785180866987, "learning_rate": 1.5905228047034117e-08, "loss": 0.2216, "step": 5029 }, { "epoch": 2.7622185612300933, "grad_norm": 0.5083825130211291, "learning_rate": 1.583286080168666e-08, "loss": 0.2254, "step": 5030 }, { "epoch": 2.7627677100494235, "grad_norm": 0.554591639047804, "learning_rate": 1.5760656961784462e-08, "loss": 0.2495, "step": 5031 }, { "epoch": 2.7633168588687536, "grad_norm": 0.6383204547398789, "learning_rate": 1.5688616551693707e-08, "loss": 0.2612, "step": 5032 }, { "epoch": 2.763866007688083, "grad_norm": 0.5003762491750082, "learning_rate": 1.5616739595725448e-08, "loss": 0.2817, "step": 5033 }, { "epoch": 2.7644151565074138, "grad_norm": 0.47604032925092815, "learning_rate": 1.5545026118135364e-08, "loss": 0.2335, "step": 5034 }, { "epoch": 2.7649643053267434, "grad_norm": 0.476299110323979, "learning_rate": 1.5473476143124284e-08, "loss": 0.2419, "step": 5035 }, { "epoch": 2.7655134541460735, "grad_norm": 0.5789921181569709, "learning_rate": 1.540208969483753e-08, "loss": 0.2427, "step": 5036 }, { "epoch": 2.7660626029654036, "grad_norm": 0.5456376708863133, "learning_rate": 1.5330866797365533e-08, "loss": 0.2181, "step": 5037 }, { "epoch": 2.7666117517847337, "grad_norm": 0.42173693807413404, "learning_rate": 1.525980747474327e-08, "loss": 0.2777, "step": 5038 }, { "epoch": 2.767160900604064, "grad_norm": 0.5867067008902789, "learning_rate": 1.5188911750950664e-08, "loss": 0.2342, "step": 5039 }, { "epoch": 2.7677100494233935, "grad_norm": 0.5239161317036305, "learning_rate": 1.511817964991257e-08, "loss": 0.2431, "step": 5040 }, { "epoch": 2.768259198242724, "grad_norm": 0.489620707462445, "learning_rate": 1.5047611195498277e-08, "loss": 0.2316, "step": 5041 }, { "epoch": 2.7688083470620537, "grad_norm": 0.5457035567455386, "learning_rate": 1.4977206411522082e-08, "loss": 0.2894, "step": 5042 }, { "epoch": 2.769357495881384, "grad_norm": 0.5056465850379799, "learning_rate": 1.4906965321743097e-08, "loss": 0.2522, "step": 5043 }, { "epoch": 2.769906644700714, "grad_norm": 0.6277435963066549, "learning_rate": 1.4836887949864992e-08, "loss": 0.2401, "step": 5044 }, { "epoch": 2.770455793520044, "grad_norm": 0.44527141389848307, "learning_rate": 1.4766974319536313e-08, "loss": 0.2396, "step": 5045 }, { "epoch": 2.771004942339374, "grad_norm": 0.5034651401467697, "learning_rate": 1.469722445435044e-08, "loss": 0.2367, "step": 5046 }, { "epoch": 2.7715540911587038, "grad_norm": 0.4805708168187247, "learning_rate": 1.4627638377845351e-08, "loss": 0.2524, "step": 5047 }, { "epoch": 2.7721032399780343, "grad_norm": 0.4440815384680824, "learning_rate": 1.4558216113503639e-08, "loss": 0.2286, "step": 5048 }, { "epoch": 2.772652388797364, "grad_norm": 0.9774905785471373, "learning_rate": 1.4488957684752826e-08, "loss": 0.4553, "step": 5049 }, { "epoch": 2.773201537616694, "grad_norm": 0.6693809056863416, "learning_rate": 1.4419863114965211e-08, "loss": 0.2633, "step": 5050 }, { "epoch": 2.773750686436024, "grad_norm": 0.5476047607260525, "learning_rate": 1.4350932427457535e-08, "loss": 0.2579, "step": 5051 }, { "epoch": 2.7742998352553543, "grad_norm": 0.5221304728367935, "learning_rate": 1.4282165645491413e-08, "loss": 0.2331, "step": 5052 }, { "epoch": 2.7748489840746844, "grad_norm": 0.5849446900620262, "learning_rate": 1.4213562792273071e-08, "loss": 0.2477, "step": 5053 }, { "epoch": 2.775398132894014, "grad_norm": 0.501502879559359, "learning_rate": 1.414512389095351e-08, "loss": 0.2284, "step": 5054 }, { "epoch": 2.775947281713344, "grad_norm": 0.4901784157951659, "learning_rate": 1.4076848964628214e-08, "loss": 0.2268, "step": 5055 }, { "epoch": 2.7764964305326743, "grad_norm": 0.5226259491762343, "learning_rate": 1.4008738036337567e-08, "loss": 0.2195, "step": 5056 }, { "epoch": 2.7770455793520044, "grad_norm": 0.408452025808108, "learning_rate": 1.3940791129066543e-08, "loss": 0.2003, "step": 5057 }, { "epoch": 2.7775947281713345, "grad_norm": 0.5412003027581265, "learning_rate": 1.3873008265744506e-08, "loss": 0.2617, "step": 5058 }, { "epoch": 2.7781438769906646, "grad_norm": 0.4275557754611918, "learning_rate": 1.3805389469245817e-08, "loss": 0.2794, "step": 5059 }, { "epoch": 2.7786930258099947, "grad_norm": 0.5232457286044238, "learning_rate": 1.3737934762389434e-08, "loss": 0.2611, "step": 5060 }, { "epoch": 2.7792421746293243, "grad_norm": 0.6042942464641426, "learning_rate": 1.3670644167938595e-08, "loss": 0.2724, "step": 5061 }, { "epoch": 2.7797913234486544, "grad_norm": 0.4607015355966124, "learning_rate": 1.3603517708601584e-08, "loss": 0.2388, "step": 5062 }, { "epoch": 2.7803404722679845, "grad_norm": 0.5167660139283935, "learning_rate": 1.3536555407031068e-08, "loss": 0.2689, "step": 5063 }, { "epoch": 2.7808896210873146, "grad_norm": 0.5933247017065723, "learning_rate": 1.3469757285824323e-08, "loss": 0.2287, "step": 5064 }, { "epoch": 2.7814387699066447, "grad_norm": 0.5464373060696974, "learning_rate": 1.3403123367523224e-08, "loss": 0.2273, "step": 5065 }, { "epoch": 2.781987918725975, "grad_norm": 0.36790815428673795, "learning_rate": 1.3336653674614314e-08, "loss": 0.2871, "step": 5066 }, { "epoch": 2.782537067545305, "grad_norm": 0.5215757250714017, "learning_rate": 1.3270348229528736e-08, "loss": 0.2195, "step": 5067 }, { "epoch": 2.7830862163646346, "grad_norm": 0.4562082004135086, "learning_rate": 1.320420705464202e-08, "loss": 0.245, "step": 5068 }, { "epoch": 2.7836353651839647, "grad_norm": 0.4427567277389462, "learning_rate": 1.3138230172274354e-08, "loss": 0.2299, "step": 5069 }, { "epoch": 2.784184514003295, "grad_norm": 0.576166295044275, "learning_rate": 1.307241760469059e-08, "loss": 0.2532, "step": 5070 }, { "epoch": 2.784733662822625, "grad_norm": 0.5043721559039088, "learning_rate": 1.3006769374100073e-08, "loss": 0.2661, "step": 5071 }, { "epoch": 2.785282811641955, "grad_norm": 0.5380318967403359, "learning_rate": 1.2941285502656528e-08, "loss": 0.2409, "step": 5072 }, { "epoch": 2.785831960461285, "grad_norm": 0.4770436405958683, "learning_rate": 1.287596601245846e-08, "loss": 0.2173, "step": 5073 }, { "epoch": 2.786381109280615, "grad_norm": 0.48774650881276055, "learning_rate": 1.281081092554886e-08, "loss": 0.1917, "step": 5074 }, { "epoch": 2.786930258099945, "grad_norm": 0.5119294617907558, "learning_rate": 1.274582026391494e-08, "loss": 0.2479, "step": 5075 }, { "epoch": 2.787479406919275, "grad_norm": 0.6040755403567626, "learning_rate": 1.2680994049488794e-08, "loss": 0.2503, "step": 5076 }, { "epoch": 2.788028555738605, "grad_norm": 0.5739021318312408, "learning_rate": 1.2616332304146955e-08, "loss": 0.2829, "step": 5077 }, { "epoch": 2.788577704557935, "grad_norm": 0.541707980208926, "learning_rate": 1.2551835049710174e-08, "loss": 0.2568, "step": 5078 }, { "epoch": 2.7891268533772653, "grad_norm": 0.5952349983750594, "learning_rate": 1.248750230794414e-08, "loss": 0.2701, "step": 5079 }, { "epoch": 2.7896760021965954, "grad_norm": 0.4487889813853214, "learning_rate": 1.242333410055859e-08, "loss": 0.2399, "step": 5080 }, { "epoch": 2.7902251510159255, "grad_norm": 0.6232405043855543, "learning_rate": 1.2359330449208034e-08, "loss": 0.2254, "step": 5081 }, { "epoch": 2.790774299835255, "grad_norm": 0.41773198087942504, "learning_rate": 1.229549137549126e-08, "loss": 0.213, "step": 5082 }, { "epoch": 2.7913234486545853, "grad_norm": 0.5638604884188275, "learning_rate": 1.2231816900951653e-08, "loss": 0.2837, "step": 5083 }, { "epoch": 2.7918725974739154, "grad_norm": 0.509037426183908, "learning_rate": 1.2168307047077094e-08, "loss": 0.2397, "step": 5084 }, { "epoch": 2.7924217462932455, "grad_norm": 0.5549802363272108, "learning_rate": 1.2104961835299689e-08, "loss": 0.2278, "step": 5085 }, { "epoch": 2.7929708951125756, "grad_norm": 0.5385808016168128, "learning_rate": 1.2041781286996138e-08, "loss": 0.2289, "step": 5086 }, { "epoch": 2.7935200439319057, "grad_norm": 0.7038234325910342, "learning_rate": 1.1978765423487586e-08, "loss": 0.2425, "step": 5087 }, { "epoch": 2.7940691927512358, "grad_norm": 0.5143360461942912, "learning_rate": 1.191591426603956e-08, "loss": 0.2363, "step": 5088 }, { "epoch": 2.7946183415705654, "grad_norm": 0.5331821026711184, "learning_rate": 1.1853227835861976e-08, "loss": 0.2439, "step": 5089 }, { "epoch": 2.7951674903898955, "grad_norm": 0.5170135375498504, "learning_rate": 1.1790706154109234e-08, "loss": 0.2341, "step": 5090 }, { "epoch": 2.7957166392092256, "grad_norm": 0.4862284155175943, "learning_rate": 1.1728349241880123e-08, "loss": 0.2743, "step": 5091 }, { "epoch": 2.7962657880285557, "grad_norm": 0.5751915011904329, "learning_rate": 1.1666157120217651e-08, "loss": 0.2706, "step": 5092 }, { "epoch": 2.796814936847886, "grad_norm": 0.5345444367044516, "learning_rate": 1.1604129810109481e-08, "loss": 0.2102, "step": 5093 }, { "epoch": 2.797364085667216, "grad_norm": 0.5799472044951631, "learning_rate": 1.1542267332487668e-08, "loss": 0.2369, "step": 5094 }, { "epoch": 2.797913234486546, "grad_norm": 0.49714572662451373, "learning_rate": 1.1480569708228254e-08, "loss": 0.2141, "step": 5095 }, { "epoch": 2.7984623833058757, "grad_norm": 0.5109252523184267, "learning_rate": 1.1419036958152165e-08, "loss": 0.2874, "step": 5096 }, { "epoch": 2.799011532125206, "grad_norm": 0.45288324118364204, "learning_rate": 1.1357669103024216e-08, "loss": 0.2588, "step": 5097 }, { "epoch": 2.799560680944536, "grad_norm": 0.5092594797856386, "learning_rate": 1.1296466163553988e-08, "loss": 0.2456, "step": 5098 }, { "epoch": 2.800109829763866, "grad_norm": 0.4782421323555091, "learning_rate": 1.1235428160395114e-08, "loss": 0.2413, "step": 5099 }, { "epoch": 2.800658978583196, "grad_norm": 0.44850993785107024, "learning_rate": 1.117455511414567e-08, "loss": 0.2488, "step": 5100 }, { "epoch": 2.8012081274025262, "grad_norm": 0.5974547932396546, "learning_rate": 1.1113847045348108e-08, "loss": 0.2592, "step": 5101 }, { "epoch": 2.8017572762218563, "grad_norm": 0.5366719466201852, "learning_rate": 1.1053303974489162e-08, "loss": 0.2407, "step": 5102 }, { "epoch": 2.802306425041186, "grad_norm": 0.4650713852780395, "learning_rate": 1.0992925921999825e-08, "loss": 0.2102, "step": 5103 }, { "epoch": 2.802855573860516, "grad_norm": 0.5356028625999021, "learning_rate": 1.0932712908255544e-08, "loss": 0.2376, "step": 5104 }, { "epoch": 2.803404722679846, "grad_norm": 0.4684839282275153, "learning_rate": 1.0872664953575974e-08, "loss": 0.2205, "step": 5105 }, { "epoch": 2.8039538714991763, "grad_norm": 0.568385380648378, "learning_rate": 1.081278207822504e-08, "loss": 0.234, "step": 5106 }, { "epoch": 2.8045030203185064, "grad_norm": 0.513683362551124, "learning_rate": 1.0753064302411115e-08, "loss": 0.2525, "step": 5107 }, { "epoch": 2.8050521691378365, "grad_norm": 0.45882880464069214, "learning_rate": 1.0693511646286613e-08, "loss": 0.2002, "step": 5108 }, { "epoch": 2.8056013179571666, "grad_norm": 0.6073397496848427, "learning_rate": 1.0634124129948396e-08, "loss": 0.2695, "step": 5109 }, { "epoch": 2.8061504667764963, "grad_norm": 0.6560840743103814, "learning_rate": 1.0574901773437643e-08, "loss": 0.2688, "step": 5110 }, { "epoch": 2.8066996155958264, "grad_norm": 0.5966242545045192, "learning_rate": 1.0515844596739651e-08, "loss": 0.2585, "step": 5111 }, { "epoch": 2.8072487644151565, "grad_norm": 0.48852742749094524, "learning_rate": 1.0456952619784088e-08, "loss": 0.2221, "step": 5112 }, { "epoch": 2.8077979132344866, "grad_norm": 0.44052206580350833, "learning_rate": 1.0398225862444739e-08, "loss": 0.2456, "step": 5113 }, { "epoch": 2.8083470620538167, "grad_norm": 0.5049084401514934, "learning_rate": 1.0339664344539766e-08, "loss": 0.2245, "step": 5114 }, { "epoch": 2.808896210873147, "grad_norm": 0.4220144410224526, "learning_rate": 1.028126808583155e-08, "loss": 0.2492, "step": 5115 }, { "epoch": 2.809445359692477, "grad_norm": 0.4793281934307426, "learning_rate": 1.0223037106026686e-08, "loss": 0.2518, "step": 5116 }, { "epoch": 2.8099945085118065, "grad_norm": 0.425684564299101, "learning_rate": 1.0164971424775879e-08, "loss": 0.246, "step": 5117 }, { "epoch": 2.8105436573311366, "grad_norm": 0.6096867001019597, "learning_rate": 1.0107071061674325e-08, "loss": 0.2541, "step": 5118 }, { "epoch": 2.8110928061504667, "grad_norm": 0.6863964964000587, "learning_rate": 1.0049336036261162e-08, "loss": 0.2042, "step": 5119 }, { "epoch": 2.811641954969797, "grad_norm": 0.5684445230142705, "learning_rate": 9.991766368019859e-09, "loss": 0.2201, "step": 5120 }, { "epoch": 2.812191103789127, "grad_norm": 0.5427930601044975, "learning_rate": 9.934362076377976e-09, "loss": 0.2834, "step": 5121 }, { "epoch": 2.8127402526084566, "grad_norm": 0.4504687041744458, "learning_rate": 9.877123180707477e-09, "loss": 0.2201, "step": 5122 }, { "epoch": 2.813289401427787, "grad_norm": 0.5794480512805189, "learning_rate": 9.820049700324308e-09, "loss": 0.2797, "step": 5123 }, { "epoch": 2.813838550247117, "grad_norm": 0.6344775579376868, "learning_rate": 9.763141654488688e-09, "loss": 0.2744, "step": 5124 }, { "epoch": 2.814387699066447, "grad_norm": 0.45111611782949923, "learning_rate": 9.706399062404945e-09, "loss": 0.2302, "step": 5125 }, { "epoch": 2.814936847885777, "grad_norm": 0.5022222076626245, "learning_rate": 9.64982194322168e-09, "loss": 0.2261, "step": 5126 }, { "epoch": 2.815485996705107, "grad_norm": 0.38561816912725905, "learning_rate": 9.593410316031485e-09, "loss": 0.2407, "step": 5127 }, { "epoch": 2.8160351455244372, "grad_norm": 0.42179197608128055, "learning_rate": 9.537164199871341e-09, "loss": 0.2216, "step": 5128 }, { "epoch": 2.816584294343767, "grad_norm": 0.49903212646268086, "learning_rate": 9.481083613722161e-09, "loss": 0.22, "step": 5129 }, { "epoch": 2.8171334431630974, "grad_norm": 0.5064329215527926, "learning_rate": 9.42516857650908e-09, "loss": 0.1969, "step": 5130 }, { "epoch": 2.817682591982427, "grad_norm": 0.4645411896184064, "learning_rate": 9.36941910710134e-09, "loss": 0.2273, "step": 5131 }, { "epoch": 2.818231740801757, "grad_norm": 0.5125041272620647, "learning_rate": 9.313835224312397e-09, "loss": 0.2264, "step": 5132 }, { "epoch": 2.8187808896210873, "grad_norm": 0.5487711188668993, "learning_rate": 9.258416946899701e-09, "loss": 0.2339, "step": 5133 }, { "epoch": 2.8193300384404174, "grad_norm": 0.5111112345675954, "learning_rate": 9.203164293564921e-09, "loss": 0.235, "step": 5134 }, { "epoch": 2.8198791872597475, "grad_norm": 1.0373292015721507, "learning_rate": 9.148077282953776e-09, "loss": 0.3876, "step": 5135 }, { "epoch": 2.820428336079077, "grad_norm": 0.44961380427179864, "learning_rate": 9.093155933656145e-09, "loss": 0.2364, "step": 5136 }, { "epoch": 2.8209774848984077, "grad_norm": 0.5491565368016308, "learning_rate": 9.03840026420585e-09, "loss": 0.243, "step": 5137 }, { "epoch": 2.8215266337177374, "grad_norm": 0.5864151393028695, "learning_rate": 8.98381029308098e-09, "loss": 0.2186, "step": 5138 }, { "epoch": 2.8220757825370675, "grad_norm": 0.4546646981435038, "learning_rate": 8.929386038703735e-09, "loss": 0.2529, "step": 5139 }, { "epoch": 2.8226249313563976, "grad_norm": 0.4695773663748892, "learning_rate": 8.875127519440138e-09, "loss": 0.2486, "step": 5140 }, { "epoch": 2.8231740801757277, "grad_norm": 0.4959900955842116, "learning_rate": 8.821034753600491e-09, "loss": 0.2158, "step": 5141 }, { "epoch": 2.823723228995058, "grad_norm": 0.4701457465721218, "learning_rate": 8.767107759439197e-09, "loss": 0.2252, "step": 5142 }, { "epoch": 2.8242723778143874, "grad_norm": 0.6477452503977166, "learning_rate": 8.713346555154487e-09, "loss": 0.2757, "step": 5143 }, { "epoch": 2.824821526633718, "grad_norm": 0.41619768764836373, "learning_rate": 8.65975115888881e-09, "loss": 0.264, "step": 5144 }, { "epoch": 2.8253706754530477, "grad_norm": 0.6052383289254648, "learning_rate": 8.606321588728778e-09, "loss": 0.251, "step": 5145 }, { "epoch": 2.8259198242723778, "grad_norm": 0.44323423460706884, "learning_rate": 8.553057862704778e-09, "loss": 0.248, "step": 5146 }, { "epoch": 2.826468973091708, "grad_norm": 0.47973459658217676, "learning_rate": 8.49995999879135e-09, "loss": 0.249, "step": 5147 }, { "epoch": 2.827018121911038, "grad_norm": 0.5730193448824772, "learning_rate": 8.447028014907146e-09, "loss": 0.2459, "step": 5148 }, { "epoch": 2.827567270730368, "grad_norm": 0.5590022429117438, "learning_rate": 8.394261928914704e-09, "loss": 0.2612, "step": 5149 }, { "epoch": 2.8281164195496977, "grad_norm": 0.4392625661269473, "learning_rate": 8.341661758620605e-09, "loss": 0.2654, "step": 5150 }, { "epoch": 2.8286655683690283, "grad_norm": 0.43699389023299984, "learning_rate": 8.289227521775598e-09, "loss": 0.2323, "step": 5151 }, { "epoch": 2.829214717188358, "grad_norm": 0.5271568542571415, "learning_rate": 8.236959236074144e-09, "loss": 0.2578, "step": 5152 }, { "epoch": 2.829763866007688, "grad_norm": 0.4365781226150285, "learning_rate": 8.184856919154981e-09, "loss": 0.2124, "step": 5153 }, { "epoch": 2.830313014827018, "grad_norm": 0.6021411368425027, "learning_rate": 8.13292058860062e-09, "loss": 0.2532, "step": 5154 }, { "epoch": 2.8308621636463482, "grad_norm": 0.6312329693230005, "learning_rate": 8.081150261937727e-09, "loss": 0.2678, "step": 5155 }, { "epoch": 2.8314113124656783, "grad_norm": 0.5529506375957167, "learning_rate": 8.029545956636968e-09, "loss": 0.2134, "step": 5156 }, { "epoch": 2.831960461285008, "grad_norm": 0.3854981662451646, "learning_rate": 7.978107690112728e-09, "loss": 0.219, "step": 5157 }, { "epoch": 2.8325096101043385, "grad_norm": 0.5161062666307481, "learning_rate": 7.926835479723551e-09, "loss": 0.2371, "step": 5158 }, { "epoch": 2.833058758923668, "grad_norm": 0.6265592085402263, "learning_rate": 7.875729342772029e-09, "loss": 0.2865, "step": 5159 }, { "epoch": 2.8336079077429983, "grad_norm": 0.4370056793815818, "learning_rate": 7.824789296504473e-09, "loss": 0.2469, "step": 5160 }, { "epoch": 2.8341570565623284, "grad_norm": 0.4343710910615937, "learning_rate": 7.77401535811136e-09, "loss": 0.2178, "step": 5161 }, { "epoch": 2.8347062053816585, "grad_norm": 0.4462909038215947, "learning_rate": 7.723407544727042e-09, "loss": 0.248, "step": 5162 }, { "epoch": 2.8352553542009886, "grad_norm": 0.43241709322446226, "learning_rate": 7.672965873429762e-09, "loss": 0.2569, "step": 5163 }, { "epoch": 2.8358045030203183, "grad_norm": 0.48607888310258496, "learning_rate": 7.622690361241644e-09, "loss": 0.2353, "step": 5164 }, { "epoch": 2.8363536518396484, "grad_norm": 0.4679786023983939, "learning_rate": 7.572581025128918e-09, "loss": 0.2598, "step": 5165 }, { "epoch": 2.8369028006589785, "grad_norm": 0.47306260662503785, "learning_rate": 7.522637882001586e-09, "loss": 0.1903, "step": 5166 }, { "epoch": 2.8374519494783086, "grad_norm": 0.5209212113475357, "learning_rate": 7.472860948713706e-09, "loss": 0.2371, "step": 5167 }, { "epoch": 2.8380010982976387, "grad_norm": 0.5137039714508882, "learning_rate": 7.423250242063043e-09, "loss": 0.2726, "step": 5168 }, { "epoch": 2.838550247116969, "grad_norm": 0.5080395898226963, "learning_rate": 7.373805778791421e-09, "loss": 0.2178, "step": 5169 }, { "epoch": 2.839099395936299, "grad_norm": 0.5986375283067724, "learning_rate": 7.324527575584545e-09, "loss": 0.2125, "step": 5170 }, { "epoch": 2.8396485447556286, "grad_norm": 0.44142299629293513, "learning_rate": 7.275415649072003e-09, "loss": 0.2792, "step": 5171 }, { "epoch": 2.8401976935749587, "grad_norm": 0.5074345653459532, "learning_rate": 7.226470015827213e-09, "loss": 0.2225, "step": 5172 }, { "epoch": 2.8407468423942888, "grad_norm": 0.41965187581038155, "learning_rate": 7.177690692367642e-09, "loss": 0.2467, "step": 5173 }, { "epoch": 2.841295991213619, "grad_norm": 0.5594912581937511, "learning_rate": 7.129077695154363e-09, "loss": 0.2053, "step": 5174 }, { "epoch": 2.841845140032949, "grad_norm": 0.4893265077401131, "learning_rate": 7.080631040592558e-09, "loss": 0.2622, "step": 5175 }, { "epoch": 2.842394288852279, "grad_norm": 0.44796188098679735, "learning_rate": 7.032350745031178e-09, "loss": 0.2637, "step": 5176 }, { "epoch": 2.842943437671609, "grad_norm": 0.4946980795451846, "learning_rate": 6.984236824762947e-09, "loss": 0.2512, "step": 5177 }, { "epoch": 2.843492586490939, "grad_norm": 0.5015533849238657, "learning_rate": 6.936289296024699e-09, "loss": 0.2311, "step": 5178 }, { "epoch": 2.844041735310269, "grad_norm": 0.4574381704209699, "learning_rate": 6.888508174996869e-09, "loss": 0.2662, "step": 5179 }, { "epoch": 2.844590884129599, "grad_norm": 0.47537805319925774, "learning_rate": 6.84089347780389e-09, "loss": 0.194, "step": 5180 }, { "epoch": 2.845140032948929, "grad_norm": 0.47433976554770146, "learning_rate": 6.793445220513857e-09, "loss": 0.2446, "step": 5181 }, { "epoch": 2.8456891817682592, "grad_norm": 0.5038632496012994, "learning_rate": 6.746163419138859e-09, "loss": 0.2279, "step": 5182 }, { "epoch": 2.8462383305875893, "grad_norm": 0.5248906413486438, "learning_rate": 6.699048089634812e-09, "loss": 0.2114, "step": 5183 }, { "epoch": 2.8467874794069195, "grad_norm": 0.5556392854844175, "learning_rate": 6.652099247901408e-09, "loss": 0.2398, "step": 5184 }, { "epoch": 2.847336628226249, "grad_norm": 0.4166224075120004, "learning_rate": 6.6053169097819976e-09, "loss": 0.241, "step": 5185 }, { "epoch": 2.847885777045579, "grad_norm": 0.4531971961694838, "learning_rate": 6.5587010910640944e-09, "loss": 0.26, "step": 5186 }, { "epoch": 2.8484349258649093, "grad_norm": 0.6555291808433016, "learning_rate": 6.512251807478707e-09, "loss": 0.2331, "step": 5187 }, { "epoch": 2.8489840746842394, "grad_norm": 0.47184881092750675, "learning_rate": 6.465969074700783e-09, "loss": 0.2368, "step": 5188 }, { "epoch": 2.8495332235035695, "grad_norm": 0.5672029845419879, "learning_rate": 6.419852908349045e-09, "loss": 0.227, "step": 5189 }, { "epoch": 2.8500823723228996, "grad_norm": 0.7479417978816865, "learning_rate": 6.373903323986041e-09, "loss": 0.3025, "step": 5190 }, { "epoch": 2.8506315211422297, "grad_norm": 0.5762568137503958, "learning_rate": 6.328120337117926e-09, "loss": 0.2682, "step": 5191 }, { "epoch": 2.8511806699615594, "grad_norm": 0.42865316317321245, "learning_rate": 6.282503963194905e-09, "loss": 0.2351, "step": 5192 }, { "epoch": 2.8517298187808895, "grad_norm": 0.46576659606969445, "learning_rate": 6.237054217610793e-09, "loss": 0.2239, "step": 5193 }, { "epoch": 2.8522789676002196, "grad_norm": 0.48579155495994175, "learning_rate": 6.191771115703173e-09, "loss": 0.2353, "step": 5194 }, { "epoch": 2.8528281164195497, "grad_norm": 0.5154199884907745, "learning_rate": 6.1466546727534584e-09, "loss": 0.1927, "step": 5195 }, { "epoch": 2.85337726523888, "grad_norm": 0.5854245460626881, "learning_rate": 6.10170490398678e-09, "loss": 0.2426, "step": 5196 }, { "epoch": 2.85392641405821, "grad_norm": 0.48522677530422514, "learning_rate": 6.056921824572041e-09, "loss": 0.2515, "step": 5197 }, { "epoch": 2.85447556287754, "grad_norm": 0.5063666991587294, "learning_rate": 6.012305449621807e-09, "loss": 0.2549, "step": 5198 }, { "epoch": 2.8550247116968697, "grad_norm": 0.44931415321483326, "learning_rate": 5.967855794192527e-09, "loss": 0.226, "step": 5199 }, { "epoch": 2.8555738605161998, "grad_norm": 0.5187289047383389, "learning_rate": 5.923572873284312e-09, "loss": 0.2443, "step": 5200 }, { "epoch": 2.8555738605161998, "eval_loss": 0.3211120069026947, "eval_runtime": 18.6592, "eval_samples_per_second": 23.742, "eval_steps_per_second": 1.018, "step": 5200 }, { "epoch": 2.85612300933553, "grad_norm": 0.4356780313848142, "learning_rate": 5.879456701841045e-09, "loss": 0.2338, "step": 5201 }, { "epoch": 2.85667215815486, "grad_norm": 0.4729804241065775, "learning_rate": 5.835507294750273e-09, "loss": 0.2306, "step": 5202 }, { "epoch": 2.85722130697419, "grad_norm": 0.5913903408707895, "learning_rate": 5.791724666843257e-09, "loss": 0.2687, "step": 5203 }, { "epoch": 2.85777045579352, "grad_norm": 0.5222692190776277, "learning_rate": 5.748108832895146e-09, "loss": 0.2295, "step": 5204 }, { "epoch": 2.8583196046128503, "grad_norm": 0.4548972054394463, "learning_rate": 5.7046598076245244e-09, "loss": 0.2636, "step": 5205 }, { "epoch": 2.85886875343218, "grad_norm": 0.4720671987178587, "learning_rate": 5.6613776056939745e-09, "loss": 0.2416, "step": 5206 }, { "epoch": 2.85941790225151, "grad_norm": 0.48273738052174386, "learning_rate": 5.618262241709573e-09, "loss": 0.228, "step": 5207 }, { "epoch": 2.85996705107084, "grad_norm": 0.4910771773545165, "learning_rate": 5.5753137302211125e-09, "loss": 0.2379, "step": 5208 }, { "epoch": 2.8605161998901703, "grad_norm": 0.6368279043837103, "learning_rate": 5.532532085722273e-09, "loss": 0.2521, "step": 5209 }, { "epoch": 2.8610653487095004, "grad_norm": 0.5468498494618178, "learning_rate": 5.489917322650171e-09, "loss": 0.3134, "step": 5210 }, { "epoch": 2.8616144975288305, "grad_norm": 0.4652726132785175, "learning_rate": 5.447469455385808e-09, "loss": 0.2261, "step": 5211 }, { "epoch": 2.8621636463481606, "grad_norm": 0.5331307806031966, "learning_rate": 5.405188498253681e-09, "loss": 0.219, "step": 5212 }, { "epoch": 2.86271279516749, "grad_norm": 0.434683907900199, "learning_rate": 5.363074465522114e-09, "loss": 0.2381, "step": 5213 }, { "epoch": 2.8632619439868203, "grad_norm": 0.4553219692348751, "learning_rate": 5.3211273714030405e-09, "loss": 0.2248, "step": 5214 }, { "epoch": 2.8638110928061504, "grad_norm": 0.655680698395531, "learning_rate": 5.279347230051996e-09, "loss": 0.29, "step": 5215 }, { "epoch": 2.8643602416254805, "grad_norm": 0.49243342600511175, "learning_rate": 5.237734055568348e-09, "loss": 0.2578, "step": 5216 }, { "epoch": 2.8649093904448106, "grad_norm": 0.48390289176616463, "learning_rate": 5.196287861994901e-09, "loss": 0.2237, "step": 5217 }, { "epoch": 2.8654585392641407, "grad_norm": 0.4683730805540779, "learning_rate": 5.1550086633183465e-09, "loss": 0.2286, "step": 5218 }, { "epoch": 2.866007688083471, "grad_norm": 0.5621852104697815, "learning_rate": 5.113896473468756e-09, "loss": 0.2229, "step": 5219 }, { "epoch": 2.8665568369028005, "grad_norm": 0.5331983882056304, "learning_rate": 5.072951306320034e-09, "loss": 0.24, "step": 5220 }, { "epoch": 2.8671059857221306, "grad_norm": 0.6198025759490249, "learning_rate": 5.0321731756896885e-09, "loss": 0.2228, "step": 5221 }, { "epoch": 2.8676551345414607, "grad_norm": 0.7360590524626794, "learning_rate": 4.9915620953388334e-09, "loss": 0.3034, "step": 5222 }, { "epoch": 2.868204283360791, "grad_norm": 0.492364856135592, "learning_rate": 4.951118078972136e-09, "loss": 0.2484, "step": 5223 }, { "epoch": 2.868753432180121, "grad_norm": 0.4368884719871482, "learning_rate": 4.910841140238088e-09, "loss": 0.2185, "step": 5224 }, { "epoch": 2.869302580999451, "grad_norm": 0.43826539849700014, "learning_rate": 4.87073129272857e-09, "loss": 0.2332, "step": 5225 }, { "epoch": 2.869851729818781, "grad_norm": 0.4752569116872085, "learning_rate": 4.830788549979233e-09, "loss": 0.2132, "step": 5226 }, { "epoch": 2.8704008786381108, "grad_norm": 0.545272461513866, "learning_rate": 4.791012925469222e-09, "loss": 0.2975, "step": 5227 }, { "epoch": 2.870950027457441, "grad_norm": 0.4536068451358327, "learning_rate": 4.751404432621458e-09, "loss": 0.2453, "step": 5228 }, { "epoch": 2.871499176276771, "grad_norm": 0.4674250392154084, "learning_rate": 4.711963084802242e-09, "loss": 0.2436, "step": 5229 }, { "epoch": 2.872048325096101, "grad_norm": 0.4300388940438271, "learning_rate": 4.672688895321542e-09, "loss": 0.2482, "step": 5230 }, { "epoch": 2.872597473915431, "grad_norm": 0.5174537296424483, "learning_rate": 4.6335818774330945e-09, "loss": 0.2727, "step": 5231 }, { "epoch": 2.873146622734761, "grad_norm": 0.4495897095752114, "learning_rate": 4.594642044333968e-09, "loss": 0.2072, "step": 5232 }, { "epoch": 2.8736957715540914, "grad_norm": 0.4201167333912245, "learning_rate": 4.555869409165001e-09, "loss": 0.2301, "step": 5233 }, { "epoch": 2.874244920373421, "grad_norm": 0.4939895954420247, "learning_rate": 4.517263985010474e-09, "loss": 0.2516, "step": 5234 }, { "epoch": 2.874794069192751, "grad_norm": 0.46910303554956784, "learning_rate": 4.478825784898272e-09, "loss": 0.2701, "step": 5235 }, { "epoch": 2.8753432180120813, "grad_norm": 0.5231717538910857, "learning_rate": 4.440554821799888e-09, "loss": 0.2188, "step": 5236 }, { "epoch": 2.8758923668314114, "grad_norm": 0.5638130487731225, "learning_rate": 4.4024511086304205e-09, "loss": 0.2512, "step": 5237 }, { "epoch": 2.8764415156507415, "grad_norm": 0.5738573389283852, "learning_rate": 4.364514658248407e-09, "loss": 0.2369, "step": 5238 }, { "epoch": 2.876990664470071, "grad_norm": 0.5430747641736359, "learning_rate": 4.326745483456049e-09, "loss": 0.2439, "step": 5239 }, { "epoch": 2.8775398132894017, "grad_norm": 0.4179974217392526, "learning_rate": 4.289143596998986e-09, "loss": 0.2554, "step": 5240 }, { "epoch": 2.8780889621087313, "grad_norm": 0.625009120713705, "learning_rate": 4.251709011566518e-09, "loss": 0.2426, "step": 5241 }, { "epoch": 2.8786381109280614, "grad_norm": 0.42479498454573944, "learning_rate": 4.21444173979139e-09, "loss": 0.2449, "step": 5242 }, { "epoch": 2.8791872597473915, "grad_norm": 0.4408017173415403, "learning_rate": 4.1773417942500026e-09, "loss": 0.2601, "step": 5243 }, { "epoch": 2.8797364085667216, "grad_norm": 0.4641781400012835, "learning_rate": 4.140409187462199e-09, "loss": 0.243, "step": 5244 }, { "epoch": 2.8802855573860517, "grad_norm": 0.5404804876410245, "learning_rate": 4.103643931891375e-09, "loss": 0.2539, "step": 5245 }, { "epoch": 2.8808347062053814, "grad_norm": 0.4398911418933329, "learning_rate": 4.0670460399444185e-09, "loss": 0.2414, "step": 5246 }, { "epoch": 2.881383855024712, "grad_norm": 0.6007207135260024, "learning_rate": 4.03061552397177e-09, "loss": 0.2614, "step": 5247 }, { "epoch": 2.8819330038440416, "grad_norm": 0.49386818141118705, "learning_rate": 3.99435239626742e-09, "loss": 0.254, "step": 5248 }, { "epoch": 2.8824821526633717, "grad_norm": 0.5382579653456148, "learning_rate": 3.9582566690688e-09, "loss": 0.2482, "step": 5249 }, { "epoch": 2.883031301482702, "grad_norm": 0.508703902057793, "learning_rate": 3.922328354556945e-09, "loss": 0.295, "step": 5250 }, { "epoch": 2.883580450302032, "grad_norm": 0.48735695810931234, "learning_rate": 3.8865674648562785e-09, "loss": 0.2378, "step": 5251 }, { "epoch": 2.884129599121362, "grad_norm": 0.45761388527643787, "learning_rate": 3.850974012034828e-09, "loss": 0.2377, "step": 5252 }, { "epoch": 2.8846787479406917, "grad_norm": 0.43958557401283843, "learning_rate": 3.815548008104059e-09, "loss": 0.2174, "step": 5253 }, { "epoch": 2.8852278967600222, "grad_norm": 0.6071425380859866, "learning_rate": 3.780289465018936e-09, "loss": 0.2486, "step": 5254 }, { "epoch": 2.885777045579352, "grad_norm": 0.5656931004916198, "learning_rate": 3.745198394677916e-09, "loss": 0.2536, "step": 5255 }, { "epoch": 2.886326194398682, "grad_norm": 0.4685328657362597, "learning_rate": 3.710274808922953e-09, "loss": 0.2456, "step": 5256 }, { "epoch": 2.886875343218012, "grad_norm": 0.5569862188447046, "learning_rate": 3.675518719539439e-09, "loss": 0.2704, "step": 5257 }, { "epoch": 2.887424492037342, "grad_norm": 0.79886758059532, "learning_rate": 3.640930138256318e-09, "loss": 0.2211, "step": 5258 }, { "epoch": 2.8879736408566723, "grad_norm": 0.47210806019191215, "learning_rate": 3.6065090767459197e-09, "loss": 0.2271, "step": 5259 }, { "epoch": 2.888522789676002, "grad_norm": 0.46928450115782855, "learning_rate": 3.5722555466241213e-09, "loss": 0.2901, "step": 5260 }, { "epoch": 2.8890719384953325, "grad_norm": 0.5182909785627313, "learning_rate": 3.538169559450187e-09, "loss": 0.2143, "step": 5261 }, { "epoch": 2.889621087314662, "grad_norm": 0.6880657165622727, "learning_rate": 3.5042511267269857e-09, "loss": 0.2961, "step": 5262 }, { "epoch": 2.8901702361339923, "grad_norm": 0.4684435778749125, "learning_rate": 3.4705002599005496e-09, "loss": 0.2606, "step": 5263 }, { "epoch": 2.8907193849533224, "grad_norm": 0.3767475174721522, "learning_rate": 3.4369169703607394e-09, "loss": 0.2466, "step": 5264 }, { "epoch": 2.8912685337726525, "grad_norm": 0.48682173607596296, "learning_rate": 3.4035012694405767e-09, "loss": 0.2294, "step": 5265 }, { "epoch": 2.8918176825919826, "grad_norm": 0.4782334731625126, "learning_rate": 3.37025316841669e-09, "loss": 0.2215, "step": 5266 }, { "epoch": 2.8923668314113122, "grad_norm": 0.5218409631403885, "learning_rate": 3.3371726785089825e-09, "loss": 0.3074, "step": 5267 }, { "epoch": 2.892915980230643, "grad_norm": 0.5223572433231088, "learning_rate": 3.3042598108810174e-09, "loss": 0.2396, "step": 5268 }, { "epoch": 2.8934651290499724, "grad_norm": 0.4511075987528436, "learning_rate": 3.2715145766396314e-09, "loss": 0.2688, "step": 5269 }, { "epoch": 2.8940142778693025, "grad_norm": 0.49306569493730634, "learning_rate": 3.238936986835102e-09, "loss": 0.2477, "step": 5270 }, { "epoch": 2.8945634266886326, "grad_norm": 0.854825782672018, "learning_rate": 3.206527052461256e-09, "loss": 0.2889, "step": 5271 }, { "epoch": 2.8951125755079627, "grad_norm": 0.5040888931704502, "learning_rate": 3.17428478445514e-09, "loss": 0.2456, "step": 5272 }, { "epoch": 2.895661724327293, "grad_norm": 0.4161732614453351, "learning_rate": 3.142210193697405e-09, "loss": 0.2215, "step": 5273 }, { "epoch": 2.8962108731466225, "grad_norm": 0.45647722085267894, "learning_rate": 3.110303291012032e-09, "loss": 0.2411, "step": 5274 }, { "epoch": 2.8967600219659526, "grad_norm": 0.4040992265484478, "learning_rate": 3.0785640871663857e-09, "loss": 0.2398, "step": 5275 }, { "epoch": 2.8973091707852827, "grad_norm": 0.49017555006118085, "learning_rate": 3.046992592871326e-09, "loss": 0.244, "step": 5276 }, { "epoch": 2.897858319604613, "grad_norm": 0.41436795929762377, "learning_rate": 3.0155888187810423e-09, "loss": 0.2488, "step": 5277 }, { "epoch": 2.898407468423943, "grad_norm": 0.5219745909152449, "learning_rate": 2.984352775493163e-09, "loss": 0.2186, "step": 5278 }, { "epoch": 2.898956617243273, "grad_norm": 0.5995917479438265, "learning_rate": 2.9532844735487014e-09, "loss": 0.2542, "step": 5279 }, { "epoch": 2.899505766062603, "grad_norm": 0.4648572986862078, "learning_rate": 2.9223839234320543e-09, "loss": 0.2378, "step": 5280 }, { "epoch": 2.900054914881933, "grad_norm": 0.5131382993261625, "learning_rate": 2.8916511355710593e-09, "loss": 0.2543, "step": 5281 }, { "epoch": 2.900604063701263, "grad_norm": 0.4087016748140403, "learning_rate": 2.8610861203368265e-09, "loss": 0.2228, "step": 5282 }, { "epoch": 2.901153212520593, "grad_norm": 0.5752035960309544, "learning_rate": 2.830688888044017e-09, "loss": 0.2354, "step": 5283 }, { "epoch": 2.901702361339923, "grad_norm": 0.5201602121251301, "learning_rate": 2.800459448950454e-09, "loss": 0.2732, "step": 5284 }, { "epoch": 2.902251510159253, "grad_norm": 0.515008303074687, "learning_rate": 2.7703978132575686e-09, "loss": 0.2281, "step": 5285 }, { "epoch": 2.9028006589785833, "grad_norm": 0.6286910979688483, "learning_rate": 2.7405039911100623e-09, "loss": 0.2515, "step": 5286 }, { "epoch": 2.9033498077979134, "grad_norm": 0.42100068784436384, "learning_rate": 2.7107779925958565e-09, "loss": 0.2528, "step": 5287 }, { "epoch": 2.903898956617243, "grad_norm": 0.5000170402301612, "learning_rate": 2.681219827746534e-09, "loss": 0.2467, "step": 5288 }, { "epoch": 2.904448105436573, "grad_norm": 0.5015005286663535, "learning_rate": 2.6518295065368392e-09, "loss": 0.2214, "step": 5289 }, { "epoch": 2.9049972542559033, "grad_norm": 0.5516268805332989, "learning_rate": 2.6226070388849018e-09, "loss": 0.2841, "step": 5290 }, { "epoch": 2.9055464030752334, "grad_norm": 0.4799330343518949, "learning_rate": 2.5935524346522344e-09, "loss": 0.2156, "step": 5291 }, { "epoch": 2.9060955518945635, "grad_norm": 0.564539473009308, "learning_rate": 2.5646657036436804e-09, "loss": 0.2218, "step": 5292 }, { "epoch": 2.9066447007138936, "grad_norm": 0.45759523777299965, "learning_rate": 2.535946855607465e-09, "loss": 0.2541, "step": 5293 }, { "epoch": 2.9071938495332237, "grad_norm": 0.47557954608166375, "learning_rate": 2.5073959002351453e-09, "loss": 0.2108, "step": 5294 }, { "epoch": 2.9077429983525533, "grad_norm": 0.600252301057463, "learning_rate": 2.4790128471616594e-09, "loss": 0.2275, "step": 5295 }, { "epoch": 2.9082921471718834, "grad_norm": 0.539653506121455, "learning_rate": 2.450797705965165e-09, "loss": 0.2452, "step": 5296 }, { "epoch": 2.9088412959912135, "grad_norm": 0.506845474128641, "learning_rate": 2.422750486167259e-09, "loss": 0.2324, "step": 5297 }, { "epoch": 2.9093904448105437, "grad_norm": 0.5778777368689596, "learning_rate": 2.3948711972328683e-09, "loss": 0.225, "step": 5298 }, { "epoch": 2.9099395936298738, "grad_norm": 0.6547914944740025, "learning_rate": 2.3671598485701913e-09, "loss": 0.262, "step": 5299 }, { "epoch": 2.910488742449204, "grad_norm": 0.4033743365202709, "learning_rate": 2.339616449530812e-09, "loss": 0.2658, "step": 5300 }, { "epoch": 2.911037891268534, "grad_norm": 0.5500268912798821, "learning_rate": 2.3122410094095866e-09, "loss": 0.2284, "step": 5301 }, { "epoch": 2.9115870400878636, "grad_norm": 0.47516686863573443, "learning_rate": 2.2850335374447568e-09, "loss": 0.262, "step": 5302 }, { "epoch": 2.9121361889071937, "grad_norm": 0.4006868376595554, "learning_rate": 2.2579940428178914e-09, "loss": 0.2529, "step": 5303 }, { "epoch": 2.912685337726524, "grad_norm": 0.4182770021858877, "learning_rate": 2.231122534653669e-09, "loss": 0.2517, "step": 5304 }, { "epoch": 2.913234486545854, "grad_norm": 0.5726155030716732, "learning_rate": 2.204419022020315e-09, "loss": 0.2315, "step": 5305 }, { "epoch": 2.913783635365184, "grad_norm": 0.5175496325072361, "learning_rate": 2.177883513929275e-09, "loss": 0.2644, "step": 5306 }, { "epoch": 2.914332784184514, "grad_norm": 0.5520635462430347, "learning_rate": 2.1515160193353795e-09, "loss": 0.2209, "step": 5307 }, { "epoch": 2.9148819330038442, "grad_norm": 0.4582595150714232, "learning_rate": 2.1253165471366196e-09, "loss": 0.2216, "step": 5308 }, { "epoch": 2.915431081823174, "grad_norm": 0.4609734252941549, "learning_rate": 2.09928510617426e-09, "loss": 0.232, "step": 5309 }, { "epoch": 2.915980230642504, "grad_norm": 0.5773175947309279, "learning_rate": 2.0734217052331174e-09, "loss": 0.2552, "step": 5310 }, { "epoch": 2.916529379461834, "grad_norm": 0.553018146901683, "learning_rate": 2.047726353041058e-09, "loss": 0.2463, "step": 5311 }, { "epoch": 2.917078528281164, "grad_norm": 0.6332939218859734, "learning_rate": 2.022199058269333e-09, "loss": 0.2438, "step": 5312 }, { "epoch": 2.9176276771004943, "grad_norm": 0.5307046797534954, "learning_rate": 1.996839829532468e-09, "loss": 0.2615, "step": 5313 }, { "epoch": 2.9181768259198244, "grad_norm": 0.4597024076512655, "learning_rate": 1.971648675388205e-09, "loss": 0.2437, "step": 5314 }, { "epoch": 2.9187259747391545, "grad_norm": 0.9276273131298172, "learning_rate": 1.9466256043377262e-09, "loss": 0.4039, "step": 5315 }, { "epoch": 2.919275123558484, "grad_norm": 0.41942749930038187, "learning_rate": 1.9217706248253204e-09, "loss": 0.2713, "step": 5316 }, { "epoch": 2.9198242723778143, "grad_norm": 0.49392360761391607, "learning_rate": 1.8970837452387163e-09, "loss": 0.198, "step": 5317 }, { "epoch": 2.9203734211971444, "grad_norm": 0.42301468682106363, "learning_rate": 1.8725649739086943e-09, "loss": 0.2352, "step": 5318 }, { "epoch": 2.9209225700164745, "grad_norm": 0.705122697236137, "learning_rate": 1.848214319109585e-09, "loss": 0.2526, "step": 5319 }, { "epoch": 2.9214717188358046, "grad_norm": 0.4710410879886511, "learning_rate": 1.8240317890587149e-09, "loss": 0.2419, "step": 5320 }, { "epoch": 2.9220208676551347, "grad_norm": 0.5144669771139706, "learning_rate": 1.8000173919168504e-09, "loss": 0.2751, "step": 5321 }, { "epoch": 2.922570016474465, "grad_norm": 1.1071583960987983, "learning_rate": 1.7761711357879758e-09, "loss": 0.2273, "step": 5322 }, { "epoch": 2.9231191652937945, "grad_norm": 0.647335194357053, "learning_rate": 1.7524930287192929e-09, "loss": 0.2837, "step": 5323 }, { "epoch": 2.9236683141131246, "grad_norm": 0.47598229370191525, "learning_rate": 1.7289830787013325e-09, "loss": 0.2265, "step": 5324 }, { "epoch": 2.9242174629324547, "grad_norm": 0.45097958659181114, "learning_rate": 1.7056412936677872e-09, "loss": 0.1854, "step": 5325 }, { "epoch": 2.9247666117517848, "grad_norm": 0.4765231236317673, "learning_rate": 1.6824676814956791e-09, "loss": 0.2178, "step": 5326 }, { "epoch": 2.925315760571115, "grad_norm": 0.49280710773740866, "learning_rate": 1.6594622500052475e-09, "loss": 0.2813, "step": 5327 }, { "epoch": 2.925864909390445, "grad_norm": 0.5348249205068328, "learning_rate": 1.6366250069599498e-09, "loss": 0.2464, "step": 5328 }, { "epoch": 2.926414058209775, "grad_norm": 0.5824599596592727, "learning_rate": 1.613955960066572e-09, "loss": 0.2457, "step": 5329 }, { "epoch": 2.9269632070291047, "grad_norm": 0.4996542857281626, "learning_rate": 1.5914551169750625e-09, "loss": 0.2104, "step": 5330 }, { "epoch": 2.927512355848435, "grad_norm": 0.6019038115414923, "learning_rate": 1.5691224852785316e-09, "loss": 0.2946, "step": 5331 }, { "epoch": 2.928061504667765, "grad_norm": 0.5368071410699573, "learning_rate": 1.5469580725135855e-09, "loss": 0.2063, "step": 5332 }, { "epoch": 2.928610653487095, "grad_norm": 0.4104528413960223, "learning_rate": 1.5249618861597704e-09, "loss": 0.2092, "step": 5333 }, { "epoch": 2.929159802306425, "grad_norm": 0.5481307128206689, "learning_rate": 1.5031339336400719e-09, "loss": 0.2481, "step": 5334 }, { "epoch": 2.9297089511257552, "grad_norm": 0.6195316128156438, "learning_rate": 1.4814742223205277e-09, "loss": 0.2454, "step": 5335 }, { "epoch": 2.9302580999450853, "grad_norm": 0.4388701773887708, "learning_rate": 1.4599827595105583e-09, "loss": 0.2154, "step": 5336 }, { "epoch": 2.930807248764415, "grad_norm": 0.6195599802824301, "learning_rate": 1.4386595524627486e-09, "loss": 0.262, "step": 5337 }, { "epoch": 2.931356397583745, "grad_norm": 0.5677726356063226, "learning_rate": 1.4175046083727883e-09, "loss": 0.3016, "step": 5338 }, { "epoch": 2.931905546403075, "grad_norm": 0.711785780606575, "learning_rate": 1.3965179343798634e-09, "loss": 0.2849, "step": 5339 }, { "epoch": 2.9324546952224053, "grad_norm": 0.4883250974044079, "learning_rate": 1.3756995375661e-09, "loss": 0.2707, "step": 5340 }, { "epoch": 2.9330038440417354, "grad_norm": 0.7298872247144134, "learning_rate": 1.3550494249569531e-09, "loss": 0.301, "step": 5341 }, { "epoch": 2.933552992861065, "grad_norm": 0.39579149600023295, "learning_rate": 1.3345676035210394e-09, "loss": 0.2489, "step": 5342 }, { "epoch": 2.9341021416803956, "grad_norm": 0.5112549009565359, "learning_rate": 1.314254080170305e-09, "loss": 0.2474, "step": 5343 }, { "epoch": 2.9346512904997253, "grad_norm": 0.5988858834115693, "learning_rate": 1.2941088617597468e-09, "loss": 0.2832, "step": 5344 }, { "epoch": 2.9352004393190554, "grad_norm": 0.44626959377511644, "learning_rate": 1.2741319550876356e-09, "loss": 0.2406, "step": 5345 }, { "epoch": 2.9357495881383855, "grad_norm": 0.5456852666544563, "learning_rate": 1.2543233668954592e-09, "loss": 0.2234, "step": 5346 }, { "epoch": 2.9362987369577156, "grad_norm": 0.5285905083209635, "learning_rate": 1.2346831038679237e-09, "loss": 0.2182, "step": 5347 }, { "epoch": 2.9368478857770457, "grad_norm": 0.5516886804108452, "learning_rate": 1.2152111726328417e-09, "loss": 0.2238, "step": 5348 }, { "epoch": 2.9373970345963754, "grad_norm": 0.44982741627536693, "learning_rate": 1.1959075797612995e-09, "loss": 0.231, "step": 5349 }, { "epoch": 2.937946183415706, "grad_norm": 0.4385594919487259, "learning_rate": 1.1767723317675449e-09, "loss": 0.2459, "step": 5350 }, { "epoch": 2.9384953322350356, "grad_norm": 0.4824086952091381, "learning_rate": 1.1578054351089883e-09, "loss": 0.24, "step": 5351 }, { "epoch": 2.9390444810543657, "grad_norm": 0.49371922829875875, "learning_rate": 1.139006896186314e-09, "loss": 0.2177, "step": 5352 }, { "epoch": 2.9395936298736958, "grad_norm": 0.5908334223515511, "learning_rate": 1.120376721343257e-09, "loss": 0.261, "step": 5353 }, { "epoch": 2.940142778693026, "grad_norm": 0.5430466896372764, "learning_rate": 1.1019149168668805e-09, "loss": 0.2551, "step": 5354 }, { "epoch": 2.940691927512356, "grad_norm": 0.46858179857506943, "learning_rate": 1.0836214889873558e-09, "loss": 0.2213, "step": 5355 }, { "epoch": 2.9412410763316856, "grad_norm": 0.531236223816229, "learning_rate": 1.0654964438780154e-09, "loss": 0.2956, "step": 5356 }, { "epoch": 2.941790225151016, "grad_norm": 0.5043324015511236, "learning_rate": 1.0475397876554099e-09, "loss": 0.2337, "step": 5357 }, { "epoch": 2.942339373970346, "grad_norm": 0.7289246305700283, "learning_rate": 1.0297515263792525e-09, "loss": 0.3336, "step": 5358 }, { "epoch": 2.942888522789676, "grad_norm": 0.5025708339484115, "learning_rate": 1.012131666052418e-09, "loss": 0.2392, "step": 5359 }, { "epoch": 2.943437671609006, "grad_norm": 0.4126505758546958, "learning_rate": 9.946802126209991e-10, "loss": 0.2592, "step": 5360 }, { "epoch": 2.943986820428336, "grad_norm": 1.0160954561595026, "learning_rate": 9.773971719741405e-10, "loss": 0.3708, "step": 5361 }, { "epoch": 2.9445359692476663, "grad_norm": 0.4190700814338857, "learning_rate": 9.60282549944315e-10, "loss": 0.2281, "step": 5362 }, { "epoch": 2.945085118066996, "grad_norm": 0.5413465484879393, "learning_rate": 9.433363523069913e-10, "loss": 0.2217, "step": 5363 }, { "epoch": 2.9456342668863265, "grad_norm": 0.4508947023507633, "learning_rate": 9.265585847809667e-10, "loss": 0.2756, "step": 5364 }, { "epoch": 2.946183415705656, "grad_norm": 0.4116309943780954, "learning_rate": 9.099492530280901e-10, "loss": 0.2338, "step": 5365 }, { "epoch": 2.946732564524986, "grad_norm": 0.43662711559505313, "learning_rate": 8.935083626533724e-10, "loss": 0.2256, "step": 5366 }, { "epoch": 2.9472817133443163, "grad_norm": 0.433677854986825, "learning_rate": 8.772359192050425e-10, "loss": 0.2379, "step": 5367 }, { "epoch": 2.9478308621636464, "grad_norm": 0.5270756740004231, "learning_rate": 8.61131928174436e-10, "loss": 0.2605, "step": 5368 }, { "epoch": 2.9483800109829765, "grad_norm": 0.41128946656622806, "learning_rate": 8.451963949960503e-10, "loss": 0.2445, "step": 5369 }, { "epoch": 2.948929159802306, "grad_norm": 0.549588929045845, "learning_rate": 8.29429325047546e-10, "loss": 0.2407, "step": 5370 }, { "epoch": 2.9494783086216367, "grad_norm": 0.5311475453128123, "learning_rate": 8.138307236497451e-10, "loss": 0.2704, "step": 5371 }, { "epoch": 2.9500274574409664, "grad_norm": 0.43109602672986197, "learning_rate": 7.984005960665768e-10, "loss": 0.2389, "step": 5372 }, { "epoch": 2.9505766062602965, "grad_norm": 0.45978128551749675, "learning_rate": 7.831389475051324e-10, "loss": 0.2389, "step": 5373 }, { "epoch": 2.9511257550796266, "grad_norm": 0.6105337029030464, "learning_rate": 7.680457831157212e-10, "loss": 0.2266, "step": 5374 }, { "epoch": 2.9516749038989567, "grad_norm": 0.5054672779083297, "learning_rate": 7.531211079916478e-10, "loss": 0.3019, "step": 5375 }, { "epoch": 2.952224052718287, "grad_norm": 0.5098243123439421, "learning_rate": 7.383649271694906e-10, "loss": 0.2555, "step": 5376 }, { "epoch": 2.9527732015376165, "grad_norm": 0.6067122886391104, "learning_rate": 7.237772456289344e-10, "loss": 0.2438, "step": 5377 }, { "epoch": 2.9533223503569466, "grad_norm": 0.4820703844854332, "learning_rate": 7.093580682927153e-10, "loss": 0.287, "step": 5378 }, { "epoch": 2.9538714991762767, "grad_norm": 0.4872872216892676, "learning_rate": 6.951074000268429e-10, "loss": 0.2279, "step": 5379 }, { "epoch": 2.9544206479956068, "grad_norm": 0.4419070346717495, "learning_rate": 6.810252456402668e-10, "loss": 0.232, "step": 5380 }, { "epoch": 2.954969796814937, "grad_norm": 0.5222928239887484, "learning_rate": 6.671116098853765e-10, "loss": 0.2712, "step": 5381 }, { "epoch": 2.955518945634267, "grad_norm": 0.4310593677149048, "learning_rate": 6.533664974574463e-10, "loss": 0.2354, "step": 5382 }, { "epoch": 2.956068094453597, "grad_norm": 0.6106742740490751, "learning_rate": 6.39789912994857e-10, "loss": 0.2462, "step": 5383 }, { "epoch": 2.9566172432729267, "grad_norm": 0.5288878997167294, "learning_rate": 6.263818610793185e-10, "loss": 0.2288, "step": 5384 }, { "epoch": 2.957166392092257, "grad_norm": 0.5188054735539555, "learning_rate": 6.131423462354807e-10, "loss": 0.2496, "step": 5385 }, { "epoch": 2.957715540911587, "grad_norm": 0.6016465096683022, "learning_rate": 6.000713729312669e-10, "loss": 0.2408, "step": 5386 }, { "epoch": 2.958264689730917, "grad_norm": 0.45833913941640814, "learning_rate": 5.871689455775407e-10, "loss": 0.2286, "step": 5387 }, { "epoch": 2.958813838550247, "grad_norm": 0.5788060494446572, "learning_rate": 5.744350685284945e-10, "loss": 0.2309, "step": 5388 }, { "epoch": 2.9593629873695773, "grad_norm": 0.5773948669030117, "learning_rate": 5.618697460812607e-10, "loss": 0.2024, "step": 5389 }, { "epoch": 2.9599121361889074, "grad_norm": 0.5058151656907947, "learning_rate": 5.49472982476245e-10, "loss": 0.2331, "step": 5390 }, { "epoch": 2.960461285008237, "grad_norm": 0.48565391987867707, "learning_rate": 5.372447818969048e-10, "loss": 0.2125, "step": 5391 }, { "epoch": 2.961010433827567, "grad_norm": 0.4691528494583722, "learning_rate": 5.251851484697482e-10, "loss": 0.2675, "step": 5392 }, { "epoch": 2.9615595826468972, "grad_norm": 0.47768103784053717, "learning_rate": 5.132940862645014e-10, "loss": 0.2371, "step": 5393 }, { "epoch": 2.9621087314662273, "grad_norm": 0.5011758210488948, "learning_rate": 5.015715992938862e-10, "loss": 0.2815, "step": 5394 }, { "epoch": 2.9626578802855574, "grad_norm": 0.46673237808343554, "learning_rate": 4.900176915138978e-10, "loss": 0.2251, "step": 5395 }, { "epoch": 2.9632070291048875, "grad_norm": 0.4869314221382234, "learning_rate": 4.78632366823527e-10, "loss": 0.2385, "step": 5396 }, { "epoch": 2.9637561779242176, "grad_norm": 0.4594908547625924, "learning_rate": 4.674156290648715e-10, "loss": 0.2872, "step": 5397 }, { "epoch": 2.9643053267435473, "grad_norm": 0.7221207710622318, "learning_rate": 4.5636748202324665e-10, "loss": 0.271, "step": 5398 }, { "epoch": 2.9648544755628774, "grad_norm": 0.5245786423910327, "learning_rate": 4.454879294269082e-10, "loss": 0.2346, "step": 5399 }, { "epoch": 2.9654036243822075, "grad_norm": 0.5321972370728846, "learning_rate": 4.347769749473295e-10, "loss": 0.2411, "step": 5400 }, { "epoch": 2.9654036243822075, "eval_loss": 0.32108238339424133, "eval_runtime": 18.6592, "eval_samples_per_second": 23.742, "eval_steps_per_second": 1.018, "step": 5400 }, { "epoch": 2.9659527732015376, "grad_norm": 0.5372673578850995, "learning_rate": 4.242346221990907e-10, "loss": 0.2522, "step": 5401 }, { "epoch": 2.9665019220208677, "grad_norm": 0.5118229484855749, "learning_rate": 4.138608747397679e-10, "loss": 0.2382, "step": 5402 }, { "epoch": 2.967051070840198, "grad_norm": 0.39504372632116935, "learning_rate": 4.036557360702101e-10, "loss": 0.261, "step": 5403 }, { "epoch": 2.967600219659528, "grad_norm": 0.6050279800274624, "learning_rate": 3.936192096342625e-10, "loss": 0.2334, "step": 5404 }, { "epoch": 2.9681493684788576, "grad_norm": 0.503231039382592, "learning_rate": 3.8375129881887655e-10, "loss": 0.3018, "step": 5405 }, { "epoch": 2.9686985172981877, "grad_norm": 0.5061476020056557, "learning_rate": 3.7405200695405543e-10, "loss": 0.265, "step": 5406 }, { "epoch": 2.969247666117518, "grad_norm": 0.41268696690069995, "learning_rate": 3.6452133731296424e-10, "loss": 0.2574, "step": 5407 }, { "epoch": 2.969796814936848, "grad_norm": 0.5559459249027656, "learning_rate": 3.551592931119306e-10, "loss": 0.2364, "step": 5408 }, { "epoch": 2.970345963756178, "grad_norm": 0.4708951284931678, "learning_rate": 3.459658775102777e-10, "loss": 0.2235, "step": 5409 }, { "epoch": 2.970895112575508, "grad_norm": 0.43836193133485246, "learning_rate": 3.3694109361038e-10, "loss": 0.2226, "step": 5410 }, { "epoch": 2.971444261394838, "grad_norm": 0.45345598934573245, "learning_rate": 3.2808494445777445e-10, "loss": 0.2423, "step": 5411 }, { "epoch": 2.971993410214168, "grad_norm": 0.5636412706721464, "learning_rate": 3.1939743304115985e-10, "loss": 0.2559, "step": 5412 }, { "epoch": 2.972542559033498, "grad_norm": 0.5566678569225637, "learning_rate": 3.108785622922312e-10, "loss": 0.2434, "step": 5413 }, { "epoch": 2.973091707852828, "grad_norm": 0.5277425997918919, "learning_rate": 3.025283350857343e-10, "loss": 0.2488, "step": 5414 }, { "epoch": 2.973640856672158, "grad_norm": 0.6297224928796673, "learning_rate": 2.943467542395777e-10, "loss": 0.2976, "step": 5415 }, { "epoch": 2.9741900054914883, "grad_norm": 0.47669669313177326, "learning_rate": 2.863338225147764e-10, "loss": 0.2329, "step": 5416 }, { "epoch": 2.9747391543108184, "grad_norm": 0.377701911436147, "learning_rate": 2.784895426154521e-10, "loss": 0.2584, "step": 5417 }, { "epoch": 2.9752883031301485, "grad_norm": 0.4869183662491141, "learning_rate": 2.708139171886115e-10, "loss": 0.2677, "step": 5418 }, { "epoch": 2.975837451949478, "grad_norm": 0.48160842403637133, "learning_rate": 2.633069488246455e-10, "loss": 0.2625, "step": 5419 }, { "epoch": 2.9763866007688082, "grad_norm": 0.5093281624528752, "learning_rate": 2.55968640056774e-10, "loss": 0.2349, "step": 5420 }, { "epoch": 2.9769357495881383, "grad_norm": 0.47985234535343113, "learning_rate": 2.4879899336149043e-10, "loss": 0.2292, "step": 5421 }, { "epoch": 2.9774848984074684, "grad_norm": 0.5538210297161699, "learning_rate": 2.41798011158173e-10, "loss": 0.2728, "step": 5422 }, { "epoch": 2.9780340472267985, "grad_norm": 0.5006779000318572, "learning_rate": 2.3496569580952844e-10, "loss": 0.2183, "step": 5423 }, { "epoch": 2.9785831960461286, "grad_norm": 0.4906868336720107, "learning_rate": 2.2830204962109292e-10, "loss": 0.2279, "step": 5424 }, { "epoch": 2.9791323448654587, "grad_norm": 0.52380493762853, "learning_rate": 2.2180707484173138e-10, "loss": 0.2103, "step": 5425 }, { "epoch": 2.9796814936847884, "grad_norm": 0.576734213999862, "learning_rate": 2.1548077366313787e-10, "loss": 0.2219, "step": 5426 }, { "epoch": 2.9802306425041185, "grad_norm": 0.4097445894891241, "learning_rate": 2.0932314822022437e-10, "loss": 0.2575, "step": 5427 }, { "epoch": 2.9807797913234486, "grad_norm": 0.4435487158984122, "learning_rate": 2.0333420059100946e-10, "loss": 0.2221, "step": 5428 }, { "epoch": 2.9813289401427787, "grad_norm": 0.4513000854850718, "learning_rate": 1.9751393279656308e-10, "loss": 0.2174, "step": 5429 }, { "epoch": 2.981878088962109, "grad_norm": 0.4882059343892691, "learning_rate": 1.9186234680095092e-10, "loss": 0.2244, "step": 5430 }, { "epoch": 2.982427237781439, "grad_norm": 0.4302162963690885, "learning_rate": 1.8637944451134534e-10, "loss": 0.2715, "step": 5431 }, { "epoch": 2.982976386600769, "grad_norm": 0.4127305885423929, "learning_rate": 1.8106522777813655e-10, "loss": 0.2175, "step": 5432 }, { "epoch": 2.9835255354200987, "grad_norm": 0.5042552345075726, "learning_rate": 1.759196983945995e-10, "loss": 0.2755, "step": 5433 }, { "epoch": 2.984074684239429, "grad_norm": 0.4309896134196142, "learning_rate": 1.709428580971714e-10, "loss": 0.2271, "step": 5434 }, { "epoch": 2.984623833058759, "grad_norm": 0.501523243959343, "learning_rate": 1.6613470856534062e-10, "loss": 0.2139, "step": 5435 }, { "epoch": 2.985172981878089, "grad_norm": 0.45430856558279076, "learning_rate": 1.614952514217025e-10, "loss": 0.2179, "step": 5436 }, { "epoch": 2.985722130697419, "grad_norm": 0.448038516367569, "learning_rate": 1.570244882319034e-10, "loss": 0.2204, "step": 5437 }, { "epoch": 2.986271279516749, "grad_norm": 0.46684865429880856, "learning_rate": 1.5272242050469654e-10, "loss": 0.215, "step": 5438 }, { "epoch": 2.9868204283360793, "grad_norm": 0.40657311519974626, "learning_rate": 1.4858904969177543e-10, "loss": 0.2358, "step": 5439 }, { "epoch": 2.987369577155409, "grad_norm": 0.5485519256556726, "learning_rate": 1.4462437718805127e-10, "loss": 0.2383, "step": 5440 }, { "epoch": 2.987918725974739, "grad_norm": 0.44300657576882274, "learning_rate": 1.4082840433148658e-10, "loss": 0.2133, "step": 5441 }, { "epoch": 2.988467874794069, "grad_norm": 0.5494934140873944, "learning_rate": 1.3720113240303957e-10, "loss": 0.2677, "step": 5442 }, { "epoch": 2.9890170236133993, "grad_norm": 0.5026753476498617, "learning_rate": 1.3374256262677531e-10, "loss": 0.2858, "step": 5443 }, { "epoch": 2.9895661724327294, "grad_norm": 0.48019756442962847, "learning_rate": 1.3045269616986547e-10, "loss": 0.2692, "step": 5444 }, { "epoch": 2.990115321252059, "grad_norm": 0.5169277919369355, "learning_rate": 1.273315341425331e-10, "loss": 0.2319, "step": 5445 }, { "epoch": 2.9906644700713896, "grad_norm": 0.5128898974397927, "learning_rate": 1.2437907759799695e-10, "loss": 0.2847, "step": 5446 }, { "epoch": 2.9912136188907192, "grad_norm": 0.5158609434391226, "learning_rate": 1.2159532753263808e-10, "loss": 0.2099, "step": 5447 }, { "epoch": 2.9917627677100493, "grad_norm": 0.4850771477341244, "learning_rate": 1.1898028488588877e-10, "loss": 0.2401, "step": 5448 }, { "epoch": 2.9923119165293794, "grad_norm": 0.5003383483499431, "learning_rate": 1.1653395054017702e-10, "loss": 0.2297, "step": 5449 }, { "epoch": 2.9928610653487095, "grad_norm": 0.47327459402472116, "learning_rate": 1.1425632532109314e-10, "loss": 0.2774, "step": 5450 }, { "epoch": 2.9934102141680397, "grad_norm": 0.4863256277213203, "learning_rate": 1.1214740999727867e-10, "loss": 0.227, "step": 5451 }, { "epoch": 2.9939593629873693, "grad_norm": 0.557741307639221, "learning_rate": 1.1020720528031541e-10, "loss": 0.2357, "step": 5452 }, { "epoch": 2.9945085118067, "grad_norm": 0.5276162873231401, "learning_rate": 1.0843571182505844e-10, "loss": 0.2546, "step": 5453 }, { "epoch": 2.9950576606260295, "grad_norm": 0.5619183504200205, "learning_rate": 1.0683293022924757e-10, "loss": 0.2548, "step": 5454 }, { "epoch": 2.9956068094453596, "grad_norm": 0.5550248691562255, "learning_rate": 1.0539886103378486e-10, "loss": 0.2579, "step": 5455 }, { "epoch": 2.9961559582646897, "grad_norm": 0.5795932679671043, "learning_rate": 1.0413350472262365e-10, "loss": 0.24, "step": 5456 }, { "epoch": 2.99670510708402, "grad_norm": 0.5454017885384628, "learning_rate": 1.030368617227685e-10, "loss": 0.256, "step": 5457 }, { "epoch": 2.99725425590335, "grad_norm": 0.5076033763426103, "learning_rate": 1.0210893240427526e-10, "loss": 0.2691, "step": 5458 }, { "epoch": 2.9978034047226796, "grad_norm": 0.5388251427263286, "learning_rate": 1.0134971708036203e-10, "loss": 0.2499, "step": 5459 }, { "epoch": 2.99835255354201, "grad_norm": 0.5294517063931046, "learning_rate": 1.0075921600713165e-10, "loss": 0.2446, "step": 5460 }, { "epoch": 2.99890170236134, "grad_norm": 0.45327376502590566, "learning_rate": 1.0033742938396025e-10, "loss": 0.2124, "step": 5461 }, { "epoch": 2.99945085118067, "grad_norm": 0.5478266139464804, "learning_rate": 1.0008435735310866e-10, "loss": 0.2811, "step": 5462 }, { "epoch": 3.0, "grad_norm": 0.5357517534829501, "learning_rate": 1e-10, "loss": 0.2212, "step": 5463 }, { "epoch": 3.0, "step": 5463, "total_flos": 1421688221401088.0, "train_loss": 0.2704946962895246, "train_runtime": 20433.0885, "train_samples_per_second": 6.416, "train_steps_per_second": 0.267 } ], "logging_steps": 1.0, "max_steps": 5463, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "total_flos": 1421688221401088.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }