{ "best_global_step": 31000, "best_metric": 0.7226839661598206, "best_model_checkpoint": "./ar-diffusion-checkpoints-fixed/checkpoint-31000", "epoch": 2.999769248519345, "eval_steps": 250, "global_step": 39000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003845858010922237, "grad_norm": 11.687983512878418, "learning_rate": 1.84e-05, "loss": 8.7035, "step": 50 }, { "epoch": 0.007691716021844474, "grad_norm": 2.6404802799224854, "learning_rate": 3.8400000000000005e-05, "loss": 2.7335, "step": 100 }, { "epoch": 0.01153757403276671, "grad_norm": 2.068481683731079, "learning_rate": 5.8399999999999997e-05, "loss": 2.0457, "step": 150 }, { "epoch": 0.015383432043688947, "grad_norm": 2.60369610786438, "learning_rate": 7.840000000000001e-05, "loss": 1.8505, "step": 200 }, { "epoch": 0.019229290054611183, "grad_norm": 1.6515765190124512, "learning_rate": 9.84e-05, "loss": 1.8158, "step": 250 }, { "epoch": 0.019229290054611183, "eval_loss": 1.8082822561264038, "eval_runtime": 18.1351, "eval_samples_per_second": 55.142, "eval_steps_per_second": 13.785, "step": 250 }, { "epoch": 0.02307514806553342, "grad_norm": 2.1014769077301025, "learning_rate": 0.0001184, "loss": 1.65, "step": 300 }, { "epoch": 0.02692100607645566, "grad_norm": 1.5384572744369507, "learning_rate": 0.0001384, "loss": 1.5594, "step": 350 }, { "epoch": 0.030766864087377895, "grad_norm": 1.4865778684616089, "learning_rate": 0.00015840000000000003, "loss": 1.6723, "step": 400 }, { "epoch": 0.03461272209830013, "grad_norm": 1.0966717004776, "learning_rate": 0.0001784, "loss": 1.602, "step": 450 }, { "epoch": 0.038458580109222366, "grad_norm": 1.6298224925994873, "learning_rate": 0.0001984, "loss": 1.5659, "step": 500 }, { "epoch": 0.038458580109222366, "eval_loss": 1.6280817985534668, "eval_runtime": 18.1181, "eval_samples_per_second": 55.194, "eval_steps_per_second": 13.798, "step": 500 }, { "epoch": 0.0423044381201446, "grad_norm": 1.2955571413040161, "learning_rate": 0.00019976105757992885, "loss": 1.6388, "step": 550 }, { "epoch": 0.04615029613106684, "grad_norm": 1.2672921419143677, "learning_rate": 0.00019950133755811236, "loss": 1.6029, "step": 600 }, { "epoch": 0.04999615414198908, "grad_norm": 1.6246057748794556, "learning_rate": 0.00019924161753629587, "loss": 1.4845, "step": 650 }, { "epoch": 0.05384201215291132, "grad_norm": 1.0235854387283325, "learning_rate": 0.0001989818975144794, "loss": 1.5184, "step": 700 }, { "epoch": 0.05768787016383355, "grad_norm": 1.5333527326583862, "learning_rate": 0.0001987221774926629, "loss": 1.6136, "step": 750 }, { "epoch": 0.05768787016383355, "eval_loss": 1.5622245073318481, "eval_runtime": 18.2497, "eval_samples_per_second": 54.795, "eval_steps_per_second": 13.699, "step": 750 }, { "epoch": 0.06153372817475579, "grad_norm": 1.1980785131454468, "learning_rate": 0.00019846245747084644, "loss": 1.571, "step": 800 }, { "epoch": 0.06537958618567802, "grad_norm": 1.6081124544143677, "learning_rate": 0.00019820273744902995, "loss": 1.5542, "step": 850 }, { "epoch": 0.06922544419660026, "grad_norm": 1.2620774507522583, "learning_rate": 0.0001979430174272135, "loss": 1.5513, "step": 900 }, { "epoch": 0.0730713022075225, "grad_norm": 0.8911245465278625, "learning_rate": 0.000197683297405397, "loss": 1.4516, "step": 950 }, { "epoch": 0.07691716021844473, "grad_norm": 1.2976778745651245, "learning_rate": 0.0001974235773835805, "loss": 1.5757, "step": 1000 }, { "epoch": 0.07691716021844473, "eval_loss": 1.5385061502456665, "eval_runtime": 18.1057, "eval_samples_per_second": 55.231, "eval_steps_per_second": 13.808, "step": 1000 }, { "epoch": 0.08076301822936698, "grad_norm": 1.451479196548462, "learning_rate": 0.00019716385736176403, "loss": 1.4287, "step": 1050 }, { "epoch": 0.0846088762402892, "grad_norm": 1.0982426404953003, "learning_rate": 0.00019690413733994754, "loss": 1.4962, "step": 1100 }, { "epoch": 0.08845473425121145, "grad_norm": 0.8899670839309692, "learning_rate": 0.00019664441731813106, "loss": 1.4675, "step": 1150 }, { "epoch": 0.09230059226213368, "grad_norm": 1.5194021463394165, "learning_rate": 0.0001963846972963146, "loss": 1.4573, "step": 1200 }, { "epoch": 0.09614645027305592, "grad_norm": 1.716470718383789, "learning_rate": 0.0001961249772744981, "loss": 1.5143, "step": 1250 }, { "epoch": 0.09614645027305592, "eval_loss": 1.5197770595550537, "eval_runtime": 18.2416, "eval_samples_per_second": 54.82, "eval_steps_per_second": 13.705, "step": 1250 }, { "epoch": 0.09999230828397816, "grad_norm": 1.986771583557129, "learning_rate": 0.00019586525725268162, "loss": 1.5265, "step": 1300 }, { "epoch": 0.10383816629490039, "grad_norm": 0.86269211769104, "learning_rate": 0.00019560553723086514, "loss": 1.4842, "step": 1350 }, { "epoch": 0.10768402430582263, "grad_norm": 1.187501072883606, "learning_rate": 0.00019534581720904865, "loss": 1.4054, "step": 1400 }, { "epoch": 0.11152988231674486, "grad_norm": 1.5051347017288208, "learning_rate": 0.00019508609718723216, "loss": 1.4141, "step": 1450 }, { "epoch": 0.1153757403276671, "grad_norm": 1.99917471408844, "learning_rate": 0.0001948263771654157, "loss": 1.4767, "step": 1500 }, { "epoch": 0.1153757403276671, "eval_loss": 1.4949736595153809, "eval_runtime": 18.0402, "eval_samples_per_second": 55.432, "eval_steps_per_second": 13.858, "step": 1500 }, { "epoch": 0.11922159833858934, "grad_norm": 1.6421241760253906, "learning_rate": 0.00019456665714359921, "loss": 1.5029, "step": 1550 }, { "epoch": 0.12306745634951158, "grad_norm": 1.8251460790634155, "learning_rate": 0.00019430693712178273, "loss": 1.4666, "step": 1600 }, { "epoch": 0.12691331436043382, "grad_norm": 1.2284319400787354, "learning_rate": 0.00019404721709996624, "loss": 1.4785, "step": 1650 }, { "epoch": 0.13075917237135604, "grad_norm": 3.7399282455444336, "learning_rate": 0.00019378749707814975, "loss": 1.5027, "step": 1700 }, { "epoch": 0.13460503038227828, "grad_norm": 1.2193188667297363, "learning_rate": 0.0001935277770563333, "loss": 1.4944, "step": 1750 }, { "epoch": 0.13460503038227828, "eval_loss": 1.4724599123001099, "eval_runtime": 18.0061, "eval_samples_per_second": 55.537, "eval_steps_per_second": 13.884, "step": 1750 }, { "epoch": 0.13845088839320052, "grad_norm": 0.5916198492050171, "learning_rate": 0.0001932680570345168, "loss": 1.5119, "step": 1800 }, { "epoch": 0.14229674640412276, "grad_norm": 1.4087570905685425, "learning_rate": 0.00019300833701270032, "loss": 1.3608, "step": 1850 }, { "epoch": 0.146142604415045, "grad_norm": 1.2559338808059692, "learning_rate": 0.00019274861699088386, "loss": 1.4772, "step": 1900 }, { "epoch": 0.14998846242596722, "grad_norm": 0.9022719860076904, "learning_rate": 0.00019248889696906734, "loss": 1.4339, "step": 1950 }, { "epoch": 0.15383432043688947, "grad_norm": 1.2900218963623047, "learning_rate": 0.00019222917694725086, "loss": 1.4612, "step": 2000 }, { "epoch": 0.15383432043688947, "eval_loss": 1.4609016180038452, "eval_runtime": 18.1146, "eval_samples_per_second": 55.204, "eval_steps_per_second": 13.801, "step": 2000 }, { "epoch": 0.1576801784478117, "grad_norm": 0.8418329358100891, "learning_rate": 0.0001919694569254344, "loss": 1.4944, "step": 2050 }, { "epoch": 0.16152603645873395, "grad_norm": 1.538751482963562, "learning_rate": 0.0001917097369036179, "loss": 1.4135, "step": 2100 }, { "epoch": 0.16537189446965617, "grad_norm": 1.3898651599884033, "learning_rate": 0.00019145001688180142, "loss": 1.3683, "step": 2150 }, { "epoch": 0.1692177524805784, "grad_norm": 0.7671115398406982, "learning_rate": 0.00019119029685998496, "loss": 1.365, "step": 2200 }, { "epoch": 0.17306361049150065, "grad_norm": 0.732802152633667, "learning_rate": 0.00019093057683816848, "loss": 1.3213, "step": 2250 }, { "epoch": 0.17306361049150065, "eval_loss": 1.4241567850112915, "eval_runtime": 17.9984, "eval_samples_per_second": 55.56, "eval_steps_per_second": 13.89, "step": 2250 }, { "epoch": 0.1769094685024229, "grad_norm": 1.6236932277679443, "learning_rate": 0.00019067085681635196, "loss": 1.4366, "step": 2300 }, { "epoch": 0.18075532651334514, "grad_norm": 1.3093007802963257, "learning_rate": 0.0001904111367945355, "loss": 1.3468, "step": 2350 }, { "epoch": 0.18460118452426735, "grad_norm": 1.409177303314209, "learning_rate": 0.00019015141677271901, "loss": 1.4066, "step": 2400 }, { "epoch": 0.1884470425351896, "grad_norm": 1.0054073333740234, "learning_rate": 0.00018989169675090253, "loss": 1.4608, "step": 2450 }, { "epoch": 0.19229290054611184, "grad_norm": 1.0325884819030762, "learning_rate": 0.00018963197672908607, "loss": 1.3857, "step": 2500 }, { "epoch": 0.19229290054611184, "eval_loss": 1.4193872213363647, "eval_runtime": 18.115, "eval_samples_per_second": 55.203, "eval_steps_per_second": 13.801, "step": 2500 }, { "epoch": 0.19613875855703408, "grad_norm": 0.7152838110923767, "learning_rate": 0.00018937225670726958, "loss": 1.3343, "step": 2550 }, { "epoch": 0.19998461656795632, "grad_norm": 0.9736573100090027, "learning_rate": 0.0001891125366854531, "loss": 1.3832, "step": 2600 }, { "epoch": 0.20383047457887854, "grad_norm": 0.9278397560119629, "learning_rate": 0.0001888528166636366, "loss": 1.3807, "step": 2650 }, { "epoch": 0.20767633258980078, "grad_norm": 1.8133916854858398, "learning_rate": 0.00018859309664182012, "loss": 1.3844, "step": 2700 }, { "epoch": 0.21152219060072303, "grad_norm": 1.1289211511611938, "learning_rate": 0.00018833337662000366, "loss": 1.2984, "step": 2750 }, { "epoch": 0.21152219060072303, "eval_loss": 1.4020246267318726, "eval_runtime": 18.086, "eval_samples_per_second": 55.291, "eval_steps_per_second": 13.823, "step": 2750 }, { "epoch": 0.21536804861164527, "grad_norm": 1.9358755350112915, "learning_rate": 0.00018807365659818717, "loss": 1.3231, "step": 2800 }, { "epoch": 0.21921390662256748, "grad_norm": 1.453515887260437, "learning_rate": 0.00018781393657637068, "loss": 1.3395, "step": 2850 }, { "epoch": 0.22305976463348973, "grad_norm": 1.423431396484375, "learning_rate": 0.0001875542165545542, "loss": 1.435, "step": 2900 }, { "epoch": 0.22690562264441197, "grad_norm": 0.9964897632598877, "learning_rate": 0.0001872944965327377, "loss": 1.3356, "step": 2950 }, { "epoch": 0.2307514806553342, "grad_norm": 1.5574508905410767, "learning_rate": 0.00018703477651092122, "loss": 1.4102, "step": 3000 }, { "epoch": 0.2307514806553342, "eval_loss": 1.3945672512054443, "eval_runtime": 18.0752, "eval_samples_per_second": 55.324, "eval_steps_per_second": 13.831, "step": 3000 }, { "epoch": 0.23459733866625646, "grad_norm": 1.7693700790405273, "learning_rate": 0.00018677505648910476, "loss": 1.4095, "step": 3050 }, { "epoch": 0.23844319667717867, "grad_norm": 1.0146111249923706, "learning_rate": 0.00018651533646728827, "loss": 1.3416, "step": 3100 }, { "epoch": 0.2422890546881009, "grad_norm": 1.228946566581726, "learning_rate": 0.0001862556164454718, "loss": 1.3326, "step": 3150 }, { "epoch": 0.24613491269902316, "grad_norm": 0.9278371930122375, "learning_rate": 0.0001859958964236553, "loss": 1.4023, "step": 3200 }, { "epoch": 0.2499807707099454, "grad_norm": 1.858821988105774, "learning_rate": 0.0001857361764018388, "loss": 1.3583, "step": 3250 }, { "epoch": 0.2499807707099454, "eval_loss": 1.3779631853103638, "eval_runtime": 18.0487, "eval_samples_per_second": 55.406, "eval_steps_per_second": 13.851, "step": 3250 }, { "epoch": 0.25382662872086764, "grad_norm": 1.0133246183395386, "learning_rate": 0.00018547645638002233, "loss": 1.3258, "step": 3300 }, { "epoch": 0.2576724867317899, "grad_norm": 1.142626166343689, "learning_rate": 0.00018521673635820587, "loss": 1.3315, "step": 3350 }, { "epoch": 0.26151834474271207, "grad_norm": 0.9573944211006165, "learning_rate": 0.00018495701633638938, "loss": 1.2949, "step": 3400 }, { "epoch": 0.2653642027536343, "grad_norm": 0.8417842984199524, "learning_rate": 0.00018469729631457292, "loss": 1.3291, "step": 3450 }, { "epoch": 0.26921006076455656, "grad_norm": 0.8505682945251465, "learning_rate": 0.00018443757629275643, "loss": 1.2611, "step": 3500 }, { "epoch": 0.26921006076455656, "eval_loss": 1.3726494312286377, "eval_runtime": 18.0195, "eval_samples_per_second": 55.495, "eval_steps_per_second": 13.874, "step": 3500 }, { "epoch": 0.2730559187754788, "grad_norm": 1.0631035566329956, "learning_rate": 0.00018417785627093992, "loss": 1.3384, "step": 3550 }, { "epoch": 0.27690177678640104, "grad_norm": 1.1145228147506714, "learning_rate": 0.00018391813624912346, "loss": 1.4159, "step": 3600 }, { "epoch": 0.2807476347973233, "grad_norm": 1.286778450012207, "learning_rate": 0.00018365841622730697, "loss": 1.3372, "step": 3650 }, { "epoch": 0.28459349280824553, "grad_norm": 1.1863288879394531, "learning_rate": 0.00018339869620549048, "loss": 1.2993, "step": 3700 }, { "epoch": 0.2884393508191678, "grad_norm": 1.6189292669296265, "learning_rate": 0.00018313897618367402, "loss": 1.3464, "step": 3750 }, { "epoch": 0.2884393508191678, "eval_loss": 1.3553545475006104, "eval_runtime": 18.0591, "eval_samples_per_second": 55.374, "eval_steps_per_second": 13.843, "step": 3750 }, { "epoch": 0.29228520883009, "grad_norm": 1.4823222160339355, "learning_rate": 0.00018287925616185754, "loss": 1.3392, "step": 3800 }, { "epoch": 0.2961310668410122, "grad_norm": 1.4085184335708618, "learning_rate": 0.00018261953614004105, "loss": 1.2989, "step": 3850 }, { "epoch": 0.29997692485193445, "grad_norm": 1.7249082326889038, "learning_rate": 0.00018235981611822456, "loss": 1.3866, "step": 3900 }, { "epoch": 0.3038227828628567, "grad_norm": 0.9753608107566833, "learning_rate": 0.00018210009609640807, "loss": 1.2916, "step": 3950 }, { "epoch": 0.30766864087377893, "grad_norm": 0.6619511246681213, "learning_rate": 0.0001818403760745916, "loss": 1.2768, "step": 4000 }, { "epoch": 0.30766864087377893, "eval_loss": 1.337461233139038, "eval_runtime": 18.006, "eval_samples_per_second": 55.537, "eval_steps_per_second": 13.884, "step": 4000 }, { "epoch": 0.3115144988847012, "grad_norm": 0.9473676085472107, "learning_rate": 0.00018158065605277513, "loss": 1.3116, "step": 4050 }, { "epoch": 0.3153603568956234, "grad_norm": 1.2772737741470337, "learning_rate": 0.00018132093603095864, "loss": 1.3058, "step": 4100 }, { "epoch": 0.31920621490654566, "grad_norm": 1.7045694589614868, "learning_rate": 0.0001810664104095785, "loss": 1.3398, "step": 4150 }, { "epoch": 0.3230520729174679, "grad_norm": 1.498179316520691, "learning_rate": 0.000180806690387762, "loss": 1.3434, "step": 4200 }, { "epoch": 0.32689793092839015, "grad_norm": 1.5777134895324707, "learning_rate": 0.00018054697036594552, "loss": 1.3437, "step": 4250 }, { "epoch": 0.32689793092839015, "eval_loss": 1.334365963935852, "eval_runtime": 18.026, "eval_samples_per_second": 55.475, "eval_steps_per_second": 13.869, "step": 4250 }, { "epoch": 0.33074378893931233, "grad_norm": 0.7399800419807434, "learning_rate": 0.00018028725034412903, "loss": 1.2932, "step": 4300 }, { "epoch": 0.3345896469502346, "grad_norm": 0.7411991953849792, "learning_rate": 0.00018002753032231257, "loss": 1.2928, "step": 4350 }, { "epoch": 0.3384355049611568, "grad_norm": 1.308003544807434, "learning_rate": 0.00017976781030049608, "loss": 1.3192, "step": 4400 }, { "epoch": 0.34228136297207906, "grad_norm": 1.1857889890670776, "learning_rate": 0.0001795080902786796, "loss": 1.2718, "step": 4450 }, { "epoch": 0.3461272209830013, "grad_norm": 0.5179012417793274, "learning_rate": 0.0001792483702568631, "loss": 1.2513, "step": 4500 }, { "epoch": 0.3461272209830013, "eval_loss": 1.346890926361084, "eval_runtime": 18.086, "eval_samples_per_second": 55.291, "eval_steps_per_second": 13.823, "step": 4500 }, { "epoch": 0.34997307899392355, "grad_norm": 1.2267632484436035, "learning_rate": 0.00017898865023504662, "loss": 1.349, "step": 4550 }, { "epoch": 0.3538189370048458, "grad_norm": 0.9660719037055969, "learning_rate": 0.00017872893021323013, "loss": 1.3194, "step": 4600 }, { "epoch": 0.35766479501576803, "grad_norm": 1.4557528495788574, "learning_rate": 0.00017846921019141367, "loss": 1.3132, "step": 4650 }, { "epoch": 0.3615106530266903, "grad_norm": 0.9239174723625183, "learning_rate": 0.0001782094901695972, "loss": 1.2598, "step": 4700 }, { "epoch": 0.36535651103761246, "grad_norm": 1.1237714290618896, "learning_rate": 0.0001779497701477807, "loss": 1.2506, "step": 4750 }, { "epoch": 0.36535651103761246, "eval_loss": 1.3218908309936523, "eval_runtime": 18.1211, "eval_samples_per_second": 55.184, "eval_steps_per_second": 13.796, "step": 4750 }, { "epoch": 0.3692023690485347, "grad_norm": 1.0127383470535278, "learning_rate": 0.0001776900501259642, "loss": 1.3277, "step": 4800 }, { "epoch": 0.37304822705945695, "grad_norm": 1.1309473514556885, "learning_rate": 0.00017743033010414773, "loss": 1.2991, "step": 4850 }, { "epoch": 0.3768940850703792, "grad_norm": 1.321747899055481, "learning_rate": 0.00017717061008233124, "loss": 1.3213, "step": 4900 }, { "epoch": 0.38073994308130144, "grad_norm": 1.1251367330551147, "learning_rate": 0.00017691089006051478, "loss": 1.2742, "step": 4950 }, { "epoch": 0.3845858010922237, "grad_norm": 1.1043410301208496, "learning_rate": 0.0001766511700386983, "loss": 1.2755, "step": 5000 }, { "epoch": 0.3845858010922237, "eval_loss": 1.3034751415252686, "eval_runtime": 18.0006, "eval_samples_per_second": 55.554, "eval_steps_per_second": 13.888, "step": 5000 }, { "epoch": 0.3884316591031459, "grad_norm": 0.8127657175064087, "learning_rate": 0.00017639145001688183, "loss": 1.3019, "step": 5050 }, { "epoch": 0.39227751711406816, "grad_norm": 0.56494140625, "learning_rate": 0.00017613172999506534, "loss": 1.3148, "step": 5100 }, { "epoch": 0.3961233751249904, "grad_norm": 2.181711435317993, "learning_rate": 0.00017587200997324883, "loss": 1.2761, "step": 5150 }, { "epoch": 0.39996923313591265, "grad_norm": 0.6779603362083435, "learning_rate": 0.00017561228995143237, "loss": 1.3149, "step": 5200 }, { "epoch": 0.40381509114683484, "grad_norm": 0.5844702124595642, "learning_rate": 0.00017535256992961588, "loss": 1.3437, "step": 5250 }, { "epoch": 0.40381509114683484, "eval_loss": 1.2923167943954468, "eval_runtime": 17.9871, "eval_samples_per_second": 55.595, "eval_steps_per_second": 13.899, "step": 5250 }, { "epoch": 0.4076609491577571, "grad_norm": 0.9879493117332458, "learning_rate": 0.0001750928499077994, "loss": 1.2764, "step": 5300 }, { "epoch": 0.4115068071686793, "grad_norm": 1.443860650062561, "learning_rate": 0.00017483312988598293, "loss": 1.3204, "step": 5350 }, { "epoch": 0.41535266517960157, "grad_norm": 0.8753446340560913, "learning_rate": 0.00017457340986416645, "loss": 1.2762, "step": 5400 }, { "epoch": 0.4191985231905238, "grad_norm": 1.2027819156646729, "learning_rate": 0.00017431368984234996, "loss": 1.2097, "step": 5450 }, { "epoch": 0.42304438120144605, "grad_norm": 1.1534991264343262, "learning_rate": 0.00017405396982053347, "loss": 1.2723, "step": 5500 }, { "epoch": 0.42304438120144605, "eval_loss": 1.2931731939315796, "eval_runtime": 17.9258, "eval_samples_per_second": 55.785, "eval_steps_per_second": 13.946, "step": 5500 }, { "epoch": 0.4268902392123683, "grad_norm": 1.0256164073944092, "learning_rate": 0.00017379424979871699, "loss": 1.2313, "step": 5550 }, { "epoch": 0.43073609722329054, "grad_norm": 1.276945948600769, "learning_rate": 0.0001735345297769005, "loss": 1.1982, "step": 5600 }, { "epoch": 0.4345819552342128, "grad_norm": 0.9002663493156433, "learning_rate": 0.00017327480975508404, "loss": 1.2726, "step": 5650 }, { "epoch": 0.43842781324513497, "grad_norm": 1.1424119472503662, "learning_rate": 0.00017301508973326755, "loss": 1.3473, "step": 5700 }, { "epoch": 0.4422736712560572, "grad_norm": 0.6811870336532593, "learning_rate": 0.00017275536971145106, "loss": 1.228, "step": 5750 }, { "epoch": 0.4422736712560572, "eval_loss": 1.2861703634262085, "eval_runtime": 18.0823, "eval_samples_per_second": 55.303, "eval_steps_per_second": 13.826, "step": 5750 }, { "epoch": 0.44611952926697945, "grad_norm": 1.0646696090698242, "learning_rate": 0.00017249564968963458, "loss": 1.2946, "step": 5800 }, { "epoch": 0.4499653872779017, "grad_norm": 1.436909556388855, "learning_rate": 0.0001722359296678181, "loss": 1.3121, "step": 5850 }, { "epoch": 0.45381124528882394, "grad_norm": 0.937135636806488, "learning_rate": 0.00017197620964600163, "loss": 1.2248, "step": 5900 }, { "epoch": 0.4576571032997462, "grad_norm": 0.908935010433197, "learning_rate": 0.00017171648962418514, "loss": 1.308, "step": 5950 }, { "epoch": 0.4615029613106684, "grad_norm": 1.3925087451934814, "learning_rate": 0.00017145676960236866, "loss": 1.3007, "step": 6000 }, { "epoch": 0.4615029613106684, "eval_loss": 1.27406644821167, "eval_runtime": 18.0211, "eval_samples_per_second": 55.49, "eval_steps_per_second": 13.873, "step": 6000 }, { "epoch": 0.46534881932159067, "grad_norm": 1.2292288541793823, "learning_rate": 0.00017119704958055217, "loss": 1.2707, "step": 6050 }, { "epoch": 0.4691946773325129, "grad_norm": 0.8948924541473389, "learning_rate": 0.00017093732955873568, "loss": 1.2481, "step": 6100 }, { "epoch": 0.4730405353434351, "grad_norm": 0.7155699133872986, "learning_rate": 0.0001706776095369192, "loss": 1.2663, "step": 6150 }, { "epoch": 0.47688639335435734, "grad_norm": 0.7100064158439636, "learning_rate": 0.00017041788951510273, "loss": 1.2976, "step": 6200 }, { "epoch": 0.4807322513652796, "grad_norm": 1.3250987529754639, "learning_rate": 0.00017015816949328625, "loss": 1.2368, "step": 6250 }, { "epoch": 0.4807322513652796, "eval_loss": 1.2630703449249268, "eval_runtime": 18.0054, "eval_samples_per_second": 55.539, "eval_steps_per_second": 13.885, "step": 6250 }, { "epoch": 0.4845781093762018, "grad_norm": 0.9060600996017456, "learning_rate": 0.00016989844947146976, "loss": 1.2344, "step": 6300 }, { "epoch": 0.48842396738712407, "grad_norm": 0.8371444940567017, "learning_rate": 0.0001696387294496533, "loss": 1.277, "step": 6350 }, { "epoch": 0.4922698253980463, "grad_norm": 1.2833727598190308, "learning_rate": 0.00016937900942783679, "loss": 1.2941, "step": 6400 }, { "epoch": 0.49611568340896856, "grad_norm": 1.5922775268554688, "learning_rate": 0.0001691192894060203, "loss": 1.2448, "step": 6450 }, { "epoch": 0.4999615414198908, "grad_norm": 0.9083874225616455, "learning_rate": 0.00016885956938420384, "loss": 1.2187, "step": 6500 }, { "epoch": 0.4999615414198908, "eval_loss": 1.2612597942352295, "eval_runtime": 17.9521, "eval_samples_per_second": 55.704, "eval_steps_per_second": 13.926, "step": 6500 }, { "epoch": 0.503807399430813, "grad_norm": 1.3177634477615356, "learning_rate": 0.00016859984936238735, "loss": 1.305, "step": 6550 }, { "epoch": 0.5076532574417353, "grad_norm": 1.8331613540649414, "learning_rate": 0.00016834012934057086, "loss": 1.2468, "step": 6600 }, { "epoch": 0.5114991154526575, "grad_norm": 0.8823532462120056, "learning_rate": 0.0001680804093187544, "loss": 1.265, "step": 6650 }, { "epoch": 0.5153449734635798, "grad_norm": 1.1489806175231934, "learning_rate": 0.00016782068929693792, "loss": 1.1942, "step": 6700 }, { "epoch": 0.519190831474502, "grad_norm": 4.0805816650390625, "learning_rate": 0.00016756096927512143, "loss": 1.2906, "step": 6750 }, { "epoch": 0.519190831474502, "eval_loss": 1.252502202987671, "eval_runtime": 17.9221, "eval_samples_per_second": 55.797, "eval_steps_per_second": 13.949, "step": 6750 }, { "epoch": 0.5230366894854241, "grad_norm": 0.9559470415115356, "learning_rate": 0.00016730124925330494, "loss": 1.1792, "step": 6800 }, { "epoch": 0.5268825474963464, "grad_norm": 2.268700361251831, "learning_rate": 0.00016704152923148846, "loss": 1.2541, "step": 6850 }, { "epoch": 0.5307284055072686, "grad_norm": 1.0873395204544067, "learning_rate": 0.000166781809209672, "loss": 1.2089, "step": 6900 }, { "epoch": 0.5345742635181909, "grad_norm": 0.877153217792511, "learning_rate": 0.0001665220891878555, "loss": 1.2655, "step": 6950 }, { "epoch": 0.5384201215291131, "grad_norm": 1.1317107677459717, "learning_rate": 0.00016626236916603902, "loss": 1.2115, "step": 7000 }, { "epoch": 0.5384201215291131, "eval_loss": 1.247037649154663, "eval_runtime": 17.9248, "eval_samples_per_second": 55.789, "eval_steps_per_second": 13.947, "step": 7000 }, { "epoch": 0.5422659795400354, "grad_norm": 1.1601048707962036, "learning_rate": 0.00016600264914422253, "loss": 1.2494, "step": 7050 }, { "epoch": 0.5461118375509576, "grad_norm": 0.7940592765808105, "learning_rate": 0.00016574292912240605, "loss": 1.2906, "step": 7100 }, { "epoch": 0.5499576955618799, "grad_norm": 0.6271395087242126, "learning_rate": 0.00016548320910058956, "loss": 1.2684, "step": 7150 }, { "epoch": 0.5538035535728021, "grad_norm": 1.3025091886520386, "learning_rate": 0.0001652234890787731, "loss": 1.2694, "step": 7200 }, { "epoch": 0.5576494115837243, "grad_norm": 1.3218464851379395, "learning_rate": 0.0001649637690569566, "loss": 1.3098, "step": 7250 }, { "epoch": 0.5576494115837243, "eval_loss": 1.2370234727859497, "eval_runtime": 18.0496, "eval_samples_per_second": 55.403, "eval_steps_per_second": 13.851, "step": 7250 }, { "epoch": 0.5614952695946466, "grad_norm": 1.1432136297225952, "learning_rate": 0.00016470404903514013, "loss": 1.2567, "step": 7300 }, { "epoch": 0.5653411276055688, "grad_norm": 0.9530320763587952, "learning_rate": 0.00016444432901332364, "loss": 1.1878, "step": 7350 }, { "epoch": 0.5691869856164911, "grad_norm": 1.1852946281433105, "learning_rate": 0.00016418460899150715, "loss": 1.2153, "step": 7400 }, { "epoch": 0.5730328436274132, "grad_norm": 0.7916271686553955, "learning_rate": 0.00016392488896969066, "loss": 1.2574, "step": 7450 }, { "epoch": 0.5768787016383355, "grad_norm": 0.8115867972373962, "learning_rate": 0.0001636651689478742, "loss": 1.2777, "step": 7500 }, { "epoch": 0.5768787016383355, "eval_loss": 1.2334003448486328, "eval_runtime": 18.0145, "eval_samples_per_second": 55.511, "eval_steps_per_second": 13.878, "step": 7500 }, { "epoch": 0.5807245596492577, "grad_norm": 0.9350728988647461, "learning_rate": 0.00016340544892605772, "loss": 1.2261, "step": 7550 }, { "epoch": 0.58457041766018, "grad_norm": 0.7061731815338135, "learning_rate": 0.00016314572890424126, "loss": 1.172, "step": 7600 }, { "epoch": 0.5884162756711022, "grad_norm": 1.091739296913147, "learning_rate": 0.00016288600888242477, "loss": 1.272, "step": 7650 }, { "epoch": 0.5922621336820244, "grad_norm": 0.8880358338356018, "learning_rate": 0.00016262628886060826, "loss": 1.1847, "step": 7700 }, { "epoch": 0.5961079916929467, "grad_norm": 2.7609329223632812, "learning_rate": 0.0001623665688387918, "loss": 1.2636, "step": 7750 }, { "epoch": 0.5961079916929467, "eval_loss": 1.2325551509857178, "eval_runtime": 17.8663, "eval_samples_per_second": 55.971, "eval_steps_per_second": 13.993, "step": 7750 }, { "epoch": 0.5999538497038689, "grad_norm": 0.8872610926628113, "learning_rate": 0.0001621068488169753, "loss": 1.245, "step": 7800 }, { "epoch": 0.6037997077147912, "grad_norm": 0.7548871040344238, "learning_rate": 0.00016184712879515882, "loss": 1.2349, "step": 7850 }, { "epoch": 0.6076455657257134, "grad_norm": 1.104351282119751, "learning_rate": 0.00016158740877334236, "loss": 1.2397, "step": 7900 }, { "epoch": 0.6114914237366357, "grad_norm": 0.8331647515296936, "learning_rate": 0.00016132768875152587, "loss": 1.2634, "step": 7950 }, { "epoch": 0.6153372817475579, "grad_norm": 1.0910013914108276, "learning_rate": 0.00016106796872970939, "loss": 1.2897, "step": 8000 }, { "epoch": 0.6153372817475579, "eval_loss": 1.2241965532302856, "eval_runtime": 18.04, "eval_samples_per_second": 55.432, "eval_steps_per_second": 13.858, "step": 8000 }, { "epoch": 0.6191831397584802, "grad_norm": 1.4128022193908691, "learning_rate": 0.0001608082487078929, "loss": 1.1912, "step": 8050 }, { "epoch": 0.6230289977694023, "grad_norm": 1.5363566875457764, "learning_rate": 0.0001605485286860764, "loss": 1.2622, "step": 8100 }, { "epoch": 0.6268748557803245, "grad_norm": 1.334889531135559, "learning_rate": 0.00016028880866425992, "loss": 1.2462, "step": 8150 }, { "epoch": 0.6307207137912468, "grad_norm": 1.63850998878479, "learning_rate": 0.00016002908864244346, "loss": 1.2099, "step": 8200 }, { "epoch": 0.634566571802169, "grad_norm": 1.2087870836257935, "learning_rate": 0.00015976936862062698, "loss": 1.1669, "step": 8250 }, { "epoch": 0.634566571802169, "eval_loss": 1.2158918380737305, "eval_runtime": 17.8293, "eval_samples_per_second": 56.087, "eval_steps_per_second": 14.022, "step": 8250 }, { "epoch": 0.6384124298130913, "grad_norm": 2.0238049030303955, "learning_rate": 0.0001595096485988105, "loss": 1.1893, "step": 8300 }, { "epoch": 0.6422582878240135, "grad_norm": 0.7206680178642273, "learning_rate": 0.000159249928576994, "loss": 1.2155, "step": 8350 }, { "epoch": 0.6461041458349358, "grad_norm": 1.0200512409210205, "learning_rate": 0.00015899020855517752, "loss": 1.2049, "step": 8400 }, { "epoch": 0.649950003845858, "grad_norm": 0.7880833745002747, "learning_rate": 0.00015873048853336106, "loss": 1.2564, "step": 8450 }, { "epoch": 0.6537958618567803, "grad_norm": 1.0986734628677368, "learning_rate": 0.00015847076851154457, "loss": 1.2989, "step": 8500 }, { "epoch": 0.6537958618567803, "eval_loss": 1.2085261344909668, "eval_runtime": 17.887, "eval_samples_per_second": 55.907, "eval_steps_per_second": 13.977, "step": 8500 }, { "epoch": 0.6576417198677025, "grad_norm": 0.5527728796005249, "learning_rate": 0.00015821104848972808, "loss": 1.1798, "step": 8550 }, { "epoch": 0.6614875778786247, "grad_norm": 1.0168190002441406, "learning_rate": 0.0001579513284679116, "loss": 1.211, "step": 8600 }, { "epoch": 0.665333435889547, "grad_norm": 0.8436816334724426, "learning_rate": 0.0001576916084460951, "loss": 1.2018, "step": 8650 }, { "epoch": 0.6691792939004692, "grad_norm": 0.967677891254425, "learning_rate": 0.00015743188842427862, "loss": 1.2134, "step": 8700 }, { "epoch": 0.6730251519113915, "grad_norm": 0.9716609120368958, "learning_rate": 0.00015717216840246216, "loss": 1.2206, "step": 8750 }, { "epoch": 0.6730251519113915, "eval_loss": 1.2069470882415771, "eval_runtime": 17.9386, "eval_samples_per_second": 55.746, "eval_steps_per_second": 13.936, "step": 8750 }, { "epoch": 0.6768710099223136, "grad_norm": 1.1798300743103027, "learning_rate": 0.00015691244838064567, "loss": 1.2258, "step": 8800 }, { "epoch": 0.6807168679332359, "grad_norm": 1.2064564228057861, "learning_rate": 0.00015665272835882919, "loss": 1.217, "step": 8850 }, { "epoch": 0.6845627259441581, "grad_norm": 1.2753881216049194, "learning_rate": 0.00015639300833701273, "loss": 1.1915, "step": 8900 }, { "epoch": 0.6884085839550804, "grad_norm": 1.2899794578552246, "learning_rate": 0.00015613848271563255, "loss": 1.1967, "step": 8950 }, { "epoch": 0.6922544419660026, "grad_norm": 0.5771601796150208, "learning_rate": 0.00015587876269381606, "loss": 1.1958, "step": 9000 }, { "epoch": 0.6922544419660026, "eval_loss": 1.19791841506958, "eval_runtime": 18.0075, "eval_samples_per_second": 55.532, "eval_steps_per_second": 13.883, "step": 9000 }, { "epoch": 0.6961002999769248, "grad_norm": 1.467191457748413, "learning_rate": 0.00015561904267199958, "loss": 1.1938, "step": 9050 }, { "epoch": 0.6999461579878471, "grad_norm": 0.7669786214828491, "learning_rate": 0.00015535932265018312, "loss": 1.2228, "step": 9100 }, { "epoch": 0.7037920159987693, "grad_norm": 0.843961238861084, "learning_rate": 0.00015509960262836663, "loss": 1.1543, "step": 9150 }, { "epoch": 0.7076378740096916, "grad_norm": 1.2265573740005493, "learning_rate": 0.00015483988260655017, "loss": 1.1518, "step": 9200 }, { "epoch": 0.7114837320206138, "grad_norm": 1.1644186973571777, "learning_rate": 0.00015458016258473365, "loss": 1.23, "step": 9250 }, { "epoch": 0.7114837320206138, "eval_loss": 1.1922409534454346, "eval_runtime": 17.9384, "eval_samples_per_second": 55.746, "eval_steps_per_second": 13.937, "step": 9250 }, { "epoch": 0.7153295900315361, "grad_norm": 1.3787301778793335, "learning_rate": 0.00015432044256291717, "loss": 1.2347, "step": 9300 }, { "epoch": 0.7191754480424583, "grad_norm": 1.4755727052688599, "learning_rate": 0.0001540607225411007, "loss": 1.1596, "step": 9350 }, { "epoch": 0.7230213060533806, "grad_norm": 1.031275749206543, "learning_rate": 0.00015380100251928422, "loss": 1.2045, "step": 9400 }, { "epoch": 0.7268671640643027, "grad_norm": 1.2802574634552002, "learning_rate": 0.00015354128249746773, "loss": 1.205, "step": 9450 }, { "epoch": 0.7307130220752249, "grad_norm": 0.5222998857498169, "learning_rate": 0.00015328156247565127, "loss": 1.1678, "step": 9500 }, { "epoch": 0.7307130220752249, "eval_loss": 1.1886347532272339, "eval_runtime": 17.9243, "eval_samples_per_second": 55.79, "eval_steps_per_second": 13.948, "step": 9500 }, { "epoch": 0.7345588800861472, "grad_norm": 0.8676270842552185, "learning_rate": 0.00015302184245383479, "loss": 1.1388, "step": 9550 }, { "epoch": 0.7384047380970694, "grad_norm": 1.198843240737915, "learning_rate": 0.00015276212243201827, "loss": 1.1995, "step": 9600 }, { "epoch": 0.7422505961079917, "grad_norm": 0.6684653162956238, "learning_rate": 0.0001525024024102018, "loss": 1.1567, "step": 9650 }, { "epoch": 0.7460964541189139, "grad_norm": 0.931119441986084, "learning_rate": 0.00015224268238838532, "loss": 1.1375, "step": 9700 }, { "epoch": 0.7499423121298362, "grad_norm": 0.7734692096710205, "learning_rate": 0.00015198296236656884, "loss": 1.1836, "step": 9750 }, { "epoch": 0.7499423121298362, "eval_loss": 1.1750942468643188, "eval_runtime": 18.0078, "eval_samples_per_second": 55.532, "eval_steps_per_second": 13.883, "step": 9750 }, { "epoch": 0.7537881701407584, "grad_norm": 1.0753988027572632, "learning_rate": 0.00015172324234475238, "loss": 1.1666, "step": 9800 }, { "epoch": 0.7576340281516807, "grad_norm": 0.45949143171310425, "learning_rate": 0.0001514635223229359, "loss": 1.1498, "step": 9850 }, { "epoch": 0.7614798861626029, "grad_norm": 1.0716335773468018, "learning_rate": 0.0001512038023011194, "loss": 1.1669, "step": 9900 }, { "epoch": 0.7653257441735252, "grad_norm": 1.043646216392517, "learning_rate": 0.00015094408227930292, "loss": 1.1689, "step": 9950 }, { "epoch": 0.7691716021844474, "grad_norm": 1.0494813919067383, "learning_rate": 0.00015068436225748643, "loss": 1.2343, "step": 10000 }, { "epoch": 0.7691716021844474, "eval_loss": 1.1786631345748901, "eval_runtime": 17.9206, "eval_samples_per_second": 55.802, "eval_steps_per_second": 13.95, "step": 10000 }, { "epoch": 0.7730174601953695, "grad_norm": 0.7689708471298218, "learning_rate": 0.00015042983663610628, "loss": 1.2341, "step": 10050 }, { "epoch": 0.7768633182062918, "grad_norm": 1.0559266805648804, "learning_rate": 0.00015017011661428982, "loss": 1.0868, "step": 10100 }, { "epoch": 0.780709176217214, "grad_norm": 1.01194429397583, "learning_rate": 0.00014991039659247333, "loss": 1.2187, "step": 10150 }, { "epoch": 0.7845550342281363, "grad_norm": 0.9095450043678284, "learning_rate": 0.00014965067657065684, "loss": 1.1432, "step": 10200 }, { "epoch": 0.7884008922390585, "grad_norm": 1.1280279159545898, "learning_rate": 0.00014939095654884036, "loss": 1.1631, "step": 10250 }, { "epoch": 0.7884008922390585, "eval_loss": 1.1772924661636353, "eval_runtime": 18.038, "eval_samples_per_second": 55.438, "eval_steps_per_second": 13.86, "step": 10250 }, { "epoch": 0.7922467502499808, "grad_norm": 1.1410025358200073, "learning_rate": 0.00014913123652702387, "loss": 1.2058, "step": 10300 }, { "epoch": 0.796092608260903, "grad_norm": 0.7516416311264038, "learning_rate": 0.00014887151650520738, "loss": 1.2702, "step": 10350 }, { "epoch": 0.7999384662718253, "grad_norm": 1.9470982551574707, "learning_rate": 0.00014861179648339092, "loss": 1.1322, "step": 10400 }, { "epoch": 0.8037843242827475, "grad_norm": 1.1969455480575562, "learning_rate": 0.00014835207646157444, "loss": 1.1882, "step": 10450 }, { "epoch": 0.8076301822936697, "grad_norm": 1.2365367412567139, "learning_rate": 0.00014809235643975795, "loss": 1.1929, "step": 10500 }, { "epoch": 0.8076301822936697, "eval_loss": 1.168535590171814, "eval_runtime": 17.931, "eval_samples_per_second": 55.769, "eval_steps_per_second": 13.942, "step": 10500 }, { "epoch": 0.811476040304592, "grad_norm": 1.2798963785171509, "learning_rate": 0.00014783263641794146, "loss": 1.2693, "step": 10550 }, { "epoch": 0.8153218983155142, "grad_norm": 0.8195398449897766, "learning_rate": 0.00014757291639612497, "loss": 1.1933, "step": 10600 }, { "epoch": 0.8191677563264365, "grad_norm": 1.039031744003296, "learning_rate": 0.0001473131963743085, "loss": 1.1257, "step": 10650 }, { "epoch": 0.8230136143373586, "grad_norm": 1.0875959396362305, "learning_rate": 0.00014705347635249203, "loss": 1.2578, "step": 10700 }, { "epoch": 0.8268594723482809, "grad_norm": 1.5674726963043213, "learning_rate": 0.00014679375633067554, "loss": 1.1895, "step": 10750 }, { "epoch": 0.8268594723482809, "eval_loss": 1.1696391105651855, "eval_runtime": 17.8658, "eval_samples_per_second": 55.973, "eval_steps_per_second": 13.993, "step": 10750 }, { "epoch": 0.8307053303592031, "grad_norm": 0.7315701842308044, "learning_rate": 0.00014653403630885905, "loss": 1.1379, "step": 10800 }, { "epoch": 0.8345511883701254, "grad_norm": 1.0033215284347534, "learning_rate": 0.00014627431628704257, "loss": 1.198, "step": 10850 }, { "epoch": 0.8383970463810476, "grad_norm": 1.0194263458251953, "learning_rate": 0.00014601459626522608, "loss": 1.2186, "step": 10900 }, { "epoch": 0.8422429043919698, "grad_norm": 0.9829340577125549, "learning_rate": 0.00014575487624340962, "loss": 1.1458, "step": 10950 }, { "epoch": 0.8460887624028921, "grad_norm": 1.3082759380340576, "learning_rate": 0.00014549515622159313, "loss": 1.1545, "step": 11000 }, { "epoch": 0.8460887624028921, "eval_loss": 1.1625275611877441, "eval_runtime": 17.8346, "eval_samples_per_second": 56.071, "eval_steps_per_second": 14.018, "step": 11000 }, { "epoch": 0.8499346204138143, "grad_norm": 1.2006999254226685, "learning_rate": 0.00014523543619977664, "loss": 1.1187, "step": 11050 }, { "epoch": 0.8537804784247366, "grad_norm": 1.0500705242156982, "learning_rate": 0.00014497571617796018, "loss": 1.1911, "step": 11100 }, { "epoch": 0.8576263364356588, "grad_norm": 0.8597742915153503, "learning_rate": 0.0001447159961561437, "loss": 1.1176, "step": 11150 }, { "epoch": 0.8614721944465811, "grad_norm": 1.338990569114685, "learning_rate": 0.00014445627613432718, "loss": 1.1699, "step": 11200 }, { "epoch": 0.8653180524575033, "grad_norm": 0.903128445148468, "learning_rate": 0.00014419655611251072, "loss": 1.1504, "step": 11250 }, { "epoch": 0.8653180524575033, "eval_loss": 1.161923885345459, "eval_runtime": 17.9304, "eval_samples_per_second": 55.771, "eval_steps_per_second": 13.943, "step": 11250 }, { "epoch": 0.8691639104684256, "grad_norm": 0.8747849464416504, "learning_rate": 0.00014393683609069424, "loss": 1.1305, "step": 11300 }, { "epoch": 0.8730097684793477, "grad_norm": 1.1505181789398193, "learning_rate": 0.00014367711606887775, "loss": 1.1515, "step": 11350 }, { "epoch": 0.8768556264902699, "grad_norm": 0.6178755164146423, "learning_rate": 0.0001434173960470613, "loss": 1.1785, "step": 11400 }, { "epoch": 0.8807014845011922, "grad_norm": 0.7437123656272888, "learning_rate": 0.0001431576760252448, "loss": 1.1567, "step": 11450 }, { "epoch": 0.8845473425121144, "grad_norm": 1.574104905128479, "learning_rate": 0.00014289795600342831, "loss": 1.1371, "step": 11500 }, { "epoch": 0.8845473425121144, "eval_loss": 1.154138445854187, "eval_runtime": 17.9772, "eval_samples_per_second": 55.626, "eval_steps_per_second": 13.906, "step": 11500 }, { "epoch": 0.8883932005230367, "grad_norm": 2.203948497772217, "learning_rate": 0.00014264343038204814, "loss": 1.2118, "step": 11550 }, { "epoch": 0.8922390585339589, "grad_norm": 1.1410473585128784, "learning_rate": 0.00014238371036023168, "loss": 1.2239, "step": 11600 }, { "epoch": 0.8960849165448812, "grad_norm": 1.0226402282714844, "learning_rate": 0.0001421239903384152, "loss": 1.1686, "step": 11650 }, { "epoch": 0.8999307745558034, "grad_norm": 1.0350555181503296, "learning_rate": 0.00014186427031659873, "loss": 1.1584, "step": 11700 }, { "epoch": 0.9037766325667257, "grad_norm": 1.6758803129196167, "learning_rate": 0.00014160455029478224, "loss": 1.1638, "step": 11750 }, { "epoch": 0.9037766325667257, "eval_loss": 1.1433300971984863, "eval_runtime": 17.8796, "eval_samples_per_second": 55.93, "eval_steps_per_second": 13.982, "step": 11750 }, { "epoch": 0.9076224905776479, "grad_norm": 1.2579885721206665, "learning_rate": 0.00014134483027296576, "loss": 1.2198, "step": 11800 }, { "epoch": 0.9114683485885701, "grad_norm": 0.886987030506134, "learning_rate": 0.00014108511025114927, "loss": 1.1477, "step": 11850 }, { "epoch": 0.9153142065994924, "grad_norm": 0.9828807711601257, "learning_rate": 0.00014082539022933278, "loss": 1.1115, "step": 11900 }, { "epoch": 0.9191600646104146, "grad_norm": 1.2362446784973145, "learning_rate": 0.0001405656702075163, "loss": 1.161, "step": 11950 }, { "epoch": 0.9230059226213368, "grad_norm": 1.1353052854537964, "learning_rate": 0.00014030595018569984, "loss": 1.1369, "step": 12000 }, { "epoch": 0.9230059226213368, "eval_loss": 1.144094467163086, "eval_runtime": 17.8499, "eval_samples_per_second": 56.023, "eval_steps_per_second": 14.006, "step": 12000 }, { "epoch": 0.926851780632259, "grad_norm": 1.0390766859054565, "learning_rate": 0.00014004623016388335, "loss": 1.1763, "step": 12050 }, { "epoch": 0.9306976386431813, "grad_norm": 1.1437292098999023, "learning_rate": 0.00013978651014206686, "loss": 1.153, "step": 12100 }, { "epoch": 0.9345434966541035, "grad_norm": 0.7012118697166443, "learning_rate": 0.00013952679012025037, "loss": 1.1209, "step": 12150 }, { "epoch": 0.9383893546650258, "grad_norm": 0.558203399181366, "learning_rate": 0.0001392670700984339, "loss": 1.0998, "step": 12200 }, { "epoch": 0.942235212675948, "grad_norm": 1.031898021697998, "learning_rate": 0.0001390073500766174, "loss": 1.1308, "step": 12250 }, { "epoch": 0.942235212675948, "eval_loss": 1.1427006721496582, "eval_runtime": 17.9677, "eval_samples_per_second": 55.656, "eval_steps_per_second": 13.914, "step": 12250 }, { "epoch": 0.9460810706868702, "grad_norm": 0.9146320223808289, "learning_rate": 0.00013874763005480094, "loss": 1.1441, "step": 12300 }, { "epoch": 0.9499269286977925, "grad_norm": 1.7698357105255127, "learning_rate": 0.00013848791003298445, "loss": 1.1278, "step": 12350 }, { "epoch": 0.9537727867087147, "grad_norm": 1.7621064186096191, "learning_rate": 0.00013822819001116797, "loss": 1.2191, "step": 12400 }, { "epoch": 0.957618644719637, "grad_norm": 1.2093744277954102, "learning_rate": 0.00013796846998935148, "loss": 1.1541, "step": 12450 }, { "epoch": 0.9614645027305592, "grad_norm": 1.0879639387130737, "learning_rate": 0.000137708749967535, "loss": 1.1278, "step": 12500 }, { "epoch": 0.9614645027305592, "eval_loss": 1.1357394456863403, "eval_runtime": 17.9695, "eval_samples_per_second": 55.65, "eval_steps_per_second": 13.912, "step": 12500 }, { "epoch": 0.9653103607414815, "grad_norm": 1.415139079093933, "learning_rate": 0.00013744902994571853, "loss": 1.1986, "step": 12550 }, { "epoch": 0.9691562187524037, "grad_norm": 1.0320454835891724, "learning_rate": 0.00013718930992390204, "loss": 1.153, "step": 12600 }, { "epoch": 0.973002076763326, "grad_norm": 1.0736747980117798, "learning_rate": 0.00013692958990208556, "loss": 1.1931, "step": 12650 }, { "epoch": 0.9768479347742481, "grad_norm": 0.8954864740371704, "learning_rate": 0.0001366698698802691, "loss": 1.1908, "step": 12700 }, { "epoch": 0.9806937927851703, "grad_norm": 1.3287911415100098, "learning_rate": 0.0001364101498584526, "loss": 1.1502, "step": 12750 }, { "epoch": 0.9806937927851703, "eval_loss": 1.133840799331665, "eval_runtime": 18.0959, "eval_samples_per_second": 55.261, "eval_steps_per_second": 13.815, "step": 12750 }, { "epoch": 0.9845396507960926, "grad_norm": 1.000588059425354, "learning_rate": 0.0001361504298366361, "loss": 1.1001, "step": 12800 }, { "epoch": 0.9883855088070148, "grad_norm": 0.9359833598136902, "learning_rate": 0.00013589070981481963, "loss": 1.2289, "step": 12850 }, { "epoch": 0.9922313668179371, "grad_norm": 1.5241230726242065, "learning_rate": 0.00013563098979300315, "loss": 1.1726, "step": 12900 }, { "epoch": 0.9960772248288593, "grad_norm": 1.0804429054260254, "learning_rate": 0.00013537126977118666, "loss": 1.1755, "step": 12950 }, { "epoch": 0.9999230828397816, "grad_norm": 0.8376865983009338, "learning_rate": 0.0001351115497493702, "loss": 1.158, "step": 13000 }, { "epoch": 0.9999230828397816, "eval_loss": 1.1301764249801636, "eval_runtime": 18.0468, "eval_samples_per_second": 55.411, "eval_steps_per_second": 13.853, "step": 13000 }, { "epoch": 1.0037689408507038, "grad_norm": 0.8493902087211609, "learning_rate": 0.0001348518297275537, "loss": 1.0935, "step": 13050 }, { "epoch": 1.007614798861626, "grad_norm": 0.9508585929870605, "learning_rate": 0.00013459210970573723, "loss": 1.0773, "step": 13100 }, { "epoch": 1.0114606568725482, "grad_norm": 1.047767162322998, "learning_rate": 0.00013433238968392074, "loss": 1.1722, "step": 13150 }, { "epoch": 1.0153065148834706, "grad_norm": 1.0310213565826416, "learning_rate": 0.00013407266966210425, "loss": 1.059, "step": 13200 }, { "epoch": 1.0191523728943928, "grad_norm": 0.7563040852546692, "learning_rate": 0.00013381294964028776, "loss": 1.0438, "step": 13250 }, { "epoch": 1.0191523728943928, "eval_loss": 1.121547818183899, "eval_runtime": 17.9914, "eval_samples_per_second": 55.582, "eval_steps_per_second": 13.895, "step": 13250 }, { "epoch": 1.022998230905315, "grad_norm": 0.8817610144615173, "learning_rate": 0.0001335532296184713, "loss": 1.1176, "step": 13300 }, { "epoch": 1.0268440889162371, "grad_norm": 0.8703081011772156, "learning_rate": 0.00013329350959665482, "loss": 1.0568, "step": 13350 }, { "epoch": 1.0306899469271595, "grad_norm": 1.0551347732543945, "learning_rate": 0.00013303378957483833, "loss": 1.1746, "step": 13400 }, { "epoch": 1.0345358049380817, "grad_norm": 1.2630723714828491, "learning_rate": 0.00013277406955302184, "loss": 1.0232, "step": 13450 }, { "epoch": 1.038381662949004, "grad_norm": 1.2565157413482666, "learning_rate": 0.00013251434953120536, "loss": 1.0523, "step": 13500 }, { "epoch": 1.038381662949004, "eval_loss": 1.1202689409255981, "eval_runtime": 17.8981, "eval_samples_per_second": 55.872, "eval_steps_per_second": 13.968, "step": 13500 }, { "epoch": 1.042227520959926, "grad_norm": 1.9204115867614746, "learning_rate": 0.0001322546295093889, "loss": 1.1115, "step": 13550 }, { "epoch": 1.0460733789708483, "grad_norm": 1.1753497123718262, "learning_rate": 0.0001319949094875724, "loss": 1.0463, "step": 13600 }, { "epoch": 1.0499192369817707, "grad_norm": 1.0144574642181396, "learning_rate": 0.00013173518946575592, "loss": 1.0914, "step": 13650 }, { "epoch": 1.0537650949926929, "grad_norm": 2.0906662940979004, "learning_rate": 0.00013147546944393943, "loss": 1.1635, "step": 13700 }, { "epoch": 1.057610953003615, "grad_norm": 1.0127108097076416, "learning_rate": 0.00013121574942212295, "loss": 1.1266, "step": 13750 }, { "epoch": 1.057610953003615, "eval_loss": 1.115894079208374, "eval_runtime": 17.8891, "eval_samples_per_second": 55.9, "eval_steps_per_second": 13.975, "step": 13750 }, { "epoch": 1.0614568110145373, "grad_norm": 1.2559298276901245, "learning_rate": 0.00013095602940030646, "loss": 1.0924, "step": 13800 }, { "epoch": 1.0653026690254597, "grad_norm": 0.7502859234809875, "learning_rate": 0.00013069630937849, "loss": 1.0756, "step": 13850 }, { "epoch": 1.0691485270363819, "grad_norm": 0.6954963207244873, "learning_rate": 0.0001304365893566735, "loss": 1.1284, "step": 13900 }, { "epoch": 1.072994385047304, "grad_norm": 1.3833235502243042, "learning_rate": 0.00013017686933485703, "loss": 1.1175, "step": 13950 }, { "epoch": 1.0768402430582262, "grad_norm": 0.9848393201828003, "learning_rate": 0.00012991714931304057, "loss": 1.1295, "step": 14000 }, { "epoch": 1.0768402430582262, "eval_loss": 1.1086758375167847, "eval_runtime": 17.8007, "eval_samples_per_second": 56.177, "eval_steps_per_second": 14.044, "step": 14000 }, { "epoch": 1.0806861010691486, "grad_norm": 1.1354585886001587, "learning_rate": 0.00012965742929122405, "loss": 1.0351, "step": 14050 }, { "epoch": 1.0845319590800708, "grad_norm": 0.760317325592041, "learning_rate": 0.00012939770926940756, "loss": 1.0614, "step": 14100 }, { "epoch": 1.088377817090993, "grad_norm": 0.9277663230895996, "learning_rate": 0.0001291379892475911, "loss": 1.0689, "step": 14150 }, { "epoch": 1.0922236751019152, "grad_norm": 1.0846219062805176, "learning_rate": 0.00012887826922577462, "loss": 1.1679, "step": 14200 }, { "epoch": 1.0960695331128374, "grad_norm": 1.204969048500061, "learning_rate": 0.00012861854920395816, "loss": 1.1198, "step": 14250 }, { "epoch": 1.0960695331128374, "eval_loss": 1.1072660684585571, "eval_runtime": 17.9757, "eval_samples_per_second": 55.631, "eval_steps_per_second": 13.908, "step": 14250 }, { "epoch": 1.0999153911237598, "grad_norm": 1.556897759437561, "learning_rate": 0.00012835882918214167, "loss": 1.1511, "step": 14300 }, { "epoch": 1.103761249134682, "grad_norm": 1.4192557334899902, "learning_rate": 0.00012809910916032518, "loss": 1.1074, "step": 14350 }, { "epoch": 1.1076071071456042, "grad_norm": 0.5456421971321106, "learning_rate": 0.0001278393891385087, "loss": 1.0818, "step": 14400 }, { "epoch": 1.1114529651565264, "grad_norm": 1.282106876373291, "learning_rate": 0.0001275796691166922, "loss": 1.0431, "step": 14450 }, { "epoch": 1.1152988231674485, "grad_norm": 0.7997551560401917, "learning_rate": 0.00012731994909487572, "loss": 1.0882, "step": 14500 }, { "epoch": 1.1152988231674485, "eval_loss": 1.1057496070861816, "eval_runtime": 17.7977, "eval_samples_per_second": 56.187, "eval_steps_per_second": 14.047, "step": 14500 }, { "epoch": 1.119144681178371, "grad_norm": 1.2388238906860352, "learning_rate": 0.00012706022907305926, "loss": 1.1321, "step": 14550 }, { "epoch": 1.1229905391892931, "grad_norm": 0.5141006708145142, "learning_rate": 0.00012680050905124277, "loss": 1.0453, "step": 14600 }, { "epoch": 1.1268363972002153, "grad_norm": 1.1240845918655396, "learning_rate": 0.0001265407890294263, "loss": 1.0742, "step": 14650 }, { "epoch": 1.1306822552111375, "grad_norm": 1.433976650238037, "learning_rate": 0.0001262810690076098, "loss": 1.1468, "step": 14700 }, { "epoch": 1.13452811322206, "grad_norm": 1.077966332435608, "learning_rate": 0.0001260213489857933, "loss": 1.1143, "step": 14750 }, { "epoch": 1.13452811322206, "eval_loss": 1.1030505895614624, "eval_runtime": 17.8195, "eval_samples_per_second": 56.118, "eval_steps_per_second": 14.03, "step": 14750 }, { "epoch": 1.1383739712329821, "grad_norm": 1.1456421613693237, "learning_rate": 0.00012576162896397683, "loss": 1.1173, "step": 14800 }, { "epoch": 1.1422198292439043, "grad_norm": 0.9305130243301392, "learning_rate": 0.00012550190894216037, "loss": 1.1158, "step": 14850 }, { "epoch": 1.1460656872548265, "grad_norm": 1.3796011209487915, "learning_rate": 0.00012524218892034388, "loss": 1.0785, "step": 14900 }, { "epoch": 1.149911545265749, "grad_norm": 0.9901970028877258, "learning_rate": 0.0001249824688985274, "loss": 1.1383, "step": 14950 }, { "epoch": 1.153757403276671, "grad_norm": 0.8250207304954529, "learning_rate": 0.00012472794327714724, "loss": 1.1363, "step": 15000 }, { "epoch": 1.153757403276671, "eval_loss": 1.103262186050415, "eval_runtime": 17.8852, "eval_samples_per_second": 55.912, "eval_steps_per_second": 13.978, "step": 15000 }, { "epoch": 1.1576032612875933, "grad_norm": 0.7949115037918091, "learning_rate": 0.00012446822325533076, "loss": 1.0542, "step": 15050 }, { "epoch": 1.1614491192985155, "grad_norm": 0.8414244055747986, "learning_rate": 0.00012420850323351427, "loss": 1.133, "step": 15100 }, { "epoch": 1.1652949773094377, "grad_norm": 0.7031393647193909, "learning_rate": 0.0001239487832116978, "loss": 1.0804, "step": 15150 }, { "epoch": 1.16914083532036, "grad_norm": 0.8476413488388062, "learning_rate": 0.00012368906318988132, "loss": 1.1252, "step": 15200 }, { "epoch": 1.1729866933312822, "grad_norm": 1.7877459526062012, "learning_rate": 0.00012342934316806483, "loss": 1.1491, "step": 15250 }, { "epoch": 1.1729866933312822, "eval_loss": 1.0958014726638794, "eval_runtime": 17.9511, "eval_samples_per_second": 55.707, "eval_steps_per_second": 13.927, "step": 15250 }, { "epoch": 1.1768325513422044, "grad_norm": 0.8797541856765747, "learning_rate": 0.00012316962314624835, "loss": 1.0446, "step": 15300 }, { "epoch": 1.1806784093531266, "grad_norm": 0.9383549690246582, "learning_rate": 0.00012290990312443186, "loss": 1.0703, "step": 15350 }, { "epoch": 1.1845242673640488, "grad_norm": 1.0788028240203857, "learning_rate": 0.00012265018310261537, "loss": 1.0976, "step": 15400 }, { "epoch": 1.1883701253749712, "grad_norm": 0.8661052584648132, "learning_rate": 0.0001223904630807989, "loss": 1.0872, "step": 15450 }, { "epoch": 1.1922159833858934, "grad_norm": 0.9346690773963928, "learning_rate": 0.00012213074305898242, "loss": 1.1296, "step": 15500 }, { "epoch": 1.1922159833858934, "eval_loss": 1.089566946029663, "eval_runtime": 17.8688, "eval_samples_per_second": 55.963, "eval_steps_per_second": 13.991, "step": 15500 }, { "epoch": 1.1960618413968156, "grad_norm": 1.229148030281067, "learning_rate": 0.00012187102303716594, "loss": 1.049, "step": 15550 }, { "epoch": 1.1999076994077378, "grad_norm": 0.9896694421768188, "learning_rate": 0.00012161130301534946, "loss": 1.0519, "step": 15600 }, { "epoch": 1.2037535574186602, "grad_norm": 0.7709591388702393, "learning_rate": 0.00012135158299353298, "loss": 0.8004, "step": 15650 }, { "epoch": 1.2075994154295824, "grad_norm": 0.8033544421195984, "learning_rate": 0.00012109186297171649, "loss": 0.8073, "step": 15700 }, { "epoch": 1.2114452734405046, "grad_norm": 0.955243706703186, "learning_rate": 0.00012083214294990002, "loss": 0.8575, "step": 15750 }, { "epoch": 1.2114452734405046, "eval_loss": 0.8102548718452454, "eval_runtime": 18.0003, "eval_samples_per_second": 55.555, "eval_steps_per_second": 13.889, "step": 15750 }, { "epoch": 1.2152911314514268, "grad_norm": 1.1329740285873413, "learning_rate": 0.00012057242292808353, "loss": 0.8421, "step": 15800 }, { "epoch": 1.2191369894623492, "grad_norm": 0.6005277633666992, "learning_rate": 0.00012031270290626706, "loss": 0.8149, "step": 15850 }, { "epoch": 1.2229828474732714, "grad_norm": 0.6579030156135559, "learning_rate": 0.00012005298288445057, "loss": 0.7783, "step": 15900 }, { "epoch": 1.2268287054841935, "grad_norm": 1.1820577383041382, "learning_rate": 0.00011979326286263408, "loss": 0.8133, "step": 15950 }, { "epoch": 1.2306745634951157, "grad_norm": 0.7112457156181335, "learning_rate": 0.00011953354284081761, "loss": 0.8004, "step": 16000 }, { "epoch": 1.2306745634951157, "eval_loss": 0.8041366338729858, "eval_runtime": 17.8867, "eval_samples_per_second": 55.907, "eval_steps_per_second": 13.977, "step": 16000 }, { "epoch": 1.234520421506038, "grad_norm": 0.7006672024726868, "learning_rate": 0.00011927382281900112, "loss": 0.7673, "step": 16050 }, { "epoch": 1.2383662795169603, "grad_norm": 1.0331398248672485, "learning_rate": 0.00011901410279718463, "loss": 0.7992, "step": 16100 }, { "epoch": 1.2422121375278825, "grad_norm": 0.8857033848762512, "learning_rate": 0.00011875438277536817, "loss": 0.8189, "step": 16150 }, { "epoch": 1.2460579955388047, "grad_norm": 0.6674500107765198, "learning_rate": 0.00011849466275355167, "loss": 0.7849, "step": 16200 }, { "epoch": 1.2499038535497269, "grad_norm": 0.8113058805465698, "learning_rate": 0.00011823494273173519, "loss": 0.7628, "step": 16250 }, { "epoch": 1.2499038535497269, "eval_loss": 0.8041785955429077, "eval_runtime": 18.0307, "eval_samples_per_second": 55.461, "eval_steps_per_second": 13.865, "step": 16250 }, { "epoch": 1.253749711560649, "grad_norm": 1.1236894130706787, "learning_rate": 0.00011797522270991873, "loss": 0.8078, "step": 16300 }, { "epoch": 1.2575955695715715, "grad_norm": 0.8019891977310181, "learning_rate": 0.00011771550268810222, "loss": 0.7999, "step": 16350 }, { "epoch": 1.2614414275824937, "grad_norm": 0.6394712924957275, "learning_rate": 0.00011745578266628574, "loss": 0.8132, "step": 16400 }, { "epoch": 1.2652872855934159, "grad_norm": 0.8335860371589661, "learning_rate": 0.00011719606264446928, "loss": 0.7494, "step": 16450 }, { "epoch": 1.2691331436043383, "grad_norm": 0.5847200155258179, "learning_rate": 0.00011693634262265279, "loss": 0.7966, "step": 16500 }, { "epoch": 1.2691331436043383, "eval_loss": 0.8012632131576538, "eval_runtime": 17.8773, "eval_samples_per_second": 55.937, "eval_steps_per_second": 13.984, "step": 16500 }, { "epoch": 1.2729790016152602, "grad_norm": 0.7161915302276611, "learning_rate": 0.00011667662260083629, "loss": 0.7969, "step": 16550 }, { "epoch": 1.2768248596261826, "grad_norm": 0.7693639397621155, "learning_rate": 0.00011641690257901983, "loss": 0.7909, "step": 16600 }, { "epoch": 1.2806707176371048, "grad_norm": 0.6645983457565308, "learning_rate": 0.00011615718255720334, "loss": 0.7816, "step": 16650 }, { "epoch": 1.284516575648027, "grad_norm": 0.7736928462982178, "learning_rate": 0.00011589746253538687, "loss": 0.745, "step": 16700 }, { "epoch": 1.2883624336589494, "grad_norm": 0.6743229627609253, "learning_rate": 0.00011563774251357038, "loss": 0.7856, "step": 16750 }, { "epoch": 1.2883624336589494, "eval_loss": 0.8006194233894348, "eval_runtime": 17.9046, "eval_samples_per_second": 55.852, "eval_steps_per_second": 13.963, "step": 16750 }, { "epoch": 1.2922082916698716, "grad_norm": 0.8146863579750061, "learning_rate": 0.0001153780224917539, "loss": 0.7559, "step": 16800 }, { "epoch": 1.2960541496807938, "grad_norm": 0.9186645746231079, "learning_rate": 0.00011511830246993742, "loss": 0.7902, "step": 16850 }, { "epoch": 1.299900007691716, "grad_norm": 0.7634202241897583, "learning_rate": 0.00011485858244812093, "loss": 0.749, "step": 16900 }, { "epoch": 1.3037458657026382, "grad_norm": 0.890457272529602, "learning_rate": 0.00011459886242630445, "loss": 0.7539, "step": 16950 }, { "epoch": 1.3075917237135606, "grad_norm": 0.8466306328773499, "learning_rate": 0.00011433914240448797, "loss": 0.7997, "step": 17000 }, { "epoch": 1.3075917237135606, "eval_loss": 0.8002509474754333, "eval_runtime": 17.9496, "eval_samples_per_second": 55.712, "eval_steps_per_second": 13.928, "step": 17000 }, { "epoch": 1.3114375817244828, "grad_norm": 1.403998851776123, "learning_rate": 0.00011407942238267149, "loss": 0.8369, "step": 17050 }, { "epoch": 1.315283439735405, "grad_norm": 0.8802525401115417, "learning_rate": 0.000113819702360855, "loss": 0.7818, "step": 17100 }, { "epoch": 1.3191292977463271, "grad_norm": 0.7463178634643555, "learning_rate": 0.00011355998233903852, "loss": 0.7661, "step": 17150 }, { "epoch": 1.3229751557572493, "grad_norm": 0.47771725058555603, "learning_rate": 0.00011330026231722204, "loss": 0.8141, "step": 17200 }, { "epoch": 1.3268210137681717, "grad_norm": 0.8294114470481873, "learning_rate": 0.00011304054229540555, "loss": 0.7979, "step": 17250 }, { "epoch": 1.3268210137681717, "eval_loss": 0.7951701879501343, "eval_runtime": 17.958, "eval_samples_per_second": 55.686, "eval_steps_per_second": 13.921, "step": 17250 }, { "epoch": 1.330666871779094, "grad_norm": 1.0813144445419312, "learning_rate": 0.00011278082227358908, "loss": 0.824, "step": 17300 }, { "epoch": 1.3345127297900161, "grad_norm": 0.5647908449172974, "learning_rate": 0.00011252110225177259, "loss": 0.7718, "step": 17350 }, { "epoch": 1.3383585878009385, "grad_norm": 0.7901347279548645, "learning_rate": 0.0001122613822299561, "loss": 0.7883, "step": 17400 }, { "epoch": 1.3422044458118605, "grad_norm": 1.0769431591033936, "learning_rate": 0.00011200166220813963, "loss": 0.7625, "step": 17450 }, { "epoch": 1.346050303822783, "grad_norm": 0.7366082072257996, "learning_rate": 0.00011174194218632314, "loss": 0.8167, "step": 17500 }, { "epoch": 1.346050303822783, "eval_loss": 0.7991219758987427, "eval_runtime": 17.9932, "eval_samples_per_second": 55.577, "eval_steps_per_second": 13.894, "step": 17500 }, { "epoch": 1.349896161833705, "grad_norm": 0.8688609600067139, "learning_rate": 0.00011148222216450668, "loss": 0.7882, "step": 17550 }, { "epoch": 1.3537420198446273, "grad_norm": 0.7067499160766602, "learning_rate": 0.0001112225021426902, "loss": 0.8193, "step": 17600 }, { "epoch": 1.3575878778555497, "grad_norm": 0.8119627833366394, "learning_rate": 0.0001109627821208737, "loss": 0.7902, "step": 17650 }, { "epoch": 1.3614337358664719, "grad_norm": 0.4667348265647888, "learning_rate": 0.00011070306209905723, "loss": 0.7595, "step": 17700 }, { "epoch": 1.365279593877394, "grad_norm": 1.1219276189804077, "learning_rate": 0.00011044334207724075, "loss": 0.8049, "step": 17750 }, { "epoch": 1.365279593877394, "eval_loss": 0.7947649955749512, "eval_runtime": 17.9375, "eval_samples_per_second": 55.749, "eval_steps_per_second": 13.937, "step": 17750 }, { "epoch": 1.3691254518883162, "grad_norm": 0.8262600302696228, "learning_rate": 0.00011018362205542425, "loss": 0.794, "step": 17800 }, { "epoch": 1.3729713098992384, "grad_norm": 0.6738994121551514, "learning_rate": 0.00010992390203360779, "loss": 0.7749, "step": 17850 }, { "epoch": 1.3768171679101608, "grad_norm": 0.40902426838874817, "learning_rate": 0.0001096641820117913, "loss": 0.8111, "step": 17900 }, { "epoch": 1.380663025921083, "grad_norm": 0.7617566585540771, "learning_rate": 0.00010940446198997481, "loss": 0.7663, "step": 17950 }, { "epoch": 1.3845088839320052, "grad_norm": 0.5269647836685181, "learning_rate": 0.00010914474196815834, "loss": 0.7913, "step": 18000 }, { "epoch": 1.3845088839320052, "eval_loss": 0.7882509231567383, "eval_runtime": 17.938, "eval_samples_per_second": 55.748, "eval_steps_per_second": 13.937, "step": 18000 }, { "epoch": 1.3883547419429274, "grad_norm": 0.9016252160072327, "learning_rate": 0.00010888502194634185, "loss": 0.8222, "step": 18050 }, { "epoch": 1.3922005999538496, "grad_norm": 0.6058124899864197, "learning_rate": 0.00010862530192452536, "loss": 0.7657, "step": 18100 }, { "epoch": 1.396046457964772, "grad_norm": 0.8505234122276306, "learning_rate": 0.00010836558190270889, "loss": 0.7962, "step": 18150 }, { "epoch": 1.3998923159756942, "grad_norm": 0.7518420815467834, "learning_rate": 0.0001081058618808924, "loss": 0.8364, "step": 18200 }, { "epoch": 1.4037381739866164, "grad_norm": 0.7778449058532715, "learning_rate": 0.00010784614185907592, "loss": 0.7836, "step": 18250 }, { "epoch": 1.4037381739866164, "eval_loss": 0.7889594435691833, "eval_runtime": 17.8326, "eval_samples_per_second": 56.077, "eval_steps_per_second": 14.019, "step": 18250 }, { "epoch": 1.4075840319975388, "grad_norm": 1.2029508352279663, "learning_rate": 0.00010758642183725944, "loss": 0.7859, "step": 18300 }, { "epoch": 1.4114298900084608, "grad_norm": 0.7625166773796082, "learning_rate": 0.00010732670181544295, "loss": 0.8018, "step": 18350 }, { "epoch": 1.4152757480193832, "grad_norm": 0.6327937245368958, "learning_rate": 0.00010706698179362648, "loss": 0.808, "step": 18400 }, { "epoch": 1.4191216060303053, "grad_norm": 0.7097195386886597, "learning_rate": 0.00010680726177181, "loss": 0.7928, "step": 18450 }, { "epoch": 1.4229674640412275, "grad_norm": 0.5188928246498108, "learning_rate": 0.00010654754174999351, "loss": 0.7934, "step": 18500 }, { "epoch": 1.4229674640412275, "eval_loss": 0.7918493151664734, "eval_runtime": 17.9735, "eval_samples_per_second": 55.638, "eval_steps_per_second": 13.909, "step": 18500 }, { "epoch": 1.42681332205215, "grad_norm": 0.6486705541610718, "learning_rate": 0.00010628782172817703, "loss": 0.7317, "step": 18550 }, { "epoch": 1.4306591800630721, "grad_norm": 0.6570118069648743, "learning_rate": 0.00010602810170636055, "loss": 0.8149, "step": 18600 }, { "epoch": 1.4345050380739943, "grad_norm": 0.8024285435676575, "learning_rate": 0.00010576838168454406, "loss": 0.8392, "step": 18650 }, { "epoch": 1.4383508960849165, "grad_norm": 0.5735141038894653, "learning_rate": 0.0001055086616627276, "loss": 0.7932, "step": 18700 }, { "epoch": 1.4421967540958387, "grad_norm": 0.5087122917175293, "learning_rate": 0.0001052489416409111, "loss": 0.7508, "step": 18750 }, { "epoch": 1.4421967540958387, "eval_loss": 0.7895762920379639, "eval_runtime": 17.9534, "eval_samples_per_second": 55.7, "eval_steps_per_second": 13.925, "step": 18750 }, { "epoch": 1.446042612106761, "grad_norm": 0.7478468418121338, "learning_rate": 0.00010498922161909461, "loss": 0.8533, "step": 18800 }, { "epoch": 1.4498884701176833, "grad_norm": 1.2165393829345703, "learning_rate": 0.00010472950159727815, "loss": 0.8025, "step": 18850 }, { "epoch": 1.4537343281286055, "grad_norm": 0.8682180643081665, "learning_rate": 0.00010446978157546165, "loss": 0.8086, "step": 18900 }, { "epoch": 1.4575801861395277, "grad_norm": 0.9063705205917358, "learning_rate": 0.00010421006155364516, "loss": 0.7583, "step": 18950 }, { "epoch": 1.4614260441504499, "grad_norm": 0.7133361101150513, "learning_rate": 0.0001039503415318287, "loss": 0.7381, "step": 19000 }, { "epoch": 1.4614260441504499, "eval_loss": 0.7869898080825806, "eval_runtime": 17.9874, "eval_samples_per_second": 55.595, "eval_steps_per_second": 13.899, "step": 19000 }, { "epoch": 1.4652719021613723, "grad_norm": 0.6205143928527832, "learning_rate": 0.00010369062151001222, "loss": 0.8141, "step": 19050 }, { "epoch": 1.4691177601722945, "grad_norm": 1.1060974597930908, "learning_rate": 0.00010343090148819572, "loss": 0.7771, "step": 19100 }, { "epoch": 1.4729636181832166, "grad_norm": 0.7808921933174133, "learning_rate": 0.00010317118146637926, "loss": 0.8006, "step": 19150 }, { "epoch": 1.476809476194139, "grad_norm": 0.5509454011917114, "learning_rate": 0.00010291146144456277, "loss": 0.7922, "step": 19200 }, { "epoch": 1.480655334205061, "grad_norm": 1.2427464723587036, "learning_rate": 0.0001026517414227463, "loss": 0.7801, "step": 19250 }, { "epoch": 1.480655334205061, "eval_loss": 0.7850660085678101, "eval_runtime": 17.8458, "eval_samples_per_second": 56.036, "eval_steps_per_second": 14.009, "step": 19250 }, { "epoch": 1.4845011922159834, "grad_norm": 0.8217543959617615, "learning_rate": 0.00010239202140092981, "loss": 0.782, "step": 19300 }, { "epoch": 1.4883470502269056, "grad_norm": 0.6753976941108704, "learning_rate": 0.00010213230137911332, "loss": 0.7787, "step": 19350 }, { "epoch": 1.4921929082378278, "grad_norm": 1.0264923572540283, "learning_rate": 0.00010187258135729685, "loss": 0.7713, "step": 19400 }, { "epoch": 1.4960387662487502, "grad_norm": 0.7317076325416565, "learning_rate": 0.00010161286133548036, "loss": 0.7634, "step": 19450 }, { "epoch": 1.4998846242596724, "grad_norm": 0.6011013388633728, "learning_rate": 0.00010135314131366387, "loss": 0.7715, "step": 19500 }, { "epoch": 1.4998846242596724, "eval_loss": 0.7843549847602844, "eval_runtime": 17.919, "eval_samples_per_second": 55.807, "eval_steps_per_second": 13.952, "step": 19500 }, { "epoch": 1.5037304822705946, "grad_norm": 0.5136408805847168, "learning_rate": 0.0001010934212918474, "loss": 0.7533, "step": 19550 }, { "epoch": 1.5075763402815168, "grad_norm": 0.8944385647773743, "learning_rate": 0.00010083370127003091, "loss": 0.8483, "step": 19600 }, { "epoch": 1.511422198292439, "grad_norm": 0.9759209752082825, "learning_rate": 0.00010057398124821442, "loss": 0.8338, "step": 19650 }, { "epoch": 1.5152680563033614, "grad_norm": 0.7823068499565125, "learning_rate": 0.00010031426122639795, "loss": 0.7739, "step": 19700 }, { "epoch": 1.5191139143142836, "grad_norm": 0.9423583745956421, "learning_rate": 0.00010005454120458146, "loss": 0.8092, "step": 19750 }, { "epoch": 1.5191139143142836, "eval_loss": 0.782052218914032, "eval_runtime": 17.8818, "eval_samples_per_second": 55.923, "eval_steps_per_second": 13.981, "step": 19750 }, { "epoch": 1.5229597723252057, "grad_norm": 1.067384123802185, "learning_rate": 9.979482118276499e-05, "loss": 0.7581, "step": 19800 }, { "epoch": 1.5268056303361282, "grad_norm": 1.0136195421218872, "learning_rate": 9.95351011609485e-05, "loss": 0.7923, "step": 19850 }, { "epoch": 1.5306514883470501, "grad_norm": 0.8290442824363708, "learning_rate": 9.927538113913202e-05, "loss": 0.7991, "step": 19900 }, { "epoch": 1.5344973463579725, "grad_norm": 0.8200196623802185, "learning_rate": 9.901566111731554e-05, "loss": 0.7382, "step": 19950 }, { "epoch": 1.5383432043688947, "grad_norm": 0.7905208468437195, "learning_rate": 9.875594109549905e-05, "loss": 0.7615, "step": 20000 }, { "epoch": 1.5383432043688947, "eval_loss": 0.7793118953704834, "eval_runtime": 17.8391, "eval_samples_per_second": 56.057, "eval_steps_per_second": 14.014, "step": 20000 }, { "epoch": 1.542189062379817, "grad_norm": 0.8522188067436218, "learning_rate": 9.849622107368257e-05, "loss": 0.8041, "step": 20050 }, { "epoch": 1.5460349203907393, "grad_norm": 1.0029702186584473, "learning_rate": 9.82365010518661e-05, "loss": 0.7873, "step": 20100 }, { "epoch": 1.5498807784016613, "grad_norm": 1.007730484008789, "learning_rate": 9.797678103004962e-05, "loss": 0.7685, "step": 20150 }, { "epoch": 1.5537266364125837, "grad_norm": 0.6971302032470703, "learning_rate": 9.771706100823312e-05, "loss": 0.805, "step": 20200 }, { "epoch": 1.5575724944235059, "grad_norm": 0.9766409993171692, "learning_rate": 9.745734098641665e-05, "loss": 0.765, "step": 20250 }, { "epoch": 1.5575724944235059, "eval_loss": 0.7780515551567078, "eval_runtime": 17.9362, "eval_samples_per_second": 55.753, "eval_steps_per_second": 13.938, "step": 20250 }, { "epoch": 1.561418352434428, "grad_norm": 0.8611739873886108, "learning_rate": 9.719762096460017e-05, "loss": 0.7768, "step": 20300 }, { "epoch": 1.5652642104453505, "grad_norm": 0.9571949243545532, "learning_rate": 9.693790094278369e-05, "loss": 0.8106, "step": 20350 }, { "epoch": 1.5691100684562724, "grad_norm": 0.9941520094871521, "learning_rate": 9.66781809209672e-05, "loss": 0.7952, "step": 20400 }, { "epoch": 1.5729559264671948, "grad_norm": 0.7494879364967346, "learning_rate": 9.641846089915072e-05, "loss": 0.7882, "step": 20450 }, { "epoch": 1.576801784478117, "grad_norm": 0.25608131289482117, "learning_rate": 9.615874087733424e-05, "loss": 0.7424, "step": 20500 }, { "epoch": 1.576801784478117, "eval_loss": 0.7784542441368103, "eval_runtime": 17.9827, "eval_samples_per_second": 55.609, "eval_steps_per_second": 13.902, "step": 20500 }, { "epoch": 1.5806476424890392, "grad_norm": 0.781563401222229, "learning_rate": 9.589902085551775e-05, "loss": 0.7251, "step": 20550 }, { "epoch": 1.5844935004999616, "grad_norm": 0.8129003047943115, "learning_rate": 9.563930083370128e-05, "loss": 0.7783, "step": 20600 }, { "epoch": 1.5883393585108838, "grad_norm": 0.7955138087272644, "learning_rate": 9.537958081188479e-05, "loss": 0.7692, "step": 20650 }, { "epoch": 1.592185216521806, "grad_norm": 0.7752518653869629, "learning_rate": 9.511986079006832e-05, "loss": 0.7578, "step": 20700 }, { "epoch": 1.5960310745327284, "grad_norm": 0.7433210611343384, "learning_rate": 9.486014076825183e-05, "loss": 0.7885, "step": 20750 }, { "epoch": 1.5960310745327284, "eval_loss": 0.7757794260978699, "eval_runtime": 17.9593, "eval_samples_per_second": 55.682, "eval_steps_per_second": 13.92, "step": 20750 }, { "epoch": 1.5998769325436504, "grad_norm": 0.7218450903892517, "learning_rate": 9.460042074643534e-05, "loss": 0.8179, "step": 20800 }, { "epoch": 1.6037227905545728, "grad_norm": 0.8611409664154053, "learning_rate": 9.434070072461887e-05, "loss": 0.8166, "step": 20850 }, { "epoch": 1.607568648565495, "grad_norm": 0.8060470223426819, "learning_rate": 9.408098070280238e-05, "loss": 0.7831, "step": 20900 }, { "epoch": 1.6114145065764172, "grad_norm": 0.9832955002784729, "learning_rate": 9.382126068098591e-05, "loss": 0.7679, "step": 20950 }, { "epoch": 1.6152603645873396, "grad_norm": 0.7749195098876953, "learning_rate": 9.356154065916942e-05, "loss": 0.8001, "step": 21000 }, { "epoch": 1.6152603645873396, "eval_loss": 0.7739425301551819, "eval_runtime": 17.9752, "eval_samples_per_second": 55.632, "eval_steps_per_second": 13.908, "step": 21000 }, { "epoch": 1.6191062225982615, "grad_norm": 0.8098833560943604, "learning_rate": 9.330182063735295e-05, "loss": 0.7673, "step": 21050 }, { "epoch": 1.622952080609184, "grad_norm": 0.7867510318756104, "learning_rate": 9.304210061553646e-05, "loss": 0.7619, "step": 21100 }, { "epoch": 1.6267979386201061, "grad_norm": 1.2399013042449951, "learning_rate": 9.278238059371997e-05, "loss": 0.8664, "step": 21150 }, { "epoch": 1.6306437966310283, "grad_norm": 0.6171821355819702, "learning_rate": 9.25226605719035e-05, "loss": 0.8543, "step": 21200 }, { "epoch": 1.6344896546419507, "grad_norm": 0.7197456955909729, "learning_rate": 9.226294055008701e-05, "loss": 0.7989, "step": 21250 }, { "epoch": 1.6344896546419507, "eval_loss": 0.7736220955848694, "eval_runtime": 18.0274, "eval_samples_per_second": 55.471, "eval_steps_per_second": 13.868, "step": 21250 }, { "epoch": 1.6383355126528727, "grad_norm": 0.6752798557281494, "learning_rate": 9.200322052827052e-05, "loss": 0.7925, "step": 21300 }, { "epoch": 1.642181370663795, "grad_norm": 0.7389090061187744, "learning_rate": 9.174350050645405e-05, "loss": 0.762, "step": 21350 }, { "epoch": 1.6460272286747173, "grad_norm": 0.7688984870910645, "learning_rate": 9.148378048463756e-05, "loss": 0.771, "step": 21400 }, { "epoch": 1.6498730866856395, "grad_norm": 0.7231914401054382, "learning_rate": 9.122406046282108e-05, "loss": 0.8244, "step": 21450 }, { "epoch": 1.6537189446965619, "grad_norm": 0.786527156829834, "learning_rate": 9.09643404410046e-05, "loss": 0.7738, "step": 21500 }, { "epoch": 1.6537189446965619, "eval_loss": 0.77164626121521, "eval_runtime": 17.9322, "eval_samples_per_second": 55.765, "eval_steps_per_second": 13.941, "step": 21500 }, { "epoch": 1.657564802707484, "grad_norm": 0.8676180243492126, "learning_rate": 9.070462041918813e-05, "loss": 0.8113, "step": 21550 }, { "epoch": 1.6614106607184063, "grad_norm": 0.6232962012290955, "learning_rate": 9.044490039737163e-05, "loss": 0.7847, "step": 21600 }, { "epoch": 1.6652565187293287, "grad_norm": 1.151957631111145, "learning_rate": 9.018518037555515e-05, "loss": 0.7924, "step": 21650 }, { "epoch": 1.6691023767402506, "grad_norm": 0.8149247169494629, "learning_rate": 8.992546035373868e-05, "loss": 0.7395, "step": 21700 }, { "epoch": 1.672948234751173, "grad_norm": 1.0983916521072388, "learning_rate": 8.96657403319222e-05, "loss": 0.7715, "step": 21750 }, { "epoch": 1.672948234751173, "eval_loss": 0.7681905031204224, "eval_runtime": 17.9279, "eval_samples_per_second": 55.779, "eval_steps_per_second": 13.945, "step": 21750 }, { "epoch": 1.6767940927620952, "grad_norm": 0.7059574723243713, "learning_rate": 8.94060203101057e-05, "loss": 0.784, "step": 21800 }, { "epoch": 1.6806399507730174, "grad_norm": 1.0587022304534912, "learning_rate": 8.914630028828923e-05, "loss": 0.7608, "step": 21850 }, { "epoch": 1.6844858087839398, "grad_norm": 0.7638582587242126, "learning_rate": 8.888658026647275e-05, "loss": 0.8159, "step": 21900 }, { "epoch": 1.6883316667948618, "grad_norm": 0.5783549547195435, "learning_rate": 8.862686024465626e-05, "loss": 0.7959, "step": 21950 }, { "epoch": 1.6921775248057842, "grad_norm": 1.2192896604537964, "learning_rate": 8.836714022283978e-05, "loss": 0.8272, "step": 22000 }, { "epoch": 1.6921775248057842, "eval_loss": 0.7701475024223328, "eval_runtime": 17.8145, "eval_samples_per_second": 56.134, "eval_steps_per_second": 14.034, "step": 22000 }, { "epoch": 1.6960233828167064, "grad_norm": 0.5081881880760193, "learning_rate": 8.810742020102331e-05, "loss": 0.7883, "step": 22050 }, { "epoch": 1.6998692408276286, "grad_norm": 0.5658268332481384, "learning_rate": 8.784770017920681e-05, "loss": 0.77, "step": 22100 }, { "epoch": 1.703715098838551, "grad_norm": 0.6888287663459778, "learning_rate": 8.758798015739034e-05, "loss": 0.8013, "step": 22150 }, { "epoch": 1.707560956849473, "grad_norm": 1.0599181652069092, "learning_rate": 8.732826013557386e-05, "loss": 0.7649, "step": 22200 }, { "epoch": 1.7114068148603954, "grad_norm": 0.7679368257522583, "learning_rate": 8.706854011375736e-05, "loss": 0.7338, "step": 22250 }, { "epoch": 1.7114068148603954, "eval_loss": 0.7665286660194397, "eval_runtime": 17.8822, "eval_samples_per_second": 55.921, "eval_steps_per_second": 13.98, "step": 22250 }, { "epoch": 1.7152526728713176, "grad_norm": 0.6636808514595032, "learning_rate": 8.680882009194089e-05, "loss": 0.7949, "step": 22300 }, { "epoch": 1.7190985308822397, "grad_norm": 0.7327821254730225, "learning_rate": 8.654910007012442e-05, "loss": 0.7484, "step": 22350 }, { "epoch": 1.7229443888931621, "grad_norm": 0.8187472224235535, "learning_rate": 8.628938004830793e-05, "loss": 0.7482, "step": 22400 }, { "epoch": 1.7267902469040843, "grad_norm": 0.4527030289173126, "learning_rate": 8.602966002649144e-05, "loss": 0.7902, "step": 22450 }, { "epoch": 1.7306361049150065, "grad_norm": 0.6475220918655396, "learning_rate": 8.576994000467497e-05, "loss": 0.7998, "step": 22500 }, { "epoch": 1.7306361049150065, "eval_loss": 0.7646552920341492, "eval_runtime": 17.8108, "eval_samples_per_second": 56.146, "eval_steps_per_second": 14.036, "step": 22500 }, { "epoch": 1.734481962925929, "grad_norm": 0.538769543170929, "learning_rate": 8.551021998285848e-05, "loss": 0.7658, "step": 22550 }, { "epoch": 1.738327820936851, "grad_norm": 0.629510223865509, "learning_rate": 8.5250499961042e-05, "loss": 0.7885, "step": 22600 }, { "epoch": 1.7421736789477733, "grad_norm": 0.6914022564888, "learning_rate": 8.499077993922552e-05, "loss": 0.7648, "step": 22650 }, { "epoch": 1.7460195369586955, "grad_norm": 0.5563036799430847, "learning_rate": 8.473105991740903e-05, "loss": 0.7558, "step": 22700 }, { "epoch": 1.7498653949696177, "grad_norm": 0.7851826548576355, "learning_rate": 8.447133989559256e-05, "loss": 0.7961, "step": 22750 }, { "epoch": 1.7498653949696177, "eval_loss": 0.7664644718170166, "eval_runtime": 17.9733, "eval_samples_per_second": 55.638, "eval_steps_per_second": 13.91, "step": 22750 }, { "epoch": 1.75371125298054, "grad_norm": 0.48695698380470276, "learning_rate": 8.421161987377607e-05, "loss": 0.8264, "step": 22800 }, { "epoch": 1.757557110991462, "grad_norm": 0.8053486347198486, "learning_rate": 8.39518998519596e-05, "loss": 0.8084, "step": 22850 }, { "epoch": 1.7614029690023845, "grad_norm": 1.1373741626739502, "learning_rate": 8.369217983014311e-05, "loss": 0.7842, "step": 22900 }, { "epoch": 1.7652488270133067, "grad_norm": 1.1318634748458862, "learning_rate": 8.343245980832662e-05, "loss": 0.7727, "step": 22950 }, { "epoch": 1.7690946850242288, "grad_norm": 0.8140521049499512, "learning_rate": 8.317273978651015e-05, "loss": 0.7945, "step": 23000 }, { "epoch": 1.7690946850242288, "eval_loss": 0.7635026574134827, "eval_runtime": 17.8969, "eval_samples_per_second": 55.875, "eval_steps_per_second": 13.969, "step": 23000 }, { "epoch": 1.7729405430351513, "grad_norm": 0.7365099787712097, "learning_rate": 8.291301976469366e-05, "loss": 0.7915, "step": 23050 }, { "epoch": 1.7767864010460732, "grad_norm": 0.7158268690109253, "learning_rate": 8.265329974287718e-05, "loss": 0.8089, "step": 23100 }, { "epoch": 1.7806322590569956, "grad_norm": 0.7917172312736511, "learning_rate": 8.23935797210607e-05, "loss": 0.7881, "step": 23150 }, { "epoch": 1.7844781170679178, "grad_norm": 0.9002280831336975, "learning_rate": 8.213385969924422e-05, "loss": 0.7861, "step": 23200 }, { "epoch": 1.78832397507884, "grad_norm": 1.118498682975769, "learning_rate": 8.187413967742774e-05, "loss": 0.7787, "step": 23250 }, { "epoch": 1.78832397507884, "eval_loss": 0.7597912549972534, "eval_runtime": 18.0126, "eval_samples_per_second": 55.517, "eval_steps_per_second": 13.879, "step": 23250 }, { "epoch": 1.7921698330897624, "grad_norm": 0.555014967918396, "learning_rate": 8.161441965561125e-05, "loss": 0.7698, "step": 23300 }, { "epoch": 1.7960156911006846, "grad_norm": 0.7749983072280884, "learning_rate": 8.135469963379477e-05, "loss": 0.7976, "step": 23350 }, { "epoch": 1.7998615491116068, "grad_norm": 0.8833787441253662, "learning_rate": 8.10949796119783e-05, "loss": 0.7901, "step": 23400 }, { "epoch": 1.8037074071225292, "grad_norm": 1.099992036819458, "learning_rate": 8.08352595901618e-05, "loss": 0.7843, "step": 23450 }, { "epoch": 1.8075532651334512, "grad_norm": 0.7118529677391052, "learning_rate": 8.057553956834533e-05, "loss": 0.7818, "step": 23500 }, { "epoch": 1.8075532651334512, "eval_loss": 0.7594859600067139, "eval_runtime": 17.8501, "eval_samples_per_second": 56.022, "eval_steps_per_second": 14.005, "step": 23500 }, { "epoch": 1.8113991231443736, "grad_norm": 0.8289865851402283, "learning_rate": 8.031581954652885e-05, "loss": 0.7899, "step": 23550 }, { "epoch": 1.8152449811552958, "grad_norm": 1.1237398386001587, "learning_rate": 8.005609952471237e-05, "loss": 0.8005, "step": 23600 }, { "epoch": 1.819090839166218, "grad_norm": 0.8594374060630798, "learning_rate": 7.979637950289588e-05, "loss": 0.7765, "step": 23650 }, { "epoch": 1.8229366971771404, "grad_norm": 0.754634439945221, "learning_rate": 7.95366594810794e-05, "loss": 0.7794, "step": 23700 }, { "epoch": 1.8267825551880623, "grad_norm": 1.0647647380828857, "learning_rate": 7.927693945926292e-05, "loss": 0.7681, "step": 23750 }, { "epoch": 1.8267825551880623, "eval_loss": 0.7575324773788452, "eval_runtime": 17.8657, "eval_samples_per_second": 55.973, "eval_steps_per_second": 13.993, "step": 23750 }, { "epoch": 1.8306284131989847, "grad_norm": 1.1255161762237549, "learning_rate": 7.901721943744644e-05, "loss": 0.7551, "step": 23800 }, { "epoch": 1.834474271209907, "grad_norm": 0.8209452629089355, "learning_rate": 7.875749941562995e-05, "loss": 0.8091, "step": 23850 }, { "epoch": 1.838320129220829, "grad_norm": 0.40779542922973633, "learning_rate": 7.849777939381348e-05, "loss": 0.7572, "step": 23900 }, { "epoch": 1.8421659872317515, "grad_norm": 0.9186558127403259, "learning_rate": 7.823805937199699e-05, "loss": 0.7681, "step": 23950 }, { "epoch": 1.8460118452426735, "grad_norm": 0.4896409213542938, "learning_rate": 7.79783393501805e-05, "loss": 0.7989, "step": 24000 }, { "epoch": 1.8460118452426735, "eval_loss": 0.7566245198249817, "eval_runtime": 17.9055, "eval_samples_per_second": 55.849, "eval_steps_per_second": 13.962, "step": 24000 }, { "epoch": 1.8498577032535959, "grad_norm": 1.1524229049682617, "learning_rate": 7.771861932836403e-05, "loss": 0.7732, "step": 24050 }, { "epoch": 1.853703561264518, "grad_norm": 0.4653956890106201, "learning_rate": 7.745889930654755e-05, "loss": 0.7961, "step": 24100 }, { "epoch": 1.8575494192754403, "grad_norm": 0.8423280119895935, "learning_rate": 7.719917928473105e-05, "loss": 0.7963, "step": 24150 }, { "epoch": 1.8613952772863627, "grad_norm": 0.6979435086250305, "learning_rate": 7.693945926291458e-05, "loss": 0.7816, "step": 24200 }, { "epoch": 1.8652411352972849, "grad_norm": 0.7800914645195007, "learning_rate": 7.66797392410981e-05, "loss": 0.7563, "step": 24250 }, { "epoch": 1.8652411352972849, "eval_loss": 0.7574715614318848, "eval_runtime": 17.9043, "eval_samples_per_second": 55.852, "eval_steps_per_second": 13.963, "step": 24250 }, { "epoch": 1.869086993308207, "grad_norm": 0.9678017497062683, "learning_rate": 7.642001921928162e-05, "loss": 0.7689, "step": 24300 }, { "epoch": 1.8729328513191295, "grad_norm": 0.454647421836853, "learning_rate": 7.616029919746513e-05, "loss": 0.7231, "step": 24350 }, { "epoch": 1.8767787093300514, "grad_norm": 0.7899460792541504, "learning_rate": 7.590057917564866e-05, "loss": 0.7471, "step": 24400 }, { "epoch": 1.8806245673409738, "grad_norm": 1.1373926401138306, "learning_rate": 7.564085915383217e-05, "loss": 0.8342, "step": 24450 }, { "epoch": 1.884470425351896, "grad_norm": 1.4272133111953735, "learning_rate": 7.538113913201568e-05, "loss": 0.7585, "step": 24500 }, { "epoch": 1.884470425351896, "eval_loss": 0.7536496520042419, "eval_runtime": 17.9401, "eval_samples_per_second": 55.741, "eval_steps_per_second": 13.935, "step": 24500 }, { "epoch": 1.8883162833628182, "grad_norm": 0.8080185055732727, "learning_rate": 7.512141911019921e-05, "loss": 0.8067, "step": 24550 }, { "epoch": 1.8921621413737406, "grad_norm": 0.5850221514701843, "learning_rate": 7.486169908838274e-05, "loss": 0.7455, "step": 24600 }, { "epoch": 1.8960079993846626, "grad_norm": 0.7521384954452515, "learning_rate": 7.460197906656624e-05, "loss": 0.7604, "step": 24650 }, { "epoch": 1.899853857395585, "grad_norm": 0.6376401782035828, "learning_rate": 7.434225904474976e-05, "loss": 0.7485, "step": 24700 }, { "epoch": 1.9036997154065072, "grad_norm": 0.6305235624313354, "learning_rate": 7.408253902293329e-05, "loss": 0.7567, "step": 24750 }, { "epoch": 1.9036997154065072, "eval_loss": 0.7527515888214111, "eval_runtime": 17.967, "eval_samples_per_second": 55.658, "eval_steps_per_second": 13.914, "step": 24750 }, { "epoch": 1.9075455734174294, "grad_norm": 0.5975210666656494, "learning_rate": 7.382281900111679e-05, "loss": 0.7855, "step": 24800 }, { "epoch": 1.9113914314283518, "grad_norm": 0.41196370124816895, "learning_rate": 7.356309897930031e-05, "loss": 0.7711, "step": 24850 }, { "epoch": 1.9152372894392737, "grad_norm": 1.1755207777023315, "learning_rate": 7.330337895748384e-05, "loss": 0.8114, "step": 24900 }, { "epoch": 1.9190831474501961, "grad_norm": 0.37193945050239563, "learning_rate": 7.304365893566735e-05, "loss": 0.7437, "step": 24950 }, { "epoch": 1.9229290054611183, "grad_norm": 0.6753848195075989, "learning_rate": 7.278393891385087e-05, "loss": 0.8289, "step": 25000 }, { "epoch": 1.9229290054611183, "eval_loss": 0.7534742951393127, "eval_runtime": 17.8978, "eval_samples_per_second": 55.873, "eval_steps_per_second": 13.968, "step": 25000 }, { "epoch": 1.9267748634720405, "grad_norm": 0.6314563751220703, "learning_rate": 7.25242188920344e-05, "loss": 0.8278, "step": 25050 }, { "epoch": 1.930620721482963, "grad_norm": 0.38249602913856506, "learning_rate": 7.22644988702179e-05, "loss": 0.7948, "step": 25100 }, { "epoch": 1.9344665794938851, "grad_norm": 0.8241211771965027, "learning_rate": 7.200477884840142e-05, "loss": 0.8152, "step": 25150 }, { "epoch": 1.9383124375048073, "grad_norm": 0.4248273968696594, "learning_rate": 7.175025322702127e-05, "loss": 0.8083, "step": 25200 }, { "epoch": 1.9421582955157297, "grad_norm": 1.0574986934661865, "learning_rate": 7.14905332052048e-05, "loss": 0.7644, "step": 25250 }, { "epoch": 1.9421582955157297, "eval_loss": 0.7520478963851929, "eval_runtime": 17.9125, "eval_samples_per_second": 55.827, "eval_steps_per_second": 13.957, "step": 25250 }, { "epoch": 1.9460041535266517, "grad_norm": 0.957831084728241, "learning_rate": 7.123081318338831e-05, "loss": 0.8145, "step": 25300 }, { "epoch": 1.949850011537574, "grad_norm": 0.7300383448600769, "learning_rate": 7.097109316157184e-05, "loss": 0.8021, "step": 25350 }, { "epoch": 1.9536958695484963, "grad_norm": 0.6103696227073669, "learning_rate": 7.071137313975535e-05, "loss": 0.7561, "step": 25400 }, { "epoch": 1.9575417275594185, "grad_norm": 0.9353188276290894, "learning_rate": 7.045165311793886e-05, "loss": 0.76, "step": 25450 }, { "epoch": 1.9613875855703409, "grad_norm": 0.7097103595733643, "learning_rate": 7.019193309612239e-05, "loss": 0.8125, "step": 25500 }, { "epoch": 1.9613875855703409, "eval_loss": 0.7521655559539795, "eval_runtime": 17.8845, "eval_samples_per_second": 55.914, "eval_steps_per_second": 13.979, "step": 25500 }, { "epoch": 1.9652334435812628, "grad_norm": 1.2154541015625, "learning_rate": 6.99322130743059e-05, "loss": 0.7644, "step": 25550 }, { "epoch": 1.9690793015921852, "grad_norm": 0.715004026889801, "learning_rate": 6.967249305248941e-05, "loss": 0.7235, "step": 25600 }, { "epoch": 1.9729251596031074, "grad_norm": 0.44530218839645386, "learning_rate": 6.941277303067294e-05, "loss": 0.7591, "step": 25650 }, { "epoch": 1.9767710176140296, "grad_norm": 0.7103247046470642, "learning_rate": 6.915305300885647e-05, "loss": 0.7203, "step": 25700 }, { "epoch": 1.980616875624952, "grad_norm": 0.6260993480682373, "learning_rate": 6.88985273874763e-05, "loss": 0.8008, "step": 25750 }, { "epoch": 1.980616875624952, "eval_loss": 0.7489978075027466, "eval_runtime": 17.8489, "eval_samples_per_second": 56.026, "eval_steps_per_second": 14.006, "step": 25750 }, { "epoch": 1.984462733635874, "grad_norm": 0.8690526485443115, "learning_rate": 6.863880736565982e-05, "loss": 0.7445, "step": 25800 }, { "epoch": 1.9883085916467964, "grad_norm": 0.8287826776504517, "learning_rate": 6.837908734384334e-05, "loss": 0.7606, "step": 25850 }, { "epoch": 1.9921544496577186, "grad_norm": 0.9105169773101807, "learning_rate": 6.811936732202686e-05, "loss": 0.7742, "step": 25900 }, { "epoch": 1.9960003076686408, "grad_norm": 0.6223366856575012, "learning_rate": 6.785964730021037e-05, "loss": 0.8069, "step": 25950 }, { "epoch": 1.9998461656795632, "grad_norm": 0.848816454410553, "learning_rate": 6.75999272783939e-05, "loss": 0.8126, "step": 26000 }, { "epoch": 1.9998461656795632, "eval_loss": 0.7482460737228394, "eval_runtime": 17.8823, "eval_samples_per_second": 55.921, "eval_steps_per_second": 13.98, "step": 26000 }, { "epoch": 2.003692023690485, "grad_norm": 0.706822395324707, "learning_rate": 6.734020725657741e-05, "loss": 0.7, "step": 26050 }, { "epoch": 2.0075378817014076, "grad_norm": 1.503631830215454, "learning_rate": 6.708048723476093e-05, "loss": 0.7347, "step": 26100 }, { "epoch": 2.01138373971233, "grad_norm": 0.8511216044425964, "learning_rate": 6.682076721294445e-05, "loss": 0.6917, "step": 26150 }, { "epoch": 2.015229597723252, "grad_norm": 0.7063366174697876, "learning_rate": 6.656104719112797e-05, "loss": 0.7188, "step": 26200 }, { "epoch": 2.0190754557341744, "grad_norm": 0.7650218605995178, "learning_rate": 6.630132716931149e-05, "loss": 0.701, "step": 26250 }, { "epoch": 2.0190754557341744, "eval_loss": 0.7444872856140137, "eval_runtime": 17.8222, "eval_samples_per_second": 56.11, "eval_steps_per_second": 14.027, "step": 26250 }, { "epoch": 2.0229213137450963, "grad_norm": 0.7015202045440674, "learning_rate": 6.6041607147495e-05, "loss": 0.7442, "step": 26300 }, { "epoch": 2.0267671717560187, "grad_norm": 0.826304018497467, "learning_rate": 6.578188712567853e-05, "loss": 0.7003, "step": 26350 }, { "epoch": 2.030613029766941, "grad_norm": 0.5597676038742065, "learning_rate": 6.552216710386204e-05, "loss": 0.7108, "step": 26400 }, { "epoch": 2.034458887777863, "grad_norm": 0.738636314868927, "learning_rate": 6.526244708204557e-05, "loss": 0.7407, "step": 26450 }, { "epoch": 2.0383047457887855, "grad_norm": 0.6629013419151306, "learning_rate": 6.500272706022908e-05, "loss": 0.7227, "step": 26500 }, { "epoch": 2.0383047457887855, "eval_loss": 0.7432363033294678, "eval_runtime": 17.8543, "eval_samples_per_second": 56.009, "eval_steps_per_second": 14.002, "step": 26500 }, { "epoch": 2.042150603799708, "grad_norm": 0.5691978931427002, "learning_rate": 6.474300703841259e-05, "loss": 0.6752, "step": 26550 }, { "epoch": 2.04599646181063, "grad_norm": 0.5654874444007874, "learning_rate": 6.448328701659612e-05, "loss": 0.7282, "step": 26600 }, { "epoch": 2.0498423198215523, "grad_norm": 1.0112574100494385, "learning_rate": 6.422356699477963e-05, "loss": 0.7343, "step": 26650 }, { "epoch": 2.0536881778324743, "grad_norm": 0.2508319318294525, "learning_rate": 6.396384697296314e-05, "loss": 0.6796, "step": 26700 }, { "epoch": 2.0575340358433967, "grad_norm": 1.014090895652771, "learning_rate": 6.370412695114667e-05, "loss": 0.7276, "step": 26750 }, { "epoch": 2.0575340358433967, "eval_loss": 0.7409418821334839, "eval_runtime": 18.0076, "eval_samples_per_second": 55.532, "eval_steps_per_second": 13.883, "step": 26750 }, { "epoch": 2.061379893854319, "grad_norm": 0.6783422827720642, "learning_rate": 6.344440692933018e-05, "loss": 0.7233, "step": 26800 }, { "epoch": 2.065225751865241, "grad_norm": 0.9281295537948608, "learning_rate": 6.31846869075137e-05, "loss": 0.7463, "step": 26850 }, { "epoch": 2.0690716098761635, "grad_norm": 0.9024075269699097, "learning_rate": 6.293016128613355e-05, "loss": 0.7499, "step": 26900 }, { "epoch": 2.0729174678870854, "grad_norm": 0.9747761487960815, "learning_rate": 6.267044126431707e-05, "loss": 0.7736, "step": 26950 }, { "epoch": 2.076763325898008, "grad_norm": 0.9101582169532776, "learning_rate": 6.241072124250059e-05, "loss": 0.7343, "step": 27000 }, { "epoch": 2.076763325898008, "eval_loss": 0.7426913380622864, "eval_runtime": 18.0363, "eval_samples_per_second": 55.444, "eval_steps_per_second": 13.861, "step": 27000 }, { "epoch": 2.0806091839089302, "grad_norm": 0.4836456775665283, "learning_rate": 6.21510012206841e-05, "loss": 0.7725, "step": 27050 }, { "epoch": 2.084455041919852, "grad_norm": 0.9285927414894104, "learning_rate": 6.189128119886762e-05, "loss": 0.7167, "step": 27100 }, { "epoch": 2.0883008999307746, "grad_norm": 0.8826911449432373, "learning_rate": 6.163156117705115e-05, "loss": 0.7204, "step": 27150 }, { "epoch": 2.0921467579416966, "grad_norm": 1.0057801008224487, "learning_rate": 6.137184115523465e-05, "loss": 0.713, "step": 27200 }, { "epoch": 2.095992615952619, "grad_norm": 1.2974227666854858, "learning_rate": 6.111212113341818e-05, "loss": 0.6974, "step": 27250 }, { "epoch": 2.095992615952619, "eval_loss": 0.7402239441871643, "eval_runtime": 17.8723, "eval_samples_per_second": 55.953, "eval_steps_per_second": 13.988, "step": 27250 }, { "epoch": 2.0998384739635414, "grad_norm": 0.7047484517097473, "learning_rate": 6.0852401111601696e-05, "loss": 0.7377, "step": 27300 }, { "epoch": 2.1036843319744634, "grad_norm": 1.0780186653137207, "learning_rate": 6.0592681089785216e-05, "loss": 0.7153, "step": 27350 }, { "epoch": 2.1075301899853858, "grad_norm": 0.6844059228897095, "learning_rate": 6.033296106796873e-05, "loss": 0.7077, "step": 27400 }, { "epoch": 2.111376047996308, "grad_norm": 0.7109357118606567, "learning_rate": 6.007324104615225e-05, "loss": 0.6574, "step": 27450 }, { "epoch": 2.11522190600723, "grad_norm": 1.0174553394317627, "learning_rate": 5.9813521024335775e-05, "loss": 0.7269, "step": 27500 }, { "epoch": 2.11522190600723, "eval_loss": 0.7388279438018799, "eval_runtime": 17.9018, "eval_samples_per_second": 55.86, "eval_steps_per_second": 13.965, "step": 27500 }, { "epoch": 2.1190677640181526, "grad_norm": 0.8702675104141235, "learning_rate": 5.955380100251928e-05, "loss": 0.7034, "step": 27550 }, { "epoch": 2.1229136220290745, "grad_norm": 0.9476292729377747, "learning_rate": 5.929408098070281e-05, "loss": 0.7075, "step": 27600 }, { "epoch": 2.126759480039997, "grad_norm": 0.7763323783874512, "learning_rate": 5.903436095888633e-05, "loss": 0.7401, "step": 27650 }, { "epoch": 2.1306053380509193, "grad_norm": 0.9647789001464844, "learning_rate": 5.8774640937069847e-05, "loss": 0.7308, "step": 27700 }, { "epoch": 2.1344511960618413, "grad_norm": 0.7125420570373535, "learning_rate": 5.851492091525336e-05, "loss": 0.7337, "step": 27750 }, { "epoch": 2.1344511960618413, "eval_loss": 0.736056387424469, "eval_runtime": 18.0862, "eval_samples_per_second": 55.291, "eval_steps_per_second": 13.823, "step": 27750 }, { "epoch": 2.1382970540727637, "grad_norm": 1.1053630113601685, "learning_rate": 5.825520089343688e-05, "loss": 0.7434, "step": 27800 }, { "epoch": 2.1421429120836857, "grad_norm": 0.751206636428833, "learning_rate": 5.79954808716204e-05, "loss": 0.7208, "step": 27850 }, { "epoch": 2.145988770094608, "grad_norm": 1.0049408674240112, "learning_rate": 5.773576084980391e-05, "loss": 0.7265, "step": 27900 }, { "epoch": 2.1498346281055305, "grad_norm": 0.9738804697990417, "learning_rate": 5.747604082798743e-05, "loss": 0.7515, "step": 27950 }, { "epoch": 2.1536804861164525, "grad_norm": 0.7807592153549194, "learning_rate": 5.721632080617095e-05, "loss": 0.7309, "step": 28000 }, { "epoch": 2.1536804861164525, "eval_loss": 0.736193835735321, "eval_runtime": 17.9436, "eval_samples_per_second": 55.73, "eval_steps_per_second": 13.933, "step": 28000 }, { "epoch": 2.157526344127375, "grad_norm": 0.9337176084518433, "learning_rate": 5.6956600784354464e-05, "loss": 0.7844, "step": 28050 }, { "epoch": 2.1613722021382973, "grad_norm": 0.7867174744606018, "learning_rate": 5.669688076253798e-05, "loss": 0.7634, "step": 28100 }, { "epoch": 2.1652180601492192, "grad_norm": 0.6526890397071838, "learning_rate": 5.643716074072151e-05, "loss": 0.743, "step": 28150 }, { "epoch": 2.1690639181601417, "grad_norm": 1.1720079183578491, "learning_rate": 5.617744071890503e-05, "loss": 0.7165, "step": 28200 }, { "epoch": 2.1729097761710636, "grad_norm": 1.029062032699585, "learning_rate": 5.5917720697088535e-05, "loss": 0.6854, "step": 28250 }, { "epoch": 2.1729097761710636, "eval_loss": 0.7313055396080017, "eval_runtime": 18.0992, "eval_samples_per_second": 55.251, "eval_steps_per_second": 13.813, "step": 28250 }, { "epoch": 2.176755634181986, "grad_norm": 0.821443498134613, "learning_rate": 5.565800067527206e-05, "loss": 0.7209, "step": 28300 }, { "epoch": 2.1806014921929084, "grad_norm": 0.6121924519538879, "learning_rate": 5.539828065345558e-05, "loss": 0.7371, "step": 28350 }, { "epoch": 2.1844473502038304, "grad_norm": 0.8862821459770203, "learning_rate": 5.5138560631639094e-05, "loss": 0.684, "step": 28400 }, { "epoch": 2.188293208214753, "grad_norm": 0.8110325336456299, "learning_rate": 5.4878840609822614e-05, "loss": 0.6843, "step": 28450 }, { "epoch": 2.192139066225675, "grad_norm": 0.732751727104187, "learning_rate": 5.461912058800613e-05, "loss": 0.7229, "step": 28500 }, { "epoch": 2.192139066225675, "eval_loss": 0.7322584986686707, "eval_runtime": 18.025, "eval_samples_per_second": 55.478, "eval_steps_per_second": 13.87, "step": 28500 }, { "epoch": 2.195984924236597, "grad_norm": 0.8648149371147156, "learning_rate": 5.435940056618965e-05, "loss": 0.6874, "step": 28550 }, { "epoch": 2.1998307822475196, "grad_norm": 0.6547895073890686, "learning_rate": 5.4099680544373166e-05, "loss": 0.7115, "step": 28600 }, { "epoch": 2.2036766402584416, "grad_norm": 0.7705133557319641, "learning_rate": 5.3839960522556685e-05, "loss": 0.7498, "step": 28650 }, { "epoch": 2.207522498269364, "grad_norm": 1.2679988145828247, "learning_rate": 5.358024050074021e-05, "loss": 0.6956, "step": 28700 }, { "epoch": 2.211368356280286, "grad_norm": 0.4992118775844574, "learning_rate": 5.332052047892372e-05, "loss": 0.7124, "step": 28750 }, { "epoch": 2.211368356280286, "eval_loss": 0.7316113710403442, "eval_runtime": 17.8507, "eval_samples_per_second": 56.02, "eval_steps_per_second": 14.005, "step": 28750 }, { "epoch": 2.2152142142912084, "grad_norm": 0.8134187459945679, "learning_rate": 5.306080045710724e-05, "loss": 0.733, "step": 28800 }, { "epoch": 2.2190600723021308, "grad_norm": 0.5038111805915833, "learning_rate": 5.2801080435290764e-05, "loss": 0.7239, "step": 28850 }, { "epoch": 2.2229059303130527, "grad_norm": 1.0252164602279663, "learning_rate": 5.254136041347427e-05, "loss": 0.7102, "step": 28900 }, { "epoch": 2.226751788323975, "grad_norm": 0.823451817035675, "learning_rate": 5.2281640391657796e-05, "loss": 0.709, "step": 28950 }, { "epoch": 2.230597646334897, "grad_norm": 0.7642868161201477, "learning_rate": 5.2021920369841316e-05, "loss": 0.7326, "step": 29000 }, { "epoch": 2.230597646334897, "eval_loss": 0.7284311652183533, "eval_runtime": 17.8949, "eval_samples_per_second": 55.882, "eval_steps_per_second": 13.97, "step": 29000 }, { "epoch": 2.2344435043458195, "grad_norm": 0.45148393511772156, "learning_rate": 5.1762200348024836e-05, "loss": 0.7068, "step": 29050 }, { "epoch": 2.238289362356742, "grad_norm": 0.4047794044017792, "learning_rate": 5.150248032620835e-05, "loss": 0.7015, "step": 29100 }, { "epoch": 2.242135220367664, "grad_norm": 0.8585315942764282, "learning_rate": 5.124276030439187e-05, "loss": 0.7028, "step": 29150 }, { "epoch": 2.2459810783785863, "grad_norm": 0.5665230751037598, "learning_rate": 5.098304028257539e-05, "loss": 0.658, "step": 29200 }, { "epoch": 2.2498269363895087, "grad_norm": 0.7042600512504578, "learning_rate": 5.07233202607589e-05, "loss": 0.6938, "step": 29250 }, { "epoch": 2.2498269363895087, "eval_loss": 0.7279431223869324, "eval_runtime": 17.8253, "eval_samples_per_second": 56.1, "eval_steps_per_second": 14.025, "step": 29250 }, { "epoch": 2.2536727944004307, "grad_norm": 0.7422949075698853, "learning_rate": 5.046360023894242e-05, "loss": 0.6996, "step": 29300 }, { "epoch": 2.257518652411353, "grad_norm": 1.0139210224151611, "learning_rate": 5.020388021712594e-05, "loss": 0.7311, "step": 29350 }, { "epoch": 2.261364510422275, "grad_norm": 0.5937057137489319, "learning_rate": 4.994416019530946e-05, "loss": 0.704, "step": 29400 }, { "epoch": 2.2652103684331975, "grad_norm": 1.064329981803894, "learning_rate": 4.968444017349297e-05, "loss": 0.6792, "step": 29450 }, { "epoch": 2.26905622644412, "grad_norm": 0.9638292789459229, "learning_rate": 4.94247201516765e-05, "loss": 0.74, "step": 29500 }, { "epoch": 2.26905622644412, "eval_loss": 0.7287396192550659, "eval_runtime": 17.8735, "eval_samples_per_second": 55.949, "eval_steps_per_second": 13.987, "step": 29500 }, { "epoch": 2.272902084455042, "grad_norm": 0.7172055840492249, "learning_rate": 4.916500012986001e-05, "loss": 0.6984, "step": 29550 }, { "epoch": 2.2767479424659642, "grad_norm": 0.6817266345024109, "learning_rate": 4.890528010804353e-05, "loss": 0.7059, "step": 29600 }, { "epoch": 2.280593800476886, "grad_norm": 1.0110056400299072, "learning_rate": 4.864556008622705e-05, "loss": 0.7304, "step": 29650 }, { "epoch": 2.2844396584878086, "grad_norm": 0.6823923587799072, "learning_rate": 4.838584006441057e-05, "loss": 0.7462, "step": 29700 }, { "epoch": 2.288285516498731, "grad_norm": 0.7316113710403442, "learning_rate": 4.812612004259408e-05, "loss": 0.7068, "step": 29750 }, { "epoch": 2.288285516498731, "eval_loss": 0.7273894548416138, "eval_runtime": 17.8859, "eval_samples_per_second": 55.91, "eval_steps_per_second": 13.978, "step": 29750 }, { "epoch": 2.292131374509653, "grad_norm": 1.200492024421692, "learning_rate": 4.78664000207776e-05, "loss": 0.6887, "step": 29800 }, { "epoch": 2.2959772325205754, "grad_norm": 1.2268471717834473, "learning_rate": 4.760667999896112e-05, "loss": 0.7259, "step": 29850 }, { "epoch": 2.299823090531498, "grad_norm": 0.7251473069190979, "learning_rate": 4.734695997714464e-05, "loss": 0.6931, "step": 29900 }, { "epoch": 2.3036689485424198, "grad_norm": 0.5327921509742737, "learning_rate": 4.708723995532816e-05, "loss": 0.7447, "step": 29950 }, { "epoch": 2.307514806553342, "grad_norm": 0.7111462950706482, "learning_rate": 4.6827519933511674e-05, "loss": 0.7221, "step": 30000 }, { "epoch": 2.307514806553342, "eval_loss": 0.7264938354492188, "eval_runtime": 17.7847, "eval_samples_per_second": 56.228, "eval_steps_per_second": 14.057, "step": 30000 }, { "epoch": 2.311360664564264, "grad_norm": 0.861571729183197, "learning_rate": 4.6567799911695194e-05, "loss": 0.7052, "step": 30050 }, { "epoch": 2.3152065225751866, "grad_norm": 0.9279738068580627, "learning_rate": 4.6308079889878714e-05, "loss": 0.715, "step": 30100 }, { "epoch": 2.3190523805861085, "grad_norm": 0.8576169013977051, "learning_rate": 4.6048359868062226e-05, "loss": 0.7125, "step": 30150 }, { "epoch": 2.322898238597031, "grad_norm": 1.3127994537353516, "learning_rate": 4.578863984624575e-05, "loss": 0.6703, "step": 30200 }, { "epoch": 2.3267440966079533, "grad_norm": 0.5930036306381226, "learning_rate": 4.5528919824429266e-05, "loss": 0.6927, "step": 30250 }, { "epoch": 2.3267440966079533, "eval_loss": 0.7252874970436096, "eval_runtime": 17.8506, "eval_samples_per_second": 56.021, "eval_steps_per_second": 14.005, "step": 30250 }, { "epoch": 2.3305899546188753, "grad_norm": 0.6445633769035339, "learning_rate": 4.5269199802612785e-05, "loss": 0.6804, "step": 30300 }, { "epoch": 2.3344358126297977, "grad_norm": 0.9251648783683777, "learning_rate": 4.5009479780796305e-05, "loss": 0.7226, "step": 30350 }, { "epoch": 2.33828167064072, "grad_norm": 1.2103322744369507, "learning_rate": 4.474975975897982e-05, "loss": 0.7179, "step": 30400 }, { "epoch": 2.342127528651642, "grad_norm": 1.0718954801559448, "learning_rate": 4.4490039737163344e-05, "loss": 0.7638, "step": 30450 }, { "epoch": 2.3459733866625645, "grad_norm": 0.8156006336212158, "learning_rate": 4.423031971534686e-05, "loss": 0.6733, "step": 30500 }, { "epoch": 2.3459733866625645, "eval_loss": 0.7244414687156677, "eval_runtime": 17.9373, "eval_samples_per_second": 55.75, "eval_steps_per_second": 13.937, "step": 30500 }, { "epoch": 2.3498192446734865, "grad_norm": 0.9593235850334167, "learning_rate": 4.3970599693530377e-05, "loss": 0.7266, "step": 30550 }, { "epoch": 2.353665102684409, "grad_norm": 0.4570913016796112, "learning_rate": 4.3710879671713896e-05, "loss": 0.7026, "step": 30600 }, { "epoch": 2.3575109606953313, "grad_norm": 0.8020208477973938, "learning_rate": 4.345115964989741e-05, "loss": 0.7345, "step": 30650 }, { "epoch": 2.3613568187062532, "grad_norm": 0.7795267701148987, "learning_rate": 4.319143962808093e-05, "loss": 0.7351, "step": 30700 }, { "epoch": 2.3652026767171757, "grad_norm": 0.6240664720535278, "learning_rate": 4.293171960626445e-05, "loss": 0.7029, "step": 30750 }, { "epoch": 2.3652026767171757, "eval_loss": 0.7251197099685669, "eval_runtime": 17.8892, "eval_samples_per_second": 55.9, "eval_steps_per_second": 13.975, "step": 30750 }, { "epoch": 2.3690485347280976, "grad_norm": 0.773654043674469, "learning_rate": 4.267199958444797e-05, "loss": 0.7105, "step": 30800 }, { "epoch": 2.37289439273902, "grad_norm": 1.1365927457809448, "learning_rate": 4.241227956263149e-05, "loss": 0.7019, "step": 30850 }, { "epoch": 2.3767402507499424, "grad_norm": 0.6990851759910583, "learning_rate": 4.2152559540815e-05, "loss": 0.6914, "step": 30900 }, { "epoch": 2.3805861087608644, "grad_norm": 0.8598945140838623, "learning_rate": 4.189283951899852e-05, "loss": 0.7087, "step": 30950 }, { "epoch": 2.384431966771787, "grad_norm": 0.9121548533439636, "learning_rate": 4.163311949718204e-05, "loss": 0.7212, "step": 31000 }, { "epoch": 2.384431966771787, "eval_loss": 0.7226839661598206, "eval_runtime": 17.8913, "eval_samples_per_second": 55.893, "eval_steps_per_second": 13.973, "step": 31000 }, { "epoch": 2.3882778247827092, "grad_norm": 0.6950593590736389, "learning_rate": 4.137339947536556e-05, "loss": 0.7201, "step": 31050 }, { "epoch": 2.392123682793631, "grad_norm": 0.7376019358634949, "learning_rate": 4.111367945354908e-05, "loss": 0.7157, "step": 31100 }, { "epoch": 2.3959695408045536, "grad_norm": 1.286970853805542, "learning_rate": 4.08539594317326e-05, "loss": 0.7325, "step": 31150 }, { "epoch": 2.3998153988154756, "grad_norm": 0.5061975121498108, "learning_rate": 4.059423940991611e-05, "loss": 0.7206, "step": 31200 }, { "epoch": 2.403661256826398, "grad_norm": 0.7503495216369629, "learning_rate": 4.033451938809963e-05, "loss": 0.7197, "step": 31250 }, { "epoch": 2.403661256826398, "eval_loss": 0.7893036007881165, "eval_runtime": 17.9552, "eval_samples_per_second": 55.694, "eval_steps_per_second": 13.924, "step": 31250 }, { "epoch": 2.4075071148373204, "grad_norm": 0.6520366668701172, "learning_rate": 4.007479936628315e-05, "loss": 0.7496, "step": 31300 }, { "epoch": 2.4113529728482423, "grad_norm": 0.7475297451019287, "learning_rate": 3.981507934446666e-05, "loss": 0.7283, "step": 31350 }, { "epoch": 2.4151988308591648, "grad_norm": 1.0714281797409058, "learning_rate": 3.955535932265019e-05, "loss": 0.7553, "step": 31400 }, { "epoch": 2.4190446888700867, "grad_norm": 0.6734263300895691, "learning_rate": 3.92956393008337e-05, "loss": 0.698, "step": 31450 }, { "epoch": 2.422890546881009, "grad_norm": 0.7820257544517517, "learning_rate": 3.903591927901722e-05, "loss": 0.7346, "step": 31500 }, { "epoch": 2.422890546881009, "eval_loss": 0.7877212762832642, "eval_runtime": 17.9806, "eval_samples_per_second": 55.616, "eval_steps_per_second": 13.904, "step": 31500 }, { "epoch": 2.4267364048919315, "grad_norm": 0.9221381545066833, "learning_rate": 3.877619925720074e-05, "loss": 0.7077, "step": 31550 }, { "epoch": 2.4305822629028535, "grad_norm": 1.1155864000320435, "learning_rate": 3.8516479235384255e-05, "loss": 0.7358, "step": 31600 }, { "epoch": 2.434428120913776, "grad_norm": 0.949946939945221, "learning_rate": 3.825675921356778e-05, "loss": 0.7314, "step": 31650 }, { "epoch": 2.4382739789246983, "grad_norm": 0.7200281023979187, "learning_rate": 3.7997039191751294e-05, "loss": 0.6781, "step": 31700 }, { "epoch": 2.4421198369356203, "grad_norm": 1.1924189329147339, "learning_rate": 3.773731916993481e-05, "loss": 0.7116, "step": 31750 }, { "epoch": 2.4421198369356203, "eval_loss": 0.7858553528785706, "eval_runtime": 18.0342, "eval_samples_per_second": 55.45, "eval_steps_per_second": 13.863, "step": 31750 }, { "epoch": 2.4459656949465427, "grad_norm": 0.7993971109390259, "learning_rate": 3.747759914811833e-05, "loss": 0.7068, "step": 31800 }, { "epoch": 2.4498115529574647, "grad_norm": 0.6277671456336975, "learning_rate": 3.7217879126301846e-05, "loss": 0.7273, "step": 31850 }, { "epoch": 2.453657410968387, "grad_norm": 0.8524878621101379, "learning_rate": 3.6958159104485365e-05, "loss": 0.7537, "step": 31900 }, { "epoch": 2.457503268979309, "grad_norm": 0.9068925976753235, "learning_rate": 3.6698439082668885e-05, "loss": 0.7192, "step": 31950 }, { "epoch": 2.4613491269902315, "grad_norm": 0.866385817527771, "learning_rate": 3.6438719060852405e-05, "loss": 0.7145, "step": 32000 }, { "epoch": 2.4613491269902315, "eval_loss": 0.7852405905723572, "eval_runtime": 18.0021, "eval_samples_per_second": 55.549, "eval_steps_per_second": 13.887, "step": 32000 }, { "epoch": 2.465194985001154, "grad_norm": 0.8729520440101624, "learning_rate": 3.6178999039035924e-05, "loss": 0.7121, "step": 32050 }, { "epoch": 2.469040843012076, "grad_norm": 1.2588157653808594, "learning_rate": 3.591927901721944e-05, "loss": 0.7145, "step": 32100 }, { "epoch": 2.4728867010229982, "grad_norm": 1.0234293937683105, "learning_rate": 3.565955899540296e-05, "loss": 0.7173, "step": 32150 }, { "epoch": 2.4767325590339206, "grad_norm": 0.6210401058197021, "learning_rate": 3.5399838973586476e-05, "loss": 0.7404, "step": 32200 }, { "epoch": 2.4805784170448426, "grad_norm": 1.0649775266647339, "learning_rate": 3.5140118951769996e-05, "loss": 0.713, "step": 32250 }, { "epoch": 2.4805784170448426, "eval_loss": 0.7857936024665833, "eval_runtime": 17.9415, "eval_samples_per_second": 55.737, "eval_steps_per_second": 13.934, "step": 32250 }, { "epoch": 2.484424275055765, "grad_norm": 0.6743142604827881, "learning_rate": 3.488039892995351e-05, "loss": 0.7213, "step": 32300 }, { "epoch": 2.488270133066687, "grad_norm": 0.7584249377250671, "learning_rate": 3.462067890813703e-05, "loss": 0.7175, "step": 32350 }, { "epoch": 2.4921159910776094, "grad_norm": 1.7324374914169312, "learning_rate": 3.436095888632055e-05, "loss": 0.7255, "step": 32400 }, { "epoch": 2.495961849088532, "grad_norm": 1.0071933269500732, "learning_rate": 3.410123886450407e-05, "loss": 0.6905, "step": 32450 }, { "epoch": 2.4998077070994538, "grad_norm": 0.8606531023979187, "learning_rate": 3.384151884268759e-05, "loss": 0.7235, "step": 32500 }, { "epoch": 2.4998077070994538, "eval_loss": 0.7832362651824951, "eval_runtime": 17.9726, "eval_samples_per_second": 55.64, "eval_steps_per_second": 13.91, "step": 32500 }, { "epoch": 2.503653565110376, "grad_norm": 0.7658631205558777, "learning_rate": 3.35817988208711e-05, "loss": 0.7258, "step": 32550 }, { "epoch": 2.507499423121298, "grad_norm": 1.3583028316497803, "learning_rate": 3.3322078799054627e-05, "loss": 0.7414, "step": 32600 }, { "epoch": 2.5113452811322206, "grad_norm": 0.7680505514144897, "learning_rate": 3.306235877723814e-05, "loss": 0.7126, "step": 32650 }, { "epoch": 2.515191139143143, "grad_norm": 0.9117040634155273, "learning_rate": 3.280263875542165e-05, "loss": 0.7019, "step": 32700 }, { "epoch": 2.519036997154065, "grad_norm": 0.995895504951477, "learning_rate": 3.254291873360518e-05, "loss": 0.7272, "step": 32750 }, { "epoch": 2.519036997154065, "eval_loss": 0.7824276089668274, "eval_runtime": 17.8488, "eval_samples_per_second": 56.026, "eval_steps_per_second": 14.007, "step": 32750 }, { "epoch": 2.5228828551649873, "grad_norm": 0.6426506042480469, "learning_rate": 3.228319871178869e-05, "loss": 0.7309, "step": 32800 }, { "epoch": 2.5267287131759097, "grad_norm": 1.327599048614502, "learning_rate": 3.202347868997221e-05, "loss": 0.7354, "step": 32850 }, { "epoch": 2.5305745711868317, "grad_norm": 0.9184108972549438, "learning_rate": 3.176375866815573e-05, "loss": 0.7053, "step": 32900 }, { "epoch": 2.534420429197754, "grad_norm": 0.9938299059867859, "learning_rate": 3.1504038646339244e-05, "loss": 0.7374, "step": 32950 }, { "epoch": 2.5382662872086765, "grad_norm": 1.230980396270752, "learning_rate": 3.124431862452277e-05, "loss": 0.7261, "step": 33000 }, { "epoch": 2.5382662872086765, "eval_loss": 0.7817492485046387, "eval_runtime": 17.9221, "eval_samples_per_second": 55.797, "eval_steps_per_second": 13.949, "step": 33000 }, { "epoch": 2.5421121452195985, "grad_norm": 0.7762789726257324, "learning_rate": 3.098459860270628e-05, "loss": 0.7279, "step": 33050 }, { "epoch": 2.5459580032305205, "grad_norm": 0.8807786703109741, "learning_rate": 3.07248785808898e-05, "loss": 0.6957, "step": 33100 }, { "epoch": 2.549803861241443, "grad_norm": 0.8823468089103699, "learning_rate": 3.046515855907332e-05, "loss": 0.7584, "step": 33150 }, { "epoch": 2.5536497192523653, "grad_norm": 0.6461008191108704, "learning_rate": 3.0205438537256835e-05, "loss": 0.7231, "step": 33200 }, { "epoch": 2.5574955772632872, "grad_norm": 0.9959568977355957, "learning_rate": 2.9945718515440358e-05, "loss": 0.7097, "step": 33250 }, { "epoch": 2.5574955772632872, "eval_loss": 0.7804549336433411, "eval_runtime": 17.9765, "eval_samples_per_second": 55.628, "eval_steps_per_second": 13.907, "step": 33250 }, { "epoch": 2.5613414352742097, "grad_norm": 1.0902256965637207, "learning_rate": 2.9685998493623874e-05, "loss": 0.7259, "step": 33300 }, { "epoch": 2.565187293285132, "grad_norm": 0.8527780771255493, "learning_rate": 2.9426278471807394e-05, "loss": 0.7138, "step": 33350 }, { "epoch": 2.569033151296054, "grad_norm": 0.7497609257698059, "learning_rate": 2.916655844999091e-05, "loss": 0.6676, "step": 33400 }, { "epoch": 2.5728790093069764, "grad_norm": 1.252274751663208, "learning_rate": 2.8906838428174433e-05, "loss": 0.7545, "step": 33450 }, { "epoch": 2.576724867317899, "grad_norm": 0.8742374777793884, "learning_rate": 2.864711840635795e-05, "loss": 0.6727, "step": 33500 }, { "epoch": 2.576724867317899, "eval_loss": 0.7791668176651001, "eval_runtime": 17.8738, "eval_samples_per_second": 55.948, "eval_steps_per_second": 13.987, "step": 33500 }, { "epoch": 2.580570725328821, "grad_norm": 0.5088424682617188, "learning_rate": 2.8387398384541465e-05, "loss": 0.7113, "step": 33550 }, { "epoch": 2.5844165833397432, "grad_norm": 0.6116564273834229, "learning_rate": 2.8127678362724985e-05, "loss": 0.7523, "step": 33600 }, { "epoch": 2.588262441350665, "grad_norm": 0.6378856301307678, "learning_rate": 2.78679583409085e-05, "loss": 0.6924, "step": 33650 }, { "epoch": 2.5921082993615876, "grad_norm": 1.1341512203216553, "learning_rate": 2.760823831909202e-05, "loss": 0.7089, "step": 33700 }, { "epoch": 2.5959541573725096, "grad_norm": 0.8231232762336731, "learning_rate": 2.7348518297275537e-05, "loss": 0.7309, "step": 33750 }, { "epoch": 2.5959541573725096, "eval_loss": 0.7776284217834473, "eval_runtime": 17.9143, "eval_samples_per_second": 55.821, "eval_steps_per_second": 13.955, "step": 33750 }, { "epoch": 2.599800015383432, "grad_norm": 0.7154203653335571, "learning_rate": 2.7088798275459053e-05, "loss": 0.7273, "step": 33800 }, { "epoch": 2.6036458733943544, "grad_norm": 0.9213638305664062, "learning_rate": 2.6829078253642576e-05, "loss": 0.7628, "step": 33850 }, { "epoch": 2.6074917314052763, "grad_norm": 1.260438084602356, "learning_rate": 2.6569358231826092e-05, "loss": 0.7004, "step": 33900 }, { "epoch": 2.6113375894161988, "grad_norm": 0.9463502764701843, "learning_rate": 2.6309638210009612e-05, "loss": 0.6771, "step": 33950 }, { "epoch": 2.615183447427121, "grad_norm": 0.7610837817192078, "learning_rate": 2.604991818819313e-05, "loss": 0.7217, "step": 34000 }, { "epoch": 2.615183447427121, "eval_loss": 0.7775171995162964, "eval_runtime": 17.9057, "eval_samples_per_second": 55.848, "eval_steps_per_second": 13.962, "step": 34000 }, { "epoch": 2.619029305438043, "grad_norm": 0.4978080093860626, "learning_rate": 2.5790198166376645e-05, "loss": 0.7309, "step": 34050 }, { "epoch": 2.6228751634489655, "grad_norm": 0.779080331325531, "learning_rate": 2.5530478144560168e-05, "loss": 0.7105, "step": 34100 }, { "epoch": 2.626721021459888, "grad_norm": 0.5153629779815674, "learning_rate": 2.5275952523180012e-05, "loss": 0.7377, "step": 34150 }, { "epoch": 2.63056687947081, "grad_norm": 0.8356613516807556, "learning_rate": 2.5016232501363528e-05, "loss": 0.7135, "step": 34200 }, { "epoch": 2.6344127374817323, "grad_norm": 0.5202348232269287, "learning_rate": 2.475651247954705e-05, "loss": 0.7333, "step": 34250 }, { "epoch": 2.6344127374817323, "eval_loss": 0.7767261862754822, "eval_runtime": 17.8651, "eval_samples_per_second": 55.975, "eval_steps_per_second": 13.994, "step": 34250 }, { "epoch": 2.6382585954926543, "grad_norm": 0.9579488039016724, "learning_rate": 2.4496792457730567e-05, "loss": 0.7823, "step": 34300 }, { "epoch": 2.6421044535035767, "grad_norm": 0.7704477906227112, "learning_rate": 2.4237072435914084e-05, "loss": 0.6979, "step": 34350 }, { "epoch": 2.6459503115144987, "grad_norm": 0.8563690781593323, "learning_rate": 2.3977352414097603e-05, "loss": 0.7049, "step": 34400 }, { "epoch": 2.649796169525421, "grad_norm": 0.663038432598114, "learning_rate": 2.3717632392281123e-05, "loss": 0.751, "step": 34450 }, { "epoch": 2.6536420275363435, "grad_norm": 0.8598125576972961, "learning_rate": 2.345791237046464e-05, "loss": 0.6982, "step": 34500 }, { "epoch": 2.6536420275363435, "eval_loss": 0.7752255201339722, "eval_runtime": 17.9612, "eval_samples_per_second": 55.675, "eval_steps_per_second": 13.919, "step": 34500 }, { "epoch": 2.6574878855472654, "grad_norm": 1.2697360515594482, "learning_rate": 2.319819234864816e-05, "loss": 0.7289, "step": 34550 }, { "epoch": 2.661333743558188, "grad_norm": 0.6098369359970093, "learning_rate": 2.2938472326831675e-05, "loss": 0.6948, "step": 34600 }, { "epoch": 2.6651796015691103, "grad_norm": 0.48443081974983215, "learning_rate": 2.2678752305015195e-05, "loss": 0.7159, "step": 34650 }, { "epoch": 2.6690254595800322, "grad_norm": 0.7432298064231873, "learning_rate": 2.2419032283198714e-05, "loss": 0.7089, "step": 34700 }, { "epoch": 2.6728713175909546, "grad_norm": 0.7649087309837341, "learning_rate": 2.215931226138223e-05, "loss": 0.7107, "step": 34750 }, { "epoch": 2.6728713175909546, "eval_loss": 0.774026095867157, "eval_runtime": 18.0048, "eval_samples_per_second": 55.541, "eval_steps_per_second": 13.885, "step": 34750 }, { "epoch": 2.676717175601877, "grad_norm": 0.984624445438385, "learning_rate": 2.189959223956575e-05, "loss": 0.749, "step": 34800 }, { "epoch": 2.680563033612799, "grad_norm": 0.7625775933265686, "learning_rate": 2.163987221774927e-05, "loss": 0.6645, "step": 34850 }, { "epoch": 2.684408891623721, "grad_norm": 0.846238374710083, "learning_rate": 2.1380152195932786e-05, "loss": 0.7468, "step": 34900 }, { "epoch": 2.6882547496346434, "grad_norm": 1.1688568592071533, "learning_rate": 2.1120432174116302e-05, "loss": 0.7105, "step": 34950 }, { "epoch": 2.692100607645566, "grad_norm": 0.9417968392372131, "learning_rate": 2.0860712152299822e-05, "loss": 0.6826, "step": 35000 }, { "epoch": 2.692100607645566, "eval_loss": 0.7743579149246216, "eval_runtime": 18.0822, "eval_samples_per_second": 55.303, "eval_steps_per_second": 13.826, "step": 35000 }, { "epoch": 2.6959464656564878, "grad_norm": 1.1616791486740112, "learning_rate": 2.060099213048334e-05, "loss": 0.7242, "step": 35050 }, { "epoch": 2.69979232366741, "grad_norm": 0.9195474982261658, "learning_rate": 2.0341272108666858e-05, "loss": 0.7112, "step": 35100 }, { "epoch": 2.7036381816783326, "grad_norm": 1.168445110321045, "learning_rate": 2.0081552086850377e-05, "loss": 0.7331, "step": 35150 }, { "epoch": 2.7074840396892546, "grad_norm": 1.3413971662521362, "learning_rate": 1.9821832065033893e-05, "loss": 0.7627, "step": 35200 }, { "epoch": 2.711329897700177, "grad_norm": 0.9387266039848328, "learning_rate": 1.9562112043217413e-05, "loss": 0.6968, "step": 35250 }, { "epoch": 2.711329897700177, "eval_loss": 0.7742106914520264, "eval_runtime": 18.076, "eval_samples_per_second": 55.322, "eval_steps_per_second": 13.83, "step": 35250 }, { "epoch": 2.7151757557110994, "grad_norm": 0.8906998634338379, "learning_rate": 1.930239202140093e-05, "loss": 0.7206, "step": 35300 }, { "epoch": 2.7190216137220213, "grad_norm": 0.8276380896568298, "learning_rate": 1.904267199958445e-05, "loss": 0.7101, "step": 35350 }, { "epoch": 2.7228674717329437, "grad_norm": 0.8341213464736938, "learning_rate": 1.878295197776797e-05, "loss": 0.7631, "step": 35400 }, { "epoch": 2.7267133297438657, "grad_norm": 0.9501305222511292, "learning_rate": 1.8523231955951485e-05, "loss": 0.7138, "step": 35450 }, { "epoch": 2.730559187754788, "grad_norm": 0.9375068545341492, "learning_rate": 1.8263511934135e-05, "loss": 0.7556, "step": 35500 }, { "epoch": 2.730559187754788, "eval_loss": 0.7727531790733337, "eval_runtime": 18.0286, "eval_samples_per_second": 55.467, "eval_steps_per_second": 13.867, "step": 35500 }, { "epoch": 2.73440504576571, "grad_norm": 0.5093711018562317, "learning_rate": 1.800379191231852e-05, "loss": 0.6425, "step": 35550 }, { "epoch": 2.7382509037766325, "grad_norm": 1.0487879514694214, "learning_rate": 1.774407189050204e-05, "loss": 0.6846, "step": 35600 }, { "epoch": 2.742096761787555, "grad_norm": 0.6705742478370667, "learning_rate": 1.748435186868556e-05, "loss": 0.7464, "step": 35650 }, { "epoch": 2.745942619798477, "grad_norm": 0.43706628680229187, "learning_rate": 1.7224631846869076e-05, "loss": 0.7325, "step": 35700 }, { "epoch": 2.7497884778093993, "grad_norm": 1.1549192667007446, "learning_rate": 1.6964911825052592e-05, "loss": 0.7083, "step": 35750 }, { "epoch": 2.7497884778093993, "eval_loss": 0.7717772126197815, "eval_runtime": 18.085, "eval_samples_per_second": 55.295, "eval_steps_per_second": 13.824, "step": 35750 }, { "epoch": 2.7536343358203217, "grad_norm": 0.5367007255554199, "learning_rate": 1.6705191803236112e-05, "loss": 0.7054, "step": 35800 }, { "epoch": 2.7574801938312437, "grad_norm": 0.8213953971862793, "learning_rate": 1.644547178141963e-05, "loss": 0.6764, "step": 35850 }, { "epoch": 2.761326051842166, "grad_norm": 0.9012633562088013, "learning_rate": 1.6185751759603148e-05, "loss": 0.725, "step": 35900 }, { "epoch": 2.7651719098530885, "grad_norm": 0.656104326248169, "learning_rate": 1.5926031737786667e-05, "loss": 0.7279, "step": 35950 }, { "epoch": 2.7690177678640104, "grad_norm": 0.901136040687561, "learning_rate": 1.5666311715970187e-05, "loss": 0.7354, "step": 36000 }, { "epoch": 2.7690177678640104, "eval_loss": 0.7717016935348511, "eval_runtime": 17.9512, "eval_samples_per_second": 55.707, "eval_steps_per_second": 13.927, "step": 36000 }, { "epoch": 2.772863625874933, "grad_norm": 0.6917023658752441, "learning_rate": 1.5406591694153703e-05, "loss": 0.6798, "step": 36050 }, { "epoch": 2.776709483885855, "grad_norm": 0.9695160388946533, "learning_rate": 1.5146871672337221e-05, "loss": 0.7063, "step": 36100 }, { "epoch": 2.780555341896777, "grad_norm": 1.0134507417678833, "learning_rate": 1.4887151650520739e-05, "loss": 0.7325, "step": 36150 }, { "epoch": 2.784401199907699, "grad_norm": 0.8022010922431946, "learning_rate": 1.4627431628704259e-05, "loss": 0.7095, "step": 36200 }, { "epoch": 2.7882470579186216, "grad_norm": 0.8629682660102844, "learning_rate": 1.4367711606887777e-05, "loss": 0.6793, "step": 36250 }, { "epoch": 2.7882470579186216, "eval_loss": 0.7713639736175537, "eval_runtime": 17.9977, "eval_samples_per_second": 55.563, "eval_steps_per_second": 13.891, "step": 36250 }, { "epoch": 2.792092915929544, "grad_norm": 0.8491897583007812, "learning_rate": 1.4107991585071294e-05, "loss": 0.6899, "step": 36300 }, { "epoch": 2.795938773940466, "grad_norm": 1.0382113456726074, "learning_rate": 1.384827156325481e-05, "loss": 0.7362, "step": 36350 }, { "epoch": 2.7997846319513884, "grad_norm": 0.7207579016685486, "learning_rate": 1.358855154143833e-05, "loss": 0.6984, "step": 36400 }, { "epoch": 2.803630489962311, "grad_norm": 0.9483594298362732, "learning_rate": 1.3328831519621848e-05, "loss": 0.713, "step": 36450 }, { "epoch": 2.8074763479732328, "grad_norm": 1.0805621147155762, "learning_rate": 1.3069111497805366e-05, "loss": 0.7235, "step": 36500 }, { "epoch": 2.8074763479732328, "eval_loss": 0.7706654667854309, "eval_runtime": 17.9083, "eval_samples_per_second": 55.84, "eval_steps_per_second": 13.96, "step": 36500 }, { "epoch": 2.811322205984155, "grad_norm": 0.8592945337295532, "learning_rate": 1.2809391475988886e-05, "loss": 0.7173, "step": 36550 }, { "epoch": 2.8151680639950776, "grad_norm": 1.0562350749969482, "learning_rate": 1.2549671454172402e-05, "loss": 0.6661, "step": 36600 }, { "epoch": 2.8190139220059995, "grad_norm": 1.0829477310180664, "learning_rate": 1.2289951432355922e-05, "loss": 0.7105, "step": 36650 }, { "epoch": 2.8228597800169215, "grad_norm": 1.2846815586090088, "learning_rate": 1.203542581097577e-05, "loss": 0.7218, "step": 36700 }, { "epoch": 2.826705638027844, "grad_norm": 1.3996707201004028, "learning_rate": 1.1775705789159287e-05, "loss": 0.7348, "step": 36750 }, { "epoch": 2.826705638027844, "eval_loss": 0.7698732018470764, "eval_runtime": 17.9822, "eval_samples_per_second": 55.611, "eval_steps_per_second": 13.903, "step": 36750 }, { "epoch": 2.8305514960387663, "grad_norm": 1.040479302406311, "learning_rate": 1.1515985767342805e-05, "loss": 0.7103, "step": 36800 }, { "epoch": 2.8343973540496883, "grad_norm": 0.8566408753395081, "learning_rate": 1.1256265745526323e-05, "loss": 0.7087, "step": 36850 }, { "epoch": 2.8382432120606107, "grad_norm": 1.0727367401123047, "learning_rate": 1.0996545723709841e-05, "loss": 0.6972, "step": 36900 }, { "epoch": 2.842089070071533, "grad_norm": 0.8675785064697266, "learning_rate": 1.0736825701893359e-05, "loss": 0.7318, "step": 36950 }, { "epoch": 2.845934928082455, "grad_norm": 1.2655267715454102, "learning_rate": 1.0477105680076877e-05, "loss": 0.7333, "step": 37000 }, { "epoch": 2.845934928082455, "eval_loss": 0.7695651650428772, "eval_runtime": 17.7976, "eval_samples_per_second": 56.187, "eval_steps_per_second": 14.047, "step": 37000 }, { "epoch": 2.8497807860933775, "grad_norm": 1.1233916282653809, "learning_rate": 1.0217385658260397e-05, "loss": 0.7128, "step": 37050 }, { "epoch": 2.8536266441043, "grad_norm": 0.917649507522583, "learning_rate": 9.957665636443913e-06, "loss": 0.7402, "step": 37100 }, { "epoch": 2.857472502115222, "grad_norm": 0.8935102820396423, "learning_rate": 9.697945614627432e-06, "loss": 0.731, "step": 37150 }, { "epoch": 2.8613183601261443, "grad_norm": 0.6891331076622009, "learning_rate": 9.43822559281095e-06, "loss": 0.7331, "step": 37200 }, { "epoch": 2.8651642181370662, "grad_norm": 0.7505995631217957, "learning_rate": 9.178505570994468e-06, "loss": 0.6744, "step": 37250 }, { "epoch": 2.8651642181370662, "eval_loss": 0.7693511247634888, "eval_runtime": 17.8693, "eval_samples_per_second": 55.962, "eval_steps_per_second": 13.99, "step": 37250 }, { "epoch": 2.8690100761479886, "grad_norm": 1.2373569011688232, "learning_rate": 8.918785549177986e-06, "loss": 0.6981, "step": 37300 }, { "epoch": 2.8728559341589106, "grad_norm": 0.9159016013145447, "learning_rate": 8.659065527361506e-06, "loss": 0.7601, "step": 37350 }, { "epoch": 2.876701792169833, "grad_norm": 0.3170250952243805, "learning_rate": 8.399345505545022e-06, "loss": 0.7008, "step": 37400 }, { "epoch": 2.8805476501807554, "grad_norm": 0.7592608332633972, "learning_rate": 8.139625483728542e-06, "loss": 0.6966, "step": 37450 }, { "epoch": 2.8843935081916774, "grad_norm": 0.7826717495918274, "learning_rate": 7.879905461912058e-06, "loss": 0.7398, "step": 37500 }, { "epoch": 2.8843935081916774, "eval_loss": 0.7694031596183777, "eval_runtime": 17.9358, "eval_samples_per_second": 55.754, "eval_steps_per_second": 13.939, "step": 37500 }, { "epoch": 2.8882393662026, "grad_norm": 0.6858485341072083, "learning_rate": 7.6201854400955775e-06, "loss": 0.7132, "step": 37550 }, { "epoch": 2.892085224213522, "grad_norm": 0.7138088345527649, "learning_rate": 7.3604654182790955e-06, "loss": 0.7082, "step": 37600 }, { "epoch": 2.895931082224444, "grad_norm": 0.4927150309085846, "learning_rate": 7.100745396462613e-06, "loss": 0.7551, "step": 37650 }, { "epoch": 2.8997769402353666, "grad_norm": 0.879112720489502, "learning_rate": 6.841025374646131e-06, "loss": 0.7228, "step": 37700 }, { "epoch": 2.903622798246289, "grad_norm": 1.2699699401855469, "learning_rate": 6.58130535282965e-06, "loss": 0.7208, "step": 37750 }, { "epoch": 2.903622798246289, "eval_loss": 0.7685362696647644, "eval_runtime": 17.9674, "eval_samples_per_second": 55.656, "eval_steps_per_second": 13.914, "step": 37750 }, { "epoch": 2.907468656257211, "grad_norm": 0.7341476082801819, "learning_rate": 6.321585331013168e-06, "loss": 0.761, "step": 37800 }, { "epoch": 2.9113145142681334, "grad_norm": 0.8890082836151123, "learning_rate": 6.061865309196686e-06, "loss": 0.6837, "step": 37850 }, { "epoch": 2.9151603722790553, "grad_norm": 0.5546180009841919, "learning_rate": 5.802145287380204e-06, "loss": 0.7126, "step": 37900 }, { "epoch": 2.9190062302899777, "grad_norm": 0.7684674263000488, "learning_rate": 5.542425265563723e-06, "loss": 0.6765, "step": 37950 }, { "epoch": 2.9228520883008997, "grad_norm": 0.8968291282653809, "learning_rate": 5.2827052437472405e-06, "loss": 0.6839, "step": 38000 }, { "epoch": 2.9228520883008997, "eval_loss": 0.7687397003173828, "eval_runtime": 17.8165, "eval_samples_per_second": 56.128, "eval_steps_per_second": 14.032, "step": 38000 }, { "epoch": 2.926697946311822, "grad_norm": 0.8798107504844666, "learning_rate": 5.0229852219307584e-06, "loss": 0.7084, "step": 38050 }, { "epoch": 2.9305438043227445, "grad_norm": 0.3845706284046173, "learning_rate": 4.763265200114277e-06, "loss": 0.6764, "step": 38100 }, { "epoch": 2.9343896623336665, "grad_norm": 0.6847463846206665, "learning_rate": 4.503545178297795e-06, "loss": 0.7165, "step": 38150 }, { "epoch": 2.938235520344589, "grad_norm": 0.7632951736450195, "learning_rate": 4.243825156481313e-06, "loss": 0.7311, "step": 38200 }, { "epoch": 2.9420813783555113, "grad_norm": 1.3314287662506104, "learning_rate": 3.984105134664832e-06, "loss": 0.6852, "step": 38250 }, { "epoch": 2.9420813783555113, "eval_loss": 0.7683274149894714, "eval_runtime": 17.9706, "eval_samples_per_second": 55.646, "eval_steps_per_second": 13.912, "step": 38250 }, { "epoch": 2.9459272363664333, "grad_norm": 1.0179448127746582, "learning_rate": 3.7243851128483497e-06, "loss": 0.759, "step": 38300 }, { "epoch": 2.9497730943773557, "grad_norm": 1.4116487503051758, "learning_rate": 3.4646650910318677e-06, "loss": 0.7773, "step": 38350 }, { "epoch": 2.953618952388278, "grad_norm": 0.6251114010810852, "learning_rate": 3.2049450692153856e-06, "loss": 0.7016, "step": 38400 }, { "epoch": 2.9574648103992, "grad_norm": 0.9810579419136047, "learning_rate": 2.945225047398904e-06, "loss": 0.6909, "step": 38450 }, { "epoch": 2.961310668410122, "grad_norm": 0.7243860363960266, "learning_rate": 2.6855050255824223e-06, "loss": 0.7305, "step": 38500 }, { "epoch": 2.961310668410122, "eval_loss": 0.7680486440658569, "eval_runtime": 17.9145, "eval_samples_per_second": 55.821, "eval_steps_per_second": 13.955, "step": 38500 }, { "epoch": 2.9651565264210444, "grad_norm": 0.7657055854797363, "learning_rate": 2.4257850037659406e-06, "loss": 0.7114, "step": 38550 }, { "epoch": 2.969002384431967, "grad_norm": 0.7305043339729309, "learning_rate": 2.1660649819494585e-06, "loss": 0.7135, "step": 38600 }, { "epoch": 2.972848242442889, "grad_norm": 0.7981142401695251, "learning_rate": 1.9063449601329769e-06, "loss": 0.7328, "step": 38650 }, { "epoch": 2.976694100453811, "grad_norm": 0.7305875420570374, "learning_rate": 1.6466249383164948e-06, "loss": 0.7103, "step": 38700 }, { "epoch": 2.9805399584647336, "grad_norm": 1.197097659111023, "learning_rate": 1.386904916500013e-06, "loss": 0.7148, "step": 38750 }, { "epoch": 2.9805399584647336, "eval_loss": 0.7678167819976807, "eval_runtime": 17.8827, "eval_samples_per_second": 55.92, "eval_steps_per_second": 13.98, "step": 38750 }, { "epoch": 2.9843858164756556, "grad_norm": 0.8533993363380432, "learning_rate": 1.127184894683531e-06, "loss": 0.753, "step": 38800 }, { "epoch": 2.988231674486578, "grad_norm": 0.7372131943702698, "learning_rate": 8.674648728670494e-07, "loss": 0.7082, "step": 38850 }, { "epoch": 2.9920775324975004, "grad_norm": 1.499084234237671, "learning_rate": 6.077448510505675e-07, "loss": 0.6937, "step": 38900 }, { "epoch": 2.9959233905084224, "grad_norm": 0.5895427465438843, "learning_rate": 3.4802482923408566e-07, "loss": 0.701, "step": 38950 }, { "epoch": 2.999769248519345, "grad_norm": 0.9201724529266357, "learning_rate": 8.830480741760382e-08, "loss": 0.7665, "step": 39000 }, { "epoch": 2.999769248519345, "eval_loss": 0.767805814743042, "eval_runtime": 17.9509, "eval_samples_per_second": 55.708, "eval_steps_per_second": 13.927, "step": 39000 } ], "logging_steps": 50, "max_steps": 39003, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }