| { | |
| "best_global_step": 31000, | |
| "best_metric": 0.7226839661598206, | |
| "best_model_checkpoint": "./ar-diffusion-checkpoints-fixed/checkpoint-31000", | |
| "epoch": 2.999769248519345, | |
| "eval_steps": 250, | |
| "global_step": 39000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003845858010922237, | |
| "grad_norm": 11.687983512878418, | |
| "learning_rate": 1.84e-05, | |
| "loss": 8.7035, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.007691716021844474, | |
| "grad_norm": 2.6404802799224854, | |
| "learning_rate": 3.8400000000000005e-05, | |
| "loss": 2.7335, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01153757403276671, | |
| "grad_norm": 2.068481683731079, | |
| "learning_rate": 5.8399999999999997e-05, | |
| "loss": 2.0457, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.015383432043688947, | |
| "grad_norm": 2.60369610786438, | |
| "learning_rate": 7.840000000000001e-05, | |
| "loss": 1.8505, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.019229290054611183, | |
| "grad_norm": 1.6515765190124512, | |
| "learning_rate": 9.84e-05, | |
| "loss": 1.8158, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.019229290054611183, | |
| "eval_loss": 1.8082822561264038, | |
| "eval_runtime": 18.1351, | |
| "eval_samples_per_second": 55.142, | |
| "eval_steps_per_second": 13.785, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.02307514806553342, | |
| "grad_norm": 2.1014769077301025, | |
| "learning_rate": 0.0001184, | |
| "loss": 1.65, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02692100607645566, | |
| "grad_norm": 1.5384572744369507, | |
| "learning_rate": 0.0001384, | |
| "loss": 1.5594, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.030766864087377895, | |
| "grad_norm": 1.4865778684616089, | |
| "learning_rate": 0.00015840000000000003, | |
| "loss": 1.6723, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03461272209830013, | |
| "grad_norm": 1.0966717004776, | |
| "learning_rate": 0.0001784, | |
| "loss": 1.602, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.038458580109222366, | |
| "grad_norm": 1.6298224925994873, | |
| "learning_rate": 0.0001984, | |
| "loss": 1.5659, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.038458580109222366, | |
| "eval_loss": 1.6280817985534668, | |
| "eval_runtime": 18.1181, | |
| "eval_samples_per_second": 55.194, | |
| "eval_steps_per_second": 13.798, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0423044381201446, | |
| "grad_norm": 1.2955571413040161, | |
| "learning_rate": 0.00019976105757992885, | |
| "loss": 1.6388, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.04615029613106684, | |
| "grad_norm": 1.2672921419143677, | |
| "learning_rate": 0.00019950133755811236, | |
| "loss": 1.6029, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04999615414198908, | |
| "grad_norm": 1.6246057748794556, | |
| "learning_rate": 0.00019924161753629587, | |
| "loss": 1.4845, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.05384201215291132, | |
| "grad_norm": 1.0235854387283325, | |
| "learning_rate": 0.0001989818975144794, | |
| "loss": 1.5184, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05768787016383355, | |
| "grad_norm": 1.5333527326583862, | |
| "learning_rate": 0.0001987221774926629, | |
| "loss": 1.6136, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.05768787016383355, | |
| "eval_loss": 1.5622245073318481, | |
| "eval_runtime": 18.2497, | |
| "eval_samples_per_second": 54.795, | |
| "eval_steps_per_second": 13.699, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.06153372817475579, | |
| "grad_norm": 1.1980785131454468, | |
| "learning_rate": 0.00019846245747084644, | |
| "loss": 1.571, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06537958618567802, | |
| "grad_norm": 1.6081124544143677, | |
| "learning_rate": 0.00019820273744902995, | |
| "loss": 1.5542, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.06922544419660026, | |
| "grad_norm": 1.2620774507522583, | |
| "learning_rate": 0.0001979430174272135, | |
| "loss": 1.5513, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0730713022075225, | |
| "grad_norm": 0.8911245465278625, | |
| "learning_rate": 0.000197683297405397, | |
| "loss": 1.4516, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.07691716021844473, | |
| "grad_norm": 1.2976778745651245, | |
| "learning_rate": 0.0001974235773835805, | |
| "loss": 1.5757, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07691716021844473, | |
| "eval_loss": 1.5385061502456665, | |
| "eval_runtime": 18.1057, | |
| "eval_samples_per_second": 55.231, | |
| "eval_steps_per_second": 13.808, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.08076301822936698, | |
| "grad_norm": 1.451479196548462, | |
| "learning_rate": 0.00019716385736176403, | |
| "loss": 1.4287, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0846088762402892, | |
| "grad_norm": 1.0982426404953003, | |
| "learning_rate": 0.00019690413733994754, | |
| "loss": 1.4962, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.08845473425121145, | |
| "grad_norm": 0.8899670839309692, | |
| "learning_rate": 0.00019664441731813106, | |
| "loss": 1.4675, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.09230059226213368, | |
| "grad_norm": 1.5194021463394165, | |
| "learning_rate": 0.0001963846972963146, | |
| "loss": 1.4573, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.09614645027305592, | |
| "grad_norm": 1.716470718383789, | |
| "learning_rate": 0.0001961249772744981, | |
| "loss": 1.5143, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.09614645027305592, | |
| "eval_loss": 1.5197770595550537, | |
| "eval_runtime": 18.2416, | |
| "eval_samples_per_second": 54.82, | |
| "eval_steps_per_second": 13.705, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.09999230828397816, | |
| "grad_norm": 1.986771583557129, | |
| "learning_rate": 0.00019586525725268162, | |
| "loss": 1.5265, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.10383816629490039, | |
| "grad_norm": 0.86269211769104, | |
| "learning_rate": 0.00019560553723086514, | |
| "loss": 1.4842, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.10768402430582263, | |
| "grad_norm": 1.187501072883606, | |
| "learning_rate": 0.00019534581720904865, | |
| "loss": 1.4054, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.11152988231674486, | |
| "grad_norm": 1.5051347017288208, | |
| "learning_rate": 0.00019508609718723216, | |
| "loss": 1.4141, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1153757403276671, | |
| "grad_norm": 1.99917471408844, | |
| "learning_rate": 0.0001948263771654157, | |
| "loss": 1.4767, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1153757403276671, | |
| "eval_loss": 1.4949736595153809, | |
| "eval_runtime": 18.0402, | |
| "eval_samples_per_second": 55.432, | |
| "eval_steps_per_second": 13.858, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.11922159833858934, | |
| "grad_norm": 1.6421241760253906, | |
| "learning_rate": 0.00019456665714359921, | |
| "loss": 1.5029, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.12306745634951158, | |
| "grad_norm": 1.8251460790634155, | |
| "learning_rate": 0.00019430693712178273, | |
| "loss": 1.4666, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.12691331436043382, | |
| "grad_norm": 1.2284319400787354, | |
| "learning_rate": 0.00019404721709996624, | |
| "loss": 1.4785, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.13075917237135604, | |
| "grad_norm": 3.7399282455444336, | |
| "learning_rate": 0.00019378749707814975, | |
| "loss": 1.5027, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.13460503038227828, | |
| "grad_norm": 1.2193188667297363, | |
| "learning_rate": 0.0001935277770563333, | |
| "loss": 1.4944, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.13460503038227828, | |
| "eval_loss": 1.4724599123001099, | |
| "eval_runtime": 18.0061, | |
| "eval_samples_per_second": 55.537, | |
| "eval_steps_per_second": 13.884, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.13845088839320052, | |
| "grad_norm": 0.5916198492050171, | |
| "learning_rate": 0.0001932680570345168, | |
| "loss": 1.5119, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.14229674640412276, | |
| "grad_norm": 1.4087570905685425, | |
| "learning_rate": 0.00019300833701270032, | |
| "loss": 1.3608, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.146142604415045, | |
| "grad_norm": 1.2559338808059692, | |
| "learning_rate": 0.00019274861699088386, | |
| "loss": 1.4772, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.14998846242596722, | |
| "grad_norm": 0.9022719860076904, | |
| "learning_rate": 0.00019248889696906734, | |
| "loss": 1.4339, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.15383432043688947, | |
| "grad_norm": 1.2900218963623047, | |
| "learning_rate": 0.00019222917694725086, | |
| "loss": 1.4612, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.15383432043688947, | |
| "eval_loss": 1.4609016180038452, | |
| "eval_runtime": 18.1146, | |
| "eval_samples_per_second": 55.204, | |
| "eval_steps_per_second": 13.801, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1576801784478117, | |
| "grad_norm": 0.8418329358100891, | |
| "learning_rate": 0.0001919694569254344, | |
| "loss": 1.4944, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.16152603645873395, | |
| "grad_norm": 1.538751482963562, | |
| "learning_rate": 0.0001917097369036179, | |
| "loss": 1.4135, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.16537189446965617, | |
| "grad_norm": 1.3898651599884033, | |
| "learning_rate": 0.00019145001688180142, | |
| "loss": 1.3683, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.1692177524805784, | |
| "grad_norm": 0.7671115398406982, | |
| "learning_rate": 0.00019119029685998496, | |
| "loss": 1.365, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.17306361049150065, | |
| "grad_norm": 0.732802152633667, | |
| "learning_rate": 0.00019093057683816848, | |
| "loss": 1.3213, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.17306361049150065, | |
| "eval_loss": 1.4241567850112915, | |
| "eval_runtime": 17.9984, | |
| "eval_samples_per_second": 55.56, | |
| "eval_steps_per_second": 13.89, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.1769094685024229, | |
| "grad_norm": 1.6236932277679443, | |
| "learning_rate": 0.00019067085681635196, | |
| "loss": 1.4366, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.18075532651334514, | |
| "grad_norm": 1.3093007802963257, | |
| "learning_rate": 0.0001904111367945355, | |
| "loss": 1.3468, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.18460118452426735, | |
| "grad_norm": 1.409177303314209, | |
| "learning_rate": 0.00019015141677271901, | |
| "loss": 1.4066, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1884470425351896, | |
| "grad_norm": 1.0054073333740234, | |
| "learning_rate": 0.00018989169675090253, | |
| "loss": 1.4608, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.19229290054611184, | |
| "grad_norm": 1.0325884819030762, | |
| "learning_rate": 0.00018963197672908607, | |
| "loss": 1.3857, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.19229290054611184, | |
| "eval_loss": 1.4193872213363647, | |
| "eval_runtime": 18.115, | |
| "eval_samples_per_second": 55.203, | |
| "eval_steps_per_second": 13.801, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.19613875855703408, | |
| "grad_norm": 0.7152838110923767, | |
| "learning_rate": 0.00018937225670726958, | |
| "loss": 1.3343, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.19998461656795632, | |
| "grad_norm": 0.9736573100090027, | |
| "learning_rate": 0.0001891125366854531, | |
| "loss": 1.3832, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.20383047457887854, | |
| "grad_norm": 0.9278397560119629, | |
| "learning_rate": 0.0001888528166636366, | |
| "loss": 1.3807, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.20767633258980078, | |
| "grad_norm": 1.8133916854858398, | |
| "learning_rate": 0.00018859309664182012, | |
| "loss": 1.3844, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.21152219060072303, | |
| "grad_norm": 1.1289211511611938, | |
| "learning_rate": 0.00018833337662000366, | |
| "loss": 1.2984, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.21152219060072303, | |
| "eval_loss": 1.4020246267318726, | |
| "eval_runtime": 18.086, | |
| "eval_samples_per_second": 55.291, | |
| "eval_steps_per_second": 13.823, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.21536804861164527, | |
| "grad_norm": 1.9358755350112915, | |
| "learning_rate": 0.00018807365659818717, | |
| "loss": 1.3231, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.21921390662256748, | |
| "grad_norm": 1.453515887260437, | |
| "learning_rate": 0.00018781393657637068, | |
| "loss": 1.3395, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.22305976463348973, | |
| "grad_norm": 1.423431396484375, | |
| "learning_rate": 0.0001875542165545542, | |
| "loss": 1.435, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.22690562264441197, | |
| "grad_norm": 0.9964897632598877, | |
| "learning_rate": 0.0001872944965327377, | |
| "loss": 1.3356, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.2307514806553342, | |
| "grad_norm": 1.5574508905410767, | |
| "learning_rate": 0.00018703477651092122, | |
| "loss": 1.4102, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.2307514806553342, | |
| "eval_loss": 1.3945672512054443, | |
| "eval_runtime": 18.0752, | |
| "eval_samples_per_second": 55.324, | |
| "eval_steps_per_second": 13.831, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.23459733866625646, | |
| "grad_norm": 1.7693700790405273, | |
| "learning_rate": 0.00018677505648910476, | |
| "loss": 1.4095, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.23844319667717867, | |
| "grad_norm": 1.0146111249923706, | |
| "learning_rate": 0.00018651533646728827, | |
| "loss": 1.3416, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.2422890546881009, | |
| "grad_norm": 1.228946566581726, | |
| "learning_rate": 0.0001862556164454718, | |
| "loss": 1.3326, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.24613491269902316, | |
| "grad_norm": 0.9278371930122375, | |
| "learning_rate": 0.0001859958964236553, | |
| "loss": 1.4023, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.2499807707099454, | |
| "grad_norm": 1.858821988105774, | |
| "learning_rate": 0.0001857361764018388, | |
| "loss": 1.3583, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.2499807707099454, | |
| "eval_loss": 1.3779631853103638, | |
| "eval_runtime": 18.0487, | |
| "eval_samples_per_second": 55.406, | |
| "eval_steps_per_second": 13.851, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.25382662872086764, | |
| "grad_norm": 1.0133246183395386, | |
| "learning_rate": 0.00018547645638002233, | |
| "loss": 1.3258, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2576724867317899, | |
| "grad_norm": 1.142626166343689, | |
| "learning_rate": 0.00018521673635820587, | |
| "loss": 1.3315, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.26151834474271207, | |
| "grad_norm": 0.9573944211006165, | |
| "learning_rate": 0.00018495701633638938, | |
| "loss": 1.2949, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.2653642027536343, | |
| "grad_norm": 0.8417842984199524, | |
| "learning_rate": 0.00018469729631457292, | |
| "loss": 1.3291, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.26921006076455656, | |
| "grad_norm": 0.8505682945251465, | |
| "learning_rate": 0.00018443757629275643, | |
| "loss": 1.2611, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.26921006076455656, | |
| "eval_loss": 1.3726494312286377, | |
| "eval_runtime": 18.0195, | |
| "eval_samples_per_second": 55.495, | |
| "eval_steps_per_second": 13.874, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2730559187754788, | |
| "grad_norm": 1.0631035566329956, | |
| "learning_rate": 0.00018417785627093992, | |
| "loss": 1.3384, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.27690177678640104, | |
| "grad_norm": 1.1145228147506714, | |
| "learning_rate": 0.00018391813624912346, | |
| "loss": 1.4159, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.2807476347973233, | |
| "grad_norm": 1.286778450012207, | |
| "learning_rate": 0.00018365841622730697, | |
| "loss": 1.3372, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.28459349280824553, | |
| "grad_norm": 1.1863288879394531, | |
| "learning_rate": 0.00018339869620549048, | |
| "loss": 1.2993, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.2884393508191678, | |
| "grad_norm": 1.6189292669296265, | |
| "learning_rate": 0.00018313897618367402, | |
| "loss": 1.3464, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.2884393508191678, | |
| "eval_loss": 1.3553545475006104, | |
| "eval_runtime": 18.0591, | |
| "eval_samples_per_second": 55.374, | |
| "eval_steps_per_second": 13.843, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.29228520883009, | |
| "grad_norm": 1.4823222160339355, | |
| "learning_rate": 0.00018287925616185754, | |
| "loss": 1.3392, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.2961310668410122, | |
| "grad_norm": 1.4085184335708618, | |
| "learning_rate": 0.00018261953614004105, | |
| "loss": 1.2989, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.29997692485193445, | |
| "grad_norm": 1.7249082326889038, | |
| "learning_rate": 0.00018235981611822456, | |
| "loss": 1.3866, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.3038227828628567, | |
| "grad_norm": 0.9753608107566833, | |
| "learning_rate": 0.00018210009609640807, | |
| "loss": 1.2916, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.30766864087377893, | |
| "grad_norm": 0.6619511246681213, | |
| "learning_rate": 0.0001818403760745916, | |
| "loss": 1.2768, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.30766864087377893, | |
| "eval_loss": 1.337461233139038, | |
| "eval_runtime": 18.006, | |
| "eval_samples_per_second": 55.537, | |
| "eval_steps_per_second": 13.884, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.3115144988847012, | |
| "grad_norm": 0.9473676085472107, | |
| "learning_rate": 0.00018158065605277513, | |
| "loss": 1.3116, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.3153603568956234, | |
| "grad_norm": 1.2772737741470337, | |
| "learning_rate": 0.00018132093603095864, | |
| "loss": 1.3058, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.31920621490654566, | |
| "grad_norm": 1.7045694589614868, | |
| "learning_rate": 0.0001810664104095785, | |
| "loss": 1.3398, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.3230520729174679, | |
| "grad_norm": 1.498179316520691, | |
| "learning_rate": 0.000180806690387762, | |
| "loss": 1.3434, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.32689793092839015, | |
| "grad_norm": 1.5777134895324707, | |
| "learning_rate": 0.00018054697036594552, | |
| "loss": 1.3437, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.32689793092839015, | |
| "eval_loss": 1.334365963935852, | |
| "eval_runtime": 18.026, | |
| "eval_samples_per_second": 55.475, | |
| "eval_steps_per_second": 13.869, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.33074378893931233, | |
| "grad_norm": 0.7399800419807434, | |
| "learning_rate": 0.00018028725034412903, | |
| "loss": 1.2932, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.3345896469502346, | |
| "grad_norm": 0.7411991953849792, | |
| "learning_rate": 0.00018002753032231257, | |
| "loss": 1.2928, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.3384355049611568, | |
| "grad_norm": 1.308003544807434, | |
| "learning_rate": 0.00017976781030049608, | |
| "loss": 1.3192, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.34228136297207906, | |
| "grad_norm": 1.1857889890670776, | |
| "learning_rate": 0.0001795080902786796, | |
| "loss": 1.2718, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.3461272209830013, | |
| "grad_norm": 0.5179012417793274, | |
| "learning_rate": 0.0001792483702568631, | |
| "loss": 1.2513, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.3461272209830013, | |
| "eval_loss": 1.346890926361084, | |
| "eval_runtime": 18.086, | |
| "eval_samples_per_second": 55.291, | |
| "eval_steps_per_second": 13.823, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.34997307899392355, | |
| "grad_norm": 1.2267632484436035, | |
| "learning_rate": 0.00017898865023504662, | |
| "loss": 1.349, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.3538189370048458, | |
| "grad_norm": 0.9660719037055969, | |
| "learning_rate": 0.00017872893021323013, | |
| "loss": 1.3194, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.35766479501576803, | |
| "grad_norm": 1.4557528495788574, | |
| "learning_rate": 0.00017846921019141367, | |
| "loss": 1.3132, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.3615106530266903, | |
| "grad_norm": 0.9239174723625183, | |
| "learning_rate": 0.0001782094901695972, | |
| "loss": 1.2598, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.36535651103761246, | |
| "grad_norm": 1.1237714290618896, | |
| "learning_rate": 0.0001779497701477807, | |
| "loss": 1.2506, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.36535651103761246, | |
| "eval_loss": 1.3218908309936523, | |
| "eval_runtime": 18.1211, | |
| "eval_samples_per_second": 55.184, | |
| "eval_steps_per_second": 13.796, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.3692023690485347, | |
| "grad_norm": 1.0127383470535278, | |
| "learning_rate": 0.0001776900501259642, | |
| "loss": 1.3277, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.37304822705945695, | |
| "grad_norm": 1.1309473514556885, | |
| "learning_rate": 0.00017743033010414773, | |
| "loss": 1.2991, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.3768940850703792, | |
| "grad_norm": 1.321747899055481, | |
| "learning_rate": 0.00017717061008233124, | |
| "loss": 1.3213, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.38073994308130144, | |
| "grad_norm": 1.1251367330551147, | |
| "learning_rate": 0.00017691089006051478, | |
| "loss": 1.2742, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.3845858010922237, | |
| "grad_norm": 1.1043410301208496, | |
| "learning_rate": 0.0001766511700386983, | |
| "loss": 1.2755, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3845858010922237, | |
| "eval_loss": 1.3034751415252686, | |
| "eval_runtime": 18.0006, | |
| "eval_samples_per_second": 55.554, | |
| "eval_steps_per_second": 13.888, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3884316591031459, | |
| "grad_norm": 0.8127657175064087, | |
| "learning_rate": 0.00017639145001688183, | |
| "loss": 1.3019, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.39227751711406816, | |
| "grad_norm": 0.56494140625, | |
| "learning_rate": 0.00017613172999506534, | |
| "loss": 1.3148, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.3961233751249904, | |
| "grad_norm": 2.181711435317993, | |
| "learning_rate": 0.00017587200997324883, | |
| "loss": 1.2761, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.39996923313591265, | |
| "grad_norm": 0.6779603362083435, | |
| "learning_rate": 0.00017561228995143237, | |
| "loss": 1.3149, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.40381509114683484, | |
| "grad_norm": 0.5844702124595642, | |
| "learning_rate": 0.00017535256992961588, | |
| "loss": 1.3437, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.40381509114683484, | |
| "eval_loss": 1.2923167943954468, | |
| "eval_runtime": 17.9871, | |
| "eval_samples_per_second": 55.595, | |
| "eval_steps_per_second": 13.899, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.4076609491577571, | |
| "grad_norm": 0.9879493117332458, | |
| "learning_rate": 0.0001750928499077994, | |
| "loss": 1.2764, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.4115068071686793, | |
| "grad_norm": 1.443860650062561, | |
| "learning_rate": 0.00017483312988598293, | |
| "loss": 1.3204, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.41535266517960157, | |
| "grad_norm": 0.8753446340560913, | |
| "learning_rate": 0.00017457340986416645, | |
| "loss": 1.2762, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.4191985231905238, | |
| "grad_norm": 1.2027819156646729, | |
| "learning_rate": 0.00017431368984234996, | |
| "loss": 1.2097, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.42304438120144605, | |
| "grad_norm": 1.1534991264343262, | |
| "learning_rate": 0.00017405396982053347, | |
| "loss": 1.2723, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.42304438120144605, | |
| "eval_loss": 1.2931731939315796, | |
| "eval_runtime": 17.9258, | |
| "eval_samples_per_second": 55.785, | |
| "eval_steps_per_second": 13.946, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.4268902392123683, | |
| "grad_norm": 1.0256164073944092, | |
| "learning_rate": 0.00017379424979871699, | |
| "loss": 1.2313, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.43073609722329054, | |
| "grad_norm": 1.276945948600769, | |
| "learning_rate": 0.0001735345297769005, | |
| "loss": 1.1982, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.4345819552342128, | |
| "grad_norm": 0.9002663493156433, | |
| "learning_rate": 0.00017327480975508404, | |
| "loss": 1.2726, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.43842781324513497, | |
| "grad_norm": 1.1424119472503662, | |
| "learning_rate": 0.00017301508973326755, | |
| "loss": 1.3473, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.4422736712560572, | |
| "grad_norm": 0.6811870336532593, | |
| "learning_rate": 0.00017275536971145106, | |
| "loss": 1.228, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.4422736712560572, | |
| "eval_loss": 1.2861703634262085, | |
| "eval_runtime": 18.0823, | |
| "eval_samples_per_second": 55.303, | |
| "eval_steps_per_second": 13.826, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.44611952926697945, | |
| "grad_norm": 1.0646696090698242, | |
| "learning_rate": 0.00017249564968963458, | |
| "loss": 1.2946, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.4499653872779017, | |
| "grad_norm": 1.436909556388855, | |
| "learning_rate": 0.0001722359296678181, | |
| "loss": 1.3121, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.45381124528882394, | |
| "grad_norm": 0.937135636806488, | |
| "learning_rate": 0.00017197620964600163, | |
| "loss": 1.2248, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.4576571032997462, | |
| "grad_norm": 0.908935010433197, | |
| "learning_rate": 0.00017171648962418514, | |
| "loss": 1.308, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.4615029613106684, | |
| "grad_norm": 1.3925087451934814, | |
| "learning_rate": 0.00017145676960236866, | |
| "loss": 1.3007, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.4615029613106684, | |
| "eval_loss": 1.27406644821167, | |
| "eval_runtime": 18.0211, | |
| "eval_samples_per_second": 55.49, | |
| "eval_steps_per_second": 13.873, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.46534881932159067, | |
| "grad_norm": 1.2292288541793823, | |
| "learning_rate": 0.00017119704958055217, | |
| "loss": 1.2707, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.4691946773325129, | |
| "grad_norm": 0.8948924541473389, | |
| "learning_rate": 0.00017093732955873568, | |
| "loss": 1.2481, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.4730405353434351, | |
| "grad_norm": 0.7155699133872986, | |
| "learning_rate": 0.0001706776095369192, | |
| "loss": 1.2663, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.47688639335435734, | |
| "grad_norm": 0.7100064158439636, | |
| "learning_rate": 0.00017041788951510273, | |
| "loss": 1.2976, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.4807322513652796, | |
| "grad_norm": 1.3250987529754639, | |
| "learning_rate": 0.00017015816949328625, | |
| "loss": 1.2368, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.4807322513652796, | |
| "eval_loss": 1.2630703449249268, | |
| "eval_runtime": 18.0054, | |
| "eval_samples_per_second": 55.539, | |
| "eval_steps_per_second": 13.885, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.4845781093762018, | |
| "grad_norm": 0.9060600996017456, | |
| "learning_rate": 0.00016989844947146976, | |
| "loss": 1.2344, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.48842396738712407, | |
| "grad_norm": 0.8371444940567017, | |
| "learning_rate": 0.0001696387294496533, | |
| "loss": 1.277, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.4922698253980463, | |
| "grad_norm": 1.2833727598190308, | |
| "learning_rate": 0.00016937900942783679, | |
| "loss": 1.2941, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.49611568340896856, | |
| "grad_norm": 1.5922775268554688, | |
| "learning_rate": 0.0001691192894060203, | |
| "loss": 1.2448, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.4999615414198908, | |
| "grad_norm": 0.9083874225616455, | |
| "learning_rate": 0.00016885956938420384, | |
| "loss": 1.2187, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.4999615414198908, | |
| "eval_loss": 1.2612597942352295, | |
| "eval_runtime": 17.9521, | |
| "eval_samples_per_second": 55.704, | |
| "eval_steps_per_second": 13.926, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.503807399430813, | |
| "grad_norm": 1.3177634477615356, | |
| "learning_rate": 0.00016859984936238735, | |
| "loss": 1.305, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.5076532574417353, | |
| "grad_norm": 1.8331613540649414, | |
| "learning_rate": 0.00016834012934057086, | |
| "loss": 1.2468, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.5114991154526575, | |
| "grad_norm": 0.8823532462120056, | |
| "learning_rate": 0.0001680804093187544, | |
| "loss": 1.265, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.5153449734635798, | |
| "grad_norm": 1.1489806175231934, | |
| "learning_rate": 0.00016782068929693792, | |
| "loss": 1.1942, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.519190831474502, | |
| "grad_norm": 4.0805816650390625, | |
| "learning_rate": 0.00016756096927512143, | |
| "loss": 1.2906, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.519190831474502, | |
| "eval_loss": 1.252502202987671, | |
| "eval_runtime": 17.9221, | |
| "eval_samples_per_second": 55.797, | |
| "eval_steps_per_second": 13.949, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.5230366894854241, | |
| "grad_norm": 0.9559470415115356, | |
| "learning_rate": 0.00016730124925330494, | |
| "loss": 1.1792, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.5268825474963464, | |
| "grad_norm": 2.268700361251831, | |
| "learning_rate": 0.00016704152923148846, | |
| "loss": 1.2541, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.5307284055072686, | |
| "grad_norm": 1.0873395204544067, | |
| "learning_rate": 0.000166781809209672, | |
| "loss": 1.2089, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.5345742635181909, | |
| "grad_norm": 0.877153217792511, | |
| "learning_rate": 0.0001665220891878555, | |
| "loss": 1.2655, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.5384201215291131, | |
| "grad_norm": 1.1317107677459717, | |
| "learning_rate": 0.00016626236916603902, | |
| "loss": 1.2115, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5384201215291131, | |
| "eval_loss": 1.247037649154663, | |
| "eval_runtime": 17.9248, | |
| "eval_samples_per_second": 55.789, | |
| "eval_steps_per_second": 13.947, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.5422659795400354, | |
| "grad_norm": 1.1601048707962036, | |
| "learning_rate": 0.00016600264914422253, | |
| "loss": 1.2494, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.5461118375509576, | |
| "grad_norm": 0.7940592765808105, | |
| "learning_rate": 0.00016574292912240605, | |
| "loss": 1.2906, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.5499576955618799, | |
| "grad_norm": 0.6271395087242126, | |
| "learning_rate": 0.00016548320910058956, | |
| "loss": 1.2684, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.5538035535728021, | |
| "grad_norm": 1.3025091886520386, | |
| "learning_rate": 0.0001652234890787731, | |
| "loss": 1.2694, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.5576494115837243, | |
| "grad_norm": 1.3218464851379395, | |
| "learning_rate": 0.0001649637690569566, | |
| "loss": 1.3098, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5576494115837243, | |
| "eval_loss": 1.2370234727859497, | |
| "eval_runtime": 18.0496, | |
| "eval_samples_per_second": 55.403, | |
| "eval_steps_per_second": 13.851, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5614952695946466, | |
| "grad_norm": 1.1432136297225952, | |
| "learning_rate": 0.00016470404903514013, | |
| "loss": 1.2567, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.5653411276055688, | |
| "grad_norm": 0.9530320763587952, | |
| "learning_rate": 0.00016444432901332364, | |
| "loss": 1.1878, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.5691869856164911, | |
| "grad_norm": 1.1852946281433105, | |
| "learning_rate": 0.00016418460899150715, | |
| "loss": 1.2153, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.5730328436274132, | |
| "grad_norm": 0.7916271686553955, | |
| "learning_rate": 0.00016392488896969066, | |
| "loss": 1.2574, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.5768787016383355, | |
| "grad_norm": 0.8115867972373962, | |
| "learning_rate": 0.0001636651689478742, | |
| "loss": 1.2777, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5768787016383355, | |
| "eval_loss": 1.2334003448486328, | |
| "eval_runtime": 18.0145, | |
| "eval_samples_per_second": 55.511, | |
| "eval_steps_per_second": 13.878, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5807245596492577, | |
| "grad_norm": 0.9350728988647461, | |
| "learning_rate": 0.00016340544892605772, | |
| "loss": 1.2261, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.58457041766018, | |
| "grad_norm": 0.7061731815338135, | |
| "learning_rate": 0.00016314572890424126, | |
| "loss": 1.172, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.5884162756711022, | |
| "grad_norm": 1.091739296913147, | |
| "learning_rate": 0.00016288600888242477, | |
| "loss": 1.272, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.5922621336820244, | |
| "grad_norm": 0.8880358338356018, | |
| "learning_rate": 0.00016262628886060826, | |
| "loss": 1.1847, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.5961079916929467, | |
| "grad_norm": 2.7609329223632812, | |
| "learning_rate": 0.0001623665688387918, | |
| "loss": 1.2636, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.5961079916929467, | |
| "eval_loss": 1.2325551509857178, | |
| "eval_runtime": 17.8663, | |
| "eval_samples_per_second": 55.971, | |
| "eval_steps_per_second": 13.993, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.5999538497038689, | |
| "grad_norm": 0.8872610926628113, | |
| "learning_rate": 0.0001621068488169753, | |
| "loss": 1.245, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.6037997077147912, | |
| "grad_norm": 0.7548871040344238, | |
| "learning_rate": 0.00016184712879515882, | |
| "loss": 1.2349, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.6076455657257134, | |
| "grad_norm": 1.104351282119751, | |
| "learning_rate": 0.00016158740877334236, | |
| "loss": 1.2397, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.6114914237366357, | |
| "grad_norm": 0.8331647515296936, | |
| "learning_rate": 0.00016132768875152587, | |
| "loss": 1.2634, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.6153372817475579, | |
| "grad_norm": 1.0910013914108276, | |
| "learning_rate": 0.00016106796872970939, | |
| "loss": 1.2897, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6153372817475579, | |
| "eval_loss": 1.2241965532302856, | |
| "eval_runtime": 18.04, | |
| "eval_samples_per_second": 55.432, | |
| "eval_steps_per_second": 13.858, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6191831397584802, | |
| "grad_norm": 1.4128022193908691, | |
| "learning_rate": 0.0001608082487078929, | |
| "loss": 1.1912, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.6230289977694023, | |
| "grad_norm": 1.5363566875457764, | |
| "learning_rate": 0.0001605485286860764, | |
| "loss": 1.2622, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.6268748557803245, | |
| "grad_norm": 1.334889531135559, | |
| "learning_rate": 0.00016028880866425992, | |
| "loss": 1.2462, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.6307207137912468, | |
| "grad_norm": 1.63850998878479, | |
| "learning_rate": 0.00016002908864244346, | |
| "loss": 1.2099, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.634566571802169, | |
| "grad_norm": 1.2087870836257935, | |
| "learning_rate": 0.00015976936862062698, | |
| "loss": 1.1669, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.634566571802169, | |
| "eval_loss": 1.2158918380737305, | |
| "eval_runtime": 17.8293, | |
| "eval_samples_per_second": 56.087, | |
| "eval_steps_per_second": 14.022, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.6384124298130913, | |
| "grad_norm": 2.0238049030303955, | |
| "learning_rate": 0.0001595096485988105, | |
| "loss": 1.1893, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.6422582878240135, | |
| "grad_norm": 0.7206680178642273, | |
| "learning_rate": 0.000159249928576994, | |
| "loss": 1.2155, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.6461041458349358, | |
| "grad_norm": 1.0200512409210205, | |
| "learning_rate": 0.00015899020855517752, | |
| "loss": 1.2049, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.649950003845858, | |
| "grad_norm": 0.7880833745002747, | |
| "learning_rate": 0.00015873048853336106, | |
| "loss": 1.2564, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.6537958618567803, | |
| "grad_norm": 1.0986734628677368, | |
| "learning_rate": 0.00015847076851154457, | |
| "loss": 1.2989, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6537958618567803, | |
| "eval_loss": 1.2085261344909668, | |
| "eval_runtime": 17.887, | |
| "eval_samples_per_second": 55.907, | |
| "eval_steps_per_second": 13.977, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6576417198677025, | |
| "grad_norm": 0.5527728796005249, | |
| "learning_rate": 0.00015821104848972808, | |
| "loss": 1.1798, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.6614875778786247, | |
| "grad_norm": 1.0168190002441406, | |
| "learning_rate": 0.0001579513284679116, | |
| "loss": 1.211, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.665333435889547, | |
| "grad_norm": 0.8436816334724426, | |
| "learning_rate": 0.0001576916084460951, | |
| "loss": 1.2018, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.6691792939004692, | |
| "grad_norm": 0.967677891254425, | |
| "learning_rate": 0.00015743188842427862, | |
| "loss": 1.2134, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.6730251519113915, | |
| "grad_norm": 0.9716609120368958, | |
| "learning_rate": 0.00015717216840246216, | |
| "loss": 1.2206, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.6730251519113915, | |
| "eval_loss": 1.2069470882415771, | |
| "eval_runtime": 17.9386, | |
| "eval_samples_per_second": 55.746, | |
| "eval_steps_per_second": 13.936, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.6768710099223136, | |
| "grad_norm": 1.1798300743103027, | |
| "learning_rate": 0.00015691244838064567, | |
| "loss": 1.2258, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.6807168679332359, | |
| "grad_norm": 1.2064564228057861, | |
| "learning_rate": 0.00015665272835882919, | |
| "loss": 1.217, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.6845627259441581, | |
| "grad_norm": 1.2753881216049194, | |
| "learning_rate": 0.00015639300833701273, | |
| "loss": 1.1915, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.6884085839550804, | |
| "grad_norm": 1.2899794578552246, | |
| "learning_rate": 0.00015613848271563255, | |
| "loss": 1.1967, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.6922544419660026, | |
| "grad_norm": 0.5771601796150208, | |
| "learning_rate": 0.00015587876269381606, | |
| "loss": 1.1958, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6922544419660026, | |
| "eval_loss": 1.19791841506958, | |
| "eval_runtime": 18.0075, | |
| "eval_samples_per_second": 55.532, | |
| "eval_steps_per_second": 13.883, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6961002999769248, | |
| "grad_norm": 1.467191457748413, | |
| "learning_rate": 0.00015561904267199958, | |
| "loss": 1.1938, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.6999461579878471, | |
| "grad_norm": 0.7669786214828491, | |
| "learning_rate": 0.00015535932265018312, | |
| "loss": 1.2228, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.7037920159987693, | |
| "grad_norm": 0.843961238861084, | |
| "learning_rate": 0.00015509960262836663, | |
| "loss": 1.1543, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.7076378740096916, | |
| "grad_norm": 1.2265573740005493, | |
| "learning_rate": 0.00015483988260655017, | |
| "loss": 1.1518, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.7114837320206138, | |
| "grad_norm": 1.1644186973571777, | |
| "learning_rate": 0.00015458016258473365, | |
| "loss": 1.23, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.7114837320206138, | |
| "eval_loss": 1.1922409534454346, | |
| "eval_runtime": 17.9384, | |
| "eval_samples_per_second": 55.746, | |
| "eval_steps_per_second": 13.937, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.7153295900315361, | |
| "grad_norm": 1.3787301778793335, | |
| "learning_rate": 0.00015432044256291717, | |
| "loss": 1.2347, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.7191754480424583, | |
| "grad_norm": 1.4755727052688599, | |
| "learning_rate": 0.0001540607225411007, | |
| "loss": 1.1596, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.7230213060533806, | |
| "grad_norm": 1.031275749206543, | |
| "learning_rate": 0.00015380100251928422, | |
| "loss": 1.2045, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.7268671640643027, | |
| "grad_norm": 1.2802574634552002, | |
| "learning_rate": 0.00015354128249746773, | |
| "loss": 1.205, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.7307130220752249, | |
| "grad_norm": 0.5222998857498169, | |
| "learning_rate": 0.00015328156247565127, | |
| "loss": 1.1678, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.7307130220752249, | |
| "eval_loss": 1.1886347532272339, | |
| "eval_runtime": 17.9243, | |
| "eval_samples_per_second": 55.79, | |
| "eval_steps_per_second": 13.948, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.7345588800861472, | |
| "grad_norm": 0.8676270842552185, | |
| "learning_rate": 0.00015302184245383479, | |
| "loss": 1.1388, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.7384047380970694, | |
| "grad_norm": 1.198843240737915, | |
| "learning_rate": 0.00015276212243201827, | |
| "loss": 1.1995, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.7422505961079917, | |
| "grad_norm": 0.6684653162956238, | |
| "learning_rate": 0.0001525024024102018, | |
| "loss": 1.1567, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.7460964541189139, | |
| "grad_norm": 0.931119441986084, | |
| "learning_rate": 0.00015224268238838532, | |
| "loss": 1.1375, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.7499423121298362, | |
| "grad_norm": 0.7734692096710205, | |
| "learning_rate": 0.00015198296236656884, | |
| "loss": 1.1836, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.7499423121298362, | |
| "eval_loss": 1.1750942468643188, | |
| "eval_runtime": 18.0078, | |
| "eval_samples_per_second": 55.532, | |
| "eval_steps_per_second": 13.883, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.7537881701407584, | |
| "grad_norm": 1.0753988027572632, | |
| "learning_rate": 0.00015172324234475238, | |
| "loss": 1.1666, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.7576340281516807, | |
| "grad_norm": 0.45949143171310425, | |
| "learning_rate": 0.0001514635223229359, | |
| "loss": 1.1498, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.7614798861626029, | |
| "grad_norm": 1.0716335773468018, | |
| "learning_rate": 0.0001512038023011194, | |
| "loss": 1.1669, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.7653257441735252, | |
| "grad_norm": 1.043646216392517, | |
| "learning_rate": 0.00015094408227930292, | |
| "loss": 1.1689, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.7691716021844474, | |
| "grad_norm": 1.0494813919067383, | |
| "learning_rate": 0.00015068436225748643, | |
| "loss": 1.2343, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7691716021844474, | |
| "eval_loss": 1.1786631345748901, | |
| "eval_runtime": 17.9206, | |
| "eval_samples_per_second": 55.802, | |
| "eval_steps_per_second": 13.95, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7730174601953695, | |
| "grad_norm": 0.7689708471298218, | |
| "learning_rate": 0.00015042983663610628, | |
| "loss": 1.2341, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.7768633182062918, | |
| "grad_norm": 1.0559266805648804, | |
| "learning_rate": 0.00015017011661428982, | |
| "loss": 1.0868, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.780709176217214, | |
| "grad_norm": 1.01194429397583, | |
| "learning_rate": 0.00014991039659247333, | |
| "loss": 1.2187, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.7845550342281363, | |
| "grad_norm": 0.9095450043678284, | |
| "learning_rate": 0.00014965067657065684, | |
| "loss": 1.1432, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.7884008922390585, | |
| "grad_norm": 1.1280279159545898, | |
| "learning_rate": 0.00014939095654884036, | |
| "loss": 1.1631, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.7884008922390585, | |
| "eval_loss": 1.1772924661636353, | |
| "eval_runtime": 18.038, | |
| "eval_samples_per_second": 55.438, | |
| "eval_steps_per_second": 13.86, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.7922467502499808, | |
| "grad_norm": 1.1410025358200073, | |
| "learning_rate": 0.00014913123652702387, | |
| "loss": 1.2058, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.796092608260903, | |
| "grad_norm": 0.7516416311264038, | |
| "learning_rate": 0.00014887151650520738, | |
| "loss": 1.2702, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.7999384662718253, | |
| "grad_norm": 1.9470982551574707, | |
| "learning_rate": 0.00014861179648339092, | |
| "loss": 1.1322, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.8037843242827475, | |
| "grad_norm": 1.1969455480575562, | |
| "learning_rate": 0.00014835207646157444, | |
| "loss": 1.1882, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.8076301822936697, | |
| "grad_norm": 1.2365367412567139, | |
| "learning_rate": 0.00014809235643975795, | |
| "loss": 1.1929, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.8076301822936697, | |
| "eval_loss": 1.168535590171814, | |
| "eval_runtime": 17.931, | |
| "eval_samples_per_second": 55.769, | |
| "eval_steps_per_second": 13.942, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.811476040304592, | |
| "grad_norm": 1.2798963785171509, | |
| "learning_rate": 0.00014783263641794146, | |
| "loss": 1.2693, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.8153218983155142, | |
| "grad_norm": 0.8195398449897766, | |
| "learning_rate": 0.00014757291639612497, | |
| "loss": 1.1933, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.8191677563264365, | |
| "grad_norm": 1.039031744003296, | |
| "learning_rate": 0.0001473131963743085, | |
| "loss": 1.1257, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.8230136143373586, | |
| "grad_norm": 1.0875959396362305, | |
| "learning_rate": 0.00014705347635249203, | |
| "loss": 1.2578, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.8268594723482809, | |
| "grad_norm": 1.5674726963043213, | |
| "learning_rate": 0.00014679375633067554, | |
| "loss": 1.1895, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.8268594723482809, | |
| "eval_loss": 1.1696391105651855, | |
| "eval_runtime": 17.8658, | |
| "eval_samples_per_second": 55.973, | |
| "eval_steps_per_second": 13.993, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.8307053303592031, | |
| "grad_norm": 0.7315701842308044, | |
| "learning_rate": 0.00014653403630885905, | |
| "loss": 1.1379, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.8345511883701254, | |
| "grad_norm": 1.0033215284347534, | |
| "learning_rate": 0.00014627431628704257, | |
| "loss": 1.198, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.8383970463810476, | |
| "grad_norm": 1.0194263458251953, | |
| "learning_rate": 0.00014601459626522608, | |
| "loss": 1.2186, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.8422429043919698, | |
| "grad_norm": 0.9829340577125549, | |
| "learning_rate": 0.00014575487624340962, | |
| "loss": 1.1458, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.8460887624028921, | |
| "grad_norm": 1.3082759380340576, | |
| "learning_rate": 0.00014549515622159313, | |
| "loss": 1.1545, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.8460887624028921, | |
| "eval_loss": 1.1625275611877441, | |
| "eval_runtime": 17.8346, | |
| "eval_samples_per_second": 56.071, | |
| "eval_steps_per_second": 14.018, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.8499346204138143, | |
| "grad_norm": 1.2006999254226685, | |
| "learning_rate": 0.00014523543619977664, | |
| "loss": 1.1187, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.8537804784247366, | |
| "grad_norm": 1.0500705242156982, | |
| "learning_rate": 0.00014497571617796018, | |
| "loss": 1.1911, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.8576263364356588, | |
| "grad_norm": 0.8597742915153503, | |
| "learning_rate": 0.0001447159961561437, | |
| "loss": 1.1176, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.8614721944465811, | |
| "grad_norm": 1.338990569114685, | |
| "learning_rate": 0.00014445627613432718, | |
| "loss": 1.1699, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.8653180524575033, | |
| "grad_norm": 0.903128445148468, | |
| "learning_rate": 0.00014419655611251072, | |
| "loss": 1.1504, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.8653180524575033, | |
| "eval_loss": 1.161923885345459, | |
| "eval_runtime": 17.9304, | |
| "eval_samples_per_second": 55.771, | |
| "eval_steps_per_second": 13.943, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.8691639104684256, | |
| "grad_norm": 0.8747849464416504, | |
| "learning_rate": 0.00014393683609069424, | |
| "loss": 1.1305, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.8730097684793477, | |
| "grad_norm": 1.1505181789398193, | |
| "learning_rate": 0.00014367711606887775, | |
| "loss": 1.1515, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.8768556264902699, | |
| "grad_norm": 0.6178755164146423, | |
| "learning_rate": 0.0001434173960470613, | |
| "loss": 1.1785, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.8807014845011922, | |
| "grad_norm": 0.7437123656272888, | |
| "learning_rate": 0.0001431576760252448, | |
| "loss": 1.1567, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.8845473425121144, | |
| "grad_norm": 1.574104905128479, | |
| "learning_rate": 0.00014289795600342831, | |
| "loss": 1.1371, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8845473425121144, | |
| "eval_loss": 1.154138445854187, | |
| "eval_runtime": 17.9772, | |
| "eval_samples_per_second": 55.626, | |
| "eval_steps_per_second": 13.906, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8883932005230367, | |
| "grad_norm": 2.203948497772217, | |
| "learning_rate": 0.00014264343038204814, | |
| "loss": 1.2118, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.8922390585339589, | |
| "grad_norm": 1.1410473585128784, | |
| "learning_rate": 0.00014238371036023168, | |
| "loss": 1.2239, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.8960849165448812, | |
| "grad_norm": 1.0226402282714844, | |
| "learning_rate": 0.0001421239903384152, | |
| "loss": 1.1686, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.8999307745558034, | |
| "grad_norm": 1.0350555181503296, | |
| "learning_rate": 0.00014186427031659873, | |
| "loss": 1.1584, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.9037766325667257, | |
| "grad_norm": 1.6758803129196167, | |
| "learning_rate": 0.00014160455029478224, | |
| "loss": 1.1638, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.9037766325667257, | |
| "eval_loss": 1.1433300971984863, | |
| "eval_runtime": 17.8796, | |
| "eval_samples_per_second": 55.93, | |
| "eval_steps_per_second": 13.982, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.9076224905776479, | |
| "grad_norm": 1.2579885721206665, | |
| "learning_rate": 0.00014134483027296576, | |
| "loss": 1.2198, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.9114683485885701, | |
| "grad_norm": 0.886987030506134, | |
| "learning_rate": 0.00014108511025114927, | |
| "loss": 1.1477, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.9153142065994924, | |
| "grad_norm": 0.9828807711601257, | |
| "learning_rate": 0.00014082539022933278, | |
| "loss": 1.1115, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.9191600646104146, | |
| "grad_norm": 1.2362446784973145, | |
| "learning_rate": 0.0001405656702075163, | |
| "loss": 1.161, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.9230059226213368, | |
| "grad_norm": 1.1353052854537964, | |
| "learning_rate": 0.00014030595018569984, | |
| "loss": 1.1369, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.9230059226213368, | |
| "eval_loss": 1.144094467163086, | |
| "eval_runtime": 17.8499, | |
| "eval_samples_per_second": 56.023, | |
| "eval_steps_per_second": 14.006, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.926851780632259, | |
| "grad_norm": 1.0390766859054565, | |
| "learning_rate": 0.00014004623016388335, | |
| "loss": 1.1763, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.9306976386431813, | |
| "grad_norm": 1.1437292098999023, | |
| "learning_rate": 0.00013978651014206686, | |
| "loss": 1.153, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.9345434966541035, | |
| "grad_norm": 0.7012118697166443, | |
| "learning_rate": 0.00013952679012025037, | |
| "loss": 1.1209, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.9383893546650258, | |
| "grad_norm": 0.558203399181366, | |
| "learning_rate": 0.0001392670700984339, | |
| "loss": 1.0998, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.942235212675948, | |
| "grad_norm": 1.031898021697998, | |
| "learning_rate": 0.0001390073500766174, | |
| "loss": 1.1308, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.942235212675948, | |
| "eval_loss": 1.1427006721496582, | |
| "eval_runtime": 17.9677, | |
| "eval_samples_per_second": 55.656, | |
| "eval_steps_per_second": 13.914, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.9460810706868702, | |
| "grad_norm": 0.9146320223808289, | |
| "learning_rate": 0.00013874763005480094, | |
| "loss": 1.1441, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.9499269286977925, | |
| "grad_norm": 1.7698357105255127, | |
| "learning_rate": 0.00013848791003298445, | |
| "loss": 1.1278, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.9537727867087147, | |
| "grad_norm": 1.7621064186096191, | |
| "learning_rate": 0.00013822819001116797, | |
| "loss": 1.2191, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.957618644719637, | |
| "grad_norm": 1.2093744277954102, | |
| "learning_rate": 0.00013796846998935148, | |
| "loss": 1.1541, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.9614645027305592, | |
| "grad_norm": 1.0879639387130737, | |
| "learning_rate": 0.000137708749967535, | |
| "loss": 1.1278, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.9614645027305592, | |
| "eval_loss": 1.1357394456863403, | |
| "eval_runtime": 17.9695, | |
| "eval_samples_per_second": 55.65, | |
| "eval_steps_per_second": 13.912, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.9653103607414815, | |
| "grad_norm": 1.415139079093933, | |
| "learning_rate": 0.00013744902994571853, | |
| "loss": 1.1986, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.9691562187524037, | |
| "grad_norm": 1.0320454835891724, | |
| "learning_rate": 0.00013718930992390204, | |
| "loss": 1.153, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.973002076763326, | |
| "grad_norm": 1.0736747980117798, | |
| "learning_rate": 0.00013692958990208556, | |
| "loss": 1.1931, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.9768479347742481, | |
| "grad_norm": 0.8954864740371704, | |
| "learning_rate": 0.0001366698698802691, | |
| "loss": 1.1908, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.9806937927851703, | |
| "grad_norm": 1.3287911415100098, | |
| "learning_rate": 0.0001364101498584526, | |
| "loss": 1.1502, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.9806937927851703, | |
| "eval_loss": 1.133840799331665, | |
| "eval_runtime": 18.0959, | |
| "eval_samples_per_second": 55.261, | |
| "eval_steps_per_second": 13.815, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.9845396507960926, | |
| "grad_norm": 1.000588059425354, | |
| "learning_rate": 0.0001361504298366361, | |
| "loss": 1.1001, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.9883855088070148, | |
| "grad_norm": 0.9359833598136902, | |
| "learning_rate": 0.00013589070981481963, | |
| "loss": 1.2289, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.9922313668179371, | |
| "grad_norm": 1.5241230726242065, | |
| "learning_rate": 0.00013563098979300315, | |
| "loss": 1.1726, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.9960772248288593, | |
| "grad_norm": 1.0804429054260254, | |
| "learning_rate": 0.00013537126977118666, | |
| "loss": 1.1755, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.9999230828397816, | |
| "grad_norm": 0.8376865983009338, | |
| "learning_rate": 0.0001351115497493702, | |
| "loss": 1.158, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.9999230828397816, | |
| "eval_loss": 1.1301764249801636, | |
| "eval_runtime": 18.0468, | |
| "eval_samples_per_second": 55.411, | |
| "eval_steps_per_second": 13.853, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.0037689408507038, | |
| "grad_norm": 0.8493902087211609, | |
| "learning_rate": 0.0001348518297275537, | |
| "loss": 1.0935, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.007614798861626, | |
| "grad_norm": 0.9508585929870605, | |
| "learning_rate": 0.00013459210970573723, | |
| "loss": 1.0773, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.0114606568725482, | |
| "grad_norm": 1.047767162322998, | |
| "learning_rate": 0.00013433238968392074, | |
| "loss": 1.1722, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.0153065148834706, | |
| "grad_norm": 1.0310213565826416, | |
| "learning_rate": 0.00013407266966210425, | |
| "loss": 1.059, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.0191523728943928, | |
| "grad_norm": 0.7563040852546692, | |
| "learning_rate": 0.00013381294964028776, | |
| "loss": 1.0438, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.0191523728943928, | |
| "eval_loss": 1.121547818183899, | |
| "eval_runtime": 17.9914, | |
| "eval_samples_per_second": 55.582, | |
| "eval_steps_per_second": 13.895, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.022998230905315, | |
| "grad_norm": 0.8817610144615173, | |
| "learning_rate": 0.0001335532296184713, | |
| "loss": 1.1176, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.0268440889162371, | |
| "grad_norm": 0.8703081011772156, | |
| "learning_rate": 0.00013329350959665482, | |
| "loss": 1.0568, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.0306899469271595, | |
| "grad_norm": 1.0551347732543945, | |
| "learning_rate": 0.00013303378957483833, | |
| "loss": 1.1746, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.0345358049380817, | |
| "grad_norm": 1.2630723714828491, | |
| "learning_rate": 0.00013277406955302184, | |
| "loss": 1.0232, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.038381662949004, | |
| "grad_norm": 1.2565157413482666, | |
| "learning_rate": 0.00013251434953120536, | |
| "loss": 1.0523, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.038381662949004, | |
| "eval_loss": 1.1202689409255981, | |
| "eval_runtime": 17.8981, | |
| "eval_samples_per_second": 55.872, | |
| "eval_steps_per_second": 13.968, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.042227520959926, | |
| "grad_norm": 1.9204115867614746, | |
| "learning_rate": 0.0001322546295093889, | |
| "loss": 1.1115, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.0460733789708483, | |
| "grad_norm": 1.1753497123718262, | |
| "learning_rate": 0.0001319949094875724, | |
| "loss": 1.0463, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.0499192369817707, | |
| "grad_norm": 1.0144574642181396, | |
| "learning_rate": 0.00013173518946575592, | |
| "loss": 1.0914, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.0537650949926929, | |
| "grad_norm": 2.0906662940979004, | |
| "learning_rate": 0.00013147546944393943, | |
| "loss": 1.1635, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.057610953003615, | |
| "grad_norm": 1.0127108097076416, | |
| "learning_rate": 0.00013121574942212295, | |
| "loss": 1.1266, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.057610953003615, | |
| "eval_loss": 1.115894079208374, | |
| "eval_runtime": 17.8891, | |
| "eval_samples_per_second": 55.9, | |
| "eval_steps_per_second": 13.975, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.0614568110145373, | |
| "grad_norm": 1.2559298276901245, | |
| "learning_rate": 0.00013095602940030646, | |
| "loss": 1.0924, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.0653026690254597, | |
| "grad_norm": 0.7502859234809875, | |
| "learning_rate": 0.00013069630937849, | |
| "loss": 1.0756, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.0691485270363819, | |
| "grad_norm": 0.6954963207244873, | |
| "learning_rate": 0.0001304365893566735, | |
| "loss": 1.1284, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.072994385047304, | |
| "grad_norm": 1.3833235502243042, | |
| "learning_rate": 0.00013017686933485703, | |
| "loss": 1.1175, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.0768402430582262, | |
| "grad_norm": 0.9848393201828003, | |
| "learning_rate": 0.00012991714931304057, | |
| "loss": 1.1295, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.0768402430582262, | |
| "eval_loss": 1.1086758375167847, | |
| "eval_runtime": 17.8007, | |
| "eval_samples_per_second": 56.177, | |
| "eval_steps_per_second": 14.044, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.0806861010691486, | |
| "grad_norm": 1.1354585886001587, | |
| "learning_rate": 0.00012965742929122405, | |
| "loss": 1.0351, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.0845319590800708, | |
| "grad_norm": 0.760317325592041, | |
| "learning_rate": 0.00012939770926940756, | |
| "loss": 1.0614, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.088377817090993, | |
| "grad_norm": 0.9277663230895996, | |
| "learning_rate": 0.0001291379892475911, | |
| "loss": 1.0689, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.0922236751019152, | |
| "grad_norm": 1.0846219062805176, | |
| "learning_rate": 0.00012887826922577462, | |
| "loss": 1.1679, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.0960695331128374, | |
| "grad_norm": 1.204969048500061, | |
| "learning_rate": 0.00012861854920395816, | |
| "loss": 1.1198, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.0960695331128374, | |
| "eval_loss": 1.1072660684585571, | |
| "eval_runtime": 17.9757, | |
| "eval_samples_per_second": 55.631, | |
| "eval_steps_per_second": 13.908, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.0999153911237598, | |
| "grad_norm": 1.556897759437561, | |
| "learning_rate": 0.00012835882918214167, | |
| "loss": 1.1511, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.103761249134682, | |
| "grad_norm": 1.4192557334899902, | |
| "learning_rate": 0.00012809910916032518, | |
| "loss": 1.1074, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.1076071071456042, | |
| "grad_norm": 0.5456421971321106, | |
| "learning_rate": 0.0001278393891385087, | |
| "loss": 1.0818, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.1114529651565264, | |
| "grad_norm": 1.282106876373291, | |
| "learning_rate": 0.0001275796691166922, | |
| "loss": 1.0431, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.1152988231674485, | |
| "grad_norm": 0.7997551560401917, | |
| "learning_rate": 0.00012731994909487572, | |
| "loss": 1.0882, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.1152988231674485, | |
| "eval_loss": 1.1057496070861816, | |
| "eval_runtime": 17.7977, | |
| "eval_samples_per_second": 56.187, | |
| "eval_steps_per_second": 14.047, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.119144681178371, | |
| "grad_norm": 1.2388238906860352, | |
| "learning_rate": 0.00012706022907305926, | |
| "loss": 1.1321, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.1229905391892931, | |
| "grad_norm": 0.5141006708145142, | |
| "learning_rate": 0.00012680050905124277, | |
| "loss": 1.0453, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.1268363972002153, | |
| "grad_norm": 1.1240845918655396, | |
| "learning_rate": 0.0001265407890294263, | |
| "loss": 1.0742, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.1306822552111375, | |
| "grad_norm": 1.433976650238037, | |
| "learning_rate": 0.0001262810690076098, | |
| "loss": 1.1468, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.13452811322206, | |
| "grad_norm": 1.077966332435608, | |
| "learning_rate": 0.0001260213489857933, | |
| "loss": 1.1143, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.13452811322206, | |
| "eval_loss": 1.1030505895614624, | |
| "eval_runtime": 17.8195, | |
| "eval_samples_per_second": 56.118, | |
| "eval_steps_per_second": 14.03, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.1383739712329821, | |
| "grad_norm": 1.1456421613693237, | |
| "learning_rate": 0.00012576162896397683, | |
| "loss": 1.1173, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.1422198292439043, | |
| "grad_norm": 0.9305130243301392, | |
| "learning_rate": 0.00012550190894216037, | |
| "loss": 1.1158, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.1460656872548265, | |
| "grad_norm": 1.3796011209487915, | |
| "learning_rate": 0.00012524218892034388, | |
| "loss": 1.0785, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.149911545265749, | |
| "grad_norm": 0.9901970028877258, | |
| "learning_rate": 0.0001249824688985274, | |
| "loss": 1.1383, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.153757403276671, | |
| "grad_norm": 0.8250207304954529, | |
| "learning_rate": 0.00012472794327714724, | |
| "loss": 1.1363, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.153757403276671, | |
| "eval_loss": 1.103262186050415, | |
| "eval_runtime": 17.8852, | |
| "eval_samples_per_second": 55.912, | |
| "eval_steps_per_second": 13.978, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.1576032612875933, | |
| "grad_norm": 0.7949115037918091, | |
| "learning_rate": 0.00012446822325533076, | |
| "loss": 1.0542, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.1614491192985155, | |
| "grad_norm": 0.8414244055747986, | |
| "learning_rate": 0.00012420850323351427, | |
| "loss": 1.133, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.1652949773094377, | |
| "grad_norm": 0.7031393647193909, | |
| "learning_rate": 0.0001239487832116978, | |
| "loss": 1.0804, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 1.16914083532036, | |
| "grad_norm": 0.8476413488388062, | |
| "learning_rate": 0.00012368906318988132, | |
| "loss": 1.1252, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.1729866933312822, | |
| "grad_norm": 1.7877459526062012, | |
| "learning_rate": 0.00012342934316806483, | |
| "loss": 1.1491, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.1729866933312822, | |
| "eval_loss": 1.0958014726638794, | |
| "eval_runtime": 17.9511, | |
| "eval_samples_per_second": 55.707, | |
| "eval_steps_per_second": 13.927, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.1768325513422044, | |
| "grad_norm": 0.8797541856765747, | |
| "learning_rate": 0.00012316962314624835, | |
| "loss": 1.0446, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.1806784093531266, | |
| "grad_norm": 0.9383549690246582, | |
| "learning_rate": 0.00012290990312443186, | |
| "loss": 1.0703, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 1.1845242673640488, | |
| "grad_norm": 1.0788028240203857, | |
| "learning_rate": 0.00012265018310261537, | |
| "loss": 1.0976, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.1883701253749712, | |
| "grad_norm": 0.8661052584648132, | |
| "learning_rate": 0.0001223904630807989, | |
| "loss": 1.0872, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 1.1922159833858934, | |
| "grad_norm": 0.9346690773963928, | |
| "learning_rate": 0.00012213074305898242, | |
| "loss": 1.1296, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.1922159833858934, | |
| "eval_loss": 1.089566946029663, | |
| "eval_runtime": 17.8688, | |
| "eval_samples_per_second": 55.963, | |
| "eval_steps_per_second": 13.991, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.1960618413968156, | |
| "grad_norm": 1.229148030281067, | |
| "learning_rate": 0.00012187102303716594, | |
| "loss": 1.049, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 1.1999076994077378, | |
| "grad_norm": 0.9896694421768188, | |
| "learning_rate": 0.00012161130301534946, | |
| "loss": 1.0519, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.2037535574186602, | |
| "grad_norm": 0.7709591388702393, | |
| "learning_rate": 0.00012135158299353298, | |
| "loss": 0.8004, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 1.2075994154295824, | |
| "grad_norm": 0.8033544421195984, | |
| "learning_rate": 0.00012109186297171649, | |
| "loss": 0.8073, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.2114452734405046, | |
| "grad_norm": 0.955243706703186, | |
| "learning_rate": 0.00012083214294990002, | |
| "loss": 0.8575, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.2114452734405046, | |
| "eval_loss": 0.8102548718452454, | |
| "eval_runtime": 18.0003, | |
| "eval_samples_per_second": 55.555, | |
| "eval_steps_per_second": 13.889, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.2152911314514268, | |
| "grad_norm": 1.1329740285873413, | |
| "learning_rate": 0.00012057242292808353, | |
| "loss": 0.8421, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.2191369894623492, | |
| "grad_norm": 0.6005277633666992, | |
| "learning_rate": 0.00012031270290626706, | |
| "loss": 0.8149, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 1.2229828474732714, | |
| "grad_norm": 0.6579030156135559, | |
| "learning_rate": 0.00012005298288445057, | |
| "loss": 0.7783, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.2268287054841935, | |
| "grad_norm": 1.1820577383041382, | |
| "learning_rate": 0.00011979326286263408, | |
| "loss": 0.8133, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 1.2306745634951157, | |
| "grad_norm": 0.7112457156181335, | |
| "learning_rate": 0.00011953354284081761, | |
| "loss": 0.8004, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.2306745634951157, | |
| "eval_loss": 0.8041366338729858, | |
| "eval_runtime": 17.8867, | |
| "eval_samples_per_second": 55.907, | |
| "eval_steps_per_second": 13.977, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.234520421506038, | |
| "grad_norm": 0.7006672024726868, | |
| "learning_rate": 0.00011927382281900112, | |
| "loss": 0.7673, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 1.2383662795169603, | |
| "grad_norm": 1.0331398248672485, | |
| "learning_rate": 0.00011901410279718463, | |
| "loss": 0.7992, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.2422121375278825, | |
| "grad_norm": 0.8857033848762512, | |
| "learning_rate": 0.00011875438277536817, | |
| "loss": 0.8189, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 1.2460579955388047, | |
| "grad_norm": 0.6674500107765198, | |
| "learning_rate": 0.00011849466275355167, | |
| "loss": 0.7849, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.2499038535497269, | |
| "grad_norm": 0.8113058805465698, | |
| "learning_rate": 0.00011823494273173519, | |
| "loss": 0.7628, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.2499038535497269, | |
| "eval_loss": 0.8041785955429077, | |
| "eval_runtime": 18.0307, | |
| "eval_samples_per_second": 55.461, | |
| "eval_steps_per_second": 13.865, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.253749711560649, | |
| "grad_norm": 1.1236894130706787, | |
| "learning_rate": 0.00011797522270991873, | |
| "loss": 0.8078, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.2575955695715715, | |
| "grad_norm": 0.8019891977310181, | |
| "learning_rate": 0.00011771550268810222, | |
| "loss": 0.7999, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 1.2614414275824937, | |
| "grad_norm": 0.6394712924957275, | |
| "learning_rate": 0.00011745578266628574, | |
| "loss": 0.8132, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.2652872855934159, | |
| "grad_norm": 0.8335860371589661, | |
| "learning_rate": 0.00011719606264446928, | |
| "loss": 0.7494, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 1.2691331436043383, | |
| "grad_norm": 0.5847200155258179, | |
| "learning_rate": 0.00011693634262265279, | |
| "loss": 0.7966, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.2691331436043383, | |
| "eval_loss": 0.8012632131576538, | |
| "eval_runtime": 17.8773, | |
| "eval_samples_per_second": 55.937, | |
| "eval_steps_per_second": 13.984, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.2729790016152602, | |
| "grad_norm": 0.7161915302276611, | |
| "learning_rate": 0.00011667662260083629, | |
| "loss": 0.7969, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 1.2768248596261826, | |
| "grad_norm": 0.7693639397621155, | |
| "learning_rate": 0.00011641690257901983, | |
| "loss": 0.7909, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.2806707176371048, | |
| "grad_norm": 0.6645983457565308, | |
| "learning_rate": 0.00011615718255720334, | |
| "loss": 0.7816, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 1.284516575648027, | |
| "grad_norm": 0.7736928462982178, | |
| "learning_rate": 0.00011589746253538687, | |
| "loss": 0.745, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.2883624336589494, | |
| "grad_norm": 0.6743229627609253, | |
| "learning_rate": 0.00011563774251357038, | |
| "loss": 0.7856, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.2883624336589494, | |
| "eval_loss": 0.8006194233894348, | |
| "eval_runtime": 17.9046, | |
| "eval_samples_per_second": 55.852, | |
| "eval_steps_per_second": 13.963, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.2922082916698716, | |
| "grad_norm": 0.8146863579750061, | |
| "learning_rate": 0.0001153780224917539, | |
| "loss": 0.7559, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.2960541496807938, | |
| "grad_norm": 0.9186645746231079, | |
| "learning_rate": 0.00011511830246993742, | |
| "loss": 0.7902, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 1.299900007691716, | |
| "grad_norm": 0.7634202241897583, | |
| "learning_rate": 0.00011485858244812093, | |
| "loss": 0.749, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.3037458657026382, | |
| "grad_norm": 0.890457272529602, | |
| "learning_rate": 0.00011459886242630445, | |
| "loss": 0.7539, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 1.3075917237135606, | |
| "grad_norm": 0.8466306328773499, | |
| "learning_rate": 0.00011433914240448797, | |
| "loss": 0.7997, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.3075917237135606, | |
| "eval_loss": 0.8002509474754333, | |
| "eval_runtime": 17.9496, | |
| "eval_samples_per_second": 55.712, | |
| "eval_steps_per_second": 13.928, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.3114375817244828, | |
| "grad_norm": 1.403998851776123, | |
| "learning_rate": 0.00011407942238267149, | |
| "loss": 0.8369, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 1.315283439735405, | |
| "grad_norm": 0.8802525401115417, | |
| "learning_rate": 0.000113819702360855, | |
| "loss": 0.7818, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.3191292977463271, | |
| "grad_norm": 0.7463178634643555, | |
| "learning_rate": 0.00011355998233903852, | |
| "loss": 0.7661, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 1.3229751557572493, | |
| "grad_norm": 0.47771725058555603, | |
| "learning_rate": 0.00011330026231722204, | |
| "loss": 0.8141, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.3268210137681717, | |
| "grad_norm": 0.8294114470481873, | |
| "learning_rate": 0.00011304054229540555, | |
| "loss": 0.7979, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.3268210137681717, | |
| "eval_loss": 0.7951701879501343, | |
| "eval_runtime": 17.958, | |
| "eval_samples_per_second": 55.686, | |
| "eval_steps_per_second": 13.921, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.330666871779094, | |
| "grad_norm": 1.0813144445419312, | |
| "learning_rate": 0.00011278082227358908, | |
| "loss": 0.824, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.3345127297900161, | |
| "grad_norm": 0.5647908449172974, | |
| "learning_rate": 0.00011252110225177259, | |
| "loss": 0.7718, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 1.3383585878009385, | |
| "grad_norm": 0.7901347279548645, | |
| "learning_rate": 0.0001122613822299561, | |
| "loss": 0.7883, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.3422044458118605, | |
| "grad_norm": 1.0769431591033936, | |
| "learning_rate": 0.00011200166220813963, | |
| "loss": 0.7625, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 1.346050303822783, | |
| "grad_norm": 0.7366082072257996, | |
| "learning_rate": 0.00011174194218632314, | |
| "loss": 0.8167, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.346050303822783, | |
| "eval_loss": 0.7991219758987427, | |
| "eval_runtime": 17.9932, | |
| "eval_samples_per_second": 55.577, | |
| "eval_steps_per_second": 13.894, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.349896161833705, | |
| "grad_norm": 0.8688609600067139, | |
| "learning_rate": 0.00011148222216450668, | |
| "loss": 0.7882, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 1.3537420198446273, | |
| "grad_norm": 0.7067499160766602, | |
| "learning_rate": 0.0001112225021426902, | |
| "loss": 0.8193, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.3575878778555497, | |
| "grad_norm": 0.8119627833366394, | |
| "learning_rate": 0.0001109627821208737, | |
| "loss": 0.7902, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 1.3614337358664719, | |
| "grad_norm": 0.4667348265647888, | |
| "learning_rate": 0.00011070306209905723, | |
| "loss": 0.7595, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.365279593877394, | |
| "grad_norm": 1.1219276189804077, | |
| "learning_rate": 0.00011044334207724075, | |
| "loss": 0.8049, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.365279593877394, | |
| "eval_loss": 0.7947649955749512, | |
| "eval_runtime": 17.9375, | |
| "eval_samples_per_second": 55.749, | |
| "eval_steps_per_second": 13.937, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.3691254518883162, | |
| "grad_norm": 0.8262600302696228, | |
| "learning_rate": 0.00011018362205542425, | |
| "loss": 0.794, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.3729713098992384, | |
| "grad_norm": 0.6738994121551514, | |
| "learning_rate": 0.00010992390203360779, | |
| "loss": 0.7749, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 1.3768171679101608, | |
| "grad_norm": 0.40902426838874817, | |
| "learning_rate": 0.0001096641820117913, | |
| "loss": 0.8111, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.380663025921083, | |
| "grad_norm": 0.7617566585540771, | |
| "learning_rate": 0.00010940446198997481, | |
| "loss": 0.7663, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 1.3845088839320052, | |
| "grad_norm": 0.5269647836685181, | |
| "learning_rate": 0.00010914474196815834, | |
| "loss": 0.7913, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.3845088839320052, | |
| "eval_loss": 0.7882509231567383, | |
| "eval_runtime": 17.938, | |
| "eval_samples_per_second": 55.748, | |
| "eval_steps_per_second": 13.937, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.3883547419429274, | |
| "grad_norm": 0.9016252160072327, | |
| "learning_rate": 0.00010888502194634185, | |
| "loss": 0.8222, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 1.3922005999538496, | |
| "grad_norm": 0.6058124899864197, | |
| "learning_rate": 0.00010862530192452536, | |
| "loss": 0.7657, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.396046457964772, | |
| "grad_norm": 0.8505234122276306, | |
| "learning_rate": 0.00010836558190270889, | |
| "loss": 0.7962, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 1.3998923159756942, | |
| "grad_norm": 0.7518420815467834, | |
| "learning_rate": 0.0001081058618808924, | |
| "loss": 0.8364, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.4037381739866164, | |
| "grad_norm": 0.7778449058532715, | |
| "learning_rate": 0.00010784614185907592, | |
| "loss": 0.7836, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.4037381739866164, | |
| "eval_loss": 0.7889594435691833, | |
| "eval_runtime": 17.8326, | |
| "eval_samples_per_second": 56.077, | |
| "eval_steps_per_second": 14.019, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.4075840319975388, | |
| "grad_norm": 1.2029508352279663, | |
| "learning_rate": 0.00010758642183725944, | |
| "loss": 0.7859, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.4114298900084608, | |
| "grad_norm": 0.7625166773796082, | |
| "learning_rate": 0.00010732670181544295, | |
| "loss": 0.8018, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 1.4152757480193832, | |
| "grad_norm": 0.6327937245368958, | |
| "learning_rate": 0.00010706698179362648, | |
| "loss": 0.808, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.4191216060303053, | |
| "grad_norm": 0.7097195386886597, | |
| "learning_rate": 0.00010680726177181, | |
| "loss": 0.7928, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 1.4229674640412275, | |
| "grad_norm": 0.5188928246498108, | |
| "learning_rate": 0.00010654754174999351, | |
| "loss": 0.7934, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.4229674640412275, | |
| "eval_loss": 0.7918493151664734, | |
| "eval_runtime": 17.9735, | |
| "eval_samples_per_second": 55.638, | |
| "eval_steps_per_second": 13.909, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.42681332205215, | |
| "grad_norm": 0.6486705541610718, | |
| "learning_rate": 0.00010628782172817703, | |
| "loss": 0.7317, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 1.4306591800630721, | |
| "grad_norm": 0.6570118069648743, | |
| "learning_rate": 0.00010602810170636055, | |
| "loss": 0.8149, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.4345050380739943, | |
| "grad_norm": 0.8024285435676575, | |
| "learning_rate": 0.00010576838168454406, | |
| "loss": 0.8392, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 1.4383508960849165, | |
| "grad_norm": 0.5735141038894653, | |
| "learning_rate": 0.0001055086616627276, | |
| "loss": 0.7932, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.4421967540958387, | |
| "grad_norm": 0.5087122917175293, | |
| "learning_rate": 0.0001052489416409111, | |
| "loss": 0.7508, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.4421967540958387, | |
| "eval_loss": 0.7895762920379639, | |
| "eval_runtime": 17.9534, | |
| "eval_samples_per_second": 55.7, | |
| "eval_steps_per_second": 13.925, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.446042612106761, | |
| "grad_norm": 0.7478468418121338, | |
| "learning_rate": 0.00010498922161909461, | |
| "loss": 0.8533, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.4498884701176833, | |
| "grad_norm": 1.2165393829345703, | |
| "learning_rate": 0.00010472950159727815, | |
| "loss": 0.8025, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 1.4537343281286055, | |
| "grad_norm": 0.8682180643081665, | |
| "learning_rate": 0.00010446978157546165, | |
| "loss": 0.8086, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.4575801861395277, | |
| "grad_norm": 0.9063705205917358, | |
| "learning_rate": 0.00010421006155364516, | |
| "loss": 0.7583, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 1.4614260441504499, | |
| "grad_norm": 0.7133361101150513, | |
| "learning_rate": 0.0001039503415318287, | |
| "loss": 0.7381, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.4614260441504499, | |
| "eval_loss": 0.7869898080825806, | |
| "eval_runtime": 17.9874, | |
| "eval_samples_per_second": 55.595, | |
| "eval_steps_per_second": 13.899, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.4652719021613723, | |
| "grad_norm": 0.6205143928527832, | |
| "learning_rate": 0.00010369062151001222, | |
| "loss": 0.8141, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 1.4691177601722945, | |
| "grad_norm": 1.1060974597930908, | |
| "learning_rate": 0.00010343090148819572, | |
| "loss": 0.7771, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.4729636181832166, | |
| "grad_norm": 0.7808921933174133, | |
| "learning_rate": 0.00010317118146637926, | |
| "loss": 0.8006, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 1.476809476194139, | |
| "grad_norm": 0.5509454011917114, | |
| "learning_rate": 0.00010291146144456277, | |
| "loss": 0.7922, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.480655334205061, | |
| "grad_norm": 1.2427464723587036, | |
| "learning_rate": 0.0001026517414227463, | |
| "loss": 0.7801, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.480655334205061, | |
| "eval_loss": 0.7850660085678101, | |
| "eval_runtime": 17.8458, | |
| "eval_samples_per_second": 56.036, | |
| "eval_steps_per_second": 14.009, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.4845011922159834, | |
| "grad_norm": 0.8217543959617615, | |
| "learning_rate": 0.00010239202140092981, | |
| "loss": 0.782, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.4883470502269056, | |
| "grad_norm": 0.6753976941108704, | |
| "learning_rate": 0.00010213230137911332, | |
| "loss": 0.7787, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 1.4921929082378278, | |
| "grad_norm": 1.0264923572540283, | |
| "learning_rate": 0.00010187258135729685, | |
| "loss": 0.7713, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.4960387662487502, | |
| "grad_norm": 0.7317076325416565, | |
| "learning_rate": 0.00010161286133548036, | |
| "loss": 0.7634, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 1.4998846242596724, | |
| "grad_norm": 0.6011013388633728, | |
| "learning_rate": 0.00010135314131366387, | |
| "loss": 0.7715, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.4998846242596724, | |
| "eval_loss": 0.7843549847602844, | |
| "eval_runtime": 17.919, | |
| "eval_samples_per_second": 55.807, | |
| "eval_steps_per_second": 13.952, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.5037304822705946, | |
| "grad_norm": 0.5136408805847168, | |
| "learning_rate": 0.0001010934212918474, | |
| "loss": 0.7533, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 1.5075763402815168, | |
| "grad_norm": 0.8944385647773743, | |
| "learning_rate": 0.00010083370127003091, | |
| "loss": 0.8483, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.511422198292439, | |
| "grad_norm": 0.9759209752082825, | |
| "learning_rate": 0.00010057398124821442, | |
| "loss": 0.8338, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 1.5152680563033614, | |
| "grad_norm": 0.7823068499565125, | |
| "learning_rate": 0.00010031426122639795, | |
| "loss": 0.7739, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.5191139143142836, | |
| "grad_norm": 0.9423583745956421, | |
| "learning_rate": 0.00010005454120458146, | |
| "loss": 0.8092, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 1.5191139143142836, | |
| "eval_loss": 0.782052218914032, | |
| "eval_runtime": 17.8818, | |
| "eval_samples_per_second": 55.923, | |
| "eval_steps_per_second": 13.981, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 1.5229597723252057, | |
| "grad_norm": 1.067384123802185, | |
| "learning_rate": 9.979482118276499e-05, | |
| "loss": 0.7581, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.5268056303361282, | |
| "grad_norm": 1.0136195421218872, | |
| "learning_rate": 9.95351011609485e-05, | |
| "loss": 0.7923, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 1.5306514883470501, | |
| "grad_norm": 0.8290442824363708, | |
| "learning_rate": 9.927538113913202e-05, | |
| "loss": 0.7991, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.5344973463579725, | |
| "grad_norm": 0.8200196623802185, | |
| "learning_rate": 9.901566111731554e-05, | |
| "loss": 0.7382, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 1.5383432043688947, | |
| "grad_norm": 0.7905208468437195, | |
| "learning_rate": 9.875594109549905e-05, | |
| "loss": 0.7615, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.5383432043688947, | |
| "eval_loss": 0.7793118953704834, | |
| "eval_runtime": 17.8391, | |
| "eval_samples_per_second": 56.057, | |
| "eval_steps_per_second": 14.014, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.542189062379817, | |
| "grad_norm": 0.8522188067436218, | |
| "learning_rate": 9.849622107368257e-05, | |
| "loss": 0.8041, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 1.5460349203907393, | |
| "grad_norm": 1.0029702186584473, | |
| "learning_rate": 9.82365010518661e-05, | |
| "loss": 0.7873, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.5498807784016613, | |
| "grad_norm": 1.007730484008789, | |
| "learning_rate": 9.797678103004962e-05, | |
| "loss": 0.7685, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 1.5537266364125837, | |
| "grad_norm": 0.6971302032470703, | |
| "learning_rate": 9.771706100823312e-05, | |
| "loss": 0.805, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.5575724944235059, | |
| "grad_norm": 0.9766409993171692, | |
| "learning_rate": 9.745734098641665e-05, | |
| "loss": 0.765, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 1.5575724944235059, | |
| "eval_loss": 0.7780515551567078, | |
| "eval_runtime": 17.9362, | |
| "eval_samples_per_second": 55.753, | |
| "eval_steps_per_second": 13.938, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 1.561418352434428, | |
| "grad_norm": 0.8611739873886108, | |
| "learning_rate": 9.719762096460017e-05, | |
| "loss": 0.7768, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.5652642104453505, | |
| "grad_norm": 0.9571949243545532, | |
| "learning_rate": 9.693790094278369e-05, | |
| "loss": 0.8106, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 1.5691100684562724, | |
| "grad_norm": 0.9941520094871521, | |
| "learning_rate": 9.66781809209672e-05, | |
| "loss": 0.7952, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.5729559264671948, | |
| "grad_norm": 0.7494879364967346, | |
| "learning_rate": 9.641846089915072e-05, | |
| "loss": 0.7882, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 1.576801784478117, | |
| "grad_norm": 0.25608131289482117, | |
| "learning_rate": 9.615874087733424e-05, | |
| "loss": 0.7424, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.576801784478117, | |
| "eval_loss": 0.7784542441368103, | |
| "eval_runtime": 17.9827, | |
| "eval_samples_per_second": 55.609, | |
| "eval_steps_per_second": 13.902, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.5806476424890392, | |
| "grad_norm": 0.781563401222229, | |
| "learning_rate": 9.589902085551775e-05, | |
| "loss": 0.7251, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 1.5844935004999616, | |
| "grad_norm": 0.8129003047943115, | |
| "learning_rate": 9.563930083370128e-05, | |
| "loss": 0.7783, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.5883393585108838, | |
| "grad_norm": 0.7955138087272644, | |
| "learning_rate": 9.537958081188479e-05, | |
| "loss": 0.7692, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 1.592185216521806, | |
| "grad_norm": 0.7752518653869629, | |
| "learning_rate": 9.511986079006832e-05, | |
| "loss": 0.7578, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.5960310745327284, | |
| "grad_norm": 0.7433210611343384, | |
| "learning_rate": 9.486014076825183e-05, | |
| "loss": 0.7885, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 1.5960310745327284, | |
| "eval_loss": 0.7757794260978699, | |
| "eval_runtime": 17.9593, | |
| "eval_samples_per_second": 55.682, | |
| "eval_steps_per_second": 13.92, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 1.5998769325436504, | |
| "grad_norm": 0.7218450903892517, | |
| "learning_rate": 9.460042074643534e-05, | |
| "loss": 0.8179, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.6037227905545728, | |
| "grad_norm": 0.8611409664154053, | |
| "learning_rate": 9.434070072461887e-05, | |
| "loss": 0.8166, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 1.607568648565495, | |
| "grad_norm": 0.8060470223426819, | |
| "learning_rate": 9.408098070280238e-05, | |
| "loss": 0.7831, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.6114145065764172, | |
| "grad_norm": 0.9832955002784729, | |
| "learning_rate": 9.382126068098591e-05, | |
| "loss": 0.7679, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 1.6152603645873396, | |
| "grad_norm": 0.7749195098876953, | |
| "learning_rate": 9.356154065916942e-05, | |
| "loss": 0.8001, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.6152603645873396, | |
| "eval_loss": 0.7739425301551819, | |
| "eval_runtime": 17.9752, | |
| "eval_samples_per_second": 55.632, | |
| "eval_steps_per_second": 13.908, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.6191062225982615, | |
| "grad_norm": 0.8098833560943604, | |
| "learning_rate": 9.330182063735295e-05, | |
| "loss": 0.7673, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 1.622952080609184, | |
| "grad_norm": 0.7867510318756104, | |
| "learning_rate": 9.304210061553646e-05, | |
| "loss": 0.7619, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.6267979386201061, | |
| "grad_norm": 1.2399013042449951, | |
| "learning_rate": 9.278238059371997e-05, | |
| "loss": 0.8664, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 1.6306437966310283, | |
| "grad_norm": 0.6171821355819702, | |
| "learning_rate": 9.25226605719035e-05, | |
| "loss": 0.8543, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.6344896546419507, | |
| "grad_norm": 0.7197456955909729, | |
| "learning_rate": 9.226294055008701e-05, | |
| "loss": 0.7989, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 1.6344896546419507, | |
| "eval_loss": 0.7736220955848694, | |
| "eval_runtime": 18.0274, | |
| "eval_samples_per_second": 55.471, | |
| "eval_steps_per_second": 13.868, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 1.6383355126528727, | |
| "grad_norm": 0.6752798557281494, | |
| "learning_rate": 9.200322052827052e-05, | |
| "loss": 0.7925, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.642181370663795, | |
| "grad_norm": 0.7389090061187744, | |
| "learning_rate": 9.174350050645405e-05, | |
| "loss": 0.762, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 1.6460272286747173, | |
| "grad_norm": 0.7688984870910645, | |
| "learning_rate": 9.148378048463756e-05, | |
| "loss": 0.771, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.6498730866856395, | |
| "grad_norm": 0.7231914401054382, | |
| "learning_rate": 9.122406046282108e-05, | |
| "loss": 0.8244, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 1.6537189446965619, | |
| "grad_norm": 0.786527156829834, | |
| "learning_rate": 9.09643404410046e-05, | |
| "loss": 0.7738, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.6537189446965619, | |
| "eval_loss": 0.77164626121521, | |
| "eval_runtime": 17.9322, | |
| "eval_samples_per_second": 55.765, | |
| "eval_steps_per_second": 13.941, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.657564802707484, | |
| "grad_norm": 0.8676180243492126, | |
| "learning_rate": 9.070462041918813e-05, | |
| "loss": 0.8113, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 1.6614106607184063, | |
| "grad_norm": 0.6232962012290955, | |
| "learning_rate": 9.044490039737163e-05, | |
| "loss": 0.7847, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.6652565187293287, | |
| "grad_norm": 1.151957631111145, | |
| "learning_rate": 9.018518037555515e-05, | |
| "loss": 0.7924, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 1.6691023767402506, | |
| "grad_norm": 0.8149247169494629, | |
| "learning_rate": 8.992546035373868e-05, | |
| "loss": 0.7395, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.672948234751173, | |
| "grad_norm": 1.0983916521072388, | |
| "learning_rate": 8.96657403319222e-05, | |
| "loss": 0.7715, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 1.672948234751173, | |
| "eval_loss": 0.7681905031204224, | |
| "eval_runtime": 17.9279, | |
| "eval_samples_per_second": 55.779, | |
| "eval_steps_per_second": 13.945, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 1.6767940927620952, | |
| "grad_norm": 0.7059574723243713, | |
| "learning_rate": 8.94060203101057e-05, | |
| "loss": 0.784, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.6806399507730174, | |
| "grad_norm": 1.0587022304534912, | |
| "learning_rate": 8.914630028828923e-05, | |
| "loss": 0.7608, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 1.6844858087839398, | |
| "grad_norm": 0.7638582587242126, | |
| "learning_rate": 8.888658026647275e-05, | |
| "loss": 0.8159, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.6883316667948618, | |
| "grad_norm": 0.5783549547195435, | |
| "learning_rate": 8.862686024465626e-05, | |
| "loss": 0.7959, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 1.6921775248057842, | |
| "grad_norm": 1.2192896604537964, | |
| "learning_rate": 8.836714022283978e-05, | |
| "loss": 0.8272, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.6921775248057842, | |
| "eval_loss": 0.7701475024223328, | |
| "eval_runtime": 17.8145, | |
| "eval_samples_per_second": 56.134, | |
| "eval_steps_per_second": 14.034, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.6960233828167064, | |
| "grad_norm": 0.5081881880760193, | |
| "learning_rate": 8.810742020102331e-05, | |
| "loss": 0.7883, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 1.6998692408276286, | |
| "grad_norm": 0.5658268332481384, | |
| "learning_rate": 8.784770017920681e-05, | |
| "loss": 0.77, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.703715098838551, | |
| "grad_norm": 0.6888287663459778, | |
| "learning_rate": 8.758798015739034e-05, | |
| "loss": 0.8013, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 1.707560956849473, | |
| "grad_norm": 1.0599181652069092, | |
| "learning_rate": 8.732826013557386e-05, | |
| "loss": 0.7649, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.7114068148603954, | |
| "grad_norm": 0.7679368257522583, | |
| "learning_rate": 8.706854011375736e-05, | |
| "loss": 0.7338, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 1.7114068148603954, | |
| "eval_loss": 0.7665286660194397, | |
| "eval_runtime": 17.8822, | |
| "eval_samples_per_second": 55.921, | |
| "eval_steps_per_second": 13.98, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 1.7152526728713176, | |
| "grad_norm": 0.6636808514595032, | |
| "learning_rate": 8.680882009194089e-05, | |
| "loss": 0.7949, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.7190985308822397, | |
| "grad_norm": 0.7327821254730225, | |
| "learning_rate": 8.654910007012442e-05, | |
| "loss": 0.7484, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 1.7229443888931621, | |
| "grad_norm": 0.8187472224235535, | |
| "learning_rate": 8.628938004830793e-05, | |
| "loss": 0.7482, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.7267902469040843, | |
| "grad_norm": 0.4527030289173126, | |
| "learning_rate": 8.602966002649144e-05, | |
| "loss": 0.7902, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 1.7306361049150065, | |
| "grad_norm": 0.6475220918655396, | |
| "learning_rate": 8.576994000467497e-05, | |
| "loss": 0.7998, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.7306361049150065, | |
| "eval_loss": 0.7646552920341492, | |
| "eval_runtime": 17.8108, | |
| "eval_samples_per_second": 56.146, | |
| "eval_steps_per_second": 14.036, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.734481962925929, | |
| "grad_norm": 0.538769543170929, | |
| "learning_rate": 8.551021998285848e-05, | |
| "loss": 0.7658, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 1.738327820936851, | |
| "grad_norm": 0.629510223865509, | |
| "learning_rate": 8.5250499961042e-05, | |
| "loss": 0.7885, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.7421736789477733, | |
| "grad_norm": 0.6914022564888, | |
| "learning_rate": 8.499077993922552e-05, | |
| "loss": 0.7648, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 1.7460195369586955, | |
| "grad_norm": 0.5563036799430847, | |
| "learning_rate": 8.473105991740903e-05, | |
| "loss": 0.7558, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.7498653949696177, | |
| "grad_norm": 0.7851826548576355, | |
| "learning_rate": 8.447133989559256e-05, | |
| "loss": 0.7961, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 1.7498653949696177, | |
| "eval_loss": 0.7664644718170166, | |
| "eval_runtime": 17.9733, | |
| "eval_samples_per_second": 55.638, | |
| "eval_steps_per_second": 13.91, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 1.75371125298054, | |
| "grad_norm": 0.48695698380470276, | |
| "learning_rate": 8.421161987377607e-05, | |
| "loss": 0.8264, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.757557110991462, | |
| "grad_norm": 0.8053486347198486, | |
| "learning_rate": 8.39518998519596e-05, | |
| "loss": 0.8084, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 1.7614029690023845, | |
| "grad_norm": 1.1373741626739502, | |
| "learning_rate": 8.369217983014311e-05, | |
| "loss": 0.7842, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.7652488270133067, | |
| "grad_norm": 1.1318634748458862, | |
| "learning_rate": 8.343245980832662e-05, | |
| "loss": 0.7727, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 1.7690946850242288, | |
| "grad_norm": 0.8140521049499512, | |
| "learning_rate": 8.317273978651015e-05, | |
| "loss": 0.7945, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.7690946850242288, | |
| "eval_loss": 0.7635026574134827, | |
| "eval_runtime": 17.8969, | |
| "eval_samples_per_second": 55.875, | |
| "eval_steps_per_second": 13.969, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.7729405430351513, | |
| "grad_norm": 0.7365099787712097, | |
| "learning_rate": 8.291301976469366e-05, | |
| "loss": 0.7915, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 1.7767864010460732, | |
| "grad_norm": 0.7158268690109253, | |
| "learning_rate": 8.265329974287718e-05, | |
| "loss": 0.8089, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.7806322590569956, | |
| "grad_norm": 0.7917172312736511, | |
| "learning_rate": 8.23935797210607e-05, | |
| "loss": 0.7881, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 1.7844781170679178, | |
| "grad_norm": 0.9002280831336975, | |
| "learning_rate": 8.213385969924422e-05, | |
| "loss": 0.7861, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.78832397507884, | |
| "grad_norm": 1.118498682975769, | |
| "learning_rate": 8.187413967742774e-05, | |
| "loss": 0.7787, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 1.78832397507884, | |
| "eval_loss": 0.7597912549972534, | |
| "eval_runtime": 18.0126, | |
| "eval_samples_per_second": 55.517, | |
| "eval_steps_per_second": 13.879, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 1.7921698330897624, | |
| "grad_norm": 0.555014967918396, | |
| "learning_rate": 8.161441965561125e-05, | |
| "loss": 0.7698, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.7960156911006846, | |
| "grad_norm": 0.7749983072280884, | |
| "learning_rate": 8.135469963379477e-05, | |
| "loss": 0.7976, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 1.7998615491116068, | |
| "grad_norm": 0.8833787441253662, | |
| "learning_rate": 8.10949796119783e-05, | |
| "loss": 0.7901, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.8037074071225292, | |
| "grad_norm": 1.099992036819458, | |
| "learning_rate": 8.08352595901618e-05, | |
| "loss": 0.7843, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 1.8075532651334512, | |
| "grad_norm": 0.7118529677391052, | |
| "learning_rate": 8.057553956834533e-05, | |
| "loss": 0.7818, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.8075532651334512, | |
| "eval_loss": 0.7594859600067139, | |
| "eval_runtime": 17.8501, | |
| "eval_samples_per_second": 56.022, | |
| "eval_steps_per_second": 14.005, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.8113991231443736, | |
| "grad_norm": 0.8289865851402283, | |
| "learning_rate": 8.031581954652885e-05, | |
| "loss": 0.7899, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 1.8152449811552958, | |
| "grad_norm": 1.1237398386001587, | |
| "learning_rate": 8.005609952471237e-05, | |
| "loss": 0.8005, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.819090839166218, | |
| "grad_norm": 0.8594374060630798, | |
| "learning_rate": 7.979637950289588e-05, | |
| "loss": 0.7765, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 1.8229366971771404, | |
| "grad_norm": 0.754634439945221, | |
| "learning_rate": 7.95366594810794e-05, | |
| "loss": 0.7794, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.8267825551880623, | |
| "grad_norm": 1.0647647380828857, | |
| "learning_rate": 7.927693945926292e-05, | |
| "loss": 0.7681, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 1.8267825551880623, | |
| "eval_loss": 0.7575324773788452, | |
| "eval_runtime": 17.8657, | |
| "eval_samples_per_second": 55.973, | |
| "eval_steps_per_second": 13.993, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 1.8306284131989847, | |
| "grad_norm": 1.1255161762237549, | |
| "learning_rate": 7.901721943744644e-05, | |
| "loss": 0.7551, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.834474271209907, | |
| "grad_norm": 0.8209452629089355, | |
| "learning_rate": 7.875749941562995e-05, | |
| "loss": 0.8091, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 1.838320129220829, | |
| "grad_norm": 0.40779542922973633, | |
| "learning_rate": 7.849777939381348e-05, | |
| "loss": 0.7572, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.8421659872317515, | |
| "grad_norm": 0.9186558127403259, | |
| "learning_rate": 7.823805937199699e-05, | |
| "loss": 0.7681, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 1.8460118452426735, | |
| "grad_norm": 0.4896409213542938, | |
| "learning_rate": 7.79783393501805e-05, | |
| "loss": 0.7989, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.8460118452426735, | |
| "eval_loss": 0.7566245198249817, | |
| "eval_runtime": 17.9055, | |
| "eval_samples_per_second": 55.849, | |
| "eval_steps_per_second": 13.962, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.8498577032535959, | |
| "grad_norm": 1.1524229049682617, | |
| "learning_rate": 7.771861932836403e-05, | |
| "loss": 0.7732, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 1.853703561264518, | |
| "grad_norm": 0.4653956890106201, | |
| "learning_rate": 7.745889930654755e-05, | |
| "loss": 0.7961, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.8575494192754403, | |
| "grad_norm": 0.8423280119895935, | |
| "learning_rate": 7.719917928473105e-05, | |
| "loss": 0.7963, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 1.8613952772863627, | |
| "grad_norm": 0.6979435086250305, | |
| "learning_rate": 7.693945926291458e-05, | |
| "loss": 0.7816, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.8652411352972849, | |
| "grad_norm": 0.7800914645195007, | |
| "learning_rate": 7.66797392410981e-05, | |
| "loss": 0.7563, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 1.8652411352972849, | |
| "eval_loss": 0.7574715614318848, | |
| "eval_runtime": 17.9043, | |
| "eval_samples_per_second": 55.852, | |
| "eval_steps_per_second": 13.963, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 1.869086993308207, | |
| "grad_norm": 0.9678017497062683, | |
| "learning_rate": 7.642001921928162e-05, | |
| "loss": 0.7689, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.8729328513191295, | |
| "grad_norm": 0.454647421836853, | |
| "learning_rate": 7.616029919746513e-05, | |
| "loss": 0.7231, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 1.8767787093300514, | |
| "grad_norm": 0.7899460792541504, | |
| "learning_rate": 7.590057917564866e-05, | |
| "loss": 0.7471, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.8806245673409738, | |
| "grad_norm": 1.1373926401138306, | |
| "learning_rate": 7.564085915383217e-05, | |
| "loss": 0.8342, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 1.884470425351896, | |
| "grad_norm": 1.4272133111953735, | |
| "learning_rate": 7.538113913201568e-05, | |
| "loss": 0.7585, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.884470425351896, | |
| "eval_loss": 0.7536496520042419, | |
| "eval_runtime": 17.9401, | |
| "eval_samples_per_second": 55.741, | |
| "eval_steps_per_second": 13.935, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.8883162833628182, | |
| "grad_norm": 0.8080185055732727, | |
| "learning_rate": 7.512141911019921e-05, | |
| "loss": 0.8067, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 1.8921621413737406, | |
| "grad_norm": 0.5850221514701843, | |
| "learning_rate": 7.486169908838274e-05, | |
| "loss": 0.7455, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.8960079993846626, | |
| "grad_norm": 0.7521384954452515, | |
| "learning_rate": 7.460197906656624e-05, | |
| "loss": 0.7604, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 1.899853857395585, | |
| "grad_norm": 0.6376401782035828, | |
| "learning_rate": 7.434225904474976e-05, | |
| "loss": 0.7485, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.9036997154065072, | |
| "grad_norm": 0.6305235624313354, | |
| "learning_rate": 7.408253902293329e-05, | |
| "loss": 0.7567, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 1.9036997154065072, | |
| "eval_loss": 0.7527515888214111, | |
| "eval_runtime": 17.967, | |
| "eval_samples_per_second": 55.658, | |
| "eval_steps_per_second": 13.914, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 1.9075455734174294, | |
| "grad_norm": 0.5975210666656494, | |
| "learning_rate": 7.382281900111679e-05, | |
| "loss": 0.7855, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.9113914314283518, | |
| "grad_norm": 0.41196370124816895, | |
| "learning_rate": 7.356309897930031e-05, | |
| "loss": 0.7711, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 1.9152372894392737, | |
| "grad_norm": 1.1755207777023315, | |
| "learning_rate": 7.330337895748384e-05, | |
| "loss": 0.8114, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.9190831474501961, | |
| "grad_norm": 0.37193945050239563, | |
| "learning_rate": 7.304365893566735e-05, | |
| "loss": 0.7437, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 1.9229290054611183, | |
| "grad_norm": 0.6753848195075989, | |
| "learning_rate": 7.278393891385087e-05, | |
| "loss": 0.8289, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.9229290054611183, | |
| "eval_loss": 0.7534742951393127, | |
| "eval_runtime": 17.8978, | |
| "eval_samples_per_second": 55.873, | |
| "eval_steps_per_second": 13.968, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.9267748634720405, | |
| "grad_norm": 0.6314563751220703, | |
| "learning_rate": 7.25242188920344e-05, | |
| "loss": 0.8278, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 1.930620721482963, | |
| "grad_norm": 0.38249602913856506, | |
| "learning_rate": 7.22644988702179e-05, | |
| "loss": 0.7948, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.9344665794938851, | |
| "grad_norm": 0.8241211771965027, | |
| "learning_rate": 7.200477884840142e-05, | |
| "loss": 0.8152, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 1.9383124375048073, | |
| "grad_norm": 0.4248273968696594, | |
| "learning_rate": 7.175025322702127e-05, | |
| "loss": 0.8083, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.9421582955157297, | |
| "grad_norm": 1.0574986934661865, | |
| "learning_rate": 7.14905332052048e-05, | |
| "loss": 0.7644, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 1.9421582955157297, | |
| "eval_loss": 0.7520478963851929, | |
| "eval_runtime": 17.9125, | |
| "eval_samples_per_second": 55.827, | |
| "eval_steps_per_second": 13.957, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 1.9460041535266517, | |
| "grad_norm": 0.957831084728241, | |
| "learning_rate": 7.123081318338831e-05, | |
| "loss": 0.8145, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.949850011537574, | |
| "grad_norm": 0.7300383448600769, | |
| "learning_rate": 7.097109316157184e-05, | |
| "loss": 0.8021, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 1.9536958695484963, | |
| "grad_norm": 0.6103696227073669, | |
| "learning_rate": 7.071137313975535e-05, | |
| "loss": 0.7561, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.9575417275594185, | |
| "grad_norm": 0.9353188276290894, | |
| "learning_rate": 7.045165311793886e-05, | |
| "loss": 0.76, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 1.9613875855703409, | |
| "grad_norm": 0.7097103595733643, | |
| "learning_rate": 7.019193309612239e-05, | |
| "loss": 0.8125, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.9613875855703409, | |
| "eval_loss": 0.7521655559539795, | |
| "eval_runtime": 17.8845, | |
| "eval_samples_per_second": 55.914, | |
| "eval_steps_per_second": 13.979, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.9652334435812628, | |
| "grad_norm": 1.2154541015625, | |
| "learning_rate": 6.99322130743059e-05, | |
| "loss": 0.7644, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 1.9690793015921852, | |
| "grad_norm": 0.715004026889801, | |
| "learning_rate": 6.967249305248941e-05, | |
| "loss": 0.7235, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.9729251596031074, | |
| "grad_norm": 0.44530218839645386, | |
| "learning_rate": 6.941277303067294e-05, | |
| "loss": 0.7591, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 1.9767710176140296, | |
| "grad_norm": 0.7103247046470642, | |
| "learning_rate": 6.915305300885647e-05, | |
| "loss": 0.7203, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.980616875624952, | |
| "grad_norm": 0.6260993480682373, | |
| "learning_rate": 6.88985273874763e-05, | |
| "loss": 0.8008, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 1.980616875624952, | |
| "eval_loss": 0.7489978075027466, | |
| "eval_runtime": 17.8489, | |
| "eval_samples_per_second": 56.026, | |
| "eval_steps_per_second": 14.006, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 1.984462733635874, | |
| "grad_norm": 0.8690526485443115, | |
| "learning_rate": 6.863880736565982e-05, | |
| "loss": 0.7445, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.9883085916467964, | |
| "grad_norm": 0.8287826776504517, | |
| "learning_rate": 6.837908734384334e-05, | |
| "loss": 0.7606, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 1.9921544496577186, | |
| "grad_norm": 0.9105169773101807, | |
| "learning_rate": 6.811936732202686e-05, | |
| "loss": 0.7742, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.9960003076686408, | |
| "grad_norm": 0.6223366856575012, | |
| "learning_rate": 6.785964730021037e-05, | |
| "loss": 0.8069, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 1.9998461656795632, | |
| "grad_norm": 0.848816454410553, | |
| "learning_rate": 6.75999272783939e-05, | |
| "loss": 0.8126, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.9998461656795632, | |
| "eval_loss": 0.7482460737228394, | |
| "eval_runtime": 17.8823, | |
| "eval_samples_per_second": 55.921, | |
| "eval_steps_per_second": 13.98, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.003692023690485, | |
| "grad_norm": 0.706822395324707, | |
| "learning_rate": 6.734020725657741e-05, | |
| "loss": 0.7, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 2.0075378817014076, | |
| "grad_norm": 1.503631830215454, | |
| "learning_rate": 6.708048723476093e-05, | |
| "loss": 0.7347, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 2.01138373971233, | |
| "grad_norm": 0.8511216044425964, | |
| "learning_rate": 6.682076721294445e-05, | |
| "loss": 0.6917, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 2.015229597723252, | |
| "grad_norm": 0.7063366174697876, | |
| "learning_rate": 6.656104719112797e-05, | |
| "loss": 0.7188, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.0190754557341744, | |
| "grad_norm": 0.7650218605995178, | |
| "learning_rate": 6.630132716931149e-05, | |
| "loss": 0.701, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.0190754557341744, | |
| "eval_loss": 0.7444872856140137, | |
| "eval_runtime": 17.8222, | |
| "eval_samples_per_second": 56.11, | |
| "eval_steps_per_second": 14.027, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.0229213137450963, | |
| "grad_norm": 0.7015202045440674, | |
| "learning_rate": 6.6041607147495e-05, | |
| "loss": 0.7442, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 2.0267671717560187, | |
| "grad_norm": 0.826304018497467, | |
| "learning_rate": 6.578188712567853e-05, | |
| "loss": 0.7003, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 2.030613029766941, | |
| "grad_norm": 0.5597676038742065, | |
| "learning_rate": 6.552216710386204e-05, | |
| "loss": 0.7108, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.034458887777863, | |
| "grad_norm": 0.738636314868927, | |
| "learning_rate": 6.526244708204557e-05, | |
| "loss": 0.7407, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 2.0383047457887855, | |
| "grad_norm": 0.6629013419151306, | |
| "learning_rate": 6.500272706022908e-05, | |
| "loss": 0.7227, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.0383047457887855, | |
| "eval_loss": 0.7432363033294678, | |
| "eval_runtime": 17.8543, | |
| "eval_samples_per_second": 56.009, | |
| "eval_steps_per_second": 14.002, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.042150603799708, | |
| "grad_norm": 0.5691978931427002, | |
| "learning_rate": 6.474300703841259e-05, | |
| "loss": 0.6752, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 2.04599646181063, | |
| "grad_norm": 0.5654874444007874, | |
| "learning_rate": 6.448328701659612e-05, | |
| "loss": 0.7282, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.0498423198215523, | |
| "grad_norm": 1.0112574100494385, | |
| "learning_rate": 6.422356699477963e-05, | |
| "loss": 0.7343, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 2.0536881778324743, | |
| "grad_norm": 0.2508319318294525, | |
| "learning_rate": 6.396384697296314e-05, | |
| "loss": 0.6796, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 2.0575340358433967, | |
| "grad_norm": 1.014090895652771, | |
| "learning_rate": 6.370412695114667e-05, | |
| "loss": 0.7276, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.0575340358433967, | |
| "eval_loss": 0.7409418821334839, | |
| "eval_runtime": 18.0076, | |
| "eval_samples_per_second": 55.532, | |
| "eval_steps_per_second": 13.883, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.061379893854319, | |
| "grad_norm": 0.6783422827720642, | |
| "learning_rate": 6.344440692933018e-05, | |
| "loss": 0.7233, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.065225751865241, | |
| "grad_norm": 0.9281295537948608, | |
| "learning_rate": 6.31846869075137e-05, | |
| "loss": 0.7463, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 2.0690716098761635, | |
| "grad_norm": 0.9024075269699097, | |
| "learning_rate": 6.293016128613355e-05, | |
| "loss": 0.7499, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 2.0729174678870854, | |
| "grad_norm": 0.9747761487960815, | |
| "learning_rate": 6.267044126431707e-05, | |
| "loss": 0.7736, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 2.076763325898008, | |
| "grad_norm": 0.9101582169532776, | |
| "learning_rate": 6.241072124250059e-05, | |
| "loss": 0.7343, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.076763325898008, | |
| "eval_loss": 0.7426913380622864, | |
| "eval_runtime": 18.0363, | |
| "eval_samples_per_second": 55.444, | |
| "eval_steps_per_second": 13.861, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.0806091839089302, | |
| "grad_norm": 0.4836456775665283, | |
| "learning_rate": 6.21510012206841e-05, | |
| "loss": 0.7725, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 2.084455041919852, | |
| "grad_norm": 0.9285927414894104, | |
| "learning_rate": 6.189128119886762e-05, | |
| "loss": 0.7167, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 2.0883008999307746, | |
| "grad_norm": 0.8826911449432373, | |
| "learning_rate": 6.163156117705115e-05, | |
| "loss": 0.7204, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 2.0921467579416966, | |
| "grad_norm": 1.0057801008224487, | |
| "learning_rate": 6.137184115523465e-05, | |
| "loss": 0.713, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.095992615952619, | |
| "grad_norm": 1.2974227666854858, | |
| "learning_rate": 6.111212113341818e-05, | |
| "loss": 0.6974, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.095992615952619, | |
| "eval_loss": 0.7402239441871643, | |
| "eval_runtime": 17.8723, | |
| "eval_samples_per_second": 55.953, | |
| "eval_steps_per_second": 13.988, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.0998384739635414, | |
| "grad_norm": 0.7047484517097473, | |
| "learning_rate": 6.0852401111601696e-05, | |
| "loss": 0.7377, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 2.1036843319744634, | |
| "grad_norm": 1.0780186653137207, | |
| "learning_rate": 6.0592681089785216e-05, | |
| "loss": 0.7153, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 2.1075301899853858, | |
| "grad_norm": 0.6844059228897095, | |
| "learning_rate": 6.033296106796873e-05, | |
| "loss": 0.7077, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 2.111376047996308, | |
| "grad_norm": 0.7109357118606567, | |
| "learning_rate": 6.007324104615225e-05, | |
| "loss": 0.6574, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 2.11522190600723, | |
| "grad_norm": 1.0174553394317627, | |
| "learning_rate": 5.9813521024335775e-05, | |
| "loss": 0.7269, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.11522190600723, | |
| "eval_loss": 0.7388279438018799, | |
| "eval_runtime": 17.9018, | |
| "eval_samples_per_second": 55.86, | |
| "eval_steps_per_second": 13.965, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.1190677640181526, | |
| "grad_norm": 0.8702675104141235, | |
| "learning_rate": 5.955380100251928e-05, | |
| "loss": 0.7034, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 2.1229136220290745, | |
| "grad_norm": 0.9476292729377747, | |
| "learning_rate": 5.929408098070281e-05, | |
| "loss": 0.7075, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 2.126759480039997, | |
| "grad_norm": 0.7763323783874512, | |
| "learning_rate": 5.903436095888633e-05, | |
| "loss": 0.7401, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 2.1306053380509193, | |
| "grad_norm": 0.9647789001464844, | |
| "learning_rate": 5.8774640937069847e-05, | |
| "loss": 0.7308, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 2.1344511960618413, | |
| "grad_norm": 0.7125420570373535, | |
| "learning_rate": 5.851492091525336e-05, | |
| "loss": 0.7337, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 2.1344511960618413, | |
| "eval_loss": 0.736056387424469, | |
| "eval_runtime": 18.0862, | |
| "eval_samples_per_second": 55.291, | |
| "eval_steps_per_second": 13.823, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 2.1382970540727637, | |
| "grad_norm": 1.1053630113601685, | |
| "learning_rate": 5.825520089343688e-05, | |
| "loss": 0.7434, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 2.1421429120836857, | |
| "grad_norm": 0.751206636428833, | |
| "learning_rate": 5.79954808716204e-05, | |
| "loss": 0.7208, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 2.145988770094608, | |
| "grad_norm": 1.0049408674240112, | |
| "learning_rate": 5.773576084980391e-05, | |
| "loss": 0.7265, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 2.1498346281055305, | |
| "grad_norm": 0.9738804697990417, | |
| "learning_rate": 5.747604082798743e-05, | |
| "loss": 0.7515, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 2.1536804861164525, | |
| "grad_norm": 0.7807592153549194, | |
| "learning_rate": 5.721632080617095e-05, | |
| "loss": 0.7309, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.1536804861164525, | |
| "eval_loss": 0.736193835735321, | |
| "eval_runtime": 17.9436, | |
| "eval_samples_per_second": 55.73, | |
| "eval_steps_per_second": 13.933, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.157526344127375, | |
| "grad_norm": 0.9337176084518433, | |
| "learning_rate": 5.6956600784354464e-05, | |
| "loss": 0.7844, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 2.1613722021382973, | |
| "grad_norm": 0.7867174744606018, | |
| "learning_rate": 5.669688076253798e-05, | |
| "loss": 0.7634, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 2.1652180601492192, | |
| "grad_norm": 0.6526890397071838, | |
| "learning_rate": 5.643716074072151e-05, | |
| "loss": 0.743, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 2.1690639181601417, | |
| "grad_norm": 1.1720079183578491, | |
| "learning_rate": 5.617744071890503e-05, | |
| "loss": 0.7165, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 2.1729097761710636, | |
| "grad_norm": 1.029062032699585, | |
| "learning_rate": 5.5917720697088535e-05, | |
| "loss": 0.6854, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 2.1729097761710636, | |
| "eval_loss": 0.7313055396080017, | |
| "eval_runtime": 18.0992, | |
| "eval_samples_per_second": 55.251, | |
| "eval_steps_per_second": 13.813, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 2.176755634181986, | |
| "grad_norm": 0.821443498134613, | |
| "learning_rate": 5.565800067527206e-05, | |
| "loss": 0.7209, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 2.1806014921929084, | |
| "grad_norm": 0.6121924519538879, | |
| "learning_rate": 5.539828065345558e-05, | |
| "loss": 0.7371, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 2.1844473502038304, | |
| "grad_norm": 0.8862821459770203, | |
| "learning_rate": 5.5138560631639094e-05, | |
| "loss": 0.684, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 2.188293208214753, | |
| "grad_norm": 0.8110325336456299, | |
| "learning_rate": 5.4878840609822614e-05, | |
| "loss": 0.6843, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 2.192139066225675, | |
| "grad_norm": 0.732751727104187, | |
| "learning_rate": 5.461912058800613e-05, | |
| "loss": 0.7229, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.192139066225675, | |
| "eval_loss": 0.7322584986686707, | |
| "eval_runtime": 18.025, | |
| "eval_samples_per_second": 55.478, | |
| "eval_steps_per_second": 13.87, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.195984924236597, | |
| "grad_norm": 0.8648149371147156, | |
| "learning_rate": 5.435940056618965e-05, | |
| "loss": 0.6874, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 2.1998307822475196, | |
| "grad_norm": 0.6547895073890686, | |
| "learning_rate": 5.4099680544373166e-05, | |
| "loss": 0.7115, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 2.2036766402584416, | |
| "grad_norm": 0.7705133557319641, | |
| "learning_rate": 5.3839960522556685e-05, | |
| "loss": 0.7498, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 2.207522498269364, | |
| "grad_norm": 1.2679988145828247, | |
| "learning_rate": 5.358024050074021e-05, | |
| "loss": 0.6956, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 2.211368356280286, | |
| "grad_norm": 0.4992118775844574, | |
| "learning_rate": 5.332052047892372e-05, | |
| "loss": 0.7124, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 2.211368356280286, | |
| "eval_loss": 0.7316113710403442, | |
| "eval_runtime": 17.8507, | |
| "eval_samples_per_second": 56.02, | |
| "eval_steps_per_second": 14.005, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 2.2152142142912084, | |
| "grad_norm": 0.8134187459945679, | |
| "learning_rate": 5.306080045710724e-05, | |
| "loss": 0.733, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 2.2190600723021308, | |
| "grad_norm": 0.5038111805915833, | |
| "learning_rate": 5.2801080435290764e-05, | |
| "loss": 0.7239, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 2.2229059303130527, | |
| "grad_norm": 1.0252164602279663, | |
| "learning_rate": 5.254136041347427e-05, | |
| "loss": 0.7102, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 2.226751788323975, | |
| "grad_norm": 0.823451817035675, | |
| "learning_rate": 5.2281640391657796e-05, | |
| "loss": 0.709, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 2.230597646334897, | |
| "grad_norm": 0.7642868161201477, | |
| "learning_rate": 5.2021920369841316e-05, | |
| "loss": 0.7326, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.230597646334897, | |
| "eval_loss": 0.7284311652183533, | |
| "eval_runtime": 17.8949, | |
| "eval_samples_per_second": 55.882, | |
| "eval_steps_per_second": 13.97, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.2344435043458195, | |
| "grad_norm": 0.45148393511772156, | |
| "learning_rate": 5.1762200348024836e-05, | |
| "loss": 0.7068, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 2.238289362356742, | |
| "grad_norm": 0.4047794044017792, | |
| "learning_rate": 5.150248032620835e-05, | |
| "loss": 0.7015, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 2.242135220367664, | |
| "grad_norm": 0.8585315942764282, | |
| "learning_rate": 5.124276030439187e-05, | |
| "loss": 0.7028, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 2.2459810783785863, | |
| "grad_norm": 0.5665230751037598, | |
| "learning_rate": 5.098304028257539e-05, | |
| "loss": 0.658, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 2.2498269363895087, | |
| "grad_norm": 0.7042600512504578, | |
| "learning_rate": 5.07233202607589e-05, | |
| "loss": 0.6938, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 2.2498269363895087, | |
| "eval_loss": 0.7279431223869324, | |
| "eval_runtime": 17.8253, | |
| "eval_samples_per_second": 56.1, | |
| "eval_steps_per_second": 14.025, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 2.2536727944004307, | |
| "grad_norm": 0.7422949075698853, | |
| "learning_rate": 5.046360023894242e-05, | |
| "loss": 0.6996, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 2.257518652411353, | |
| "grad_norm": 1.0139210224151611, | |
| "learning_rate": 5.020388021712594e-05, | |
| "loss": 0.7311, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 2.261364510422275, | |
| "grad_norm": 0.5937057137489319, | |
| "learning_rate": 4.994416019530946e-05, | |
| "loss": 0.704, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 2.2652103684331975, | |
| "grad_norm": 1.064329981803894, | |
| "learning_rate": 4.968444017349297e-05, | |
| "loss": 0.6792, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 2.26905622644412, | |
| "grad_norm": 0.9638292789459229, | |
| "learning_rate": 4.94247201516765e-05, | |
| "loss": 0.74, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.26905622644412, | |
| "eval_loss": 0.7287396192550659, | |
| "eval_runtime": 17.8735, | |
| "eval_samples_per_second": 55.949, | |
| "eval_steps_per_second": 13.987, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.272902084455042, | |
| "grad_norm": 0.7172055840492249, | |
| "learning_rate": 4.916500012986001e-05, | |
| "loss": 0.6984, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 2.2767479424659642, | |
| "grad_norm": 0.6817266345024109, | |
| "learning_rate": 4.890528010804353e-05, | |
| "loss": 0.7059, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 2.280593800476886, | |
| "grad_norm": 1.0110056400299072, | |
| "learning_rate": 4.864556008622705e-05, | |
| "loss": 0.7304, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 2.2844396584878086, | |
| "grad_norm": 0.6823923587799072, | |
| "learning_rate": 4.838584006441057e-05, | |
| "loss": 0.7462, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 2.288285516498731, | |
| "grad_norm": 0.7316113710403442, | |
| "learning_rate": 4.812612004259408e-05, | |
| "loss": 0.7068, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 2.288285516498731, | |
| "eval_loss": 0.7273894548416138, | |
| "eval_runtime": 17.8859, | |
| "eval_samples_per_second": 55.91, | |
| "eval_steps_per_second": 13.978, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 2.292131374509653, | |
| "grad_norm": 1.200492024421692, | |
| "learning_rate": 4.78664000207776e-05, | |
| "loss": 0.6887, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 2.2959772325205754, | |
| "grad_norm": 1.2268471717834473, | |
| "learning_rate": 4.760667999896112e-05, | |
| "loss": 0.7259, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 2.299823090531498, | |
| "grad_norm": 0.7251473069190979, | |
| "learning_rate": 4.734695997714464e-05, | |
| "loss": 0.6931, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 2.3036689485424198, | |
| "grad_norm": 0.5327921509742737, | |
| "learning_rate": 4.708723995532816e-05, | |
| "loss": 0.7447, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 2.307514806553342, | |
| "grad_norm": 0.7111462950706482, | |
| "learning_rate": 4.6827519933511674e-05, | |
| "loss": 0.7221, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.307514806553342, | |
| "eval_loss": 0.7264938354492188, | |
| "eval_runtime": 17.7847, | |
| "eval_samples_per_second": 56.228, | |
| "eval_steps_per_second": 14.057, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.311360664564264, | |
| "grad_norm": 0.861571729183197, | |
| "learning_rate": 4.6567799911695194e-05, | |
| "loss": 0.7052, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 2.3152065225751866, | |
| "grad_norm": 0.9279738068580627, | |
| "learning_rate": 4.6308079889878714e-05, | |
| "loss": 0.715, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 2.3190523805861085, | |
| "grad_norm": 0.8576169013977051, | |
| "learning_rate": 4.6048359868062226e-05, | |
| "loss": 0.7125, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 2.322898238597031, | |
| "grad_norm": 1.3127994537353516, | |
| "learning_rate": 4.578863984624575e-05, | |
| "loss": 0.6703, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 2.3267440966079533, | |
| "grad_norm": 0.5930036306381226, | |
| "learning_rate": 4.5528919824429266e-05, | |
| "loss": 0.6927, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 2.3267440966079533, | |
| "eval_loss": 0.7252874970436096, | |
| "eval_runtime": 17.8506, | |
| "eval_samples_per_second": 56.021, | |
| "eval_steps_per_second": 14.005, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 2.3305899546188753, | |
| "grad_norm": 0.6445633769035339, | |
| "learning_rate": 4.5269199802612785e-05, | |
| "loss": 0.6804, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 2.3344358126297977, | |
| "grad_norm": 0.9251648783683777, | |
| "learning_rate": 4.5009479780796305e-05, | |
| "loss": 0.7226, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 2.33828167064072, | |
| "grad_norm": 1.2103322744369507, | |
| "learning_rate": 4.474975975897982e-05, | |
| "loss": 0.7179, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 2.342127528651642, | |
| "grad_norm": 1.0718954801559448, | |
| "learning_rate": 4.4490039737163344e-05, | |
| "loss": 0.7638, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 2.3459733866625645, | |
| "grad_norm": 0.8156006336212158, | |
| "learning_rate": 4.423031971534686e-05, | |
| "loss": 0.6733, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.3459733866625645, | |
| "eval_loss": 0.7244414687156677, | |
| "eval_runtime": 17.9373, | |
| "eval_samples_per_second": 55.75, | |
| "eval_steps_per_second": 13.937, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.3498192446734865, | |
| "grad_norm": 0.9593235850334167, | |
| "learning_rate": 4.3970599693530377e-05, | |
| "loss": 0.7266, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 2.353665102684409, | |
| "grad_norm": 0.4570913016796112, | |
| "learning_rate": 4.3710879671713896e-05, | |
| "loss": 0.7026, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 2.3575109606953313, | |
| "grad_norm": 0.8020208477973938, | |
| "learning_rate": 4.345115964989741e-05, | |
| "loss": 0.7345, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 2.3613568187062532, | |
| "grad_norm": 0.7795267701148987, | |
| "learning_rate": 4.319143962808093e-05, | |
| "loss": 0.7351, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 2.3652026767171757, | |
| "grad_norm": 0.6240664720535278, | |
| "learning_rate": 4.293171960626445e-05, | |
| "loss": 0.7029, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 2.3652026767171757, | |
| "eval_loss": 0.7251197099685669, | |
| "eval_runtime": 17.8892, | |
| "eval_samples_per_second": 55.9, | |
| "eval_steps_per_second": 13.975, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 2.3690485347280976, | |
| "grad_norm": 0.773654043674469, | |
| "learning_rate": 4.267199958444797e-05, | |
| "loss": 0.7105, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 2.37289439273902, | |
| "grad_norm": 1.1365927457809448, | |
| "learning_rate": 4.241227956263149e-05, | |
| "loss": 0.7019, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 2.3767402507499424, | |
| "grad_norm": 0.6990851759910583, | |
| "learning_rate": 4.2152559540815e-05, | |
| "loss": 0.6914, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 2.3805861087608644, | |
| "grad_norm": 0.8598945140838623, | |
| "learning_rate": 4.189283951899852e-05, | |
| "loss": 0.7087, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 2.384431966771787, | |
| "grad_norm": 0.9121548533439636, | |
| "learning_rate": 4.163311949718204e-05, | |
| "loss": 0.7212, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.384431966771787, | |
| "eval_loss": 0.7226839661598206, | |
| "eval_runtime": 17.8913, | |
| "eval_samples_per_second": 55.893, | |
| "eval_steps_per_second": 13.973, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.3882778247827092, | |
| "grad_norm": 0.6950593590736389, | |
| "learning_rate": 4.137339947536556e-05, | |
| "loss": 0.7201, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 2.392123682793631, | |
| "grad_norm": 0.7376019358634949, | |
| "learning_rate": 4.111367945354908e-05, | |
| "loss": 0.7157, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 2.3959695408045536, | |
| "grad_norm": 1.286970853805542, | |
| "learning_rate": 4.08539594317326e-05, | |
| "loss": 0.7325, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 2.3998153988154756, | |
| "grad_norm": 0.5061975121498108, | |
| "learning_rate": 4.059423940991611e-05, | |
| "loss": 0.7206, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 2.403661256826398, | |
| "grad_norm": 0.7503495216369629, | |
| "learning_rate": 4.033451938809963e-05, | |
| "loss": 0.7197, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 2.403661256826398, | |
| "eval_loss": 0.7893036007881165, | |
| "eval_runtime": 17.9552, | |
| "eval_samples_per_second": 55.694, | |
| "eval_steps_per_second": 13.924, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 2.4075071148373204, | |
| "grad_norm": 0.6520366668701172, | |
| "learning_rate": 4.007479936628315e-05, | |
| "loss": 0.7496, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 2.4113529728482423, | |
| "grad_norm": 0.7475297451019287, | |
| "learning_rate": 3.981507934446666e-05, | |
| "loss": 0.7283, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 2.4151988308591648, | |
| "grad_norm": 1.0714281797409058, | |
| "learning_rate": 3.955535932265019e-05, | |
| "loss": 0.7553, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 2.4190446888700867, | |
| "grad_norm": 0.6734263300895691, | |
| "learning_rate": 3.92956393008337e-05, | |
| "loss": 0.698, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 2.422890546881009, | |
| "grad_norm": 0.7820257544517517, | |
| "learning_rate": 3.903591927901722e-05, | |
| "loss": 0.7346, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.422890546881009, | |
| "eval_loss": 0.7877212762832642, | |
| "eval_runtime": 17.9806, | |
| "eval_samples_per_second": 55.616, | |
| "eval_steps_per_second": 13.904, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.4267364048919315, | |
| "grad_norm": 0.9221381545066833, | |
| "learning_rate": 3.877619925720074e-05, | |
| "loss": 0.7077, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 2.4305822629028535, | |
| "grad_norm": 1.1155864000320435, | |
| "learning_rate": 3.8516479235384255e-05, | |
| "loss": 0.7358, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 2.434428120913776, | |
| "grad_norm": 0.949946939945221, | |
| "learning_rate": 3.825675921356778e-05, | |
| "loss": 0.7314, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 2.4382739789246983, | |
| "grad_norm": 0.7200281023979187, | |
| "learning_rate": 3.7997039191751294e-05, | |
| "loss": 0.6781, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 2.4421198369356203, | |
| "grad_norm": 1.1924189329147339, | |
| "learning_rate": 3.773731916993481e-05, | |
| "loss": 0.7116, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 2.4421198369356203, | |
| "eval_loss": 0.7858553528785706, | |
| "eval_runtime": 18.0342, | |
| "eval_samples_per_second": 55.45, | |
| "eval_steps_per_second": 13.863, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 2.4459656949465427, | |
| "grad_norm": 0.7993971109390259, | |
| "learning_rate": 3.747759914811833e-05, | |
| "loss": 0.7068, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 2.4498115529574647, | |
| "grad_norm": 0.6277671456336975, | |
| "learning_rate": 3.7217879126301846e-05, | |
| "loss": 0.7273, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 2.453657410968387, | |
| "grad_norm": 0.8524878621101379, | |
| "learning_rate": 3.6958159104485365e-05, | |
| "loss": 0.7537, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 2.457503268979309, | |
| "grad_norm": 0.9068925976753235, | |
| "learning_rate": 3.6698439082668885e-05, | |
| "loss": 0.7192, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 2.4613491269902315, | |
| "grad_norm": 0.866385817527771, | |
| "learning_rate": 3.6438719060852405e-05, | |
| "loss": 0.7145, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.4613491269902315, | |
| "eval_loss": 0.7852405905723572, | |
| "eval_runtime": 18.0021, | |
| "eval_samples_per_second": 55.549, | |
| "eval_steps_per_second": 13.887, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.465194985001154, | |
| "grad_norm": 0.8729520440101624, | |
| "learning_rate": 3.6178999039035924e-05, | |
| "loss": 0.7121, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 2.469040843012076, | |
| "grad_norm": 1.2588157653808594, | |
| "learning_rate": 3.591927901721944e-05, | |
| "loss": 0.7145, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 2.4728867010229982, | |
| "grad_norm": 1.0234293937683105, | |
| "learning_rate": 3.565955899540296e-05, | |
| "loss": 0.7173, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 2.4767325590339206, | |
| "grad_norm": 0.6210401058197021, | |
| "learning_rate": 3.5399838973586476e-05, | |
| "loss": 0.7404, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 2.4805784170448426, | |
| "grad_norm": 1.0649775266647339, | |
| "learning_rate": 3.5140118951769996e-05, | |
| "loss": 0.713, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 2.4805784170448426, | |
| "eval_loss": 0.7857936024665833, | |
| "eval_runtime": 17.9415, | |
| "eval_samples_per_second": 55.737, | |
| "eval_steps_per_second": 13.934, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 2.484424275055765, | |
| "grad_norm": 0.6743142604827881, | |
| "learning_rate": 3.488039892995351e-05, | |
| "loss": 0.7213, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 2.488270133066687, | |
| "grad_norm": 0.7584249377250671, | |
| "learning_rate": 3.462067890813703e-05, | |
| "loss": 0.7175, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 2.4921159910776094, | |
| "grad_norm": 1.7324374914169312, | |
| "learning_rate": 3.436095888632055e-05, | |
| "loss": 0.7255, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 2.495961849088532, | |
| "grad_norm": 1.0071933269500732, | |
| "learning_rate": 3.410123886450407e-05, | |
| "loss": 0.6905, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 2.4998077070994538, | |
| "grad_norm": 0.8606531023979187, | |
| "learning_rate": 3.384151884268759e-05, | |
| "loss": 0.7235, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.4998077070994538, | |
| "eval_loss": 0.7832362651824951, | |
| "eval_runtime": 17.9726, | |
| "eval_samples_per_second": 55.64, | |
| "eval_steps_per_second": 13.91, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.503653565110376, | |
| "grad_norm": 0.7658631205558777, | |
| "learning_rate": 3.35817988208711e-05, | |
| "loss": 0.7258, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 2.507499423121298, | |
| "grad_norm": 1.3583028316497803, | |
| "learning_rate": 3.3322078799054627e-05, | |
| "loss": 0.7414, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 2.5113452811322206, | |
| "grad_norm": 0.7680505514144897, | |
| "learning_rate": 3.306235877723814e-05, | |
| "loss": 0.7126, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 2.515191139143143, | |
| "grad_norm": 0.9117040634155273, | |
| "learning_rate": 3.280263875542165e-05, | |
| "loss": 0.7019, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 2.519036997154065, | |
| "grad_norm": 0.995895504951477, | |
| "learning_rate": 3.254291873360518e-05, | |
| "loss": 0.7272, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 2.519036997154065, | |
| "eval_loss": 0.7824276089668274, | |
| "eval_runtime": 17.8488, | |
| "eval_samples_per_second": 56.026, | |
| "eval_steps_per_second": 14.007, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 2.5228828551649873, | |
| "grad_norm": 0.6426506042480469, | |
| "learning_rate": 3.228319871178869e-05, | |
| "loss": 0.7309, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 2.5267287131759097, | |
| "grad_norm": 1.327599048614502, | |
| "learning_rate": 3.202347868997221e-05, | |
| "loss": 0.7354, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 2.5305745711868317, | |
| "grad_norm": 0.9184108972549438, | |
| "learning_rate": 3.176375866815573e-05, | |
| "loss": 0.7053, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 2.534420429197754, | |
| "grad_norm": 0.9938299059867859, | |
| "learning_rate": 3.1504038646339244e-05, | |
| "loss": 0.7374, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 2.5382662872086765, | |
| "grad_norm": 1.230980396270752, | |
| "learning_rate": 3.124431862452277e-05, | |
| "loss": 0.7261, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.5382662872086765, | |
| "eval_loss": 0.7817492485046387, | |
| "eval_runtime": 17.9221, | |
| "eval_samples_per_second": 55.797, | |
| "eval_steps_per_second": 13.949, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.5421121452195985, | |
| "grad_norm": 0.7762789726257324, | |
| "learning_rate": 3.098459860270628e-05, | |
| "loss": 0.7279, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 2.5459580032305205, | |
| "grad_norm": 0.8807786703109741, | |
| "learning_rate": 3.07248785808898e-05, | |
| "loss": 0.6957, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 2.549803861241443, | |
| "grad_norm": 0.8823468089103699, | |
| "learning_rate": 3.046515855907332e-05, | |
| "loss": 0.7584, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 2.5536497192523653, | |
| "grad_norm": 0.6461008191108704, | |
| "learning_rate": 3.0205438537256835e-05, | |
| "loss": 0.7231, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 2.5574955772632872, | |
| "grad_norm": 0.9959568977355957, | |
| "learning_rate": 2.9945718515440358e-05, | |
| "loss": 0.7097, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 2.5574955772632872, | |
| "eval_loss": 0.7804549336433411, | |
| "eval_runtime": 17.9765, | |
| "eval_samples_per_second": 55.628, | |
| "eval_steps_per_second": 13.907, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 2.5613414352742097, | |
| "grad_norm": 1.0902256965637207, | |
| "learning_rate": 2.9685998493623874e-05, | |
| "loss": 0.7259, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 2.565187293285132, | |
| "grad_norm": 0.8527780771255493, | |
| "learning_rate": 2.9426278471807394e-05, | |
| "loss": 0.7138, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 2.569033151296054, | |
| "grad_norm": 0.7497609257698059, | |
| "learning_rate": 2.916655844999091e-05, | |
| "loss": 0.6676, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 2.5728790093069764, | |
| "grad_norm": 1.252274751663208, | |
| "learning_rate": 2.8906838428174433e-05, | |
| "loss": 0.7545, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 2.576724867317899, | |
| "grad_norm": 0.8742374777793884, | |
| "learning_rate": 2.864711840635795e-05, | |
| "loss": 0.6727, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.576724867317899, | |
| "eval_loss": 0.7791668176651001, | |
| "eval_runtime": 17.8738, | |
| "eval_samples_per_second": 55.948, | |
| "eval_steps_per_second": 13.987, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.580570725328821, | |
| "grad_norm": 0.5088424682617188, | |
| "learning_rate": 2.8387398384541465e-05, | |
| "loss": 0.7113, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 2.5844165833397432, | |
| "grad_norm": 0.6116564273834229, | |
| "learning_rate": 2.8127678362724985e-05, | |
| "loss": 0.7523, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 2.588262441350665, | |
| "grad_norm": 0.6378856301307678, | |
| "learning_rate": 2.78679583409085e-05, | |
| "loss": 0.6924, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 2.5921082993615876, | |
| "grad_norm": 1.1341512203216553, | |
| "learning_rate": 2.760823831909202e-05, | |
| "loss": 0.7089, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 2.5959541573725096, | |
| "grad_norm": 0.8231232762336731, | |
| "learning_rate": 2.7348518297275537e-05, | |
| "loss": 0.7309, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 2.5959541573725096, | |
| "eval_loss": 0.7776284217834473, | |
| "eval_runtime": 17.9143, | |
| "eval_samples_per_second": 55.821, | |
| "eval_steps_per_second": 13.955, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 2.599800015383432, | |
| "grad_norm": 0.7154203653335571, | |
| "learning_rate": 2.7088798275459053e-05, | |
| "loss": 0.7273, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 2.6036458733943544, | |
| "grad_norm": 0.9213638305664062, | |
| "learning_rate": 2.6829078253642576e-05, | |
| "loss": 0.7628, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 2.6074917314052763, | |
| "grad_norm": 1.260438084602356, | |
| "learning_rate": 2.6569358231826092e-05, | |
| "loss": 0.7004, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 2.6113375894161988, | |
| "grad_norm": 0.9463502764701843, | |
| "learning_rate": 2.6309638210009612e-05, | |
| "loss": 0.6771, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 2.615183447427121, | |
| "grad_norm": 0.7610837817192078, | |
| "learning_rate": 2.604991818819313e-05, | |
| "loss": 0.7217, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.615183447427121, | |
| "eval_loss": 0.7775171995162964, | |
| "eval_runtime": 17.9057, | |
| "eval_samples_per_second": 55.848, | |
| "eval_steps_per_second": 13.962, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.619029305438043, | |
| "grad_norm": 0.4978080093860626, | |
| "learning_rate": 2.5790198166376645e-05, | |
| "loss": 0.7309, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 2.6228751634489655, | |
| "grad_norm": 0.779080331325531, | |
| "learning_rate": 2.5530478144560168e-05, | |
| "loss": 0.7105, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 2.626721021459888, | |
| "grad_norm": 0.5153629779815674, | |
| "learning_rate": 2.5275952523180012e-05, | |
| "loss": 0.7377, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 2.63056687947081, | |
| "grad_norm": 0.8356613516807556, | |
| "learning_rate": 2.5016232501363528e-05, | |
| "loss": 0.7135, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 2.6344127374817323, | |
| "grad_norm": 0.5202348232269287, | |
| "learning_rate": 2.475651247954705e-05, | |
| "loss": 0.7333, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 2.6344127374817323, | |
| "eval_loss": 0.7767261862754822, | |
| "eval_runtime": 17.8651, | |
| "eval_samples_per_second": 55.975, | |
| "eval_steps_per_second": 13.994, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 2.6382585954926543, | |
| "grad_norm": 0.9579488039016724, | |
| "learning_rate": 2.4496792457730567e-05, | |
| "loss": 0.7823, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 2.6421044535035767, | |
| "grad_norm": 0.7704477906227112, | |
| "learning_rate": 2.4237072435914084e-05, | |
| "loss": 0.6979, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 2.6459503115144987, | |
| "grad_norm": 0.8563690781593323, | |
| "learning_rate": 2.3977352414097603e-05, | |
| "loss": 0.7049, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 2.649796169525421, | |
| "grad_norm": 0.663038432598114, | |
| "learning_rate": 2.3717632392281123e-05, | |
| "loss": 0.751, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 2.6536420275363435, | |
| "grad_norm": 0.8598125576972961, | |
| "learning_rate": 2.345791237046464e-05, | |
| "loss": 0.6982, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.6536420275363435, | |
| "eval_loss": 0.7752255201339722, | |
| "eval_runtime": 17.9612, | |
| "eval_samples_per_second": 55.675, | |
| "eval_steps_per_second": 13.919, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.6574878855472654, | |
| "grad_norm": 1.2697360515594482, | |
| "learning_rate": 2.319819234864816e-05, | |
| "loss": 0.7289, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 2.661333743558188, | |
| "grad_norm": 0.6098369359970093, | |
| "learning_rate": 2.2938472326831675e-05, | |
| "loss": 0.6948, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 2.6651796015691103, | |
| "grad_norm": 0.48443081974983215, | |
| "learning_rate": 2.2678752305015195e-05, | |
| "loss": 0.7159, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 2.6690254595800322, | |
| "grad_norm": 0.7432298064231873, | |
| "learning_rate": 2.2419032283198714e-05, | |
| "loss": 0.7089, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 2.6728713175909546, | |
| "grad_norm": 0.7649087309837341, | |
| "learning_rate": 2.215931226138223e-05, | |
| "loss": 0.7107, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 2.6728713175909546, | |
| "eval_loss": 0.774026095867157, | |
| "eval_runtime": 18.0048, | |
| "eval_samples_per_second": 55.541, | |
| "eval_steps_per_second": 13.885, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 2.676717175601877, | |
| "grad_norm": 0.984624445438385, | |
| "learning_rate": 2.189959223956575e-05, | |
| "loss": 0.749, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 2.680563033612799, | |
| "grad_norm": 0.7625775933265686, | |
| "learning_rate": 2.163987221774927e-05, | |
| "loss": 0.6645, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 2.684408891623721, | |
| "grad_norm": 0.846238374710083, | |
| "learning_rate": 2.1380152195932786e-05, | |
| "loss": 0.7468, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 2.6882547496346434, | |
| "grad_norm": 1.1688568592071533, | |
| "learning_rate": 2.1120432174116302e-05, | |
| "loss": 0.7105, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 2.692100607645566, | |
| "grad_norm": 0.9417968392372131, | |
| "learning_rate": 2.0860712152299822e-05, | |
| "loss": 0.6826, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.692100607645566, | |
| "eval_loss": 0.7743579149246216, | |
| "eval_runtime": 18.0822, | |
| "eval_samples_per_second": 55.303, | |
| "eval_steps_per_second": 13.826, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.6959464656564878, | |
| "grad_norm": 1.1616791486740112, | |
| "learning_rate": 2.060099213048334e-05, | |
| "loss": 0.7242, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 2.69979232366741, | |
| "grad_norm": 0.9195474982261658, | |
| "learning_rate": 2.0341272108666858e-05, | |
| "loss": 0.7112, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 2.7036381816783326, | |
| "grad_norm": 1.168445110321045, | |
| "learning_rate": 2.0081552086850377e-05, | |
| "loss": 0.7331, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 2.7074840396892546, | |
| "grad_norm": 1.3413971662521362, | |
| "learning_rate": 1.9821832065033893e-05, | |
| "loss": 0.7627, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 2.711329897700177, | |
| "grad_norm": 0.9387266039848328, | |
| "learning_rate": 1.9562112043217413e-05, | |
| "loss": 0.6968, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 2.711329897700177, | |
| "eval_loss": 0.7742106914520264, | |
| "eval_runtime": 18.076, | |
| "eval_samples_per_second": 55.322, | |
| "eval_steps_per_second": 13.83, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 2.7151757557110994, | |
| "grad_norm": 0.8906998634338379, | |
| "learning_rate": 1.930239202140093e-05, | |
| "loss": 0.7206, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 2.7190216137220213, | |
| "grad_norm": 0.8276380896568298, | |
| "learning_rate": 1.904267199958445e-05, | |
| "loss": 0.7101, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 2.7228674717329437, | |
| "grad_norm": 0.8341213464736938, | |
| "learning_rate": 1.878295197776797e-05, | |
| "loss": 0.7631, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 2.7267133297438657, | |
| "grad_norm": 0.9501305222511292, | |
| "learning_rate": 1.8523231955951485e-05, | |
| "loss": 0.7138, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 2.730559187754788, | |
| "grad_norm": 0.9375068545341492, | |
| "learning_rate": 1.8263511934135e-05, | |
| "loss": 0.7556, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.730559187754788, | |
| "eval_loss": 0.7727531790733337, | |
| "eval_runtime": 18.0286, | |
| "eval_samples_per_second": 55.467, | |
| "eval_steps_per_second": 13.867, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.73440504576571, | |
| "grad_norm": 0.5093711018562317, | |
| "learning_rate": 1.800379191231852e-05, | |
| "loss": 0.6425, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 2.7382509037766325, | |
| "grad_norm": 1.0487879514694214, | |
| "learning_rate": 1.774407189050204e-05, | |
| "loss": 0.6846, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 2.742096761787555, | |
| "grad_norm": 0.6705742478370667, | |
| "learning_rate": 1.748435186868556e-05, | |
| "loss": 0.7464, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 2.745942619798477, | |
| "grad_norm": 0.43706628680229187, | |
| "learning_rate": 1.7224631846869076e-05, | |
| "loss": 0.7325, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 2.7497884778093993, | |
| "grad_norm": 1.1549192667007446, | |
| "learning_rate": 1.6964911825052592e-05, | |
| "loss": 0.7083, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 2.7497884778093993, | |
| "eval_loss": 0.7717772126197815, | |
| "eval_runtime": 18.085, | |
| "eval_samples_per_second": 55.295, | |
| "eval_steps_per_second": 13.824, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 2.7536343358203217, | |
| "grad_norm": 0.5367007255554199, | |
| "learning_rate": 1.6705191803236112e-05, | |
| "loss": 0.7054, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 2.7574801938312437, | |
| "grad_norm": 0.8213953971862793, | |
| "learning_rate": 1.644547178141963e-05, | |
| "loss": 0.6764, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 2.761326051842166, | |
| "grad_norm": 0.9012633562088013, | |
| "learning_rate": 1.6185751759603148e-05, | |
| "loss": 0.725, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 2.7651719098530885, | |
| "grad_norm": 0.656104326248169, | |
| "learning_rate": 1.5926031737786667e-05, | |
| "loss": 0.7279, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 2.7690177678640104, | |
| "grad_norm": 0.901136040687561, | |
| "learning_rate": 1.5666311715970187e-05, | |
| "loss": 0.7354, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.7690177678640104, | |
| "eval_loss": 0.7717016935348511, | |
| "eval_runtime": 17.9512, | |
| "eval_samples_per_second": 55.707, | |
| "eval_steps_per_second": 13.927, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.772863625874933, | |
| "grad_norm": 0.6917023658752441, | |
| "learning_rate": 1.5406591694153703e-05, | |
| "loss": 0.6798, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 2.776709483885855, | |
| "grad_norm": 0.9695160388946533, | |
| "learning_rate": 1.5146871672337221e-05, | |
| "loss": 0.7063, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 2.780555341896777, | |
| "grad_norm": 1.0134507417678833, | |
| "learning_rate": 1.4887151650520739e-05, | |
| "loss": 0.7325, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 2.784401199907699, | |
| "grad_norm": 0.8022010922431946, | |
| "learning_rate": 1.4627431628704259e-05, | |
| "loss": 0.7095, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 2.7882470579186216, | |
| "grad_norm": 0.8629682660102844, | |
| "learning_rate": 1.4367711606887777e-05, | |
| "loss": 0.6793, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 2.7882470579186216, | |
| "eval_loss": 0.7713639736175537, | |
| "eval_runtime": 17.9977, | |
| "eval_samples_per_second": 55.563, | |
| "eval_steps_per_second": 13.891, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 2.792092915929544, | |
| "grad_norm": 0.8491897583007812, | |
| "learning_rate": 1.4107991585071294e-05, | |
| "loss": 0.6899, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 2.795938773940466, | |
| "grad_norm": 1.0382113456726074, | |
| "learning_rate": 1.384827156325481e-05, | |
| "loss": 0.7362, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 2.7997846319513884, | |
| "grad_norm": 0.7207579016685486, | |
| "learning_rate": 1.358855154143833e-05, | |
| "loss": 0.6984, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 2.803630489962311, | |
| "grad_norm": 0.9483594298362732, | |
| "learning_rate": 1.3328831519621848e-05, | |
| "loss": 0.713, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 2.8074763479732328, | |
| "grad_norm": 1.0805621147155762, | |
| "learning_rate": 1.3069111497805366e-05, | |
| "loss": 0.7235, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.8074763479732328, | |
| "eval_loss": 0.7706654667854309, | |
| "eval_runtime": 17.9083, | |
| "eval_samples_per_second": 55.84, | |
| "eval_steps_per_second": 13.96, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.811322205984155, | |
| "grad_norm": 0.8592945337295532, | |
| "learning_rate": 1.2809391475988886e-05, | |
| "loss": 0.7173, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 2.8151680639950776, | |
| "grad_norm": 1.0562350749969482, | |
| "learning_rate": 1.2549671454172402e-05, | |
| "loss": 0.6661, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 2.8190139220059995, | |
| "grad_norm": 1.0829477310180664, | |
| "learning_rate": 1.2289951432355922e-05, | |
| "loss": 0.7105, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 2.8228597800169215, | |
| "grad_norm": 1.2846815586090088, | |
| "learning_rate": 1.203542581097577e-05, | |
| "loss": 0.7218, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 2.826705638027844, | |
| "grad_norm": 1.3996707201004028, | |
| "learning_rate": 1.1775705789159287e-05, | |
| "loss": 0.7348, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 2.826705638027844, | |
| "eval_loss": 0.7698732018470764, | |
| "eval_runtime": 17.9822, | |
| "eval_samples_per_second": 55.611, | |
| "eval_steps_per_second": 13.903, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 2.8305514960387663, | |
| "grad_norm": 1.040479302406311, | |
| "learning_rate": 1.1515985767342805e-05, | |
| "loss": 0.7103, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 2.8343973540496883, | |
| "grad_norm": 0.8566408753395081, | |
| "learning_rate": 1.1256265745526323e-05, | |
| "loss": 0.7087, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 2.8382432120606107, | |
| "grad_norm": 1.0727367401123047, | |
| "learning_rate": 1.0996545723709841e-05, | |
| "loss": 0.6972, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 2.842089070071533, | |
| "grad_norm": 0.8675785064697266, | |
| "learning_rate": 1.0736825701893359e-05, | |
| "loss": 0.7318, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 2.845934928082455, | |
| "grad_norm": 1.2655267715454102, | |
| "learning_rate": 1.0477105680076877e-05, | |
| "loss": 0.7333, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.845934928082455, | |
| "eval_loss": 0.7695651650428772, | |
| "eval_runtime": 17.7976, | |
| "eval_samples_per_second": 56.187, | |
| "eval_steps_per_second": 14.047, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.8497807860933775, | |
| "grad_norm": 1.1233916282653809, | |
| "learning_rate": 1.0217385658260397e-05, | |
| "loss": 0.7128, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 2.8536266441043, | |
| "grad_norm": 0.917649507522583, | |
| "learning_rate": 9.957665636443913e-06, | |
| "loss": 0.7402, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 2.857472502115222, | |
| "grad_norm": 0.8935102820396423, | |
| "learning_rate": 9.697945614627432e-06, | |
| "loss": 0.731, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 2.8613183601261443, | |
| "grad_norm": 0.6891331076622009, | |
| "learning_rate": 9.43822559281095e-06, | |
| "loss": 0.7331, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 2.8651642181370662, | |
| "grad_norm": 0.7505995631217957, | |
| "learning_rate": 9.178505570994468e-06, | |
| "loss": 0.6744, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 2.8651642181370662, | |
| "eval_loss": 0.7693511247634888, | |
| "eval_runtime": 17.8693, | |
| "eval_samples_per_second": 55.962, | |
| "eval_steps_per_second": 13.99, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 2.8690100761479886, | |
| "grad_norm": 1.2373569011688232, | |
| "learning_rate": 8.918785549177986e-06, | |
| "loss": 0.6981, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 2.8728559341589106, | |
| "grad_norm": 0.9159016013145447, | |
| "learning_rate": 8.659065527361506e-06, | |
| "loss": 0.7601, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 2.876701792169833, | |
| "grad_norm": 0.3170250952243805, | |
| "learning_rate": 8.399345505545022e-06, | |
| "loss": 0.7008, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 2.8805476501807554, | |
| "grad_norm": 0.7592608332633972, | |
| "learning_rate": 8.139625483728542e-06, | |
| "loss": 0.6966, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 2.8843935081916774, | |
| "grad_norm": 0.7826717495918274, | |
| "learning_rate": 7.879905461912058e-06, | |
| "loss": 0.7398, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.8843935081916774, | |
| "eval_loss": 0.7694031596183777, | |
| "eval_runtime": 17.9358, | |
| "eval_samples_per_second": 55.754, | |
| "eval_steps_per_second": 13.939, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.8882393662026, | |
| "grad_norm": 0.6858485341072083, | |
| "learning_rate": 7.6201854400955775e-06, | |
| "loss": 0.7132, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 2.892085224213522, | |
| "grad_norm": 0.7138088345527649, | |
| "learning_rate": 7.3604654182790955e-06, | |
| "loss": 0.7082, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 2.895931082224444, | |
| "grad_norm": 0.4927150309085846, | |
| "learning_rate": 7.100745396462613e-06, | |
| "loss": 0.7551, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 2.8997769402353666, | |
| "grad_norm": 0.879112720489502, | |
| "learning_rate": 6.841025374646131e-06, | |
| "loss": 0.7228, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 2.903622798246289, | |
| "grad_norm": 1.2699699401855469, | |
| "learning_rate": 6.58130535282965e-06, | |
| "loss": 0.7208, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 2.903622798246289, | |
| "eval_loss": 0.7685362696647644, | |
| "eval_runtime": 17.9674, | |
| "eval_samples_per_second": 55.656, | |
| "eval_steps_per_second": 13.914, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 2.907468656257211, | |
| "grad_norm": 0.7341476082801819, | |
| "learning_rate": 6.321585331013168e-06, | |
| "loss": 0.761, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 2.9113145142681334, | |
| "grad_norm": 0.8890082836151123, | |
| "learning_rate": 6.061865309196686e-06, | |
| "loss": 0.6837, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 2.9151603722790553, | |
| "grad_norm": 0.5546180009841919, | |
| "learning_rate": 5.802145287380204e-06, | |
| "loss": 0.7126, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 2.9190062302899777, | |
| "grad_norm": 0.7684674263000488, | |
| "learning_rate": 5.542425265563723e-06, | |
| "loss": 0.6765, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 2.9228520883008997, | |
| "grad_norm": 0.8968291282653809, | |
| "learning_rate": 5.2827052437472405e-06, | |
| "loss": 0.6839, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.9228520883008997, | |
| "eval_loss": 0.7687397003173828, | |
| "eval_runtime": 17.8165, | |
| "eval_samples_per_second": 56.128, | |
| "eval_steps_per_second": 14.032, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.926697946311822, | |
| "grad_norm": 0.8798107504844666, | |
| "learning_rate": 5.0229852219307584e-06, | |
| "loss": 0.7084, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 2.9305438043227445, | |
| "grad_norm": 0.3845706284046173, | |
| "learning_rate": 4.763265200114277e-06, | |
| "loss": 0.6764, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 2.9343896623336665, | |
| "grad_norm": 0.6847463846206665, | |
| "learning_rate": 4.503545178297795e-06, | |
| "loss": 0.7165, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 2.938235520344589, | |
| "grad_norm": 0.7632951736450195, | |
| "learning_rate": 4.243825156481313e-06, | |
| "loss": 0.7311, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 2.9420813783555113, | |
| "grad_norm": 1.3314287662506104, | |
| "learning_rate": 3.984105134664832e-06, | |
| "loss": 0.6852, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 2.9420813783555113, | |
| "eval_loss": 0.7683274149894714, | |
| "eval_runtime": 17.9706, | |
| "eval_samples_per_second": 55.646, | |
| "eval_steps_per_second": 13.912, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 2.9459272363664333, | |
| "grad_norm": 1.0179448127746582, | |
| "learning_rate": 3.7243851128483497e-06, | |
| "loss": 0.759, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 2.9497730943773557, | |
| "grad_norm": 1.4116487503051758, | |
| "learning_rate": 3.4646650910318677e-06, | |
| "loss": 0.7773, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 2.953618952388278, | |
| "grad_norm": 0.6251114010810852, | |
| "learning_rate": 3.2049450692153856e-06, | |
| "loss": 0.7016, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 2.9574648103992, | |
| "grad_norm": 0.9810579419136047, | |
| "learning_rate": 2.945225047398904e-06, | |
| "loss": 0.6909, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 2.961310668410122, | |
| "grad_norm": 0.7243860363960266, | |
| "learning_rate": 2.6855050255824223e-06, | |
| "loss": 0.7305, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.961310668410122, | |
| "eval_loss": 0.7680486440658569, | |
| "eval_runtime": 17.9145, | |
| "eval_samples_per_second": 55.821, | |
| "eval_steps_per_second": 13.955, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.9651565264210444, | |
| "grad_norm": 0.7657055854797363, | |
| "learning_rate": 2.4257850037659406e-06, | |
| "loss": 0.7114, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 2.969002384431967, | |
| "grad_norm": 0.7305043339729309, | |
| "learning_rate": 2.1660649819494585e-06, | |
| "loss": 0.7135, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 2.972848242442889, | |
| "grad_norm": 0.7981142401695251, | |
| "learning_rate": 1.9063449601329769e-06, | |
| "loss": 0.7328, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 2.976694100453811, | |
| "grad_norm": 0.7305875420570374, | |
| "learning_rate": 1.6466249383164948e-06, | |
| "loss": 0.7103, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 2.9805399584647336, | |
| "grad_norm": 1.197097659111023, | |
| "learning_rate": 1.386904916500013e-06, | |
| "loss": 0.7148, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 2.9805399584647336, | |
| "eval_loss": 0.7678167819976807, | |
| "eval_runtime": 17.8827, | |
| "eval_samples_per_second": 55.92, | |
| "eval_steps_per_second": 13.98, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 2.9843858164756556, | |
| "grad_norm": 0.8533993363380432, | |
| "learning_rate": 1.127184894683531e-06, | |
| "loss": 0.753, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 2.988231674486578, | |
| "grad_norm": 0.7372131943702698, | |
| "learning_rate": 8.674648728670494e-07, | |
| "loss": 0.7082, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 2.9920775324975004, | |
| "grad_norm": 1.499084234237671, | |
| "learning_rate": 6.077448510505675e-07, | |
| "loss": 0.6937, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 2.9959233905084224, | |
| "grad_norm": 0.5895427465438843, | |
| "learning_rate": 3.4802482923408566e-07, | |
| "loss": 0.701, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 2.999769248519345, | |
| "grad_norm": 0.9201724529266357, | |
| "learning_rate": 8.830480741760382e-08, | |
| "loss": 0.7665, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.999769248519345, | |
| "eval_loss": 0.767805814743042, | |
| "eval_runtime": 17.9509, | |
| "eval_samples_per_second": 55.708, | |
| "eval_steps_per_second": 13.927, | |
| "step": 39000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 39003, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |