{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 8734, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002862837859599274, "grad_norm": 19.151697158813477, "learning_rate": 4.0000000000000003e-07, "loss": 516.7791, "step": 5 }, { "epoch": 0.0005725675719198549, "grad_norm": 16.986482620239258, "learning_rate": 9e-07, "loss": 116.6236, "step": 10 }, { "epoch": 0.0008588513578797823, "grad_norm": 11.296030044555664, "learning_rate": 1.4000000000000001e-06, "loss": 176.0494, "step": 15 }, { "epoch": 0.0011451351438397097, "grad_norm": 6.591270923614502, "learning_rate": 1.9e-06, "loss": 34.2581, "step": 20 }, { "epoch": 0.001431418929799637, "grad_norm": 3.7718400955200195, "learning_rate": 2.4000000000000003e-06, "loss": 97.8785, "step": 25 }, { "epoch": 0.0017177027157595647, "grad_norm": 2.6732568740844727, "learning_rate": 2.9e-06, "loss": 56.9053, "step": 30 }, { "epoch": 0.002003986501719492, "grad_norm": 1.7744698524475098, "learning_rate": 3.4000000000000005e-06, "loss": 112.3654, "step": 35 }, { "epoch": 0.0022902702876794194, "grad_norm": 1.9627138376235962, "learning_rate": 3.9e-06, "loss": 319.8193, "step": 40 }, { "epoch": 0.002576554073639347, "grad_norm": 3.067491054534912, "learning_rate": 4.4e-06, "loss": 271.8014, "step": 45 }, { "epoch": 0.002862837859599274, "grad_norm": 2.7929775714874268, "learning_rate": 4.9000000000000005e-06, "loss": 51.7764, "step": 50 }, { "epoch": 0.0031491216455592015, "grad_norm": 7.4539971351623535, "learning_rate": 5.4e-06, "loss": 86.3225, "step": 55 }, { "epoch": 0.0034354054315191293, "grad_norm": 6.343064308166504, "learning_rate": 5.9e-06, "loss": 55.7213, "step": 60 }, { "epoch": 0.0037216892174790567, "grad_norm": 12.474454879760742, "learning_rate": 6.4000000000000006e-06, "loss": 184.1954, "step": 65 }, { "epoch": 0.004007973003438984, "grad_norm": 12.984466552734375, "learning_rate": 6.900000000000001e-06, "loss": 169.4968, "step": 70 }, { "epoch": 0.0042942567893989115, "grad_norm": 7.857387542724609, "learning_rate": 7.4e-06, "loss": 261.5822, "step": 75 }, { "epoch": 0.004580540575358839, "grad_norm": 9.837823867797852, "learning_rate": 7.9e-06, "loss": 74.4514, "step": 80 }, { "epoch": 0.004866824361318766, "grad_norm": 8.058916091918945, "learning_rate": 8.400000000000001e-06, "loss": 124.711, "step": 85 }, { "epoch": 0.005153108147278694, "grad_norm": 9.69068431854248, "learning_rate": 8.9e-06, "loss": 119.5081, "step": 90 }, { "epoch": 0.005439391933238621, "grad_norm": 8.338385581970215, "learning_rate": 9.4e-06, "loss": 163.0948, "step": 95 }, { "epoch": 0.005725675719198548, "grad_norm": 12.555005073547363, "learning_rate": 9.900000000000002e-06, "loss": 223.8742, "step": 100 }, { "epoch": 0.006011959505158476, "grad_norm": 7.753326892852783, "learning_rate": 1.04e-05, "loss": 162.0142, "step": 105 }, { "epoch": 0.006298243291118403, "grad_norm": 7.663868427276611, "learning_rate": 1.09e-05, "loss": 44.9881, "step": 110 }, { "epoch": 0.0065845270770783304, "grad_norm": 7.713599681854248, "learning_rate": 1.1400000000000001e-05, "loss": 59.8376, "step": 115 }, { "epoch": 0.006870810863038259, "grad_norm": 6.707058906555176, "learning_rate": 1.19e-05, "loss": 147.0191, "step": 120 }, { "epoch": 0.007157094648998186, "grad_norm": 3.721087694168091, "learning_rate": 1.24e-05, "loss": 54.5157, "step": 125 }, { "epoch": 0.007443378434958113, "grad_norm": 5.96558952331543, "learning_rate": 1.29e-05, "loss": 114.7495, "step": 130 }, { "epoch": 0.007729662220918041, "grad_norm": 3.8625457286834717, "learning_rate": 1.3400000000000002e-05, "loss": 90.1746, "step": 135 }, { "epoch": 0.008015946006877968, "grad_norm": 6.123764991760254, "learning_rate": 1.3900000000000002e-05, "loss": 41.6194, "step": 140 }, { "epoch": 0.008302229792837896, "grad_norm": 5.954704284667969, "learning_rate": 1.44e-05, "loss": 151.4757, "step": 145 }, { "epoch": 0.008588513578797823, "grad_norm": 3.687113046646118, "learning_rate": 1.49e-05, "loss": 68.2852, "step": 150 }, { "epoch": 0.00887479736475775, "grad_norm": 3.733111619949341, "learning_rate": 1.54e-05, "loss": 44.1476, "step": 155 }, { "epoch": 0.03664261994732623, "grad_norm": 4.615285873413086, "learning_rate": 1.59e-05, "loss": 0.9683, "step": 160 }, { "epoch": 0.037787701820680177, "grad_norm": 7.621520519256592, "learning_rate": 1.6400000000000002e-05, "loss": 0.9631, "step": 165 }, { "epoch": 0.03893278369403412, "grad_norm": 12.825485229492188, "learning_rate": 1.69e-05, "loss": 0.9485, "step": 170 }, { "epoch": 0.04007786556738807, "grad_norm": 9.484743118286133, "learning_rate": 1.74e-05, "loss": 0.9404, "step": 175 }, { "epoch": 0.041222947440742015, "grad_norm": 5.518902778625488, "learning_rate": 1.79e-05, "loss": 0.9197, "step": 180 }, { "epoch": 0.04236802931409596, "grad_norm": 3.3146023750305176, "learning_rate": 1.84e-05, "loss": 0.9093, "step": 185 }, { "epoch": 0.0435131111874499, "grad_norm": 2.7806248664855957, "learning_rate": 1.8900000000000002e-05, "loss": 0.9032, "step": 190 }, { "epoch": 0.04465819306080385, "grad_norm": 1.950194239616394, "learning_rate": 1.94e-05, "loss": 0.8957, "step": 195 }, { "epoch": 0.04580327493415779, "grad_norm": 2.105093479156494, "learning_rate": 1.9900000000000003e-05, "loss": 0.8921, "step": 200 }, { "epoch": 0.046948356807511735, "grad_norm": 2.0000460147857666, "learning_rate": 2.04e-05, "loss": 0.8877, "step": 205 }, { "epoch": 0.04809343868086568, "grad_norm": 1.6276265382766724, "learning_rate": 2.09e-05, "loss": 0.8862, "step": 210 }, { "epoch": 0.04923852055421963, "grad_norm": 5.631216049194336, "learning_rate": 2.1400000000000002e-05, "loss": 0.8805, "step": 215 }, { "epoch": 0.05038360242757357, "grad_norm": 2.450659990310669, "learning_rate": 2.19e-05, "loss": 0.881, "step": 220 }, { "epoch": 0.05152868430092752, "grad_norm": 2.5550310611724854, "learning_rate": 2.2400000000000002e-05, "loss": 0.8737, "step": 225 }, { "epoch": 0.05267376617428146, "grad_norm": 2.168761730194092, "learning_rate": 2.29e-05, "loss": 0.8656, "step": 230 }, { "epoch": 0.053818848047635405, "grad_norm": 2.1524722576141357, "learning_rate": 2.3400000000000003e-05, "loss": 0.8583, "step": 235 }, { "epoch": 0.05496392992098935, "grad_norm": 2.131282091140747, "learning_rate": 2.39e-05, "loss": 0.8554, "step": 240 }, { "epoch": 0.05610901179434329, "grad_norm": 2.4136056900024414, "learning_rate": 2.44e-05, "loss": 0.8504, "step": 245 }, { "epoch": 0.057254093667697244, "grad_norm": 2.4667770862579346, "learning_rate": 2.4900000000000002e-05, "loss": 0.8459, "step": 250 }, { "epoch": 0.05839917554105119, "grad_norm": 1.7110178470611572, "learning_rate": 2.54e-05, "loss": 0.842, "step": 255 }, { "epoch": 0.05954425741440513, "grad_norm": 1.4909886121749878, "learning_rate": 2.5900000000000003e-05, "loss": 0.8408, "step": 260 }, { "epoch": 0.060689339287759075, "grad_norm": 1.4522689580917358, "learning_rate": 2.64e-05, "loss": 0.8371, "step": 265 }, { "epoch": 0.06183442116111302, "grad_norm": 1.8193100690841675, "learning_rate": 2.6900000000000003e-05, "loss": 0.831, "step": 270 }, { "epoch": 0.06297950303446696, "grad_norm": 1.8061556816101074, "learning_rate": 2.7400000000000002e-05, "loss": 0.8288, "step": 275 }, { "epoch": 0.06412458490782091, "grad_norm": 2.727900743484497, "learning_rate": 2.7900000000000004e-05, "loss": 0.829, "step": 280 }, { "epoch": 0.06526966678117485, "grad_norm": 2.1262223720550537, "learning_rate": 2.84e-05, "loss": 0.8282, "step": 285 }, { "epoch": 0.0664147486545288, "grad_norm": 1.948807716369629, "learning_rate": 2.8899999999999998e-05, "loss": 0.8253, "step": 290 }, { "epoch": 0.06755983052788274, "grad_norm": 2.3872830867767334, "learning_rate": 2.94e-05, "loss": 0.8235, "step": 295 }, { "epoch": 0.06870491240123669, "grad_norm": 1.0653102397918701, "learning_rate": 2.9900000000000002e-05, "loss": 0.8204, "step": 300 }, { "epoch": 0.06984999427459063, "grad_norm": 1.2086213827133179, "learning_rate": 3.04e-05, "loss": 0.8216, "step": 305 }, { "epoch": 0.07099507614794458, "grad_norm": 1.2478430271148682, "learning_rate": 3.09e-05, "loss": 0.8218, "step": 310 }, { "epoch": 0.07214015802129853, "grad_norm": 1.4686928987503052, "learning_rate": 3.1400000000000004e-05, "loss": 0.8175, "step": 315 }, { "epoch": 0.07328523989465247, "grad_norm": 1.691828966140747, "learning_rate": 3.19e-05, "loss": 0.8194, "step": 320 }, { "epoch": 0.07443032176800642, "grad_norm": 2.0422606468200684, "learning_rate": 3.24e-05, "loss": 0.8181, "step": 325 }, { "epoch": 0.07557540364136035, "grad_norm": 1.3948687314987183, "learning_rate": 3.29e-05, "loss": 0.8212, "step": 330 }, { "epoch": 0.0767204855147143, "grad_norm": 1.1443554162979126, "learning_rate": 3.3400000000000005e-05, "loss": 0.8169, "step": 335 }, { "epoch": 0.07786556738806824, "grad_norm": 1.0854233503341675, "learning_rate": 3.3900000000000004e-05, "loss": 0.8134, "step": 340 }, { "epoch": 0.07901064926142219, "grad_norm": 1.8612085580825806, "learning_rate": 3.4399999999999996e-05, "loss": 0.8133, "step": 345 }, { "epoch": 0.08015573113477614, "grad_norm": 0.5761221647262573, "learning_rate": 3.49e-05, "loss": 0.8101, "step": 350 }, { "epoch": 0.08130081300813008, "grad_norm": 0.7849775552749634, "learning_rate": 3.54e-05, "loss": 0.8096, "step": 355 }, { "epoch": 0.08244589488148403, "grad_norm": 0.47798988223075867, "learning_rate": 3.59e-05, "loss": 0.8089, "step": 360 }, { "epoch": 0.08359097675483797, "grad_norm": 0.9662396311759949, "learning_rate": 3.6400000000000004e-05, "loss": 0.8073, "step": 365 }, { "epoch": 0.08473605862819192, "grad_norm": 0.5217769742012024, "learning_rate": 3.69e-05, "loss": 0.8079, "step": 370 }, { "epoch": 0.08588114050154586, "grad_norm": 1.0022109746932983, "learning_rate": 3.74e-05, "loss": 0.8041, "step": 375 }, { "epoch": 0.0870262223748998, "grad_norm": 1.0508103370666504, "learning_rate": 3.79e-05, "loss": 0.8045, "step": 380 }, { "epoch": 0.08817130424825376, "grad_norm": 0.9267056584358215, "learning_rate": 3.8400000000000005e-05, "loss": 0.8071, "step": 385 }, { "epoch": 0.0893163861216077, "grad_norm": 1.0763800144195557, "learning_rate": 3.8900000000000004e-05, "loss": 0.8022, "step": 390 }, { "epoch": 0.09046146799496164, "grad_norm": 0.9646713733673096, "learning_rate": 3.94e-05, "loss": 0.8037, "step": 395 }, { "epoch": 0.09160654986831558, "grad_norm": 0.7930561900138855, "learning_rate": 3.99e-05, "loss": 0.8033, "step": 400 }, { "epoch": 0.09275163174166953, "grad_norm": 0.7772406339645386, "learning_rate": 4.0400000000000006e-05, "loss": 0.8014, "step": 405 }, { "epoch": 0.09389671361502347, "grad_norm": 0.7040679454803467, "learning_rate": 4.09e-05, "loss": 0.8016, "step": 410 }, { "epoch": 0.09504179548837742, "grad_norm": 0.6085620522499084, "learning_rate": 4.14e-05, "loss": 0.8009, "step": 415 }, { "epoch": 0.09618687736173136, "grad_norm": 0.9182414412498474, "learning_rate": 4.19e-05, "loss": 0.8023, "step": 420 }, { "epoch": 0.09733195923508531, "grad_norm": 0.9242359399795532, "learning_rate": 4.24e-05, "loss": 0.8014, "step": 425 }, { "epoch": 0.09847704110843926, "grad_norm": 1.0517164468765259, "learning_rate": 4.29e-05, "loss": 0.7987, "step": 430 }, { "epoch": 0.0996221229817932, "grad_norm": 0.9291065335273743, "learning_rate": 4.3400000000000005e-05, "loss": 0.8005, "step": 435 }, { "epoch": 0.10076720485514715, "grad_norm": 0.7035262584686279, "learning_rate": 4.39e-05, "loss": 0.7961, "step": 440 }, { "epoch": 0.10191228672850108, "grad_norm": 0.947799801826477, "learning_rate": 4.44e-05, "loss": 0.7991, "step": 445 }, { "epoch": 0.10305736860185503, "grad_norm": 1.345821738243103, "learning_rate": 4.49e-05, "loss": 0.7948, "step": 450 }, { "epoch": 0.10420245047520897, "grad_norm": 0.511234700679779, "learning_rate": 4.5400000000000006e-05, "loss": 0.7959, "step": 455 }, { "epoch": 0.10534753234856292, "grad_norm": 0.8376046419143677, "learning_rate": 4.5900000000000004e-05, "loss": 0.796, "step": 460 }, { "epoch": 0.10649261422191687, "grad_norm": 0.9656692147254944, "learning_rate": 4.64e-05, "loss": 0.7961, "step": 465 }, { "epoch": 0.10763769609527081, "grad_norm": 0.7987990379333496, "learning_rate": 4.69e-05, "loss": 0.794, "step": 470 }, { "epoch": 0.10878277796862476, "grad_norm": 0.9489181637763977, "learning_rate": 4.74e-05, "loss": 0.7928, "step": 475 }, { "epoch": 0.1099278598419787, "grad_norm": 1.0666885375976562, "learning_rate": 4.79e-05, "loss": 0.796, "step": 480 }, { "epoch": 0.11107294171533265, "grad_norm": 0.8832483887672424, "learning_rate": 4.8400000000000004e-05, "loss": 0.7921, "step": 485 }, { "epoch": 0.11221802358868659, "grad_norm": 0.5236397385597229, "learning_rate": 4.89e-05, "loss": 0.7967, "step": 490 }, { "epoch": 0.11336310546204054, "grad_norm": 0.6618067026138306, "learning_rate": 4.94e-05, "loss": 0.7922, "step": 495 }, { "epoch": 0.11450818733539449, "grad_norm": 0.5967982411384583, "learning_rate": 4.99e-05, "loss": 0.7907, "step": 500 }, { "epoch": 0.11565326920874842, "grad_norm": 0.9039576053619385, "learning_rate": 5.0400000000000005e-05, "loss": 0.7916, "step": 505 }, { "epoch": 0.11679835108210238, "grad_norm": 0.7755367159843445, "learning_rate": 5.0900000000000004e-05, "loss": 0.7916, "step": 510 }, { "epoch": 0.11794343295545631, "grad_norm": 0.6333810091018677, "learning_rate": 5.14e-05, "loss": 0.7917, "step": 515 }, { "epoch": 0.11908851482881026, "grad_norm": 0.5068004131317139, "learning_rate": 5.19e-05, "loss": 0.7936, "step": 520 }, { "epoch": 0.1202335967021642, "grad_norm": 0.8217163681983948, "learning_rate": 5.2400000000000007e-05, "loss": 0.7946, "step": 525 }, { "epoch": 0.12137867857551815, "grad_norm": 0.433635413646698, "learning_rate": 5.2900000000000005e-05, "loss": 0.7947, "step": 530 }, { "epoch": 0.12252376044887209, "grad_norm": 0.5382624864578247, "learning_rate": 5.3400000000000004e-05, "loss": 0.7937, "step": 535 }, { "epoch": 0.12366884232222604, "grad_norm": 0.43913567066192627, "learning_rate": 5.390000000000001e-05, "loss": 0.7929, "step": 540 }, { "epoch": 0.12481392419557999, "grad_norm": 0.6911773085594177, "learning_rate": 5.440000000000001e-05, "loss": 0.7916, "step": 545 }, { "epoch": 0.12595900606893393, "grad_norm": 0.4433671236038208, "learning_rate": 5.4900000000000006e-05, "loss": 0.7912, "step": 550 }, { "epoch": 0.12710408794228786, "grad_norm": 0.5388156175613403, "learning_rate": 5.5400000000000005e-05, "loss": 0.7867, "step": 555 }, { "epoch": 0.12824916981564183, "grad_norm": 0.6417338848114014, "learning_rate": 5.590000000000001e-05, "loss": 0.7916, "step": 560 }, { "epoch": 0.12939425168899577, "grad_norm": 0.7721633911132812, "learning_rate": 5.6399999999999995e-05, "loss": 0.7931, "step": 565 }, { "epoch": 0.1305393335623497, "grad_norm": 0.47057852149009705, "learning_rate": 5.69e-05, "loss": 0.7926, "step": 570 }, { "epoch": 0.13168441543570364, "grad_norm": 0.7483288645744324, "learning_rate": 5.74e-05, "loss": 0.7907, "step": 575 }, { "epoch": 0.1328294973090576, "grad_norm": 0.6444791555404663, "learning_rate": 5.79e-05, "loss": 0.7923, "step": 580 }, { "epoch": 0.13397457918241154, "grad_norm": 0.6855381727218628, "learning_rate": 5.8399999999999997e-05, "loss": 0.7898, "step": 585 }, { "epoch": 0.13511966105576548, "grad_norm": 0.4442330300807953, "learning_rate": 5.89e-05, "loss": 0.7905, "step": 590 }, { "epoch": 0.13626474292911944, "grad_norm": 0.6380087733268738, "learning_rate": 5.94e-05, "loss": 0.7908, "step": 595 }, { "epoch": 0.13740982480247338, "grad_norm": 0.6280926465988159, "learning_rate": 5.99e-05, "loss": 0.7912, "step": 600 }, { "epoch": 0.13855490667582732, "grad_norm": 0.3598725497722626, "learning_rate": 6.04e-05, "loss": 0.7891, "step": 605 }, { "epoch": 0.13969998854918125, "grad_norm": 0.5610477924346924, "learning_rate": 6.09e-05, "loss": 0.7904, "step": 610 }, { "epoch": 0.14084507042253522, "grad_norm": 0.5610749125480652, "learning_rate": 6.14e-05, "loss": 0.7889, "step": 615 }, { "epoch": 0.14199015229588915, "grad_norm": 0.5461752414703369, "learning_rate": 6.19e-05, "loss": 0.7901, "step": 620 }, { "epoch": 0.1431352341692431, "grad_norm": 0.4764970541000366, "learning_rate": 6.24e-05, "loss": 0.7878, "step": 625 }, { "epoch": 0.14428031604259706, "grad_norm": 0.40810781717300415, "learning_rate": 6.29e-05, "loss": 0.789, "step": 630 }, { "epoch": 0.145425397915951, "grad_norm": 0.818898618221283, "learning_rate": 6.340000000000001e-05, "loss": 0.7877, "step": 635 }, { "epoch": 0.14657047978930493, "grad_norm": 0.6349887847900391, "learning_rate": 6.390000000000001e-05, "loss": 0.7902, "step": 640 }, { "epoch": 0.14771556166265887, "grad_norm": 0.34319865703582764, "learning_rate": 6.440000000000001e-05, "loss": 0.7899, "step": 645 }, { "epoch": 0.14886064353601283, "grad_norm": 0.6926677823066711, "learning_rate": 6.49e-05, "loss": 0.7863, "step": 650 }, { "epoch": 0.15000572540936677, "grad_norm": 0.6303408145904541, "learning_rate": 6.54e-05, "loss": 0.7854, "step": 655 }, { "epoch": 0.1511508072827207, "grad_norm": 0.5530775189399719, "learning_rate": 6.59e-05, "loss": 0.7894, "step": 660 }, { "epoch": 0.15229588915607467, "grad_norm": 0.5497018098831177, "learning_rate": 6.64e-05, "loss": 0.7873, "step": 665 }, { "epoch": 0.1534409710294286, "grad_norm": 0.6323860883712769, "learning_rate": 6.690000000000001e-05, "loss": 0.7886, "step": 670 }, { "epoch": 0.15458605290278254, "grad_norm": 0.40749162435531616, "learning_rate": 6.740000000000001e-05, "loss": 0.7887, "step": 675 }, { "epoch": 0.15573113477613648, "grad_norm": 0.4536402225494385, "learning_rate": 6.790000000000001e-05, "loss": 0.7867, "step": 680 }, { "epoch": 0.15687621664949045, "grad_norm": 0.5276408195495605, "learning_rate": 6.840000000000001e-05, "loss": 0.7871, "step": 685 }, { "epoch": 0.15802129852284438, "grad_norm": 0.4989128112792969, "learning_rate": 6.89e-05, "loss": 0.7895, "step": 690 }, { "epoch": 0.15916638039619832, "grad_norm": 0.7024186253547668, "learning_rate": 6.939999999999999e-05, "loss": 0.7847, "step": 695 }, { "epoch": 0.16031146226955229, "grad_norm": 0.37541908025741577, "learning_rate": 6.99e-05, "loss": 0.7833, "step": 700 }, { "epoch": 0.16145654414290622, "grad_norm": 0.5957688689231873, "learning_rate": 7.04e-05, "loss": 0.7864, "step": 705 }, { "epoch": 0.16260162601626016, "grad_norm": 0.8745490908622742, "learning_rate": 7.09e-05, "loss": 0.7851, "step": 710 }, { "epoch": 0.1637467078896141, "grad_norm": 0.6420118808746338, "learning_rate": 7.14e-05, "loss": 0.7879, "step": 715 }, { "epoch": 0.16489178976296806, "grad_norm": 0.5130624175071716, "learning_rate": 7.19e-05, "loss": 0.7864, "step": 720 }, { "epoch": 0.166036871636322, "grad_norm": 0.3838486671447754, "learning_rate": 7.24e-05, "loss": 0.7884, "step": 725 }, { "epoch": 0.16718195350967593, "grad_norm": 0.34363555908203125, "learning_rate": 7.29e-05, "loss": 0.7812, "step": 730 }, { "epoch": 0.1683270353830299, "grad_norm": 0.4423808455467224, "learning_rate": 7.340000000000001e-05, "loss": 0.7881, "step": 735 }, { "epoch": 0.16947211725638384, "grad_norm": 0.4986163377761841, "learning_rate": 7.390000000000001e-05, "loss": 0.7877, "step": 740 }, { "epoch": 0.17061719912973777, "grad_norm": 0.651222288608551, "learning_rate": 7.44e-05, "loss": 0.7871, "step": 745 }, { "epoch": 0.1717622810030917, "grad_norm": 0.6739994883537292, "learning_rate": 7.49e-05, "loss": 0.7854, "step": 750 }, { "epoch": 0.17290736287644567, "grad_norm": 0.6694870591163635, "learning_rate": 7.54e-05, "loss": 0.7858, "step": 755 }, { "epoch": 0.1740524447497996, "grad_norm": 0.4669223725795746, "learning_rate": 7.59e-05, "loss": 0.791, "step": 760 }, { "epoch": 0.17519752662315355, "grad_norm": 0.4475642144680023, "learning_rate": 7.64e-05, "loss": 0.7862, "step": 765 }, { "epoch": 0.1763426084965075, "grad_norm": 0.41131269931793213, "learning_rate": 7.69e-05, "loss": 0.7846, "step": 770 }, { "epoch": 0.17748769036986145, "grad_norm": 0.46661683917045593, "learning_rate": 7.740000000000001e-05, "loss": 0.7849, "step": 775 }, { "epoch": 0.1786327722432154, "grad_norm": 0.6970285773277283, "learning_rate": 7.790000000000001e-05, "loss": 0.7856, "step": 780 }, { "epoch": 0.17977785411656932, "grad_norm": 0.4303015470504761, "learning_rate": 7.840000000000001e-05, "loss": 0.7848, "step": 785 }, { "epoch": 0.1809229359899233, "grad_norm": 0.6247970461845398, "learning_rate": 7.890000000000001e-05, "loss": 0.7871, "step": 790 }, { "epoch": 0.18206801786327723, "grad_norm": 0.6847257614135742, "learning_rate": 7.94e-05, "loss": 0.7898, "step": 795 }, { "epoch": 0.18321309973663116, "grad_norm": 0.5924108624458313, "learning_rate": 7.99e-05, "loss": 0.7809, "step": 800 }, { "epoch": 0.1843581816099851, "grad_norm": 0.44637331366539, "learning_rate": 8.04e-05, "loss": 0.7858, "step": 805 }, { "epoch": 0.18550326348333906, "grad_norm": 0.4461766183376312, "learning_rate": 8.090000000000001e-05, "loss": 0.7844, "step": 810 }, { "epoch": 0.186648345356693, "grad_norm": 0.6761696934700012, "learning_rate": 8.14e-05, "loss": 0.7885, "step": 815 }, { "epoch": 0.18779342723004694, "grad_norm": 0.4983709752559662, "learning_rate": 8.19e-05, "loss": 0.7838, "step": 820 }, { "epoch": 0.1889385091034009, "grad_norm": 0.4978698790073395, "learning_rate": 8.24e-05, "loss": 0.7891, "step": 825 }, { "epoch": 0.19008359097675484, "grad_norm": 0.7057047486305237, "learning_rate": 8.29e-05, "loss": 0.7847, "step": 830 }, { "epoch": 0.19122867285010878, "grad_norm": 0.4118855595588684, "learning_rate": 8.34e-05, "loss": 0.7838, "step": 835 }, { "epoch": 0.19237375472346271, "grad_norm": 0.7234144806861877, "learning_rate": 8.39e-05, "loss": 0.7865, "step": 840 }, { "epoch": 0.19351883659681668, "grad_norm": 0.3011474907398224, "learning_rate": 8.44e-05, "loss": 0.7842, "step": 845 }, { "epoch": 0.19466391847017062, "grad_norm": 0.4353237450122833, "learning_rate": 8.49e-05, "loss": 0.7856, "step": 850 }, { "epoch": 0.19580900034352455, "grad_norm": 0.3795527219772339, "learning_rate": 8.54e-05, "loss": 0.7849, "step": 855 }, { "epoch": 0.19695408221687852, "grad_norm": 0.5458865165710449, "learning_rate": 8.59e-05, "loss": 0.7878, "step": 860 }, { "epoch": 0.19809916409023245, "grad_norm": 0.4245418310165405, "learning_rate": 8.64e-05, "loss": 0.7842, "step": 865 }, { "epoch": 0.1992442459635864, "grad_norm": 0.5682182908058167, "learning_rate": 8.69e-05, "loss": 0.7836, "step": 870 }, { "epoch": 0.20038932783694033, "grad_norm": 0.27713704109191895, "learning_rate": 8.740000000000001e-05, "loss": 0.7856, "step": 875 }, { "epoch": 0.2015344097102943, "grad_norm": 0.48992612957954407, "learning_rate": 8.790000000000001e-05, "loss": 0.7806, "step": 880 }, { "epoch": 0.20267949158364823, "grad_norm": 0.7062950134277344, "learning_rate": 8.840000000000001e-05, "loss": 0.784, "step": 885 }, { "epoch": 0.20382457345700217, "grad_norm": 0.5864018797874451, "learning_rate": 8.89e-05, "loss": 0.7823, "step": 890 }, { "epoch": 0.20496965533035613, "grad_norm": 0.5565068125724792, "learning_rate": 8.94e-05, "loss": 0.7835, "step": 895 }, { "epoch": 0.20611473720371007, "grad_norm": 0.3518352806568146, "learning_rate": 8.99e-05, "loss": 0.7833, "step": 900 }, { "epoch": 0.207259819077064, "grad_norm": 0.31621456146240234, "learning_rate": 9.04e-05, "loss": 0.7837, "step": 905 }, { "epoch": 0.20840490095041794, "grad_norm": 0.4907382130622864, "learning_rate": 9.090000000000001e-05, "loss": 0.7854, "step": 910 }, { "epoch": 0.2095499828237719, "grad_norm": 0.3738730549812317, "learning_rate": 9.140000000000001e-05, "loss": 0.7816, "step": 915 }, { "epoch": 0.21069506469712584, "grad_norm": 0.589777946472168, "learning_rate": 9.190000000000001e-05, "loss": 0.7826, "step": 920 }, { "epoch": 0.21184014657047978, "grad_norm": 0.3376914858818054, "learning_rate": 9.240000000000001e-05, "loss": 0.7851, "step": 925 }, { "epoch": 0.21298522844383375, "grad_norm": 0.34315842390060425, "learning_rate": 9.290000000000001e-05, "loss": 0.7853, "step": 930 }, { "epoch": 0.21413031031718768, "grad_norm": 0.34507983922958374, "learning_rate": 9.340000000000001e-05, "loss": 0.7882, "step": 935 }, { "epoch": 0.21527539219054162, "grad_norm": 0.33713966608047485, "learning_rate": 9.39e-05, "loss": 0.7824, "step": 940 }, { "epoch": 0.21642047406389556, "grad_norm": 0.4617432653903961, "learning_rate": 9.44e-05, "loss": 0.7837, "step": 945 }, { "epoch": 0.21756555593724952, "grad_norm": 0.3514646589756012, "learning_rate": 9.49e-05, "loss": 0.7816, "step": 950 }, { "epoch": 0.21871063781060346, "grad_norm": 0.3907168209552765, "learning_rate": 9.54e-05, "loss": 0.781, "step": 955 }, { "epoch": 0.2198557196839574, "grad_norm": 0.4069294333457947, "learning_rate": 9.59e-05, "loss": 0.7819, "step": 960 }, { "epoch": 0.22100080155731136, "grad_norm": 0.4836900532245636, "learning_rate": 9.64e-05, "loss": 0.7826, "step": 965 }, { "epoch": 0.2221458834306653, "grad_norm": 0.387287437915802, "learning_rate": 9.69e-05, "loss": 0.7819, "step": 970 }, { "epoch": 0.22329096530401923, "grad_norm": 0.3079049289226532, "learning_rate": 9.74e-05, "loss": 0.7808, "step": 975 }, { "epoch": 0.22443604717737317, "grad_norm": 0.3615763187408447, "learning_rate": 9.790000000000001e-05, "loss": 0.7844, "step": 980 }, { "epoch": 0.22558112905072714, "grad_norm": 0.4044090509414673, "learning_rate": 9.84e-05, "loss": 0.7804, "step": 985 }, { "epoch": 0.22672621092408107, "grad_norm": 0.2885280251502991, "learning_rate": 9.89e-05, "loss": 0.7827, "step": 990 }, { "epoch": 0.227871292797435, "grad_norm": 0.41335442662239075, "learning_rate": 9.94e-05, "loss": 0.7798, "step": 995 }, { "epoch": 0.22901637467078897, "grad_norm": 0.35099464654922485, "learning_rate": 9.99e-05, "loss": 0.781, "step": 1000 }, { "epoch": 0.2301614565441429, "grad_norm": 0.3341158330440521, "learning_rate": 9.994828032066201e-05, "loss": 0.7832, "step": 1005 }, { "epoch": 0.23130653841749685, "grad_norm": 0.5810456275939941, "learning_rate": 9.988363072148953e-05, "loss": 0.7852, "step": 1010 }, { "epoch": 0.23245162029085079, "grad_norm": 0.487460732460022, "learning_rate": 9.981898112231704e-05, "loss": 0.7856, "step": 1015 }, { "epoch": 0.23359670216420475, "grad_norm": 0.6533329486846924, "learning_rate": 9.975433152314456e-05, "loss": 0.7842, "step": 1020 }, { "epoch": 0.2347417840375587, "grad_norm": 0.2424434870481491, "learning_rate": 9.968968192397208e-05, "loss": 0.7829, "step": 1025 }, { "epoch": 0.23588686591091262, "grad_norm": 0.4848748743534088, "learning_rate": 9.96250323247996e-05, "loss": 0.7802, "step": 1030 }, { "epoch": 0.23703194778426656, "grad_norm": 0.38208287954330444, "learning_rate": 9.956038272562711e-05, "loss": 0.7855, "step": 1035 }, { "epoch": 0.23817702965762053, "grad_norm": 0.5554670095443726, "learning_rate": 9.949573312645461e-05, "loss": 0.7829, "step": 1040 }, { "epoch": 0.23932211153097446, "grad_norm": 0.3737400770187378, "learning_rate": 9.943108352728213e-05, "loss": 0.7815, "step": 1045 }, { "epoch": 0.2404671934043284, "grad_norm": 0.287233829498291, "learning_rate": 9.936643392810964e-05, "loss": 0.7868, "step": 1050 }, { "epoch": 0.24161227527768236, "grad_norm": 0.2772092819213867, "learning_rate": 9.930178432893717e-05, "loss": 0.7774, "step": 1055 }, { "epoch": 0.2427573571510363, "grad_norm": 0.35240355134010315, "learning_rate": 9.923713472976468e-05, "loss": 0.781, "step": 1060 }, { "epoch": 0.24390243902439024, "grad_norm": 0.21047982573509216, "learning_rate": 9.91724851305922e-05, "loss": 0.7837, "step": 1065 }, { "epoch": 0.24504752089774418, "grad_norm": 0.37012019753456116, "learning_rate": 9.910783553141971e-05, "loss": 0.7809, "step": 1070 }, { "epoch": 0.24619260277109814, "grad_norm": 0.5411164164543152, "learning_rate": 9.904318593224723e-05, "loss": 0.7828, "step": 1075 }, { "epoch": 0.24733768464445208, "grad_norm": 0.5926540493965149, "learning_rate": 9.897853633307474e-05, "loss": 0.7856, "step": 1080 }, { "epoch": 0.24848276651780601, "grad_norm": 0.31834328174591064, "learning_rate": 9.891388673390225e-05, "loss": 0.7848, "step": 1085 }, { "epoch": 0.24962784839115998, "grad_norm": 0.9762104153633118, "learning_rate": 9.884923713472977e-05, "loss": 0.7852, "step": 1090 }, { "epoch": 0.2507729302645139, "grad_norm": 0.5515075325965881, "learning_rate": 9.878458753555728e-05, "loss": 0.7801, "step": 1095 }, { "epoch": 0.25191801213786785, "grad_norm": 0.5665395855903625, "learning_rate": 9.87199379363848e-05, "loss": 0.7808, "step": 1100 }, { "epoch": 0.2530630940112218, "grad_norm": 0.2942984104156494, "learning_rate": 9.865528833721231e-05, "loss": 0.78, "step": 1105 }, { "epoch": 0.2542081758845757, "grad_norm": 0.2915126085281372, "learning_rate": 9.859063873803983e-05, "loss": 0.7799, "step": 1110 }, { "epoch": 0.2553532577579297, "grad_norm": 0.39388832449913025, "learning_rate": 9.852598913886735e-05, "loss": 0.7801, "step": 1115 }, { "epoch": 0.25649833963128366, "grad_norm": 0.3185393810272217, "learning_rate": 9.846133953969487e-05, "loss": 0.7802, "step": 1120 }, { "epoch": 0.2576434215046376, "grad_norm": 0.4271395802497864, "learning_rate": 9.839668994052237e-05, "loss": 0.7819, "step": 1125 }, { "epoch": 0.25878850337799153, "grad_norm": 0.3662160336971283, "learning_rate": 9.833204034134988e-05, "loss": 0.7821, "step": 1130 }, { "epoch": 0.25993358525134547, "grad_norm": 0.38755324482917786, "learning_rate": 9.82673907421774e-05, "loss": 0.7803, "step": 1135 }, { "epoch": 0.2610786671246994, "grad_norm": 0.2804664671421051, "learning_rate": 9.820274114300491e-05, "loss": 0.778, "step": 1140 }, { "epoch": 0.26222374899805334, "grad_norm": 0.5177549719810486, "learning_rate": 9.813809154383244e-05, "loss": 0.7789, "step": 1145 }, { "epoch": 0.2633688308714073, "grad_norm": 0.3463110029697418, "learning_rate": 9.807344194465995e-05, "loss": 0.7794, "step": 1150 }, { "epoch": 0.26451391274476127, "grad_norm": 0.6302537322044373, "learning_rate": 9.800879234548747e-05, "loss": 0.7809, "step": 1155 }, { "epoch": 0.2656589946181152, "grad_norm": 0.3459393382072449, "learning_rate": 9.794414274631498e-05, "loss": 0.7789, "step": 1160 }, { "epoch": 0.26680407649146914, "grad_norm": 0.2957208454608917, "learning_rate": 9.787949314714248e-05, "loss": 0.7818, "step": 1165 }, { "epoch": 0.2679491583648231, "grad_norm": 0.3508155941963196, "learning_rate": 9.781484354797001e-05, "loss": 0.7812, "step": 1170 }, { "epoch": 0.269094240238177, "grad_norm": 0.21779291331768036, "learning_rate": 9.775019394879752e-05, "loss": 0.7805, "step": 1175 }, { "epoch": 0.27023932211153096, "grad_norm": 0.42935284972190857, "learning_rate": 9.768554434962504e-05, "loss": 0.7779, "step": 1180 }, { "epoch": 0.2713844039848849, "grad_norm": 0.37435927987098694, "learning_rate": 9.762089475045255e-05, "loss": 0.7796, "step": 1185 }, { "epoch": 0.2725294858582389, "grad_norm": 0.3577134311199188, "learning_rate": 9.755624515128007e-05, "loss": 0.7779, "step": 1190 }, { "epoch": 0.2736745677315928, "grad_norm": 0.36118873953819275, "learning_rate": 9.749159555210758e-05, "loss": 0.7775, "step": 1195 }, { "epoch": 0.27481964960494676, "grad_norm": 0.38497358560562134, "learning_rate": 9.74269459529351e-05, "loss": 0.776, "step": 1200 }, { "epoch": 0.2759647314783007, "grad_norm": 0.3275431990623474, "learning_rate": 9.736229635376262e-05, "loss": 0.7828, "step": 1205 }, { "epoch": 0.27710981335165463, "grad_norm": 0.3107474148273468, "learning_rate": 9.729764675459012e-05, "loss": 0.7777, "step": 1210 }, { "epoch": 0.27825489522500857, "grad_norm": 0.3608294129371643, "learning_rate": 9.723299715541764e-05, "loss": 0.7791, "step": 1215 }, { "epoch": 0.2793999770983625, "grad_norm": 0.3778846859931946, "learning_rate": 9.716834755624515e-05, "loss": 0.7844, "step": 1220 }, { "epoch": 0.2805450589717165, "grad_norm": 0.3609561622142792, "learning_rate": 9.710369795707267e-05, "loss": 0.7776, "step": 1225 }, { "epoch": 0.28169014084507044, "grad_norm": 0.2806609272956848, "learning_rate": 9.703904835790018e-05, "loss": 0.7797, "step": 1230 }, { "epoch": 0.2828352227184244, "grad_norm": 0.3381900191307068, "learning_rate": 9.697439875872771e-05, "loss": 0.7813, "step": 1235 }, { "epoch": 0.2839803045917783, "grad_norm": 0.25639185309410095, "learning_rate": 9.690974915955522e-05, "loss": 0.7802, "step": 1240 }, { "epoch": 0.28512538646513225, "grad_norm": 0.3533225357532501, "learning_rate": 9.684509956038274e-05, "loss": 0.7773, "step": 1245 }, { "epoch": 0.2862704683384862, "grad_norm": 0.4826221466064453, "learning_rate": 9.678044996121024e-05, "loss": 0.7789, "step": 1250 }, { "epoch": 0.2874155502118401, "grad_norm": 0.20925547182559967, "learning_rate": 9.671580036203775e-05, "loss": 0.7864, "step": 1255 }, { "epoch": 0.2885606320851941, "grad_norm": 0.3238098919391632, "learning_rate": 9.665115076286527e-05, "loss": 0.7782, "step": 1260 }, { "epoch": 0.28970571395854805, "grad_norm": 0.451593279838562, "learning_rate": 9.65865011636928e-05, "loss": 0.7817, "step": 1265 }, { "epoch": 0.290850795831902, "grad_norm": 0.3764170706272125, "learning_rate": 9.652185156452031e-05, "loss": 0.7807, "step": 1270 }, { "epoch": 0.2919958777052559, "grad_norm": 0.40236976742744446, "learning_rate": 9.645720196534782e-05, "loss": 0.7804, "step": 1275 }, { "epoch": 0.29314095957860986, "grad_norm": 0.4926491677761078, "learning_rate": 9.639255236617534e-05, "loss": 0.776, "step": 1280 }, { "epoch": 0.2942860414519638, "grad_norm": 0.4249913692474365, "learning_rate": 9.632790276700285e-05, "loss": 0.7793, "step": 1285 }, { "epoch": 0.29543112332531773, "grad_norm": 0.45594412088394165, "learning_rate": 9.626325316783036e-05, "loss": 0.7791, "step": 1290 }, { "epoch": 0.2965762051986717, "grad_norm": 0.2865663170814514, "learning_rate": 9.619860356865788e-05, "loss": 0.7801, "step": 1295 }, { "epoch": 0.29772128707202566, "grad_norm": 0.4077535569667816, "learning_rate": 9.613395396948539e-05, "loss": 0.7756, "step": 1300 }, { "epoch": 0.2988663689453796, "grad_norm": 0.43570780754089355, "learning_rate": 9.606930437031291e-05, "loss": 0.7796, "step": 1305 }, { "epoch": 0.30001145081873354, "grad_norm": 0.304936945438385, "learning_rate": 9.600465477114042e-05, "loss": 0.7785, "step": 1310 }, { "epoch": 0.3011565326920875, "grad_norm": 0.39299318194389343, "learning_rate": 9.594000517196794e-05, "loss": 0.7809, "step": 1315 }, { "epoch": 0.3023016145654414, "grad_norm": 0.4470611810684204, "learning_rate": 9.587535557279545e-05, "loss": 0.78, "step": 1320 }, { "epoch": 0.30344669643879535, "grad_norm": 0.9563407301902771, "learning_rate": 9.581070597362298e-05, "loss": 0.7793, "step": 1325 }, { "epoch": 0.30459177831214934, "grad_norm": 0.6523124575614929, "learning_rate": 9.574605637445049e-05, "loss": 0.7848, "step": 1330 }, { "epoch": 0.3057368601855033, "grad_norm": 0.4043123424053192, "learning_rate": 9.568140677527799e-05, "loss": 0.783, "step": 1335 }, { "epoch": 0.3068819420588572, "grad_norm": 0.2784610986709595, "learning_rate": 9.561675717610551e-05, "loss": 0.7784, "step": 1340 }, { "epoch": 0.30802702393221115, "grad_norm": 0.34219038486480713, "learning_rate": 9.555210757693302e-05, "loss": 0.7752, "step": 1345 }, { "epoch": 0.3091721058055651, "grad_norm": 0.25641149282455444, "learning_rate": 9.548745797776054e-05, "loss": 0.7767, "step": 1350 }, { "epoch": 0.310317187678919, "grad_norm": 0.2744545340538025, "learning_rate": 9.542280837858806e-05, "loss": 0.779, "step": 1355 }, { "epoch": 0.31146226955227296, "grad_norm": 0.3130733072757721, "learning_rate": 9.535815877941558e-05, "loss": 0.7789, "step": 1360 }, { "epoch": 0.31260735142562696, "grad_norm": 0.27104058861732483, "learning_rate": 9.529350918024309e-05, "loss": 0.7764, "step": 1365 }, { "epoch": 0.3137524332989809, "grad_norm": 0.33596792817115784, "learning_rate": 9.52288595810706e-05, "loss": 0.7742, "step": 1370 }, { "epoch": 0.31489751517233483, "grad_norm": 0.23721632361412048, "learning_rate": 9.51642099818981e-05, "loss": 0.776, "step": 1375 }, { "epoch": 0.31604259704568877, "grad_norm": 0.29659345746040344, "learning_rate": 9.509956038272563e-05, "loss": 0.7764, "step": 1380 }, { "epoch": 0.3171876789190427, "grad_norm": 0.27600282430648804, "learning_rate": 9.503491078355315e-05, "loss": 0.7756, "step": 1385 }, { "epoch": 0.31833276079239664, "grad_norm": 0.38308611512184143, "learning_rate": 9.497026118438066e-05, "loss": 0.7783, "step": 1390 }, { "epoch": 0.3194778426657506, "grad_norm": 0.22261874377727509, "learning_rate": 9.490561158520818e-05, "loss": 0.7762, "step": 1395 }, { "epoch": 0.32062292453910457, "grad_norm": 0.3498833179473877, "learning_rate": 9.484096198603569e-05, "loss": 0.7771, "step": 1400 }, { "epoch": 0.3217680064124585, "grad_norm": 0.33357107639312744, "learning_rate": 9.47763123868632e-05, "loss": 0.7781, "step": 1405 }, { "epoch": 0.32291308828581244, "grad_norm": 0.2920812666416168, "learning_rate": 9.471166278769072e-05, "loss": 0.776, "step": 1410 }, { "epoch": 0.3240581701591664, "grad_norm": 0.48737823963165283, "learning_rate": 9.464701318851825e-05, "loss": 0.7775, "step": 1415 }, { "epoch": 0.3252032520325203, "grad_norm": 0.3747755289077759, "learning_rate": 9.458236358934575e-05, "loss": 0.7749, "step": 1420 }, { "epoch": 0.32634833390587425, "grad_norm": 0.2682911455631256, "learning_rate": 9.451771399017326e-05, "loss": 0.7768, "step": 1425 }, { "epoch": 0.3274934157792282, "grad_norm": 0.21982483565807343, "learning_rate": 9.445306439100078e-05, "loss": 0.7795, "step": 1430 }, { "epoch": 0.3286384976525822, "grad_norm": 0.39821624755859375, "learning_rate": 9.438841479182829e-05, "loss": 0.7797, "step": 1435 }, { "epoch": 0.3297835795259361, "grad_norm": 0.3070865571498871, "learning_rate": 9.43237651926558e-05, "loss": 0.7743, "step": 1440 }, { "epoch": 0.33092866139929006, "grad_norm": 0.46173593401908875, "learning_rate": 9.425911559348333e-05, "loss": 0.7776, "step": 1445 }, { "epoch": 0.332073743272644, "grad_norm": 0.66759192943573, "learning_rate": 9.419446599431085e-05, "loss": 0.7743, "step": 1450 }, { "epoch": 0.33321882514599793, "grad_norm": 0.30952057242393494, "learning_rate": 9.412981639513836e-05, "loss": 0.7757, "step": 1455 }, { "epoch": 0.33436390701935187, "grad_norm": 0.433117538690567, "learning_rate": 9.406516679596586e-05, "loss": 0.7757, "step": 1460 }, { "epoch": 0.3355089888927058, "grad_norm": 0.4374425709247589, "learning_rate": 9.400051719679338e-05, "loss": 0.7765, "step": 1465 }, { "epoch": 0.3366540707660598, "grad_norm": 0.3277287781238556, "learning_rate": 9.393586759762089e-05, "loss": 0.7772, "step": 1470 }, { "epoch": 0.33779915263941374, "grad_norm": 0.3934798240661621, "learning_rate": 9.387121799844842e-05, "loss": 0.7723, "step": 1475 }, { "epoch": 0.3389442345127677, "grad_norm": 0.23350226879119873, "learning_rate": 9.380656839927593e-05, "loss": 0.7771, "step": 1480 }, { "epoch": 0.3400893163861216, "grad_norm": 0.3555000126361847, "learning_rate": 9.374191880010345e-05, "loss": 0.7791, "step": 1485 }, { "epoch": 0.34123439825947555, "grad_norm": 0.26215437054634094, "learning_rate": 9.367726920093096e-05, "loss": 0.7726, "step": 1490 }, { "epoch": 0.3423794801328295, "grad_norm": 0.3617369532585144, "learning_rate": 9.361261960175847e-05, "loss": 0.7724, "step": 1495 }, { "epoch": 0.3435245620061834, "grad_norm": 0.2677263617515564, "learning_rate": 9.354797000258599e-05, "loss": 0.7762, "step": 1500 }, { "epoch": 0.3446696438795374, "grad_norm": 0.38829365372657776, "learning_rate": 9.34833204034135e-05, "loss": 0.7782, "step": 1505 }, { "epoch": 0.34581472575289135, "grad_norm": 0.30776435136795044, "learning_rate": 9.341867080424102e-05, "loss": 0.7762, "step": 1510 }, { "epoch": 0.3469598076262453, "grad_norm": 0.4308106005191803, "learning_rate": 9.335402120506853e-05, "loss": 0.7716, "step": 1515 }, { "epoch": 0.3481048894995992, "grad_norm": 0.358279287815094, "learning_rate": 9.328937160589605e-05, "loss": 0.7774, "step": 1520 }, { "epoch": 0.34924997137295316, "grad_norm": 0.32716208696365356, "learning_rate": 9.322472200672356e-05, "loss": 0.7793, "step": 1525 }, { "epoch": 0.3503950532463071, "grad_norm": 0.35342416167259216, "learning_rate": 9.316007240755107e-05, "loss": 0.775, "step": 1530 }, { "epoch": 0.35154013511966103, "grad_norm": 0.4312511682510376, "learning_rate": 9.30954228083786e-05, "loss": 0.7707, "step": 1535 }, { "epoch": 0.352685216993015, "grad_norm": 0.38912200927734375, "learning_rate": 9.303077320920612e-05, "loss": 0.7769, "step": 1540 }, { "epoch": 0.35383029886636896, "grad_norm": 0.3942466974258423, "learning_rate": 9.296612361003362e-05, "loss": 0.7788, "step": 1545 }, { "epoch": 0.3549753807397229, "grad_norm": 0.28408968448638916, "learning_rate": 9.290147401086113e-05, "loss": 0.7762, "step": 1550 }, { "epoch": 0.35612046261307684, "grad_norm": 0.2877650260925293, "learning_rate": 9.283682441168865e-05, "loss": 0.776, "step": 1555 }, { "epoch": 0.3572655444864308, "grad_norm": 0.22159777581691742, "learning_rate": 9.277217481251616e-05, "loss": 0.7717, "step": 1560 }, { "epoch": 0.3584106263597847, "grad_norm": 0.2480212152004242, "learning_rate": 9.270752521334369e-05, "loss": 0.7755, "step": 1565 }, { "epoch": 0.35955570823313865, "grad_norm": 0.3367151916027069, "learning_rate": 9.26428756141712e-05, "loss": 0.7744, "step": 1570 }, { "epoch": 0.36070079010649264, "grad_norm": 0.38849833607673645, "learning_rate": 9.257822601499872e-05, "loss": 0.7741, "step": 1575 }, { "epoch": 0.3618458719798466, "grad_norm": 0.43399858474731445, "learning_rate": 9.251357641582623e-05, "loss": 0.7716, "step": 1580 }, { "epoch": 0.3629909538532005, "grad_norm": 0.4505549371242523, "learning_rate": 9.244892681665373e-05, "loss": 0.7744, "step": 1585 }, { "epoch": 0.36413603572655445, "grad_norm": 0.4170837104320526, "learning_rate": 9.238427721748126e-05, "loss": 0.7746, "step": 1590 }, { "epoch": 0.3652811175999084, "grad_norm": 0.3157612085342407, "learning_rate": 9.231962761830877e-05, "loss": 0.7772, "step": 1595 }, { "epoch": 0.3664261994732623, "grad_norm": 0.2665364742279053, "learning_rate": 9.225497801913629e-05, "loss": 0.7769, "step": 1600 }, { "epoch": 0.36757128134661626, "grad_norm": 0.32196542620658875, "learning_rate": 9.21903284199638e-05, "loss": 0.7757, "step": 1605 }, { "epoch": 0.3687163632199702, "grad_norm": 0.3406280279159546, "learning_rate": 9.212567882079132e-05, "loss": 0.777, "step": 1610 }, { "epoch": 0.3698614450933242, "grad_norm": 0.22886425256729126, "learning_rate": 9.206102922161883e-05, "loss": 0.7706, "step": 1615 }, { "epoch": 0.37100652696667813, "grad_norm": 0.3251831829547882, "learning_rate": 9.199637962244634e-05, "loss": 0.7775, "step": 1620 }, { "epoch": 0.37215160884003207, "grad_norm": 0.5250852704048157, "learning_rate": 9.193173002327387e-05, "loss": 0.7769, "step": 1625 }, { "epoch": 0.373296690713386, "grad_norm": 0.28842002153396606, "learning_rate": 9.186708042410137e-05, "loss": 0.7739, "step": 1630 }, { "epoch": 0.37444177258673994, "grad_norm": 0.35593077540397644, "learning_rate": 9.180243082492889e-05, "loss": 0.7739, "step": 1635 }, { "epoch": 0.3755868544600939, "grad_norm": 0.23034408688545227, "learning_rate": 9.17377812257564e-05, "loss": 0.7711, "step": 1640 }, { "epoch": 0.3767319363334478, "grad_norm": 0.4300076365470886, "learning_rate": 9.167313162658391e-05, "loss": 0.7723, "step": 1645 }, { "epoch": 0.3778770182068018, "grad_norm": 0.805209219455719, "learning_rate": 9.160848202741143e-05, "loss": 0.7742, "step": 1650 }, { "epoch": 0.37902210008015574, "grad_norm": 0.30290111899375916, "learning_rate": 9.154383242823896e-05, "loss": 0.7715, "step": 1655 }, { "epoch": 0.3801671819535097, "grad_norm": 0.3359617590904236, "learning_rate": 9.147918282906647e-05, "loss": 0.7764, "step": 1660 }, { "epoch": 0.3813122638268636, "grad_norm": 0.23703722655773163, "learning_rate": 9.141453322989399e-05, "loss": 0.7745, "step": 1665 }, { "epoch": 0.38245734570021755, "grad_norm": 0.4906832277774811, "learning_rate": 9.134988363072149e-05, "loss": 0.7743, "step": 1670 }, { "epoch": 0.3836024275735715, "grad_norm": 0.4271215498447418, "learning_rate": 9.1285234031549e-05, "loss": 0.7748, "step": 1675 }, { "epoch": 0.38474750944692543, "grad_norm": 0.3383249044418335, "learning_rate": 9.122058443237653e-05, "loss": 0.7763, "step": 1680 }, { "epoch": 0.3858925913202794, "grad_norm": 0.45870041847229004, "learning_rate": 9.115593483320404e-05, "loss": 0.7738, "step": 1685 }, { "epoch": 0.38703767319363336, "grad_norm": 0.3419291079044342, "learning_rate": 9.109128523403156e-05, "loss": 0.7727, "step": 1690 }, { "epoch": 0.3881827550669873, "grad_norm": 0.27392908930778503, "learning_rate": 9.102663563485907e-05, "loss": 0.7749, "step": 1695 }, { "epoch": 0.38932783694034123, "grad_norm": 0.36585626006126404, "learning_rate": 9.096198603568658e-05, "loss": 0.7731, "step": 1700 }, { "epoch": 0.39047291881369517, "grad_norm": 0.31979432702064514, "learning_rate": 9.08973364365141e-05, "loss": 0.7744, "step": 1705 }, { "epoch": 0.3916180006870491, "grad_norm": 0.6172612309455872, "learning_rate": 9.083268683734161e-05, "loss": 0.7718, "step": 1710 }, { "epoch": 0.39276308256040304, "grad_norm": 0.5099327564239502, "learning_rate": 9.076803723816913e-05, "loss": 0.7745, "step": 1715 }, { "epoch": 0.39390816443375704, "grad_norm": 0.2649715840816498, "learning_rate": 9.070338763899664e-05, "loss": 0.7723, "step": 1720 }, { "epoch": 0.395053246307111, "grad_norm": 0.3346767723560333, "learning_rate": 9.063873803982416e-05, "loss": 0.7754, "step": 1725 }, { "epoch": 0.3961983281804649, "grad_norm": 0.36208733916282654, "learning_rate": 9.057408844065167e-05, "loss": 0.7716, "step": 1730 }, { "epoch": 0.39734341005381885, "grad_norm": 0.5012935996055603, "learning_rate": 9.050943884147918e-05, "loss": 0.7741, "step": 1735 }, { "epoch": 0.3984884919271728, "grad_norm": 0.25386548042297363, "learning_rate": 9.04447892423067e-05, "loss": 0.7722, "step": 1740 }, { "epoch": 0.3996335738005267, "grad_norm": 0.5945486426353455, "learning_rate": 9.038013964313423e-05, "loss": 0.7735, "step": 1745 }, { "epoch": 0.40077865567388066, "grad_norm": 0.3223184049129486, "learning_rate": 9.031549004396174e-05, "loss": 0.7729, "step": 1750 }, { "epoch": 0.40192373754723465, "grad_norm": 0.30719512701034546, "learning_rate": 9.025084044478924e-05, "loss": 0.7776, "step": 1755 }, { "epoch": 0.4030688194205886, "grad_norm": 0.2536080777645111, "learning_rate": 9.018619084561676e-05, "loss": 0.7723, "step": 1760 }, { "epoch": 0.4042139012939425, "grad_norm": 0.31377112865448, "learning_rate": 9.012154124644427e-05, "loss": 0.7722, "step": 1765 }, { "epoch": 0.40535898316729646, "grad_norm": 0.5046892762184143, "learning_rate": 9.005689164727178e-05, "loss": 0.77, "step": 1770 }, { "epoch": 0.4065040650406504, "grad_norm": 0.4742714464664459, "learning_rate": 8.999224204809931e-05, "loss": 0.7738, "step": 1775 }, { "epoch": 0.40764914691400433, "grad_norm": 0.39765727519989014, "learning_rate": 8.992759244892683e-05, "loss": 0.7733, "step": 1780 }, { "epoch": 0.40879422878735827, "grad_norm": 0.28733646869659424, "learning_rate": 8.986294284975434e-05, "loss": 0.773, "step": 1785 }, { "epoch": 0.40993931066071226, "grad_norm": 0.38625991344451904, "learning_rate": 8.979829325058185e-05, "loss": 0.772, "step": 1790 }, { "epoch": 0.4110843925340662, "grad_norm": 0.23286648094654083, "learning_rate": 8.973364365140936e-05, "loss": 0.7687, "step": 1795 }, { "epoch": 0.41222947440742014, "grad_norm": 0.36121290922164917, "learning_rate": 8.966899405223688e-05, "loss": 0.7728, "step": 1800 }, { "epoch": 0.4133745562807741, "grad_norm": 0.4618297219276428, "learning_rate": 8.96043444530644e-05, "loss": 0.7719, "step": 1805 }, { "epoch": 0.414519638154128, "grad_norm": 0.3385610282421112, "learning_rate": 8.953969485389191e-05, "loss": 0.7791, "step": 1810 }, { "epoch": 0.41566472002748195, "grad_norm": 0.45700961351394653, "learning_rate": 8.947504525471943e-05, "loss": 0.7751, "step": 1815 }, { "epoch": 0.4168098019008359, "grad_norm": 0.2775494158267975, "learning_rate": 8.941039565554694e-05, "loss": 0.7705, "step": 1820 }, { "epoch": 0.4179548837741899, "grad_norm": 0.32360318303108215, "learning_rate": 8.934574605637445e-05, "loss": 0.7743, "step": 1825 }, { "epoch": 0.4190999656475438, "grad_norm": 0.2116226702928543, "learning_rate": 8.928109645720197e-05, "loss": 0.7702, "step": 1830 }, { "epoch": 0.42024504752089775, "grad_norm": 0.1969475895166397, "learning_rate": 8.92164468580295e-05, "loss": 0.7767, "step": 1835 }, { "epoch": 0.4213901293942517, "grad_norm": 0.2624218165874481, "learning_rate": 8.9151797258857e-05, "loss": 0.7718, "step": 1840 }, { "epoch": 0.4225352112676056, "grad_norm": 0.29143357276916504, "learning_rate": 8.908714765968451e-05, "loss": 0.7695, "step": 1845 }, { "epoch": 0.42368029314095956, "grad_norm": 0.3152911067008972, "learning_rate": 8.902249806051203e-05, "loss": 0.774, "step": 1850 }, { "epoch": 0.4248253750143135, "grad_norm": 0.4107597768306732, "learning_rate": 8.895784846133954e-05, "loss": 0.7754, "step": 1855 }, { "epoch": 0.4259704568876675, "grad_norm": 0.39836370944976807, "learning_rate": 8.889319886216705e-05, "loss": 0.7706, "step": 1860 }, { "epoch": 0.42711553876102143, "grad_norm": 0.44928601384162903, "learning_rate": 8.882854926299458e-05, "loss": 0.7735, "step": 1865 }, { "epoch": 0.42826062063437537, "grad_norm": 0.2496773898601532, "learning_rate": 8.87638996638221e-05, "loss": 0.7725, "step": 1870 }, { "epoch": 0.4294057025077293, "grad_norm": 0.4287888705730438, "learning_rate": 8.869925006464961e-05, "loss": 0.7733, "step": 1875 }, { "epoch": 0.43055078438108324, "grad_norm": 0.18999287486076355, "learning_rate": 8.863460046547711e-05, "loss": 0.7728, "step": 1880 }, { "epoch": 0.4316958662544372, "grad_norm": 0.4098029136657715, "learning_rate": 8.856995086630462e-05, "loss": 0.7731, "step": 1885 }, { "epoch": 0.4328409481277911, "grad_norm": 0.3076467216014862, "learning_rate": 8.850530126713215e-05, "loss": 0.7762, "step": 1890 }, { "epoch": 0.4339860300011451, "grad_norm": 0.3346705436706543, "learning_rate": 8.844065166795967e-05, "loss": 0.7738, "step": 1895 }, { "epoch": 0.43513111187449904, "grad_norm": 0.4677494764328003, "learning_rate": 8.837600206878718e-05, "loss": 0.7704, "step": 1900 }, { "epoch": 0.436276193747853, "grad_norm": 0.3557366132736206, "learning_rate": 8.83113524696147e-05, "loss": 0.7727, "step": 1905 }, { "epoch": 0.4374212756212069, "grad_norm": 0.29481977224349976, "learning_rate": 8.824670287044221e-05, "loss": 0.7718, "step": 1910 }, { "epoch": 0.43856635749456085, "grad_norm": 0.48542648553848267, "learning_rate": 8.818205327126972e-05, "loss": 0.7716, "step": 1915 }, { "epoch": 0.4397114393679148, "grad_norm": 0.29684439301490784, "learning_rate": 8.811740367209724e-05, "loss": 0.771, "step": 1920 }, { "epoch": 0.44085652124126873, "grad_norm": 0.2121712863445282, "learning_rate": 8.805275407292475e-05, "loss": 0.7709, "step": 1925 }, { "epoch": 0.4420016031146227, "grad_norm": 0.221273273229599, "learning_rate": 8.798810447375227e-05, "loss": 0.7705, "step": 1930 }, { "epoch": 0.44314668498797666, "grad_norm": 0.21169264614582062, "learning_rate": 8.792345487457978e-05, "loss": 0.7696, "step": 1935 }, { "epoch": 0.4442917668613306, "grad_norm": 0.33791452646255493, "learning_rate": 8.78588052754073e-05, "loss": 0.773, "step": 1940 }, { "epoch": 0.44543684873468453, "grad_norm": 0.2366342693567276, "learning_rate": 8.779415567623481e-05, "loss": 0.77, "step": 1945 }, { "epoch": 0.44658193060803847, "grad_norm": 0.3381491005420685, "learning_rate": 8.772950607706232e-05, "loss": 0.7736, "step": 1950 }, { "epoch": 0.4477270124813924, "grad_norm": 0.2792139947414398, "learning_rate": 8.766485647788985e-05, "loss": 0.7698, "step": 1955 }, { "epoch": 0.44887209435474634, "grad_norm": 0.2960784435272217, "learning_rate": 8.760020687871736e-05, "loss": 0.7697, "step": 1960 }, { "epoch": 0.45001717622810034, "grad_norm": 0.3192628026008606, "learning_rate": 8.753555727954487e-05, "loss": 0.7711, "step": 1965 }, { "epoch": 0.45116225810145427, "grad_norm": 0.33469468355178833, "learning_rate": 8.747090768037238e-05, "loss": 0.7679, "step": 1970 }, { "epoch": 0.4523073399748082, "grad_norm": 0.8246955871582031, "learning_rate": 8.74062580811999e-05, "loss": 0.7693, "step": 1975 }, { "epoch": 0.45345242184816215, "grad_norm": 0.4532000720500946, "learning_rate": 8.734160848202741e-05, "loss": 0.7736, "step": 1980 }, { "epoch": 0.4545975037215161, "grad_norm": 0.26574060320854187, "learning_rate": 8.727695888285494e-05, "loss": 0.7727, "step": 1985 }, { "epoch": 0.45574258559487, "grad_norm": 0.33447420597076416, "learning_rate": 8.721230928368245e-05, "loss": 0.772, "step": 1990 }, { "epoch": 0.45688766746822396, "grad_norm": 0.2761179804801941, "learning_rate": 8.714765968450996e-05, "loss": 0.773, "step": 1995 }, { "epoch": 0.45803274934157795, "grad_norm": 0.24412183463573456, "learning_rate": 8.708301008533748e-05, "loss": 0.7695, "step": 2000 }, { "epoch": 0.4591778312149319, "grad_norm": 0.20122234523296356, "learning_rate": 8.701836048616498e-05, "loss": 0.7721, "step": 2005 }, { "epoch": 0.4603229130882858, "grad_norm": 0.3519915044307709, "learning_rate": 8.695371088699251e-05, "loss": 0.7676, "step": 2010 }, { "epoch": 0.46146799496163976, "grad_norm": 0.34082722663879395, "learning_rate": 8.688906128782002e-05, "loss": 0.7682, "step": 2015 }, { "epoch": 0.4626130768349937, "grad_norm": 0.4101792573928833, "learning_rate": 8.682441168864754e-05, "loss": 0.772, "step": 2020 }, { "epoch": 0.46375815870834763, "grad_norm": 0.3329888880252838, "learning_rate": 8.675976208947505e-05, "loss": 0.77, "step": 2025 }, { "epoch": 0.46490324058170157, "grad_norm": 0.4946768283843994, "learning_rate": 8.669511249030256e-05, "loss": 0.7693, "step": 2030 }, { "epoch": 0.46604832245505556, "grad_norm": 0.26925185322761536, "learning_rate": 8.663046289113008e-05, "loss": 0.7708, "step": 2035 }, { "epoch": 0.4671934043284095, "grad_norm": 0.23793742060661316, "learning_rate": 8.656581329195759e-05, "loss": 0.7718, "step": 2040 }, { "epoch": 0.46833848620176344, "grad_norm": 0.1948954164981842, "learning_rate": 8.650116369278512e-05, "loss": 0.7702, "step": 2045 }, { "epoch": 0.4694835680751174, "grad_norm": 0.32897040247917175, "learning_rate": 8.643651409361262e-05, "loss": 0.7686, "step": 2050 }, { "epoch": 0.4706286499484713, "grad_norm": 0.2682906687259674, "learning_rate": 8.637186449444014e-05, "loss": 0.7716, "step": 2055 }, { "epoch": 0.47177373182182525, "grad_norm": 0.3296032249927521, "learning_rate": 8.630721489526765e-05, "loss": 0.7699, "step": 2060 }, { "epoch": 0.4729188136951792, "grad_norm": 0.32693296670913696, "learning_rate": 8.624256529609516e-05, "loss": 0.771, "step": 2065 }, { "epoch": 0.4740638955685331, "grad_norm": 0.36149802803993225, "learning_rate": 8.617791569692268e-05, "loss": 0.7709, "step": 2070 }, { "epoch": 0.4752089774418871, "grad_norm": 0.28551197052001953, "learning_rate": 8.61132660977502e-05, "loss": 0.7676, "step": 2075 }, { "epoch": 0.47635405931524105, "grad_norm": 0.19539989531040192, "learning_rate": 8.604861649857772e-05, "loss": 0.7724, "step": 2080 }, { "epoch": 0.477499141188595, "grad_norm": 0.28068146109580994, "learning_rate": 8.598396689940523e-05, "loss": 0.7717, "step": 2085 }, { "epoch": 0.4786442230619489, "grad_norm": 0.40226539969444275, "learning_rate": 8.591931730023273e-05, "loss": 0.7749, "step": 2090 }, { "epoch": 0.47978930493530286, "grad_norm": 0.3484259843826294, "learning_rate": 8.585466770106025e-05, "loss": 0.7701, "step": 2095 }, { "epoch": 0.4809343868086568, "grad_norm": 0.3907545208930969, "learning_rate": 8.579001810188778e-05, "loss": 0.7713, "step": 2100 }, { "epoch": 0.48207946868201074, "grad_norm": 0.25463706254959106, "learning_rate": 8.572536850271529e-05, "loss": 0.7706, "step": 2105 }, { "epoch": 0.48322455055536473, "grad_norm": 0.3591324985027313, "learning_rate": 8.56607189035428e-05, "loss": 0.7699, "step": 2110 }, { "epoch": 0.48436963242871867, "grad_norm": 0.38342562317848206, "learning_rate": 8.559606930437032e-05, "loss": 0.7679, "step": 2115 }, { "epoch": 0.4855147143020726, "grad_norm": 0.2707025408744812, "learning_rate": 8.553141970519783e-05, "loss": 0.7663, "step": 2120 }, { "epoch": 0.48665979617542654, "grad_norm": 0.27740100026130676, "learning_rate": 8.546677010602535e-05, "loss": 0.7702, "step": 2125 }, { "epoch": 0.4878048780487805, "grad_norm": 0.38322174549102783, "learning_rate": 8.540212050685286e-05, "loss": 0.771, "step": 2130 }, { "epoch": 0.4889499599221344, "grad_norm": 0.25964078307151794, "learning_rate": 8.533747090768038e-05, "loss": 0.77, "step": 2135 }, { "epoch": 0.49009504179548835, "grad_norm": 0.3346869647502899, "learning_rate": 8.527282130850789e-05, "loss": 0.7695, "step": 2140 }, { "epoch": 0.49124012366884234, "grad_norm": 0.2086460143327713, "learning_rate": 8.52081717093354e-05, "loss": 0.7703, "step": 2145 }, { "epoch": 0.4923852055421963, "grad_norm": 0.18562249839305878, "learning_rate": 8.514352211016292e-05, "loss": 0.7691, "step": 2150 }, { "epoch": 0.4935302874155502, "grad_norm": 0.27760863304138184, "learning_rate": 8.507887251099043e-05, "loss": 0.7736, "step": 2155 }, { "epoch": 0.49467536928890415, "grad_norm": 0.33482494950294495, "learning_rate": 8.501422291181795e-05, "loss": 0.7681, "step": 2160 }, { "epoch": 0.4958204511622581, "grad_norm": 0.2479313164949417, "learning_rate": 8.494957331264548e-05, "loss": 0.7714, "step": 2165 }, { "epoch": 0.49696553303561203, "grad_norm": 0.31260478496551514, "learning_rate": 8.488492371347299e-05, "loss": 0.7683, "step": 2170 }, { "epoch": 0.49811061490896597, "grad_norm": 0.25391116738319397, "learning_rate": 8.482027411430049e-05, "loss": 0.7666, "step": 2175 }, { "epoch": 0.49925569678231996, "grad_norm": 0.27383050322532654, "learning_rate": 8.4755624515128e-05, "loss": 0.7678, "step": 2180 }, { "epoch": 0.5004007786556739, "grad_norm": 0.4296813905239105, "learning_rate": 8.469097491595552e-05, "loss": 0.7699, "step": 2185 }, { "epoch": 0.5015458605290278, "grad_norm": 0.26294124126434326, "learning_rate": 8.462632531678305e-05, "loss": 0.7698, "step": 2190 }, { "epoch": 0.5026909424023818, "grad_norm": 0.34626004099845886, "learning_rate": 8.456167571761056e-05, "loss": 0.7709, "step": 2195 }, { "epoch": 0.5038360242757357, "grad_norm": 0.44097933173179626, "learning_rate": 8.449702611843807e-05, "loss": 0.7644, "step": 2200 }, { "epoch": 0.5049811061490896, "grad_norm": 0.2886388897895813, "learning_rate": 8.443237651926559e-05, "loss": 0.7667, "step": 2205 }, { "epoch": 0.5061261880224436, "grad_norm": 0.20610256493091583, "learning_rate": 8.43677269200931e-05, "loss": 0.7723, "step": 2210 }, { "epoch": 0.5072712698957975, "grad_norm": 0.17685441672801971, "learning_rate": 8.43030773209206e-05, "loss": 0.769, "step": 2215 }, { "epoch": 0.5084163517691515, "grad_norm": 0.2608635723590851, "learning_rate": 8.423842772174813e-05, "loss": 0.7676, "step": 2220 }, { "epoch": 0.5095614336425054, "grad_norm": 0.19678382575511932, "learning_rate": 8.417377812257565e-05, "loss": 0.7656, "step": 2225 }, { "epoch": 0.5107065155158594, "grad_norm": 0.2524316906929016, "learning_rate": 8.410912852340316e-05, "loss": 0.7693, "step": 2230 }, { "epoch": 0.5118515973892134, "grad_norm": 0.1823377162218094, "learning_rate": 8.404447892423067e-05, "loss": 0.7675, "step": 2235 }, { "epoch": 0.5129966792625673, "grad_norm": 0.3295440375804901, "learning_rate": 8.397982932505819e-05, "loss": 0.7702, "step": 2240 }, { "epoch": 0.5141417611359212, "grad_norm": 0.2945457398891449, "learning_rate": 8.39151797258857e-05, "loss": 0.7735, "step": 2245 }, { "epoch": 0.5152868430092752, "grad_norm": 0.2396729290485382, "learning_rate": 8.385053012671322e-05, "loss": 0.7695, "step": 2250 }, { "epoch": 0.5164319248826291, "grad_norm": 0.4043848514556885, "learning_rate": 8.378588052754074e-05, "loss": 0.7689, "step": 2255 }, { "epoch": 0.5175770067559831, "grad_norm": 0.4132694602012634, "learning_rate": 8.372123092836825e-05, "loss": 0.7709, "step": 2260 }, { "epoch": 0.518722088629337, "grad_norm": 0.3979206681251526, "learning_rate": 8.365658132919576e-05, "loss": 0.7683, "step": 2265 }, { "epoch": 0.5198671705026909, "grad_norm": 0.23621103167533875, "learning_rate": 8.359193173002327e-05, "loss": 0.7675, "step": 2270 }, { "epoch": 0.5210122523760449, "grad_norm": 0.22756105661392212, "learning_rate": 8.352728213085079e-05, "loss": 0.7729, "step": 2275 }, { "epoch": 0.5221573342493988, "grad_norm": 0.2720366418361664, "learning_rate": 8.34626325316783e-05, "loss": 0.7705, "step": 2280 }, { "epoch": 0.5233024161227527, "grad_norm": 0.34250500798225403, "learning_rate": 8.339798293250583e-05, "loss": 0.7696, "step": 2285 }, { "epoch": 0.5244474979961067, "grad_norm": 0.38787877559661865, "learning_rate": 8.333333333333334e-05, "loss": 0.7664, "step": 2290 }, { "epoch": 0.5255925798694606, "grad_norm": 0.23861998319625854, "learning_rate": 8.326868373416086e-05, "loss": 0.7715, "step": 2295 }, { "epoch": 0.5267376617428146, "grad_norm": 0.3382185995578766, "learning_rate": 8.320403413498836e-05, "loss": 0.7689, "step": 2300 }, { "epoch": 0.5278827436161686, "grad_norm": 0.29420629143714905, "learning_rate": 8.313938453581587e-05, "loss": 0.7649, "step": 2305 }, { "epoch": 0.5290278254895225, "grad_norm": 0.26455843448638916, "learning_rate": 8.30747349366434e-05, "loss": 0.7704, "step": 2310 }, { "epoch": 0.5301729073628765, "grad_norm": 0.3979088366031647, "learning_rate": 8.301008533747092e-05, "loss": 0.7679, "step": 2315 }, { "epoch": 0.5313179892362304, "grad_norm": 0.24187666177749634, "learning_rate": 8.294543573829843e-05, "loss": 0.7648, "step": 2320 }, { "epoch": 0.5324630711095844, "grad_norm": 0.21819165349006653, "learning_rate": 8.288078613912594e-05, "loss": 0.7666, "step": 2325 }, { "epoch": 0.5336081529829383, "grad_norm": 0.44462597370147705, "learning_rate": 8.281613653995346e-05, "loss": 0.7653, "step": 2330 }, { "epoch": 0.5347532348562922, "grad_norm": 0.2796482443809509, "learning_rate": 8.275148694078097e-05, "loss": 0.7733, "step": 2335 }, { "epoch": 0.5358983167296462, "grad_norm": 0.28863418102264404, "learning_rate": 8.268683734160849e-05, "loss": 0.7672, "step": 2340 }, { "epoch": 0.5370433986030001, "grad_norm": 0.38829681277275085, "learning_rate": 8.2622187742436e-05, "loss": 0.7717, "step": 2345 }, { "epoch": 0.538188480476354, "grad_norm": 0.26142099499702454, "learning_rate": 8.255753814326351e-05, "loss": 0.7703, "step": 2350 }, { "epoch": 0.539333562349708, "grad_norm": 0.32155346870422363, "learning_rate": 8.249288854409103e-05, "loss": 0.77, "step": 2355 }, { "epoch": 0.5404786442230619, "grad_norm": 0.25289323925971985, "learning_rate": 8.242823894491854e-05, "loss": 0.7712, "step": 2360 }, { "epoch": 0.5416237260964158, "grad_norm": 0.4167221188545227, "learning_rate": 8.236358934574606e-05, "loss": 0.7683, "step": 2365 }, { "epoch": 0.5427688079697698, "grad_norm": 0.3492532968521118, "learning_rate": 8.229893974657357e-05, "loss": 0.7678, "step": 2370 }, { "epoch": 0.5439138898431238, "grad_norm": 0.27711644768714905, "learning_rate": 8.22342901474011e-05, "loss": 0.7659, "step": 2375 }, { "epoch": 0.5450589717164778, "grad_norm": 0.2010398656129837, "learning_rate": 8.216964054822861e-05, "loss": 0.768, "step": 2380 }, { "epoch": 0.5462040535898317, "grad_norm": 0.33034494519233704, "learning_rate": 8.210499094905611e-05, "loss": 0.7723, "step": 2385 }, { "epoch": 0.5473491354631856, "grad_norm": 0.251040518283844, "learning_rate": 8.204034134988363e-05, "loss": 0.7636, "step": 2390 }, { "epoch": 0.5484942173365396, "grad_norm": 0.20830363035202026, "learning_rate": 8.197569175071114e-05, "loss": 0.7698, "step": 2395 }, { "epoch": 0.5496392992098935, "grad_norm": 0.21820737421512604, "learning_rate": 8.191104215153867e-05, "loss": 0.7663, "step": 2400 }, { "epoch": 0.5507843810832475, "grad_norm": 0.24281048774719238, "learning_rate": 8.184639255236618e-05, "loss": 0.7667, "step": 2405 }, { "epoch": 0.5519294629566014, "grad_norm": 0.2047927975654602, "learning_rate": 8.17817429531937e-05, "loss": 0.7688, "step": 2410 }, { "epoch": 0.5530745448299553, "grad_norm": 0.3286665081977844, "learning_rate": 8.171709335402121e-05, "loss": 0.7657, "step": 2415 }, { "epoch": 0.5542196267033093, "grad_norm": 0.234399676322937, "learning_rate": 8.165244375484873e-05, "loss": 0.7672, "step": 2420 }, { "epoch": 0.5553647085766632, "grad_norm": 0.27283400297164917, "learning_rate": 8.158779415567623e-05, "loss": 0.767, "step": 2425 }, { "epoch": 0.5565097904500171, "grad_norm": 0.2366446703672409, "learning_rate": 8.152314455650376e-05, "loss": 0.7654, "step": 2430 }, { "epoch": 0.5576548723233711, "grad_norm": 0.37923479080200195, "learning_rate": 8.145849495733127e-05, "loss": 0.7687, "step": 2435 }, { "epoch": 0.558799954196725, "grad_norm": 0.2888481020927429, "learning_rate": 8.139384535815878e-05, "loss": 0.7667, "step": 2440 }, { "epoch": 0.5599450360700791, "grad_norm": 0.2126360982656479, "learning_rate": 8.13291957589863e-05, "loss": 0.7707, "step": 2445 }, { "epoch": 0.561090117943433, "grad_norm": 0.13646523654460907, "learning_rate": 8.126454615981381e-05, "loss": 0.7697, "step": 2450 }, { "epoch": 0.5622351998167869, "grad_norm": 0.18953712284564972, "learning_rate": 8.119989656064133e-05, "loss": 0.7601, "step": 2455 }, { "epoch": 0.5633802816901409, "grad_norm": 0.661413311958313, "learning_rate": 8.113524696146884e-05, "loss": 0.7687, "step": 2460 }, { "epoch": 0.5645253635634948, "grad_norm": 0.39190343022346497, "learning_rate": 8.107059736229636e-05, "loss": 0.7708, "step": 2465 }, { "epoch": 0.5656704454368487, "grad_norm": 0.1970270425081253, "learning_rate": 8.100594776312387e-05, "loss": 0.7708, "step": 2470 }, { "epoch": 0.5668155273102027, "grad_norm": 0.3270746171474457, "learning_rate": 8.094129816395138e-05, "loss": 0.7685, "step": 2475 }, { "epoch": 0.5679606091835566, "grad_norm": 0.19802868366241455, "learning_rate": 8.08766485647789e-05, "loss": 0.7671, "step": 2480 }, { "epoch": 0.5691056910569106, "grad_norm": 0.2073948085308075, "learning_rate": 8.081199896560641e-05, "loss": 0.7704, "step": 2485 }, { "epoch": 0.5702507729302645, "grad_norm": 0.292240172624588, "learning_rate": 8.074734936643393e-05, "loss": 0.768, "step": 2490 }, { "epoch": 0.5713958548036184, "grad_norm": 0.2657301723957062, "learning_rate": 8.068269976726145e-05, "loss": 0.7699, "step": 2495 }, { "epoch": 0.5725409366769724, "grad_norm": 0.30449584126472473, "learning_rate": 8.061805016808897e-05, "loss": 0.7678, "step": 2500 }, { "epoch": 0.5736860185503263, "grad_norm": 0.3515103757381439, "learning_rate": 8.055340056891648e-05, "loss": 0.7662, "step": 2505 }, { "epoch": 0.5748311004236802, "grad_norm": 0.4272383153438568, "learning_rate": 8.048875096974398e-05, "loss": 0.7664, "step": 2510 }, { "epoch": 0.5759761822970343, "grad_norm": 0.2849995791912079, "learning_rate": 8.04241013705715e-05, "loss": 0.7648, "step": 2515 }, { "epoch": 0.5771212641703882, "grad_norm": 0.26403772830963135, "learning_rate": 8.035945177139903e-05, "loss": 0.7634, "step": 2520 }, { "epoch": 0.5782663460437422, "grad_norm": 0.28480133414268494, "learning_rate": 8.029480217222654e-05, "loss": 0.7665, "step": 2525 }, { "epoch": 0.5794114279170961, "grad_norm": 0.3533156216144562, "learning_rate": 8.023015257305405e-05, "loss": 0.7692, "step": 2530 }, { "epoch": 0.58055650979045, "grad_norm": 0.35114097595214844, "learning_rate": 8.016550297388157e-05, "loss": 0.7657, "step": 2535 }, { "epoch": 0.581701591663804, "grad_norm": 0.3216884434223175, "learning_rate": 8.010085337470908e-05, "loss": 0.7667, "step": 2540 }, { "epoch": 0.5828466735371579, "grad_norm": 0.2756839990615845, "learning_rate": 8.00362037755366e-05, "loss": 0.7683, "step": 2545 }, { "epoch": 0.5839917554105118, "grad_norm": 0.24576738476753235, "learning_rate": 7.997155417636411e-05, "loss": 0.7654, "step": 2550 }, { "epoch": 0.5851368372838658, "grad_norm": 0.25817248225212097, "learning_rate": 7.990690457719162e-05, "loss": 0.7616, "step": 2555 }, { "epoch": 0.5862819191572197, "grad_norm": 0.3255434036254883, "learning_rate": 7.984225497801914e-05, "loss": 0.7684, "step": 2560 }, { "epoch": 0.5874270010305737, "grad_norm": 0.3148951232433319, "learning_rate": 7.977760537884665e-05, "loss": 0.7661, "step": 2565 }, { "epoch": 0.5885720829039276, "grad_norm": 0.3079426884651184, "learning_rate": 7.971295577967417e-05, "loss": 0.7623, "step": 2570 }, { "epoch": 0.5897171647772815, "grad_norm": 0.39346790313720703, "learning_rate": 7.964830618050168e-05, "loss": 0.7656, "step": 2575 }, { "epoch": 0.5908622466506355, "grad_norm": 0.22193998098373413, "learning_rate": 7.95836565813292e-05, "loss": 0.7695, "step": 2580 }, { "epoch": 0.5920073285239895, "grad_norm": 0.25602176785469055, "learning_rate": 7.951900698215672e-05, "loss": 0.7639, "step": 2585 }, { "epoch": 0.5931524103973435, "grad_norm": 0.3182067275047302, "learning_rate": 7.945435738298424e-05, "loss": 0.7667, "step": 2590 }, { "epoch": 0.5942974922706974, "grad_norm": 0.2605539858341217, "learning_rate": 7.938970778381174e-05, "loss": 0.7668, "step": 2595 }, { "epoch": 0.5954425741440513, "grad_norm": 0.39896488189697266, "learning_rate": 7.932505818463925e-05, "loss": 0.767, "step": 2600 }, { "epoch": 0.5965876560174053, "grad_norm": 0.32872942090034485, "learning_rate": 7.926040858546677e-05, "loss": 0.7638, "step": 2605 }, { "epoch": 0.5977327378907592, "grad_norm": 0.3192463219165802, "learning_rate": 7.91957589862943e-05, "loss": 0.7708, "step": 2610 }, { "epoch": 0.5988778197641131, "grad_norm": 0.4065992534160614, "learning_rate": 7.913110938712181e-05, "loss": 0.7669, "step": 2615 }, { "epoch": 0.6000229016374671, "grad_norm": 0.24374541640281677, "learning_rate": 7.906645978794932e-05, "loss": 0.7668, "step": 2620 }, { "epoch": 0.601167983510821, "grad_norm": 0.1961938738822937, "learning_rate": 7.900181018877684e-05, "loss": 0.7628, "step": 2625 }, { "epoch": 0.602313065384175, "grad_norm": 0.32636597752571106, "learning_rate": 7.893716058960435e-05, "loss": 0.7671, "step": 2630 }, { "epoch": 0.6034581472575289, "grad_norm": 0.5156142711639404, "learning_rate": 7.887251099043185e-05, "loss": 0.7643, "step": 2635 }, { "epoch": 0.6046032291308828, "grad_norm": 0.4189930856227875, "learning_rate": 7.880786139125938e-05, "loss": 0.7699, "step": 2640 }, { "epoch": 0.6057483110042368, "grad_norm": 0.26843327283859253, "learning_rate": 7.87432117920869e-05, "loss": 0.7655, "step": 2645 }, { "epoch": 0.6068933928775907, "grad_norm": 0.2084437906742096, "learning_rate": 7.867856219291441e-05, "loss": 0.7642, "step": 2650 }, { "epoch": 0.6080384747509447, "grad_norm": 0.23720037937164307, "learning_rate": 7.861391259374192e-05, "loss": 0.7652, "step": 2655 }, { "epoch": 0.6091835566242987, "grad_norm": 0.2192964255809784, "learning_rate": 7.854926299456944e-05, "loss": 0.7658, "step": 2660 }, { "epoch": 0.6103286384976526, "grad_norm": 0.23451314866542816, "learning_rate": 7.848461339539695e-05, "loss": 0.7622, "step": 2665 }, { "epoch": 0.6114737203710066, "grad_norm": 0.2981168031692505, "learning_rate": 7.841996379622447e-05, "loss": 0.7648, "step": 2670 }, { "epoch": 0.6126188022443605, "grad_norm": 0.2158365696668625, "learning_rate": 7.835531419705198e-05, "loss": 0.7635, "step": 2675 }, { "epoch": 0.6137638841177144, "grad_norm": 0.20274029672145844, "learning_rate": 7.82906645978795e-05, "loss": 0.7642, "step": 2680 }, { "epoch": 0.6149089659910684, "grad_norm": 0.35747575759887695, "learning_rate": 7.822601499870701e-05, "loss": 0.7652, "step": 2685 }, { "epoch": 0.6160540478644223, "grad_norm": 0.23654219508171082, "learning_rate": 7.816136539953452e-05, "loss": 0.7636, "step": 2690 }, { "epoch": 0.6171991297377762, "grad_norm": 0.22896522283554077, "learning_rate": 7.809671580036204e-05, "loss": 0.7629, "step": 2695 }, { "epoch": 0.6183442116111302, "grad_norm": 0.2906895577907562, "learning_rate": 7.803206620118956e-05, "loss": 0.762, "step": 2700 }, { "epoch": 0.6194892934844841, "grad_norm": 0.1701405942440033, "learning_rate": 7.796741660201708e-05, "loss": 0.7646, "step": 2705 }, { "epoch": 0.620634375357838, "grad_norm": 0.44711700081825256, "learning_rate": 7.790276700284459e-05, "loss": 0.7684, "step": 2710 }, { "epoch": 0.621779457231192, "grad_norm": 0.2111397385597229, "learning_rate": 7.783811740367211e-05, "loss": 0.7655, "step": 2715 }, { "epoch": 0.6229245391045459, "grad_norm": 0.23328475654125214, "learning_rate": 7.777346780449961e-05, "loss": 0.7666, "step": 2720 }, { "epoch": 0.6240696209779, "grad_norm": 0.2624497711658478, "learning_rate": 7.770881820532712e-05, "loss": 0.7604, "step": 2725 }, { "epoch": 0.6252147028512539, "grad_norm": 0.3210087716579437, "learning_rate": 7.764416860615465e-05, "loss": 0.7615, "step": 2730 }, { "epoch": 0.6263597847246078, "grad_norm": 0.21020106971263885, "learning_rate": 7.757951900698216e-05, "loss": 0.7629, "step": 2735 }, { "epoch": 0.6275048665979618, "grad_norm": 0.23509849607944489, "learning_rate": 7.751486940780968e-05, "loss": 0.7668, "step": 2740 }, { "epoch": 0.6286499484713157, "grad_norm": 0.42521828413009644, "learning_rate": 7.745021980863719e-05, "loss": 0.7662, "step": 2745 }, { "epoch": 0.6297950303446697, "grad_norm": 0.2406284660100937, "learning_rate": 7.73855702094647e-05, "loss": 0.7631, "step": 2750 }, { "epoch": 0.6309401122180236, "grad_norm": 0.25343960523605347, "learning_rate": 7.732092061029222e-05, "loss": 0.7645, "step": 2755 }, { "epoch": 0.6320851940913775, "grad_norm": 0.2309281826019287, "learning_rate": 7.725627101111973e-05, "loss": 0.7609, "step": 2760 }, { "epoch": 0.6332302759647315, "grad_norm": 0.34525343775749207, "learning_rate": 7.719162141194725e-05, "loss": 0.7657, "step": 2765 }, { "epoch": 0.6343753578380854, "grad_norm": 0.42397892475128174, "learning_rate": 7.712697181277476e-05, "loss": 0.7618, "step": 2770 }, { "epoch": 0.6355204397114393, "grad_norm": 0.3545205295085907, "learning_rate": 7.706232221360228e-05, "loss": 0.7632, "step": 2775 }, { "epoch": 0.6366655215847933, "grad_norm": 0.2760995626449585, "learning_rate": 7.699767261442979e-05, "loss": 0.762, "step": 2780 }, { "epoch": 0.6378106034581472, "grad_norm": 0.252374529838562, "learning_rate": 7.69330230152573e-05, "loss": 0.7627, "step": 2785 }, { "epoch": 0.6389556853315012, "grad_norm": 0.24971815943717957, "learning_rate": 7.686837341608482e-05, "loss": 0.7647, "step": 2790 }, { "epoch": 0.6401007672048551, "grad_norm": 0.39055272936820984, "learning_rate": 7.680372381691235e-05, "loss": 0.7648, "step": 2795 }, { "epoch": 0.6412458490782091, "grad_norm": 0.5789263844490051, "learning_rate": 7.673907421773986e-05, "loss": 0.7631, "step": 2800 }, { "epoch": 0.6423909309515631, "grad_norm": 0.34116217494010925, "learning_rate": 7.667442461856736e-05, "loss": 0.7627, "step": 2805 }, { "epoch": 0.643536012824917, "grad_norm": 0.17528630793094635, "learning_rate": 7.660977501939488e-05, "loss": 0.7662, "step": 2810 }, { "epoch": 0.644681094698271, "grad_norm": 0.34103506803512573, "learning_rate": 7.654512542022239e-05, "loss": 0.7675, "step": 2815 }, { "epoch": 0.6458261765716249, "grad_norm": 0.38298407196998596, "learning_rate": 7.648047582104992e-05, "loss": 0.7608, "step": 2820 }, { "epoch": 0.6469712584449788, "grad_norm": 0.34761834144592285, "learning_rate": 7.641582622187743e-05, "loss": 0.7671, "step": 2825 }, { "epoch": 0.6481163403183328, "grad_norm": 0.21851806342601776, "learning_rate": 7.635117662270495e-05, "loss": 0.7595, "step": 2830 }, { "epoch": 0.6492614221916867, "grad_norm": 0.19289985299110413, "learning_rate": 7.628652702353246e-05, "loss": 0.7606, "step": 2835 }, { "epoch": 0.6504065040650406, "grad_norm": 0.17358209192752838, "learning_rate": 7.622187742435998e-05, "loss": 0.7599, "step": 2840 }, { "epoch": 0.6515515859383946, "grad_norm": 0.3359616696834564, "learning_rate": 7.615722782518748e-05, "loss": 0.7665, "step": 2845 }, { "epoch": 0.6526966678117485, "grad_norm": 0.4290224313735962, "learning_rate": 7.6092578226015e-05, "loss": 0.7643, "step": 2850 }, { "epoch": 0.6538417496851024, "grad_norm": 0.24706508219242096, "learning_rate": 7.602792862684252e-05, "loss": 0.7652, "step": 2855 }, { "epoch": 0.6549868315584564, "grad_norm": 0.23143522441387177, "learning_rate": 7.596327902767003e-05, "loss": 0.7618, "step": 2860 }, { "epoch": 0.6561319134318103, "grad_norm": 0.2622555196285248, "learning_rate": 7.589862942849755e-05, "loss": 0.7599, "step": 2865 }, { "epoch": 0.6572769953051644, "grad_norm": 0.30878838896751404, "learning_rate": 7.583397982932506e-05, "loss": 0.7608, "step": 2870 }, { "epoch": 0.6584220771785183, "grad_norm": 0.2817178964614868, "learning_rate": 7.576933023015258e-05, "loss": 0.764, "step": 2875 }, { "epoch": 0.6595671590518722, "grad_norm": 0.23512059450149536, "learning_rate": 7.570468063098009e-05, "loss": 0.7662, "step": 2880 }, { "epoch": 0.6607122409252262, "grad_norm": 0.2094719558954239, "learning_rate": 7.56400310318076e-05, "loss": 0.7669, "step": 2885 }, { "epoch": 0.6618573227985801, "grad_norm": 0.26047632098197937, "learning_rate": 7.557538143263512e-05, "loss": 0.7613, "step": 2890 }, { "epoch": 0.663002404671934, "grad_norm": 0.2346109300851822, "learning_rate": 7.551073183346263e-05, "loss": 0.7627, "step": 2895 }, { "epoch": 0.664147486545288, "grad_norm": 0.2940877676010132, "learning_rate": 7.544608223429015e-05, "loss": 0.7625, "step": 2900 }, { "epoch": 0.6652925684186419, "grad_norm": 0.5073947906494141, "learning_rate": 7.538143263511766e-05, "loss": 0.761, "step": 2905 }, { "epoch": 0.6664376502919959, "grad_norm": 0.28100213408470154, "learning_rate": 7.531678303594519e-05, "loss": 0.7677, "step": 2910 }, { "epoch": 0.6675827321653498, "grad_norm": 0.18574713170528412, "learning_rate": 7.52521334367727e-05, "loss": 0.7636, "step": 2915 }, { "epoch": 0.6687278140387037, "grad_norm": 0.2935812473297119, "learning_rate": 7.518748383760022e-05, "loss": 0.7658, "step": 2920 }, { "epoch": 0.6698728959120577, "grad_norm": 0.1969953179359436, "learning_rate": 7.512283423842773e-05, "loss": 0.7634, "step": 2925 }, { "epoch": 0.6710179777854116, "grad_norm": 0.17374487221240997, "learning_rate": 7.505818463925523e-05, "loss": 0.7593, "step": 2930 }, { "epoch": 0.6721630596587655, "grad_norm": 0.28035348653793335, "learning_rate": 7.499353504008275e-05, "loss": 0.7637, "step": 2935 }, { "epoch": 0.6733081415321196, "grad_norm": 0.23470938205718994, "learning_rate": 7.492888544091027e-05, "loss": 0.7615, "step": 2940 }, { "epoch": 0.6744532234054735, "grad_norm": 0.27259859442710876, "learning_rate": 7.486423584173779e-05, "loss": 0.7582, "step": 2945 }, { "epoch": 0.6755983052788275, "grad_norm": 0.3640976846218109, "learning_rate": 7.47995862425653e-05, "loss": 0.7662, "step": 2950 }, { "epoch": 0.6767433871521814, "grad_norm": 0.1931801736354828, "learning_rate": 7.473493664339282e-05, "loss": 0.759, "step": 2955 }, { "epoch": 0.6778884690255353, "grad_norm": 0.47389793395996094, "learning_rate": 7.467028704422033e-05, "loss": 0.7636, "step": 2960 }, { "epoch": 0.6790335508988893, "grad_norm": 0.34375831484794617, "learning_rate": 7.460563744504785e-05, "loss": 0.7625, "step": 2965 }, { "epoch": 0.6801786327722432, "grad_norm": 0.19540467858314514, "learning_rate": 7.454098784587536e-05, "loss": 0.759, "step": 2970 }, { "epoch": 0.6813237146455972, "grad_norm": 0.33635178208351135, "learning_rate": 7.447633824670287e-05, "loss": 0.7596, "step": 2975 }, { "epoch": 0.6824687965189511, "grad_norm": 0.2803053557872772, "learning_rate": 7.441168864753039e-05, "loss": 0.7613, "step": 2980 }, { "epoch": 0.683613878392305, "grad_norm": 0.21707181632518768, "learning_rate": 7.43470390483579e-05, "loss": 0.7584, "step": 2985 }, { "epoch": 0.684758960265659, "grad_norm": 0.25607773661613464, "learning_rate": 7.428238944918542e-05, "loss": 0.7617, "step": 2990 }, { "epoch": 0.6859040421390129, "grad_norm": 0.3066530227661133, "learning_rate": 7.421773985001293e-05, "loss": 0.7619, "step": 2995 }, { "epoch": 0.6870491240123668, "grad_norm": 0.2540838122367859, "learning_rate": 7.415309025084044e-05, "loss": 0.7616, "step": 3000 }, { "epoch": 0.6881942058857208, "grad_norm": 0.2732105553150177, "learning_rate": 7.408844065166797e-05, "loss": 0.7638, "step": 3005 }, { "epoch": 0.6893392877590748, "grad_norm": 0.23296260833740234, "learning_rate": 7.402379105249547e-05, "loss": 0.7639, "step": 3010 }, { "epoch": 0.6904843696324288, "grad_norm": 0.2918176054954529, "learning_rate": 7.395914145332299e-05, "loss": 0.7631, "step": 3015 }, { "epoch": 0.6916294515057827, "grad_norm": 0.20227240025997162, "learning_rate": 7.38944918541505e-05, "loss": 0.7624, "step": 3020 }, { "epoch": 0.6927745333791366, "grad_norm": 0.2289716601371765, "learning_rate": 7.382984225497802e-05, "loss": 0.76, "step": 3025 }, { "epoch": 0.6939196152524906, "grad_norm": 0.4011261761188507, "learning_rate": 7.376519265580554e-05, "loss": 0.7639, "step": 3030 }, { "epoch": 0.6950646971258445, "grad_norm": 0.16025088727474213, "learning_rate": 7.370054305663306e-05, "loss": 0.7602, "step": 3035 }, { "epoch": 0.6962097789991984, "grad_norm": 0.21587808430194855, "learning_rate": 7.363589345746057e-05, "loss": 0.7647, "step": 3040 }, { "epoch": 0.6973548608725524, "grad_norm": 0.2196759283542633, "learning_rate": 7.357124385828809e-05, "loss": 0.7657, "step": 3045 }, { "epoch": 0.6984999427459063, "grad_norm": 0.34964117407798767, "learning_rate": 7.35065942591156e-05, "loss": 0.7582, "step": 3050 }, { "epoch": 0.6996450246192603, "grad_norm": 0.34376025199890137, "learning_rate": 7.34419446599431e-05, "loss": 0.7627, "step": 3055 }, { "epoch": 0.7007901064926142, "grad_norm": 0.26398709416389465, "learning_rate": 7.337729506077063e-05, "loss": 0.7624, "step": 3060 }, { "epoch": 0.7019351883659681, "grad_norm": 0.24627701938152313, "learning_rate": 7.331264546159814e-05, "loss": 0.7619, "step": 3065 }, { "epoch": 0.7030802702393221, "grad_norm": 0.3965471386909485, "learning_rate": 7.324799586242566e-05, "loss": 0.7635, "step": 3070 }, { "epoch": 0.704225352112676, "grad_norm": 0.2562287747859955, "learning_rate": 7.318334626325317e-05, "loss": 0.7624, "step": 3075 }, { "epoch": 0.70537043398603, "grad_norm": 0.20183491706848145, "learning_rate": 7.311869666408069e-05, "loss": 0.7617, "step": 3080 }, { "epoch": 0.706515515859384, "grad_norm": 0.23248891532421112, "learning_rate": 7.30540470649082e-05, "loss": 0.7627, "step": 3085 }, { "epoch": 0.7076605977327379, "grad_norm": 0.4082355797290802, "learning_rate": 7.298939746573571e-05, "loss": 0.7649, "step": 3090 }, { "epoch": 0.7088056796060919, "grad_norm": 0.3321089744567871, "learning_rate": 7.292474786656323e-05, "loss": 0.7615, "step": 3095 }, { "epoch": 0.7099507614794458, "grad_norm": 0.3166889250278473, "learning_rate": 7.286009826739074e-05, "loss": 0.7639, "step": 3100 }, { "epoch": 0.7110958433527997, "grad_norm": 0.3378044068813324, "learning_rate": 7.279544866821826e-05, "loss": 0.7617, "step": 3105 }, { "epoch": 0.7122409252261537, "grad_norm": 0.24754200875759125, "learning_rate": 7.273079906904577e-05, "loss": 0.7603, "step": 3110 }, { "epoch": 0.7133860070995076, "grad_norm": 0.20655082166194916, "learning_rate": 7.266614946987329e-05, "loss": 0.7652, "step": 3115 }, { "epoch": 0.7145310889728615, "grad_norm": 0.1733807623386383, "learning_rate": 7.260149987070081e-05, "loss": 0.7638, "step": 3120 }, { "epoch": 0.7156761708462155, "grad_norm": 0.20328563451766968, "learning_rate": 7.253685027152833e-05, "loss": 0.7632, "step": 3125 }, { "epoch": 0.7168212527195694, "grad_norm": 0.28387951850891113, "learning_rate": 7.247220067235584e-05, "loss": 0.7607, "step": 3130 }, { "epoch": 0.7179663345929234, "grad_norm": 0.21539588272571564, "learning_rate": 7.240755107318336e-05, "loss": 0.7624, "step": 3135 }, { "epoch": 0.7191114164662773, "grad_norm": 0.20758353173732758, "learning_rate": 7.234290147401086e-05, "loss": 0.7636, "step": 3140 }, { "epoch": 0.7202564983396312, "grad_norm": 0.2486894130706787, "learning_rate": 7.227825187483837e-05, "loss": 0.766, "step": 3145 }, { "epoch": 0.7214015802129853, "grad_norm": 0.2833589017391205, "learning_rate": 7.22136022756659e-05, "loss": 0.7606, "step": 3150 }, { "epoch": 0.7225466620863392, "grad_norm": 0.2237931787967682, "learning_rate": 7.214895267649341e-05, "loss": 0.7631, "step": 3155 }, { "epoch": 0.7236917439596932, "grad_norm": 0.4406035244464874, "learning_rate": 7.208430307732093e-05, "loss": 0.7629, "step": 3160 }, { "epoch": 0.7248368258330471, "grad_norm": 0.32733210921287537, "learning_rate": 7.201965347814844e-05, "loss": 0.7581, "step": 3165 }, { "epoch": 0.725981907706401, "grad_norm": 0.1655232459306717, "learning_rate": 7.195500387897596e-05, "loss": 0.7621, "step": 3170 }, { "epoch": 0.727126989579755, "grad_norm": 0.17573896050453186, "learning_rate": 7.189035427980347e-05, "loss": 0.7606, "step": 3175 }, { "epoch": 0.7282720714531089, "grad_norm": 0.364986777305603, "learning_rate": 7.182570468063098e-05, "loss": 0.7621, "step": 3180 }, { "epoch": 0.7294171533264628, "grad_norm": 0.3647826611995697, "learning_rate": 7.17610550814585e-05, "loss": 0.7611, "step": 3185 }, { "epoch": 0.7305622351998168, "grad_norm": 0.23403432965278625, "learning_rate": 7.169640548228601e-05, "loss": 0.759, "step": 3190 }, { "epoch": 0.7317073170731707, "grad_norm": 0.33898410201072693, "learning_rate": 7.163175588311353e-05, "loss": 0.7621, "step": 3195 }, { "epoch": 0.7328523989465247, "grad_norm": 0.23586343228816986, "learning_rate": 7.156710628394104e-05, "loss": 0.7608, "step": 3200 }, { "epoch": 0.7339974808198786, "grad_norm": 0.2086014598608017, "learning_rate": 7.150245668476855e-05, "loss": 0.7611, "step": 3205 }, { "epoch": 0.7351425626932325, "grad_norm": 0.23983906209468842, "learning_rate": 7.143780708559607e-05, "loss": 0.7606, "step": 3210 }, { "epoch": 0.7362876445665865, "grad_norm": 0.19357101619243622, "learning_rate": 7.13731574864236e-05, "loss": 0.7623, "step": 3215 }, { "epoch": 0.7374327264399404, "grad_norm": 0.24940311908721924, "learning_rate": 7.13085078872511e-05, "loss": 0.7607, "step": 3220 }, { "epoch": 0.7385778083132944, "grad_norm": 0.19958077371120453, "learning_rate": 7.124385828807861e-05, "loss": 0.7639, "step": 3225 }, { "epoch": 0.7397228901866484, "grad_norm": 0.1807526797056198, "learning_rate": 7.117920868890613e-05, "loss": 0.7597, "step": 3230 }, { "epoch": 0.7408679720600023, "grad_norm": 0.3189755082130432, "learning_rate": 7.111455908973364e-05, "loss": 0.7584, "step": 3235 }, { "epoch": 0.7420130539333563, "grad_norm": 0.2481878697872162, "learning_rate": 7.104990949056117e-05, "loss": 0.7593, "step": 3240 }, { "epoch": 0.7431581358067102, "grad_norm": 0.2721337676048279, "learning_rate": 7.098525989138868e-05, "loss": 0.7606, "step": 3245 }, { "epoch": 0.7443032176800641, "grad_norm": 0.23429130017757416, "learning_rate": 7.09206102922162e-05, "loss": 0.7587, "step": 3250 }, { "epoch": 0.7454482995534181, "grad_norm": 0.30629605054855347, "learning_rate": 7.085596069304371e-05, "loss": 0.7631, "step": 3255 }, { "epoch": 0.746593381426772, "grad_norm": 0.3316456973552704, "learning_rate": 7.079131109387122e-05, "loss": 0.759, "step": 3260 }, { "epoch": 0.7477384633001259, "grad_norm": 0.41559523344039917, "learning_rate": 7.072666149469873e-05, "loss": 0.7592, "step": 3265 }, { "epoch": 0.7488835451734799, "grad_norm": 0.28401699662208557, "learning_rate": 7.066201189552625e-05, "loss": 0.7615, "step": 3270 }, { "epoch": 0.7500286270468338, "grad_norm": 0.25793886184692383, "learning_rate": 7.059736229635377e-05, "loss": 0.763, "step": 3275 }, { "epoch": 0.7511737089201878, "grad_norm": 0.19388380646705627, "learning_rate": 7.053271269718128e-05, "loss": 0.7578, "step": 3280 }, { "epoch": 0.7523187907935417, "grad_norm": 0.2703215479850769, "learning_rate": 7.04680630980088e-05, "loss": 0.7602, "step": 3285 }, { "epoch": 0.7534638726668956, "grad_norm": 0.21396110951900482, "learning_rate": 7.040341349883631e-05, "loss": 0.7593, "step": 3290 }, { "epoch": 0.7546089545402497, "grad_norm": 0.2268245965242386, "learning_rate": 7.033876389966382e-05, "loss": 0.7567, "step": 3295 }, { "epoch": 0.7557540364136036, "grad_norm": 0.17969952523708344, "learning_rate": 7.027411430049134e-05, "loss": 0.7612, "step": 3300 }, { "epoch": 0.7568991182869576, "grad_norm": 0.27984392642974854, "learning_rate": 7.020946470131885e-05, "loss": 0.7593, "step": 3305 }, { "epoch": 0.7580442001603115, "grad_norm": 0.22135871648788452, "learning_rate": 7.014481510214637e-05, "loss": 0.759, "step": 3310 }, { "epoch": 0.7591892820336654, "grad_norm": 0.26906076073646545, "learning_rate": 7.008016550297388e-05, "loss": 0.7618, "step": 3315 }, { "epoch": 0.7603343639070194, "grad_norm": 0.26624733209609985, "learning_rate": 7.00155159038014e-05, "loss": 0.7609, "step": 3320 }, { "epoch": 0.7614794457803733, "grad_norm": 0.4906958043575287, "learning_rate": 6.995086630462891e-05, "loss": 0.7602, "step": 3325 }, { "epoch": 0.7626245276537272, "grad_norm": 0.3430747985839844, "learning_rate": 6.988621670545644e-05, "loss": 0.7596, "step": 3330 }, { "epoch": 0.7637696095270812, "grad_norm": 0.2605244219303131, "learning_rate": 6.982156710628395e-05, "loss": 0.762, "step": 3335 }, { "epoch": 0.7649146914004351, "grad_norm": 0.21872663497924805, "learning_rate": 6.975691750711147e-05, "loss": 0.7603, "step": 3340 }, { "epoch": 0.766059773273789, "grad_norm": 0.3047766089439392, "learning_rate": 6.969226790793898e-05, "loss": 0.7592, "step": 3345 }, { "epoch": 0.767204855147143, "grad_norm": 0.317523717880249, "learning_rate": 6.962761830876648e-05, "loss": 0.7595, "step": 3350 }, { "epoch": 0.7683499370204969, "grad_norm": 0.19987279176712036, "learning_rate": 6.9562968709594e-05, "loss": 0.7557, "step": 3355 }, { "epoch": 0.7694950188938509, "grad_norm": 0.17979387938976288, "learning_rate": 6.949831911042152e-05, "loss": 0.7642, "step": 3360 }, { "epoch": 0.7706401007672049, "grad_norm": 0.18759754300117493, "learning_rate": 6.943366951124904e-05, "loss": 0.76, "step": 3365 }, { "epoch": 0.7717851826405588, "grad_norm": 0.25630974769592285, "learning_rate": 6.936901991207655e-05, "loss": 0.7558, "step": 3370 }, { "epoch": 0.7729302645139128, "grad_norm": 0.19636793434619904, "learning_rate": 6.930437031290407e-05, "loss": 0.7592, "step": 3375 }, { "epoch": 0.7740753463872667, "grad_norm": 0.1815565526485443, "learning_rate": 6.923972071373158e-05, "loss": 0.7553, "step": 3380 }, { "epoch": 0.7752204282606207, "grad_norm": 0.28360649943351746, "learning_rate": 6.91750711145591e-05, "loss": 0.7593, "step": 3385 }, { "epoch": 0.7763655101339746, "grad_norm": 0.2512878477573395, "learning_rate": 6.911042151538661e-05, "loss": 0.7566, "step": 3390 }, { "epoch": 0.7775105920073285, "grad_norm": 0.23916715383529663, "learning_rate": 6.904577191621412e-05, "loss": 0.7604, "step": 3395 }, { "epoch": 0.7786556738806825, "grad_norm": 0.2430179864168167, "learning_rate": 6.898112231704164e-05, "loss": 0.7618, "step": 3400 }, { "epoch": 0.7798007557540364, "grad_norm": 0.32661014795303345, "learning_rate": 6.891647271786915e-05, "loss": 0.7611, "step": 3405 }, { "epoch": 0.7809458376273903, "grad_norm": 0.26453500986099243, "learning_rate": 6.885182311869666e-05, "loss": 0.7612, "step": 3410 }, { "epoch": 0.7820909195007443, "grad_norm": 0.19719307124614716, "learning_rate": 6.878717351952418e-05, "loss": 0.7596, "step": 3415 }, { "epoch": 0.7832360013740982, "grad_norm": 0.1745481789112091, "learning_rate": 6.872252392035171e-05, "loss": 0.7562, "step": 3420 }, { "epoch": 0.7843810832474521, "grad_norm": 0.3306420147418976, "learning_rate": 6.865787432117922e-05, "loss": 0.7589, "step": 3425 }, { "epoch": 0.7855261651208061, "grad_norm": 0.3108626902103424, "learning_rate": 6.859322472200672e-05, "loss": 0.7598, "step": 3430 }, { "epoch": 0.7866712469941601, "grad_norm": 0.24207472801208496, "learning_rate": 6.852857512283424e-05, "loss": 0.7623, "step": 3435 }, { "epoch": 0.7878163288675141, "grad_norm": 0.31846627593040466, "learning_rate": 6.846392552366175e-05, "loss": 0.7595, "step": 3440 }, { "epoch": 0.788961410740868, "grad_norm": 0.34010037779808044, "learning_rate": 6.839927592448926e-05, "loss": 0.7598, "step": 3445 }, { "epoch": 0.790106492614222, "grad_norm": 0.2276770919561386, "learning_rate": 6.833462632531679e-05, "loss": 0.7612, "step": 3450 }, { "epoch": 0.7912515744875759, "grad_norm": 0.20850631594657898, "learning_rate": 6.82699767261443e-05, "loss": 0.7643, "step": 3455 }, { "epoch": 0.7923966563609298, "grad_norm": 0.2672816216945648, "learning_rate": 6.820532712697182e-05, "loss": 0.7585, "step": 3460 }, { "epoch": 0.7935417382342838, "grad_norm": 0.22202163934707642, "learning_rate": 6.814067752779933e-05, "loss": 0.7554, "step": 3465 }, { "epoch": 0.7946868201076377, "grad_norm": 0.2591758072376251, "learning_rate": 6.807602792862685e-05, "loss": 0.7597, "step": 3470 }, { "epoch": 0.7958319019809916, "grad_norm": 0.23392333090305328, "learning_rate": 6.801137832945435e-05, "loss": 0.7648, "step": 3475 }, { "epoch": 0.7969769838543456, "grad_norm": 0.2861672639846802, "learning_rate": 6.794672873028188e-05, "loss": 0.7598, "step": 3480 }, { "epoch": 0.7981220657276995, "grad_norm": 0.2735273241996765, "learning_rate": 6.788207913110939e-05, "loss": 0.759, "step": 3485 }, { "epoch": 0.7992671476010534, "grad_norm": 0.2500370442867279, "learning_rate": 6.78174295319369e-05, "loss": 0.7634, "step": 3490 }, { "epoch": 0.8004122294744074, "grad_norm": 0.21133625507354736, "learning_rate": 6.775277993276442e-05, "loss": 0.7558, "step": 3495 }, { "epoch": 0.8015573113477613, "grad_norm": 0.29856279492378235, "learning_rate": 6.768813033359193e-05, "loss": 0.7598, "step": 3500 }, { "epoch": 0.8027023932211154, "grad_norm": 0.29944127798080444, "learning_rate": 6.762348073441945e-05, "loss": 0.7556, "step": 3505 }, { "epoch": 0.8038474750944693, "grad_norm": 0.2628041207790375, "learning_rate": 6.755883113524696e-05, "loss": 0.7551, "step": 3510 }, { "epoch": 0.8049925569678232, "grad_norm": 0.30071425437927246, "learning_rate": 6.749418153607448e-05, "loss": 0.7613, "step": 3515 }, { "epoch": 0.8061376388411772, "grad_norm": 0.19781585037708282, "learning_rate": 6.742953193690199e-05, "loss": 0.7584, "step": 3520 }, { "epoch": 0.8072827207145311, "grad_norm": 0.22515876591205597, "learning_rate": 6.73648823377295e-05, "loss": 0.7602, "step": 3525 }, { "epoch": 0.808427802587885, "grad_norm": 0.18484292924404144, "learning_rate": 6.730023273855702e-05, "loss": 0.7587, "step": 3530 }, { "epoch": 0.809572884461239, "grad_norm": 0.18081125617027283, "learning_rate": 6.723558313938453e-05, "loss": 0.7571, "step": 3535 }, { "epoch": 0.8107179663345929, "grad_norm": 0.27940741181373596, "learning_rate": 6.717093354021206e-05, "loss": 0.763, "step": 3540 }, { "epoch": 0.8118630482079469, "grad_norm": 0.26946401596069336, "learning_rate": 6.710628394103958e-05, "loss": 0.7608, "step": 3545 }, { "epoch": 0.8130081300813008, "grad_norm": 0.2804325222969055, "learning_rate": 6.704163434186709e-05, "loss": 0.758, "step": 3550 }, { "epoch": 0.8141532119546547, "grad_norm": 0.3489839732646942, "learning_rate": 6.69769847426946e-05, "loss": 0.758, "step": 3555 }, { "epoch": 0.8152982938280087, "grad_norm": 0.17796777188777924, "learning_rate": 6.69123351435221e-05, "loss": 0.7592, "step": 3560 }, { "epoch": 0.8164433757013626, "grad_norm": 0.1994534283876419, "learning_rate": 6.684768554434962e-05, "loss": 0.7555, "step": 3565 }, { "epoch": 0.8175884575747165, "grad_norm": 0.23157624900341034, "learning_rate": 6.678303594517715e-05, "loss": 0.7592, "step": 3570 }, { "epoch": 0.8187335394480706, "grad_norm": 0.26892754435539246, "learning_rate": 6.671838634600466e-05, "loss": 0.7558, "step": 3575 }, { "epoch": 0.8198786213214245, "grad_norm": 0.2596917152404785, "learning_rate": 6.665373674683218e-05, "loss": 0.7561, "step": 3580 }, { "epoch": 0.8210237031947785, "grad_norm": 0.20809470117092133, "learning_rate": 6.658908714765969e-05, "loss": 0.7588, "step": 3585 }, { "epoch": 0.8221687850681324, "grad_norm": 0.19905616343021393, "learning_rate": 6.65244375484872e-05, "loss": 0.7615, "step": 3590 }, { "epoch": 0.8233138669414863, "grad_norm": 0.33176565170288086, "learning_rate": 6.645978794931472e-05, "loss": 0.7557, "step": 3595 }, { "epoch": 0.8244589488148403, "grad_norm": 0.3151164948940277, "learning_rate": 6.639513835014223e-05, "loss": 0.7541, "step": 3600 }, { "epoch": 0.8256040306881942, "grad_norm": 0.21330596506595612, "learning_rate": 6.633048875096975e-05, "loss": 0.7628, "step": 3605 }, { "epoch": 0.8267491125615481, "grad_norm": 0.2487279772758484, "learning_rate": 6.626583915179726e-05, "loss": 0.7591, "step": 3610 }, { "epoch": 0.8278941944349021, "grad_norm": 0.37388402223587036, "learning_rate": 6.620118955262477e-05, "loss": 0.758, "step": 3615 }, { "epoch": 0.829039276308256, "grad_norm": 0.32117441296577454, "learning_rate": 6.613653995345229e-05, "loss": 0.7577, "step": 3620 }, { "epoch": 0.83018435818161, "grad_norm": 0.22614158689975739, "learning_rate": 6.60718903542798e-05, "loss": 0.7582, "step": 3625 }, { "epoch": 0.8313294400549639, "grad_norm": 0.3191037178039551, "learning_rate": 6.600724075510733e-05, "loss": 0.7567, "step": 3630 }, { "epoch": 0.8324745219283178, "grad_norm": 0.22228999435901642, "learning_rate": 6.594259115593485e-05, "loss": 0.7574, "step": 3635 }, { "epoch": 0.8336196038016718, "grad_norm": 0.21925920248031616, "learning_rate": 6.587794155676235e-05, "loss": 0.7561, "step": 3640 }, { "epoch": 0.8347646856750258, "grad_norm": 0.18947115540504456, "learning_rate": 6.581329195758986e-05, "loss": 0.7517, "step": 3645 }, { "epoch": 0.8359097675483798, "grad_norm": 0.25208985805511475, "learning_rate": 6.574864235841737e-05, "loss": 0.7576, "step": 3650 }, { "epoch": 0.8370548494217337, "grad_norm": 0.3698408901691437, "learning_rate": 6.568399275924489e-05, "loss": 0.7538, "step": 3655 }, { "epoch": 0.8381999312950876, "grad_norm": 0.3591291308403015, "learning_rate": 6.561934316007242e-05, "loss": 0.7572, "step": 3660 }, { "epoch": 0.8393450131684416, "grad_norm": 0.268759548664093, "learning_rate": 6.555469356089993e-05, "loss": 0.7577, "step": 3665 }, { "epoch": 0.8404900950417955, "grad_norm": 0.2655838131904602, "learning_rate": 6.549004396172744e-05, "loss": 0.7597, "step": 3670 }, { "epoch": 0.8416351769151494, "grad_norm": 0.29894131422042847, "learning_rate": 6.542539436255496e-05, "loss": 0.7547, "step": 3675 }, { "epoch": 0.8427802587885034, "grad_norm": 0.2238704115152359, "learning_rate": 6.536074476338247e-05, "loss": 0.757, "step": 3680 }, { "epoch": 0.8439253406618573, "grad_norm": 0.14427435398101807, "learning_rate": 6.529609516420997e-05, "loss": 0.7577, "step": 3685 }, { "epoch": 0.8450704225352113, "grad_norm": 0.16628648340702057, "learning_rate": 6.52314455650375e-05, "loss": 0.7559, "step": 3690 }, { "epoch": 0.8462155044085652, "grad_norm": 0.36011502146720886, "learning_rate": 6.516679596586502e-05, "loss": 0.7559, "step": 3695 }, { "epoch": 0.8473605862819191, "grad_norm": 0.3687536418437958, "learning_rate": 6.510214636669253e-05, "loss": 0.7574, "step": 3700 }, { "epoch": 0.8485056681552731, "grad_norm": 0.2624550461769104, "learning_rate": 6.503749676752004e-05, "loss": 0.7528, "step": 3705 }, { "epoch": 0.849650750028627, "grad_norm": 0.30368492007255554, "learning_rate": 6.497284716834756e-05, "loss": 0.7614, "step": 3710 }, { "epoch": 0.8507958319019809, "grad_norm": 0.2152595818042755, "learning_rate": 6.490819756917507e-05, "loss": 0.7563, "step": 3715 }, { "epoch": 0.851940913775335, "grad_norm": 0.1912301778793335, "learning_rate": 6.484354797000259e-05, "loss": 0.7528, "step": 3720 }, { "epoch": 0.8530859956486889, "grad_norm": 0.2414664626121521, "learning_rate": 6.47788983708301e-05, "loss": 0.7565, "step": 3725 }, { "epoch": 0.8542310775220429, "grad_norm": 0.2542330026626587, "learning_rate": 6.471424877165762e-05, "loss": 0.7581, "step": 3730 }, { "epoch": 0.8553761593953968, "grad_norm": 0.21856382489204407, "learning_rate": 6.464959917248513e-05, "loss": 0.7586, "step": 3735 }, { "epoch": 0.8565212412687507, "grad_norm": 0.35254383087158203, "learning_rate": 6.458494957331264e-05, "loss": 0.7572, "step": 3740 }, { "epoch": 0.8576663231421047, "grad_norm": 0.21001994609832764, "learning_rate": 6.452029997414016e-05, "loss": 0.7616, "step": 3745 }, { "epoch": 0.8588114050154586, "grad_norm": 0.4330880045890808, "learning_rate": 6.445565037496769e-05, "loss": 0.7568, "step": 3750 }, { "epoch": 0.8599564868888125, "grad_norm": 0.2603987455368042, "learning_rate": 6.43910007757952e-05, "loss": 0.7542, "step": 3755 }, { "epoch": 0.8611015687621665, "grad_norm": 0.2131146788597107, "learning_rate": 6.432635117662271e-05, "loss": 0.7556, "step": 3760 }, { "epoch": 0.8622466506355204, "grad_norm": 0.296593576669693, "learning_rate": 6.426170157745022e-05, "loss": 0.7627, "step": 3765 }, { "epoch": 0.8633917325088744, "grad_norm": 0.20938166975975037, "learning_rate": 6.419705197827773e-05, "loss": 0.7571, "step": 3770 }, { "epoch": 0.8645368143822283, "grad_norm": 0.31437405943870544, "learning_rate": 6.413240237910524e-05, "loss": 0.7561, "step": 3775 }, { "epoch": 0.8656818962555822, "grad_norm": 0.31622976064682007, "learning_rate": 6.406775277993277e-05, "loss": 0.7546, "step": 3780 }, { "epoch": 0.8668269781289362, "grad_norm": 0.24524053931236267, "learning_rate": 6.400310318076029e-05, "loss": 0.7599, "step": 3785 }, { "epoch": 0.8679720600022902, "grad_norm": 0.17051318287849426, "learning_rate": 6.39384535815878e-05, "loss": 0.7583, "step": 3790 }, { "epoch": 0.8691171418756442, "grad_norm": 0.1901961863040924, "learning_rate": 6.387380398241531e-05, "loss": 0.7569, "step": 3795 }, { "epoch": 0.8702622237489981, "grad_norm": 0.19765377044677734, "learning_rate": 6.380915438324283e-05, "loss": 0.7561, "step": 3800 }, { "epoch": 0.871407305622352, "grad_norm": 0.2318921536207199, "learning_rate": 6.374450478407034e-05, "loss": 0.7574, "step": 3805 }, { "epoch": 0.872552387495706, "grad_norm": 0.23888637125492096, "learning_rate": 6.367985518489786e-05, "loss": 0.7559, "step": 3810 }, { "epoch": 0.8736974693690599, "grad_norm": 0.2582821547985077, "learning_rate": 6.361520558572537e-05, "loss": 0.759, "step": 3815 }, { "epoch": 0.8748425512424138, "grad_norm": 0.2514699697494507, "learning_rate": 6.355055598655289e-05, "loss": 0.7546, "step": 3820 }, { "epoch": 0.8759876331157678, "grad_norm": 0.31897538900375366, "learning_rate": 6.34859063873804e-05, "loss": 0.758, "step": 3825 }, { "epoch": 0.8771327149891217, "grad_norm": 0.18458561599254608, "learning_rate": 6.342125678820791e-05, "loss": 0.7569, "step": 3830 }, { "epoch": 0.8782777968624756, "grad_norm": 0.30276724696159363, "learning_rate": 6.335660718903543e-05, "loss": 0.7588, "step": 3835 }, { "epoch": 0.8794228787358296, "grad_norm": 0.20559334754943848, "learning_rate": 6.329195758986296e-05, "loss": 0.7593, "step": 3840 }, { "epoch": 0.8805679606091835, "grad_norm": 0.23002290725708008, "learning_rate": 6.322730799069047e-05, "loss": 0.7565, "step": 3845 }, { "epoch": 0.8817130424825375, "grad_norm": 0.21782149374485016, "learning_rate": 6.316265839151797e-05, "loss": 0.7536, "step": 3850 }, { "epoch": 0.8828581243558914, "grad_norm": 0.22648191452026367, "learning_rate": 6.309800879234548e-05, "loss": 0.756, "step": 3855 }, { "epoch": 0.8840032062292454, "grad_norm": 0.25743627548217773, "learning_rate": 6.3033359193173e-05, "loss": 0.7574, "step": 3860 }, { "epoch": 0.8851482881025994, "grad_norm": 0.23849058151245117, "learning_rate": 6.296870959400051e-05, "loss": 0.7532, "step": 3865 }, { "epoch": 0.8862933699759533, "grad_norm": 0.24444012343883514, "learning_rate": 6.290405999482804e-05, "loss": 0.7552, "step": 3870 }, { "epoch": 0.8874384518493073, "grad_norm": 0.21026170253753662, "learning_rate": 6.283941039565556e-05, "loss": 0.7551, "step": 3875 }, { "epoch": 0.8885835337226612, "grad_norm": 0.25571170449256897, "learning_rate": 6.277476079648307e-05, "loss": 0.754, "step": 3880 }, { "epoch": 0.8897286155960151, "grad_norm": 0.24130992591381073, "learning_rate": 6.271011119731058e-05, "loss": 0.7608, "step": 3885 }, { "epoch": 0.8908736974693691, "grad_norm": 0.21595409512519836, "learning_rate": 6.26454615981381e-05, "loss": 0.7563, "step": 3890 }, { "epoch": 0.892018779342723, "grad_norm": 0.2847621738910675, "learning_rate": 6.25808119989656e-05, "loss": 0.7569, "step": 3895 }, { "epoch": 0.8931638612160769, "grad_norm": 0.2793238162994385, "learning_rate": 6.251616239979313e-05, "loss": 0.7572, "step": 3900 }, { "epoch": 0.8943089430894309, "grad_norm": 0.19144894182682037, "learning_rate": 6.245151280062064e-05, "loss": 0.7541, "step": 3905 }, { "epoch": 0.8954540249627848, "grad_norm": 0.18905645608901978, "learning_rate": 6.238686320144815e-05, "loss": 0.7584, "step": 3910 }, { "epoch": 0.8965991068361387, "grad_norm": 0.24144595861434937, "learning_rate": 6.232221360227567e-05, "loss": 0.7566, "step": 3915 }, { "epoch": 0.8977441887094927, "grad_norm": 0.22137823700904846, "learning_rate": 6.225756400310318e-05, "loss": 0.7578, "step": 3920 }, { "epoch": 0.8988892705828466, "grad_norm": 0.247211754322052, "learning_rate": 6.21929144039307e-05, "loss": 0.7566, "step": 3925 }, { "epoch": 0.9000343524562007, "grad_norm": 0.14881117641925812, "learning_rate": 6.212826480475823e-05, "loss": 0.7558, "step": 3930 }, { "epoch": 0.9011794343295546, "grad_norm": 0.2065708339214325, "learning_rate": 6.206361520558573e-05, "loss": 0.759, "step": 3935 }, { "epoch": 0.9023245162029085, "grad_norm": 0.21069486439228058, "learning_rate": 6.199896560641324e-05, "loss": 0.7569, "step": 3940 }, { "epoch": 0.9034695980762625, "grad_norm": 0.21884271502494812, "learning_rate": 6.193431600724075e-05, "loss": 0.7575, "step": 3945 }, { "epoch": 0.9046146799496164, "grad_norm": 0.3551616966724396, "learning_rate": 6.186966640806827e-05, "loss": 0.7536, "step": 3950 }, { "epoch": 0.9057597618229704, "grad_norm": 0.28585392236709595, "learning_rate": 6.180501680889578e-05, "loss": 0.7561, "step": 3955 }, { "epoch": 0.9069048436963243, "grad_norm": 0.21221600472927094, "learning_rate": 6.174036720972331e-05, "loss": 0.753, "step": 3960 }, { "epoch": 0.9080499255696782, "grad_norm": 0.24336685240268707, "learning_rate": 6.167571761055082e-05, "loss": 0.7576, "step": 3965 }, { "epoch": 0.9091950074430322, "grad_norm": 0.22866609692573547, "learning_rate": 6.161106801137834e-05, "loss": 0.7549, "step": 3970 }, { "epoch": 0.9103400893163861, "grad_norm": 0.21172703802585602, "learning_rate": 6.154641841220584e-05, "loss": 0.757, "step": 3975 }, { "epoch": 0.91148517118974, "grad_norm": 0.2627602815628052, "learning_rate": 6.148176881303335e-05, "loss": 0.7549, "step": 3980 }, { "epoch": 0.912630253063094, "grad_norm": 0.3001517355442047, "learning_rate": 6.141711921386087e-05, "loss": 0.7576, "step": 3985 }, { "epoch": 0.9137753349364479, "grad_norm": 0.18586446344852448, "learning_rate": 6.13524696146884e-05, "loss": 0.7527, "step": 3990 }, { "epoch": 0.9149204168098019, "grad_norm": 0.2953189015388489, "learning_rate": 6.128782001551591e-05, "loss": 0.753, "step": 3995 }, { "epoch": 0.9160654986831559, "grad_norm": 0.1657901555299759, "learning_rate": 6.122317041634342e-05, "loss": 0.7552, "step": 4000 }, { "epoch": 0.9172105805565098, "grad_norm": 0.26477518677711487, "learning_rate": 6.115852081717094e-05, "loss": 0.7542, "step": 4005 }, { "epoch": 0.9183556624298638, "grad_norm": 0.1784099042415619, "learning_rate": 6.109387121799845e-05, "loss": 0.7548, "step": 4010 }, { "epoch": 0.9195007443032177, "grad_norm": 0.27934473752975464, "learning_rate": 6.1029221618825974e-05, "loss": 0.7548, "step": 4015 }, { "epoch": 0.9206458261765716, "grad_norm": 0.18960314989089966, "learning_rate": 6.0964572019653474e-05, "loss": 0.7552, "step": 4020 }, { "epoch": 0.9217909080499256, "grad_norm": 0.26606252789497375, "learning_rate": 6.0899922420480995e-05, "loss": 0.757, "step": 4025 }, { "epoch": 0.9229359899232795, "grad_norm": 0.19727392494678497, "learning_rate": 6.083527282130851e-05, "loss": 0.752, "step": 4030 }, { "epoch": 0.9240810717966335, "grad_norm": 0.1997370570898056, "learning_rate": 6.0770623222136024e-05, "loss": 0.7556, "step": 4035 }, { "epoch": 0.9252261536699874, "grad_norm": 0.3227653503417969, "learning_rate": 6.070597362296354e-05, "loss": 0.756, "step": 4040 }, { "epoch": 0.9263712355433413, "grad_norm": 0.21063637733459473, "learning_rate": 6.064132402379106e-05, "loss": 0.7525, "step": 4045 }, { "epoch": 0.9275163174166953, "grad_norm": 0.1530616283416748, "learning_rate": 6.057667442461857e-05, "loss": 0.7552, "step": 4050 }, { "epoch": 0.9286613992900492, "grad_norm": 0.3049672245979309, "learning_rate": 6.051202482544609e-05, "loss": 0.756, "step": 4055 }, { "epoch": 0.9298064811634031, "grad_norm": 0.20015349984169006, "learning_rate": 6.0447375226273595e-05, "loss": 0.7549, "step": 4060 }, { "epoch": 0.9309515630367571, "grad_norm": 0.2715403139591217, "learning_rate": 6.038272562710111e-05, "loss": 0.7544, "step": 4065 }, { "epoch": 0.9320966449101111, "grad_norm": 0.200375497341156, "learning_rate": 6.031807602792863e-05, "loss": 0.7525, "step": 4070 }, { "epoch": 0.9332417267834651, "grad_norm": 0.2582535445690155, "learning_rate": 6.0253426428756144e-05, "loss": 0.7506, "step": 4075 }, { "epoch": 0.934386808656819, "grad_norm": 0.18540117144584656, "learning_rate": 6.018877682958366e-05, "loss": 0.7555, "step": 4080 }, { "epoch": 0.9355318905301729, "grad_norm": 0.24330022931098938, "learning_rate": 6.012412723041117e-05, "loss": 0.7551, "step": 4085 }, { "epoch": 0.9366769724035269, "grad_norm": 0.32542139291763306, "learning_rate": 6.0059477631238694e-05, "loss": 0.7552, "step": 4090 }, { "epoch": 0.9378220542768808, "grad_norm": 0.24217607080936432, "learning_rate": 5.999482803206621e-05, "loss": 0.7557, "step": 4095 }, { "epoch": 0.9389671361502347, "grad_norm": 0.2282743602991104, "learning_rate": 5.993017843289372e-05, "loss": 0.7571, "step": 4100 }, { "epoch": 0.9401122180235887, "grad_norm": 0.18588009476661682, "learning_rate": 5.986552883372123e-05, "loss": 0.7529, "step": 4105 }, { "epoch": 0.9412572998969426, "grad_norm": 0.24249807000160217, "learning_rate": 5.9800879234548744e-05, "loss": 0.756, "step": 4110 }, { "epoch": 0.9424023817702966, "grad_norm": 0.20471017062664032, "learning_rate": 5.9736229635376265e-05, "loss": 0.7577, "step": 4115 }, { "epoch": 0.9435474636436505, "grad_norm": 0.2088862508535385, "learning_rate": 5.967158003620378e-05, "loss": 0.75, "step": 4120 }, { "epoch": 0.9446925455170044, "grad_norm": 0.16393999755382538, "learning_rate": 5.960693043703129e-05, "loss": 0.7508, "step": 4125 }, { "epoch": 0.9458376273903584, "grad_norm": 0.1825890988111496, "learning_rate": 5.954228083785881e-05, "loss": 0.7525, "step": 4130 }, { "epoch": 0.9469827092637123, "grad_norm": 0.14733733236789703, "learning_rate": 5.947763123868633e-05, "loss": 0.7552, "step": 4135 }, { "epoch": 0.9481277911370662, "grad_norm": 0.34639453887939453, "learning_rate": 5.941298163951384e-05, "loss": 0.752, "step": 4140 }, { "epoch": 0.9492728730104203, "grad_norm": 0.35142093896865845, "learning_rate": 5.934833204034135e-05, "loss": 0.7563, "step": 4145 }, { "epoch": 0.9504179548837742, "grad_norm": 0.20236782729625702, "learning_rate": 5.9283682441168864e-05, "loss": 0.7542, "step": 4150 }, { "epoch": 0.9515630367571282, "grad_norm": 0.2146655023097992, "learning_rate": 5.921903284199638e-05, "loss": 0.7539, "step": 4155 }, { "epoch": 0.9527081186304821, "grad_norm": 0.20262286067008972, "learning_rate": 5.91543832428239e-05, "loss": 0.7487, "step": 4160 }, { "epoch": 0.953853200503836, "grad_norm": 0.22617624700069427, "learning_rate": 5.9089733643651414e-05, "loss": 0.7551, "step": 4165 }, { "epoch": 0.95499828237719, "grad_norm": 0.18724945187568665, "learning_rate": 5.902508404447893e-05, "loss": 0.7558, "step": 4170 }, { "epoch": 0.9561433642505439, "grad_norm": 0.22609928250312805, "learning_rate": 5.896043444530644e-05, "loss": 0.7541, "step": 4175 }, { "epoch": 0.9572884461238979, "grad_norm": 0.28732478618621826, "learning_rate": 5.889578484613396e-05, "loss": 0.7535, "step": 4180 }, { "epoch": 0.9584335279972518, "grad_norm": 0.18549516797065735, "learning_rate": 5.8831135246961464e-05, "loss": 0.751, "step": 4185 }, { "epoch": 0.9595786098706057, "grad_norm": 0.18325352668762207, "learning_rate": 5.8766485647788985e-05, "loss": 0.754, "step": 4190 }, { "epoch": 0.9607236917439597, "grad_norm": 0.23209375143051147, "learning_rate": 5.87018360486165e-05, "loss": 0.7526, "step": 4195 }, { "epoch": 0.9618687736173136, "grad_norm": 0.312589168548584, "learning_rate": 5.863718644944401e-05, "loss": 0.7494, "step": 4200 }, { "epoch": 0.9630138554906675, "grad_norm": 0.20656633377075195, "learning_rate": 5.8572536850271534e-05, "loss": 0.7549, "step": 4205 }, { "epoch": 0.9641589373640215, "grad_norm": 0.16602040827274323, "learning_rate": 5.850788725109905e-05, "loss": 0.7553, "step": 4210 }, { "epoch": 0.9653040192373755, "grad_norm": 0.32111111283302307, "learning_rate": 5.844323765192656e-05, "loss": 0.7518, "step": 4215 }, { "epoch": 0.9664491011107295, "grad_norm": 0.3452571630477905, "learning_rate": 5.837858805275408e-05, "loss": 0.753, "step": 4220 }, { "epoch": 0.9675941829840834, "grad_norm": 0.18496818840503693, "learning_rate": 5.83139384535816e-05, "loss": 0.7565, "step": 4225 }, { "epoch": 0.9687392648574373, "grad_norm": 0.3221565783023834, "learning_rate": 5.82492888544091e-05, "loss": 0.7534, "step": 4230 }, { "epoch": 0.9698843467307913, "grad_norm": 0.24431121349334717, "learning_rate": 5.818463925523662e-05, "loss": 0.7519, "step": 4235 }, { "epoch": 0.9710294286041452, "grad_norm": 0.21201278269290924, "learning_rate": 5.8119989656064134e-05, "loss": 0.7536, "step": 4240 }, { "epoch": 0.9721745104774991, "grad_norm": 0.1579071283340454, "learning_rate": 5.805534005689165e-05, "loss": 0.755, "step": 4245 }, { "epoch": 0.9733195923508531, "grad_norm": 0.3869153559207916, "learning_rate": 5.799069045771917e-05, "loss": 0.7513, "step": 4250 }, { "epoch": 0.974464674224207, "grad_norm": 0.175177663564682, "learning_rate": 5.792604085854668e-05, "loss": 0.7513, "step": 4255 }, { "epoch": 0.975609756097561, "grad_norm": 0.2530837655067444, "learning_rate": 5.78613912593742e-05, "loss": 0.7546, "step": 4260 }, { "epoch": 0.9767548379709149, "grad_norm": 0.28631719946861267, "learning_rate": 5.779674166020171e-05, "loss": 0.7511, "step": 4265 }, { "epoch": 0.9778999198442688, "grad_norm": 0.1455335021018982, "learning_rate": 5.773209206102922e-05, "loss": 0.7532, "step": 4270 }, { "epoch": 0.9790450017176228, "grad_norm": 0.21537651121616364, "learning_rate": 5.766744246185673e-05, "loss": 0.7511, "step": 4275 }, { "epoch": 0.9801900835909767, "grad_norm": 0.22129929065704346, "learning_rate": 5.7602792862684254e-05, "loss": 0.7567, "step": 4280 }, { "epoch": 0.9813351654643307, "grad_norm": 0.33510035276412964, "learning_rate": 5.753814326351177e-05, "loss": 0.7509, "step": 4285 }, { "epoch": 0.9824802473376847, "grad_norm": 0.31159457564353943, "learning_rate": 5.747349366433928e-05, "loss": 0.7536, "step": 4290 }, { "epoch": 0.9836253292110386, "grad_norm": 0.225836381316185, "learning_rate": 5.74088440651668e-05, "loss": 0.7546, "step": 4295 }, { "epoch": 0.9847704110843926, "grad_norm": 0.18996797502040863, "learning_rate": 5.734419446599432e-05, "loss": 0.7512, "step": 4300 }, { "epoch": 0.9859154929577465, "grad_norm": 0.19356811046600342, "learning_rate": 5.727954486682183e-05, "loss": 0.7511, "step": 4305 }, { "epoch": 0.9870605748311004, "grad_norm": 0.24983671307563782, "learning_rate": 5.7214895267649346e-05, "loss": 0.7552, "step": 4310 }, { "epoch": 0.9882056567044544, "grad_norm": 0.2550750970840454, "learning_rate": 5.7150245668476854e-05, "loss": 0.7495, "step": 4315 }, { "epoch": 0.9893507385778083, "grad_norm": 0.20238997042179108, "learning_rate": 5.708559606930437e-05, "loss": 0.7531, "step": 4320 }, { "epoch": 0.9904958204511622, "grad_norm": 0.23092573881149292, "learning_rate": 5.702094647013189e-05, "loss": 0.7508, "step": 4325 }, { "epoch": 0.9916409023245162, "grad_norm": 0.245700404047966, "learning_rate": 5.69562968709594e-05, "loss": 0.7517, "step": 4330 }, { "epoch": 0.9927859841978701, "grad_norm": 0.2939732074737549, "learning_rate": 5.689164727178692e-05, "loss": 0.7532, "step": 4335 }, { "epoch": 0.9939310660712241, "grad_norm": 0.2650054693222046, "learning_rate": 5.682699767261443e-05, "loss": 0.7556, "step": 4340 }, { "epoch": 0.995076147944578, "grad_norm": 0.18158090114593506, "learning_rate": 5.676234807344195e-05, "loss": 0.7511, "step": 4345 }, { "epoch": 0.9962212298179319, "grad_norm": 0.1984746754169464, "learning_rate": 5.669769847426947e-05, "loss": 0.755, "step": 4350 }, { "epoch": 0.997366311691286, "grad_norm": 0.2304326891899109, "learning_rate": 5.6633048875096974e-05, "loss": 0.7536, "step": 4355 }, { "epoch": 0.9985113935646399, "grad_norm": 0.20437856018543243, "learning_rate": 5.656839927592449e-05, "loss": 0.7495, "step": 4360 }, { "epoch": 0.9996564754379939, "grad_norm": 0.2961784899234772, "learning_rate": 5.6503749676752e-05, "loss": 0.7505, "step": 4365 }, { "epoch": 1.0006870491240123, "grad_norm": 0.27757421135902405, "learning_rate": 5.6439100077579524e-05, "loss": 0.7592, "step": 4370 }, { "epoch": 1.0018321309973663, "grad_norm": 0.17433343827724457, "learning_rate": 5.637445047840704e-05, "loss": 0.75, "step": 4375 }, { "epoch": 1.0029772128707202, "grad_norm": 0.22225502133369446, "learning_rate": 5.630980087923455e-05, "loss": 0.7547, "step": 4380 }, { "epoch": 1.0041222947440742, "grad_norm": 0.21817201375961304, "learning_rate": 5.6245151280062066e-05, "loss": 0.7504, "step": 4385 }, { "epoch": 1.005267376617428, "grad_norm": 0.22865532338619232, "learning_rate": 5.618050168088959e-05, "loss": 0.7488, "step": 4390 }, { "epoch": 1.006412458490782, "grad_norm": 0.24642197787761688, "learning_rate": 5.611585208171709e-05, "loss": 0.7553, "step": 4395 }, { "epoch": 1.0075575403641361, "grad_norm": 0.20586034655570984, "learning_rate": 5.605120248254461e-05, "loss": 0.7516, "step": 4400 }, { "epoch": 1.00870262223749, "grad_norm": 0.20645256340503693, "learning_rate": 5.598655288337212e-05, "loss": 0.7481, "step": 4405 }, { "epoch": 1.009847704110844, "grad_norm": 0.227728009223938, "learning_rate": 5.592190328419964e-05, "loss": 0.7531, "step": 4410 }, { "epoch": 1.0109927859841978, "grad_norm": 0.19497741758823395, "learning_rate": 5.585725368502716e-05, "loss": 0.7534, "step": 4415 }, { "epoch": 1.0121378678575519, "grad_norm": 0.24544782936573029, "learning_rate": 5.579260408585467e-05, "loss": 0.7522, "step": 4420 }, { "epoch": 1.0132829497309057, "grad_norm": 0.21906015276908875, "learning_rate": 5.572795448668219e-05, "loss": 0.7531, "step": 4425 }, { "epoch": 1.0144280316042598, "grad_norm": 0.26968297362327576, "learning_rate": 5.56633048875097e-05, "loss": 0.7501, "step": 4430 }, { "epoch": 1.0155731134776136, "grad_norm": 0.16189736127853394, "learning_rate": 5.559865528833722e-05, "loss": 0.754, "step": 4435 }, { "epoch": 1.0167181953509676, "grad_norm": 0.17086242139339447, "learning_rate": 5.553400568916472e-05, "loss": 0.7526, "step": 4440 }, { "epoch": 1.0178632772243215, "grad_norm": 0.22381778061389923, "learning_rate": 5.5469356089992244e-05, "loss": 0.7541, "step": 4445 }, { "epoch": 1.0190083590976755, "grad_norm": 0.20228935778141022, "learning_rate": 5.540470649081976e-05, "loss": 0.7502, "step": 4450 }, { "epoch": 1.0201534409710293, "grad_norm": 0.25878095626831055, "learning_rate": 5.534005689164727e-05, "loss": 0.7493, "step": 4455 }, { "epoch": 1.0212985228443834, "grad_norm": 0.22310547530651093, "learning_rate": 5.527540729247479e-05, "loss": 0.755, "step": 4460 }, { "epoch": 1.0224436047177372, "grad_norm": 0.2674236297607422, "learning_rate": 5.521075769330231e-05, "loss": 0.7532, "step": 4465 }, { "epoch": 1.0235886865910913, "grad_norm": 0.28843796253204346, "learning_rate": 5.514610809412982e-05, "loss": 0.7561, "step": 4470 }, { "epoch": 1.0247337684644453, "grad_norm": 0.3136085271835327, "learning_rate": 5.5081458494957336e-05, "loss": 0.752, "step": 4475 }, { "epoch": 1.0258788503377991, "grad_norm": 0.2548903822898865, "learning_rate": 5.501680889578484e-05, "loss": 0.7495, "step": 4480 }, { "epoch": 1.0270239322111532, "grad_norm": 0.31399279832839966, "learning_rate": 5.495215929661236e-05, "loss": 0.754, "step": 4485 }, { "epoch": 1.028169014084507, "grad_norm": 0.2506180703639984, "learning_rate": 5.488750969743988e-05, "loss": 0.7486, "step": 4490 }, { "epoch": 1.029314095957861, "grad_norm": 0.21598592400550842, "learning_rate": 5.482286009826739e-05, "loss": 0.7464, "step": 4495 }, { "epoch": 1.0304591778312149, "grad_norm": 0.19644518196582794, "learning_rate": 5.475821049909491e-05, "loss": 0.7521, "step": 4500 }, { "epoch": 1.031604259704569, "grad_norm": 0.22480544447898865, "learning_rate": 5.469356089992243e-05, "loss": 0.7579, "step": 4505 }, { "epoch": 1.0327493415779228, "grad_norm": 0.2105066180229187, "learning_rate": 5.462891130074994e-05, "loss": 0.7517, "step": 4510 }, { "epoch": 1.0338944234512768, "grad_norm": 0.4357909858226776, "learning_rate": 5.4564261701577456e-05, "loss": 0.75, "step": 4515 }, { "epoch": 1.0350395053246306, "grad_norm": 0.3007284700870514, "learning_rate": 5.4499612102404964e-05, "loss": 0.7505, "step": 4520 }, { "epoch": 1.0361845871979847, "grad_norm": 0.13870058953762054, "learning_rate": 5.443496250323248e-05, "loss": 0.75, "step": 4525 }, { "epoch": 1.0373296690713385, "grad_norm": 0.2073334902524948, "learning_rate": 5.437031290405999e-05, "loss": 0.7487, "step": 4530 }, { "epoch": 1.0384747509446925, "grad_norm": 0.21007287502288818, "learning_rate": 5.430566330488751e-05, "loss": 0.7528, "step": 4535 }, { "epoch": 1.0396198328180466, "grad_norm": 0.24089570343494415, "learning_rate": 5.424101370571503e-05, "loss": 0.752, "step": 4540 }, { "epoch": 1.0407649146914004, "grad_norm": 0.21664293110370636, "learning_rate": 5.417636410654254e-05, "loss": 0.7502, "step": 4545 }, { "epoch": 1.0419099965647545, "grad_norm": 0.1680118888616562, "learning_rate": 5.4111714507370056e-05, "loss": 0.7476, "step": 4550 }, { "epoch": 1.0430550784381083, "grad_norm": 0.19197143614292145, "learning_rate": 5.404706490819758e-05, "loss": 0.7527, "step": 4555 }, { "epoch": 1.0442001603114623, "grad_norm": 0.15469254553318024, "learning_rate": 5.398241530902509e-05, "loss": 0.7508, "step": 4560 }, { "epoch": 1.0453452421848162, "grad_norm": 0.16881108283996582, "learning_rate": 5.39177657098526e-05, "loss": 0.7534, "step": 4565 }, { "epoch": 1.0464903240581702, "grad_norm": 0.28262412548065186, "learning_rate": 5.385311611068011e-05, "loss": 0.7519, "step": 4570 }, { "epoch": 1.047635405931524, "grad_norm": 0.2931879460811615, "learning_rate": 5.378846651150763e-05, "loss": 0.7536, "step": 4575 }, { "epoch": 1.048780487804878, "grad_norm": 0.30282384157180786, "learning_rate": 5.372381691233515e-05, "loss": 0.7514, "step": 4580 }, { "epoch": 1.049925569678232, "grad_norm": 0.1762247383594513, "learning_rate": 5.365916731316266e-05, "loss": 0.7526, "step": 4585 }, { "epoch": 1.051070651551586, "grad_norm": 0.2082643061876297, "learning_rate": 5.3594517713990177e-05, "loss": 0.7507, "step": 4590 }, { "epoch": 1.0522157334249398, "grad_norm": 0.26037630438804626, "learning_rate": 5.352986811481769e-05, "loss": 0.7518, "step": 4595 }, { "epoch": 1.0533608152982938, "grad_norm": 0.1809810996055603, "learning_rate": 5.346521851564521e-05, "loss": 0.754, "step": 4600 }, { "epoch": 1.0545058971716477, "grad_norm": 0.2725500762462616, "learning_rate": 5.340056891647271e-05, "loss": 0.7515, "step": 4605 }, { "epoch": 1.0556509790450017, "grad_norm": 0.28422823548316956, "learning_rate": 5.3335919317300233e-05, "loss": 0.7485, "step": 4610 }, { "epoch": 1.0567960609183558, "grad_norm": 0.19163936376571655, "learning_rate": 5.327126971812775e-05, "loss": 0.7507, "step": 4615 }, { "epoch": 1.0579411427917096, "grad_norm": 0.19279161095619202, "learning_rate": 5.320662011895526e-05, "loss": 0.75, "step": 4620 }, { "epoch": 1.0590862246650636, "grad_norm": 0.26162973046302795, "learning_rate": 5.314197051978278e-05, "loss": 0.7525, "step": 4625 }, { "epoch": 1.0602313065384175, "grad_norm": 0.18127937614917755, "learning_rate": 5.30773209206103e-05, "loss": 0.7504, "step": 4630 }, { "epoch": 1.0613763884117715, "grad_norm": 0.16717013716697693, "learning_rate": 5.301267132143781e-05, "loss": 0.7487, "step": 4635 }, { "epoch": 1.0625214702851253, "grad_norm": 0.22202427685260773, "learning_rate": 5.2948021722265326e-05, "loss": 0.7526, "step": 4640 }, { "epoch": 1.0636665521584794, "grad_norm": 0.17828546464443207, "learning_rate": 5.2883372123092847e-05, "loss": 0.75, "step": 4645 }, { "epoch": 1.0648116340318332, "grad_norm": 0.20090904831886292, "learning_rate": 5.281872252392035e-05, "loss": 0.748, "step": 4650 }, { "epoch": 1.0659567159051873, "grad_norm": 0.2585245370864868, "learning_rate": 5.275407292474787e-05, "loss": 0.7529, "step": 4655 }, { "epoch": 1.067101797778541, "grad_norm": 0.21099530160427094, "learning_rate": 5.268942332557538e-05, "loss": 0.7531, "step": 4660 }, { "epoch": 1.0682468796518951, "grad_norm": 0.2740081548690796, "learning_rate": 5.26247737264029e-05, "loss": 0.7507, "step": 4665 }, { "epoch": 1.069391961525249, "grad_norm": 0.2878158688545227, "learning_rate": 5.256012412723042e-05, "loss": 0.7513, "step": 4670 }, { "epoch": 1.070537043398603, "grad_norm": 0.1615872085094452, "learning_rate": 5.249547452805793e-05, "loss": 0.7505, "step": 4675 }, { "epoch": 1.0716821252719568, "grad_norm": 0.18126648664474487, "learning_rate": 5.2430824928885446e-05, "loss": 0.7493, "step": 4680 }, { "epoch": 1.0728272071453109, "grad_norm": 0.1734725534915924, "learning_rate": 5.236617532971296e-05, "loss": 0.7496, "step": 4685 }, { "epoch": 1.073972289018665, "grad_norm": 0.32179296016693115, "learning_rate": 5.230152573054047e-05, "loss": 0.7504, "step": 4690 }, { "epoch": 1.0751173708920188, "grad_norm": 0.37920457124710083, "learning_rate": 5.223687613136798e-05, "loss": 0.7471, "step": 4695 }, { "epoch": 1.0762624527653728, "grad_norm": 0.36169323325157166, "learning_rate": 5.21722265321955e-05, "loss": 0.7507, "step": 4700 }, { "epoch": 1.0774075346387266, "grad_norm": 0.22468143701553345, "learning_rate": 5.210757693302302e-05, "loss": 0.7534, "step": 4705 }, { "epoch": 1.0785526165120807, "grad_norm": 0.20241256058216095, "learning_rate": 5.204292733385053e-05, "loss": 0.7542, "step": 4710 }, { "epoch": 1.0796976983854345, "grad_norm": 0.24470794200897217, "learning_rate": 5.197827773467805e-05, "loss": 0.7513, "step": 4715 }, { "epoch": 1.0808427802587885, "grad_norm": 0.22353743016719818, "learning_rate": 5.1913628135505567e-05, "loss": 0.7493, "step": 4720 }, { "epoch": 1.0819878621321424, "grad_norm": 0.1666325479745865, "learning_rate": 5.184897853633308e-05, "loss": 0.7505, "step": 4725 }, { "epoch": 1.0831329440054964, "grad_norm": 0.23964060842990875, "learning_rate": 5.178432893716059e-05, "loss": 0.7515, "step": 4730 }, { "epoch": 1.0842780258788502, "grad_norm": 0.2576775550842285, "learning_rate": 5.17196793379881e-05, "loss": 0.753, "step": 4735 }, { "epoch": 1.0854231077522043, "grad_norm": 0.2827492952346802, "learning_rate": 5.165502973881562e-05, "loss": 0.7527, "step": 4740 }, { "epoch": 1.0865681896255581, "grad_norm": 0.2897157371044159, "learning_rate": 5.159038013964314e-05, "loss": 0.7508, "step": 4745 }, { "epoch": 1.0877132714989122, "grad_norm": 0.28682994842529297, "learning_rate": 5.152573054047065e-05, "loss": 0.7525, "step": 4750 }, { "epoch": 1.0888583533722662, "grad_norm": 0.14168643951416016, "learning_rate": 5.1461080941298166e-05, "loss": 0.7547, "step": 4755 }, { "epoch": 1.09000343524562, "grad_norm": 0.2121264487504959, "learning_rate": 5.139643134212569e-05, "loss": 0.7494, "step": 4760 }, { "epoch": 1.091148517118974, "grad_norm": 0.2246008962392807, "learning_rate": 5.13317817429532e-05, "loss": 0.7512, "step": 4765 }, { "epoch": 1.092293598992328, "grad_norm": 0.18789461255073547, "learning_rate": 5.1267132143780716e-05, "loss": 0.7499, "step": 4770 }, { "epoch": 1.093438680865682, "grad_norm": 0.2336086481809616, "learning_rate": 5.120248254460822e-05, "loss": 0.7478, "step": 4775 }, { "epoch": 1.0945837627390358, "grad_norm": 0.23505932092666626, "learning_rate": 5.113783294543574e-05, "loss": 0.7483, "step": 4780 }, { "epoch": 1.0957288446123898, "grad_norm": 0.24164508283138275, "learning_rate": 5.107318334626325e-05, "loss": 0.7524, "step": 4785 }, { "epoch": 1.0968739264857437, "grad_norm": 0.22491183876991272, "learning_rate": 5.100853374709077e-05, "loss": 0.7502, "step": 4790 }, { "epoch": 1.0980190083590977, "grad_norm": 0.17309866845607758, "learning_rate": 5.094388414791829e-05, "loss": 0.751, "step": 4795 }, { "epoch": 1.0991640902324515, "grad_norm": 0.20419526100158691, "learning_rate": 5.08792345487458e-05, "loss": 0.7462, "step": 4800 }, { "epoch": 1.1003091721058056, "grad_norm": 0.22812721133232117, "learning_rate": 5.0814584949573315e-05, "loss": 0.7499, "step": 4805 }, { "epoch": 1.1014542539791594, "grad_norm": 0.2595369815826416, "learning_rate": 5.0749935350400836e-05, "loss": 0.7511, "step": 4810 }, { "epoch": 1.1025993358525135, "grad_norm": 0.2934839427471161, "learning_rate": 5.068528575122834e-05, "loss": 0.7512, "step": 4815 }, { "epoch": 1.1037444177258675, "grad_norm": 0.2313292771577835, "learning_rate": 5.062063615205586e-05, "loss": 0.7491, "step": 4820 }, { "epoch": 1.1048894995992213, "grad_norm": 0.2503287196159363, "learning_rate": 5.055598655288337e-05, "loss": 0.7503, "step": 4825 }, { "epoch": 1.1060345814725754, "grad_norm": 0.16885490715503693, "learning_rate": 5.0491336953710886e-05, "loss": 0.7547, "step": 4830 }, { "epoch": 1.1071796633459292, "grad_norm": 0.23992787301540375, "learning_rate": 5.042668735453841e-05, "loss": 0.7535, "step": 4835 }, { "epoch": 1.1083247452192833, "grad_norm": 0.1906599998474121, "learning_rate": 5.036203775536592e-05, "loss": 0.7516, "step": 4840 }, { "epoch": 1.109469827092637, "grad_norm": 0.15855015814304352, "learning_rate": 5.0297388156193436e-05, "loss": 0.7478, "step": 4845 }, { "epoch": 1.1106149089659911, "grad_norm": 0.1988474577665329, "learning_rate": 5.023273855702095e-05, "loss": 0.7506, "step": 4850 }, { "epoch": 1.111759990839345, "grad_norm": 0.1842668205499649, "learning_rate": 5.016808895784847e-05, "loss": 0.7515, "step": 4855 }, { "epoch": 1.112905072712699, "grad_norm": 0.18278862535953522, "learning_rate": 5.010343935867597e-05, "loss": 0.7539, "step": 4860 }, { "epoch": 1.1140501545860528, "grad_norm": 0.18607094883918762, "learning_rate": 5.003878975950349e-05, "loss": 0.7488, "step": 4865 }, { "epoch": 1.1151952364594069, "grad_norm": 0.21544776856899261, "learning_rate": 4.997414016033101e-05, "loss": 0.7511, "step": 4870 }, { "epoch": 1.1163403183327607, "grad_norm": 0.21455517411231995, "learning_rate": 4.990949056115852e-05, "loss": 0.7526, "step": 4875 }, { "epoch": 1.1174854002061148, "grad_norm": 0.2741033434867859, "learning_rate": 4.984484096198604e-05, "loss": 0.7451, "step": 4880 }, { "epoch": 1.1186304820794686, "grad_norm": 0.22744566202163696, "learning_rate": 4.9780191362813556e-05, "loss": 0.7521, "step": 4885 }, { "epoch": 1.1197755639528226, "grad_norm": 0.18673773109912872, "learning_rate": 4.9715541763641064e-05, "loss": 0.7506, "step": 4890 }, { "epoch": 1.1209206458261765, "grad_norm": 0.19755394756793976, "learning_rate": 4.9650892164468585e-05, "loss": 0.7477, "step": 4895 }, { "epoch": 1.1220657276995305, "grad_norm": 0.21924807131290436, "learning_rate": 4.95862425652961e-05, "loss": 0.7469, "step": 4900 }, { "epoch": 1.1232108095728845, "grad_norm": 0.17516666650772095, "learning_rate": 4.952159296612361e-05, "loss": 0.7499, "step": 4905 }, { "epoch": 1.1243558914462384, "grad_norm": 0.18603424727916718, "learning_rate": 4.945694336695113e-05, "loss": 0.7494, "step": 4910 }, { "epoch": 1.1255009733195924, "grad_norm": 0.19065344333648682, "learning_rate": 4.939229376777864e-05, "loss": 0.748, "step": 4915 }, { "epoch": 1.1266460551929462, "grad_norm": 0.2856319546699524, "learning_rate": 4.9327644168606156e-05, "loss": 0.7501, "step": 4920 }, { "epoch": 1.1277911370663003, "grad_norm": 0.21050098538398743, "learning_rate": 4.926299456943368e-05, "loss": 0.7499, "step": 4925 }, { "epoch": 1.1289362189396541, "grad_norm": 0.19090114533901215, "learning_rate": 4.9198344970261184e-05, "loss": 0.7546, "step": 4930 }, { "epoch": 1.1300813008130082, "grad_norm": 0.2405533790588379, "learning_rate": 4.91336953710887e-05, "loss": 0.7514, "step": 4935 }, { "epoch": 1.131226382686362, "grad_norm": 0.18367047607898712, "learning_rate": 4.906904577191622e-05, "loss": 0.7486, "step": 4940 }, { "epoch": 1.132371464559716, "grad_norm": 0.18685941398143768, "learning_rate": 4.9004396172743734e-05, "loss": 0.7513, "step": 4945 }, { "epoch": 1.1335165464330699, "grad_norm": 0.18659988045692444, "learning_rate": 4.893974657357124e-05, "loss": 0.7511, "step": 4950 }, { "epoch": 1.134661628306424, "grad_norm": 0.22466859221458435, "learning_rate": 4.887509697439876e-05, "loss": 0.7478, "step": 4955 }, { "epoch": 1.1358067101797777, "grad_norm": 0.2367299497127533, "learning_rate": 4.8810447375226276e-05, "loss": 0.7518, "step": 4960 }, { "epoch": 1.1369517920531318, "grad_norm": 0.17893458902835846, "learning_rate": 4.874579777605379e-05, "loss": 0.7492, "step": 4965 }, { "epoch": 1.1380968739264858, "grad_norm": 0.22315214574337006, "learning_rate": 4.868114817688131e-05, "loss": 0.746, "step": 4970 }, { "epoch": 1.1392419557998397, "grad_norm": 0.24147474765777588, "learning_rate": 4.861649857770882e-05, "loss": 0.7509, "step": 4975 }, { "epoch": 1.1403870376731937, "grad_norm": 0.17804847657680511, "learning_rate": 4.855184897853633e-05, "loss": 0.7546, "step": 4980 }, { "epoch": 1.1415321195465475, "grad_norm": 0.17471164464950562, "learning_rate": 4.8487199379363854e-05, "loss": 0.7467, "step": 4985 }, { "epoch": 1.1426772014199016, "grad_norm": 0.21423661708831787, "learning_rate": 4.842254978019137e-05, "loss": 0.7511, "step": 4990 }, { "epoch": 1.1438222832932554, "grad_norm": 0.21214281022548676, "learning_rate": 4.8357900181018876e-05, "loss": 0.7453, "step": 4995 }, { "epoch": 1.1449673651666095, "grad_norm": 0.1819201558828354, "learning_rate": 4.82932505818464e-05, "loss": 0.7513, "step": 5000 }, { "epoch": 1.1461124470399633, "grad_norm": 0.17857210338115692, "learning_rate": 4.822860098267391e-05, "loss": 0.7504, "step": 5005 }, { "epoch": 1.1472575289133173, "grad_norm": 0.22314776480197906, "learning_rate": 4.8163951383501425e-05, "loss": 0.7535, "step": 5010 }, { "epoch": 1.1484026107866712, "grad_norm": 0.16417430341243744, "learning_rate": 4.809930178432894e-05, "loss": 0.7516, "step": 5015 }, { "epoch": 1.1495476926600252, "grad_norm": 0.16635441780090332, "learning_rate": 4.8034652185156454e-05, "loss": 0.7499, "step": 5020 }, { "epoch": 1.150692774533379, "grad_norm": 0.254543662071228, "learning_rate": 4.797000258598397e-05, "loss": 0.75, "step": 5025 }, { "epoch": 1.151837856406733, "grad_norm": 0.17547422647476196, "learning_rate": 4.790535298681149e-05, "loss": 0.752, "step": 5030 }, { "epoch": 1.1529829382800871, "grad_norm": 0.18553274869918823, "learning_rate": 4.7840703387638996e-05, "loss": 0.7505, "step": 5035 }, { "epoch": 1.154128020153441, "grad_norm": 0.24924322962760925, "learning_rate": 4.777605378846651e-05, "loss": 0.7488, "step": 5040 }, { "epoch": 1.155273102026795, "grad_norm": 0.3234774172306061, "learning_rate": 4.771140418929403e-05, "loss": 0.7501, "step": 5045 }, { "epoch": 1.1564181839001488, "grad_norm": 0.2411537766456604, "learning_rate": 4.7646754590121546e-05, "loss": 0.7476, "step": 5050 }, { "epoch": 1.1575632657735029, "grad_norm": 0.21320085227489471, "learning_rate": 4.758210499094905e-05, "loss": 0.7512, "step": 5055 }, { "epoch": 1.1587083476468567, "grad_norm": 0.20281052589416504, "learning_rate": 4.7517455391776574e-05, "loss": 0.7498, "step": 5060 }, { "epoch": 1.1598534295202108, "grad_norm": 0.1915905475616455, "learning_rate": 4.745280579260409e-05, "loss": 0.7465, "step": 5065 }, { "epoch": 1.1609985113935646, "grad_norm": 0.17410551011562347, "learning_rate": 4.73881561934316e-05, "loss": 0.7512, "step": 5070 }, { "epoch": 1.1621435932669186, "grad_norm": 0.16328193247318268, "learning_rate": 4.7323506594259124e-05, "loss": 0.7527, "step": 5075 }, { "epoch": 1.1632886751402725, "grad_norm": 0.23142258822917938, "learning_rate": 4.725885699508663e-05, "loss": 0.7484, "step": 5080 }, { "epoch": 1.1644337570136265, "grad_norm": 0.3409397006034851, "learning_rate": 4.7194207395914145e-05, "loss": 0.7464, "step": 5085 }, { "epoch": 1.1655788388869803, "grad_norm": 0.29608339071273804, "learning_rate": 4.7129557796741666e-05, "loss": 0.7493, "step": 5090 }, { "epoch": 1.1667239207603344, "grad_norm": 0.25607064366340637, "learning_rate": 4.706490819756918e-05, "loss": 0.7495, "step": 5095 }, { "epoch": 1.1678690026336884, "grad_norm": 0.21050728857517242, "learning_rate": 4.700025859839669e-05, "loss": 0.7517, "step": 5100 }, { "epoch": 1.1690140845070423, "grad_norm": 0.2006913423538208, "learning_rate": 4.693560899922421e-05, "loss": 0.7498, "step": 5105 }, { "epoch": 1.170159166380396, "grad_norm": 0.1534210741519928, "learning_rate": 4.687095940005172e-05, "loss": 0.749, "step": 5110 }, { "epoch": 1.1713042482537501, "grad_norm": 0.21326866745948792, "learning_rate": 4.680630980087924e-05, "loss": 0.753, "step": 5115 }, { "epoch": 1.1724493301271042, "grad_norm": 0.21120962500572205, "learning_rate": 4.674166020170675e-05, "loss": 0.7518, "step": 5120 }, { "epoch": 1.173594412000458, "grad_norm": 0.1823272407054901, "learning_rate": 4.6677010602534266e-05, "loss": 0.7484, "step": 5125 }, { "epoch": 1.174739493873812, "grad_norm": 0.19098564982414246, "learning_rate": 4.661236100336178e-05, "loss": 0.7551, "step": 5130 }, { "epoch": 1.1758845757471659, "grad_norm": 0.17055176198482513, "learning_rate": 4.65477114041893e-05, "loss": 0.7512, "step": 5135 }, { "epoch": 1.17702965762052, "grad_norm": 0.23654793202877045, "learning_rate": 4.648306180501681e-05, "loss": 0.7471, "step": 5140 }, { "epoch": 1.1781747394938737, "grad_norm": 0.18556205928325653, "learning_rate": 4.641841220584432e-05, "loss": 0.7527, "step": 5145 }, { "epoch": 1.1793198213672278, "grad_norm": 0.2395789623260498, "learning_rate": 4.6353762606671844e-05, "loss": 0.7502, "step": 5150 }, { "epoch": 1.1804649032405816, "grad_norm": 0.23787091672420502, "learning_rate": 4.628911300749936e-05, "loss": 0.7485, "step": 5155 }, { "epoch": 1.1816099851139357, "grad_norm": 0.1994669884443283, "learning_rate": 4.6224463408326865e-05, "loss": 0.7516, "step": 5160 }, { "epoch": 1.1827550669872897, "grad_norm": 0.218184232711792, "learning_rate": 4.6159813809154386e-05, "loss": 0.7542, "step": 5165 }, { "epoch": 1.1839001488606435, "grad_norm": 0.16488322615623474, "learning_rate": 4.60951642099819e-05, "loss": 0.7472, "step": 5170 }, { "epoch": 1.1850452307339974, "grad_norm": 0.2295355200767517, "learning_rate": 4.6030514610809415e-05, "loss": 0.7485, "step": 5175 }, { "epoch": 1.1861903126073514, "grad_norm": 0.19770926237106323, "learning_rate": 4.5965865011636936e-05, "loss": 0.7515, "step": 5180 }, { "epoch": 1.1873353944807055, "grad_norm": 0.20445016026496887, "learning_rate": 4.590121541246444e-05, "loss": 0.7492, "step": 5185 }, { "epoch": 1.1884804763540593, "grad_norm": 0.23801274597644806, "learning_rate": 4.583656581329196e-05, "loss": 0.7482, "step": 5190 }, { "epoch": 1.1896255582274133, "grad_norm": 0.22532589733600616, "learning_rate": 4.577191621411948e-05, "loss": 0.7491, "step": 5195 }, { "epoch": 1.1907706401007672, "grad_norm": 0.2098410725593567, "learning_rate": 4.570726661494699e-05, "loss": 0.7498, "step": 5200 }, { "epoch": 1.1919157219741212, "grad_norm": 0.17258141934871674, "learning_rate": 4.56426170157745e-05, "loss": 0.7464, "step": 5205 }, { "epoch": 1.193060803847475, "grad_norm": 0.23425732553005219, "learning_rate": 4.557796741660202e-05, "loss": 0.7486, "step": 5210 }, { "epoch": 1.194205885720829, "grad_norm": 0.32258808612823486, "learning_rate": 4.5513317817429535e-05, "loss": 0.7517, "step": 5215 }, { "epoch": 1.195350967594183, "grad_norm": 0.29464104771614075, "learning_rate": 4.544866821825705e-05, "loss": 0.7469, "step": 5220 }, { "epoch": 1.196496049467537, "grad_norm": 0.2603786587715149, "learning_rate": 4.5384018619084564e-05, "loss": 0.7532, "step": 5225 }, { "epoch": 1.1976411313408908, "grad_norm": 0.24306389689445496, "learning_rate": 4.531936901991208e-05, "loss": 0.7481, "step": 5230 }, { "epoch": 1.1987862132142448, "grad_norm": 0.2622597813606262, "learning_rate": 4.525471942073959e-05, "loss": 0.748, "step": 5235 }, { "epoch": 1.1999312950875987, "grad_norm": 0.19283488392829895, "learning_rate": 4.519006982156711e-05, "loss": 0.7476, "step": 5240 }, { "epoch": 1.2010763769609527, "grad_norm": 0.22162917256355286, "learning_rate": 4.512542022239462e-05, "loss": 0.7489, "step": 5245 }, { "epoch": 1.2022214588343068, "grad_norm": 0.2552875280380249, "learning_rate": 4.5060770623222135e-05, "loss": 0.7471, "step": 5250 }, { "epoch": 1.2033665407076606, "grad_norm": 0.19390220940113068, "learning_rate": 4.4996121024049656e-05, "loss": 0.7454, "step": 5255 }, { "epoch": 1.2045116225810146, "grad_norm": 0.19998888671398163, "learning_rate": 4.493147142487717e-05, "loss": 0.7462, "step": 5260 }, { "epoch": 1.2056567044543685, "grad_norm": 0.14607127010822296, "learning_rate": 4.486682182570468e-05, "loss": 0.7489, "step": 5265 }, { "epoch": 1.2068017863277225, "grad_norm": 0.17341439425945282, "learning_rate": 4.48021722265322e-05, "loss": 0.7495, "step": 5270 }, { "epoch": 1.2079468682010763, "grad_norm": 0.1995754837989807, "learning_rate": 4.473752262735971e-05, "loss": 0.7482, "step": 5275 }, { "epoch": 1.2090919500744304, "grad_norm": 0.24299126863479614, "learning_rate": 4.467287302818723e-05, "loss": 0.751, "step": 5280 }, { "epoch": 1.2102370319477842, "grad_norm": 0.24882306158542633, "learning_rate": 4.460822342901475e-05, "loss": 0.7521, "step": 5285 }, { "epoch": 1.2113821138211383, "grad_norm": 0.2369246929883957, "learning_rate": 4.4543573829842255e-05, "loss": 0.749, "step": 5290 }, { "epoch": 1.212527195694492, "grad_norm": 0.16573785245418549, "learning_rate": 4.447892423066977e-05, "loss": 0.7513, "step": 5295 }, { "epoch": 1.2136722775678461, "grad_norm": 0.25582486391067505, "learning_rate": 4.441427463149729e-05, "loss": 0.7494, "step": 5300 }, { "epoch": 1.2148173594412, "grad_norm": 0.2501291036605835, "learning_rate": 4.4349625032324805e-05, "loss": 0.7454, "step": 5305 }, { "epoch": 1.215962441314554, "grad_norm": 0.25190266966819763, "learning_rate": 4.428497543315231e-05, "loss": 0.7508, "step": 5310 }, { "epoch": 1.217107523187908, "grad_norm": 0.2685193121433258, "learning_rate": 4.422032583397983e-05, "loss": 0.7515, "step": 5315 }, { "epoch": 1.2182526050612619, "grad_norm": 0.21009624004364014, "learning_rate": 4.415567623480735e-05, "loss": 0.7524, "step": 5320 }, { "epoch": 1.219397686934616, "grad_norm": 0.17537476122379303, "learning_rate": 4.409102663563486e-05, "loss": 0.7456, "step": 5325 }, { "epoch": 1.2205427688079697, "grad_norm": 0.21760603785514832, "learning_rate": 4.4026377036462376e-05, "loss": 0.7504, "step": 5330 }, { "epoch": 1.2216878506813238, "grad_norm": 0.2105751931667328, "learning_rate": 4.396172743728989e-05, "loss": 0.7458, "step": 5335 }, { "epoch": 1.2228329325546776, "grad_norm": 0.3550606667995453, "learning_rate": 4.3897077838117404e-05, "loss": 0.7498, "step": 5340 }, { "epoch": 1.2239780144280317, "grad_norm": 0.22760871052742004, "learning_rate": 4.3832428238944925e-05, "loss": 0.7445, "step": 5345 }, { "epoch": 1.2251230963013855, "grad_norm": 0.2754494249820709, "learning_rate": 4.376777863977243e-05, "loss": 0.7484, "step": 5350 }, { "epoch": 1.2262681781747395, "grad_norm": 0.3162951171398163, "learning_rate": 4.370312904059995e-05, "loss": 0.7461, "step": 5355 }, { "epoch": 1.2274132600480934, "grad_norm": 0.30421894788742065, "learning_rate": 4.363847944142747e-05, "loss": 0.7468, "step": 5360 }, { "epoch": 1.2285583419214474, "grad_norm": 0.22896705567836761, "learning_rate": 4.357382984225498e-05, "loss": 0.7485, "step": 5365 }, { "epoch": 1.2297034237948012, "grad_norm": 0.2728241980075836, "learning_rate": 4.350918024308249e-05, "loss": 0.7482, "step": 5370 }, { "epoch": 1.2308485056681553, "grad_norm": 0.18981365859508514, "learning_rate": 4.344453064391001e-05, "loss": 0.7493, "step": 5375 }, { "epoch": 1.2319935875415093, "grad_norm": 0.24496853351593018, "learning_rate": 4.3379881044737525e-05, "loss": 0.7466, "step": 5380 }, { "epoch": 1.2331386694148632, "grad_norm": 0.20402029156684875, "learning_rate": 4.331523144556504e-05, "loss": 0.7522, "step": 5385 }, { "epoch": 1.234283751288217, "grad_norm": 0.21166785061359406, "learning_rate": 4.325058184639256e-05, "loss": 0.7484, "step": 5390 }, { "epoch": 1.235428833161571, "grad_norm": 0.19772662222385406, "learning_rate": 4.318593224722007e-05, "loss": 0.7492, "step": 5395 }, { "epoch": 1.236573915034925, "grad_norm": 0.18771900236606598, "learning_rate": 4.312128264804758e-05, "loss": 0.7473, "step": 5400 }, { "epoch": 1.237718996908279, "grad_norm": 0.19199180603027344, "learning_rate": 4.30566330488751e-05, "loss": 0.7474, "step": 5405 }, { "epoch": 1.238864078781633, "grad_norm": 0.2975488603115082, "learning_rate": 4.299198344970262e-05, "loss": 0.7519, "step": 5410 }, { "epoch": 1.2400091606549868, "grad_norm": 0.28295421600341797, "learning_rate": 4.2927333850530124e-05, "loss": 0.7485, "step": 5415 }, { "epoch": 1.2411542425283408, "grad_norm": 0.18072207272052765, "learning_rate": 4.2862684251357645e-05, "loss": 0.7446, "step": 5420 }, { "epoch": 1.2422993244016947, "grad_norm": 0.17685651779174805, "learning_rate": 4.279803465218516e-05, "loss": 0.7469, "step": 5425 }, { "epoch": 1.2434444062750487, "grad_norm": 0.16913194954395294, "learning_rate": 4.2733385053012674e-05, "loss": 0.7466, "step": 5430 }, { "epoch": 1.2445894881484025, "grad_norm": 0.2549387514591217, "learning_rate": 4.266873545384019e-05, "loss": 0.7487, "step": 5435 }, { "epoch": 1.2457345700217566, "grad_norm": 0.18447519838809967, "learning_rate": 4.26040858546677e-05, "loss": 0.7464, "step": 5440 }, { "epoch": 1.2468796518951104, "grad_norm": 0.28448691964149475, "learning_rate": 4.2539436255495217e-05, "loss": 0.7527, "step": 5445 }, { "epoch": 1.2480247337684645, "grad_norm": 0.15298114717006683, "learning_rate": 4.247478665632274e-05, "loss": 0.7492, "step": 5450 }, { "epoch": 1.2491698156418183, "grad_norm": 0.24181102216243744, "learning_rate": 4.2410137057150245e-05, "loss": 0.7444, "step": 5455 }, { "epoch": 1.2503148975151723, "grad_norm": 0.2764556109905243, "learning_rate": 4.234548745797776e-05, "loss": 0.7461, "step": 5460 }, { "epoch": 1.2514599793885264, "grad_norm": 0.23078586161136627, "learning_rate": 4.228083785880528e-05, "loss": 0.7471, "step": 5465 }, { "epoch": 1.2526050612618802, "grad_norm": 0.18252263963222504, "learning_rate": 4.2216188259632794e-05, "loss": 0.7476, "step": 5470 }, { "epoch": 1.2537501431352343, "grad_norm": 0.23740258812904358, "learning_rate": 4.21515386604603e-05, "loss": 0.7491, "step": 5475 }, { "epoch": 1.254895225008588, "grad_norm": 0.22824452817440033, "learning_rate": 4.208688906128782e-05, "loss": 0.7475, "step": 5480 }, { "epoch": 1.2560403068819421, "grad_norm": 0.3317946195602417, "learning_rate": 4.202223946211534e-05, "loss": 0.7441, "step": 5485 }, { "epoch": 1.257185388755296, "grad_norm": 0.19173449277877808, "learning_rate": 4.195758986294285e-05, "loss": 0.7484, "step": 5490 }, { "epoch": 1.25833047062865, "grad_norm": 0.2722282111644745, "learning_rate": 4.189294026377037e-05, "loss": 0.7473, "step": 5495 }, { "epoch": 1.2594755525020038, "grad_norm": 0.21009880304336548, "learning_rate": 4.182829066459788e-05, "loss": 0.7481, "step": 5500 }, { "epoch": 1.2606206343753579, "grad_norm": 0.25598183274269104, "learning_rate": 4.1763641065425394e-05, "loss": 0.7466, "step": 5505 }, { "epoch": 1.261765716248712, "grad_norm": 0.17408539354801178, "learning_rate": 4.1698991466252915e-05, "loss": 0.7497, "step": 5510 }, { "epoch": 1.2629107981220657, "grad_norm": 0.1897374391555786, "learning_rate": 4.163434186708043e-05, "loss": 0.747, "step": 5515 }, { "epoch": 1.2640558799954196, "grad_norm": 0.18324466049671173, "learning_rate": 4.1569692267907937e-05, "loss": 0.7476, "step": 5520 }, { "epoch": 1.2652009618687736, "grad_norm": 0.21715322136878967, "learning_rate": 4.150504266873546e-05, "loss": 0.747, "step": 5525 }, { "epoch": 1.2663460437421277, "grad_norm": 0.21431036293506622, "learning_rate": 4.144039306956297e-05, "loss": 0.7496, "step": 5530 }, { "epoch": 1.2674911256154815, "grad_norm": 0.26471084356307983, "learning_rate": 4.1375743470390486e-05, "loss": 0.7476, "step": 5535 }, { "epoch": 1.2686362074888353, "grad_norm": 0.14690008759498596, "learning_rate": 4.1311093871218e-05, "loss": 0.7527, "step": 5540 }, { "epoch": 1.2697812893621894, "grad_norm": 0.15945012867450714, "learning_rate": 4.1246444272045514e-05, "loss": 0.7463, "step": 5545 }, { "epoch": 1.2709263712355434, "grad_norm": 0.22661566734313965, "learning_rate": 4.118179467287303e-05, "loss": 0.7481, "step": 5550 }, { "epoch": 1.2720714531088972, "grad_norm": 0.19860315322875977, "learning_rate": 4.111714507370055e-05, "loss": 0.747, "step": 5555 }, { "epoch": 1.2732165349822513, "grad_norm": 0.23950666189193726, "learning_rate": 4.105249547452806e-05, "loss": 0.7447, "step": 5560 }, { "epoch": 1.2743616168556051, "grad_norm": 0.2150084674358368, "learning_rate": 4.098784587535557e-05, "loss": 0.7505, "step": 5565 }, { "epoch": 1.2755066987289592, "grad_norm": 0.24051061272621155, "learning_rate": 4.092319627618309e-05, "loss": 0.7494, "step": 5570 }, { "epoch": 1.276651780602313, "grad_norm": 0.19743932783603668, "learning_rate": 4.0858546677010607e-05, "loss": 0.7519, "step": 5575 }, { "epoch": 1.277796862475667, "grad_norm": 0.17389178276062012, "learning_rate": 4.0793897077838114e-05, "loss": 0.7521, "step": 5580 }, { "epoch": 1.2789419443490209, "grad_norm": 0.19800211489200592, "learning_rate": 4.0729247478665635e-05, "loss": 0.7524, "step": 5585 }, { "epoch": 1.280087026222375, "grad_norm": 0.21809372305870056, "learning_rate": 4.066459787949315e-05, "loss": 0.7489, "step": 5590 }, { "epoch": 1.281232108095729, "grad_norm": 0.1831665188074112, "learning_rate": 4.0599948280320663e-05, "loss": 0.7453, "step": 5595 }, { "epoch": 1.2823771899690828, "grad_norm": 0.1886046826839447, "learning_rate": 4.053529868114818e-05, "loss": 0.7483, "step": 5600 }, { "epoch": 1.2835222718424366, "grad_norm": 0.1644008457660675, "learning_rate": 4.047064908197569e-05, "loss": 0.7474, "step": 5605 }, { "epoch": 1.2846673537157907, "grad_norm": 0.15591102838516235, "learning_rate": 4.0405999482803206e-05, "loss": 0.7478, "step": 5610 }, { "epoch": 1.2858124355891447, "grad_norm": 0.24285583198070526, "learning_rate": 4.034134988363073e-05, "loss": 0.7502, "step": 5615 }, { "epoch": 1.2869575174624985, "grad_norm": 0.19036328792572021, "learning_rate": 4.027670028445824e-05, "loss": 0.7475, "step": 5620 }, { "epoch": 1.2881025993358526, "grad_norm": 0.1598913073539734, "learning_rate": 4.021205068528575e-05, "loss": 0.7462, "step": 5625 }, { "epoch": 1.2892476812092064, "grad_norm": 0.3020450472831726, "learning_rate": 4.014740108611327e-05, "loss": 0.7477, "step": 5630 }, { "epoch": 1.2903927630825605, "grad_norm": 0.20970624685287476, "learning_rate": 4.0082751486940784e-05, "loss": 0.7497, "step": 5635 }, { "epoch": 1.2915378449559143, "grad_norm": 0.24046367406845093, "learning_rate": 4.00181018877683e-05, "loss": 0.7441, "step": 5640 }, { "epoch": 1.2926829268292683, "grad_norm": 0.22961650788784027, "learning_rate": 3.995345228859581e-05, "loss": 0.7448, "step": 5645 }, { "epoch": 1.2938280087026222, "grad_norm": 0.1839102804660797, "learning_rate": 3.988880268942333e-05, "loss": 0.7464, "step": 5650 }, { "epoch": 1.2949730905759762, "grad_norm": 0.1463252604007721, "learning_rate": 3.982415309025084e-05, "loss": 0.7461, "step": 5655 }, { "epoch": 1.2961181724493303, "grad_norm": 0.22954629361629486, "learning_rate": 3.975950349107836e-05, "loss": 0.7458, "step": 5660 }, { "epoch": 1.297263254322684, "grad_norm": 0.17761455476284027, "learning_rate": 3.969485389190587e-05, "loss": 0.7475, "step": 5665 }, { "epoch": 1.298408336196038, "grad_norm": 0.21988537907600403, "learning_rate": 3.9630204292733384e-05, "loss": 0.7476, "step": 5670 }, { "epoch": 1.299553418069392, "grad_norm": 0.2295353263616562, "learning_rate": 3.9565554693560905e-05, "loss": 0.7453, "step": 5675 }, { "epoch": 1.300698499942746, "grad_norm": 0.1871115267276764, "learning_rate": 3.950090509438842e-05, "loss": 0.7477, "step": 5680 }, { "epoch": 1.3018435818160998, "grad_norm": 0.2102229744195938, "learning_rate": 3.9436255495215926e-05, "loss": 0.7469, "step": 5685 }, { "epoch": 1.3029886636894539, "grad_norm": 0.2002745121717453, "learning_rate": 3.937160589604345e-05, "loss": 0.751, "step": 5690 }, { "epoch": 1.3041337455628077, "grad_norm": 0.23087018728256226, "learning_rate": 3.930695629687096e-05, "loss": 0.7454, "step": 5695 }, { "epoch": 1.3052788274361617, "grad_norm": 0.23787514865398407, "learning_rate": 3.9242306697698476e-05, "loss": 0.7476, "step": 5700 }, { "epoch": 1.3064239093095156, "grad_norm": 0.17557412385940552, "learning_rate": 3.917765709852599e-05, "loss": 0.7471, "step": 5705 }, { "epoch": 1.3075689911828696, "grad_norm": 0.1856442093849182, "learning_rate": 3.9113007499353504e-05, "loss": 0.748, "step": 5710 }, { "epoch": 1.3087140730562234, "grad_norm": 0.242033913731575, "learning_rate": 3.904835790018102e-05, "loss": 0.7502, "step": 5715 }, { "epoch": 1.3098591549295775, "grad_norm": 0.19342093169689178, "learning_rate": 3.898370830100854e-05, "loss": 0.7477, "step": 5720 }, { "epoch": 1.3110042368029315, "grad_norm": 0.2315911203622818, "learning_rate": 3.8919058701836053e-05, "loss": 0.746, "step": 5725 }, { "epoch": 1.3121493186762854, "grad_norm": 0.23462465405464172, "learning_rate": 3.885440910266356e-05, "loss": 0.7466, "step": 5730 }, { "epoch": 1.3132944005496392, "grad_norm": 0.20711112022399902, "learning_rate": 3.878975950349108e-05, "loss": 0.7495, "step": 5735 }, { "epoch": 1.3144394824229932, "grad_norm": 0.21462731063365936, "learning_rate": 3.8725109904318596e-05, "loss": 0.7484, "step": 5740 }, { "epoch": 1.3155845642963473, "grad_norm": 0.1888807862997055, "learning_rate": 3.866046030514611e-05, "loss": 0.7455, "step": 5745 }, { "epoch": 1.3167296461697011, "grad_norm": 0.19394944608211517, "learning_rate": 3.8595810705973625e-05, "loss": 0.749, "step": 5750 }, { "epoch": 1.317874728043055, "grad_norm": 0.1983867883682251, "learning_rate": 3.853116110680114e-05, "loss": 0.7463, "step": 5755 }, { "epoch": 1.319019809916409, "grad_norm": 0.1493363082408905, "learning_rate": 3.846651150762865e-05, "loss": 0.7472, "step": 5760 }, { "epoch": 1.320164891789763, "grad_norm": 0.1743180751800537, "learning_rate": 3.8401861908456174e-05, "loss": 0.7462, "step": 5765 }, { "epoch": 1.3213099736631169, "grad_norm": 0.2253560572862625, "learning_rate": 3.833721230928368e-05, "loss": 0.7438, "step": 5770 }, { "epoch": 1.322455055536471, "grad_norm": 0.19323718547821045, "learning_rate": 3.8272562710111196e-05, "loss": 0.7486, "step": 5775 }, { "epoch": 1.3236001374098247, "grad_norm": 0.1969655156135559, "learning_rate": 3.820791311093872e-05, "loss": 0.7467, "step": 5780 }, { "epoch": 1.3247452192831788, "grad_norm": 0.18611446022987366, "learning_rate": 3.814326351176623e-05, "loss": 0.7459, "step": 5785 }, { "epoch": 1.3258903011565326, "grad_norm": 0.25435754656791687, "learning_rate": 3.807861391259374e-05, "loss": 0.7469, "step": 5790 }, { "epoch": 1.3270353830298867, "grad_norm": 0.2215898036956787, "learning_rate": 3.801396431342126e-05, "loss": 0.7477, "step": 5795 }, { "epoch": 1.3281804649032405, "grad_norm": 0.16783438622951508, "learning_rate": 3.7949314714248774e-05, "loss": 0.7452, "step": 5800 }, { "epoch": 1.3293255467765945, "grad_norm": 0.2280804067850113, "learning_rate": 3.788466511507629e-05, "loss": 0.7459, "step": 5805 }, { "epoch": 1.3304706286499486, "grad_norm": 0.1959756463766098, "learning_rate": 3.78200155159038e-05, "loss": 0.7471, "step": 5810 }, { "epoch": 1.3316157105233024, "grad_norm": 0.28425371646881104, "learning_rate": 3.7755365916731316e-05, "loss": 0.7485, "step": 5815 }, { "epoch": 1.3327607923966562, "grad_norm": 0.1959027647972107, "learning_rate": 3.769071631755883e-05, "loss": 0.7489, "step": 5820 }, { "epoch": 1.3339058742700103, "grad_norm": 0.2708321809768677, "learning_rate": 3.762606671838635e-05, "loss": 0.7471, "step": 5825 }, { "epoch": 1.3350509561433643, "grad_norm": 0.19583362340927124, "learning_rate": 3.7561417119213866e-05, "loss": 0.7453, "step": 5830 }, { "epoch": 1.3361960380167182, "grad_norm": 0.14683350920677185, "learning_rate": 3.749676752004137e-05, "loss": 0.7472, "step": 5835 }, { "epoch": 1.3373411198900722, "grad_norm": 0.24646610021591187, "learning_rate": 3.7432117920868894e-05, "loss": 0.7475, "step": 5840 }, { "epoch": 1.338486201763426, "grad_norm": 0.15096144378185272, "learning_rate": 3.736746832169641e-05, "loss": 0.7479, "step": 5845 }, { "epoch": 1.33963128363678, "grad_norm": 0.14911700785160065, "learning_rate": 3.730281872252392e-05, "loss": 0.7497, "step": 5850 }, { "epoch": 1.340776365510134, "grad_norm": 0.2568439245223999, "learning_rate": 3.723816912335144e-05, "loss": 0.7464, "step": 5855 }, { "epoch": 1.341921447383488, "grad_norm": 0.25193852186203003, "learning_rate": 3.717351952417895e-05, "loss": 0.7458, "step": 5860 }, { "epoch": 1.3430665292568418, "grad_norm": 0.1714603453874588, "learning_rate": 3.7108869925006465e-05, "loss": 0.747, "step": 5865 }, { "epoch": 1.3442116111301958, "grad_norm": 0.2510092854499817, "learning_rate": 3.7044220325833986e-05, "loss": 0.747, "step": 5870 }, { "epoch": 1.3453566930035499, "grad_norm": 0.20950113236904144, "learning_rate": 3.6979570726661494e-05, "loss": 0.7479, "step": 5875 }, { "epoch": 1.3465017748769037, "grad_norm": 0.17263996601104736, "learning_rate": 3.691492112748901e-05, "loss": 0.7462, "step": 5880 }, { "epoch": 1.3476468567502575, "grad_norm": 0.25733307003974915, "learning_rate": 3.685027152831653e-05, "loss": 0.7486, "step": 5885 }, { "epoch": 1.3487919386236116, "grad_norm": 0.23318397998809814, "learning_rate": 3.678562192914404e-05, "loss": 0.7503, "step": 5890 }, { "epoch": 1.3499370204969656, "grad_norm": 0.19126096367835999, "learning_rate": 3.672097232997155e-05, "loss": 0.7425, "step": 5895 }, { "epoch": 1.3510821023703194, "grad_norm": 0.2814708650112152, "learning_rate": 3.665632273079907e-05, "loss": 0.7476, "step": 5900 }, { "epoch": 1.3522271842436735, "grad_norm": 0.23234932124614716, "learning_rate": 3.6591673131626586e-05, "loss": 0.7485, "step": 5905 }, { "epoch": 1.3533722661170273, "grad_norm": 0.34967902302742004, "learning_rate": 3.65270235324541e-05, "loss": 0.7484, "step": 5910 }, { "epoch": 1.3545173479903814, "grad_norm": 0.17776401340961456, "learning_rate": 3.6462373933281614e-05, "loss": 0.7476, "step": 5915 }, { "epoch": 1.3556624298637352, "grad_norm": 0.20090113580226898, "learning_rate": 3.639772433410913e-05, "loss": 0.7461, "step": 5920 }, { "epoch": 1.3568075117370892, "grad_norm": 0.27608180046081543, "learning_rate": 3.633307473493664e-05, "loss": 0.7471, "step": 5925 }, { "epoch": 1.357952593610443, "grad_norm": 0.18629661202430725, "learning_rate": 3.6268425135764164e-05, "loss": 0.7479, "step": 5930 }, { "epoch": 1.3590976754837971, "grad_norm": 0.20956484973430634, "learning_rate": 3.620377553659168e-05, "loss": 0.7487, "step": 5935 }, { "epoch": 1.3602427573571512, "grad_norm": 0.2737008035182953, "learning_rate": 3.6139125937419185e-05, "loss": 0.7436, "step": 5940 }, { "epoch": 1.361387839230505, "grad_norm": 0.16381745040416718, "learning_rate": 3.6074476338246706e-05, "loss": 0.7469, "step": 5945 }, { "epoch": 1.3625329211038588, "grad_norm": 0.24451714754104614, "learning_rate": 3.600982673907422e-05, "loss": 0.748, "step": 5950 }, { "epoch": 1.3636780029772129, "grad_norm": 0.34765592217445374, "learning_rate": 3.5945177139901735e-05, "loss": 0.7451, "step": 5955 }, { "epoch": 1.364823084850567, "grad_norm": 0.1471840888261795, "learning_rate": 3.588052754072925e-05, "loss": 0.7486, "step": 5960 }, { "epoch": 1.3659681667239207, "grad_norm": 0.23996858298778534, "learning_rate": 3.581587794155676e-05, "loss": 0.7492, "step": 5965 }, { "epoch": 1.3671132485972748, "grad_norm": 0.22462098300457, "learning_rate": 3.575122834238428e-05, "loss": 0.7446, "step": 5970 }, { "epoch": 1.3682583304706286, "grad_norm": 0.22103479504585266, "learning_rate": 3.56865787432118e-05, "loss": 0.7508, "step": 5975 }, { "epoch": 1.3694034123439827, "grad_norm": 0.22594700753688812, "learning_rate": 3.5621929144039306e-05, "loss": 0.7501, "step": 5980 }, { "epoch": 1.3705484942173365, "grad_norm": 0.15880168974399567, "learning_rate": 3.555727954486682e-05, "loss": 0.7407, "step": 5985 }, { "epoch": 1.3716935760906905, "grad_norm": 0.18890678882598877, "learning_rate": 3.549262994569434e-05, "loss": 0.7498, "step": 5990 }, { "epoch": 1.3728386579640444, "grad_norm": 0.2300521731376648, "learning_rate": 3.5427980346521855e-05, "loss": 0.7429, "step": 5995 }, { "epoch": 1.3739837398373984, "grad_norm": 0.2027832716703415, "learning_rate": 3.536333074734936e-05, "loss": 0.7482, "step": 6000 }, { "epoch": 1.3751288217107522, "grad_norm": 0.1609947830438614, "learning_rate": 3.5298681148176884e-05, "loss": 0.7469, "step": 6005 }, { "epoch": 1.3762739035841063, "grad_norm": 0.1555599868297577, "learning_rate": 3.52340315490044e-05, "loss": 0.7448, "step": 6010 }, { "epoch": 1.37741898545746, "grad_norm": 0.19723734259605408, "learning_rate": 3.516938194983191e-05, "loss": 0.7445, "step": 6015 }, { "epoch": 1.3785640673308142, "grad_norm": 0.23402778804302216, "learning_rate": 3.5104732350659426e-05, "loss": 0.7437, "step": 6020 }, { "epoch": 1.3797091492041682, "grad_norm": 0.2760639488697052, "learning_rate": 3.504008275148694e-05, "loss": 0.7429, "step": 6025 }, { "epoch": 1.380854231077522, "grad_norm": 0.18020078539848328, "learning_rate": 3.4975433152314455e-05, "loss": 0.7464, "step": 6030 }, { "epoch": 1.3819993129508759, "grad_norm": 0.19109944999217987, "learning_rate": 3.4910783553141976e-05, "loss": 0.7438, "step": 6035 }, { "epoch": 1.38314439482423, "grad_norm": 0.2158491015434265, "learning_rate": 3.484613395396949e-05, "loss": 0.7462, "step": 6040 }, { "epoch": 1.384289476697584, "grad_norm": 0.2594541311264038, "learning_rate": 3.4781484354797e-05, "loss": 0.7483, "step": 6045 }, { "epoch": 1.3854345585709378, "grad_norm": 0.1272057443857193, "learning_rate": 3.471683475562452e-05, "loss": 0.7449, "step": 6050 }, { "epoch": 1.3865796404442918, "grad_norm": 0.1897531896829605, "learning_rate": 3.465218515645203e-05, "loss": 0.7491, "step": 6055 }, { "epoch": 1.3877247223176457, "grad_norm": 0.1548863649368286, "learning_rate": 3.458753555727955e-05, "loss": 0.7439, "step": 6060 }, { "epoch": 1.3888698041909997, "grad_norm": 0.16846147179603577, "learning_rate": 3.452288595810706e-05, "loss": 0.7437, "step": 6065 }, { "epoch": 1.3900148860643535, "grad_norm": 0.1798481047153473, "learning_rate": 3.4458236358934575e-05, "loss": 0.7464, "step": 6070 }, { "epoch": 1.3911599679377076, "grad_norm": 0.31617963314056396, "learning_rate": 3.439358675976209e-05, "loss": 0.7442, "step": 6075 }, { "epoch": 1.3923050498110614, "grad_norm": 0.17033277451992035, "learning_rate": 3.432893716058961e-05, "loss": 0.7494, "step": 6080 }, { "epoch": 1.3934501316844154, "grad_norm": 0.30593806505203247, "learning_rate": 3.426428756141712e-05, "loss": 0.744, "step": 6085 }, { "epoch": 1.3945952135577695, "grad_norm": 0.18552587926387787, "learning_rate": 3.419963796224463e-05, "loss": 0.7489, "step": 6090 }, { "epoch": 1.3957402954311233, "grad_norm": 0.15603308379650116, "learning_rate": 3.413498836307215e-05, "loss": 0.7412, "step": 6095 }, { "epoch": 1.3968853773044771, "grad_norm": 0.20843032002449036, "learning_rate": 3.407033876389967e-05, "loss": 0.7417, "step": 6100 }, { "epoch": 1.3980304591778312, "grad_norm": 0.20933730900287628, "learning_rate": 3.4005689164727175e-05, "loss": 0.7426, "step": 6105 }, { "epoch": 1.3991755410511852, "grad_norm": 0.17963987588882446, "learning_rate": 3.3941039565554696e-05, "loss": 0.747, "step": 6110 }, { "epoch": 1.400320622924539, "grad_norm": 0.22531652450561523, "learning_rate": 3.387638996638221e-05, "loss": 0.7469, "step": 6115 }, { "epoch": 1.4014657047978931, "grad_norm": 0.2187807708978653, "learning_rate": 3.3811740367209724e-05, "loss": 0.7439, "step": 6120 }, { "epoch": 1.402610786671247, "grad_norm": 0.21380850672721863, "learning_rate": 3.374709076803724e-05, "loss": 0.7443, "step": 6125 }, { "epoch": 1.403755868544601, "grad_norm": 0.22001244127750397, "learning_rate": 3.368244116886475e-05, "loss": 0.7509, "step": 6130 }, { "epoch": 1.4049009504179548, "grad_norm": 0.17430852353572845, "learning_rate": 3.361779156969227e-05, "loss": 0.7496, "step": 6135 }, { "epoch": 1.4060460322913089, "grad_norm": 0.2584846317768097, "learning_rate": 3.355314197051979e-05, "loss": 0.7483, "step": 6140 }, { "epoch": 1.4071911141646627, "grad_norm": 0.15371204912662506, "learning_rate": 3.34884923713473e-05, "loss": 0.7454, "step": 6145 }, { "epoch": 1.4083361960380167, "grad_norm": 0.24281756579875946, "learning_rate": 3.342384277217481e-05, "loss": 0.7459, "step": 6150 }, { "epoch": 1.4094812779113708, "grad_norm": 0.18498913943767548, "learning_rate": 3.335919317300233e-05, "loss": 0.7458, "step": 6155 }, { "epoch": 1.4106263597847246, "grad_norm": 0.19410498440265656, "learning_rate": 3.3294543573829845e-05, "loss": 0.7465, "step": 6160 }, { "epoch": 1.4117714416580784, "grad_norm": 0.1695197969675064, "learning_rate": 3.322989397465736e-05, "loss": 0.7485, "step": 6165 }, { "epoch": 1.4129165235314325, "grad_norm": 0.1701662689447403, "learning_rate": 3.316524437548487e-05, "loss": 0.7511, "step": 6170 }, { "epoch": 1.4140616054047865, "grad_norm": 0.1659391224384308, "learning_rate": 3.310059477631239e-05, "loss": 0.7421, "step": 6175 }, { "epoch": 1.4152066872781404, "grad_norm": 0.17819622159004211, "learning_rate": 3.30359451771399e-05, "loss": 0.7465, "step": 6180 }, { "epoch": 1.4163517691514944, "grad_norm": 0.2059786170721054, "learning_rate": 3.297129557796742e-05, "loss": 0.7437, "step": 6185 }, { "epoch": 1.4174968510248482, "grad_norm": 0.24188071489334106, "learning_rate": 3.290664597879493e-05, "loss": 0.7426, "step": 6190 }, { "epoch": 1.4186419328982023, "grad_norm": 0.2378331571817398, "learning_rate": 3.2841996379622444e-05, "loss": 0.7468, "step": 6195 }, { "epoch": 1.419787014771556, "grad_norm": 0.2121928632259369, "learning_rate": 3.2777346780449965e-05, "loss": 0.7437, "step": 6200 }, { "epoch": 1.4209320966449102, "grad_norm": 0.18138332664966583, "learning_rate": 3.271269718127748e-05, "loss": 0.7473, "step": 6205 }, { "epoch": 1.422077178518264, "grad_norm": 0.18859916925430298, "learning_rate": 3.264804758210499e-05, "loss": 0.7499, "step": 6210 }, { "epoch": 1.423222260391618, "grad_norm": 0.250404417514801, "learning_rate": 3.258339798293251e-05, "loss": 0.7473, "step": 6215 }, { "epoch": 1.424367342264972, "grad_norm": 0.26826900243759155, "learning_rate": 3.251874838376002e-05, "loss": 0.7442, "step": 6220 }, { "epoch": 1.425512424138326, "grad_norm": 0.20250312983989716, "learning_rate": 3.2454098784587536e-05, "loss": 0.7453, "step": 6225 }, { "epoch": 1.4266575060116797, "grad_norm": 0.18097352981567383, "learning_rate": 3.238944918541505e-05, "loss": 0.7525, "step": 6230 }, { "epoch": 1.4278025878850338, "grad_norm": 0.19572164118289948, "learning_rate": 3.2324799586242565e-05, "loss": 0.748, "step": 6235 }, { "epoch": 1.4289476697583878, "grad_norm": 0.2311696708202362, "learning_rate": 3.226014998707008e-05, "loss": 0.7479, "step": 6240 }, { "epoch": 1.4300927516317417, "grad_norm": 0.22108034789562225, "learning_rate": 3.21955003878976e-05, "loss": 0.7435, "step": 6245 }, { "epoch": 1.4312378335050955, "grad_norm": 0.19984404742717743, "learning_rate": 3.213085078872511e-05, "loss": 0.7446, "step": 6250 }, { "epoch": 1.4323829153784495, "grad_norm": 0.17798064649105072, "learning_rate": 3.206620118955262e-05, "loss": 0.7499, "step": 6255 }, { "epoch": 1.4335279972518036, "grad_norm": 0.19513164460659027, "learning_rate": 3.200155159038014e-05, "loss": 0.7478, "step": 6260 }, { "epoch": 1.4346730791251574, "grad_norm": 0.21562044322490692, "learning_rate": 3.193690199120766e-05, "loss": 0.7471, "step": 6265 }, { "epoch": 1.4358181609985115, "grad_norm": 0.16316361725330353, "learning_rate": 3.187225239203517e-05, "loss": 0.7473, "step": 6270 }, { "epoch": 1.4369632428718653, "grad_norm": 0.17712213099002838, "learning_rate": 3.1807602792862685e-05, "loss": 0.7458, "step": 6275 }, { "epoch": 1.4381083247452193, "grad_norm": 0.18155217170715332, "learning_rate": 3.17429531936902e-05, "loss": 0.7513, "step": 6280 }, { "epoch": 1.4392534066185732, "grad_norm": 0.18960131704807281, "learning_rate": 3.1678303594517714e-05, "loss": 0.7469, "step": 6285 }, { "epoch": 1.4403984884919272, "grad_norm": 0.15972264111042023, "learning_rate": 3.1613653995345235e-05, "loss": 0.7422, "step": 6290 }, { "epoch": 1.441543570365281, "grad_norm": 0.23653262853622437, "learning_rate": 3.154900439617274e-05, "loss": 0.7424, "step": 6295 }, { "epoch": 1.442688652238635, "grad_norm": 0.22679731249809265, "learning_rate": 3.1484354797000257e-05, "loss": 0.7452, "step": 6300 }, { "epoch": 1.4438337341119891, "grad_norm": 0.25342831015586853, "learning_rate": 3.141970519782778e-05, "loss": 0.7401, "step": 6305 }, { "epoch": 1.444978815985343, "grad_norm": 0.21492965519428253, "learning_rate": 3.135505559865529e-05, "loss": 0.7443, "step": 6310 }, { "epoch": 1.4461238978586968, "grad_norm": 0.19540521502494812, "learning_rate": 3.12904059994828e-05, "loss": 0.7442, "step": 6315 }, { "epoch": 1.4472689797320508, "grad_norm": 0.28760331869125366, "learning_rate": 3.122575640031032e-05, "loss": 0.7408, "step": 6320 }, { "epoch": 1.4484140616054049, "grad_norm": 0.1781582087278366, "learning_rate": 3.1161106801137834e-05, "loss": 0.7459, "step": 6325 }, { "epoch": 1.4495591434787587, "grad_norm": 0.21897757053375244, "learning_rate": 3.109645720196535e-05, "loss": 0.7438, "step": 6330 }, { "epoch": 1.4507042253521127, "grad_norm": 0.3101745843887329, "learning_rate": 3.103180760279286e-05, "loss": 0.7477, "step": 6335 }, { "epoch": 1.4518493072254666, "grad_norm": 0.18157465755939484, "learning_rate": 3.096715800362038e-05, "loss": 0.7497, "step": 6340 }, { "epoch": 1.4529943890988206, "grad_norm": 0.2329101860523224, "learning_rate": 3.090250840444789e-05, "loss": 0.7449, "step": 6345 }, { "epoch": 1.4541394709721744, "grad_norm": 0.17827336490154266, "learning_rate": 3.083785880527541e-05, "loss": 0.745, "step": 6350 }, { "epoch": 1.4552845528455285, "grad_norm": 0.22005167603492737, "learning_rate": 3.077320920610292e-05, "loss": 0.7458, "step": 6355 }, { "epoch": 1.4564296347188823, "grad_norm": 0.17929284274578094, "learning_rate": 3.0708559606930434e-05, "loss": 0.7451, "step": 6360 }, { "epoch": 1.4575747165922364, "grad_norm": 0.20535363256931305, "learning_rate": 3.0643910007757955e-05, "loss": 0.7398, "step": 6365 }, { "epoch": 1.4587197984655904, "grad_norm": 0.19280757009983063, "learning_rate": 3.057926040858547e-05, "loss": 0.7476, "step": 6370 }, { "epoch": 1.4598648803389442, "grad_norm": 0.19142794609069824, "learning_rate": 3.0514610809412987e-05, "loss": 0.7427, "step": 6375 }, { "epoch": 1.461009962212298, "grad_norm": 0.2704218626022339, "learning_rate": 3.0449961210240498e-05, "loss": 0.7455, "step": 6380 }, { "epoch": 1.4621550440856521, "grad_norm": 0.21759334206581116, "learning_rate": 3.0385311611068012e-05, "loss": 0.7429, "step": 6385 }, { "epoch": 1.4633001259590062, "grad_norm": 0.1478096842765808, "learning_rate": 3.032066201189553e-05, "loss": 0.7422, "step": 6390 }, { "epoch": 1.46444520783236, "grad_norm": 0.19636553525924683, "learning_rate": 3.0256012412723044e-05, "loss": 0.7478, "step": 6395 }, { "epoch": 1.465590289705714, "grad_norm": 0.1897573620080948, "learning_rate": 3.0191362813550554e-05, "loss": 0.7432, "step": 6400 }, { "epoch": 1.4667353715790679, "grad_norm": 0.17960022389888763, "learning_rate": 3.0126713214378072e-05, "loss": 0.746, "step": 6405 }, { "epoch": 1.467880453452422, "grad_norm": 0.18701818585395813, "learning_rate": 3.0062063615205586e-05, "loss": 0.7419, "step": 6410 }, { "epoch": 1.4690255353257757, "grad_norm": 0.1686282753944397, "learning_rate": 2.9997414016033104e-05, "loss": 0.743, "step": 6415 }, { "epoch": 1.4701706171991298, "grad_norm": 0.2564958333969116, "learning_rate": 2.9932764416860615e-05, "loss": 0.7475, "step": 6420 }, { "epoch": 1.4713156990724836, "grad_norm": 0.25114843249320984, "learning_rate": 2.9868114817688132e-05, "loss": 0.7476, "step": 6425 }, { "epoch": 1.4724607809458377, "grad_norm": 0.16929112374782562, "learning_rate": 2.9803465218515647e-05, "loss": 0.7429, "step": 6430 }, { "epoch": 1.4736058628191917, "grad_norm": 0.25373727083206177, "learning_rate": 2.9738815619343164e-05, "loss": 0.7429, "step": 6435 }, { "epoch": 1.4747509446925455, "grad_norm": 0.15113116800785065, "learning_rate": 2.9674166020170675e-05, "loss": 0.7416, "step": 6440 }, { "epoch": 1.4758960265658994, "grad_norm": 0.2088536024093628, "learning_rate": 2.960951642099819e-05, "loss": 0.7457, "step": 6445 }, { "epoch": 1.4770411084392534, "grad_norm": 0.16958044469356537, "learning_rate": 2.9544866821825707e-05, "loss": 0.7446, "step": 6450 }, { "epoch": 1.4781861903126075, "grad_norm": 0.2599340081214905, "learning_rate": 2.948021722265322e-05, "loss": 0.7422, "step": 6455 }, { "epoch": 1.4793312721859613, "grad_norm": 0.1796865016222, "learning_rate": 2.9415567623480732e-05, "loss": 0.7466, "step": 6460 }, { "epoch": 1.480476354059315, "grad_norm": 0.265708863735199, "learning_rate": 2.935091802430825e-05, "loss": 0.7434, "step": 6465 }, { "epoch": 1.4816214359326692, "grad_norm": 0.2140311598777771, "learning_rate": 2.9286268425135767e-05, "loss": 0.7485, "step": 6470 }, { "epoch": 1.4827665178060232, "grad_norm": 0.1847228854894638, "learning_rate": 2.922161882596328e-05, "loss": 0.743, "step": 6475 }, { "epoch": 1.483911599679377, "grad_norm": 0.24625612795352936, "learning_rate": 2.91569692267908e-05, "loss": 0.7481, "step": 6480 }, { "epoch": 1.485056681552731, "grad_norm": 0.199641615152359, "learning_rate": 2.909231962761831e-05, "loss": 0.7463, "step": 6485 }, { "epoch": 1.486201763426085, "grad_norm": 0.20555761456489563, "learning_rate": 2.9027670028445824e-05, "loss": 0.7499, "step": 6490 }, { "epoch": 1.487346845299439, "grad_norm": 0.1842489093542099, "learning_rate": 2.896302042927334e-05, "loss": 0.7497, "step": 6495 }, { "epoch": 1.4884919271727928, "grad_norm": 0.24474182724952698, "learning_rate": 2.8898370830100856e-05, "loss": 0.7429, "step": 6500 }, { "epoch": 1.4896370090461468, "grad_norm": 0.2305942177772522, "learning_rate": 2.8833721230928367e-05, "loss": 0.7456, "step": 6505 }, { "epoch": 1.4907820909195006, "grad_norm": 0.19151118397712708, "learning_rate": 2.8769071631755884e-05, "loss": 0.7469, "step": 6510 }, { "epoch": 1.4919271727928547, "grad_norm": 0.17481862008571625, "learning_rate": 2.87044220325834e-05, "loss": 0.7435, "step": 6515 }, { "epoch": 1.4930722546662087, "grad_norm": 0.1783713400363922, "learning_rate": 2.8639772433410916e-05, "loss": 0.7466, "step": 6520 }, { "epoch": 1.4942173365395626, "grad_norm": 0.18017755448818207, "learning_rate": 2.8575122834238427e-05, "loss": 0.7424, "step": 6525 }, { "epoch": 1.4953624184129164, "grad_norm": 0.25057896971702576, "learning_rate": 2.8510473235065945e-05, "loss": 0.7395, "step": 6530 }, { "epoch": 1.4965075002862704, "grad_norm": 0.25676342844963074, "learning_rate": 2.844582363589346e-05, "loss": 0.7402, "step": 6535 }, { "epoch": 1.4976525821596245, "grad_norm": 0.17843708395957947, "learning_rate": 2.8381174036720976e-05, "loss": 0.7485, "step": 6540 }, { "epoch": 1.4987976640329783, "grad_norm": 0.19747896492481232, "learning_rate": 2.8316524437548487e-05, "loss": 0.7466, "step": 6545 }, { "epoch": 1.4999427459063324, "grad_norm": 0.24598313868045807, "learning_rate": 2.8251874838376e-05, "loss": 0.7434, "step": 6550 }, { "epoch": 1.5010878277796862, "grad_norm": 0.1990305483341217, "learning_rate": 2.818722523920352e-05, "loss": 0.7455, "step": 6555 }, { "epoch": 1.5022329096530402, "grad_norm": 0.20389562845230103, "learning_rate": 2.8122575640031033e-05, "loss": 0.7472, "step": 6560 }, { "epoch": 1.5033779915263943, "grad_norm": 0.18377013504505157, "learning_rate": 2.8057926040858544e-05, "loss": 0.7445, "step": 6565 }, { "epoch": 1.5045230733997481, "grad_norm": 0.17699111998081207, "learning_rate": 2.799327644168606e-05, "loss": 0.7433, "step": 6570 }, { "epoch": 1.505668155273102, "grad_norm": 0.1928340494632721, "learning_rate": 2.792862684251358e-05, "loss": 0.7442, "step": 6575 }, { "epoch": 1.506813237146456, "grad_norm": 0.1634168028831482, "learning_rate": 2.7863977243341093e-05, "loss": 0.7422, "step": 6580 }, { "epoch": 1.50795831901981, "grad_norm": 0.1806732714176178, "learning_rate": 2.779932764416861e-05, "loss": 0.7436, "step": 6585 }, { "epoch": 1.5091034008931639, "grad_norm": 0.17415916919708252, "learning_rate": 2.7734678044996122e-05, "loss": 0.7457, "step": 6590 }, { "epoch": 1.5102484827665177, "grad_norm": 0.15790635347366333, "learning_rate": 2.7670028445823636e-05, "loss": 0.7445, "step": 6595 }, { "epoch": 1.5113935646398717, "grad_norm": 0.26694029569625854, "learning_rate": 2.7605378846651154e-05, "loss": 0.7469, "step": 6600 }, { "epoch": 1.5125386465132258, "grad_norm": 0.2555330991744995, "learning_rate": 2.7540729247478668e-05, "loss": 0.7449, "step": 6605 }, { "epoch": 1.5136837283865796, "grad_norm": 0.1820402443408966, "learning_rate": 2.747607964830618e-05, "loss": 0.7433, "step": 6610 }, { "epoch": 1.5148288102599334, "grad_norm": 0.17936120927333832, "learning_rate": 2.7411430049133696e-05, "loss": 0.7464, "step": 6615 }, { "epoch": 1.5159738921332875, "grad_norm": 0.17956532537937164, "learning_rate": 2.7346780449961214e-05, "loss": 0.7438, "step": 6620 }, { "epoch": 1.5171189740066415, "grad_norm": 0.23218995332717896, "learning_rate": 2.7282130850788728e-05, "loss": 0.7451, "step": 6625 }, { "epoch": 1.5182640558799956, "grad_norm": 0.21373704075813293, "learning_rate": 2.721748125161624e-05, "loss": 0.7436, "step": 6630 }, { "epoch": 1.5194091377533494, "grad_norm": 0.17974428832530975, "learning_rate": 2.7152831652443757e-05, "loss": 0.7414, "step": 6635 }, { "epoch": 1.5205542196267032, "grad_norm": 0.16060051321983337, "learning_rate": 2.708818205327127e-05, "loss": 0.7446, "step": 6640 }, { "epoch": 1.5216993015000573, "grad_norm": 0.21203061938285828, "learning_rate": 2.702353245409879e-05, "loss": 0.7418, "step": 6645 }, { "epoch": 1.5228443833734113, "grad_norm": 0.16728591918945312, "learning_rate": 2.69588828549263e-05, "loss": 0.7469, "step": 6650 }, { "epoch": 1.5239894652467652, "grad_norm": 0.1729734241962433, "learning_rate": 2.6894233255753814e-05, "loss": 0.7429, "step": 6655 }, { "epoch": 1.525134547120119, "grad_norm": 0.1781495064496994, "learning_rate": 2.682958365658133e-05, "loss": 0.7416, "step": 6660 }, { "epoch": 1.526279628993473, "grad_norm": 0.17667962610721588, "learning_rate": 2.6764934057408845e-05, "loss": 0.743, "step": 6665 }, { "epoch": 1.527424710866827, "grad_norm": 0.20562386512756348, "learning_rate": 2.6700284458236356e-05, "loss": 0.7439, "step": 6670 }, { "epoch": 1.528569792740181, "grad_norm": 0.22437416017055511, "learning_rate": 2.6635634859063874e-05, "loss": 0.7436, "step": 6675 }, { "epoch": 1.5297148746135347, "grad_norm": 0.20179647207260132, "learning_rate": 2.657098525989139e-05, "loss": 0.7446, "step": 6680 }, { "epoch": 1.5308599564868888, "grad_norm": 0.2053723931312561, "learning_rate": 2.6506335660718906e-05, "loss": 0.7424, "step": 6685 }, { "epoch": 1.5320050383602428, "grad_norm": 0.2075263410806656, "learning_rate": 2.6441686061546423e-05, "loss": 0.7439, "step": 6690 }, { "epoch": 1.5331501202335966, "grad_norm": 0.2276289463043213, "learning_rate": 2.6377036462373934e-05, "loss": 0.7428, "step": 6695 }, { "epoch": 1.5342952021069507, "grad_norm": 0.2265288233757019, "learning_rate": 2.631238686320145e-05, "loss": 0.7437, "step": 6700 }, { "epoch": 1.5354402839803045, "grad_norm": 0.17774851620197296, "learning_rate": 2.6247737264028966e-05, "loss": 0.7469, "step": 6705 }, { "epoch": 1.5365853658536586, "grad_norm": 0.1712876409292221, "learning_rate": 2.618308766485648e-05, "loss": 0.7398, "step": 6710 }, { "epoch": 1.5377304477270126, "grad_norm": 0.16502240300178528, "learning_rate": 2.611843806568399e-05, "loss": 0.7486, "step": 6715 }, { "epoch": 1.5388755296003664, "grad_norm": 0.17795094847679138, "learning_rate": 2.605378846651151e-05, "loss": 0.7474, "step": 6720 }, { "epoch": 1.5400206114737203, "grad_norm": 0.16823707520961761, "learning_rate": 2.5989138867339026e-05, "loss": 0.7411, "step": 6725 }, { "epoch": 1.5411656933470743, "grad_norm": 0.16455113887786865, "learning_rate": 2.592448926816654e-05, "loss": 0.7409, "step": 6730 }, { "epoch": 1.5423107752204284, "grad_norm": 0.1896887868642807, "learning_rate": 2.585983966899405e-05, "loss": 0.7445, "step": 6735 }, { "epoch": 1.5434558570937822, "grad_norm": 0.16402652859687805, "learning_rate": 2.579519006982157e-05, "loss": 0.748, "step": 6740 }, { "epoch": 1.544600938967136, "grad_norm": 0.17740361392498016, "learning_rate": 2.5730540470649083e-05, "loss": 0.7454, "step": 6745 }, { "epoch": 1.54574602084049, "grad_norm": 0.16312089562416077, "learning_rate": 2.56658908714766e-05, "loss": 0.7451, "step": 6750 }, { "epoch": 1.5468911027138441, "grad_norm": 0.15862400829792023, "learning_rate": 2.560124127230411e-05, "loss": 0.7472, "step": 6755 }, { "epoch": 1.548036184587198, "grad_norm": 0.23264558613300323, "learning_rate": 2.5536591673131626e-05, "loss": 0.7446, "step": 6760 }, { "epoch": 1.5491812664605518, "grad_norm": 0.20183593034744263, "learning_rate": 2.5471942073959143e-05, "loss": 0.7426, "step": 6765 }, { "epoch": 1.5503263483339058, "grad_norm": 0.16477009654045105, "learning_rate": 2.5407292474786658e-05, "loss": 0.7452, "step": 6770 }, { "epoch": 1.5514714302072599, "grad_norm": 0.14841727912425995, "learning_rate": 2.534264287561417e-05, "loss": 0.7439, "step": 6775 }, { "epoch": 1.552616512080614, "grad_norm": 0.16832435131072998, "learning_rate": 2.5277993276441686e-05, "loss": 0.7428, "step": 6780 }, { "epoch": 1.5537615939539677, "grad_norm": 0.16100366413593292, "learning_rate": 2.5213343677269204e-05, "loss": 0.7461, "step": 6785 }, { "epoch": 1.5549066758273216, "grad_norm": 0.16844795644283295, "learning_rate": 2.5148694078096718e-05, "loss": 0.7407, "step": 6790 }, { "epoch": 1.5560517577006756, "grad_norm": 0.2388097047805786, "learning_rate": 2.5084044478924235e-05, "loss": 0.7446, "step": 6795 }, { "epoch": 1.5571968395740297, "grad_norm": 0.2388181835412979, "learning_rate": 2.5019394879751746e-05, "loss": 0.7424, "step": 6800 }, { "epoch": 1.5583419214473835, "grad_norm": 0.1726500540971756, "learning_rate": 2.495474528057926e-05, "loss": 0.7445, "step": 6805 }, { "epoch": 1.5594870033207373, "grad_norm": 0.19557859003543854, "learning_rate": 2.4890095681406778e-05, "loss": 0.7411, "step": 6810 }, { "epoch": 1.5606320851940914, "grad_norm": 0.14564669132232666, "learning_rate": 2.4825446082234292e-05, "loss": 0.7441, "step": 6815 }, { "epoch": 1.5617771670674454, "grad_norm": 0.14897030591964722, "learning_rate": 2.4760796483061807e-05, "loss": 0.7449, "step": 6820 }, { "epoch": 1.5629222489407992, "grad_norm": 0.17548809945583344, "learning_rate": 2.469614688388932e-05, "loss": 0.7452, "step": 6825 }, { "epoch": 1.564067330814153, "grad_norm": 0.21593323349952698, "learning_rate": 2.463149728471684e-05, "loss": 0.7428, "step": 6830 }, { "epoch": 1.565212412687507, "grad_norm": 0.1762513667345047, "learning_rate": 2.456684768554435e-05, "loss": 0.7441, "step": 6835 }, { "epoch": 1.5663574945608612, "grad_norm": 0.2051253616809845, "learning_rate": 2.4502198086371867e-05, "loss": 0.743, "step": 6840 }, { "epoch": 1.5675025764342152, "grad_norm": 0.15214301645755768, "learning_rate": 2.443754848719938e-05, "loss": 0.7448, "step": 6845 }, { "epoch": 1.568647658307569, "grad_norm": 0.1648416966199875, "learning_rate": 2.4372898888026895e-05, "loss": 0.7378, "step": 6850 }, { "epoch": 1.5697927401809229, "grad_norm": 0.1519552320241928, "learning_rate": 2.430824928885441e-05, "loss": 0.7445, "step": 6855 }, { "epoch": 1.570937822054277, "grad_norm": 0.18640117347240448, "learning_rate": 2.4243599689681927e-05, "loss": 0.7416, "step": 6860 }, { "epoch": 1.572082903927631, "grad_norm": 0.19299635291099548, "learning_rate": 2.4178950090509438e-05, "loss": 0.7456, "step": 6865 }, { "epoch": 1.5732279858009848, "grad_norm": 0.19505447149276733, "learning_rate": 2.4114300491336955e-05, "loss": 0.7468, "step": 6870 }, { "epoch": 1.5743730676743386, "grad_norm": 0.17147518694400787, "learning_rate": 2.404965089216447e-05, "loss": 0.7414, "step": 6875 }, { "epoch": 1.5755181495476926, "grad_norm": 0.22106440365314484, "learning_rate": 2.3985001292991984e-05, "loss": 0.7444, "step": 6880 }, { "epoch": 1.5766632314210467, "grad_norm": 0.27256956696510315, "learning_rate": 2.3920351693819498e-05, "loss": 0.7448, "step": 6885 }, { "epoch": 1.5778083132944005, "grad_norm": 0.23254042863845825, "learning_rate": 2.3855702094647016e-05, "loss": 0.7449, "step": 6890 }, { "epoch": 1.5789533951677543, "grad_norm": 0.17433930933475494, "learning_rate": 2.3791052495474527e-05, "loss": 0.7433, "step": 6895 }, { "epoch": 1.5800984770411084, "grad_norm": 0.21645629405975342, "learning_rate": 2.3726402896302044e-05, "loss": 0.7444, "step": 6900 }, { "epoch": 1.5812435589144624, "grad_norm": 0.2150690108537674, "learning_rate": 2.3661753297129562e-05, "loss": 0.7435, "step": 6905 }, { "epoch": 1.5823886407878165, "grad_norm": 0.3046257793903351, "learning_rate": 2.3597103697957073e-05, "loss": 0.7452, "step": 6910 }, { "epoch": 1.5835337226611703, "grad_norm": 0.18164703249931335, "learning_rate": 2.353245409878459e-05, "loss": 0.7457, "step": 6915 }, { "epoch": 1.5846788045345241, "grad_norm": 0.16916827857494354, "learning_rate": 2.3467804499612104e-05, "loss": 0.7408, "step": 6920 }, { "epoch": 1.5858238864078782, "grad_norm": 0.17933885753154755, "learning_rate": 2.340315490043962e-05, "loss": 0.7411, "step": 6925 }, { "epoch": 1.5869689682812322, "grad_norm": 0.22185829281806946, "learning_rate": 2.3338505301267133e-05, "loss": 0.7463, "step": 6930 }, { "epoch": 1.588114050154586, "grad_norm": 0.21240991353988647, "learning_rate": 2.327385570209465e-05, "loss": 0.7432, "step": 6935 }, { "epoch": 1.58925913202794, "grad_norm": 0.1964518278837204, "learning_rate": 2.320920610292216e-05, "loss": 0.7425, "step": 6940 }, { "epoch": 1.590404213901294, "grad_norm": 0.19527554512023926, "learning_rate": 2.314455650374968e-05, "loss": 0.7439, "step": 6945 }, { "epoch": 1.591549295774648, "grad_norm": 0.16673538088798523, "learning_rate": 2.3079906904577193e-05, "loss": 0.7429, "step": 6950 }, { "epoch": 1.5926943776480018, "grad_norm": 0.2501964271068573, "learning_rate": 2.3015257305404707e-05, "loss": 0.7456, "step": 6955 }, { "epoch": 1.5938394595213556, "grad_norm": 0.21052591502666473, "learning_rate": 2.295060770623222e-05, "loss": 0.7451, "step": 6960 }, { "epoch": 1.5949845413947097, "grad_norm": 0.1688924878835678, "learning_rate": 2.288595810705974e-05, "loss": 0.7431, "step": 6965 }, { "epoch": 1.5961296232680637, "grad_norm": 0.2228127270936966, "learning_rate": 2.282130850788725e-05, "loss": 0.7368, "step": 6970 }, { "epoch": 1.5972747051414176, "grad_norm": 0.16818226873874664, "learning_rate": 2.2756658908714768e-05, "loss": 0.7402, "step": 6975 }, { "epoch": 1.5984197870147716, "grad_norm": 0.22381144762039185, "learning_rate": 2.2692009309542282e-05, "loss": 0.7424, "step": 6980 }, { "epoch": 1.5995648688881254, "grad_norm": 0.15830406546592712, "learning_rate": 2.2627359710369796e-05, "loss": 0.7477, "step": 6985 }, { "epoch": 1.6007099507614795, "grad_norm": 0.2143416404724121, "learning_rate": 2.256271011119731e-05, "loss": 0.7439, "step": 6990 }, { "epoch": 1.6018550326348335, "grad_norm": 0.20761322975158691, "learning_rate": 2.2498060512024828e-05, "loss": 0.7463, "step": 6995 }, { "epoch": 1.6030001145081874, "grad_norm": 0.23132385313510895, "learning_rate": 2.243341091285234e-05, "loss": 0.7459, "step": 7000 }, { "epoch": 1.6041451963815412, "grad_norm": 0.1932103931903839, "learning_rate": 2.2368761313679856e-05, "loss": 0.743, "step": 7005 }, { "epoch": 1.6052902782548952, "grad_norm": 0.18202821910381317, "learning_rate": 2.2304111714507374e-05, "loss": 0.742, "step": 7010 }, { "epoch": 1.6064353601282493, "grad_norm": 0.17258566617965698, "learning_rate": 2.2239462115334885e-05, "loss": 0.7414, "step": 7015 }, { "epoch": 1.607580442001603, "grad_norm": 0.223942369222641, "learning_rate": 2.2174812516162402e-05, "loss": 0.7452, "step": 7020 }, { "epoch": 1.608725523874957, "grad_norm": 0.15610426664352417, "learning_rate": 2.2110162916989917e-05, "loss": 0.744, "step": 7025 }, { "epoch": 1.609870605748311, "grad_norm": 0.1526680737733841, "learning_rate": 2.204551331781743e-05, "loss": 0.7444, "step": 7030 }, { "epoch": 1.611015687621665, "grad_norm": 0.15396279096603394, "learning_rate": 2.1980863718644945e-05, "loss": 0.738, "step": 7035 }, { "epoch": 1.6121607694950189, "grad_norm": 0.17607542872428894, "learning_rate": 2.1916214119472463e-05, "loss": 0.7445, "step": 7040 }, { "epoch": 1.6133058513683727, "grad_norm": 0.270062118768692, "learning_rate": 2.1851564520299974e-05, "loss": 0.7479, "step": 7045 }, { "epoch": 1.6144509332417267, "grad_norm": 0.18403637409210205, "learning_rate": 2.178691492112749e-05, "loss": 0.7425, "step": 7050 }, { "epoch": 1.6155960151150808, "grad_norm": 0.24163562059402466, "learning_rate": 2.1722265321955005e-05, "loss": 0.7418, "step": 7055 }, { "epoch": 1.6167410969884348, "grad_norm": 0.16625964641571045, "learning_rate": 2.165761572278252e-05, "loss": 0.7427, "step": 7060 }, { "epoch": 1.6178861788617886, "grad_norm": 0.19170767068862915, "learning_rate": 2.1592966123610034e-05, "loss": 0.7423, "step": 7065 }, { "epoch": 1.6190312607351425, "grad_norm": 0.1749299019575119, "learning_rate": 2.152831652443755e-05, "loss": 0.7442, "step": 7070 }, { "epoch": 1.6201763426084965, "grad_norm": 0.28893375396728516, "learning_rate": 2.1463666925265062e-05, "loss": 0.7441, "step": 7075 }, { "epoch": 1.6213214244818506, "grad_norm": 0.15919381380081177, "learning_rate": 2.139901732609258e-05, "loss": 0.7454, "step": 7080 }, { "epoch": 1.6224665063552044, "grad_norm": 0.18021824955940247, "learning_rate": 2.1334367726920094e-05, "loss": 0.7467, "step": 7085 }, { "epoch": 1.6236115882285582, "grad_norm": 0.21050041913986206, "learning_rate": 2.1269718127747608e-05, "loss": 0.7425, "step": 7090 }, { "epoch": 1.6247566701019123, "grad_norm": 0.1786871701478958, "learning_rate": 2.1205068528575122e-05, "loss": 0.7471, "step": 7095 }, { "epoch": 1.6259017519752663, "grad_norm": 0.18213413655757904, "learning_rate": 2.114041892940264e-05, "loss": 0.7435, "step": 7100 }, { "epoch": 1.6270468338486201, "grad_norm": 0.18353727459907532, "learning_rate": 2.107576933023015e-05, "loss": 0.7407, "step": 7105 }, { "epoch": 1.628191915721974, "grad_norm": 0.1717100739479065, "learning_rate": 2.101111973105767e-05, "loss": 0.7417, "step": 7110 }, { "epoch": 1.629336997595328, "grad_norm": 0.1539594978094101, "learning_rate": 2.0946470131885186e-05, "loss": 0.744, "step": 7115 }, { "epoch": 1.630482079468682, "grad_norm": 0.1720319539308548, "learning_rate": 2.0881820532712697e-05, "loss": 0.7472, "step": 7120 }, { "epoch": 1.6316271613420361, "grad_norm": 0.15920764207839966, "learning_rate": 2.0817170933540215e-05, "loss": 0.7439, "step": 7125 }, { "epoch": 1.63277224321539, "grad_norm": 0.2293880581855774, "learning_rate": 2.075252133436773e-05, "loss": 0.743, "step": 7130 }, { "epoch": 1.6339173250887438, "grad_norm": 0.16954880952835083, "learning_rate": 2.0687871735195243e-05, "loss": 0.7437, "step": 7135 }, { "epoch": 1.6350624069620978, "grad_norm": 0.2514009475708008, "learning_rate": 2.0623222136022757e-05, "loss": 0.7427, "step": 7140 }, { "epoch": 1.6362074888354519, "grad_norm": 0.1630130112171173, "learning_rate": 2.0558572536850275e-05, "loss": 0.7428, "step": 7145 }, { "epoch": 1.6373525707088057, "grad_norm": 0.2145601212978363, "learning_rate": 2.0493922937677786e-05, "loss": 0.7485, "step": 7150 }, { "epoch": 1.6384976525821595, "grad_norm": 0.2102821171283722, "learning_rate": 2.0429273338505303e-05, "loss": 0.7454, "step": 7155 }, { "epoch": 1.6396427344555136, "grad_norm": 0.24257567524909973, "learning_rate": 2.0364623739332818e-05, "loss": 0.7436, "step": 7160 }, { "epoch": 1.6407878163288676, "grad_norm": 0.198568657040596, "learning_rate": 2.0299974140160332e-05, "loss": 0.7436, "step": 7165 }, { "epoch": 1.6419328982022214, "grad_norm": 0.1587372124195099, "learning_rate": 2.0235324540987846e-05, "loss": 0.7453, "step": 7170 }, { "epoch": 1.6430779800755753, "grad_norm": 0.1822303682565689, "learning_rate": 2.0170674941815364e-05, "loss": 0.7407, "step": 7175 }, { "epoch": 1.6442230619489293, "grad_norm": 0.19832190871238708, "learning_rate": 2.0106025342642874e-05, "loss": 0.7437, "step": 7180 }, { "epoch": 1.6453681438222834, "grad_norm": 0.14971603453159332, "learning_rate": 2.0041375743470392e-05, "loss": 0.742, "step": 7185 }, { "epoch": 1.6465132256956372, "grad_norm": 0.22691278159618378, "learning_rate": 1.9976726144297906e-05, "loss": 0.7446, "step": 7190 }, { "epoch": 1.6476583075689912, "grad_norm": 0.2300955355167389, "learning_rate": 1.991207654512542e-05, "loss": 0.7449, "step": 7195 }, { "epoch": 1.648803389442345, "grad_norm": 0.22187408804893494, "learning_rate": 1.9847426945952935e-05, "loss": 0.7428, "step": 7200 }, { "epoch": 1.649948471315699, "grad_norm": 0.18663349747657776, "learning_rate": 1.9782777346780452e-05, "loss": 0.7431, "step": 7205 }, { "epoch": 1.6510935531890532, "grad_norm": 0.212881401181221, "learning_rate": 1.9718127747607963e-05, "loss": 0.7414, "step": 7210 }, { "epoch": 1.652238635062407, "grad_norm": 0.18411917984485626, "learning_rate": 1.965347814843548e-05, "loss": 0.738, "step": 7215 }, { "epoch": 1.6533837169357608, "grad_norm": 0.15346644818782806, "learning_rate": 1.9588828549262995e-05, "loss": 0.7422, "step": 7220 }, { "epoch": 1.6545287988091149, "grad_norm": 0.15337932109832764, "learning_rate": 1.952417895009051e-05, "loss": 0.7421, "step": 7225 }, { "epoch": 1.655673880682469, "grad_norm": 0.24280506372451782, "learning_rate": 1.9459529350918027e-05, "loss": 0.7395, "step": 7230 }, { "epoch": 1.6568189625558227, "grad_norm": 0.18352577090263367, "learning_rate": 1.939487975174554e-05, "loss": 0.7444, "step": 7235 }, { "epoch": 1.6579640444291766, "grad_norm": 0.1767406165599823, "learning_rate": 1.9330230152573055e-05, "loss": 0.7427, "step": 7240 }, { "epoch": 1.6591091263025306, "grad_norm": 0.20321263372898102, "learning_rate": 1.926558055340057e-05, "loss": 0.7432, "step": 7245 }, { "epoch": 1.6602542081758846, "grad_norm": 0.2268437296152115, "learning_rate": 1.9200930954228087e-05, "loss": 0.739, "step": 7250 }, { "epoch": 1.6613992900492385, "grad_norm": 0.25249117612838745, "learning_rate": 1.9136281355055598e-05, "loss": 0.7461, "step": 7255 }, { "epoch": 1.6625443719225923, "grad_norm": 0.24010123312473297, "learning_rate": 1.9071631755883115e-05, "loss": 0.741, "step": 7260 }, { "epoch": 1.6636894537959463, "grad_norm": 0.19712349772453308, "learning_rate": 1.900698215671063e-05, "loss": 0.738, "step": 7265 }, { "epoch": 1.6648345356693004, "grad_norm": 0.22672364115715027, "learning_rate": 1.8942332557538144e-05, "loss": 0.7448, "step": 7270 }, { "epoch": 1.6659796175426544, "grad_norm": 0.24020400643348694, "learning_rate": 1.8877682958365658e-05, "loss": 0.7407, "step": 7275 }, { "epoch": 1.6671246994160083, "grad_norm": 0.2094317078590393, "learning_rate": 1.8813033359193176e-05, "loss": 0.742, "step": 7280 }, { "epoch": 1.668269781289362, "grad_norm": 0.34122583270072937, "learning_rate": 1.8748383760020687e-05, "loss": 0.744, "step": 7285 }, { "epoch": 1.6694148631627161, "grad_norm": 0.29159530997276306, "learning_rate": 1.8683734160848204e-05, "loss": 0.7414, "step": 7290 }, { "epoch": 1.6705599450360702, "grad_norm": 0.2118174433708191, "learning_rate": 1.861908456167572e-05, "loss": 0.742, "step": 7295 }, { "epoch": 1.671705026909424, "grad_norm": 0.1960996836423874, "learning_rate": 1.8554434962503233e-05, "loss": 0.7412, "step": 7300 }, { "epoch": 1.6728501087827778, "grad_norm": 0.20879942178726196, "learning_rate": 1.8489785363330747e-05, "loss": 0.7422, "step": 7305 }, { "epoch": 1.673995190656132, "grad_norm": 0.1483440101146698, "learning_rate": 1.8425135764158264e-05, "loss": 0.742, "step": 7310 }, { "epoch": 1.675140272529486, "grad_norm": 0.15710671246051788, "learning_rate": 1.8360486164985775e-05, "loss": 0.7458, "step": 7315 }, { "epoch": 1.6762853544028398, "grad_norm": 0.15391378104686737, "learning_rate": 1.8295836565813293e-05, "loss": 0.7462, "step": 7320 }, { "epoch": 1.6774304362761936, "grad_norm": 0.15545009076595306, "learning_rate": 1.8231186966640807e-05, "loss": 0.7435, "step": 7325 }, { "epoch": 1.6785755181495476, "grad_norm": 0.16694271564483643, "learning_rate": 1.816653736746832e-05, "loss": 0.7436, "step": 7330 }, { "epoch": 1.6797206000229017, "grad_norm": 0.21401578187942505, "learning_rate": 1.810188776829584e-05, "loss": 0.7442, "step": 7335 }, { "epoch": 1.6808656818962557, "grad_norm": 0.1898529827594757, "learning_rate": 1.8037238169123353e-05, "loss": 0.743, "step": 7340 }, { "epoch": 1.6820107637696096, "grad_norm": 0.22047021985054016, "learning_rate": 1.7972588569950867e-05, "loss": 0.7419, "step": 7345 }, { "epoch": 1.6831558456429634, "grad_norm": 0.1430683583021164, "learning_rate": 1.790793897077838e-05, "loss": 0.7432, "step": 7350 }, { "epoch": 1.6843009275163174, "grad_norm": 0.1772889792919159, "learning_rate": 1.78432893716059e-05, "loss": 0.7423, "step": 7355 }, { "epoch": 1.6854460093896715, "grad_norm": 0.2118036150932312, "learning_rate": 1.777863977243341e-05, "loss": 0.743, "step": 7360 }, { "epoch": 1.6865910912630253, "grad_norm": 0.1891527622938156, "learning_rate": 1.7713990173260928e-05, "loss": 0.7424, "step": 7365 }, { "epoch": 1.6877361731363791, "grad_norm": 0.1738433688879013, "learning_rate": 1.7649340574088442e-05, "loss": 0.7403, "step": 7370 }, { "epoch": 1.6888812550097332, "grad_norm": 0.2036980539560318, "learning_rate": 1.7584690974915956e-05, "loss": 0.7409, "step": 7375 }, { "epoch": 1.6900263368830872, "grad_norm": 0.1397104263305664, "learning_rate": 1.752004137574347e-05, "loss": 0.7433, "step": 7380 }, { "epoch": 1.691171418756441, "grad_norm": 0.1923220157623291, "learning_rate": 1.7455391776570988e-05, "loss": 0.7432, "step": 7385 }, { "epoch": 1.6923165006297949, "grad_norm": 0.17131070792675018, "learning_rate": 1.73907421773985e-05, "loss": 0.7423, "step": 7390 }, { "epoch": 1.693461582503149, "grad_norm": 0.19466909766197205, "learning_rate": 1.7326092578226016e-05, "loss": 0.743, "step": 7395 }, { "epoch": 1.694606664376503, "grad_norm": 0.15853409469127655, "learning_rate": 1.726144297905353e-05, "loss": 0.7405, "step": 7400 }, { "epoch": 1.695751746249857, "grad_norm": 0.1643185168504715, "learning_rate": 1.7196793379881045e-05, "loss": 0.7466, "step": 7405 }, { "epoch": 1.6968968281232109, "grad_norm": 0.20322449505329132, "learning_rate": 1.713214378070856e-05, "loss": 0.7463, "step": 7410 }, { "epoch": 1.6980419099965647, "grad_norm": 0.1877737194299698, "learning_rate": 1.7067494181536077e-05, "loss": 0.737, "step": 7415 }, { "epoch": 1.6991869918699187, "grad_norm": 0.16088910400867462, "learning_rate": 1.7002844582363587e-05, "loss": 0.7466, "step": 7420 }, { "epoch": 1.7003320737432728, "grad_norm": 0.16621771454811096, "learning_rate": 1.6938194983191105e-05, "loss": 0.7427, "step": 7425 }, { "epoch": 1.7014771556166266, "grad_norm": 0.17676587402820587, "learning_rate": 1.687354538401862e-05, "loss": 0.7461, "step": 7430 }, { "epoch": 1.7026222374899804, "grad_norm": 0.22310510277748108, "learning_rate": 1.6808895784846133e-05, "loss": 0.7416, "step": 7435 }, { "epoch": 1.7037673193633345, "grad_norm": 0.1830226629972458, "learning_rate": 1.674424618567365e-05, "loss": 0.7393, "step": 7440 }, { "epoch": 1.7049124012366885, "grad_norm": 0.17793413996696472, "learning_rate": 1.6679596586501165e-05, "loss": 0.7447, "step": 7445 }, { "epoch": 1.7060574831100423, "grad_norm": 0.2182731181383133, "learning_rate": 1.661494698732868e-05, "loss": 0.7424, "step": 7450 }, { "epoch": 1.7072025649833962, "grad_norm": 0.15893946588039398, "learning_rate": 1.6550297388156194e-05, "loss": 0.7453, "step": 7455 }, { "epoch": 1.7083476468567502, "grad_norm": 0.18081578612327576, "learning_rate": 1.648564778898371e-05, "loss": 0.7473, "step": 7460 }, { "epoch": 1.7094927287301043, "grad_norm": 0.18333902955055237, "learning_rate": 1.6420998189811222e-05, "loss": 0.7386, "step": 7465 }, { "epoch": 1.710637810603458, "grad_norm": 0.18723733723163605, "learning_rate": 1.635634859063874e-05, "loss": 0.7439, "step": 7470 }, { "epoch": 1.7117828924768121, "grad_norm": 0.15768438577651978, "learning_rate": 1.6291698991466254e-05, "loss": 0.741, "step": 7475 }, { "epoch": 1.712927974350166, "grad_norm": 0.15292543172836304, "learning_rate": 1.6227049392293768e-05, "loss": 0.7439, "step": 7480 }, { "epoch": 1.71407305622352, "grad_norm": 0.15622615814208984, "learning_rate": 1.6162399793121282e-05, "loss": 0.7415, "step": 7485 }, { "epoch": 1.715218138096874, "grad_norm": 0.2150067538022995, "learning_rate": 1.60977501939488e-05, "loss": 0.7441, "step": 7490 }, { "epoch": 1.716363219970228, "grad_norm": 0.17251062393188477, "learning_rate": 1.603310059477631e-05, "loss": 0.7394, "step": 7495 }, { "epoch": 1.7175083018435817, "grad_norm": 0.1846706122159958, "learning_rate": 1.596845099560383e-05, "loss": 0.7433, "step": 7500 }, { "epoch": 1.7186533837169358, "grad_norm": 0.16406218707561493, "learning_rate": 1.5903801396431343e-05, "loss": 0.7412, "step": 7505 }, { "epoch": 1.7197984655902898, "grad_norm": 0.17380113899707794, "learning_rate": 1.5839151797258857e-05, "loss": 0.7407, "step": 7510 }, { "epoch": 1.7209435474636436, "grad_norm": 0.18964682519435883, "learning_rate": 1.577450219808637e-05, "loss": 0.7447, "step": 7515 }, { "epoch": 1.7220886293369975, "grad_norm": 0.24054868519306183, "learning_rate": 1.570985259891389e-05, "loss": 0.7437, "step": 7520 }, { "epoch": 1.7232337112103515, "grad_norm": 0.206729918718338, "learning_rate": 1.56452029997414e-05, "loss": 0.7403, "step": 7525 }, { "epoch": 1.7243787930837056, "grad_norm": 0.14928388595581055, "learning_rate": 1.5580553400568917e-05, "loss": 0.7429, "step": 7530 }, { "epoch": 1.7255238749570594, "grad_norm": 0.13767731189727783, "learning_rate": 1.551590380139643e-05, "loss": 0.7423, "step": 7535 }, { "epoch": 1.7266689568304132, "grad_norm": 0.21498754620552063, "learning_rate": 1.5451254202223946e-05, "loss": 0.7395, "step": 7540 }, { "epoch": 1.7278140387037673, "grad_norm": 0.19207172095775604, "learning_rate": 1.538660460305146e-05, "loss": 0.7423, "step": 7545 }, { "epoch": 1.7289591205771213, "grad_norm": 0.16116943955421448, "learning_rate": 1.5321955003878977e-05, "loss": 0.7404, "step": 7550 }, { "epoch": 1.7301042024504754, "grad_norm": 0.18703986704349518, "learning_rate": 1.5257305404706493e-05, "loss": 0.7387, "step": 7555 }, { "epoch": 1.7312492843238292, "grad_norm": 0.14413946866989136, "learning_rate": 1.5192655805534006e-05, "loss": 0.745, "step": 7560 }, { "epoch": 1.732394366197183, "grad_norm": 0.18334537744522095, "learning_rate": 1.5128006206361522e-05, "loss": 0.7456, "step": 7565 }, { "epoch": 1.733539448070537, "grad_norm": 0.1862698346376419, "learning_rate": 1.5063356607189036e-05, "loss": 0.7441, "step": 7570 }, { "epoch": 1.734684529943891, "grad_norm": 0.18286314606666565, "learning_rate": 1.4998707008016552e-05, "loss": 0.743, "step": 7575 }, { "epoch": 1.735829611817245, "grad_norm": 0.17219437658786774, "learning_rate": 1.4934057408844066e-05, "loss": 0.7403, "step": 7580 }, { "epoch": 1.7369746936905988, "grad_norm": 0.22169716656208038, "learning_rate": 1.4869407809671582e-05, "loss": 0.7379, "step": 7585 }, { "epoch": 1.7381197755639528, "grad_norm": 0.19314883649349213, "learning_rate": 1.4804758210499095e-05, "loss": 0.743, "step": 7590 }, { "epoch": 1.7392648574373069, "grad_norm": 0.22871583700180054, "learning_rate": 1.474010861132661e-05, "loss": 0.742, "step": 7595 }, { "epoch": 1.7404099393106607, "grad_norm": 0.17754560708999634, "learning_rate": 1.4675459012154125e-05, "loss": 0.7414, "step": 7600 }, { "epoch": 1.7415550211840145, "grad_norm": 0.16177891194820404, "learning_rate": 1.461080941298164e-05, "loss": 0.7418, "step": 7605 }, { "epoch": 1.7427001030573686, "grad_norm": 0.17212548851966858, "learning_rate": 1.4546159813809155e-05, "loss": 0.7438, "step": 7610 }, { "epoch": 1.7438451849307226, "grad_norm": 0.20453935861587524, "learning_rate": 1.448151021463667e-05, "loss": 0.7383, "step": 7615 }, { "epoch": 1.7449902668040767, "grad_norm": 0.17530488967895508, "learning_rate": 1.4416860615464183e-05, "loss": 0.7422, "step": 7620 }, { "epoch": 1.7461353486774305, "grad_norm": 0.15790791809558868, "learning_rate": 1.43522110162917e-05, "loss": 0.7426, "step": 7625 }, { "epoch": 1.7472804305507843, "grad_norm": 0.19352249801158905, "learning_rate": 1.4287561417119213e-05, "loss": 0.744, "step": 7630 }, { "epoch": 1.7484255124241384, "grad_norm": 0.19556356966495514, "learning_rate": 1.422291181794673e-05, "loss": 0.7412, "step": 7635 }, { "epoch": 1.7495705942974924, "grad_norm": 0.1878131479024887, "learning_rate": 1.4158262218774244e-05, "loss": 0.7456, "step": 7640 }, { "epoch": 1.7507156761708462, "grad_norm": 0.14573533833026886, "learning_rate": 1.409361261960176e-05, "loss": 0.743, "step": 7645 }, { "epoch": 1.7518607580442, "grad_norm": 0.15882503986358643, "learning_rate": 1.4028963020429272e-05, "loss": 0.7424, "step": 7650 }, { "epoch": 1.753005839917554, "grad_norm": 0.14983394742012024, "learning_rate": 1.396431342125679e-05, "loss": 0.745, "step": 7655 }, { "epoch": 1.7541509217909081, "grad_norm": 0.13475537300109863, "learning_rate": 1.3899663822084306e-05, "loss": 0.739, "step": 7660 }, { "epoch": 1.755296003664262, "grad_norm": 0.20346559584140778, "learning_rate": 1.3835014222911818e-05, "loss": 0.7376, "step": 7665 }, { "epoch": 1.7564410855376158, "grad_norm": 0.18719197809696198, "learning_rate": 1.3770364623739334e-05, "loss": 0.7371, "step": 7670 }, { "epoch": 1.7575861674109698, "grad_norm": 0.2041907161474228, "learning_rate": 1.3705715024566848e-05, "loss": 0.745, "step": 7675 }, { "epoch": 1.758731249284324, "grad_norm": 0.17985425889492035, "learning_rate": 1.3641065425394364e-05, "loss": 0.739, "step": 7680 }, { "epoch": 1.7598763311576777, "grad_norm": 0.1952625960111618, "learning_rate": 1.3576415826221878e-05, "loss": 0.7407, "step": 7685 }, { "epoch": 1.7610214130310318, "grad_norm": 0.19944436848163605, "learning_rate": 1.3511766227049394e-05, "loss": 0.7432, "step": 7690 }, { "epoch": 1.7621664949043856, "grad_norm": 0.1849888414144516, "learning_rate": 1.3447116627876907e-05, "loss": 0.7385, "step": 7695 }, { "epoch": 1.7633115767777396, "grad_norm": 0.20331071317195892, "learning_rate": 1.3382467028704423e-05, "loss": 0.7407, "step": 7700 }, { "epoch": 1.7644566586510937, "grad_norm": 0.15034550428390503, "learning_rate": 1.3317817429531937e-05, "loss": 0.742, "step": 7705 }, { "epoch": 1.7656017405244475, "grad_norm": 0.19090567529201508, "learning_rate": 1.3253167830359453e-05, "loss": 0.7434, "step": 7710 }, { "epoch": 1.7667468223978013, "grad_norm": 0.19512049853801727, "learning_rate": 1.3188518231186967e-05, "loss": 0.7418, "step": 7715 }, { "epoch": 1.7678919042711554, "grad_norm": 0.15119600296020508, "learning_rate": 1.3123868632014483e-05, "loss": 0.7441, "step": 7720 }, { "epoch": 1.7690369861445094, "grad_norm": 0.16292977333068848, "learning_rate": 1.3059219032841995e-05, "loss": 0.741, "step": 7725 }, { "epoch": 1.7701820680178633, "grad_norm": 0.14958208799362183, "learning_rate": 1.2994569433669513e-05, "loss": 0.7434, "step": 7730 }, { "epoch": 1.771327149891217, "grad_norm": 0.1666942834854126, "learning_rate": 1.2929919834497026e-05, "loss": 0.7415, "step": 7735 }, { "epoch": 1.7724722317645711, "grad_norm": 0.17329080402851105, "learning_rate": 1.2865270235324542e-05, "loss": 0.7401, "step": 7740 }, { "epoch": 1.7736173136379252, "grad_norm": 0.18390893936157227, "learning_rate": 1.2800620636152056e-05, "loss": 0.7392, "step": 7745 }, { "epoch": 1.774762395511279, "grad_norm": 0.20777173340320587, "learning_rate": 1.2735971036979572e-05, "loss": 0.7461, "step": 7750 }, { "epoch": 1.7759074773846328, "grad_norm": 0.16422134637832642, "learning_rate": 1.2671321437807084e-05, "loss": 0.7416, "step": 7755 }, { "epoch": 1.7770525592579869, "grad_norm": 0.20069563388824463, "learning_rate": 1.2606671838634602e-05, "loss": 0.7422, "step": 7760 }, { "epoch": 1.778197641131341, "grad_norm": 0.1575007140636444, "learning_rate": 1.2542022239462118e-05, "loss": 0.736, "step": 7765 }, { "epoch": 1.779342723004695, "grad_norm": 0.1856757402420044, "learning_rate": 1.247737264028963e-05, "loss": 0.7456, "step": 7770 }, { "epoch": 1.7804878048780488, "grad_norm": 0.1524512767791748, "learning_rate": 1.2412723041117146e-05, "loss": 0.7419, "step": 7775 }, { "epoch": 1.7816328867514026, "grad_norm": 0.17118653655052185, "learning_rate": 1.234807344194466e-05, "loss": 0.7465, "step": 7780 }, { "epoch": 1.7827779686247567, "grad_norm": 0.17673975229263306, "learning_rate": 1.2283423842772175e-05, "loss": 0.7377, "step": 7785 }, { "epoch": 1.7839230504981107, "grad_norm": 0.16801375150680542, "learning_rate": 1.221877424359969e-05, "loss": 0.7417, "step": 7790 }, { "epoch": 1.7850681323714646, "grad_norm": 0.21200990676879883, "learning_rate": 1.2154124644427205e-05, "loss": 0.7421, "step": 7795 }, { "epoch": 1.7862132142448184, "grad_norm": 0.1710125207901001, "learning_rate": 1.2089475045254719e-05, "loss": 0.7392, "step": 7800 }, { "epoch": 1.7873582961181724, "grad_norm": 0.17124149203300476, "learning_rate": 1.2024825446082235e-05, "loss": 0.7419, "step": 7805 }, { "epoch": 1.7885033779915265, "grad_norm": 0.1881469190120697, "learning_rate": 1.1960175846909749e-05, "loss": 0.7431, "step": 7810 }, { "epoch": 1.7896484598648803, "grad_norm": 0.138245090842247, "learning_rate": 1.1895526247737263e-05, "loss": 0.7416, "step": 7815 }, { "epoch": 1.7907935417382341, "grad_norm": 0.19728197157382965, "learning_rate": 1.1830876648564781e-05, "loss": 0.7388, "step": 7820 }, { "epoch": 1.7919386236115882, "grad_norm": 0.15021589398384094, "learning_rate": 1.1766227049392295e-05, "loss": 0.7395, "step": 7825 }, { "epoch": 1.7930837054849422, "grad_norm": 0.1715056300163269, "learning_rate": 1.170157745021981e-05, "loss": 0.7404, "step": 7830 }, { "epoch": 1.7942287873582963, "grad_norm": 0.14820553362369537, "learning_rate": 1.1636927851047325e-05, "loss": 0.745, "step": 7835 }, { "epoch": 1.79537386923165, "grad_norm": 0.1573396623134613, "learning_rate": 1.157227825187484e-05, "loss": 0.7412, "step": 7840 }, { "epoch": 1.796518951105004, "grad_norm": 0.17530624568462372, "learning_rate": 1.1507628652702354e-05, "loss": 0.7416, "step": 7845 }, { "epoch": 1.797664032978358, "grad_norm": 0.1957489401102066, "learning_rate": 1.144297905352987e-05, "loss": 0.7417, "step": 7850 }, { "epoch": 1.798809114851712, "grad_norm": 0.14243096113204956, "learning_rate": 1.1378329454357384e-05, "loss": 0.7407, "step": 7855 }, { "epoch": 1.7999541967250658, "grad_norm": 0.17198410630226135, "learning_rate": 1.1313679855184898e-05, "loss": 0.7396, "step": 7860 }, { "epoch": 1.8010992785984197, "grad_norm": 0.170758455991745, "learning_rate": 1.1249030256012414e-05, "loss": 0.7431, "step": 7865 }, { "epoch": 1.8022443604717737, "grad_norm": 0.1600438803434372, "learning_rate": 1.1184380656839928e-05, "loss": 0.7441, "step": 7870 }, { "epoch": 1.8033894423451278, "grad_norm": 0.18139535188674927, "learning_rate": 1.1119731057667442e-05, "loss": 0.7433, "step": 7875 }, { "epoch": 1.8045345242184816, "grad_norm": 0.24969196319580078, "learning_rate": 1.1055081458494958e-05, "loss": 0.7426, "step": 7880 }, { "epoch": 1.8056796060918354, "grad_norm": 0.16466961801052094, "learning_rate": 1.0990431859322473e-05, "loss": 0.7426, "step": 7885 }, { "epoch": 1.8068246879651895, "grad_norm": 0.19945576786994934, "learning_rate": 1.0925782260149987e-05, "loss": 0.7396, "step": 7890 }, { "epoch": 1.8079697698385435, "grad_norm": 0.2034262865781784, "learning_rate": 1.0861132660977503e-05, "loss": 0.7378, "step": 7895 }, { "epoch": 1.8091148517118976, "grad_norm": 0.16968347132205963, "learning_rate": 1.0796483061805017e-05, "loss": 0.7384, "step": 7900 }, { "epoch": 1.8102599335852514, "grad_norm": 0.20821303129196167, "learning_rate": 1.0731833462632531e-05, "loss": 0.7362, "step": 7905 }, { "epoch": 1.8114050154586052, "grad_norm": 0.20150309801101685, "learning_rate": 1.0667183863460047e-05, "loss": 0.7452, "step": 7910 }, { "epoch": 1.8125500973319593, "grad_norm": 0.16308322548866272, "learning_rate": 1.0602534264287561e-05, "loss": 0.7387, "step": 7915 }, { "epoch": 1.8136951792053133, "grad_norm": 0.19306686520576477, "learning_rate": 1.0537884665115075e-05, "loss": 0.7388, "step": 7920 }, { "epoch": 1.8148402610786671, "grad_norm": 0.1482827216386795, "learning_rate": 1.0473235065942593e-05, "loss": 0.7394, "step": 7925 }, { "epoch": 1.815985342952021, "grad_norm": 0.16145211458206177, "learning_rate": 1.0408585466770107e-05, "loss": 0.7387, "step": 7930 }, { "epoch": 1.817130424825375, "grad_norm": 0.15279000997543335, "learning_rate": 1.0343935867597622e-05, "loss": 0.7397, "step": 7935 }, { "epoch": 1.818275506698729, "grad_norm": 0.15904763340950012, "learning_rate": 1.0279286268425137e-05, "loss": 0.7404, "step": 7940 }, { "epoch": 1.8194205885720829, "grad_norm": 0.14108118414878845, "learning_rate": 1.0214636669252652e-05, "loss": 0.7407, "step": 7945 }, { "epoch": 1.8205656704454367, "grad_norm": 0.14389878511428833, "learning_rate": 1.0149987070080166e-05, "loss": 0.7454, "step": 7950 }, { "epoch": 1.8217107523187908, "grad_norm": 0.15871091187000275, "learning_rate": 1.0085337470907682e-05, "loss": 0.7398, "step": 7955 }, { "epoch": 1.8228558341921448, "grad_norm": 0.17688189446926117, "learning_rate": 1.0020687871735196e-05, "loss": 0.7433, "step": 7960 }, { "epoch": 1.8240009160654986, "grad_norm": 0.14944761991500854, "learning_rate": 9.95603827256271e-06, "loss": 0.7453, "step": 7965 }, { "epoch": 1.8251459979388527, "grad_norm": 0.19329968094825745, "learning_rate": 9.891388673390226e-06, "loss": 0.744, "step": 7970 }, { "epoch": 1.8262910798122065, "grad_norm": 0.15224604308605194, "learning_rate": 9.82673907421774e-06, "loss": 0.7419, "step": 7975 }, { "epoch": 1.8274361616855606, "grad_norm": 0.19345805048942566, "learning_rate": 9.762089475045255e-06, "loss": 0.7453, "step": 7980 }, { "epoch": 1.8285812435589146, "grad_norm": 0.17446333169937134, "learning_rate": 9.69743987587277e-06, "loss": 0.7404, "step": 7985 }, { "epoch": 1.8297263254322684, "grad_norm": 0.14764748513698578, "learning_rate": 9.632790276700285e-06, "loss": 0.7413, "step": 7990 }, { "epoch": 1.8308714073056223, "grad_norm": 0.15941238403320312, "learning_rate": 9.568140677527799e-06, "loss": 0.7429, "step": 7995 }, { "epoch": 1.8320164891789763, "grad_norm": 0.1465342789888382, "learning_rate": 9.503491078355315e-06, "loss": 0.7422, "step": 8000 }, { "epoch": 1.8331615710523304, "grad_norm": 0.15987376868724823, "learning_rate": 9.438841479182829e-06, "loss": 0.7432, "step": 8005 }, { "epoch": 1.8343066529256842, "grad_norm": 0.153104767203331, "learning_rate": 9.374191880010343e-06, "loss": 0.7391, "step": 8010 }, { "epoch": 1.835451734799038, "grad_norm": 0.14702925086021423, "learning_rate": 9.30954228083786e-06, "loss": 0.742, "step": 8015 }, { "epoch": 1.836596816672392, "grad_norm": 0.14339160919189453, "learning_rate": 9.244892681665373e-06, "loss": 0.743, "step": 8020 }, { "epoch": 1.837741898545746, "grad_norm": 0.16755425930023193, "learning_rate": 9.180243082492888e-06, "loss": 0.7434, "step": 8025 }, { "epoch": 1.8388869804191, "grad_norm": 0.19335520267486572, "learning_rate": 9.115593483320404e-06, "loss": 0.7422, "step": 8030 }, { "epoch": 1.8400320622924538, "grad_norm": 0.1825624406337738, "learning_rate": 9.05094388414792e-06, "loss": 0.741, "step": 8035 }, { "epoch": 1.8411771441658078, "grad_norm": 0.1814911663532257, "learning_rate": 8.986294284975434e-06, "loss": 0.7431, "step": 8040 }, { "epoch": 1.8423222260391618, "grad_norm": 0.16757063567638397, "learning_rate": 8.92164468580295e-06, "loss": 0.7367, "step": 8045 }, { "epoch": 1.843467307912516, "grad_norm": 0.18699325621128082, "learning_rate": 8.856995086630464e-06, "loss": 0.7431, "step": 8050 }, { "epoch": 1.8446123897858697, "grad_norm": 0.15374694764614105, "learning_rate": 8.792345487457978e-06, "loss": 0.7463, "step": 8055 }, { "epoch": 1.8457574716592235, "grad_norm": 0.17602622509002686, "learning_rate": 8.727695888285494e-06, "loss": 0.7433, "step": 8060 }, { "epoch": 1.8469025535325776, "grad_norm": 0.15638279914855957, "learning_rate": 8.663046289113008e-06, "loss": 0.7399, "step": 8065 }, { "epoch": 1.8480476354059316, "grad_norm": 0.16193757951259613, "learning_rate": 8.598396689940522e-06, "loss": 0.7422, "step": 8070 }, { "epoch": 1.8491927172792855, "grad_norm": 0.17268849909305573, "learning_rate": 8.533747090768038e-06, "loss": 0.7448, "step": 8075 }, { "epoch": 1.8503377991526393, "grad_norm": 0.14754731953144073, "learning_rate": 8.469097491595553e-06, "loss": 0.7401, "step": 8080 }, { "epoch": 1.8514828810259933, "grad_norm": 0.17625221610069275, "learning_rate": 8.404447892423067e-06, "loss": 0.7385, "step": 8085 }, { "epoch": 1.8526279628993474, "grad_norm": 0.14788591861724854, "learning_rate": 8.339798293250583e-06, "loss": 0.7409, "step": 8090 }, { "epoch": 1.8537730447727012, "grad_norm": 0.18604493141174316, "learning_rate": 8.275148694078097e-06, "loss": 0.7394, "step": 8095 }, { "epoch": 1.854918126646055, "grad_norm": 0.16041810810565948, "learning_rate": 8.210499094905611e-06, "loss": 0.7386, "step": 8100 }, { "epoch": 1.856063208519409, "grad_norm": 0.18122714757919312, "learning_rate": 8.145849495733127e-06, "loss": 0.7393, "step": 8105 }, { "epoch": 1.8572082903927631, "grad_norm": 0.16307488083839417, "learning_rate": 8.081199896560641e-06, "loss": 0.7413, "step": 8110 }, { "epoch": 1.8583533722661172, "grad_norm": 0.22801965475082397, "learning_rate": 8.016550297388155e-06, "loss": 0.7459, "step": 8115 }, { "epoch": 1.859498454139471, "grad_norm": 0.18238000571727753, "learning_rate": 7.951900698215671e-06, "loss": 0.74, "step": 8120 }, { "epoch": 1.8606435360128248, "grad_norm": 0.20741623640060425, "learning_rate": 7.887251099043186e-06, "loss": 0.7382, "step": 8125 }, { "epoch": 1.8617886178861789, "grad_norm": 0.22925831377506256, "learning_rate": 7.8226014998707e-06, "loss": 0.7391, "step": 8130 }, { "epoch": 1.862933699759533, "grad_norm": 0.16230937838554382, "learning_rate": 7.757951900698216e-06, "loss": 0.7397, "step": 8135 }, { "epoch": 1.8640787816328868, "grad_norm": 0.17243419587612152, "learning_rate": 7.69330230152573e-06, "loss": 0.7401, "step": 8140 }, { "epoch": 1.8652238635062406, "grad_norm": 0.1455468386411667, "learning_rate": 7.628652702353247e-06, "loss": 0.7411, "step": 8145 }, { "epoch": 1.8663689453795946, "grad_norm": 0.1580912470817566, "learning_rate": 7.564003103180761e-06, "loss": 0.7408, "step": 8150 }, { "epoch": 1.8675140272529487, "grad_norm": 0.18351049721240997, "learning_rate": 7.499353504008276e-06, "loss": 0.7425, "step": 8155 }, { "epoch": 1.8686591091263025, "grad_norm": 0.1617344319820404, "learning_rate": 7.434703904835791e-06, "loss": 0.7398, "step": 8160 }, { "epoch": 1.8698041909996563, "grad_norm": 0.14651767909526825, "learning_rate": 7.370054305663305e-06, "loss": 0.7403, "step": 8165 }, { "epoch": 1.8709492728730104, "grad_norm": 0.15270645916461945, "learning_rate": 7.30540470649082e-06, "loss": 0.7394, "step": 8170 }, { "epoch": 1.8720943547463644, "grad_norm": 0.15083663165569305, "learning_rate": 7.240755107318335e-06, "loss": 0.7384, "step": 8175 }, { "epoch": 1.8732394366197183, "grad_norm": 0.18049649894237518, "learning_rate": 7.17610550814585e-06, "loss": 0.7441, "step": 8180 }, { "epoch": 1.8743845184930723, "grad_norm": 0.18089523911476135, "learning_rate": 7.111455908973365e-06, "loss": 0.7386, "step": 8185 }, { "epoch": 1.8755296003664261, "grad_norm": 0.19794507324695587, "learning_rate": 7.04680630980088e-06, "loss": 0.7408, "step": 8190 }, { "epoch": 1.8766746822397802, "grad_norm": 0.15723812580108643, "learning_rate": 6.982156710628395e-06, "loss": 0.7404, "step": 8195 }, { "epoch": 1.8778197641131342, "grad_norm": 0.14659984409809113, "learning_rate": 6.917507111455909e-06, "loss": 0.7427, "step": 8200 }, { "epoch": 1.878964845986488, "grad_norm": 0.14966751635074615, "learning_rate": 6.852857512283424e-06, "loss": 0.7358, "step": 8205 }, { "epoch": 1.8801099278598419, "grad_norm": 0.1722017079591751, "learning_rate": 6.788207913110939e-06, "loss": 0.7426, "step": 8210 }, { "epoch": 1.881255009733196, "grad_norm": 0.15833428502082825, "learning_rate": 6.723558313938453e-06, "loss": 0.7438, "step": 8215 }, { "epoch": 1.88240009160655, "grad_norm": 0.15574586391448975, "learning_rate": 6.6589087147659685e-06, "loss": 0.7382, "step": 8220 }, { "epoch": 1.8835451734799038, "grad_norm": 0.18555963039398193, "learning_rate": 6.5942591155934835e-06, "loss": 0.7406, "step": 8225 }, { "epoch": 1.8846902553532576, "grad_norm": 0.14833788573741913, "learning_rate": 6.529609516420998e-06, "loss": 0.7443, "step": 8230 }, { "epoch": 1.8858353372266117, "grad_norm": 0.15129245817661285, "learning_rate": 6.464959917248513e-06, "loss": 0.7388, "step": 8235 }, { "epoch": 1.8869804190999657, "grad_norm": 0.16850748658180237, "learning_rate": 6.400310318076028e-06, "loss": 0.741, "step": 8240 }, { "epoch": 1.8881255009733195, "grad_norm": 0.1815343052148819, "learning_rate": 6.335660718903542e-06, "loss": 0.7415, "step": 8245 }, { "epoch": 1.8892705828466734, "grad_norm": 0.17904560267925262, "learning_rate": 6.271011119731059e-06, "loss": 0.741, "step": 8250 }, { "epoch": 1.8904156647200274, "grad_norm": 0.1715519279241562, "learning_rate": 6.206361520558573e-06, "loss": 0.7427, "step": 8255 }, { "epoch": 1.8915607465933815, "grad_norm": 0.16421706974506378, "learning_rate": 6.141711921386087e-06, "loss": 0.7419, "step": 8260 }, { "epoch": 1.8927058284667355, "grad_norm": 0.150858074426651, "learning_rate": 6.077062322213602e-06, "loss": 0.7424, "step": 8265 }, { "epoch": 1.8938509103400893, "grad_norm": 0.19547419250011444, "learning_rate": 6.0124127230411174e-06, "loss": 0.741, "step": 8270 }, { "epoch": 1.8949959922134432, "grad_norm": 0.1885257363319397, "learning_rate": 5.947763123868632e-06, "loss": 0.7398, "step": 8275 }, { "epoch": 1.8961410740867972, "grad_norm": 0.17618173360824585, "learning_rate": 5.8831135246961476e-06, "loss": 0.7426, "step": 8280 }, { "epoch": 1.8972861559601513, "grad_norm": 0.19151267409324646, "learning_rate": 5.818463925523663e-06, "loss": 0.7419, "step": 8285 }, { "epoch": 1.898431237833505, "grad_norm": 0.17066292464733124, "learning_rate": 5.753814326351177e-06, "loss": 0.7412, "step": 8290 }, { "epoch": 1.899576319706859, "grad_norm": 0.17010915279388428, "learning_rate": 5.689164727178692e-06, "loss": 0.7413, "step": 8295 }, { "epoch": 1.900721401580213, "grad_norm": 0.14419379830360413, "learning_rate": 5.624515128006207e-06, "loss": 0.7415, "step": 8300 }, { "epoch": 1.901866483453567, "grad_norm": 0.15456563234329224, "learning_rate": 5.559865528833721e-06, "loss": 0.7393, "step": 8305 }, { "epoch": 1.9030115653269208, "grad_norm": 0.15366721153259277, "learning_rate": 5.495215929661236e-06, "loss": 0.7393, "step": 8310 }, { "epoch": 1.9041566472002747, "grad_norm": 0.13521674275398254, "learning_rate": 5.430566330488751e-06, "loss": 0.7416, "step": 8315 }, { "epoch": 1.9053017290736287, "grad_norm": 0.1475028693675995, "learning_rate": 5.3659167313162656e-06, "loss": 0.7368, "step": 8320 }, { "epoch": 1.9064468109469828, "grad_norm": 0.14698173105716705, "learning_rate": 5.301267132143781e-06, "loss": 0.7395, "step": 8325 }, { "epoch": 1.9075918928203368, "grad_norm": 0.1603127121925354, "learning_rate": 5.2366175329712965e-06, "loss": 0.7422, "step": 8330 }, { "epoch": 1.9087369746936906, "grad_norm": 0.13494108617305756, "learning_rate": 5.171967933798811e-06, "loss": 0.7399, "step": 8335 }, { "epoch": 1.9098820565670445, "grad_norm": 0.1673167645931244, "learning_rate": 5.107318334626326e-06, "loss": 0.7398, "step": 8340 }, { "epoch": 1.9110271384403985, "grad_norm": 0.19458456337451935, "learning_rate": 5.042668735453841e-06, "loss": 0.7438, "step": 8345 }, { "epoch": 1.9121722203137526, "grad_norm": 0.1579926759004593, "learning_rate": 4.978019136281355e-06, "loss": 0.7418, "step": 8350 }, { "epoch": 1.9133173021871064, "grad_norm": 0.18511444330215454, "learning_rate": 4.91336953710887e-06, "loss": 0.7379, "step": 8355 }, { "epoch": 1.9144623840604602, "grad_norm": 0.1794387549161911, "learning_rate": 4.848719937936385e-06, "loss": 0.743, "step": 8360 }, { "epoch": 1.9156074659338143, "grad_norm": 0.14920532703399658, "learning_rate": 4.7840703387638995e-06, "loss": 0.7411, "step": 8365 }, { "epoch": 1.9167525478071683, "grad_norm": 0.14493878185749054, "learning_rate": 4.7194207395914145e-06, "loss": 0.7393, "step": 8370 }, { "epoch": 1.9178976296805221, "grad_norm": 0.17128613591194153, "learning_rate": 4.65477114041893e-06, "loss": 0.7426, "step": 8375 }, { "epoch": 1.919042711553876, "grad_norm": 0.13501162827014923, "learning_rate": 4.590121541246444e-06, "loss": 0.74, "step": 8380 }, { "epoch": 1.92018779342723, "grad_norm": 0.18030951917171478, "learning_rate": 4.52547194207396e-06, "loss": 0.7379, "step": 8385 }, { "epoch": 1.921332875300584, "grad_norm": 0.1932855099439621, "learning_rate": 4.460822342901475e-06, "loss": 0.7438, "step": 8390 }, { "epoch": 1.922477957173938, "grad_norm": 0.1626242697238922, "learning_rate": 4.396172743728989e-06, "loss": 0.743, "step": 8395 }, { "epoch": 1.923623039047292, "grad_norm": 0.1467551589012146, "learning_rate": 4.331523144556504e-06, "loss": 0.7406, "step": 8400 }, { "epoch": 1.9247681209206458, "grad_norm": 0.1600460261106491, "learning_rate": 4.266873545384019e-06, "loss": 0.7414, "step": 8405 }, { "epoch": 1.9259132027939998, "grad_norm": 0.1701911985874176, "learning_rate": 4.202223946211533e-06, "loss": 0.7399, "step": 8410 }, { "epoch": 1.9270582846673538, "grad_norm": 0.15907885134220123, "learning_rate": 4.1375743470390484e-06, "loss": 0.7355, "step": 8415 }, { "epoch": 1.9282033665407077, "grad_norm": 0.14833064377307892, "learning_rate": 4.0729247478665635e-06, "loss": 0.747, "step": 8420 }, { "epoch": 1.9293484484140615, "grad_norm": 0.1576046198606491, "learning_rate": 4.008275148694078e-06, "loss": 0.7372, "step": 8425 }, { "epoch": 1.9304935302874155, "grad_norm": 0.1346118301153183, "learning_rate": 3.943625549521593e-06, "loss": 0.7375, "step": 8430 }, { "epoch": 1.9316386121607696, "grad_norm": 0.16976960003376007, "learning_rate": 3.878975950349108e-06, "loss": 0.7393, "step": 8435 }, { "epoch": 1.9327836940341234, "grad_norm": 0.16566011309623718, "learning_rate": 3.8143263511766233e-06, "loss": 0.7351, "step": 8440 }, { "epoch": 1.9339287759074772, "grad_norm": 0.1495964080095291, "learning_rate": 3.749676752004138e-06, "loss": 0.7378, "step": 8445 }, { "epoch": 1.9350738577808313, "grad_norm": 0.17395488917827606, "learning_rate": 3.6850271528316526e-06, "loss": 0.7394, "step": 8450 }, { "epoch": 1.9362189396541853, "grad_norm": 0.1591861993074417, "learning_rate": 3.6203775536591677e-06, "loss": 0.7377, "step": 8455 }, { "epoch": 1.9373640215275392, "grad_norm": 0.14285223186016083, "learning_rate": 3.5557279544866823e-06, "loss": 0.7416, "step": 8460 }, { "epoch": 1.9385091034008932, "grad_norm": 0.1831120252609253, "learning_rate": 3.4910783553141974e-06, "loss": 0.7338, "step": 8465 }, { "epoch": 1.939654185274247, "grad_norm": 0.1750773787498474, "learning_rate": 3.426428756141712e-06, "loss": 0.7395, "step": 8470 }, { "epoch": 1.940799267147601, "grad_norm": 0.16022038459777832, "learning_rate": 3.3617791569692267e-06, "loss": 0.7399, "step": 8475 }, { "epoch": 1.9419443490209551, "grad_norm": 0.1562938690185547, "learning_rate": 3.2971295577967418e-06, "loss": 0.7374, "step": 8480 }, { "epoch": 1.943089430894309, "grad_norm": 0.1531076729297638, "learning_rate": 3.2324799586242564e-06, "loss": 0.7366, "step": 8485 }, { "epoch": 1.9442345127676628, "grad_norm": 0.13733965158462524, "learning_rate": 3.167830359451771e-06, "loss": 0.7387, "step": 8490 }, { "epoch": 1.9453795946410168, "grad_norm": 0.1592342108488083, "learning_rate": 3.1031807602792865e-06, "loss": 0.7412, "step": 8495 }, { "epoch": 1.9465246765143709, "grad_norm": 0.13735567033290863, "learning_rate": 3.038531161106801e-06, "loss": 0.7424, "step": 8500 }, { "epoch": 1.9476697583877247, "grad_norm": 0.1416022926568985, "learning_rate": 2.973881561934316e-06, "loss": 0.7371, "step": 8505 }, { "epoch": 1.9488148402610785, "grad_norm": 0.15780256688594818, "learning_rate": 2.9092319627618313e-06, "loss": 0.7413, "step": 8510 }, { "epoch": 1.9499599221344326, "grad_norm": 0.15426452457904816, "learning_rate": 2.844582363589346e-06, "loss": 0.7399, "step": 8515 }, { "epoch": 1.9511050040077866, "grad_norm": 0.1280713975429535, "learning_rate": 2.7799327644168606e-06, "loss": 0.7386, "step": 8520 }, { "epoch": 1.9522500858811405, "grad_norm": 0.1521618515253067, "learning_rate": 2.7152831652443757e-06, "loss": 0.7361, "step": 8525 }, { "epoch": 1.9533951677544943, "grad_norm": 0.15051406621932983, "learning_rate": 2.6506335660718903e-06, "loss": 0.7436, "step": 8530 }, { "epoch": 1.9545402496278483, "grad_norm": 0.19529180228710175, "learning_rate": 2.5859839668994054e-06, "loss": 0.7364, "step": 8535 }, { "epoch": 1.9556853315012024, "grad_norm": 0.17485053837299347, "learning_rate": 2.5213343677269204e-06, "loss": 0.7413, "step": 8540 }, { "epoch": 1.9568304133745564, "grad_norm": 0.13643115758895874, "learning_rate": 2.456684768554435e-06, "loss": 0.7377, "step": 8545 }, { "epoch": 1.9579754952479103, "grad_norm": 0.16808709502220154, "learning_rate": 2.3920351693819497e-06, "loss": 0.7398, "step": 8550 }, { "epoch": 1.959120577121264, "grad_norm": 0.14400559663772583, "learning_rate": 2.327385570209465e-06, "loss": 0.7403, "step": 8555 }, { "epoch": 1.9602656589946181, "grad_norm": 0.15056101977825165, "learning_rate": 2.26273597103698e-06, "loss": 0.7426, "step": 8560 }, { "epoch": 1.9614107408679722, "grad_norm": 0.1426333487033844, "learning_rate": 2.1980863718644945e-06, "loss": 0.7387, "step": 8565 }, { "epoch": 1.962555822741326, "grad_norm": 0.14333635568618774, "learning_rate": 2.1334367726920096e-06, "loss": 0.7378, "step": 8570 }, { "epoch": 1.9637009046146798, "grad_norm": 0.15889368951320648, "learning_rate": 2.0687871735195242e-06, "loss": 0.7417, "step": 8575 }, { "epoch": 1.9648459864880339, "grad_norm": 0.1589675098657608, "learning_rate": 2.004137574347039e-06, "loss": 0.7391, "step": 8580 }, { "epoch": 1.965991068361388, "grad_norm": 0.14005248248577118, "learning_rate": 1.939487975174554e-06, "loss": 0.7368, "step": 8585 }, { "epoch": 1.9671361502347418, "grad_norm": 0.12994040548801422, "learning_rate": 1.874838376002069e-06, "loss": 0.7369, "step": 8590 }, { "epoch": 1.9682812321080956, "grad_norm": 0.15902163088321686, "learning_rate": 1.8101887768295838e-06, "loss": 0.7406, "step": 8595 }, { "epoch": 1.9694263139814496, "grad_norm": 0.14345116913318634, "learning_rate": 1.7455391776570987e-06, "loss": 0.7397, "step": 8600 }, { "epoch": 1.9705713958548037, "grad_norm": 0.12906809151172638, "learning_rate": 1.6808895784846133e-06, "loss": 0.7412, "step": 8605 }, { "epoch": 1.9717164777281577, "grad_norm": 0.15647900104522705, "learning_rate": 1.6162399793121282e-06, "loss": 0.7395, "step": 8610 }, { "epoch": 1.9728615596015115, "grad_norm": 0.14507581293582916, "learning_rate": 1.5515903801396433e-06, "loss": 0.7436, "step": 8615 }, { "epoch": 1.9740066414748654, "grad_norm": 0.1292596310377121, "learning_rate": 1.486940780967158e-06, "loss": 0.7404, "step": 8620 }, { "epoch": 1.9751517233482194, "grad_norm": 0.13533343374729156, "learning_rate": 1.422291181794673e-06, "loss": 0.7395, "step": 8625 }, { "epoch": 1.9762968052215735, "grad_norm": 0.14757055044174194, "learning_rate": 1.3576415826221878e-06, "loss": 0.7355, "step": 8630 }, { "epoch": 1.9774418870949273, "grad_norm": 0.15465658903121948, "learning_rate": 1.2929919834497027e-06, "loss": 0.7399, "step": 8635 }, { "epoch": 1.9785869689682811, "grad_norm": 0.16538003087043762, "learning_rate": 1.2283423842772175e-06, "loss": 0.7384, "step": 8640 }, { "epoch": 1.9797320508416352, "grad_norm": 0.14361347258090973, "learning_rate": 1.1636927851047324e-06, "loss": 0.7421, "step": 8645 }, { "epoch": 1.9808771327149892, "grad_norm": 0.13651369512081146, "learning_rate": 1.0990431859322473e-06, "loss": 0.7371, "step": 8650 }, { "epoch": 1.982022214588343, "grad_norm": 0.1409565657377243, "learning_rate": 1.0343935867597621e-06, "loss": 0.7391, "step": 8655 }, { "epoch": 1.9831672964616969, "grad_norm": 0.13974834978580475, "learning_rate": 9.69743987587277e-07, "loss": 0.7457, "step": 8660 }, { "epoch": 1.984312378335051, "grad_norm": 0.14657549560070038, "learning_rate": 9.050943884147919e-07, "loss": 0.7348, "step": 8665 }, { "epoch": 1.985457460208405, "grad_norm": 0.156954824924469, "learning_rate": 8.404447892423067e-07, "loss": 0.7441, "step": 8670 }, { "epoch": 1.9866025420817588, "grad_norm": 0.1401233673095703, "learning_rate": 7.757951900698216e-07, "loss": 0.7386, "step": 8675 }, { "epoch": 1.9877476239551128, "grad_norm": 0.14088112115859985, "learning_rate": 7.111455908973365e-07, "loss": 0.7359, "step": 8680 }, { "epoch": 1.9888927058284667, "grad_norm": 0.14570772647857666, "learning_rate": 6.464959917248513e-07, "loss": 0.7392, "step": 8685 }, { "epoch": 1.9900377877018207, "grad_norm": 0.15030637383460999, "learning_rate": 5.818463925523662e-07, "loss": 0.744, "step": 8690 }, { "epoch": 1.9911828695751748, "grad_norm": 0.13755646347999573, "learning_rate": 5.171967933798811e-07, "loss": 0.7355, "step": 8695 }, { "epoch": 1.9923279514485286, "grad_norm": 0.155239075422287, "learning_rate": 4.5254719420739596e-07, "loss": 0.7408, "step": 8700 }, { "epoch": 1.9934730333218824, "grad_norm": 0.1403430551290512, "learning_rate": 3.878975950349108e-07, "loss": 0.7427, "step": 8705 }, { "epoch": 1.9946181151952365, "grad_norm": 0.14680492877960205, "learning_rate": 3.2324799586242567e-07, "loss": 0.7434, "step": 8710 }, { "epoch": 1.9957631970685905, "grad_norm": 0.1654711365699768, "learning_rate": 2.5859839668994053e-07, "loss": 0.74, "step": 8715 }, { "epoch": 1.9969082789419443, "grad_norm": 0.14024050533771515, "learning_rate": 1.939487975174554e-07, "loss": 0.7425, "step": 8720 }, { "epoch": 1.9980533608152982, "grad_norm": 0.12798629701137543, "learning_rate": 1.2929919834497026e-07, "loss": 0.7389, "step": 8725 }, { "epoch": 1.9991984426886522, "grad_norm": 0.1456129103899002, "learning_rate": 6.464959917248513e-08, "loss": 0.7414, "step": 8730 } ], "logging_steps": 5, "max_steps": 8734, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }