mbert_mar-deva / trainer_state.json
DGurgurov's picture
Uploading checkpoint-39000 for mbert - mar-deva
b8afc27 verified
{
"best_metric": 0.7045323252677917,
"best_model_checkpoint": "./model_fine-tune/glot/mbert/mar-Deva/checkpoint-39000",
"epoch": 91.54929577464789,
"eval_steps": 500,
"global_step": 39000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.1737089201877935,
"grad_norm": 3.1700332164764404,
"learning_rate": 9.95e-05,
"loss": 1.3483,
"step": 500
},
{
"epoch": 1.1737089201877935,
"eval_accuracy": 0.749312885729062,
"eval_loss": 1.1869930028915405,
"eval_runtime": 145.5506,
"eval_samples_per_second": 121.484,
"eval_steps_per_second": 3.799,
"step": 500
},
{
"epoch": 2.347417840375587,
"grad_norm": 2.450835704803467,
"learning_rate": 9.900000000000001e-05,
"loss": 1.16,
"step": 1000
},
{
"epoch": 2.347417840375587,
"eval_accuracy": 0.7679429076763364,
"eval_loss": 1.1002804040908813,
"eval_runtime": 138.5842,
"eval_samples_per_second": 127.59,
"eval_steps_per_second": 3.99,
"step": 1000
},
{
"epoch": 3.52112676056338,
"grad_norm": 2.5313944816589355,
"learning_rate": 9.850000000000001e-05,
"loss": 1.0796,
"step": 1500
},
{
"epoch": 3.52112676056338,
"eval_accuracy": 0.7788953556992149,
"eval_loss": 1.0443540811538696,
"eval_runtime": 143.0988,
"eval_samples_per_second": 123.565,
"eval_steps_per_second": 3.864,
"step": 1500
},
{
"epoch": 4.694835680751174,
"grad_norm": 2.6745097637176514,
"learning_rate": 9.8e-05,
"loss": 1.023,
"step": 2000
},
{
"epoch": 4.694835680751174,
"eval_accuracy": 0.7875008678674885,
"eval_loss": 0.9982088208198547,
"eval_runtime": 147.3047,
"eval_samples_per_second": 120.037,
"eval_steps_per_second": 3.754,
"step": 2000
},
{
"epoch": 5.868544600938967,
"grad_norm": 2.8676445484161377,
"learning_rate": 9.75e-05,
"loss": 0.9784,
"step": 2500
},
{
"epoch": 5.868544600938967,
"eval_accuracy": 0.7940504772991072,
"eval_loss": 0.964560329914093,
"eval_runtime": 137.4737,
"eval_samples_per_second": 128.621,
"eval_steps_per_second": 4.023,
"step": 2500
},
{
"epoch": 7.042253521126761,
"grad_norm": 2.3849828243255615,
"learning_rate": 9.7e-05,
"loss": 0.9394,
"step": 3000
},
{
"epoch": 7.042253521126761,
"eval_accuracy": 0.7981764510868291,
"eval_loss": 0.9444334506988525,
"eval_runtime": 147.2364,
"eval_samples_per_second": 120.093,
"eval_steps_per_second": 3.756,
"step": 3000
},
{
"epoch": 8.215962441314554,
"grad_norm": 2.1890273094177246,
"learning_rate": 9.65e-05,
"loss": 0.9119,
"step": 3500
},
{
"epoch": 8.215962441314554,
"eval_accuracy": 0.8029835860860965,
"eval_loss": 0.9215248823165894,
"eval_runtime": 147.2738,
"eval_samples_per_second": 120.062,
"eval_steps_per_second": 3.755,
"step": 3500
},
{
"epoch": 9.389671361502348,
"grad_norm": 2.342008590698242,
"learning_rate": 9.6e-05,
"loss": 0.8872,
"step": 4000
},
{
"epoch": 9.389671361502348,
"eval_accuracy": 0.805941164259263,
"eval_loss": 0.916360080242157,
"eval_runtime": 147.2413,
"eval_samples_per_second": 120.089,
"eval_steps_per_second": 3.756,
"step": 4000
},
{
"epoch": 10.56338028169014,
"grad_norm": 2.2766637802124023,
"learning_rate": 9.55e-05,
"loss": 0.8629,
"step": 4500
},
{
"epoch": 10.56338028169014,
"eval_accuracy": 0.8102512145065337,
"eval_loss": 0.898952066898346,
"eval_runtime": 147.2077,
"eval_samples_per_second": 120.116,
"eval_steps_per_second": 3.757,
"step": 4500
},
{
"epoch": 11.737089201877934,
"grad_norm": 2.391869306564331,
"learning_rate": 9.5e-05,
"loss": 0.8477,
"step": 5000
},
{
"epoch": 11.737089201877934,
"eval_accuracy": 0.8117510986190105,
"eval_loss": 0.8836262822151184,
"eval_runtime": 145.6587,
"eval_samples_per_second": 121.393,
"eval_steps_per_second": 3.797,
"step": 5000
},
{
"epoch": 12.910798122065728,
"grad_norm": 2.4697470664978027,
"learning_rate": 9.449999999999999e-05,
"loss": 0.8261,
"step": 5500
},
{
"epoch": 12.910798122065728,
"eval_accuracy": 0.8150349185540299,
"eval_loss": 0.8613038063049316,
"eval_runtime": 145.7167,
"eval_samples_per_second": 121.345,
"eval_steps_per_second": 3.795,
"step": 5500
},
{
"epoch": 14.084507042253522,
"grad_norm": 2.36245059967041,
"learning_rate": 9.4e-05,
"loss": 0.8072,
"step": 6000
},
{
"epoch": 14.084507042253522,
"eval_accuracy": 0.816731876355394,
"eval_loss": 0.8536739945411682,
"eval_runtime": 147.07,
"eval_samples_per_second": 120.228,
"eval_steps_per_second": 3.76,
"step": 6000
},
{
"epoch": 15.258215962441314,
"grad_norm": 2.285848379135132,
"learning_rate": 9.350000000000001e-05,
"loss": 0.7924,
"step": 6500
},
{
"epoch": 15.258215962441314,
"eval_accuracy": 0.8197007489640474,
"eval_loss": 0.8382024765014648,
"eval_runtime": 147.1867,
"eval_samples_per_second": 120.133,
"eval_steps_per_second": 3.757,
"step": 6500
},
{
"epoch": 16.431924882629108,
"grad_norm": 2.218266010284424,
"learning_rate": 9.300000000000001e-05,
"loss": 0.7815,
"step": 7000
},
{
"epoch": 16.431924882629108,
"eval_accuracy": 0.8211554348763181,
"eval_loss": 0.8357976675033569,
"eval_runtime": 135.7542,
"eval_samples_per_second": 130.25,
"eval_steps_per_second": 4.074,
"step": 7000
},
{
"epoch": 17.6056338028169,
"grad_norm": 2.300184488296509,
"learning_rate": 9.250000000000001e-05,
"loss": 0.767,
"step": 7500
},
{
"epoch": 17.6056338028169,
"eval_accuracy": 0.8242465371712792,
"eval_loss": 0.8331694006919861,
"eval_runtime": 145.3933,
"eval_samples_per_second": 121.615,
"eval_steps_per_second": 3.803,
"step": 7500
},
{
"epoch": 18.779342723004696,
"grad_norm": 2.1632742881774902,
"learning_rate": 9.200000000000001e-05,
"loss": 0.7541,
"step": 8000
},
{
"epoch": 18.779342723004696,
"eval_accuracy": 0.8246549447936785,
"eval_loss": 0.8179985880851746,
"eval_runtime": 139.0754,
"eval_samples_per_second": 127.14,
"eval_steps_per_second": 3.976,
"step": 8000
},
{
"epoch": 19.953051643192488,
"grad_norm": 2.2807085514068604,
"learning_rate": 9.15e-05,
"loss": 0.7462,
"step": 8500
},
{
"epoch": 19.953051643192488,
"eval_accuracy": 0.8256249942473,
"eval_loss": 0.8264754414558411,
"eval_runtime": 147.2594,
"eval_samples_per_second": 120.074,
"eval_steps_per_second": 3.755,
"step": 8500
},
{
"epoch": 21.12676056338028,
"grad_norm": 2.210843563079834,
"learning_rate": 9.1e-05,
"loss": 0.7323,
"step": 9000
},
{
"epoch": 21.12676056338028,
"eval_accuracy": 0.8267119847302682,
"eval_loss": 0.8105438947677612,
"eval_runtime": 136.1823,
"eval_samples_per_second": 129.841,
"eval_steps_per_second": 4.061,
"step": 9000
},
{
"epoch": 22.300469483568076,
"grad_norm": 2.389461040496826,
"learning_rate": 9.05e-05,
"loss": 0.7212,
"step": 9500
},
{
"epoch": 22.300469483568076,
"eval_accuracy": 0.8280417403957945,
"eval_loss": 0.8193202018737793,
"eval_runtime": 147.4213,
"eval_samples_per_second": 119.942,
"eval_steps_per_second": 3.751,
"step": 9500
},
{
"epoch": 23.474178403755868,
"grad_norm": 2.059155225753784,
"learning_rate": 9e-05,
"loss": 0.7086,
"step": 10000
},
{
"epoch": 23.474178403755868,
"eval_accuracy": 0.8300135381668301,
"eval_loss": 0.8017289638519287,
"eval_runtime": 137.0188,
"eval_samples_per_second": 129.048,
"eval_steps_per_second": 4.036,
"step": 10000
},
{
"epoch": 24.647887323943664,
"grad_norm": 2.143441677093506,
"learning_rate": 8.950000000000001e-05,
"loss": 0.7018,
"step": 10500
},
{
"epoch": 24.647887323943664,
"eval_accuracy": 0.8309204998181491,
"eval_loss": 0.7943059206008911,
"eval_runtime": 136.1098,
"eval_samples_per_second": 129.91,
"eval_steps_per_second": 4.063,
"step": 10500
},
{
"epoch": 25.821596244131456,
"grad_norm": 2.3048555850982666,
"learning_rate": 8.900000000000001e-05,
"loss": 0.6949,
"step": 11000
},
{
"epoch": 25.821596244131456,
"eval_accuracy": 0.8313006683640194,
"eval_loss": 0.7950281500816345,
"eval_runtime": 147.4239,
"eval_samples_per_second": 119.94,
"eval_steps_per_second": 3.751,
"step": 11000
},
{
"epoch": 26.995305164319248,
"grad_norm": 1.9936089515686035,
"learning_rate": 8.850000000000001e-05,
"loss": 0.6917,
"step": 11500
},
{
"epoch": 26.995305164319248,
"eval_accuracy": 0.8331900538385245,
"eval_loss": 0.7858129143714905,
"eval_runtime": 137.4392,
"eval_samples_per_second": 128.653,
"eval_steps_per_second": 4.024,
"step": 11500
},
{
"epoch": 28.169014084507044,
"grad_norm": 2.1459641456604004,
"learning_rate": 8.800000000000001e-05,
"loss": 0.6782,
"step": 12000
},
{
"epoch": 28.169014084507044,
"eval_accuracy": 0.8343617601040733,
"eval_loss": 0.7777819037437439,
"eval_runtime": 147.5831,
"eval_samples_per_second": 119.81,
"eval_steps_per_second": 3.747,
"step": 12000
},
{
"epoch": 29.342723004694836,
"grad_norm": 2.0599782466888428,
"learning_rate": 8.75e-05,
"loss": 0.672,
"step": 12500
},
{
"epoch": 29.342723004694836,
"eval_accuracy": 0.8344647882838259,
"eval_loss": 0.7832308411598206,
"eval_runtime": 137.4669,
"eval_samples_per_second": 128.627,
"eval_steps_per_second": 4.023,
"step": 12500
},
{
"epoch": 30.516431924882628,
"grad_norm": 2.52083158493042,
"learning_rate": 8.7e-05,
"loss": 0.6612,
"step": 13000
},
{
"epoch": 30.516431924882628,
"eval_accuracy": 0.835733848691268,
"eval_loss": 0.7805770039558411,
"eval_runtime": 137.4676,
"eval_samples_per_second": 128.627,
"eval_steps_per_second": 4.023,
"step": 13000
},
{
"epoch": 31.690140845070424,
"grad_norm": 2.808067560195923,
"learning_rate": 8.65e-05,
"loss": 0.653,
"step": 13500
},
{
"epoch": 31.690140845070424,
"eval_accuracy": 0.8369936505384986,
"eval_loss": 0.7829101085662842,
"eval_runtime": 147.4085,
"eval_samples_per_second": 119.952,
"eval_steps_per_second": 3.751,
"step": 13500
},
{
"epoch": 32.863849765258216,
"grad_norm": 2.1012039184570312,
"learning_rate": 8.6e-05,
"loss": 0.6509,
"step": 14000
},
{
"epoch": 32.863849765258216,
"eval_accuracy": 0.8373059658134827,
"eval_loss": 0.7640124559402466,
"eval_runtime": 137.589,
"eval_samples_per_second": 128.513,
"eval_steps_per_second": 4.019,
"step": 14000
},
{
"epoch": 34.03755868544601,
"grad_norm": 2.1211578845977783,
"learning_rate": 8.55e-05,
"loss": 0.6403,
"step": 14500
},
{
"epoch": 34.03755868544601,
"eval_accuracy": 0.8385678802262423,
"eval_loss": 0.7672787308692932,
"eval_runtime": 146.9801,
"eval_samples_per_second": 120.302,
"eval_steps_per_second": 3.762,
"step": 14500
},
{
"epoch": 35.2112676056338,
"grad_norm": 2.017094135284424,
"learning_rate": 8.5e-05,
"loss": 0.6348,
"step": 15000
},
{
"epoch": 35.2112676056338,
"eval_accuracy": 0.8392093131517121,
"eval_loss": 0.7593186497688293,
"eval_runtime": 137.1351,
"eval_samples_per_second": 128.939,
"eval_steps_per_second": 4.033,
"step": 15000
},
{
"epoch": 36.3849765258216,
"grad_norm": 2.0924570560455322,
"learning_rate": 8.450000000000001e-05,
"loss": 0.6297,
"step": 15500
},
{
"epoch": 36.3849765258216,
"eval_accuracy": 0.8399583286650959,
"eval_loss": 0.763680636882782,
"eval_runtime": 147.5378,
"eval_samples_per_second": 119.847,
"eval_steps_per_second": 3.748,
"step": 15500
},
{
"epoch": 37.55868544600939,
"grad_norm": 2.131864070892334,
"learning_rate": 8.4e-05,
"loss": 0.6238,
"step": 16000
},
{
"epoch": 37.55868544600939,
"eval_accuracy": 0.840090867939712,
"eval_loss": 0.7627538442611694,
"eval_runtime": 137.0753,
"eval_samples_per_second": 128.995,
"eval_steps_per_second": 4.034,
"step": 16000
},
{
"epoch": 38.732394366197184,
"grad_norm": 2.12919545173645,
"learning_rate": 8.35e-05,
"loss": 0.6162,
"step": 16500
},
{
"epoch": 38.732394366197184,
"eval_accuracy": 0.8413376227933144,
"eval_loss": 0.7513773441314697,
"eval_runtime": 147.6687,
"eval_samples_per_second": 119.741,
"eval_steps_per_second": 3.745,
"step": 16500
},
{
"epoch": 39.906103286384976,
"grad_norm": 2.1172847747802734,
"learning_rate": 8.3e-05,
"loss": 0.6121,
"step": 17000
},
{
"epoch": 39.906103286384976,
"eval_accuracy": 0.8418376093750392,
"eval_loss": 0.753982663154602,
"eval_runtime": 136.0303,
"eval_samples_per_second": 129.986,
"eval_steps_per_second": 4.065,
"step": 17000
},
{
"epoch": 41.07981220657277,
"grad_norm": 2.195590019226074,
"learning_rate": 8.25e-05,
"loss": 0.6078,
"step": 17500
},
{
"epoch": 41.07981220657277,
"eval_accuracy": 0.8428805964270359,
"eval_loss": 0.7542482614517212,
"eval_runtime": 137.0472,
"eval_samples_per_second": 129.021,
"eval_steps_per_second": 4.035,
"step": 17500
},
{
"epoch": 42.25352112676056,
"grad_norm": 2.067308187484741,
"learning_rate": 8.2e-05,
"loss": 0.601,
"step": 18000
},
{
"epoch": 42.25352112676056,
"eval_accuracy": 0.8427417329846963,
"eval_loss": 0.7472436428070068,
"eval_runtime": 136.152,
"eval_samples_per_second": 129.87,
"eval_steps_per_second": 4.062,
"step": 18000
},
{
"epoch": 43.42723004694836,
"grad_norm": 2.278040885925293,
"learning_rate": 8.15e-05,
"loss": 0.5962,
"step": 18500
},
{
"epoch": 43.42723004694836,
"eval_accuracy": 0.8429750545039596,
"eval_loss": 0.7438804507255554,
"eval_runtime": 136.1815,
"eval_samples_per_second": 129.841,
"eval_steps_per_second": 4.061,
"step": 18500
},
{
"epoch": 44.60093896713615,
"grad_norm": 2.0480079650878906,
"learning_rate": 8.1e-05,
"loss": 0.5921,
"step": 19000
},
{
"epoch": 44.60093896713615,
"eval_accuracy": 0.8430666280582005,
"eval_loss": 0.7555158734321594,
"eval_runtime": 147.7547,
"eval_samples_per_second": 119.671,
"eval_steps_per_second": 3.743,
"step": 19000
},
{
"epoch": 45.774647887323944,
"grad_norm": 2.0856966972351074,
"learning_rate": 8.05e-05,
"loss": 0.5828,
"step": 19500
},
{
"epoch": 45.774647887323944,
"eval_accuracy": 0.8444825839615888,
"eval_loss": 0.7403737902641296,
"eval_runtime": 142.7228,
"eval_samples_per_second": 123.89,
"eval_steps_per_second": 3.875,
"step": 19500
},
{
"epoch": 46.948356807511736,
"grad_norm": 2.0801377296447754,
"learning_rate": 8e-05,
"loss": 0.5805,
"step": 20000
},
{
"epoch": 46.948356807511736,
"eval_accuracy": 0.8456604553444954,
"eval_loss": 0.7381341457366943,
"eval_runtime": 136.8181,
"eval_samples_per_second": 129.237,
"eval_steps_per_second": 4.042,
"step": 20000
},
{
"epoch": 48.12206572769953,
"grad_norm": 2.0854578018188477,
"learning_rate": 7.950000000000001e-05,
"loss": 0.577,
"step": 20500
},
{
"epoch": 48.12206572769953,
"eval_accuracy": 0.8459030596342113,
"eval_loss": 0.7439441680908203,
"eval_runtime": 137.0553,
"eval_samples_per_second": 129.014,
"eval_steps_per_second": 4.035,
"step": 20500
},
{
"epoch": 49.29577464788732,
"grad_norm": 2.0203611850738525,
"learning_rate": 7.900000000000001e-05,
"loss": 0.5702,
"step": 21000
},
{
"epoch": 49.29577464788732,
"eval_accuracy": 0.8458144906353607,
"eval_loss": 0.739824116230011,
"eval_runtime": 147.5519,
"eval_samples_per_second": 119.836,
"eval_steps_per_second": 3.748,
"step": 21000
},
{
"epoch": 50.46948356807512,
"grad_norm": 2.07930326461792,
"learning_rate": 7.850000000000001e-05,
"loss": 0.5626,
"step": 21500
},
{
"epoch": 50.46948356807512,
"eval_accuracy": 0.847225799976582,
"eval_loss": 0.7376012206077576,
"eval_runtime": 140.7844,
"eval_samples_per_second": 125.596,
"eval_steps_per_second": 3.928,
"step": 21500
},
{
"epoch": 51.64319248826291,
"grad_norm": 1.9633455276489258,
"learning_rate": 7.800000000000001e-05,
"loss": 0.5644,
"step": 22000
},
{
"epoch": 51.64319248826291,
"eval_accuracy": 0.8477510357072401,
"eval_loss": 0.7312297224998474,
"eval_runtime": 136.8418,
"eval_samples_per_second": 129.215,
"eval_steps_per_second": 4.041,
"step": 22000
},
{
"epoch": 52.816901408450704,
"grad_norm": 2.0171732902526855,
"learning_rate": 7.75e-05,
"loss": 0.5575,
"step": 22500
},
{
"epoch": 52.816901408450704,
"eval_accuracy": 0.8479556976991851,
"eval_loss": 0.7324073314666748,
"eval_runtime": 136.8371,
"eval_samples_per_second": 129.219,
"eval_steps_per_second": 4.041,
"step": 22500
},
{
"epoch": 53.990610328638496,
"grad_norm": 1.9490004777908325,
"learning_rate": 7.7e-05,
"loss": 0.5504,
"step": 23000
},
{
"epoch": 53.990610328638496,
"eval_accuracy": 0.8482885853024055,
"eval_loss": 0.739380955696106,
"eval_runtime": 136.7899,
"eval_samples_per_second": 129.264,
"eval_steps_per_second": 4.043,
"step": 23000
},
{
"epoch": 55.16431924882629,
"grad_norm": 2.1706230640411377,
"learning_rate": 7.65e-05,
"loss": 0.5467,
"step": 23500
},
{
"epoch": 55.16431924882629,
"eval_accuracy": 0.8485212684731438,
"eval_loss": 0.7348983287811279,
"eval_runtime": 137.1652,
"eval_samples_per_second": 128.91,
"eval_steps_per_second": 4.032,
"step": 23500
},
{
"epoch": 56.33802816901409,
"grad_norm": 2.046226978302002,
"learning_rate": 7.6e-05,
"loss": 0.5434,
"step": 24000
},
{
"epoch": 56.33802816901409,
"eval_accuracy": 0.8486974677705212,
"eval_loss": 0.736895740032196,
"eval_runtime": 136.9534,
"eval_samples_per_second": 129.11,
"eval_steps_per_second": 4.038,
"step": 24000
},
{
"epoch": 57.51173708920188,
"grad_norm": 1.9242944717407227,
"learning_rate": 7.55e-05,
"loss": 0.5378,
"step": 24500
},
{
"epoch": 57.51173708920188,
"eval_accuracy": 0.8492700287304648,
"eval_loss": 0.7279884815216064,
"eval_runtime": 136.7008,
"eval_samples_per_second": 129.348,
"eval_steps_per_second": 4.045,
"step": 24500
},
{
"epoch": 58.68544600938967,
"grad_norm": 1.9436827898025513,
"learning_rate": 7.500000000000001e-05,
"loss": 0.5382,
"step": 25000
},
{
"epoch": 58.68544600938967,
"eval_accuracy": 0.8496587662017004,
"eval_loss": 0.7233351469039917,
"eval_runtime": 136.9816,
"eval_samples_per_second": 129.083,
"eval_steps_per_second": 4.037,
"step": 25000
},
{
"epoch": 59.859154929577464,
"grad_norm": 2.442077159881592,
"learning_rate": 7.450000000000001e-05,
"loss": 0.5334,
"step": 25500
},
{
"epoch": 59.859154929577464,
"eval_accuracy": 0.8502614948790587,
"eval_loss": 0.7267663478851318,
"eval_runtime": 140.4178,
"eval_samples_per_second": 125.924,
"eval_steps_per_second": 3.938,
"step": 25500
},
{
"epoch": 61.032863849765256,
"grad_norm": 2.1809020042419434,
"learning_rate": 7.4e-05,
"loss": 0.5295,
"step": 26000
},
{
"epoch": 61.032863849765256,
"eval_accuracy": 0.8492560240081366,
"eval_loss": 0.7322823405265808,
"eval_runtime": 137.5974,
"eval_samples_per_second": 128.505,
"eval_steps_per_second": 4.019,
"step": 26000
},
{
"epoch": 62.20657276995305,
"grad_norm": 2.0946710109710693,
"learning_rate": 7.35e-05,
"loss": 0.5215,
"step": 26500
},
{
"epoch": 62.20657276995305,
"eval_accuracy": 0.8505152353921837,
"eval_loss": 0.721396803855896,
"eval_runtime": 136.7594,
"eval_samples_per_second": 129.293,
"eval_steps_per_second": 4.044,
"step": 26500
},
{
"epoch": 63.38028169014085,
"grad_norm": 2.1418216228485107,
"learning_rate": 7.3e-05,
"loss": 0.5188,
"step": 27000
},
{
"epoch": 63.38028169014085,
"eval_accuracy": 0.8503667583911142,
"eval_loss": 0.7287681698799133,
"eval_runtime": 136.8587,
"eval_samples_per_second": 129.199,
"eval_steps_per_second": 4.041,
"step": 27000
},
{
"epoch": 64.55399061032864,
"grad_norm": 2.0790982246398926,
"learning_rate": 7.25e-05,
"loss": 0.5168,
"step": 27500
},
{
"epoch": 64.55399061032864,
"eval_accuracy": 0.8514007297976166,
"eval_loss": 0.7309630513191223,
"eval_runtime": 136.1553,
"eval_samples_per_second": 129.866,
"eval_steps_per_second": 4.062,
"step": 27500
},
{
"epoch": 65.72769953051643,
"grad_norm": 2.085266351699829,
"learning_rate": 7.2e-05,
"loss": 0.5125,
"step": 28000
},
{
"epoch": 65.72769953051643,
"eval_accuracy": 0.8518668186024253,
"eval_loss": 0.7231945395469666,
"eval_runtime": 147.5131,
"eval_samples_per_second": 119.867,
"eval_steps_per_second": 3.749,
"step": 28000
},
{
"epoch": 66.90140845070422,
"grad_norm": 1.7428772449493408,
"learning_rate": 7.15e-05,
"loss": 0.5076,
"step": 28500
},
{
"epoch": 66.90140845070422,
"eval_accuracy": 0.8518548866516221,
"eval_loss": 0.726669430732727,
"eval_runtime": 139.3497,
"eval_samples_per_second": 126.889,
"eval_steps_per_second": 3.968,
"step": 28500
},
{
"epoch": 68.07511737089202,
"grad_norm": 1.9387340545654297,
"learning_rate": 7.1e-05,
"loss": 0.5018,
"step": 29000
},
{
"epoch": 68.07511737089202,
"eval_accuracy": 0.8516039425023846,
"eval_loss": 0.7305765151977539,
"eval_runtime": 136.8809,
"eval_samples_per_second": 129.178,
"eval_steps_per_second": 4.04,
"step": 29000
},
{
"epoch": 69.24882629107981,
"grad_norm": 2.090191125869751,
"learning_rate": 7.05e-05,
"loss": 0.5011,
"step": 29500
},
{
"epoch": 69.24882629107981,
"eval_accuracy": 0.8522613468602297,
"eval_loss": 0.723237931728363,
"eval_runtime": 137.4062,
"eval_samples_per_second": 128.684,
"eval_steps_per_second": 4.025,
"step": 29500
},
{
"epoch": 70.4225352112676,
"grad_norm": 1.8190377950668335,
"learning_rate": 7e-05,
"loss": 0.4986,
"step": 30000
},
{
"epoch": 70.4225352112676,
"eval_accuracy": 0.8533026826846422,
"eval_loss": 0.7156932353973389,
"eval_runtime": 136.1553,
"eval_samples_per_second": 129.866,
"eval_steps_per_second": 4.062,
"step": 30000
},
{
"epoch": 71.59624413145539,
"grad_norm": 2.007854700088501,
"learning_rate": 6.95e-05,
"loss": 0.4915,
"step": 30500
},
{
"epoch": 71.59624413145539,
"eval_accuracy": 0.8527300721103733,
"eval_loss": 0.7252740859985352,
"eval_runtime": 136.7888,
"eval_samples_per_second": 129.265,
"eval_steps_per_second": 4.043,
"step": 30500
},
{
"epoch": 72.7699530516432,
"grad_norm": 2.4322381019592285,
"learning_rate": 6.9e-05,
"loss": 0.4934,
"step": 31000
},
{
"epoch": 72.7699530516432,
"eval_accuracy": 0.8530242933183955,
"eval_loss": 0.7319638133049011,
"eval_runtime": 144.2929,
"eval_samples_per_second": 122.542,
"eval_steps_per_second": 3.832,
"step": 31000
},
{
"epoch": 73.94366197183099,
"grad_norm": 2.21516489982605,
"learning_rate": 6.850000000000001e-05,
"loss": 0.4895,
"step": 31500
},
{
"epoch": 73.94366197183099,
"eval_accuracy": 0.8537945053849306,
"eval_loss": 0.7180453538894653,
"eval_runtime": 141.0891,
"eval_samples_per_second": 125.325,
"eval_steps_per_second": 3.92,
"step": 31500
},
{
"epoch": 75.11737089201878,
"grad_norm": 2.0788252353668213,
"learning_rate": 6.800000000000001e-05,
"loss": 0.4822,
"step": 32000
},
{
"epoch": 75.11737089201878,
"eval_accuracy": 0.8538218958075418,
"eval_loss": 0.7125606536865234,
"eval_runtime": 137.0269,
"eval_samples_per_second": 129.04,
"eval_steps_per_second": 4.036,
"step": 32000
},
{
"epoch": 76.29107981220658,
"grad_norm": 2.038712978363037,
"learning_rate": 6.750000000000001e-05,
"loss": 0.4807,
"step": 32500
},
{
"epoch": 76.29107981220658,
"eval_accuracy": 0.8543182928024404,
"eval_loss": 0.7191519141197205,
"eval_runtime": 136.1481,
"eval_samples_per_second": 129.873,
"eval_steps_per_second": 4.062,
"step": 32500
},
{
"epoch": 77.46478873239437,
"grad_norm": 2.133082389831543,
"learning_rate": 6.7e-05,
"loss": 0.4743,
"step": 33000
},
{
"epoch": 77.46478873239437,
"eval_accuracy": 0.8547410139669203,
"eval_loss": 0.7057022452354431,
"eval_runtime": 136.0992,
"eval_samples_per_second": 129.92,
"eval_steps_per_second": 4.063,
"step": 33000
},
{
"epoch": 78.63849765258216,
"grad_norm": 1.9323476552963257,
"learning_rate": 6.65e-05,
"loss": 0.4749,
"step": 33500
},
{
"epoch": 78.63849765258216,
"eval_accuracy": 0.8546438959333029,
"eval_loss": 0.7180441617965698,
"eval_runtime": 141.0328,
"eval_samples_per_second": 125.375,
"eval_steps_per_second": 3.921,
"step": 33500
},
{
"epoch": 79.81220657276995,
"grad_norm": 2.1482994556427,
"learning_rate": 6.6e-05,
"loss": 0.4693,
"step": 34000
},
{
"epoch": 79.81220657276995,
"eval_accuracy": 0.8554924128329217,
"eval_loss": 0.7105869054794312,
"eval_runtime": 137.5148,
"eval_samples_per_second": 128.583,
"eval_steps_per_second": 4.021,
"step": 34000
},
{
"epoch": 80.98591549295774,
"grad_norm": 2.077798366546631,
"learning_rate": 6.55e-05,
"loss": 0.4675,
"step": 34500
},
{
"epoch": 80.98591549295774,
"eval_accuracy": 0.8544003573962279,
"eval_loss": 0.7216335535049438,
"eval_runtime": 142.3955,
"eval_samples_per_second": 124.175,
"eval_steps_per_second": 3.884,
"step": 34500
},
{
"epoch": 82.15962441314554,
"grad_norm": 2.2198646068573,
"learning_rate": 6.500000000000001e-05,
"loss": 0.4637,
"step": 35000
},
{
"epoch": 82.15962441314554,
"eval_accuracy": 0.855125887761347,
"eval_loss": 0.717957079410553,
"eval_runtime": 137.0613,
"eval_samples_per_second": 129.008,
"eval_steps_per_second": 4.035,
"step": 35000
},
{
"epoch": 83.33333333333333,
"grad_norm": 2.155299186706543,
"learning_rate": 6.450000000000001e-05,
"loss": 0.461,
"step": 35500
},
{
"epoch": 83.33333333333333,
"eval_accuracy": 0.8554556348895517,
"eval_loss": 0.7124961614608765,
"eval_runtime": 145.4916,
"eval_samples_per_second": 121.533,
"eval_steps_per_second": 3.801,
"step": 35500
},
{
"epoch": 84.50704225352112,
"grad_norm": 2.8300819396972656,
"learning_rate": 6.400000000000001e-05,
"loss": 0.4542,
"step": 36000
},
{
"epoch": 84.50704225352112,
"eval_accuracy": 0.856205421066434,
"eval_loss": 0.7241988182067871,
"eval_runtime": 138.1126,
"eval_samples_per_second": 128.026,
"eval_steps_per_second": 4.004,
"step": 36000
},
{
"epoch": 85.68075117370893,
"grad_norm": 2.4033689498901367,
"learning_rate": 6.35e-05,
"loss": 0.4564,
"step": 36500
},
{
"epoch": 85.68075117370893,
"eval_accuracy": 0.8557880398860113,
"eval_loss": 0.7188218832015991,
"eval_runtime": 138.5628,
"eval_samples_per_second": 127.61,
"eval_steps_per_second": 3.991,
"step": 36500
},
{
"epoch": 86.85446009389672,
"grad_norm": 2.104976177215576,
"learning_rate": 6.3e-05,
"loss": 0.4497,
"step": 37000
},
{
"epoch": 86.85446009389672,
"eval_accuracy": 0.8571525482780049,
"eval_loss": 0.718337893486023,
"eval_runtime": 136.8784,
"eval_samples_per_second": 129.18,
"eval_steps_per_second": 4.04,
"step": 37000
},
{
"epoch": 88.02816901408451,
"grad_norm": 1.9449199438095093,
"learning_rate": 6.25e-05,
"loss": 0.4485,
"step": 37500
},
{
"epoch": 88.02816901408451,
"eval_accuracy": 0.8572300673528399,
"eval_loss": 0.7067864537239075,
"eval_runtime": 140.6771,
"eval_samples_per_second": 125.692,
"eval_steps_per_second": 3.931,
"step": 37500
},
{
"epoch": 89.2018779342723,
"grad_norm": 2.0170960426330566,
"learning_rate": 6.2e-05,
"loss": 0.4486,
"step": 38000
},
{
"epoch": 89.2018779342723,
"eval_accuracy": 0.8567210575027162,
"eval_loss": 0.7134066820144653,
"eval_runtime": 137.1729,
"eval_samples_per_second": 128.903,
"eval_steps_per_second": 4.031,
"step": 38000
},
{
"epoch": 90.3755868544601,
"grad_norm": 2.048830032348633,
"learning_rate": 6.15e-05,
"loss": 0.4439,
"step": 38500
},
{
"epoch": 90.3755868544601,
"eval_accuracy": 0.8563134552098389,
"eval_loss": 0.7194843292236328,
"eval_runtime": 136.9411,
"eval_samples_per_second": 129.121,
"eval_steps_per_second": 4.038,
"step": 38500
},
{
"epoch": 91.54929577464789,
"grad_norm": 2.0259907245635986,
"learning_rate": 6.1e-05,
"loss": 0.4385,
"step": 39000
},
{
"epoch": 91.54929577464789,
"eval_accuracy": 0.8580620221342309,
"eval_loss": 0.7045323252677917,
"eval_runtime": 137.122,
"eval_samples_per_second": 128.951,
"eval_steps_per_second": 4.033,
"step": 39000
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 235,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2882090840162304e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}