alenphilip's picture
Training in progress, step 428, checkpoint
65901d7 verified
{
"best_global_step": 400,
"best_metric": 0.754337888795046,
"best_model_checkpoint": "./qwen2.5-7b-sft-qlora/checkpoint-400",
"epoch": 2.0,
"eval_steps": 50,
"global_step": 428,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.7451270699501038,
"epoch": 0.04678362573099415,
"grad_norm": 0.9813888669013977,
"learning_rate": 4.186046511627907e-05,
"loss": 1.6867,
"mean_token_accuracy": 0.6874781519174575,
"num_tokens": 237406.0,
"step": 10
},
{
"entropy": 0.980076114833355,
"epoch": 0.0935672514619883,
"grad_norm": 0.38563278317451477,
"learning_rate": 8.837209302325582e-05,
"loss": 1.1195,
"mean_token_accuracy": 0.7362557649612427,
"num_tokens": 477881.0,
"step": 20
},
{
"entropy": 0.8651187509298325,
"epoch": 0.14035087719298245,
"grad_norm": 0.1798514723777771,
"learning_rate": 0.00013488372093023256,
"loss": 0.8381,
"mean_token_accuracy": 0.7792193830013275,
"num_tokens": 722273.0,
"step": 30
},
{
"entropy": 0.7459597036242485,
"epoch": 0.1871345029239766,
"grad_norm": 0.1387411206960678,
"learning_rate": 0.0001813953488372093,
"loss": 0.7437,
"mean_token_accuracy": 0.7977154269814491,
"num_tokens": 965310.0,
"step": 40
},
{
"entropy": 0.7164520159363746,
"epoch": 0.23391812865497075,
"grad_norm": 0.16301865875720978,
"learning_rate": 0.00019988017042007065,
"loss": 0.7196,
"mean_token_accuracy": 0.8026821106672287,
"num_tokens": 1206059.0,
"step": 50
},
{
"epoch": 0.23391812865497075,
"eval_bleu": 62.7027023623289,
"eval_entropy": 0.6610592747176135,
"eval_loss": 0.6552960276603699,
"eval_mean_token_accuracy": 0.8157303515407774,
"eval_num_tokens": 1206059.0,
"eval_rougeL": 0.74902075023143,
"eval_runtime": 62.5464,
"eval_samples_per_second": 27.596,
"eval_steps_per_second": 1.727,
"step": 50
},
{
"entropy": 0.7049296364188194,
"epoch": 0.2807017543859649,
"grad_norm": 0.1793186366558075,
"learning_rate": 0.00019914891828692888,
"loss": 0.6999,
"mean_token_accuracy": 0.8054823115468025,
"num_tokens": 1443480.0,
"step": 60
},
{
"entropy": 0.6841731831431389,
"epoch": 0.32748538011695905,
"grad_norm": 0.17175541818141937,
"learning_rate": 0.0001977578464610077,
"loss": 0.6826,
"mean_token_accuracy": 0.8097251161932946,
"num_tokens": 1684201.0,
"step": 70
},
{
"entropy": 0.6653475634753704,
"epoch": 0.3742690058479532,
"grad_norm": 0.1718990057706833,
"learning_rate": 0.00019571621229579782,
"loss": 0.6671,
"mean_token_accuracy": 0.812454403936863,
"num_tokens": 1934153.0,
"step": 80
},
{
"entropy": 0.6634650066494941,
"epoch": 0.42105263157894735,
"grad_norm": 0.1641378253698349,
"learning_rate": 0.00019303760252982287,
"loss": 0.6563,
"mean_token_accuracy": 0.8153385534882546,
"num_tokens": 2176500.0,
"step": 90
},
{
"entropy": 0.6492825776338578,
"epoch": 0.4678362573099415,
"grad_norm": 0.18577228486537933,
"learning_rate": 0.00018973984286913584,
"loss": 0.6431,
"mean_token_accuracy": 0.8175705805420875,
"num_tokens": 2417754.0,
"step": 100
},
{
"epoch": 0.4678362573099415,
"eval_bleu": 60.52596113898074,
"eval_entropy": 0.5916264095792064,
"eval_loss": 0.6234877109527588,
"eval_mean_token_accuracy": 0.8231563827505818,
"eval_num_tokens": 2417754.0,
"eval_rougeL": 0.7388377291703244,
"eval_runtime": 63.4008,
"eval_samples_per_second": 27.224,
"eval_steps_per_second": 1.703,
"step": 100
},
{
"entropy": 0.6501711800694465,
"epoch": 0.5146198830409356,
"grad_norm": 0.1625615507364273,
"learning_rate": 0.00018584487936018661,
"loss": 0.6484,
"mean_token_accuracy": 0.8180312633514404,
"num_tokens": 2659238.0,
"step": 110
},
{
"entropy": 0.6405581876635551,
"epoch": 0.5614035087719298,
"grad_norm": 0.17417997121810913,
"learning_rate": 0.00018137863234250347,
"loss": 0.6404,
"mean_token_accuracy": 0.819054339826107,
"num_tokens": 2897816.0,
"step": 120
},
{
"entropy": 0.6380819544196129,
"epoch": 0.6081871345029239,
"grad_norm": 0.17349691689014435,
"learning_rate": 0.00017637082395311024,
"loss": 0.6366,
"mean_token_accuracy": 0.820624266564846,
"num_tokens": 3136294.0,
"step": 130
},
{
"entropy": 0.6500405013561249,
"epoch": 0.6549707602339181,
"grad_norm": 0.18412715196609497,
"learning_rate": 0.00017085478033060806,
"loss": 0.6426,
"mean_token_accuracy": 0.8185427248477936,
"num_tokens": 3375202.0,
"step": 140
},
{
"entropy": 0.6269903033971786,
"epoch": 0.7017543859649122,
"grad_norm": 0.1778886765241623,
"learning_rate": 0.00016486720983522156,
"loss": 0.6279,
"mean_token_accuracy": 0.8219256103038788,
"num_tokens": 3614721.0,
"step": 150
},
{
"epoch": 0.7017543859649122,
"eval_bleu": 61.15829556167586,
"eval_entropy": 0.5959388177703928,
"eval_loss": 0.6073054671287537,
"eval_mean_token_accuracy": 0.8267559442255232,
"eval_num_tokens": 3614721.0,
"eval_rougeL": 0.7485533859740823,
"eval_runtime": 63.4672,
"eval_samples_per_second": 27.195,
"eval_steps_per_second": 1.702,
"step": 150
},
{
"entropy": 0.6273025006055832,
"epoch": 0.7485380116959064,
"grad_norm": 0.17554914951324463,
"learning_rate": 0.000158447958760718,
"loss": 0.6235,
"mean_token_accuracy": 0.8232012897729873,
"num_tokens": 3852615.0,
"step": 160
},
{
"entropy": 0.6264464437961579,
"epoch": 0.7953216374269005,
"grad_norm": 0.17685498297214508,
"learning_rate": 0.0001516397461638962,
"loss": 0.6223,
"mean_token_accuracy": 0.8228656515479088,
"num_tokens": 4085589.0,
"step": 170
},
{
"entropy": 0.623998960852623,
"epoch": 0.8421052631578947,
"grad_norm": 0.1789834052324295,
"learning_rate": 0.0001444878795763121,
"loss": 0.6191,
"mean_token_accuracy": 0.8224357396364212,
"num_tokens": 4327626.0,
"step": 180
},
{
"entropy": 0.6093558698892594,
"epoch": 0.8888888888888888,
"grad_norm": 0.17523610591888428,
"learning_rate": 0.00013703995349013113,
"loss": 0.61,
"mean_token_accuracy": 0.8264237254858017,
"num_tokens": 4570278.0,
"step": 190
},
{
"entropy": 0.6039168611168861,
"epoch": 0.935672514619883,
"grad_norm": 0.18692275881767273,
"learning_rate": 0.00012934553262463548,
"loss": 0.6032,
"mean_token_accuracy": 0.828160648047924,
"num_tokens": 4806172.0,
"step": 200
},
{
"epoch": 0.935672514619883,
"eval_bleu": 60.260312927941236,
"eval_entropy": 0.5826076859677279,
"eval_loss": 0.6021928787231445,
"eval_mean_token_accuracy": 0.8273030961001361,
"eval_num_tokens": 4806172.0,
"eval_rougeL": 0.7492690359164101,
"eval_runtime": 63.3853,
"eval_samples_per_second": 27.23,
"eval_steps_per_second": 1.704,
"step": 200
},
{
"entropy": 0.607174352556467,
"epoch": 0.9824561403508771,
"grad_norm": 0.1787632554769516,
"learning_rate": 0.00012145582208119497,
"loss": 0.6041,
"mean_token_accuracy": 0.826733535528183,
"num_tokens": 5046903.0,
"step": 210
},
{
"entropy": 0.5812954008579254,
"epoch": 1.0280701754385966,
"grad_norm": 0.18372896313667297,
"learning_rate": 0.00011342332658176555,
"loss": 0.5672,
"mean_token_accuracy": 0.8368643965476599,
"num_tokens": 5286085.0,
"step": 220
},
{
"entropy": 0.5580198377370834,
"epoch": 1.0748538011695907,
"grad_norm": 0.19435207545757294,
"learning_rate": 0.00010530150105862748,
"loss": 0.5539,
"mean_token_accuracy": 0.8394016489386559,
"num_tokens": 5522827.0,
"step": 230
},
{
"entropy": 0.5524544611573219,
"epoch": 1.1216374269005849,
"grad_norm": 0.2061990201473236,
"learning_rate": 9.71443949206304e-05,
"loss": 0.5445,
"mean_token_accuracy": 0.8409878596663475,
"num_tokens": 5764879.0,
"step": 240
},
{
"entropy": 0.5744347527623177,
"epoch": 1.168421052631579,
"grad_norm": 0.21095068752765656,
"learning_rate": 8.900629236329482e-05,
"loss": 0.5672,
"mean_token_accuracy": 0.8354128882288933,
"num_tokens": 6002932.0,
"step": 250
},
{
"epoch": 1.168421052631579,
"eval_bleu": 61.178798380428454,
"eval_entropy": 0.5307412986402158,
"eval_loss": 0.6018245816230774,
"eval_mean_token_accuracy": 0.8288786930066568,
"eval_num_tokens": 6002932.0,
"eval_rougeL": 0.7482538683957054,
"eval_runtime": 63.5756,
"eval_samples_per_second": 27.149,
"eval_steps_per_second": 1.699,
"step": 250
},
{
"entropy": 0.5755763545632362,
"epoch": 1.2152046783625732,
"grad_norm": 0.20451681315898895,
"learning_rate": 8.094135111644742e-05,
"loss": 0.568,
"mean_token_accuracy": 0.8352775603532792,
"num_tokens": 6236870.0,
"step": 260
},
{
"entropy": 0.5376525044441223,
"epoch": 1.2619883040935673,
"grad_norm": 0.21364448964595795,
"learning_rate": 7.300324203346431e-05,
"loss": 0.5367,
"mean_token_accuracy": 0.8422502785921097,
"num_tokens": 6475853.0,
"step": 270
},
{
"entropy": 0.5575953021645546,
"epoch": 1.3087719298245615,
"grad_norm": 0.2244759500026703,
"learning_rate": 6.524479192059698e-05,
"loss": 0.5487,
"mean_token_accuracy": 0.8396281078457832,
"num_tokens": 6717478.0,
"step": 280
},
{
"entropy": 0.5532271094620228,
"epoch": 1.3555555555555556,
"grad_norm": 0.22009848058223724,
"learning_rate": 5.7717631983292375e-05,
"loss": 0.5539,
"mean_token_accuracy": 0.839461912214756,
"num_tokens": 6956096.0,
"step": 290
},
{
"entropy": 0.5678240090608597,
"epoch": 1.4023391812865498,
"grad_norm": 0.22350303828716278,
"learning_rate": 5.047185422903928e-05,
"loss": 0.5536,
"mean_token_accuracy": 0.8384527832269668,
"num_tokens": 7198597.0,
"step": 300
},
{
"epoch": 1.4023391812865498,
"eval_bleu": 61.61816935351767,
"eval_entropy": 0.529030436442958,
"eval_loss": 0.5979623198509216,
"eval_mean_token_accuracy": 0.8302161036818115,
"eval_num_tokens": 7198597.0,
"eval_rougeL": 0.7527724116774704,
"eval_runtime": 63.2813,
"eval_samples_per_second": 27.275,
"eval_steps_per_second": 1.707,
"step": 300
},
{
"entropy": 0.5341040723025798,
"epoch": 1.449122807017544,
"grad_norm": 0.2159273475408554,
"learning_rate": 4.355567811332311e-05,
"loss": 0.5294,
"mean_token_accuracy": 0.8453650638461113,
"num_tokens": 7444525.0,
"step": 310
},
{
"entropy": 0.5395594887435436,
"epoch": 1.495906432748538,
"grad_norm": 0.2247242033481598,
"learning_rate": 3.701512964710513e-05,
"loss": 0.5297,
"mean_token_accuracy": 0.84483852237463,
"num_tokens": 7692840.0,
"step": 320
},
{
"entropy": 0.5372945822775363,
"epoch": 1.5426900584795322,
"grad_norm": 0.23070968687534332,
"learning_rate": 3.089373510131354e-05,
"loss": 0.5324,
"mean_token_accuracy": 0.8450036272406578,
"num_tokens": 7927218.0,
"step": 330
},
{
"entropy": 0.5493481300771237,
"epoch": 1.5894736842105264,
"grad_norm": 0.23444080352783203,
"learning_rate": 2.523223134669157e-05,
"loss": 0.5435,
"mean_token_accuracy": 0.8414489045739174,
"num_tokens": 8164688.0,
"step": 340
},
{
"entropy": 0.5425299167633056,
"epoch": 1.6362573099415205,
"grad_norm": 0.21845506131649017,
"learning_rate": 2.0068294756643845e-05,
"loss": 0.538,
"mean_token_accuracy": 0.8418876558542252,
"num_tokens": 8406525.0,
"step": 350
},
{
"epoch": 1.6362573099415205,
"eval_bleu": 61.86199921842308,
"eval_entropy": 0.5256232053593353,
"eval_loss": 0.5959370732307434,
"eval_mean_token_accuracy": 0.8305901747058939,
"eval_num_tokens": 8406525.0,
"eval_rougeL": 0.7535288717921484,
"eval_runtime": 63.1544,
"eval_samples_per_second": 27.33,
"eval_steps_per_second": 1.71,
"step": 350
},
{
"entropy": 0.5370763696730136,
"epoch": 1.6830409356725147,
"grad_norm": 0.2211093306541443,
"learning_rate": 1.5436290477187587e-05,
"loss": 0.5294,
"mean_token_accuracy": 0.8450759872794151,
"num_tokens": 8647405.0,
"step": 360
},
{
"entropy": 0.5449528440833091,
"epoch": 1.7298245614035088,
"grad_norm": 0.2158094048500061,
"learning_rate": 1.1367043732575666e-05,
"loss": 0.5348,
"mean_token_accuracy": 0.843912661075592,
"num_tokens": 8888082.0,
"step": 370
},
{
"entropy": 0.5405852876603603,
"epoch": 1.776608187134503,
"grad_norm": 0.21288156509399414,
"learning_rate": 7.887634688515e-06,
"loss": 0.5314,
"mean_token_accuracy": 0.8438028782606125,
"num_tokens": 9140762.0,
"step": 380
},
{
"entropy": 0.5433258466422558,
"epoch": 1.8233918128654971,
"grad_norm": 0.23975639045238495,
"learning_rate": 5.021218238131719e-06,
"loss": 0.5395,
"mean_token_accuracy": 0.8432600662112236,
"num_tokens": 9374299.0,
"step": 390
},
{
"entropy": 0.5539177514612674,
"epoch": 1.8701754385964913,
"grad_norm": 0.23764148354530334,
"learning_rate": 2.7868699099777297e-06,
"loss": 0.5455,
"mean_token_accuracy": 0.8410582140088081,
"num_tokens": 9612371.0,
"step": 400
},
{
"epoch": 1.8701754385964913,
"eval_bleu": 61.99304439252053,
"eval_entropy": 0.5258893685208427,
"eval_loss": 0.5951277017593384,
"eval_mean_token_accuracy": 0.8308189588564413,
"eval_num_tokens": 9612371.0,
"eval_rougeL": 0.754337888795046,
"eval_runtime": 63.4001,
"eval_samples_per_second": 27.224,
"eval_steps_per_second": 1.703,
"step": 400
},
{
"entropy": 0.5425351396203041,
"epoch": 1.9169590643274854,
"grad_norm": 0.2328052669763565,
"learning_rate": 1.1994589235353681e-06,
"loss": 0.5376,
"mean_token_accuracy": 0.8435128018260002,
"num_tokens": 9853669.0,
"step": 410
},
{
"entropy": 0.533241743594408,
"epoch": 1.9637426900584796,
"grad_norm": 0.22453628480434418,
"learning_rate": 2.695492370149988e-07,
"loss": 0.5238,
"mean_token_accuracy": 0.8451769664883614,
"num_tokens": 10097512.0,
"step": 420
}
],
"logging_steps": 10,
"max_steps": 428,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.941697521363354e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}