| { | |
| "best_global_step": 400, | |
| "best_metric": 0.754337888795046, | |
| "best_model_checkpoint": "./qwen2.5-7b-sft-qlora/checkpoint-400", | |
| "epoch": 2.0, | |
| "eval_steps": 50, | |
| "global_step": 428, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.7451270699501038, | |
| "epoch": 0.04678362573099415, | |
| "grad_norm": 0.9813888669013977, | |
| "learning_rate": 4.186046511627907e-05, | |
| "loss": 1.6867, | |
| "mean_token_accuracy": 0.6874781519174575, | |
| "num_tokens": 237406.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.980076114833355, | |
| "epoch": 0.0935672514619883, | |
| "grad_norm": 0.38563278317451477, | |
| "learning_rate": 8.837209302325582e-05, | |
| "loss": 1.1195, | |
| "mean_token_accuracy": 0.7362557649612427, | |
| "num_tokens": 477881.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.8651187509298325, | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 0.1798514723777771, | |
| "learning_rate": 0.00013488372093023256, | |
| "loss": 0.8381, | |
| "mean_token_accuracy": 0.7792193830013275, | |
| "num_tokens": 722273.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.7459597036242485, | |
| "epoch": 0.1871345029239766, | |
| "grad_norm": 0.1387411206960678, | |
| "learning_rate": 0.0001813953488372093, | |
| "loss": 0.7437, | |
| "mean_token_accuracy": 0.7977154269814491, | |
| "num_tokens": 965310.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.7164520159363746, | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 0.16301865875720978, | |
| "learning_rate": 0.00019988017042007065, | |
| "loss": 0.7196, | |
| "mean_token_accuracy": 0.8026821106672287, | |
| "num_tokens": 1206059.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23391812865497075, | |
| "eval_bleu": 62.7027023623289, | |
| "eval_entropy": 0.6610592747176135, | |
| "eval_loss": 0.6552960276603699, | |
| "eval_mean_token_accuracy": 0.8157303515407774, | |
| "eval_num_tokens": 1206059.0, | |
| "eval_rougeL": 0.74902075023143, | |
| "eval_runtime": 62.5464, | |
| "eval_samples_per_second": 27.596, | |
| "eval_steps_per_second": 1.727, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.7049296364188194, | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 0.1793186366558075, | |
| "learning_rate": 0.00019914891828692888, | |
| "loss": 0.6999, | |
| "mean_token_accuracy": 0.8054823115468025, | |
| "num_tokens": 1443480.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.6841731831431389, | |
| "epoch": 0.32748538011695905, | |
| "grad_norm": 0.17175541818141937, | |
| "learning_rate": 0.0001977578464610077, | |
| "loss": 0.6826, | |
| "mean_token_accuracy": 0.8097251161932946, | |
| "num_tokens": 1684201.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.6653475634753704, | |
| "epoch": 0.3742690058479532, | |
| "grad_norm": 0.1718990057706833, | |
| "learning_rate": 0.00019571621229579782, | |
| "loss": 0.6671, | |
| "mean_token_accuracy": 0.812454403936863, | |
| "num_tokens": 1934153.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.6634650066494941, | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.1641378253698349, | |
| "learning_rate": 0.00019303760252982287, | |
| "loss": 0.6563, | |
| "mean_token_accuracy": 0.8153385534882546, | |
| "num_tokens": 2176500.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.6492825776338578, | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 0.18577228486537933, | |
| "learning_rate": 0.00018973984286913584, | |
| "loss": 0.6431, | |
| "mean_token_accuracy": 0.8175705805420875, | |
| "num_tokens": 2417754.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4678362573099415, | |
| "eval_bleu": 60.52596113898074, | |
| "eval_entropy": 0.5916264095792064, | |
| "eval_loss": 0.6234877109527588, | |
| "eval_mean_token_accuracy": 0.8231563827505818, | |
| "eval_num_tokens": 2417754.0, | |
| "eval_rougeL": 0.7388377291703244, | |
| "eval_runtime": 63.4008, | |
| "eval_samples_per_second": 27.224, | |
| "eval_steps_per_second": 1.703, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.6501711800694465, | |
| "epoch": 0.5146198830409356, | |
| "grad_norm": 0.1625615507364273, | |
| "learning_rate": 0.00018584487936018661, | |
| "loss": 0.6484, | |
| "mean_token_accuracy": 0.8180312633514404, | |
| "num_tokens": 2659238.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.6405581876635551, | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 0.17417997121810913, | |
| "learning_rate": 0.00018137863234250347, | |
| "loss": 0.6404, | |
| "mean_token_accuracy": 0.819054339826107, | |
| "num_tokens": 2897816.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.6380819544196129, | |
| "epoch": 0.6081871345029239, | |
| "grad_norm": 0.17349691689014435, | |
| "learning_rate": 0.00017637082395311024, | |
| "loss": 0.6366, | |
| "mean_token_accuracy": 0.820624266564846, | |
| "num_tokens": 3136294.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.6500405013561249, | |
| "epoch": 0.6549707602339181, | |
| "grad_norm": 0.18412715196609497, | |
| "learning_rate": 0.00017085478033060806, | |
| "loss": 0.6426, | |
| "mean_token_accuracy": 0.8185427248477936, | |
| "num_tokens": 3375202.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.6269903033971786, | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.1778886765241623, | |
| "learning_rate": 0.00016486720983522156, | |
| "loss": 0.6279, | |
| "mean_token_accuracy": 0.8219256103038788, | |
| "num_tokens": 3614721.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "eval_bleu": 61.15829556167586, | |
| "eval_entropy": 0.5959388177703928, | |
| "eval_loss": 0.6073054671287537, | |
| "eval_mean_token_accuracy": 0.8267559442255232, | |
| "eval_num_tokens": 3614721.0, | |
| "eval_rougeL": 0.7485533859740823, | |
| "eval_runtime": 63.4672, | |
| "eval_samples_per_second": 27.195, | |
| "eval_steps_per_second": 1.702, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.6273025006055832, | |
| "epoch": 0.7485380116959064, | |
| "grad_norm": 0.17554914951324463, | |
| "learning_rate": 0.000158447958760718, | |
| "loss": 0.6235, | |
| "mean_token_accuracy": 0.8232012897729873, | |
| "num_tokens": 3852615.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.6264464437961579, | |
| "epoch": 0.7953216374269005, | |
| "grad_norm": 0.17685498297214508, | |
| "learning_rate": 0.0001516397461638962, | |
| "loss": 0.6223, | |
| "mean_token_accuracy": 0.8228656515479088, | |
| "num_tokens": 4085589.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.623998960852623, | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.1789834052324295, | |
| "learning_rate": 0.0001444878795763121, | |
| "loss": 0.6191, | |
| "mean_token_accuracy": 0.8224357396364212, | |
| "num_tokens": 4327626.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.6093558698892594, | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.17523610591888428, | |
| "learning_rate": 0.00013703995349013113, | |
| "loss": 0.61, | |
| "mean_token_accuracy": 0.8264237254858017, | |
| "num_tokens": 4570278.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.6039168611168861, | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 0.18692275881767273, | |
| "learning_rate": 0.00012934553262463548, | |
| "loss": 0.6032, | |
| "mean_token_accuracy": 0.828160648047924, | |
| "num_tokens": 4806172.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.935672514619883, | |
| "eval_bleu": 60.260312927941236, | |
| "eval_entropy": 0.5826076859677279, | |
| "eval_loss": 0.6021928787231445, | |
| "eval_mean_token_accuracy": 0.8273030961001361, | |
| "eval_num_tokens": 4806172.0, | |
| "eval_rougeL": 0.7492690359164101, | |
| "eval_runtime": 63.3853, | |
| "eval_samples_per_second": 27.23, | |
| "eval_steps_per_second": 1.704, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.607174352556467, | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.1787632554769516, | |
| "learning_rate": 0.00012145582208119497, | |
| "loss": 0.6041, | |
| "mean_token_accuracy": 0.826733535528183, | |
| "num_tokens": 5046903.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5812954008579254, | |
| "epoch": 1.0280701754385966, | |
| "grad_norm": 0.18372896313667297, | |
| "learning_rate": 0.00011342332658176555, | |
| "loss": 0.5672, | |
| "mean_token_accuracy": 0.8368643965476599, | |
| "num_tokens": 5286085.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5580198377370834, | |
| "epoch": 1.0748538011695907, | |
| "grad_norm": 0.19435207545757294, | |
| "learning_rate": 0.00010530150105862748, | |
| "loss": 0.5539, | |
| "mean_token_accuracy": 0.8394016489386559, | |
| "num_tokens": 5522827.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5524544611573219, | |
| "epoch": 1.1216374269005849, | |
| "grad_norm": 0.2061990201473236, | |
| "learning_rate": 9.71443949206304e-05, | |
| "loss": 0.5445, | |
| "mean_token_accuracy": 0.8409878596663475, | |
| "num_tokens": 5764879.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5744347527623177, | |
| "epoch": 1.168421052631579, | |
| "grad_norm": 0.21095068752765656, | |
| "learning_rate": 8.900629236329482e-05, | |
| "loss": 0.5672, | |
| "mean_token_accuracy": 0.8354128882288933, | |
| "num_tokens": 6002932.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.168421052631579, | |
| "eval_bleu": 61.178798380428454, | |
| "eval_entropy": 0.5307412986402158, | |
| "eval_loss": 0.6018245816230774, | |
| "eval_mean_token_accuracy": 0.8288786930066568, | |
| "eval_num_tokens": 6002932.0, | |
| "eval_rougeL": 0.7482538683957054, | |
| "eval_runtime": 63.5756, | |
| "eval_samples_per_second": 27.149, | |
| "eval_steps_per_second": 1.699, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5755763545632362, | |
| "epoch": 1.2152046783625732, | |
| "grad_norm": 0.20451681315898895, | |
| "learning_rate": 8.094135111644742e-05, | |
| "loss": 0.568, | |
| "mean_token_accuracy": 0.8352775603532792, | |
| "num_tokens": 6236870.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5376525044441223, | |
| "epoch": 1.2619883040935673, | |
| "grad_norm": 0.21364448964595795, | |
| "learning_rate": 7.300324203346431e-05, | |
| "loss": 0.5367, | |
| "mean_token_accuracy": 0.8422502785921097, | |
| "num_tokens": 6475853.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5575953021645546, | |
| "epoch": 1.3087719298245615, | |
| "grad_norm": 0.2244759500026703, | |
| "learning_rate": 6.524479192059698e-05, | |
| "loss": 0.5487, | |
| "mean_token_accuracy": 0.8396281078457832, | |
| "num_tokens": 6717478.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5532271094620228, | |
| "epoch": 1.3555555555555556, | |
| "grad_norm": 0.22009848058223724, | |
| "learning_rate": 5.7717631983292375e-05, | |
| "loss": 0.5539, | |
| "mean_token_accuracy": 0.839461912214756, | |
| "num_tokens": 6956096.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5678240090608597, | |
| "epoch": 1.4023391812865498, | |
| "grad_norm": 0.22350303828716278, | |
| "learning_rate": 5.047185422903928e-05, | |
| "loss": 0.5536, | |
| "mean_token_accuracy": 0.8384527832269668, | |
| "num_tokens": 7198597.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.4023391812865498, | |
| "eval_bleu": 61.61816935351767, | |
| "eval_entropy": 0.529030436442958, | |
| "eval_loss": 0.5979623198509216, | |
| "eval_mean_token_accuracy": 0.8302161036818115, | |
| "eval_num_tokens": 7198597.0, | |
| "eval_rougeL": 0.7527724116774704, | |
| "eval_runtime": 63.2813, | |
| "eval_samples_per_second": 27.275, | |
| "eval_steps_per_second": 1.707, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5341040723025798, | |
| "epoch": 1.449122807017544, | |
| "grad_norm": 0.2159273475408554, | |
| "learning_rate": 4.355567811332311e-05, | |
| "loss": 0.5294, | |
| "mean_token_accuracy": 0.8453650638461113, | |
| "num_tokens": 7444525.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5395594887435436, | |
| "epoch": 1.495906432748538, | |
| "grad_norm": 0.2247242033481598, | |
| "learning_rate": 3.701512964710513e-05, | |
| "loss": 0.5297, | |
| "mean_token_accuracy": 0.84483852237463, | |
| "num_tokens": 7692840.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5372945822775363, | |
| "epoch": 1.5426900584795322, | |
| "grad_norm": 0.23070968687534332, | |
| "learning_rate": 3.089373510131354e-05, | |
| "loss": 0.5324, | |
| "mean_token_accuracy": 0.8450036272406578, | |
| "num_tokens": 7927218.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5493481300771237, | |
| "epoch": 1.5894736842105264, | |
| "grad_norm": 0.23444080352783203, | |
| "learning_rate": 2.523223134669157e-05, | |
| "loss": 0.5435, | |
| "mean_token_accuracy": 0.8414489045739174, | |
| "num_tokens": 8164688.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5425299167633056, | |
| "epoch": 1.6362573099415205, | |
| "grad_norm": 0.21845506131649017, | |
| "learning_rate": 2.0068294756643845e-05, | |
| "loss": 0.538, | |
| "mean_token_accuracy": 0.8418876558542252, | |
| "num_tokens": 8406525.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.6362573099415205, | |
| "eval_bleu": 61.86199921842308, | |
| "eval_entropy": 0.5256232053593353, | |
| "eval_loss": 0.5959370732307434, | |
| "eval_mean_token_accuracy": 0.8305901747058939, | |
| "eval_num_tokens": 8406525.0, | |
| "eval_rougeL": 0.7535288717921484, | |
| "eval_runtime": 63.1544, | |
| "eval_samples_per_second": 27.33, | |
| "eval_steps_per_second": 1.71, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5370763696730136, | |
| "epoch": 1.6830409356725147, | |
| "grad_norm": 0.2211093306541443, | |
| "learning_rate": 1.5436290477187587e-05, | |
| "loss": 0.5294, | |
| "mean_token_accuracy": 0.8450759872794151, | |
| "num_tokens": 8647405.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5449528440833091, | |
| "epoch": 1.7298245614035088, | |
| "grad_norm": 0.2158094048500061, | |
| "learning_rate": 1.1367043732575666e-05, | |
| "loss": 0.5348, | |
| "mean_token_accuracy": 0.843912661075592, | |
| "num_tokens": 8888082.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5405852876603603, | |
| "epoch": 1.776608187134503, | |
| "grad_norm": 0.21288156509399414, | |
| "learning_rate": 7.887634688515e-06, | |
| "loss": 0.5314, | |
| "mean_token_accuracy": 0.8438028782606125, | |
| "num_tokens": 9140762.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5433258466422558, | |
| "epoch": 1.8233918128654971, | |
| "grad_norm": 0.23975639045238495, | |
| "learning_rate": 5.021218238131719e-06, | |
| "loss": 0.5395, | |
| "mean_token_accuracy": 0.8432600662112236, | |
| "num_tokens": 9374299.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5539177514612674, | |
| "epoch": 1.8701754385964913, | |
| "grad_norm": 0.23764148354530334, | |
| "learning_rate": 2.7868699099777297e-06, | |
| "loss": 0.5455, | |
| "mean_token_accuracy": 0.8410582140088081, | |
| "num_tokens": 9612371.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.8701754385964913, | |
| "eval_bleu": 61.99304439252053, | |
| "eval_entropy": 0.5258893685208427, | |
| "eval_loss": 0.5951277017593384, | |
| "eval_mean_token_accuracy": 0.8308189588564413, | |
| "eval_num_tokens": 9612371.0, | |
| "eval_rougeL": 0.754337888795046, | |
| "eval_runtime": 63.4001, | |
| "eval_samples_per_second": 27.224, | |
| "eval_steps_per_second": 1.703, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5425351396203041, | |
| "epoch": 1.9169590643274854, | |
| "grad_norm": 0.2328052669763565, | |
| "learning_rate": 1.1994589235353681e-06, | |
| "loss": 0.5376, | |
| "mean_token_accuracy": 0.8435128018260002, | |
| "num_tokens": 9853669.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.533241743594408, | |
| "epoch": 1.9637426900584796, | |
| "grad_norm": 0.22453628480434418, | |
| "learning_rate": 2.695492370149988e-07, | |
| "loss": 0.5238, | |
| "mean_token_accuracy": 0.8451769664883614, | |
| "num_tokens": 10097512.0, | |
| "step": 420 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 428, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.941697521363354e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |