| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9984, | |
| "eval_steps": 500, | |
| "global_step": 312, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 19.50429916381836, | |
| "learning_rate": 3.125e-06, | |
| "loss": 1.7197, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 7.2519917488098145, | |
| "learning_rate": 6.25e-06, | |
| "loss": 1.3639, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 3.76739501953125, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 0.6994, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 2.3495688438415527, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.3779, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.5075960159301758, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 0.336, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.3261762857437134, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.3298, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 1.2955995798110962, | |
| "learning_rate": 1.9994335583335336e-05, | |
| "loss": 0.3132, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 1.1081024408340454, | |
| "learning_rate": 1.9959742939952393e-05, | |
| "loss": 0.302, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.9613136649131775, | |
| "learning_rate": 1.9893813260530368e-05, | |
| "loss": 0.2939, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.02515709400177, | |
| "learning_rate": 1.9796753984232357e-05, | |
| "loss": 0.2879, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.952653169631958, | |
| "learning_rate": 1.9668870495450064e-05, | |
| "loss": 0.2856, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.771054208278656, | |
| "learning_rate": 1.9510565162951538e-05, | |
| "loss": 0.279, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.8033021688461304, | |
| "learning_rate": 1.9322336073880143e-05, | |
| "loss": 0.2782, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.9187172055244446, | |
| "learning_rate": 1.9104775466588162e-05, | |
| "loss": 0.2777, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.8861866593360901, | |
| "learning_rate": 1.88585678672358e-05, | |
| "loss": 0.2773, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.8510691523551941, | |
| "learning_rate": 1.8584487936018663e-05, | |
| "loss": 0.2712, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.734692394733429, | |
| "learning_rate": 1.8283398029800167e-05, | |
| "loss": 0.2746, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.5683246850967407, | |
| "learning_rate": 1.795624548881781e-05, | |
| "loss": 0.2725, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.8533231019973755, | |
| "learning_rate": 1.7604059656000313e-05, | |
| "loss": 0.2754, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.7900028228759766, | |
| "learning_rate": 1.7227948638273918e-05, | |
| "loss": 0.2738, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 1.2377707958221436, | |
| "learning_rate": 1.682909582004807e-05, | |
| "loss": 0.2714, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.6560891270637512, | |
| "learning_rate": 1.6408756139850243e-05, | |
| "loss": 0.2749, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.7056406736373901, | |
| "learning_rate": 1.5968252141825038e-05, | |
| "loss": 0.2707, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.6733701229095459, | |
| "learning_rate": 1.5508969814521026e-05, | |
| "loss": 0.2681, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7684840559959412, | |
| "learning_rate": 1.5032354230058004e-05, | |
| "loss": 0.2695, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.8542689085006714, | |
| "learning_rate": 1.4539904997395468e-05, | |
| "loss": 0.2651, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.5982159972190857, | |
| "learning_rate": 1.4033171544008053e-05, | |
| "loss": 0.2677, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.7119853496551514, | |
| "learning_rate": 1.3513748240813429e-05, | |
| "loss": 0.2717, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.6406282186508179, | |
| "learning_rate": 1.2983269385691562e-05, | |
| "loss": 0.2674, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.5884711146354675, | |
| "learning_rate": 1.2443404061378941e-05, | |
| "loss": 0.2706, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.7757735252380371, | |
| "learning_rate": 1.1895850883916786e-05, | |
| "loss": 0.2687, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.8187023997306824, | |
| "learning_rate": 1.1342332658176556e-05, | |
| "loss": 0.2664, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.6702791452407837, | |
| "learning_rate": 1.0784590957278452e-05, | |
| "loss": 0.2643, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.668189525604248, | |
| "learning_rate": 1.0224380642958052e-05, | |
| "loss": 0.2645, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7360859513282776, | |
| "learning_rate": 9.663464344122064e-06, | |
| "loss": 0.2625, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.5236344337463379, | |
| "learning_rate": 9.103606910965666e-06, | |
| "loss": 0.2672, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.6402584314346313, | |
| "learning_rate": 8.546569862100876e-06, | |
| "loss": 0.2643, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.6501412987709045, | |
| "learning_rate": 7.994105842167274e-06, | |
| "loss": 0.2627, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.5710216164588928, | |
| "learning_rate": 7.447953107363574e-06, | |
| "loss": 0.2627, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.408921241760254, | |
| "learning_rate": 6.909830056250527e-06, | |
| "loss": 0.259, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.4925643503665924, | |
| "learning_rate": 6.381429823033281e-06, | |
| "loss": 0.2648, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.6027353405952454, | |
| "learning_rate": 5.864414950334796e-06, | |
| "loss": 0.2628, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.6041746139526367, | |
| "learning_rate": 5.360412158221661e-06, | |
| "loss": 0.2624, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.7557849884033203, | |
| "learning_rate": 4.87100722594094e-06, | |
| "loss": 0.2601, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7085661292076111, | |
| "learning_rate": 4.397740002471973e-06, | |
| "loss": 0.2642, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.4548507332801819, | |
| "learning_rate": 3.942099561591802e-06, | |
| "loss": 0.2597, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.5355045795440674, | |
| "learning_rate": 3.505519516698165e-06, | |
| "loss": 0.2578, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.5922319293022156, | |
| "learning_rate": 3.089373510131354e-06, | |
| "loss": 0.2588, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.4978267252445221, | |
| "learning_rate": 2.694970891187225e-06, | |
| "loss": 0.2616, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.4677634537220001, | |
| "learning_rate": 2.323552596419889e-06, | |
| "loss": 0.2597, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.7307313680648804, | |
| "learning_rate": 1.9762872451962214e-06, | |
| "loss": 0.2606, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.4911010265350342, | |
| "learning_rate": 1.6542674627869738e-06, | |
| "loss": 0.259, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.5247293710708618, | |
| "learning_rate": 1.3585064425634542e-06, | |
| "loss": 0.2612, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.45184415578842163, | |
| "learning_rate": 1.0899347581163222e-06, | |
| "loss": 0.2611, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.465379923582077, | |
| "learning_rate": 8.493974353268019e-07, | |
| "loss": 0.262, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.7584052085876465, | |
| "learning_rate": 6.37651293602628e-07, | |
| "loss": 0.2608, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.4854549169540405, | |
| "learning_rate": 4.553625646441928e-07, | |
| "loss": 0.2579, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.4984216094017029, | |
| "learning_rate": 3.0310479623313125e-07, | |
| "loss": 0.2569, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.4194032847881317, | |
| "learning_rate": 1.81357047638816e-07, | |
| "loss": 0.2536, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.506372332572937, | |
| "learning_rate": 9.0502382320653e-08, | |
| "loss": 0.2561, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.44982290267944336, | |
| "learning_rate": 3.082666266872036e-08, | |
| "loss": 0.2592, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.47311434149742126, | |
| "learning_rate": 2.5176505749346937e-09, | |
| "loss": 0.2545, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "step": 312, | |
| "total_flos": 2.68577119859114e+17, | |
| "train_loss": 0.3196743833713042, | |
| "train_runtime": 939.7368, | |
| "train_samples_per_second": 31.924, | |
| "train_steps_per_second": 0.332 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 312, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.68577119859114e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |