{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 533, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018788163457022077, "grad_norm": 39.5539436340332, "learning_rate": 1.6666666666666667e-06, "loss": 1.7708, "step": 10 }, { "epoch": 0.03757632691404415, "grad_norm": 15.056221961975098, "learning_rate": 3.5185185185185187e-06, "loss": 0.8157, "step": 20 }, { "epoch": 0.05636449037106623, "grad_norm": 8.456525802612305, "learning_rate": 5.370370370370371e-06, "loss": 0.5049, "step": 30 }, { "epoch": 0.0751526538280883, "grad_norm": 9.841835021972656, "learning_rate": 7.222222222222223e-06, "loss": 0.4008, "step": 40 }, { "epoch": 0.09394081728511038, "grad_norm": 8.734881401062012, "learning_rate": 9.074074074074075e-06, "loss": 0.3481, "step": 50 }, { "epoch": 0.11272898074213246, "grad_norm": 5.559770107269287, "learning_rate": 9.997311749002358e-06, "loss": 0.3233, "step": 60 }, { "epoch": 0.13151714419915453, "grad_norm": 9.457917213439941, "learning_rate": 9.975823081977089e-06, "loss": 0.2962, "step": 70 }, { "epoch": 0.1503053076561766, "grad_norm": 8.458663940429688, "learning_rate": 9.93293815016874e-06, "loss": 0.3505, "step": 80 }, { "epoch": 0.1690934711131987, "grad_norm": 5.281663417816162, "learning_rate": 9.868841360727856e-06, "loss": 0.341, "step": 90 }, { "epoch": 0.18788163457022075, "grad_norm": 4.8133440017700195, "learning_rate": 9.783808332754242e-06, "loss": 0.3644, "step": 100 }, { "epoch": 0.20666979802724283, "grad_norm": 5.978756904602051, "learning_rate": 9.678204712122328e-06, "loss": 0.2572, "step": 110 }, { "epoch": 0.22545796148426492, "grad_norm": 3.6604723930358887, "learning_rate": 9.552484599187344e-06, "loss": 0.2801, "step": 120 }, { "epoch": 0.244246124941287, "grad_norm": 5.864062786102295, "learning_rate": 9.407188596133212e-06, "loss": 0.2438, "step": 130 }, { "epoch": 0.26303428839830906, "grad_norm": 4.652751445770264, "learning_rate": 9.242941482358646e-06, "loss": 0.2426, "step": 140 }, { "epoch": 0.28182245185533117, "grad_norm": 7.59506368637085, "learning_rate": 9.060449527897424e-06, "loss": 0.2699, "step": 150 }, { "epoch": 0.3006106153123532, "grad_norm": 5.299258232116699, "learning_rate": 8.860497456425226e-06, "loss": 0.2438, "step": 160 }, { "epoch": 0.3193987787693753, "grad_norm": 6.822812080383301, "learning_rate": 8.643945070912269e-06, "loss": 0.2703, "step": 170 }, { "epoch": 0.3381869422263974, "grad_norm": 4.299904823303223, "learning_rate": 8.411723556431555e-06, "loss": 0.2858, "step": 180 }, { "epoch": 0.35697510568341945, "grad_norm": 4.70183801651001, "learning_rate": 8.164831476020856e-06, "loss": 0.1675, "step": 190 }, { "epoch": 0.3757632691404415, "grad_norm": 6.74561071395874, "learning_rate": 7.904330476816391e-06, "loss": 0.2649, "step": 200 }, { "epoch": 0.3757632691404415, "eval_loss": 0.22606511414051056, "eval_runtime": 222.2942, "eval_samples_per_second": 4.256, "eval_steps_per_second": 1.066, "step": 200 }, { "epoch": 0.3945514325974636, "grad_norm": 5.191539287567139, "learning_rate": 7.631340724922023e-06, "loss": 0.2252, "step": 210 }, { "epoch": 0.41333959605448567, "grad_norm": 6.738773822784424, "learning_rate": 7.347036088644232e-06, "loss": 0.2396, "step": 220 }, { "epoch": 0.4321277595115077, "grad_norm": 5.1947808265686035, "learning_rate": 7.0526390908052e-06, "loss": 0.2902, "step": 230 }, { "epoch": 0.45091592296852984, "grad_norm": 4.540430545806885, "learning_rate": 6.7494156518392625e-06, "loss": 0.2185, "step": 240 }, { "epoch": 0.4697040864255519, "grad_norm": 7.267935752868652, "learning_rate": 6.43866964627766e-06, "loss": 0.2483, "step": 250 }, { "epoch": 0.488492249882574, "grad_norm": 6.775832653045654, "learning_rate": 6.121737296028959e-06, "loss": 0.2132, "step": 260 }, { "epoch": 0.5072804133395961, "grad_norm": 4.134191036224365, "learning_rate": 5.799981424564275e-06, "loss": 0.2062, "step": 270 }, { "epoch": 0.5260685767966181, "grad_norm": 3.4992332458496094, "learning_rate": 5.474785596714581e-06, "loss": 0.1665, "step": 280 }, { "epoch": 0.5448567402536402, "grad_norm": 3.856785297393799, "learning_rate": 5.1475481692792235e-06, "loss": 0.1902, "step": 290 }, { "epoch": 0.5636449037106623, "grad_norm": 5.150210857391357, "learning_rate": 4.819676278028305e-06, "loss": 0.2333, "step": 300 }, { "epoch": 0.5824330671676844, "grad_norm": 5.201061725616455, "learning_rate": 4.4925797869550865e-06, "loss": 0.2023, "step": 310 }, { "epoch": 0.6012212306247064, "grad_norm": 5.331509590148926, "learning_rate": 4.167665225796925e-06, "loss": 0.1776, "step": 320 }, { "epoch": 0.6200093940817285, "grad_norm": 3.550278902053833, "learning_rate": 3.846329741893646e-06, "loss": 0.2364, "step": 330 }, { "epoch": 0.6387975575387506, "grad_norm": 5.495516300201416, "learning_rate": 3.52995509239065e-06, "loss": 0.2192, "step": 340 }, { "epoch": 0.6575857209957726, "grad_norm": 4.215266704559326, "learning_rate": 3.2199017026205744e-06, "loss": 0.1601, "step": 350 }, { "epoch": 0.6763738844527948, "grad_norm": 3.173532247543335, "learning_rate": 2.917502816212685e-06, "loss": 0.1688, "step": 360 }, { "epoch": 0.6951620479098168, "grad_norm": 6.164701461791992, "learning_rate": 2.6240587620848512e-06, "loss": 0.2083, "step": 370 }, { "epoch": 0.7139502113668389, "grad_norm": 2.307567596435547, "learning_rate": 2.340831362970257e-06, "loss": 0.181, "step": 380 }, { "epoch": 0.732738374823861, "grad_norm": 5.143068313598633, "learning_rate": 2.0690385095224557e-06, "loss": 0.2234, "step": 390 }, { "epoch": 0.751526538280883, "grad_norm": 2.648653030395508, "learning_rate": 1.8098489233303595e-06, "loss": 0.1551, "step": 400 }, { "epoch": 0.751526538280883, "eval_loss": 0.17390167713165283, "eval_runtime": 222.2168, "eval_samples_per_second": 4.257, "eval_steps_per_second": 1.067, "step": 400 }, { "epoch": 0.7703147017379052, "grad_norm": 3.061445713043213, "learning_rate": 1.5643771313624394e-06, "loss": 0.1899, "step": 410 }, { "epoch": 0.7891028651949272, "grad_norm": 3.341491460800171, "learning_rate": 1.3336786734502294e-06, "loss": 0.1615, "step": 420 }, { "epoch": 0.8078910286519493, "grad_norm": 3.6715078353881836, "learning_rate": 1.1187455634192307e-06, "loss": 0.1671, "step": 430 }, { "epoch": 0.8266791921089713, "grad_norm": 2.944624423980713, "learning_rate": 9.205020233844736e-07, "loss": 0.1275, "step": 440 }, { "epoch": 0.8454673555659934, "grad_norm": 4.107908725738525, "learning_rate": 7.398005095535565e-07, "loss": 0.1797, "step": 450 }, { "epoch": 0.8642555190230155, "grad_norm": 5.744738578796387, "learning_rate": 5.774180466262985e-07, "loss": 0.2238, "step": 460 }, { "epoch": 0.8830436824800376, "grad_norm": 3.282698631286621, "learning_rate": 4.340528865533161e-07, "loss": 0.1766, "step": 470 }, { "epoch": 0.9018318459370597, "grad_norm": 6.375194549560547, "learning_rate": 3.103215060209902e-07, "loss": 0.2085, "step": 480 }, { "epoch": 0.9206200093940817, "grad_norm": 2.820821523666382, "learning_rate": 2.0675595557376916e-07, "loss": 0.1595, "step": 490 }, { "epoch": 0.9394081728511038, "grad_norm": 3.918117046356201, "learning_rate": 1.2380157177271369e-07, "loss": 0.2025, "step": 500 }, { "epoch": 0.9581963363081258, "grad_norm": 2.6554410457611084, "learning_rate": 6.181506222809885e-08, "loss": 0.1641, "step": 510 }, { "epoch": 0.976984499765148, "grad_norm": 4.8615899085998535, "learning_rate": 2.1062971740523076e-08, "loss": 0.1658, "step": 520 }, { "epoch": 0.9957726632221701, "grad_norm": 3.5291943550109863, "learning_rate": 1.7205361461825054e-09, "loss": 0.1645, "step": 530 }, { "epoch": 1.0, "step": 533, "total_flos": 78811436154880.0, "train_loss": 0.27306876374901345, "train_runtime": 11552.2832, "train_samples_per_second": 0.737, "train_steps_per_second": 0.046 } ], "logging_steps": 10, "max_steps": 533, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 78811436154880.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }