{ "best_global_step": 3000, "best_metric": 3.7956512070627526, "best_model_checkpoint": "checkpoints/gpt2_sparse_moe_wiki/checkpoint-3000", "epoch": 0.45211788973974965, "eval_steps": 300, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007535298162329161, "grad_norm": 1.2304445505142212, "learning_rate": 8.166666666666667e-05, "loss": 9.5844, "step": 50 }, { "epoch": 0.015070596324658321, "grad_norm": 0.5569408535957336, "learning_rate": 0.000165, "loss": 7.6009, "step": 100 }, { "epoch": 0.022605894486987483, "grad_norm": 0.4758251905441284, "learning_rate": 0.0002483333333333333, "loss": 6.7993, "step": 150 }, { "epoch": 0.030141192649316643, "grad_norm": 0.6503773331642151, "learning_rate": 0.0003316666666666667, "loss": 6.2649, "step": 200 }, { "epoch": 0.037676490811645806, "grad_norm": 0.5381407141685486, "learning_rate": 0.000415, "loss": 5.9237, "step": 250 }, { "epoch": 0.045211788973974966, "grad_norm": 0.280143141746521, "learning_rate": 0.0004983333333333334, "loss": 5.6423, "step": 300 }, { "epoch": 0.045211788973974966, "eval_loss": 5.46437172209171, "eval_perplexity": 236.12745483202283, "eval_runtime": 43.9995, "eval_samples_per_second": 37.705, "eval_steps_per_second": 0.591, "step": 300 }, { "epoch": 0.052747087136304126, "grad_norm": 0.25354620814323425, "learning_rate": 0.0004909259259259259, "loss": 5.3993, "step": 350 }, { "epoch": 0.060282385298633286, "grad_norm": 0.3524823486804962, "learning_rate": 0.0004816666666666667, "loss": 5.1954, "step": 400 }, { "epoch": 0.06781768346096245, "grad_norm": 0.3906818628311157, "learning_rate": 0.0004724074074074074, "loss": 5.025, "step": 450 }, { "epoch": 0.07535298162329161, "grad_norm": 0.456912636756897, "learning_rate": 0.00046314814814814813, "loss": 4.859, "step": 500 }, { "epoch": 0.08288827978562077, "grad_norm": 0.3447301387786865, "learning_rate": 0.00045388888888888893, "loss": 4.728, "step": 550 }, { "epoch": 0.09042357794794993, "grad_norm": 0.400978147983551, "learning_rate": 0.00044462962962962967, "loss": 4.6308, "step": 600 }, { "epoch": 0.09042357794794993, "eval_loss": 4.516210105340873, "eval_perplexity": 91.48820943465822, "eval_runtime": 43.4593, "eval_samples_per_second": 38.174, "eval_steps_per_second": 0.598, "step": 600 }, { "epoch": 0.09795887611027909, "grad_norm": 0.30938830971717834, "learning_rate": 0.00043537037037037036, "loss": 4.5451, "step": 650 }, { "epoch": 0.10549417427260825, "grad_norm": 0.3306984603404999, "learning_rate": 0.0004261111111111111, "loss": 4.4871, "step": 700 }, { "epoch": 0.11302947243493741, "grad_norm": 0.30364739894866943, "learning_rate": 0.00041685185185185184, "loss": 4.4291, "step": 750 }, { "epoch": 0.12056477059726657, "grad_norm": 0.32504332065582275, "learning_rate": 0.00040759259259259264, "loss": 4.3825, "step": 800 }, { "epoch": 0.12810006875959573, "grad_norm": 0.32315653562545776, "learning_rate": 0.00039833333333333333, "loss": 4.3458, "step": 850 }, { "epoch": 0.1356353669219249, "grad_norm": 0.2794428765773773, "learning_rate": 0.00038907407407407407, "loss": 4.3127, "step": 900 }, { "epoch": 0.1356353669219249, "eval_loss": 4.218866269811809, "eval_perplexity": 67.95639638117432, "eval_runtime": 43.6842, "eval_samples_per_second": 37.977, "eval_steps_per_second": 0.595, "step": 900 }, { "epoch": 0.14317066508425405, "grad_norm": 0.2745271623134613, "learning_rate": 0.0003798148148148148, "loss": 4.2776, "step": 950 }, { "epoch": 0.15070596324658322, "grad_norm": 0.3081075847148895, "learning_rate": 0.0003705555555555556, "loss": 4.2474, "step": 1000 }, { "epoch": 0.15824126140891237, "grad_norm": 0.26861488819122314, "learning_rate": 0.0003612962962962963, "loss": 4.222, "step": 1050 }, { "epoch": 0.16577655957124154, "grad_norm": 0.26902666687965393, "learning_rate": 0.00035203703703703704, "loss": 4.1946, "step": 1100 }, { "epoch": 0.1733118577335707, "grad_norm": 0.24377495050430298, "learning_rate": 0.0003427777777777778, "loss": 4.1722, "step": 1150 }, { "epoch": 0.18084715589589986, "grad_norm": 0.2884402573108673, "learning_rate": 0.0003335185185185185, "loss": 4.1505, "step": 1200 }, { "epoch": 0.18084715589589986, "eval_loss": 4.069066753989171, "eval_perplexity": 58.50234003225461, "eval_runtime": 44.2137, "eval_samples_per_second": 37.522, "eval_steps_per_second": 0.588, "step": 1200 }, { "epoch": 0.188382454058229, "grad_norm": 0.2788221836090088, "learning_rate": 0.00032425925925925927, "loss": 4.1284, "step": 1250 }, { "epoch": 0.19591775222055818, "grad_norm": 0.25833043456077576, "learning_rate": 0.000315, "loss": 4.1179, "step": 1300 }, { "epoch": 0.20345305038288733, "grad_norm": 0.24359293282032013, "learning_rate": 0.00030574074074074076, "loss": 4.0981, "step": 1350 }, { "epoch": 0.2109883485452165, "grad_norm": 0.2737596929073334, "learning_rate": 0.00029648148148148144, "loss": 4.0855, "step": 1400 }, { "epoch": 0.21852364670754565, "grad_norm": 0.24729043245315552, "learning_rate": 0.00028722222222222224, "loss": 4.0639, "step": 1450 }, { "epoch": 0.22605894486987482, "grad_norm": 0.226772740483284, "learning_rate": 0.000277962962962963, "loss": 4.0538, "step": 1500 }, { "epoch": 0.22605894486987482, "eval_loss": 3.974391115922098, "eval_perplexity": 53.21770362637156, "eval_runtime": 43.3887, "eval_samples_per_second": 38.236, "eval_steps_per_second": 0.599, "step": 1500 }, { "epoch": 0.23359424303220397, "grad_norm": 0.2437480390071869, "learning_rate": 0.0002687037037037037, "loss": 4.0447, "step": 1550 }, { "epoch": 0.24112954119453314, "grad_norm": 0.2321815937757492, "learning_rate": 0.0002594444444444444, "loss": 4.0357, "step": 1600 }, { "epoch": 0.2486648393568623, "grad_norm": 0.23976509273052216, "learning_rate": 0.00025018518518518516, "loss": 4.0169, "step": 1650 }, { "epoch": 0.25620013751919146, "grad_norm": 0.28224146366119385, "learning_rate": 0.00024092592592592593, "loss": 4.0091, "step": 1700 }, { "epoch": 0.26373543568152064, "grad_norm": 0.23423795402050018, "learning_rate": 0.00023166666666666667, "loss": 4.0012, "step": 1750 }, { "epoch": 0.2712707338438498, "grad_norm": 0.22995537519454956, "learning_rate": 0.0002224074074074074, "loss": 3.9909, "step": 1800 }, { "epoch": 0.2712707338438498, "eval_loss": 3.910796159768955, "eval_perplexity": 49.93869533041372, "eval_runtime": 43.6043, "eval_samples_per_second": 38.047, "eval_steps_per_second": 0.596, "step": 1800 }, { "epoch": 0.2788060320061789, "grad_norm": 0.23875726759433746, "learning_rate": 0.00021314814814814815, "loss": 3.9818, "step": 1850 }, { "epoch": 0.2863413301685081, "grad_norm": 0.24082094430923462, "learning_rate": 0.0002038888888888889, "loss": 3.9765, "step": 1900 }, { "epoch": 0.2938766283308373, "grad_norm": 0.24363255500793457, "learning_rate": 0.00019462962962962964, "loss": 3.9654, "step": 1950 }, { "epoch": 0.30141192649316645, "grad_norm": 0.23058578372001648, "learning_rate": 0.00018537037037037038, "loss": 3.9612, "step": 2000 }, { "epoch": 0.30894722465549557, "grad_norm": 0.22857844829559326, "learning_rate": 0.00017611111111111112, "loss": 3.952, "step": 2050 }, { "epoch": 0.31648252281782474, "grad_norm": 0.223977729678154, "learning_rate": 0.00016685185185185187, "loss": 3.9433, "step": 2100 }, { "epoch": 0.31648252281782474, "eval_loss": 3.863826939699745, "eval_perplexity": 47.64734642630777, "eval_runtime": 42.7477, "eval_samples_per_second": 38.809, "eval_steps_per_second": 0.608, "step": 2100 }, { "epoch": 0.3240178209801539, "grad_norm": 0.21180413663387299, "learning_rate": 0.00015759259259259258, "loss": 3.9399, "step": 2150 }, { "epoch": 0.3315531191424831, "grad_norm": 0.218739852309227, "learning_rate": 0.00014833333333333335, "loss": 3.9309, "step": 2200 }, { "epoch": 0.3390884173048122, "grad_norm": 0.2420540750026703, "learning_rate": 0.00013907407407407407, "loss": 3.9235, "step": 2250 }, { "epoch": 0.3466237154671414, "grad_norm": 0.23727546632289886, "learning_rate": 0.00012981481481481484, "loss": 3.9189, "step": 2300 }, { "epoch": 0.35415901362947055, "grad_norm": 0.21484792232513428, "learning_rate": 0.00012055555555555555, "loss": 3.9085, "step": 2350 }, { "epoch": 0.3616943117917997, "grad_norm": 0.20896899700164795, "learning_rate": 0.0001112962962962963, "loss": 3.9139, "step": 2400 }, { "epoch": 0.3616943117917997, "eval_loss": 3.8293397981153188, "eval_perplexity": 46.03213769600957, "eval_runtime": 42.6597, "eval_samples_per_second": 38.889, "eval_steps_per_second": 0.609, "step": 2400 }, { "epoch": 0.36922960995412885, "grad_norm": 0.21242469549179077, "learning_rate": 0.00010203703703703704, "loss": 3.9004, "step": 2450 }, { "epoch": 0.376764908116458, "grad_norm": 0.2224598526954651, "learning_rate": 9.277777777777778e-05, "loss": 3.9026, "step": 2500 }, { "epoch": 0.3843002062787872, "grad_norm": 0.19466078281402588, "learning_rate": 8.351851851851852e-05, "loss": 3.894, "step": 2550 }, { "epoch": 0.39183550444111637, "grad_norm": 0.19186748564243317, "learning_rate": 7.425925925925927e-05, "loss": 3.8922, "step": 2600 }, { "epoch": 0.39937080260344554, "grad_norm": 0.1871800720691681, "learning_rate": 6.500000000000001e-05, "loss": 3.8897, "step": 2650 }, { "epoch": 0.40690610076577466, "grad_norm": 0.1799030303955078, "learning_rate": 5.5740740740740744e-05, "loss": 3.8854, "step": 2700 }, { "epoch": 0.40690610076577466, "eval_loss": 3.8063672254540974, "eval_perplexity": 44.98671506714982, "eval_runtime": 42.5792, "eval_samples_per_second": 38.963, "eval_steps_per_second": 0.611, "step": 2700 }, { "epoch": 0.41444139892810383, "grad_norm": 0.17992734909057617, "learning_rate": 4.6481481481481486e-05, "loss": 3.8828, "step": 2750 }, { "epoch": 0.421976697090433, "grad_norm": 0.1795385181903839, "learning_rate": 3.722222222222222e-05, "loss": 3.88, "step": 2800 }, { "epoch": 0.4295119952527622, "grad_norm": 0.17680570483207703, "learning_rate": 2.7962962962962965e-05, "loss": 3.8807, "step": 2850 }, { "epoch": 0.4370472934150913, "grad_norm": 0.16620458662509918, "learning_rate": 1.8703703703703707e-05, "loss": 3.8766, "step": 2900 }, { "epoch": 0.44458259157742047, "grad_norm": 0.227438822388649, "learning_rate": 9.444444444444445e-06, "loss": 3.8783, "step": 2950 }, { "epoch": 0.45211788973974965, "grad_norm": 0.1560600996017456, "learning_rate": 1.8518518518518518e-07, "loss": 3.876, "step": 3000 }, { "epoch": 0.45211788973974965, "eval_loss": 3.7956512070627526, "eval_perplexity": 44.507210380219355, "eval_runtime": 42.4711, "eval_samples_per_second": 39.062, "eval_steps_per_second": 0.612, "step": 3000 } ], "logging_steps": 50, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.244546904064e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }