wcs2024's picture
Model save
ad53144 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9968652037617556,
"eval_steps": 500,
"global_step": 2391,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012539184952978057,
"grad_norm": 50.64135999803937,
"learning_rate": 0.0,
"loss": 11.1605,
"step": 1
},
{
"epoch": 0.0025078369905956114,
"grad_norm": 54.44006733109489,
"learning_rate": 2.0833333333333333e-07,
"loss": 11.1229,
"step": 2
},
{
"epoch": 0.003761755485893417,
"grad_norm": 53.544806176439245,
"learning_rate": 4.1666666666666667e-07,
"loss": 11.0341,
"step": 3
},
{
"epoch": 0.005015673981191223,
"grad_norm": 52.841468929574354,
"learning_rate": 6.25e-07,
"loss": 11.0757,
"step": 4
},
{
"epoch": 0.006269592476489028,
"grad_norm": 56.03226546075816,
"learning_rate": 8.333333333333333e-07,
"loss": 11.0005,
"step": 5
},
{
"epoch": 0.007523510971786834,
"grad_norm": 55.24840407403759,
"learning_rate": 1.0416666666666667e-06,
"loss": 11.0074,
"step": 6
},
{
"epoch": 0.00877742946708464,
"grad_norm": 53.135884342580766,
"learning_rate": 1.25e-06,
"loss": 11.072,
"step": 7
},
{
"epoch": 0.010031347962382446,
"grad_norm": 56.85722709985535,
"learning_rate": 1.4583333333333335e-06,
"loss": 10.8116,
"step": 8
},
{
"epoch": 0.01128526645768025,
"grad_norm": 63.80469694729264,
"learning_rate": 1.6666666666666667e-06,
"loss": 10.5033,
"step": 9
},
{
"epoch": 0.012539184952978056,
"grad_norm": 60.84758521078789,
"learning_rate": 1.875e-06,
"loss": 10.5154,
"step": 10
},
{
"epoch": 0.013793103448275862,
"grad_norm": 80.52702045151378,
"learning_rate": 2.0833333333333334e-06,
"loss": 9.5285,
"step": 11
},
{
"epoch": 0.015047021943573668,
"grad_norm": 80.82658311765395,
"learning_rate": 2.2916666666666666e-06,
"loss": 9.277,
"step": 12
},
{
"epoch": 0.016300940438871474,
"grad_norm": 93.9968144272142,
"learning_rate": 2.5e-06,
"loss": 8.7101,
"step": 13
},
{
"epoch": 0.01755485893416928,
"grad_norm": 74.73626923094774,
"learning_rate": 2.7083333333333334e-06,
"loss": 4.359,
"step": 14
},
{
"epoch": 0.018808777429467086,
"grad_norm": 63.325961403452666,
"learning_rate": 2.916666666666667e-06,
"loss": 3.911,
"step": 15
},
{
"epoch": 0.02006269592476489,
"grad_norm": 58.09507288811587,
"learning_rate": 3.125e-06,
"loss": 3.4933,
"step": 16
},
{
"epoch": 0.021316614420062698,
"grad_norm": 40.1003348158749,
"learning_rate": 3.3333333333333333e-06,
"loss": 2.8272,
"step": 17
},
{
"epoch": 0.0225705329153605,
"grad_norm": 33.77104421019437,
"learning_rate": 3.541666666666667e-06,
"loss": 2.4705,
"step": 18
},
{
"epoch": 0.023824451410658306,
"grad_norm": 6.8720348871086685,
"learning_rate": 3.75e-06,
"loss": 1.4665,
"step": 19
},
{
"epoch": 0.025078369905956112,
"grad_norm": 4.909790572454527,
"learning_rate": 3.958333333333333e-06,
"loss": 1.3211,
"step": 20
},
{
"epoch": 0.026332288401253918,
"grad_norm": 4.23184759338397,
"learning_rate": 4.166666666666667e-06,
"loss": 1.3126,
"step": 21
},
{
"epoch": 0.027586206896551724,
"grad_norm": 3.312162777085822,
"learning_rate": 4.375e-06,
"loss": 1.215,
"step": 22
},
{
"epoch": 0.02884012539184953,
"grad_norm": 2.5405398827982117,
"learning_rate": 4.583333333333333e-06,
"loss": 1.1317,
"step": 23
},
{
"epoch": 0.030094043887147336,
"grad_norm": 2.208436217187417,
"learning_rate": 4.791666666666667e-06,
"loss": 1.1296,
"step": 24
},
{
"epoch": 0.03134796238244514,
"grad_norm": 1.7867671726648862,
"learning_rate": 5e-06,
"loss": 1.0361,
"step": 25
},
{
"epoch": 0.03260188087774295,
"grad_norm": 1.5288002406194068,
"learning_rate": 5.208333333333334e-06,
"loss": 1.0082,
"step": 26
},
{
"epoch": 0.03385579937304075,
"grad_norm": 2.1348491702205457,
"learning_rate": 5.416666666666667e-06,
"loss": 0.9204,
"step": 27
},
{
"epoch": 0.03510971786833856,
"grad_norm": 1.1714518049047233,
"learning_rate": 5.625e-06,
"loss": 0.8681,
"step": 28
},
{
"epoch": 0.03636363636363636,
"grad_norm": 0.9885877174050772,
"learning_rate": 5.833333333333334e-06,
"loss": 0.8577,
"step": 29
},
{
"epoch": 0.03761755485893417,
"grad_norm": 0.9930662635157355,
"learning_rate": 6.041666666666667e-06,
"loss": 0.8785,
"step": 30
},
{
"epoch": 0.038871473354231974,
"grad_norm": 0.8616092080743135,
"learning_rate": 6.25e-06,
"loss": 0.842,
"step": 31
},
{
"epoch": 0.04012539184952978,
"grad_norm": 0.8155694244853302,
"learning_rate": 6.458333333333334e-06,
"loss": 0.8055,
"step": 32
},
{
"epoch": 0.041379310344827586,
"grad_norm": 0.7248150081369437,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7889,
"step": 33
},
{
"epoch": 0.042633228840125395,
"grad_norm": 0.6315667513847614,
"learning_rate": 6.875000000000001e-06,
"loss": 0.7539,
"step": 34
},
{
"epoch": 0.0438871473354232,
"grad_norm": 0.6229264651088183,
"learning_rate": 7.083333333333334e-06,
"loss": 0.7374,
"step": 35
},
{
"epoch": 0.045141065830721,
"grad_norm": 0.5894468915785406,
"learning_rate": 7.2916666666666674e-06,
"loss": 0.7153,
"step": 36
},
{
"epoch": 0.04639498432601881,
"grad_norm": 0.7412298880666272,
"learning_rate": 7.5e-06,
"loss": 0.781,
"step": 37
},
{
"epoch": 0.04764890282131661,
"grad_norm": 0.6108396631779481,
"learning_rate": 7.708333333333334e-06,
"loss": 0.7598,
"step": 38
},
{
"epoch": 0.04890282131661442,
"grad_norm": 0.7331246782865163,
"learning_rate": 7.916666666666667e-06,
"loss": 0.765,
"step": 39
},
{
"epoch": 0.050156739811912224,
"grad_norm": 0.5367626650120569,
"learning_rate": 8.125000000000001e-06,
"loss": 0.7003,
"step": 40
},
{
"epoch": 0.05141065830721003,
"grad_norm": 0.5419134040669606,
"learning_rate": 8.333333333333334e-06,
"loss": 0.6959,
"step": 41
},
{
"epoch": 0.052664576802507836,
"grad_norm": 0.47797671630724625,
"learning_rate": 8.541666666666666e-06,
"loss": 0.6716,
"step": 42
},
{
"epoch": 0.053918495297805645,
"grad_norm": 0.5198937653156491,
"learning_rate": 8.75e-06,
"loss": 0.701,
"step": 43
},
{
"epoch": 0.05517241379310345,
"grad_norm": 0.5183180691645413,
"learning_rate": 8.958333333333334e-06,
"loss": 0.7003,
"step": 44
},
{
"epoch": 0.05642633228840126,
"grad_norm": 0.5104629468915255,
"learning_rate": 9.166666666666666e-06,
"loss": 0.6805,
"step": 45
},
{
"epoch": 0.05768025078369906,
"grad_norm": 0.44568220181680257,
"learning_rate": 9.375000000000001e-06,
"loss": 0.6869,
"step": 46
},
{
"epoch": 0.05893416927899686,
"grad_norm": 0.45128493498903965,
"learning_rate": 9.583333333333334e-06,
"loss": 0.6606,
"step": 47
},
{
"epoch": 0.06018808777429467,
"grad_norm": 0.4534571381454502,
"learning_rate": 9.791666666666666e-06,
"loss": 0.6598,
"step": 48
},
{
"epoch": 0.061442006269592474,
"grad_norm": 0.42918816038128427,
"learning_rate": 1e-05,
"loss": 0.6152,
"step": 49
},
{
"epoch": 0.06269592476489028,
"grad_norm": 0.3792843446436007,
"learning_rate": 1.0208333333333334e-05,
"loss": 0.6609,
"step": 50
},
{
"epoch": 0.06394984326018809,
"grad_norm": 0.39532095227218156,
"learning_rate": 1.0416666666666668e-05,
"loss": 0.644,
"step": 51
},
{
"epoch": 0.0652037617554859,
"grad_norm": 0.4136624263590355,
"learning_rate": 1.0625e-05,
"loss": 0.6151,
"step": 52
},
{
"epoch": 0.0664576802507837,
"grad_norm": 0.40973984468365926,
"learning_rate": 1.0833333333333334e-05,
"loss": 0.6445,
"step": 53
},
{
"epoch": 0.0677115987460815,
"grad_norm": 0.3341251890648894,
"learning_rate": 1.1041666666666666e-05,
"loss": 0.6373,
"step": 54
},
{
"epoch": 0.06896551724137931,
"grad_norm": 0.32438360923545945,
"learning_rate": 1.125e-05,
"loss": 0.6062,
"step": 55
},
{
"epoch": 0.07021943573667712,
"grad_norm": 0.35971295394635117,
"learning_rate": 1.1458333333333333e-05,
"loss": 0.6403,
"step": 56
},
{
"epoch": 0.07147335423197493,
"grad_norm": 0.354471384444684,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.626,
"step": 57
},
{
"epoch": 0.07272727272727272,
"grad_norm": 0.35971680729134936,
"learning_rate": 1.1875e-05,
"loss": 0.6139,
"step": 58
},
{
"epoch": 0.07398119122257053,
"grad_norm": 0.2908796022942612,
"learning_rate": 1.2083333333333333e-05,
"loss": 0.5988,
"step": 59
},
{
"epoch": 0.07523510971786834,
"grad_norm": 0.3471442524579246,
"learning_rate": 1.2291666666666666e-05,
"loss": 0.608,
"step": 60
},
{
"epoch": 0.07648902821316614,
"grad_norm": 0.3239613734813439,
"learning_rate": 1.25e-05,
"loss": 0.6249,
"step": 61
},
{
"epoch": 0.07774294670846395,
"grad_norm": 0.34113715110751086,
"learning_rate": 1.2708333333333333e-05,
"loss": 0.647,
"step": 62
},
{
"epoch": 0.07899686520376176,
"grad_norm": 0.3280014258910571,
"learning_rate": 1.2916666666666668e-05,
"loss": 0.6212,
"step": 63
},
{
"epoch": 0.08025078369905957,
"grad_norm": 0.29294717778113777,
"learning_rate": 1.3125e-05,
"loss": 0.6132,
"step": 64
},
{
"epoch": 0.08150470219435736,
"grad_norm": 0.34717002848114126,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.5909,
"step": 65
},
{
"epoch": 0.08275862068965517,
"grad_norm": 0.3747315121598484,
"learning_rate": 1.3541666666666666e-05,
"loss": 0.5944,
"step": 66
},
{
"epoch": 0.08401253918495298,
"grad_norm": 0.3073539717304244,
"learning_rate": 1.3750000000000002e-05,
"loss": 0.5936,
"step": 67
},
{
"epoch": 0.08526645768025079,
"grad_norm": 0.35564050159510535,
"learning_rate": 1.3958333333333335e-05,
"loss": 0.6039,
"step": 68
},
{
"epoch": 0.08652037617554859,
"grad_norm": 0.30590372508933117,
"learning_rate": 1.4166666666666668e-05,
"loss": 0.5666,
"step": 69
},
{
"epoch": 0.0877742946708464,
"grad_norm": 0.3486426841772046,
"learning_rate": 1.4374999999999999e-05,
"loss": 0.6004,
"step": 70
},
{
"epoch": 0.0890282131661442,
"grad_norm": 0.28719759020338803,
"learning_rate": 1.4583333333333335e-05,
"loss": 0.5805,
"step": 71
},
{
"epoch": 0.090282131661442,
"grad_norm": 0.3409536205637824,
"learning_rate": 1.4791666666666668e-05,
"loss": 0.6086,
"step": 72
},
{
"epoch": 0.09153605015673981,
"grad_norm": 0.28436275528631205,
"learning_rate": 1.5e-05,
"loss": 0.5759,
"step": 73
},
{
"epoch": 0.09278996865203762,
"grad_norm": 0.3115793310495634,
"learning_rate": 1.5208333333333333e-05,
"loss": 0.6218,
"step": 74
},
{
"epoch": 0.09404388714733543,
"grad_norm": 0.3637587177467416,
"learning_rate": 1.5416666666666668e-05,
"loss": 0.5849,
"step": 75
},
{
"epoch": 0.09529780564263322,
"grad_norm": 0.3209888361303752,
"learning_rate": 1.5625e-05,
"loss": 0.5831,
"step": 76
},
{
"epoch": 0.09655172413793103,
"grad_norm": 0.3284865645004801,
"learning_rate": 1.5833333333333333e-05,
"loss": 0.5782,
"step": 77
},
{
"epoch": 0.09780564263322884,
"grad_norm": 0.2806798636486924,
"learning_rate": 1.604166666666667e-05,
"loss": 0.5702,
"step": 78
},
{
"epoch": 0.09905956112852665,
"grad_norm": 0.4125948027547393,
"learning_rate": 1.6250000000000002e-05,
"loss": 0.6035,
"step": 79
},
{
"epoch": 0.10031347962382445,
"grad_norm": 0.27394791792012835,
"learning_rate": 1.6458333333333335e-05,
"loss": 0.5795,
"step": 80
},
{
"epoch": 0.10156739811912226,
"grad_norm": 0.35819924622567145,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.6003,
"step": 81
},
{
"epoch": 0.10282131661442007,
"grad_norm": 0.3570062906995411,
"learning_rate": 1.6875000000000004e-05,
"loss": 0.5592,
"step": 82
},
{
"epoch": 0.10407523510971786,
"grad_norm": 0.3077594871557812,
"learning_rate": 1.7083333333333333e-05,
"loss": 0.5483,
"step": 83
},
{
"epoch": 0.10532915360501567,
"grad_norm": 0.30214124595300534,
"learning_rate": 1.7291666666666666e-05,
"loss": 0.5746,
"step": 84
},
{
"epoch": 0.10658307210031348,
"grad_norm": 0.3118032051740413,
"learning_rate": 1.75e-05,
"loss": 0.5438,
"step": 85
},
{
"epoch": 0.10783699059561129,
"grad_norm": 0.3579572942586854,
"learning_rate": 1.7708333333333335e-05,
"loss": 0.576,
"step": 86
},
{
"epoch": 0.10909090909090909,
"grad_norm": 0.32108754271927414,
"learning_rate": 1.7916666666666667e-05,
"loss": 0.5468,
"step": 87
},
{
"epoch": 0.1103448275862069,
"grad_norm": 0.36865303655426157,
"learning_rate": 1.8125e-05,
"loss": 0.5574,
"step": 88
},
{
"epoch": 0.1115987460815047,
"grad_norm": 0.34985718836885343,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.5805,
"step": 89
},
{
"epoch": 0.11285266457680251,
"grad_norm": 0.3905896291936876,
"learning_rate": 1.854166666666667e-05,
"loss": 0.5506,
"step": 90
},
{
"epoch": 0.11410658307210031,
"grad_norm": 0.39379899612118624,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.5743,
"step": 91
},
{
"epoch": 0.11536050156739812,
"grad_norm": 0.43105463144862316,
"learning_rate": 1.8958333333333334e-05,
"loss": 0.5609,
"step": 92
},
{
"epoch": 0.11661442006269593,
"grad_norm": 0.36454454801688724,
"learning_rate": 1.9166666666666667e-05,
"loss": 0.5622,
"step": 93
},
{
"epoch": 0.11786833855799372,
"grad_norm": 0.416860511322126,
"learning_rate": 1.9375e-05,
"loss": 0.5803,
"step": 94
},
{
"epoch": 0.11912225705329153,
"grad_norm": 0.39730819524122646,
"learning_rate": 1.9583333333333333e-05,
"loss": 0.5274,
"step": 95
},
{
"epoch": 0.12037617554858934,
"grad_norm": 0.40212091995349764,
"learning_rate": 1.9791666666666665e-05,
"loss": 0.5455,
"step": 96
},
{
"epoch": 0.12163009404388715,
"grad_norm": 0.37458755038175395,
"learning_rate": 2e-05,
"loss": 0.5595,
"step": 97
},
{
"epoch": 0.12288401253918495,
"grad_norm": 0.3808132592974076,
"learning_rate": 2.0208333333333334e-05,
"loss": 0.5816,
"step": 98
},
{
"epoch": 0.12413793103448276,
"grad_norm": 0.4217750038485611,
"learning_rate": 2.0416666666666667e-05,
"loss": 0.5296,
"step": 99
},
{
"epoch": 0.12539184952978055,
"grad_norm": 0.37094576242050337,
"learning_rate": 2.0625e-05,
"loss": 0.5484,
"step": 100
},
{
"epoch": 0.12664576802507838,
"grad_norm": 0.39890335445372344,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.5472,
"step": 101
},
{
"epoch": 0.12789968652037617,
"grad_norm": 0.33217111889520756,
"learning_rate": 2.104166666666667e-05,
"loss": 0.5399,
"step": 102
},
{
"epoch": 0.129153605015674,
"grad_norm": 0.39995424423412806,
"learning_rate": 2.125e-05,
"loss": 0.5381,
"step": 103
},
{
"epoch": 0.1304075235109718,
"grad_norm": 0.3488175697417937,
"learning_rate": 2.1458333333333334e-05,
"loss": 0.5192,
"step": 104
},
{
"epoch": 0.13166144200626959,
"grad_norm": 0.4414103381681035,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.5273,
"step": 105
},
{
"epoch": 0.1329153605015674,
"grad_norm": 0.3249563897635953,
"learning_rate": 2.1875e-05,
"loss": 0.5336,
"step": 106
},
{
"epoch": 0.1341692789968652,
"grad_norm": 0.4445301972510841,
"learning_rate": 2.2083333333333333e-05,
"loss": 0.5528,
"step": 107
},
{
"epoch": 0.135423197492163,
"grad_norm": 0.35906405137265135,
"learning_rate": 2.229166666666667e-05,
"loss": 0.5207,
"step": 108
},
{
"epoch": 0.13667711598746082,
"grad_norm": 0.38423102560771794,
"learning_rate": 2.25e-05,
"loss": 0.5439,
"step": 109
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.38389715115947437,
"learning_rate": 2.2708333333333334e-05,
"loss": 0.527,
"step": 110
},
{
"epoch": 0.13918495297805641,
"grad_norm": 0.45226627300518973,
"learning_rate": 2.2916666666666667e-05,
"loss": 0.5509,
"step": 111
},
{
"epoch": 0.14043887147335424,
"grad_norm": 0.3696238426430523,
"learning_rate": 2.3125000000000003e-05,
"loss": 0.5407,
"step": 112
},
{
"epoch": 0.14169278996865203,
"grad_norm": 0.39372933251851006,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.5343,
"step": 113
},
{
"epoch": 0.14294670846394986,
"grad_norm": 0.3996721172055718,
"learning_rate": 2.354166666666667e-05,
"loss": 0.5368,
"step": 114
},
{
"epoch": 0.14420062695924765,
"grad_norm": 0.407734338511496,
"learning_rate": 2.375e-05,
"loss": 0.5577,
"step": 115
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.40796451893774355,
"learning_rate": 2.3958333333333334e-05,
"loss": 0.5339,
"step": 116
},
{
"epoch": 0.14670846394984327,
"grad_norm": 0.3213804794889667,
"learning_rate": 2.4166666666666667e-05,
"loss": 0.5291,
"step": 117
},
{
"epoch": 0.14796238244514107,
"grad_norm": 0.4091567515542283,
"learning_rate": 2.4375e-05,
"loss": 0.5432,
"step": 118
},
{
"epoch": 0.14921630094043886,
"grad_norm": 0.42486732455445864,
"learning_rate": 2.4583333333333332e-05,
"loss": 0.5318,
"step": 119
},
{
"epoch": 0.15047021943573669,
"grad_norm": 0.37592138198897335,
"learning_rate": 2.479166666666667e-05,
"loss": 0.5369,
"step": 120
},
{
"epoch": 0.15172413793103448,
"grad_norm": 0.3416524853312382,
"learning_rate": 2.5e-05,
"loss": 0.5153,
"step": 121
},
{
"epoch": 0.15297805642633228,
"grad_norm": 0.4098677518261708,
"learning_rate": 2.5208333333333334e-05,
"loss": 0.5582,
"step": 122
},
{
"epoch": 0.1542319749216301,
"grad_norm": 0.3875929609009856,
"learning_rate": 2.5416666666666667e-05,
"loss": 0.5207,
"step": 123
},
{
"epoch": 0.1554858934169279,
"grad_norm": 0.3604683885209445,
"learning_rate": 2.5625e-05,
"loss": 0.5185,
"step": 124
},
{
"epoch": 0.15673981191222572,
"grad_norm": 0.3532423904897217,
"learning_rate": 2.5833333333333336e-05,
"loss": 0.5264,
"step": 125
},
{
"epoch": 0.1579937304075235,
"grad_norm": 0.34891556028635146,
"learning_rate": 2.604166666666667e-05,
"loss": 0.513,
"step": 126
},
{
"epoch": 0.1592476489028213,
"grad_norm": 0.3751612400036874,
"learning_rate": 2.625e-05,
"loss": 0.5249,
"step": 127
},
{
"epoch": 0.16050156739811913,
"grad_norm": 0.3872297855360959,
"learning_rate": 2.6458333333333334e-05,
"loss": 0.5256,
"step": 128
},
{
"epoch": 0.16175548589341693,
"grad_norm": 0.36704590533276366,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.5277,
"step": 129
},
{
"epoch": 0.16300940438871472,
"grad_norm": 0.39445158768296706,
"learning_rate": 2.6875e-05,
"loss": 0.5413,
"step": 130
},
{
"epoch": 0.16426332288401255,
"grad_norm": 0.36608518722578337,
"learning_rate": 2.7083333333333332e-05,
"loss": 0.529,
"step": 131
},
{
"epoch": 0.16551724137931034,
"grad_norm": 0.4338212197641358,
"learning_rate": 2.7291666666666665e-05,
"loss": 0.5511,
"step": 132
},
{
"epoch": 0.16677115987460814,
"grad_norm": 0.5213913964423107,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.5238,
"step": 133
},
{
"epoch": 0.16802507836990596,
"grad_norm": 0.4392428081829348,
"learning_rate": 2.7708333333333337e-05,
"loss": 0.4967,
"step": 134
},
{
"epoch": 0.16927899686520376,
"grad_norm": 0.41321211622317533,
"learning_rate": 2.791666666666667e-05,
"loss": 0.5406,
"step": 135
},
{
"epoch": 0.17053291536050158,
"grad_norm": 0.6380367806099768,
"learning_rate": 2.8125000000000003e-05,
"loss": 0.5141,
"step": 136
},
{
"epoch": 0.17178683385579938,
"grad_norm": 0.49566635260262487,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.5137,
"step": 137
},
{
"epoch": 0.17304075235109717,
"grad_norm": 0.5231550884340032,
"learning_rate": 2.8541666666666668e-05,
"loss": 0.538,
"step": 138
},
{
"epoch": 0.174294670846395,
"grad_norm": 0.5035173445845378,
"learning_rate": 2.8749999999999997e-05,
"loss": 0.5449,
"step": 139
},
{
"epoch": 0.1755485893416928,
"grad_norm": 0.5629214360819548,
"learning_rate": 2.8958333333333337e-05,
"loss": 0.5375,
"step": 140
},
{
"epoch": 0.17680250783699059,
"grad_norm": 0.4627158393358165,
"learning_rate": 2.916666666666667e-05,
"loss": 0.5253,
"step": 141
},
{
"epoch": 0.1780564263322884,
"grad_norm": 0.4769639696868499,
"learning_rate": 2.9375000000000003e-05,
"loss": 0.5294,
"step": 142
},
{
"epoch": 0.1793103448275862,
"grad_norm": 0.49553224020515213,
"learning_rate": 2.9583333333333335e-05,
"loss": 0.521,
"step": 143
},
{
"epoch": 0.180564263322884,
"grad_norm": 0.49597455350511377,
"learning_rate": 2.9791666666666668e-05,
"loss": 0.5325,
"step": 144
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.4372379817912527,
"learning_rate": 3e-05,
"loss": 0.5192,
"step": 145
},
{
"epoch": 0.18307210031347962,
"grad_norm": 0.39089665616220565,
"learning_rate": 3.0208333333333334e-05,
"loss": 0.5037,
"step": 146
},
{
"epoch": 0.18432601880877744,
"grad_norm": 0.4631748610553936,
"learning_rate": 3.0416666666666666e-05,
"loss": 0.5243,
"step": 147
},
{
"epoch": 0.18557993730407524,
"grad_norm": 0.44111356395890394,
"learning_rate": 3.0625000000000006e-05,
"loss": 0.4944,
"step": 148
},
{
"epoch": 0.18683385579937303,
"grad_norm": 0.49207043974323883,
"learning_rate": 3.0833333333333335e-05,
"loss": 0.532,
"step": 149
},
{
"epoch": 0.18808777429467086,
"grad_norm": 0.3682857959701906,
"learning_rate": 3.104166666666667e-05,
"loss": 0.5173,
"step": 150
},
{
"epoch": 0.18934169278996865,
"grad_norm": 0.44594000200971,
"learning_rate": 3.125e-05,
"loss": 0.5209,
"step": 151
},
{
"epoch": 0.19059561128526645,
"grad_norm": 0.41556645278574755,
"learning_rate": 3.145833333333334e-05,
"loss": 0.5507,
"step": 152
},
{
"epoch": 0.19184952978056427,
"grad_norm": 0.3525220437226554,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.5066,
"step": 153
},
{
"epoch": 0.19310344827586207,
"grad_norm": 0.41653646429173874,
"learning_rate": 3.1875e-05,
"loss": 0.5094,
"step": 154
},
{
"epoch": 0.19435736677115986,
"grad_norm": 0.3701330521331497,
"learning_rate": 3.208333333333334e-05,
"loss": 0.5175,
"step": 155
},
{
"epoch": 0.19561128526645769,
"grad_norm": 0.39758417203237806,
"learning_rate": 3.229166666666667e-05,
"loss": 0.4919,
"step": 156
},
{
"epoch": 0.19686520376175548,
"grad_norm": 0.3808307697795449,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.511,
"step": 157
},
{
"epoch": 0.1981191222570533,
"grad_norm": 0.3772929738645413,
"learning_rate": 3.270833333333333e-05,
"loss": 0.5038,
"step": 158
},
{
"epoch": 0.1993730407523511,
"grad_norm": 0.5048389797769592,
"learning_rate": 3.291666666666667e-05,
"loss": 0.5245,
"step": 159
},
{
"epoch": 0.2006269592476489,
"grad_norm": 0.49424359925818384,
"learning_rate": 3.3125e-05,
"loss": 0.5385,
"step": 160
},
{
"epoch": 0.20188087774294672,
"grad_norm": 0.43000466584271196,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.4833,
"step": 161
},
{
"epoch": 0.2031347962382445,
"grad_norm": 0.486919767715433,
"learning_rate": 3.3541666666666664e-05,
"loss": 0.5115,
"step": 162
},
{
"epoch": 0.2043887147335423,
"grad_norm": 0.42268016944782205,
"learning_rate": 3.375000000000001e-05,
"loss": 0.5358,
"step": 163
},
{
"epoch": 0.20564263322884013,
"grad_norm": 0.4687576813971511,
"learning_rate": 3.3958333333333337e-05,
"loss": 0.525,
"step": 164
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.49541629801161974,
"learning_rate": 3.4166666666666666e-05,
"loss": 0.5032,
"step": 165
},
{
"epoch": 0.20815047021943572,
"grad_norm": 0.4077298375054488,
"learning_rate": 3.4375e-05,
"loss": 0.4982,
"step": 166
},
{
"epoch": 0.20940438871473355,
"grad_norm": 0.4095341348243131,
"learning_rate": 3.458333333333333e-05,
"loss": 0.4935,
"step": 167
},
{
"epoch": 0.21065830721003134,
"grad_norm": 0.4809124301556673,
"learning_rate": 3.479166666666667e-05,
"loss": 0.5048,
"step": 168
},
{
"epoch": 0.21191222570532917,
"grad_norm": 0.5058627374289052,
"learning_rate": 3.5e-05,
"loss": 0.5018,
"step": 169
},
{
"epoch": 0.21316614420062696,
"grad_norm": 0.43743026274447805,
"learning_rate": 3.520833333333334e-05,
"loss": 0.5197,
"step": 170
},
{
"epoch": 0.21442006269592476,
"grad_norm": 0.6171180805029759,
"learning_rate": 3.541666666666667e-05,
"loss": 0.5212,
"step": 171
},
{
"epoch": 0.21567398119122258,
"grad_norm": 0.676039126782024,
"learning_rate": 3.5625000000000005e-05,
"loss": 0.4844,
"step": 172
},
{
"epoch": 0.21692789968652038,
"grad_norm": 0.47849477754149533,
"learning_rate": 3.5833333333333335e-05,
"loss": 0.5086,
"step": 173
},
{
"epoch": 0.21818181818181817,
"grad_norm": 0.6310336028014092,
"learning_rate": 3.604166666666667e-05,
"loss": 0.5224,
"step": 174
},
{
"epoch": 0.219435736677116,
"grad_norm": 0.5636747812278621,
"learning_rate": 3.625e-05,
"loss": 0.5408,
"step": 175
},
{
"epoch": 0.2206896551724138,
"grad_norm": 0.5570968365909392,
"learning_rate": 3.6458333333333336e-05,
"loss": 0.4818,
"step": 176
},
{
"epoch": 0.22194357366771159,
"grad_norm": 0.6106021635715896,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.4755,
"step": 177
},
{
"epoch": 0.2231974921630094,
"grad_norm": 0.7303014896013104,
"learning_rate": 3.6875e-05,
"loss": 0.5336,
"step": 178
},
{
"epoch": 0.2244514106583072,
"grad_norm": 0.5322823620607594,
"learning_rate": 3.708333333333334e-05,
"loss": 0.5071,
"step": 179
},
{
"epoch": 0.22570532915360503,
"grad_norm": 0.629222839853943,
"learning_rate": 3.729166666666667e-05,
"loss": 0.4989,
"step": 180
},
{
"epoch": 0.22695924764890282,
"grad_norm": 0.7990024468563582,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.506,
"step": 181
},
{
"epoch": 0.22821316614420062,
"grad_norm": 0.6115760581214867,
"learning_rate": 3.770833333333333e-05,
"loss": 0.4992,
"step": 182
},
{
"epoch": 0.22946708463949844,
"grad_norm": 0.8289460841173271,
"learning_rate": 3.791666666666667e-05,
"loss": 0.5417,
"step": 183
},
{
"epoch": 0.23072100313479624,
"grad_norm": 0.7116486634014242,
"learning_rate": 3.8125e-05,
"loss": 0.4785,
"step": 184
},
{
"epoch": 0.23197492163009403,
"grad_norm": 0.608416412159041,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.4838,
"step": 185
},
{
"epoch": 0.23322884012539186,
"grad_norm": 0.7574053178094695,
"learning_rate": 3.854166666666667e-05,
"loss": 0.5164,
"step": 186
},
{
"epoch": 0.23448275862068965,
"grad_norm": 0.8292040718716209,
"learning_rate": 3.875e-05,
"loss": 0.5359,
"step": 187
},
{
"epoch": 0.23573667711598745,
"grad_norm": 0.49541552951596945,
"learning_rate": 3.8958333333333336e-05,
"loss": 0.519,
"step": 188
},
{
"epoch": 0.23699059561128527,
"grad_norm": 0.7727007379400457,
"learning_rate": 3.9166666666666665e-05,
"loss": 0.4788,
"step": 189
},
{
"epoch": 0.23824451410658307,
"grad_norm": 0.6921735078607218,
"learning_rate": 3.9375e-05,
"loss": 0.4671,
"step": 190
},
{
"epoch": 0.2394984326018809,
"grad_norm": 0.520293356251696,
"learning_rate": 3.958333333333333e-05,
"loss": 0.4852,
"step": 191
},
{
"epoch": 0.24075235109717869,
"grad_norm": 0.9551925220125765,
"learning_rate": 3.979166666666667e-05,
"loss": 0.5113,
"step": 192
},
{
"epoch": 0.24200626959247648,
"grad_norm": 0.58492603542875,
"learning_rate": 4e-05,
"loss": 0.5114,
"step": 193
},
{
"epoch": 0.2432601880877743,
"grad_norm": 0.6989988687763795,
"learning_rate": 4.020833333333334e-05,
"loss": 0.4817,
"step": 194
},
{
"epoch": 0.2445141065830721,
"grad_norm": 0.629896566723547,
"learning_rate": 4.041666666666667e-05,
"loss": 0.4897,
"step": 195
},
{
"epoch": 0.2457680250783699,
"grad_norm": 0.48539607730798956,
"learning_rate": 4.0625000000000005e-05,
"loss": 0.5028,
"step": 196
},
{
"epoch": 0.24702194357366772,
"grad_norm": 0.7707520829218591,
"learning_rate": 4.0833333333333334e-05,
"loss": 0.4989,
"step": 197
},
{
"epoch": 0.2482758620689655,
"grad_norm": 0.5691094495312282,
"learning_rate": 4.104166666666667e-05,
"loss": 0.4732,
"step": 198
},
{
"epoch": 0.2495297805642633,
"grad_norm": 0.6943195873156014,
"learning_rate": 4.125e-05,
"loss": 0.5031,
"step": 199
},
{
"epoch": 0.2507836990595611,
"grad_norm": 0.712773047741249,
"learning_rate": 4.1458333333333336e-05,
"loss": 0.4921,
"step": 200
},
{
"epoch": 0.25203761755485893,
"grad_norm": 0.5961161161114042,
"learning_rate": 4.166666666666667e-05,
"loss": 0.4984,
"step": 201
},
{
"epoch": 0.25329153605015675,
"grad_norm": 0.8735265727722459,
"learning_rate": 4.1875e-05,
"loss": 0.5119,
"step": 202
},
{
"epoch": 0.2545454545454545,
"grad_norm": 0.528431235946117,
"learning_rate": 4.208333333333334e-05,
"loss": 0.4877,
"step": 203
},
{
"epoch": 0.25579937304075234,
"grad_norm": 0.783879350756353,
"learning_rate": 4.229166666666667e-05,
"loss": 0.4721,
"step": 204
},
{
"epoch": 0.25705329153605017,
"grad_norm": 0.5385752314897841,
"learning_rate": 4.25e-05,
"loss": 0.4784,
"step": 205
},
{
"epoch": 0.258307210031348,
"grad_norm": 0.6840418910143427,
"learning_rate": 4.270833333333333e-05,
"loss": 0.4936,
"step": 206
},
{
"epoch": 0.25956112852664576,
"grad_norm": 0.7128109766661378,
"learning_rate": 4.291666666666667e-05,
"loss": 0.5224,
"step": 207
},
{
"epoch": 0.2608150470219436,
"grad_norm": 0.5283517499670323,
"learning_rate": 4.3125000000000005e-05,
"loss": 0.5327,
"step": 208
},
{
"epoch": 0.2620689655172414,
"grad_norm": 0.6170509034688557,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.5041,
"step": 209
},
{
"epoch": 0.26332288401253917,
"grad_norm": 0.45886905059882427,
"learning_rate": 4.354166666666667e-05,
"loss": 0.4866,
"step": 210
},
{
"epoch": 0.264576802507837,
"grad_norm": 0.5068179366201936,
"learning_rate": 4.375e-05,
"loss": 0.4987,
"step": 211
},
{
"epoch": 0.2658307210031348,
"grad_norm": 0.4239254240707721,
"learning_rate": 4.3958333333333336e-05,
"loss": 0.4497,
"step": 212
},
{
"epoch": 0.2670846394984326,
"grad_norm": 0.47930699343351496,
"learning_rate": 4.4166666666666665e-05,
"loss": 0.491,
"step": 213
},
{
"epoch": 0.2683385579937304,
"grad_norm": 0.459639895097182,
"learning_rate": 4.4375e-05,
"loss": 0.4762,
"step": 214
},
{
"epoch": 0.26959247648902823,
"grad_norm": 0.46706346153816425,
"learning_rate": 4.458333333333334e-05,
"loss": 0.4855,
"step": 215
},
{
"epoch": 0.270846394984326,
"grad_norm": 0.4077873420597743,
"learning_rate": 4.4791666666666673e-05,
"loss": 0.4629,
"step": 216
},
{
"epoch": 0.2721003134796238,
"grad_norm": 0.4692514537435917,
"learning_rate": 4.5e-05,
"loss": 0.5057,
"step": 217
},
{
"epoch": 0.27335423197492165,
"grad_norm": 0.4584119463386754,
"learning_rate": 4.520833333333334e-05,
"loss": 0.4393,
"step": 218
},
{
"epoch": 0.2746081504702194,
"grad_norm": 0.5616500648463426,
"learning_rate": 4.541666666666667e-05,
"loss": 0.4814,
"step": 219
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.5243656217945699,
"learning_rate": 4.5625e-05,
"loss": 0.4823,
"step": 220
},
{
"epoch": 0.27711598746081506,
"grad_norm": 0.5823329265200899,
"learning_rate": 4.5833333333333334e-05,
"loss": 0.4706,
"step": 221
},
{
"epoch": 0.27836990595611283,
"grad_norm": 0.5659926552895286,
"learning_rate": 4.604166666666666e-05,
"loss": 0.4927,
"step": 222
},
{
"epoch": 0.27962382445141065,
"grad_norm": 0.6807705151064477,
"learning_rate": 4.6250000000000006e-05,
"loss": 0.5038,
"step": 223
},
{
"epoch": 0.2808777429467085,
"grad_norm": 0.4138370821754332,
"learning_rate": 4.6458333333333335e-05,
"loss": 0.472,
"step": 224
},
{
"epoch": 0.28213166144200624,
"grad_norm": 0.6850631623254443,
"learning_rate": 4.666666666666667e-05,
"loss": 0.4941,
"step": 225
},
{
"epoch": 0.28338557993730407,
"grad_norm": 0.5210182816385369,
"learning_rate": 4.6875e-05,
"loss": 0.5169,
"step": 226
},
{
"epoch": 0.2846394984326019,
"grad_norm": 0.467078247567627,
"learning_rate": 4.708333333333334e-05,
"loss": 0.4815,
"step": 227
},
{
"epoch": 0.2858934169278997,
"grad_norm": 0.46595442143311094,
"learning_rate": 4.7291666666666666e-05,
"loss": 0.4839,
"step": 228
},
{
"epoch": 0.2871473354231975,
"grad_norm": 0.5645855074611064,
"learning_rate": 4.75e-05,
"loss": 0.4955,
"step": 229
},
{
"epoch": 0.2884012539184953,
"grad_norm": 0.42658264996587325,
"learning_rate": 4.770833333333334e-05,
"loss": 0.5177,
"step": 230
},
{
"epoch": 0.2896551724137931,
"grad_norm": 0.49224882792610747,
"learning_rate": 4.791666666666667e-05,
"loss": 0.4749,
"step": 231
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.5374704903674536,
"learning_rate": 4.8125000000000004e-05,
"loss": 0.4965,
"step": 232
},
{
"epoch": 0.2921630094043887,
"grad_norm": 0.4705574421240699,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.4688,
"step": 233
},
{
"epoch": 0.29341692789968654,
"grad_norm": 0.5606358019371714,
"learning_rate": 4.854166666666667e-05,
"loss": 0.4878,
"step": 234
},
{
"epoch": 0.2946708463949843,
"grad_norm": 0.6221337358761776,
"learning_rate": 4.875e-05,
"loss": 0.4866,
"step": 235
},
{
"epoch": 0.29592476489028213,
"grad_norm": 0.48574929189072236,
"learning_rate": 4.8958333333333335e-05,
"loss": 0.455,
"step": 236
},
{
"epoch": 0.29717868338557996,
"grad_norm": 0.6705266761693125,
"learning_rate": 4.9166666666666665e-05,
"loss": 0.4803,
"step": 237
},
{
"epoch": 0.2984326018808777,
"grad_norm": 0.6870254770180511,
"learning_rate": 4.937500000000001e-05,
"loss": 0.4976,
"step": 238
},
{
"epoch": 0.29968652037617555,
"grad_norm": 0.5083788129327743,
"learning_rate": 4.958333333333334e-05,
"loss": 0.5222,
"step": 239
},
{
"epoch": 0.30094043887147337,
"grad_norm": 0.6814631715062539,
"learning_rate": 4.979166666666667e-05,
"loss": 0.4645,
"step": 240
},
{
"epoch": 0.30219435736677114,
"grad_norm": 0.6603401739256765,
"learning_rate": 5e-05,
"loss": 0.4977,
"step": 241
},
{
"epoch": 0.30344827586206896,
"grad_norm": 0.6020651627761702,
"learning_rate": 4.99767549976755e-05,
"loss": 0.5059,
"step": 242
},
{
"epoch": 0.3047021943573668,
"grad_norm": 0.735582642694233,
"learning_rate": 4.9953509995351e-05,
"loss": 0.4611,
"step": 243
},
{
"epoch": 0.30595611285266455,
"grad_norm": 0.6032363670330652,
"learning_rate": 4.99302649930265e-05,
"loss": 0.4814,
"step": 244
},
{
"epoch": 0.3072100313479624,
"grad_norm": 0.6492020557756347,
"learning_rate": 4.9907019990702e-05,
"loss": 0.4785,
"step": 245
},
{
"epoch": 0.3084639498432602,
"grad_norm": 0.6168227024352553,
"learning_rate": 4.9883774988377505e-05,
"loss": 0.4662,
"step": 246
},
{
"epoch": 0.30971786833855797,
"grad_norm": 0.6560724880378253,
"learning_rate": 4.9860529986053e-05,
"loss": 0.4546,
"step": 247
},
{
"epoch": 0.3109717868338558,
"grad_norm": 0.528188670724647,
"learning_rate": 4.98372849837285e-05,
"loss": 0.4701,
"step": 248
},
{
"epoch": 0.3122257053291536,
"grad_norm": 0.7454020417813048,
"learning_rate": 4.9814039981404e-05,
"loss": 0.4969,
"step": 249
},
{
"epoch": 0.31347962382445144,
"grad_norm": 0.5630795919961847,
"learning_rate": 4.97907949790795e-05,
"loss": 0.491,
"step": 250
},
{
"epoch": 0.3147335423197492,
"grad_norm": 0.4693918541611371,
"learning_rate": 4.9767549976755e-05,
"loss": 0.4949,
"step": 251
},
{
"epoch": 0.315987460815047,
"grad_norm": 0.4897501759892825,
"learning_rate": 4.97443049744305e-05,
"loss": 0.4552,
"step": 252
},
{
"epoch": 0.31724137931034485,
"grad_norm": 0.37569551375112076,
"learning_rate": 4.9721059972106e-05,
"loss": 0.4514,
"step": 253
},
{
"epoch": 0.3184952978056426,
"grad_norm": 0.5258111553824133,
"learning_rate": 4.9697814969781495e-05,
"loss": 0.4823,
"step": 254
},
{
"epoch": 0.31974921630094044,
"grad_norm": 0.3686657615893019,
"learning_rate": 4.9674569967457e-05,
"loss": 0.4708,
"step": 255
},
{
"epoch": 0.32100313479623827,
"grad_norm": 0.47087906120750406,
"learning_rate": 4.9651324965132504e-05,
"loss": 0.475,
"step": 256
},
{
"epoch": 0.32225705329153603,
"grad_norm": 0.5524651678854905,
"learning_rate": 4.9628079962808e-05,
"loss": 0.4579,
"step": 257
},
{
"epoch": 0.32351097178683386,
"grad_norm": 0.423116411912461,
"learning_rate": 4.96048349604835e-05,
"loss": 0.4908,
"step": 258
},
{
"epoch": 0.3247648902821317,
"grad_norm": 0.6069807875747713,
"learning_rate": 4.9581589958159e-05,
"loss": 0.4808,
"step": 259
},
{
"epoch": 0.32601880877742945,
"grad_norm": 0.4760143132533877,
"learning_rate": 4.9558344955834495e-05,
"loss": 0.4876,
"step": 260
},
{
"epoch": 0.32727272727272727,
"grad_norm": 0.48588421662783765,
"learning_rate": 4.953509995351e-05,
"loss": 0.4718,
"step": 261
},
{
"epoch": 0.3285266457680251,
"grad_norm": 0.4032225621917967,
"learning_rate": 4.95118549511855e-05,
"loss": 0.5034,
"step": 262
},
{
"epoch": 0.32978056426332286,
"grad_norm": 0.5172385259408704,
"learning_rate": 4.9488609948860995e-05,
"loss": 0.5026,
"step": 263
},
{
"epoch": 0.3310344827586207,
"grad_norm": 0.43935155032241585,
"learning_rate": 4.946536494653649e-05,
"loss": 0.4849,
"step": 264
},
{
"epoch": 0.3322884012539185,
"grad_norm": 0.5210091010311008,
"learning_rate": 4.9442119944212e-05,
"loss": 0.4649,
"step": 265
},
{
"epoch": 0.3335423197492163,
"grad_norm": 0.5127006074114845,
"learning_rate": 4.9418874941887496e-05,
"loss": 0.4863,
"step": 266
},
{
"epoch": 0.3347962382445141,
"grad_norm": 0.5232623011492532,
"learning_rate": 4.9395629939563e-05,
"loss": 0.4928,
"step": 267
},
{
"epoch": 0.3360501567398119,
"grad_norm": 0.5210087141542927,
"learning_rate": 4.93723849372385e-05,
"loss": 0.4754,
"step": 268
},
{
"epoch": 0.3373040752351097,
"grad_norm": 0.5507622983510155,
"learning_rate": 4.9349139934913996e-05,
"loss": 0.4789,
"step": 269
},
{
"epoch": 0.3385579937304075,
"grad_norm": 0.46087422461302735,
"learning_rate": 4.9325894932589494e-05,
"loss": 0.4786,
"step": 270
},
{
"epoch": 0.33981191222570534,
"grad_norm": 0.5289218324689127,
"learning_rate": 4.930264993026499e-05,
"loss": 0.4771,
"step": 271
},
{
"epoch": 0.34106583072100316,
"grad_norm": 0.44893174768129207,
"learning_rate": 4.9279404927940496e-05,
"loss": 0.5034,
"step": 272
},
{
"epoch": 0.34231974921630093,
"grad_norm": 0.5437415215596336,
"learning_rate": 4.9256159925615994e-05,
"loss": 0.4896,
"step": 273
},
{
"epoch": 0.34357366771159875,
"grad_norm": 0.4969623206045752,
"learning_rate": 4.92329149232915e-05,
"loss": 0.4769,
"step": 274
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.5531636133397371,
"learning_rate": 4.9209669920967e-05,
"loss": 0.4632,
"step": 275
},
{
"epoch": 0.34608150470219434,
"grad_norm": 0.6314502391048888,
"learning_rate": 4.9186424918642495e-05,
"loss": 0.467,
"step": 276
},
{
"epoch": 0.34733542319749217,
"grad_norm": 0.53728076650209,
"learning_rate": 4.916317991631799e-05,
"loss": 0.4885,
"step": 277
},
{
"epoch": 0.34858934169279,
"grad_norm": 0.530870867023985,
"learning_rate": 4.91399349139935e-05,
"loss": 0.4868,
"step": 278
},
{
"epoch": 0.34984326018808776,
"grad_norm": 0.44170515709646435,
"learning_rate": 4.9116689911668995e-05,
"loss": 0.4289,
"step": 279
},
{
"epoch": 0.3510971786833856,
"grad_norm": 0.6091468207281537,
"learning_rate": 4.909344490934449e-05,
"loss": 0.4892,
"step": 280
},
{
"epoch": 0.3523510971786834,
"grad_norm": 0.5104743246529412,
"learning_rate": 4.907019990701999e-05,
"loss": 0.4508,
"step": 281
},
{
"epoch": 0.35360501567398117,
"grad_norm": 0.5398316208625654,
"learning_rate": 4.904695490469549e-05,
"loss": 0.492,
"step": 282
},
{
"epoch": 0.354858934169279,
"grad_norm": 0.42722427991400835,
"learning_rate": 4.902370990237099e-05,
"loss": 0.4928,
"step": 283
},
{
"epoch": 0.3561128526645768,
"grad_norm": 0.5847725962858734,
"learning_rate": 4.90004649000465e-05,
"loss": 0.4937,
"step": 284
},
{
"epoch": 0.3573667711598746,
"grad_norm": 0.510193455151422,
"learning_rate": 4.8977219897721995e-05,
"loss": 0.4643,
"step": 285
},
{
"epoch": 0.3586206896551724,
"grad_norm": 0.4859715471427984,
"learning_rate": 4.895397489539749e-05,
"loss": 0.4937,
"step": 286
},
{
"epoch": 0.35987460815047023,
"grad_norm": 0.5387982730096461,
"learning_rate": 4.893072989307299e-05,
"loss": 0.4829,
"step": 287
},
{
"epoch": 0.361128526645768,
"grad_norm": 0.4507527622134857,
"learning_rate": 4.890748489074849e-05,
"loss": 0.4873,
"step": 288
},
{
"epoch": 0.3623824451410658,
"grad_norm": 0.5491397804770694,
"learning_rate": 4.8884239888423994e-05,
"loss": 0.4741,
"step": 289
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.5211779214939708,
"learning_rate": 4.886099488609949e-05,
"loss": 0.4671,
"step": 290
},
{
"epoch": 0.36489028213166147,
"grad_norm": 0.44333701204831655,
"learning_rate": 4.883774988377499e-05,
"loss": 0.4636,
"step": 291
},
{
"epoch": 0.36614420062695924,
"grad_norm": 0.45876488694139744,
"learning_rate": 4.881450488145049e-05,
"loss": 0.4579,
"step": 292
},
{
"epoch": 0.36739811912225706,
"grad_norm": 0.5521421971063752,
"learning_rate": 4.879125987912599e-05,
"loss": 0.5002,
"step": 293
},
{
"epoch": 0.3686520376175549,
"grad_norm": 0.3653430107371124,
"learning_rate": 4.876801487680149e-05,
"loss": 0.4746,
"step": 294
},
{
"epoch": 0.36990595611285265,
"grad_norm": 0.6776708812084911,
"learning_rate": 4.8744769874476994e-05,
"loss": 0.5,
"step": 295
},
{
"epoch": 0.3711598746081505,
"grad_norm": 0.3702289267169548,
"learning_rate": 4.872152487215249e-05,
"loss": 0.486,
"step": 296
},
{
"epoch": 0.3724137931034483,
"grad_norm": 0.5879621150747804,
"learning_rate": 4.869827986982799e-05,
"loss": 0.4624,
"step": 297
},
{
"epoch": 0.37366771159874607,
"grad_norm": 0.41287899552387985,
"learning_rate": 4.867503486750349e-05,
"loss": 0.4865,
"step": 298
},
{
"epoch": 0.3749216300940439,
"grad_norm": 0.4817917535098043,
"learning_rate": 4.8651789865178985e-05,
"loss": 0.4608,
"step": 299
},
{
"epoch": 0.3761755485893417,
"grad_norm": 0.4368217387259026,
"learning_rate": 4.862854486285448e-05,
"loss": 0.4921,
"step": 300
},
{
"epoch": 0.3774294670846395,
"grad_norm": 0.5372763333680445,
"learning_rate": 4.860529986052999e-05,
"loss": 0.4863,
"step": 301
},
{
"epoch": 0.3786833855799373,
"grad_norm": 0.45703472969190484,
"learning_rate": 4.8582054858205486e-05,
"loss": 0.4823,
"step": 302
},
{
"epoch": 0.3799373040752351,
"grad_norm": 0.5370022756456122,
"learning_rate": 4.855880985588099e-05,
"loss": 0.4777,
"step": 303
},
{
"epoch": 0.3811912225705329,
"grad_norm": 0.4138387904772327,
"learning_rate": 4.853556485355649e-05,
"loss": 0.4573,
"step": 304
},
{
"epoch": 0.3824451410658307,
"grad_norm": 0.620840174109759,
"learning_rate": 4.8512319851231986e-05,
"loss": 0.4649,
"step": 305
},
{
"epoch": 0.38369905956112854,
"grad_norm": 0.4369848501494815,
"learning_rate": 4.848907484890749e-05,
"loss": 0.4613,
"step": 306
},
{
"epoch": 0.3849529780564263,
"grad_norm": 0.5684832269997437,
"learning_rate": 4.846582984658299e-05,
"loss": 0.4381,
"step": 307
},
{
"epoch": 0.38620689655172413,
"grad_norm": 0.5507617345625007,
"learning_rate": 4.8442584844258486e-05,
"loss": 0.4747,
"step": 308
},
{
"epoch": 0.38746081504702196,
"grad_norm": 0.5463104310328326,
"learning_rate": 4.8419339841933984e-05,
"loss": 0.4698,
"step": 309
},
{
"epoch": 0.3887147335423197,
"grad_norm": 0.6196984750343147,
"learning_rate": 4.839609483960948e-05,
"loss": 0.4606,
"step": 310
},
{
"epoch": 0.38996865203761755,
"grad_norm": 0.489286676284584,
"learning_rate": 4.837284983728499e-05,
"loss": 0.49,
"step": 311
},
{
"epoch": 0.39122257053291537,
"grad_norm": 0.5734801518212188,
"learning_rate": 4.834960483496049e-05,
"loss": 0.4798,
"step": 312
},
{
"epoch": 0.3924764890282132,
"grad_norm": 0.48262484716747817,
"learning_rate": 4.832635983263599e-05,
"loss": 0.4822,
"step": 313
},
{
"epoch": 0.39373040752351096,
"grad_norm": 0.5606294536970398,
"learning_rate": 4.830311483031149e-05,
"loss": 0.479,
"step": 314
},
{
"epoch": 0.3949843260188088,
"grad_norm": 0.46658060823593994,
"learning_rate": 4.8279869827986985e-05,
"loss": 0.4655,
"step": 315
},
{
"epoch": 0.3962382445141066,
"grad_norm": 0.5523042690934277,
"learning_rate": 4.825662482566248e-05,
"loss": 0.4796,
"step": 316
},
{
"epoch": 0.3974921630094044,
"grad_norm": 0.3859032495694748,
"learning_rate": 4.823337982333798e-05,
"loss": 0.4683,
"step": 317
},
{
"epoch": 0.3987460815047022,
"grad_norm": 0.5604441029249502,
"learning_rate": 4.8210134821013485e-05,
"loss": 0.4603,
"step": 318
},
{
"epoch": 0.4,
"grad_norm": 0.44954023705862173,
"learning_rate": 4.818688981868898e-05,
"loss": 0.493,
"step": 319
},
{
"epoch": 0.4012539184952978,
"grad_norm": 0.45921180178001103,
"learning_rate": 4.816364481636448e-05,
"loss": 0.4531,
"step": 320
},
{
"epoch": 0.4025078369905956,
"grad_norm": 0.41076422108004323,
"learning_rate": 4.8140399814039985e-05,
"loss": 0.4594,
"step": 321
},
{
"epoch": 0.40376175548589344,
"grad_norm": 0.4139935158373309,
"learning_rate": 4.811715481171548e-05,
"loss": 0.46,
"step": 322
},
{
"epoch": 0.4050156739811912,
"grad_norm": 0.401425056550118,
"learning_rate": 4.809390980939099e-05,
"loss": 0.4475,
"step": 323
},
{
"epoch": 0.406269592476489,
"grad_norm": 0.4332562786755319,
"learning_rate": 4.8070664807066486e-05,
"loss": 0.4996,
"step": 324
},
{
"epoch": 0.40752351097178685,
"grad_norm": 0.4267790196330538,
"learning_rate": 4.8047419804741983e-05,
"loss": 0.5026,
"step": 325
},
{
"epoch": 0.4087774294670846,
"grad_norm": 0.4084461798934156,
"learning_rate": 4.802417480241748e-05,
"loss": 0.4403,
"step": 326
},
{
"epoch": 0.41003134796238244,
"grad_norm": 0.4554500338944917,
"learning_rate": 4.800092980009298e-05,
"loss": 0.4986,
"step": 327
},
{
"epoch": 0.41128526645768027,
"grad_norm": 0.42892875040816536,
"learning_rate": 4.797768479776848e-05,
"loss": 0.48,
"step": 328
},
{
"epoch": 0.41253918495297803,
"grad_norm": 0.5003412140354901,
"learning_rate": 4.795443979544398e-05,
"loss": 0.5036,
"step": 329
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.524930881452445,
"learning_rate": 4.793119479311948e-05,
"loss": 0.4859,
"step": 330
},
{
"epoch": 0.4150470219435737,
"grad_norm": 0.4949223106178305,
"learning_rate": 4.7907949790794984e-05,
"loss": 0.452,
"step": 331
},
{
"epoch": 0.41630094043887145,
"grad_norm": 0.4704944882323933,
"learning_rate": 4.788470478847048e-05,
"loss": 0.4733,
"step": 332
},
{
"epoch": 0.41755485893416927,
"grad_norm": 0.48760149439196204,
"learning_rate": 4.786145978614598e-05,
"loss": 0.4769,
"step": 333
},
{
"epoch": 0.4188087774294671,
"grad_norm": 0.4696455372734968,
"learning_rate": 4.7838214783821484e-05,
"loss": 0.5046,
"step": 334
},
{
"epoch": 0.4200626959247649,
"grad_norm": 0.44385399818670784,
"learning_rate": 4.781496978149698e-05,
"loss": 0.4735,
"step": 335
},
{
"epoch": 0.4213166144200627,
"grad_norm": 0.45733772771784004,
"learning_rate": 4.779172477917248e-05,
"loss": 0.4738,
"step": 336
},
{
"epoch": 0.4225705329153605,
"grad_norm": 0.49753069847399267,
"learning_rate": 4.776847977684798e-05,
"loss": 0.4742,
"step": 337
},
{
"epoch": 0.42382445141065833,
"grad_norm": 0.45364657385254264,
"learning_rate": 4.7745234774523476e-05,
"loss": 0.4341,
"step": 338
},
{
"epoch": 0.4250783699059561,
"grad_norm": 0.5297970207043281,
"learning_rate": 4.772198977219898e-05,
"loss": 0.4942,
"step": 339
},
{
"epoch": 0.4263322884012539,
"grad_norm": 0.39788626445249636,
"learning_rate": 4.7698744769874485e-05,
"loss": 0.4799,
"step": 340
},
{
"epoch": 0.42758620689655175,
"grad_norm": 0.4831887454380128,
"learning_rate": 4.767549976754998e-05,
"loss": 0.4719,
"step": 341
},
{
"epoch": 0.4288401253918495,
"grad_norm": 0.4277270711057265,
"learning_rate": 4.765225476522548e-05,
"loss": 0.4533,
"step": 342
},
{
"epoch": 0.43009404388714734,
"grad_norm": 0.5304736508255485,
"learning_rate": 4.762900976290098e-05,
"loss": 0.5031,
"step": 343
},
{
"epoch": 0.43134796238244516,
"grad_norm": 0.4275118367284353,
"learning_rate": 4.7605764760576476e-05,
"loss": 0.4883,
"step": 344
},
{
"epoch": 0.43260188087774293,
"grad_norm": 0.4794661540430207,
"learning_rate": 4.7582519758251974e-05,
"loss": 0.4756,
"step": 345
},
{
"epoch": 0.43385579937304075,
"grad_norm": 0.5666286593259459,
"learning_rate": 4.755927475592748e-05,
"loss": 0.4792,
"step": 346
},
{
"epoch": 0.4351097178683386,
"grad_norm": 0.614794043971327,
"learning_rate": 4.7536029753602977e-05,
"loss": 0.4741,
"step": 347
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.44599102698260285,
"learning_rate": 4.7512784751278474e-05,
"loss": 0.448,
"step": 348
},
{
"epoch": 0.43761755485893417,
"grad_norm": 0.5643176425888605,
"learning_rate": 4.748953974895398e-05,
"loss": 0.4668,
"step": 349
},
{
"epoch": 0.438871473354232,
"grad_norm": 0.5242952958843355,
"learning_rate": 4.746629474662948e-05,
"loss": 0.4804,
"step": 350
},
{
"epoch": 0.44012539184952976,
"grad_norm": 0.42362017189763784,
"learning_rate": 4.744304974430498e-05,
"loss": 0.4439,
"step": 351
},
{
"epoch": 0.4413793103448276,
"grad_norm": 0.48760710845678346,
"learning_rate": 4.741980474198048e-05,
"loss": 0.4603,
"step": 352
},
{
"epoch": 0.4426332288401254,
"grad_norm": 0.5388416968067885,
"learning_rate": 4.739655973965598e-05,
"loss": 0.4747,
"step": 353
},
{
"epoch": 0.44388714733542317,
"grad_norm": 0.4069720312748789,
"learning_rate": 4.7373314737331475e-05,
"loss": 0.4634,
"step": 354
},
{
"epoch": 0.445141065830721,
"grad_norm": 0.4861430060861511,
"learning_rate": 4.735006973500697e-05,
"loss": 0.4507,
"step": 355
},
{
"epoch": 0.4463949843260188,
"grad_norm": 0.4347976238611198,
"learning_rate": 4.732682473268247e-05,
"loss": 0.4585,
"step": 356
},
{
"epoch": 0.44764890282131664,
"grad_norm": 0.45613763311120487,
"learning_rate": 4.7303579730357975e-05,
"loss": 0.4549,
"step": 357
},
{
"epoch": 0.4489028213166144,
"grad_norm": 0.47715978309733,
"learning_rate": 4.728033472803347e-05,
"loss": 0.4418,
"step": 358
},
{
"epoch": 0.45015673981191223,
"grad_norm": 0.40539677796073686,
"learning_rate": 4.725708972570898e-05,
"loss": 0.4602,
"step": 359
},
{
"epoch": 0.45141065830721006,
"grad_norm": 0.4253949963890617,
"learning_rate": 4.7233844723384476e-05,
"loss": 0.4442,
"step": 360
},
{
"epoch": 0.4526645768025078,
"grad_norm": 0.40223292930240656,
"learning_rate": 4.7210599721059973e-05,
"loss": 0.4577,
"step": 361
},
{
"epoch": 0.45391849529780565,
"grad_norm": 0.4402489576396967,
"learning_rate": 4.718735471873547e-05,
"loss": 0.4379,
"step": 362
},
{
"epoch": 0.45517241379310347,
"grad_norm": 0.3737967115397889,
"learning_rate": 4.7164109716410976e-05,
"loss": 0.4673,
"step": 363
},
{
"epoch": 0.45642633228840124,
"grad_norm": 0.4732865595068881,
"learning_rate": 4.7140864714086474e-05,
"loss": 0.4653,
"step": 364
},
{
"epoch": 0.45768025078369906,
"grad_norm": 0.3603644709400332,
"learning_rate": 4.711761971176197e-05,
"loss": 0.4607,
"step": 365
},
{
"epoch": 0.4589341692789969,
"grad_norm": 0.4790366167429156,
"learning_rate": 4.709437470943747e-05,
"loss": 0.4941,
"step": 366
},
{
"epoch": 0.46018808777429465,
"grad_norm": 0.3844034274037309,
"learning_rate": 4.7071129707112974e-05,
"loss": 0.4607,
"step": 367
},
{
"epoch": 0.4614420062695925,
"grad_norm": 0.3866441600775973,
"learning_rate": 4.704788470478847e-05,
"loss": 0.4704,
"step": 368
},
{
"epoch": 0.4626959247648903,
"grad_norm": 0.4021991947907984,
"learning_rate": 4.7024639702463976e-05,
"loss": 0.4687,
"step": 369
},
{
"epoch": 0.46394984326018807,
"grad_norm": 0.40911956138034244,
"learning_rate": 4.7001394700139474e-05,
"loss": 0.4629,
"step": 370
},
{
"epoch": 0.4652037617554859,
"grad_norm": 0.5333968012917225,
"learning_rate": 4.697814969781497e-05,
"loss": 0.4699,
"step": 371
},
{
"epoch": 0.4664576802507837,
"grad_norm": 0.40544260540785443,
"learning_rate": 4.695490469549047e-05,
"loss": 0.4635,
"step": 372
},
{
"epoch": 0.4677115987460815,
"grad_norm": 0.4560656731289972,
"learning_rate": 4.693165969316597e-05,
"loss": 0.4646,
"step": 373
},
{
"epoch": 0.4689655172413793,
"grad_norm": 0.46024196925339683,
"learning_rate": 4.690841469084147e-05,
"loss": 0.4849,
"step": 374
},
{
"epoch": 0.4702194357366771,
"grad_norm": 0.37700282266875795,
"learning_rate": 4.688516968851697e-05,
"loss": 0.4431,
"step": 375
},
{
"epoch": 0.4714733542319749,
"grad_norm": 0.5176514827905447,
"learning_rate": 4.686192468619247e-05,
"loss": 0.4708,
"step": 376
},
{
"epoch": 0.4727272727272727,
"grad_norm": 0.4509673653001928,
"learning_rate": 4.683867968386797e-05,
"loss": 0.4542,
"step": 377
},
{
"epoch": 0.47398119122257054,
"grad_norm": 0.43731438773603404,
"learning_rate": 4.681543468154347e-05,
"loss": 0.4571,
"step": 378
},
{
"epoch": 0.47523510971786836,
"grad_norm": 0.4037517071668997,
"learning_rate": 4.6792189679218975e-05,
"loss": 0.4885,
"step": 379
},
{
"epoch": 0.47648902821316613,
"grad_norm": 0.44514960645515494,
"learning_rate": 4.676894467689447e-05,
"loss": 0.4659,
"step": 380
},
{
"epoch": 0.47774294670846396,
"grad_norm": 0.5139387355750772,
"learning_rate": 4.674569967456997e-05,
"loss": 0.4537,
"step": 381
},
{
"epoch": 0.4789968652037618,
"grad_norm": 0.3774914322482863,
"learning_rate": 4.672245467224547e-05,
"loss": 0.4463,
"step": 382
},
{
"epoch": 0.48025078369905955,
"grad_norm": 0.6127115023728015,
"learning_rate": 4.6699209669920967e-05,
"loss": 0.4689,
"step": 383
},
{
"epoch": 0.48150470219435737,
"grad_norm": 0.4233004414986294,
"learning_rate": 4.6675964667596464e-05,
"loss": 0.5031,
"step": 384
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.49756576438965816,
"learning_rate": 4.665271966527197e-05,
"loss": 0.5005,
"step": 385
},
{
"epoch": 0.48401253918495296,
"grad_norm": 0.39079434423094395,
"learning_rate": 4.662947466294747e-05,
"loss": 0.4661,
"step": 386
},
{
"epoch": 0.4852664576802508,
"grad_norm": 0.3795492480110091,
"learning_rate": 4.660622966062297e-05,
"loss": 0.4461,
"step": 387
},
{
"epoch": 0.4865203761755486,
"grad_norm": 0.40000993693946724,
"learning_rate": 4.658298465829847e-05,
"loss": 0.4751,
"step": 388
},
{
"epoch": 0.4877742946708464,
"grad_norm": 0.4383004879348865,
"learning_rate": 4.655973965597397e-05,
"loss": 0.4479,
"step": 389
},
{
"epoch": 0.4890282131661442,
"grad_norm": 0.4582085218001207,
"learning_rate": 4.6536494653649465e-05,
"loss": 0.5261,
"step": 390
},
{
"epoch": 0.490282131661442,
"grad_norm": 0.44401655955505615,
"learning_rate": 4.651324965132497e-05,
"loss": 0.4644,
"step": 391
},
{
"epoch": 0.4915360501567398,
"grad_norm": 0.4054245193919015,
"learning_rate": 4.649000464900047e-05,
"loss": 0.4739,
"step": 392
},
{
"epoch": 0.4927899686520376,
"grad_norm": 0.4604111394207155,
"learning_rate": 4.6466759646675965e-05,
"loss": 0.4608,
"step": 393
},
{
"epoch": 0.49404388714733544,
"grad_norm": 0.4223875772606713,
"learning_rate": 4.644351464435146e-05,
"loss": 0.4598,
"step": 394
},
{
"epoch": 0.4952978056426332,
"grad_norm": 0.4482962880146626,
"learning_rate": 4.642026964202697e-05,
"loss": 0.4499,
"step": 395
},
{
"epoch": 0.496551724137931,
"grad_norm": 0.4353709395597604,
"learning_rate": 4.6397024639702466e-05,
"loss": 0.4746,
"step": 396
},
{
"epoch": 0.49780564263322885,
"grad_norm": 0.3944037148752727,
"learning_rate": 4.637377963737797e-05,
"loss": 0.4517,
"step": 397
},
{
"epoch": 0.4990595611285266,
"grad_norm": 0.5074016464155598,
"learning_rate": 4.635053463505347e-05,
"loss": 0.4368,
"step": 398
},
{
"epoch": 0.5003134796238244,
"grad_norm": 0.41485057650720636,
"learning_rate": 4.6327289632728966e-05,
"loss": 0.4713,
"step": 399
},
{
"epoch": 0.5015673981191222,
"grad_norm": 0.5749200566932813,
"learning_rate": 4.6304044630404464e-05,
"loss": 0.4648,
"step": 400
},
{
"epoch": 0.5028213166144201,
"grad_norm": 0.41087466774729886,
"learning_rate": 4.628079962807996e-05,
"loss": 0.4549,
"step": 401
},
{
"epoch": 0.5040752351097179,
"grad_norm": 0.5349643556838662,
"learning_rate": 4.6257554625755466e-05,
"loss": 0.4571,
"step": 402
},
{
"epoch": 0.5053291536050156,
"grad_norm": 0.4095477700673694,
"learning_rate": 4.6234309623430964e-05,
"loss": 0.4505,
"step": 403
},
{
"epoch": 0.5065830721003135,
"grad_norm": 0.43425134078417815,
"learning_rate": 4.621106462110646e-05,
"loss": 0.4644,
"step": 404
},
{
"epoch": 0.5078369905956113,
"grad_norm": 0.40866776962736456,
"learning_rate": 4.6187819618781966e-05,
"loss": 0.4863,
"step": 405
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.4584287273904148,
"learning_rate": 4.6164574616457464e-05,
"loss": 0.4425,
"step": 406
},
{
"epoch": 0.5103448275862069,
"grad_norm": 0.41999708527857005,
"learning_rate": 4.614132961413296e-05,
"loss": 0.4498,
"step": 407
},
{
"epoch": 0.5115987460815047,
"grad_norm": 0.3357483896495483,
"learning_rate": 4.611808461180847e-05,
"loss": 0.4268,
"step": 408
},
{
"epoch": 0.5128526645768025,
"grad_norm": 0.49565288790236917,
"learning_rate": 4.6094839609483965e-05,
"loss": 0.4785,
"step": 409
},
{
"epoch": 0.5141065830721003,
"grad_norm": 0.4149164241874618,
"learning_rate": 4.607159460715946e-05,
"loss": 0.4809,
"step": 410
},
{
"epoch": 0.5153605015673981,
"grad_norm": 0.4338885663365593,
"learning_rate": 4.604834960483496e-05,
"loss": 0.435,
"step": 411
},
{
"epoch": 0.516614420062696,
"grad_norm": 0.410123241789722,
"learning_rate": 4.602510460251046e-05,
"loss": 0.4867,
"step": 412
},
{
"epoch": 0.5178683385579937,
"grad_norm": 0.4243070381918125,
"learning_rate": 4.600185960018596e-05,
"loss": 0.4527,
"step": 413
},
{
"epoch": 0.5191222570532915,
"grad_norm": 0.40643473099211014,
"learning_rate": 4.597861459786146e-05,
"loss": 0.4457,
"step": 414
},
{
"epoch": 0.5203761755485894,
"grad_norm": 0.44769405072594354,
"learning_rate": 4.5955369595536965e-05,
"loss": 0.4716,
"step": 415
},
{
"epoch": 0.5216300940438872,
"grad_norm": 0.4246666651538923,
"learning_rate": 4.593212459321246e-05,
"loss": 0.4614,
"step": 416
},
{
"epoch": 0.5228840125391849,
"grad_norm": 0.5380067296828854,
"learning_rate": 4.590887959088796e-05,
"loss": 0.4519,
"step": 417
},
{
"epoch": 0.5241379310344828,
"grad_norm": 0.4738547844853042,
"learning_rate": 4.588563458856346e-05,
"loss": 0.421,
"step": 418
},
{
"epoch": 0.5253918495297806,
"grad_norm": 0.45344802895857406,
"learning_rate": 4.586238958623896e-05,
"loss": 0.4433,
"step": 419
},
{
"epoch": 0.5266457680250783,
"grad_norm": 0.45517902643421526,
"learning_rate": 4.583914458391446e-05,
"loss": 0.4641,
"step": 420
},
{
"epoch": 0.5278996865203762,
"grad_norm": 0.41994002120153556,
"learning_rate": 4.581589958158996e-05,
"loss": 0.4419,
"step": 421
},
{
"epoch": 0.529153605015674,
"grad_norm": 0.4290576107267562,
"learning_rate": 4.579265457926546e-05,
"loss": 0.4509,
"step": 422
},
{
"epoch": 0.5304075235109718,
"grad_norm": 0.4055408855270868,
"learning_rate": 4.576940957694096e-05,
"loss": 0.4586,
"step": 423
},
{
"epoch": 0.5316614420062696,
"grad_norm": 0.4060880162875057,
"learning_rate": 4.574616457461646e-05,
"loss": 0.4692,
"step": 424
},
{
"epoch": 0.5329153605015674,
"grad_norm": 0.39691975932141443,
"learning_rate": 4.5722919572291964e-05,
"loss": 0.4199,
"step": 425
},
{
"epoch": 0.5341692789968652,
"grad_norm": 0.4288367102434096,
"learning_rate": 4.569967456996746e-05,
"loss": 0.4478,
"step": 426
},
{
"epoch": 0.535423197492163,
"grad_norm": 0.4442918186547453,
"learning_rate": 4.567642956764296e-05,
"loss": 0.4725,
"step": 427
},
{
"epoch": 0.5366771159874608,
"grad_norm": 0.4605003172530893,
"learning_rate": 4.565318456531846e-05,
"loss": 0.498,
"step": 428
},
{
"epoch": 0.5379310344827586,
"grad_norm": 0.35082140331504397,
"learning_rate": 4.5629939562993955e-05,
"loss": 0.4369,
"step": 429
},
{
"epoch": 0.5391849529780565,
"grad_norm": 0.44252317622451215,
"learning_rate": 4.560669456066946e-05,
"loss": 0.4705,
"step": 430
},
{
"epoch": 0.5404388714733542,
"grad_norm": 0.41077287654486566,
"learning_rate": 4.558344955834496e-05,
"loss": 0.4574,
"step": 431
},
{
"epoch": 0.541692789968652,
"grad_norm": 0.3504025567861513,
"learning_rate": 4.5560204556020455e-05,
"loss": 0.4648,
"step": 432
},
{
"epoch": 0.5429467084639499,
"grad_norm": 0.3843803030390814,
"learning_rate": 4.553695955369596e-05,
"loss": 0.438,
"step": 433
},
{
"epoch": 0.5442006269592476,
"grad_norm": 0.3620843675593709,
"learning_rate": 4.551371455137146e-05,
"loss": 0.4638,
"step": 434
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.4431391268135086,
"learning_rate": 4.5490469549046956e-05,
"loss": 0.4468,
"step": 435
},
{
"epoch": 0.5467084639498433,
"grad_norm": 0.5411507117324692,
"learning_rate": 4.546722454672246e-05,
"loss": 0.4919,
"step": 436
},
{
"epoch": 0.5479623824451411,
"grad_norm": 0.40028621656097885,
"learning_rate": 4.544397954439796e-05,
"loss": 0.4406,
"step": 437
},
{
"epoch": 0.5492163009404388,
"grad_norm": 0.5365076618271549,
"learning_rate": 4.5420734542073456e-05,
"loss": 0.4769,
"step": 438
},
{
"epoch": 0.5504702194357367,
"grad_norm": 0.5157194904195961,
"learning_rate": 4.5397489539748954e-05,
"loss": 0.4707,
"step": 439
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.4898686931521917,
"learning_rate": 4.537424453742445e-05,
"loss": 0.4534,
"step": 440
},
{
"epoch": 0.5529780564263322,
"grad_norm": 0.37061388345242957,
"learning_rate": 4.5350999535099956e-05,
"loss": 0.4581,
"step": 441
},
{
"epoch": 0.5542319749216301,
"grad_norm": 0.47603317457886424,
"learning_rate": 4.5327754532775454e-05,
"loss": 0.478,
"step": 442
},
{
"epoch": 0.5554858934169279,
"grad_norm": 0.31858706585325125,
"learning_rate": 4.530450953045096e-05,
"loss": 0.4505,
"step": 443
},
{
"epoch": 0.5567398119122257,
"grad_norm": 0.418611226624613,
"learning_rate": 4.528126452812646e-05,
"loss": 0.4339,
"step": 444
},
{
"epoch": 0.5579937304075235,
"grad_norm": 0.4100650595722824,
"learning_rate": 4.5258019525801954e-05,
"loss": 0.478,
"step": 445
},
{
"epoch": 0.5592476489028213,
"grad_norm": 0.4312879419672854,
"learning_rate": 4.523477452347745e-05,
"loss": 0.4502,
"step": 446
},
{
"epoch": 0.5605015673981191,
"grad_norm": 0.3919936378559303,
"learning_rate": 4.521152952115296e-05,
"loss": 0.4472,
"step": 447
},
{
"epoch": 0.561755485893417,
"grad_norm": 0.5293030962546723,
"learning_rate": 4.5188284518828455e-05,
"loss": 0.4585,
"step": 448
},
{
"epoch": 0.5630094043887147,
"grad_norm": 0.44132321648941525,
"learning_rate": 4.516503951650395e-05,
"loss": 0.4386,
"step": 449
},
{
"epoch": 0.5642633228840125,
"grad_norm": 0.44084357221612386,
"learning_rate": 4.514179451417945e-05,
"loss": 0.4682,
"step": 450
},
{
"epoch": 0.5655172413793104,
"grad_norm": 0.46705086654333383,
"learning_rate": 4.5118549511854955e-05,
"loss": 0.4517,
"step": 451
},
{
"epoch": 0.5667711598746081,
"grad_norm": 0.4357084314597459,
"learning_rate": 4.509530450953045e-05,
"loss": 0.4473,
"step": 452
},
{
"epoch": 0.568025078369906,
"grad_norm": 0.45195957429219125,
"learning_rate": 4.507205950720596e-05,
"loss": 0.4369,
"step": 453
},
{
"epoch": 0.5692789968652038,
"grad_norm": 0.37125644056125484,
"learning_rate": 4.5048814504881455e-05,
"loss": 0.45,
"step": 454
},
{
"epoch": 0.5705329153605015,
"grad_norm": 0.4819496385771428,
"learning_rate": 4.502556950255695e-05,
"loss": 0.4307,
"step": 455
},
{
"epoch": 0.5717868338557994,
"grad_norm": 0.4374914864886535,
"learning_rate": 4.500232450023245e-05,
"loss": 0.4488,
"step": 456
},
{
"epoch": 0.5730407523510972,
"grad_norm": 0.49960328160641915,
"learning_rate": 4.497907949790795e-05,
"loss": 0.4894,
"step": 457
},
{
"epoch": 0.574294670846395,
"grad_norm": 0.49564150104825117,
"learning_rate": 4.4955834495583453e-05,
"loss": 0.4494,
"step": 458
},
{
"epoch": 0.5755485893416928,
"grad_norm": 0.4111199739333019,
"learning_rate": 4.493258949325895e-05,
"loss": 0.4695,
"step": 459
},
{
"epoch": 0.5768025078369906,
"grad_norm": 0.5775648425183937,
"learning_rate": 4.490934449093445e-05,
"loss": 0.4261,
"step": 460
},
{
"epoch": 0.5780564263322884,
"grad_norm": 0.4352889251035282,
"learning_rate": 4.4886099488609954e-05,
"loss": 0.4585,
"step": 461
},
{
"epoch": 0.5793103448275863,
"grad_norm": 0.5799013676447736,
"learning_rate": 4.486285448628545e-05,
"loss": 0.4438,
"step": 462
},
{
"epoch": 0.580564263322884,
"grad_norm": 0.4206819109265725,
"learning_rate": 4.483960948396095e-05,
"loss": 0.459,
"step": 463
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.5604967509509768,
"learning_rate": 4.4816364481636454e-05,
"loss": 0.4694,
"step": 464
},
{
"epoch": 0.5830721003134797,
"grad_norm": 0.47115505321929096,
"learning_rate": 4.479311947931195e-05,
"loss": 0.4558,
"step": 465
},
{
"epoch": 0.5843260188087774,
"grad_norm": 0.48907729218125795,
"learning_rate": 4.476987447698745e-05,
"loss": 0.4613,
"step": 466
},
{
"epoch": 0.5855799373040752,
"grad_norm": 0.5262700030595355,
"learning_rate": 4.474662947466295e-05,
"loss": 0.448,
"step": 467
},
{
"epoch": 0.5868338557993731,
"grad_norm": 0.42596159133819944,
"learning_rate": 4.4723384472338445e-05,
"loss": 0.4469,
"step": 468
},
{
"epoch": 0.5880877742946709,
"grad_norm": 0.54186736592769,
"learning_rate": 4.470013947001394e-05,
"loss": 0.4839,
"step": 469
},
{
"epoch": 0.5893416927899686,
"grad_norm": 0.4492487738616722,
"learning_rate": 4.467689446768945e-05,
"loss": 0.4728,
"step": 470
},
{
"epoch": 0.5905956112852665,
"grad_norm": 0.46806868743862295,
"learning_rate": 4.465364946536495e-05,
"loss": 0.4966,
"step": 471
},
{
"epoch": 0.5918495297805643,
"grad_norm": 0.46664790452160076,
"learning_rate": 4.463040446304045e-05,
"loss": 0.4328,
"step": 472
},
{
"epoch": 0.593103448275862,
"grad_norm": 0.4395526252816186,
"learning_rate": 4.460715946071595e-05,
"loss": 0.4444,
"step": 473
},
{
"epoch": 0.5943573667711599,
"grad_norm": 0.43623340397967686,
"learning_rate": 4.4583914458391446e-05,
"loss": 0.4782,
"step": 474
},
{
"epoch": 0.5956112852664577,
"grad_norm": 0.49154033792721324,
"learning_rate": 4.456066945606695e-05,
"loss": 0.4387,
"step": 475
},
{
"epoch": 0.5968652037617554,
"grad_norm": 0.4526010194115278,
"learning_rate": 4.453742445374245e-05,
"loss": 0.4535,
"step": 476
},
{
"epoch": 0.5981191222570533,
"grad_norm": 0.4762879053539736,
"learning_rate": 4.4514179451417946e-05,
"loss": 0.4525,
"step": 477
},
{
"epoch": 0.5993730407523511,
"grad_norm": 0.47683411609169724,
"learning_rate": 4.4490934449093444e-05,
"loss": 0.4928,
"step": 478
},
{
"epoch": 0.6006269592476489,
"grad_norm": 0.49596051363912796,
"learning_rate": 4.446768944676894e-05,
"loss": 0.4854,
"step": 479
},
{
"epoch": 0.6018808777429467,
"grad_norm": 0.42808115164066035,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.4556,
"step": 480
},
{
"epoch": 0.6031347962382445,
"grad_norm": 0.36721454411799004,
"learning_rate": 4.442119944211995e-05,
"loss": 0.4553,
"step": 481
},
{
"epoch": 0.6043887147335423,
"grad_norm": 0.5127311515936736,
"learning_rate": 4.439795443979545e-05,
"loss": 0.4504,
"step": 482
},
{
"epoch": 0.6056426332288402,
"grad_norm": 0.4136469123029096,
"learning_rate": 4.437470943747095e-05,
"loss": 0.4555,
"step": 483
},
{
"epoch": 0.6068965517241379,
"grad_norm": 0.42459446278515117,
"learning_rate": 4.4351464435146445e-05,
"loss": 0.4564,
"step": 484
},
{
"epoch": 0.6081504702194357,
"grad_norm": 0.4015416958194172,
"learning_rate": 4.432821943282194e-05,
"loss": 0.4575,
"step": 485
},
{
"epoch": 0.6094043887147336,
"grad_norm": 0.45615159647275383,
"learning_rate": 4.430497443049745e-05,
"loss": 0.4816,
"step": 486
},
{
"epoch": 0.6106583072100313,
"grad_norm": 0.40297734848636646,
"learning_rate": 4.4281729428172945e-05,
"loss": 0.457,
"step": 487
},
{
"epoch": 0.6119122257053291,
"grad_norm": 0.43631622743375215,
"learning_rate": 4.425848442584844e-05,
"loss": 0.4613,
"step": 488
},
{
"epoch": 0.613166144200627,
"grad_norm": 0.4374338407494167,
"learning_rate": 4.423523942352395e-05,
"loss": 0.4628,
"step": 489
},
{
"epoch": 0.6144200626959248,
"grad_norm": 0.4556869658651037,
"learning_rate": 4.4211994421199445e-05,
"loss": 0.4713,
"step": 490
},
{
"epoch": 0.6156739811912225,
"grad_norm": 0.5057700208806412,
"learning_rate": 4.418874941887494e-05,
"loss": 0.447,
"step": 491
},
{
"epoch": 0.6169278996865204,
"grad_norm": 0.4140654371734097,
"learning_rate": 4.416550441655045e-05,
"loss": 0.4594,
"step": 492
},
{
"epoch": 0.6181818181818182,
"grad_norm": 0.49421178710838287,
"learning_rate": 4.4142259414225946e-05,
"loss": 0.4796,
"step": 493
},
{
"epoch": 0.6194357366771159,
"grad_norm": 0.46960671019515576,
"learning_rate": 4.4119014411901443e-05,
"loss": 0.4418,
"step": 494
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.44413745159472173,
"learning_rate": 4.409576940957694e-05,
"loss": 0.4437,
"step": 495
},
{
"epoch": 0.6219435736677116,
"grad_norm": 0.5160913146910245,
"learning_rate": 4.407252440725244e-05,
"loss": 0.4323,
"step": 496
},
{
"epoch": 0.6231974921630095,
"grad_norm": 0.47959238620080746,
"learning_rate": 4.404927940492794e-05,
"loss": 0.4574,
"step": 497
},
{
"epoch": 0.6244514106583072,
"grad_norm": 0.5131218347134716,
"learning_rate": 4.402603440260344e-05,
"loss": 0.4724,
"step": 498
},
{
"epoch": 0.625705329153605,
"grad_norm": 0.4819713541190505,
"learning_rate": 4.4002789400278946e-05,
"loss": 0.4164,
"step": 499
},
{
"epoch": 0.6269592476489029,
"grad_norm": 0.522376823665628,
"learning_rate": 4.3979544397954444e-05,
"loss": 0.4681,
"step": 500
},
{
"epoch": 0.6282131661442006,
"grad_norm": 0.5186270472334603,
"learning_rate": 4.395629939562994e-05,
"loss": 0.4162,
"step": 501
},
{
"epoch": 0.6294670846394984,
"grad_norm": 0.4212002305095572,
"learning_rate": 4.393305439330544e-05,
"loss": 0.4546,
"step": 502
},
{
"epoch": 0.6307210031347963,
"grad_norm": 0.6120858685869168,
"learning_rate": 4.3909809390980944e-05,
"loss": 0.4452,
"step": 503
},
{
"epoch": 0.631974921630094,
"grad_norm": 0.3647094463520387,
"learning_rate": 4.388656438865644e-05,
"loss": 0.4359,
"step": 504
},
{
"epoch": 0.6332288401253918,
"grad_norm": 0.5274555431394217,
"learning_rate": 4.386331938633194e-05,
"loss": 0.4312,
"step": 505
},
{
"epoch": 0.6344827586206897,
"grad_norm": 0.5060280656581896,
"learning_rate": 4.384007438400744e-05,
"loss": 0.4524,
"step": 506
},
{
"epoch": 0.6357366771159875,
"grad_norm": 0.5949640852466459,
"learning_rate": 4.3816829381682936e-05,
"loss": 0.4654,
"step": 507
},
{
"epoch": 0.6369905956112852,
"grad_norm": 0.6805468622322287,
"learning_rate": 4.379358437935844e-05,
"loss": 0.4613,
"step": 508
},
{
"epoch": 0.6382445141065831,
"grad_norm": 0.4527120367977408,
"learning_rate": 4.3770339377033945e-05,
"loss": 0.4847,
"step": 509
},
{
"epoch": 0.6394984326018809,
"grad_norm": 0.6392217446398496,
"learning_rate": 4.374709437470944e-05,
"loss": 0.4358,
"step": 510
},
{
"epoch": 0.6407523510971787,
"grad_norm": 0.502059732040333,
"learning_rate": 4.372384937238494e-05,
"loss": 0.4799,
"step": 511
},
{
"epoch": 0.6420062695924765,
"grad_norm": 0.45897823279348016,
"learning_rate": 4.370060437006044e-05,
"loss": 0.4342,
"step": 512
},
{
"epoch": 0.6432601880877743,
"grad_norm": 0.4924185079090007,
"learning_rate": 4.3677359367735936e-05,
"loss": 0.4209,
"step": 513
},
{
"epoch": 0.6445141065830721,
"grad_norm": 0.45012021164458105,
"learning_rate": 4.3654114365411434e-05,
"loss": 0.4657,
"step": 514
},
{
"epoch": 0.64576802507837,
"grad_norm": 0.477789609875453,
"learning_rate": 4.363086936308694e-05,
"loss": 0.4735,
"step": 515
},
{
"epoch": 0.6470219435736677,
"grad_norm": 0.40170067440542634,
"learning_rate": 4.3607624360762437e-05,
"loss": 0.4237,
"step": 516
},
{
"epoch": 0.6482758620689655,
"grad_norm": 0.5235780556652827,
"learning_rate": 4.358437935843794e-05,
"loss": 0.4839,
"step": 517
},
{
"epoch": 0.6495297805642634,
"grad_norm": 0.39649167876744906,
"learning_rate": 4.356113435611344e-05,
"loss": 0.4476,
"step": 518
},
{
"epoch": 0.6507836990595611,
"grad_norm": 0.43016091808187806,
"learning_rate": 4.353788935378894e-05,
"loss": 0.4494,
"step": 519
},
{
"epoch": 0.6520376175548589,
"grad_norm": 0.4253107171743552,
"learning_rate": 4.351464435146444e-05,
"loss": 0.4555,
"step": 520
},
{
"epoch": 0.6532915360501568,
"grad_norm": 0.4715609755020508,
"learning_rate": 4.349139934913994e-05,
"loss": 0.4539,
"step": 521
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.4161895825880825,
"learning_rate": 4.346815434681544e-05,
"loss": 0.4334,
"step": 522
},
{
"epoch": 0.6557993730407523,
"grad_norm": 0.40569675155834556,
"learning_rate": 4.3444909344490935e-05,
"loss": 0.4462,
"step": 523
},
{
"epoch": 0.6570532915360502,
"grad_norm": 0.49878486878255424,
"learning_rate": 4.342166434216643e-05,
"loss": 0.4412,
"step": 524
},
{
"epoch": 0.658307210031348,
"grad_norm": 0.4466517892077318,
"learning_rate": 4.339841933984193e-05,
"loss": 0.4654,
"step": 525
},
{
"epoch": 0.6595611285266457,
"grad_norm": 0.524754699462782,
"learning_rate": 4.3375174337517435e-05,
"loss": 0.4448,
"step": 526
},
{
"epoch": 0.6608150470219436,
"grad_norm": 0.44079340833455766,
"learning_rate": 4.335192933519294e-05,
"loss": 0.4329,
"step": 527
},
{
"epoch": 0.6620689655172414,
"grad_norm": 0.4136609544376585,
"learning_rate": 4.332868433286844e-05,
"loss": 0.4373,
"step": 528
},
{
"epoch": 0.6633228840125391,
"grad_norm": 0.4313970987350847,
"learning_rate": 4.3305439330543936e-05,
"loss": 0.432,
"step": 529
},
{
"epoch": 0.664576802507837,
"grad_norm": 0.4587923334003133,
"learning_rate": 4.328219432821943e-05,
"loss": 0.4406,
"step": 530
},
{
"epoch": 0.6658307210031348,
"grad_norm": 0.43153221314103307,
"learning_rate": 4.325894932589494e-05,
"loss": 0.4446,
"step": 531
},
{
"epoch": 0.6670846394984326,
"grad_norm": 0.44165077475744957,
"learning_rate": 4.3235704323570436e-05,
"loss": 0.4428,
"step": 532
},
{
"epoch": 0.6683385579937304,
"grad_norm": 0.4034830539175131,
"learning_rate": 4.3212459321245934e-05,
"loss": 0.4527,
"step": 533
},
{
"epoch": 0.6695924764890282,
"grad_norm": 0.46798616495567885,
"learning_rate": 4.318921431892143e-05,
"loss": 0.4449,
"step": 534
},
{
"epoch": 0.670846394984326,
"grad_norm": 0.4126249669255497,
"learning_rate": 4.316596931659693e-05,
"loss": 0.4693,
"step": 535
},
{
"epoch": 0.6721003134796238,
"grad_norm": 0.4831127565083721,
"learning_rate": 4.3142724314272434e-05,
"loss": 0.4428,
"step": 536
},
{
"epoch": 0.6733542319749216,
"grad_norm": 0.48894878685681126,
"learning_rate": 4.311947931194794e-05,
"loss": 0.424,
"step": 537
},
{
"epoch": 0.6746081504702194,
"grad_norm": 0.4623086456966088,
"learning_rate": 4.3096234309623436e-05,
"loss": 0.4674,
"step": 538
},
{
"epoch": 0.6758620689655173,
"grad_norm": 0.4924048623169994,
"learning_rate": 4.3072989307298934e-05,
"loss": 0.4308,
"step": 539
},
{
"epoch": 0.677115987460815,
"grad_norm": 0.4275894401626009,
"learning_rate": 4.304974430497443e-05,
"loss": 0.442,
"step": 540
},
{
"epoch": 0.6783699059561129,
"grad_norm": 0.4908592728035608,
"learning_rate": 4.302649930264993e-05,
"loss": 0.4419,
"step": 541
},
{
"epoch": 0.6796238244514107,
"grad_norm": 0.3680854873982936,
"learning_rate": 4.300325430032543e-05,
"loss": 0.468,
"step": 542
},
{
"epoch": 0.6808777429467084,
"grad_norm": 0.45237860638037075,
"learning_rate": 4.298000929800093e-05,
"loss": 0.429,
"step": 543
},
{
"epoch": 0.6821316614420063,
"grad_norm": 0.4750717557928149,
"learning_rate": 4.295676429567643e-05,
"loss": 0.475,
"step": 544
},
{
"epoch": 0.6833855799373041,
"grad_norm": 0.4792002757754514,
"learning_rate": 4.2933519293351935e-05,
"loss": 0.4172,
"step": 545
},
{
"epoch": 0.6846394984326019,
"grad_norm": 0.36346060017724313,
"learning_rate": 4.291027429102743e-05,
"loss": 0.4329,
"step": 546
},
{
"epoch": 0.6858934169278997,
"grad_norm": 0.40621610179365425,
"learning_rate": 4.288702928870293e-05,
"loss": 0.4137,
"step": 547
},
{
"epoch": 0.6871473354231975,
"grad_norm": 0.41939227651449457,
"learning_rate": 4.2863784286378435e-05,
"loss": 0.4425,
"step": 548
},
{
"epoch": 0.6884012539184953,
"grad_norm": 0.4628567152711822,
"learning_rate": 4.284053928405393e-05,
"loss": 0.4694,
"step": 549
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.3834763148443634,
"learning_rate": 4.281729428172943e-05,
"loss": 0.4529,
"step": 550
},
{
"epoch": 0.6909090909090909,
"grad_norm": 0.3459860394273285,
"learning_rate": 4.279404927940493e-05,
"loss": 0.4096,
"step": 551
},
{
"epoch": 0.6921630094043887,
"grad_norm": 0.409636854397069,
"learning_rate": 4.2770804277080426e-05,
"loss": 0.4613,
"step": 552
},
{
"epoch": 0.6934169278996866,
"grad_norm": 0.377955124605852,
"learning_rate": 4.2747559274755924e-05,
"loss": 0.45,
"step": 553
},
{
"epoch": 0.6946708463949843,
"grad_norm": 0.41894278253414846,
"learning_rate": 4.272431427243143e-05,
"loss": 0.468,
"step": 554
},
{
"epoch": 0.6959247648902821,
"grad_norm": 0.4254994116044244,
"learning_rate": 4.2701069270106934e-05,
"loss": 0.4899,
"step": 555
},
{
"epoch": 0.69717868338558,
"grad_norm": 0.38739660796354775,
"learning_rate": 4.267782426778243e-05,
"loss": 0.4589,
"step": 556
},
{
"epoch": 0.6984326018808777,
"grad_norm": 0.4744843487085416,
"learning_rate": 4.265457926545793e-05,
"loss": 0.46,
"step": 557
},
{
"epoch": 0.6996865203761755,
"grad_norm": 0.41696412327381654,
"learning_rate": 4.263133426313343e-05,
"loss": 0.4585,
"step": 558
},
{
"epoch": 0.7009404388714734,
"grad_norm": 0.5407149752385829,
"learning_rate": 4.2608089260808925e-05,
"loss": 0.4616,
"step": 559
},
{
"epoch": 0.7021943573667712,
"grad_norm": 0.3400903407973187,
"learning_rate": 4.258484425848443e-05,
"loss": 0.465,
"step": 560
},
{
"epoch": 0.7034482758620689,
"grad_norm": 0.470250176673341,
"learning_rate": 4.256159925615993e-05,
"loss": 0.4389,
"step": 561
},
{
"epoch": 0.7047021943573668,
"grad_norm": 0.4396219153457148,
"learning_rate": 4.2538354253835425e-05,
"loss": 0.4543,
"step": 562
},
{
"epoch": 0.7059561128526646,
"grad_norm": 0.4460348567827846,
"learning_rate": 4.251510925151092e-05,
"loss": 0.458,
"step": 563
},
{
"epoch": 0.7072100313479623,
"grad_norm": 0.4236771049362164,
"learning_rate": 4.249186424918643e-05,
"loss": 0.4669,
"step": 564
},
{
"epoch": 0.7084639498432602,
"grad_norm": 0.5972075650162256,
"learning_rate": 4.246861924686193e-05,
"loss": 0.4832,
"step": 565
},
{
"epoch": 0.709717868338558,
"grad_norm": 0.40025195898832533,
"learning_rate": 4.244537424453743e-05,
"loss": 0.4392,
"step": 566
},
{
"epoch": 0.7109717868338558,
"grad_norm": 0.5386699452026774,
"learning_rate": 4.242212924221293e-05,
"loss": 0.4452,
"step": 567
},
{
"epoch": 0.7122257053291536,
"grad_norm": 0.4709898606482836,
"learning_rate": 4.2398884239888426e-05,
"loss": 0.457,
"step": 568
},
{
"epoch": 0.7134796238244514,
"grad_norm": 0.36772096899562573,
"learning_rate": 4.2375639237563924e-05,
"loss": 0.422,
"step": 569
},
{
"epoch": 0.7147335423197492,
"grad_norm": 0.44682734833923365,
"learning_rate": 4.235239423523942e-05,
"loss": 0.4564,
"step": 570
},
{
"epoch": 0.715987460815047,
"grad_norm": 0.42058325573939465,
"learning_rate": 4.2329149232914926e-05,
"loss": 0.4405,
"step": 571
},
{
"epoch": 0.7172413793103448,
"grad_norm": 0.43804950426711875,
"learning_rate": 4.2305904230590424e-05,
"loss": 0.4369,
"step": 572
},
{
"epoch": 0.7184952978056426,
"grad_norm": 0.4690517635456882,
"learning_rate": 4.228265922826592e-05,
"loss": 0.441,
"step": 573
},
{
"epoch": 0.7197492163009405,
"grad_norm": 0.49756254831204344,
"learning_rate": 4.2259414225941426e-05,
"loss": 0.4349,
"step": 574
},
{
"epoch": 0.7210031347962382,
"grad_norm": 0.4402418523600355,
"learning_rate": 4.2236169223616924e-05,
"loss": 0.4609,
"step": 575
},
{
"epoch": 0.722257053291536,
"grad_norm": 0.4850109984052213,
"learning_rate": 4.221292422129242e-05,
"loss": 0.4497,
"step": 576
},
{
"epoch": 0.7235109717868339,
"grad_norm": 0.37872644094272734,
"learning_rate": 4.218967921896793e-05,
"loss": 0.4203,
"step": 577
},
{
"epoch": 0.7247648902821316,
"grad_norm": 0.45788648564212436,
"learning_rate": 4.2166434216643424e-05,
"loss": 0.445,
"step": 578
},
{
"epoch": 0.7260188087774294,
"grad_norm": 0.3834717079171693,
"learning_rate": 4.214318921431892e-05,
"loss": 0.4539,
"step": 579
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.44998645522375025,
"learning_rate": 4.211994421199442e-05,
"loss": 0.4696,
"step": 580
},
{
"epoch": 0.7285266457680251,
"grad_norm": 0.439969956431325,
"learning_rate": 4.209669920966992e-05,
"loss": 0.4524,
"step": 581
},
{
"epoch": 0.7297805642633229,
"grad_norm": 0.3551255855112613,
"learning_rate": 4.207345420734542e-05,
"loss": 0.4398,
"step": 582
},
{
"epoch": 0.7310344827586207,
"grad_norm": 0.32843829725910395,
"learning_rate": 4.205020920502093e-05,
"loss": 0.4767,
"step": 583
},
{
"epoch": 0.7322884012539185,
"grad_norm": 0.35011648681185925,
"learning_rate": 4.2026964202696425e-05,
"loss": 0.4345,
"step": 584
},
{
"epoch": 0.7335423197492164,
"grad_norm": 0.39356813990759876,
"learning_rate": 4.200371920037192e-05,
"loss": 0.4221,
"step": 585
},
{
"epoch": 0.7347962382445141,
"grad_norm": 0.3801587036629978,
"learning_rate": 4.198047419804742e-05,
"loss": 0.4444,
"step": 586
},
{
"epoch": 0.7360501567398119,
"grad_norm": 0.4517419201746339,
"learning_rate": 4.195722919572292e-05,
"loss": 0.4255,
"step": 587
},
{
"epoch": 0.7373040752351098,
"grad_norm": 0.4731000100112271,
"learning_rate": 4.193398419339842e-05,
"loss": 0.4556,
"step": 588
},
{
"epoch": 0.7385579937304075,
"grad_norm": 0.376372272988303,
"learning_rate": 4.191073919107392e-05,
"loss": 0.4324,
"step": 589
},
{
"epoch": 0.7398119122257053,
"grad_norm": 0.35485452705302434,
"learning_rate": 4.188749418874942e-05,
"loss": 0.4411,
"step": 590
},
{
"epoch": 0.7410658307210032,
"grad_norm": 0.41733033938766756,
"learning_rate": 4.186424918642492e-05,
"loss": 0.4483,
"step": 591
},
{
"epoch": 0.742319749216301,
"grad_norm": 0.3932372408901751,
"learning_rate": 4.184100418410042e-05,
"loss": 0.4628,
"step": 592
},
{
"epoch": 0.7435736677115987,
"grad_norm": 0.5054952554852729,
"learning_rate": 4.1817759181775926e-05,
"loss": 0.4373,
"step": 593
},
{
"epoch": 0.7448275862068966,
"grad_norm": 0.4244530772275813,
"learning_rate": 4.1794514179451424e-05,
"loss": 0.4584,
"step": 594
},
{
"epoch": 0.7460815047021944,
"grad_norm": 0.4288574860573689,
"learning_rate": 4.177126917712692e-05,
"loss": 0.4624,
"step": 595
},
{
"epoch": 0.7473354231974921,
"grad_norm": 0.5008326798135792,
"learning_rate": 4.174802417480242e-05,
"loss": 0.4309,
"step": 596
},
{
"epoch": 0.74858934169279,
"grad_norm": 0.3348693650103407,
"learning_rate": 4.172477917247792e-05,
"loss": 0.4525,
"step": 597
},
{
"epoch": 0.7498432601880878,
"grad_norm": 0.5404638952450502,
"learning_rate": 4.1701534170153415e-05,
"loss": 0.4411,
"step": 598
},
{
"epoch": 0.7510971786833855,
"grad_norm": 0.3735376668928673,
"learning_rate": 4.167828916782892e-05,
"loss": 0.4317,
"step": 599
},
{
"epoch": 0.7523510971786834,
"grad_norm": 0.5192913433527879,
"learning_rate": 4.165504416550442e-05,
"loss": 0.4615,
"step": 600
},
{
"epoch": 0.7536050156739812,
"grad_norm": 0.433684996995179,
"learning_rate": 4.1631799163179915e-05,
"loss": 0.4247,
"step": 601
},
{
"epoch": 0.754858934169279,
"grad_norm": 0.492518559279571,
"learning_rate": 4.160855416085542e-05,
"loss": 0.4257,
"step": 602
},
{
"epoch": 0.7561128526645768,
"grad_norm": 0.4633010870223474,
"learning_rate": 4.158530915853092e-05,
"loss": 0.4493,
"step": 603
},
{
"epoch": 0.7573667711598746,
"grad_norm": 0.43861146117170763,
"learning_rate": 4.1562064156206416e-05,
"loss": 0.4474,
"step": 604
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.42044465360598166,
"learning_rate": 4.153881915388192e-05,
"loss": 0.4406,
"step": 605
},
{
"epoch": 0.7598746081504703,
"grad_norm": 0.34434574835317766,
"learning_rate": 4.151557415155742e-05,
"loss": 0.4237,
"step": 606
},
{
"epoch": 0.761128526645768,
"grad_norm": 0.34709218412953896,
"learning_rate": 4.1492329149232916e-05,
"loss": 0.4452,
"step": 607
},
{
"epoch": 0.7623824451410658,
"grad_norm": 0.40017123170948476,
"learning_rate": 4.1469084146908414e-05,
"loss": 0.4523,
"step": 608
},
{
"epoch": 0.7636363636363637,
"grad_norm": 0.3657386600793048,
"learning_rate": 4.144583914458391e-05,
"loss": 0.4738,
"step": 609
},
{
"epoch": 0.7648902821316614,
"grad_norm": 0.34832457505198877,
"learning_rate": 4.1422594142259416e-05,
"loss": 0.4303,
"step": 610
},
{
"epoch": 0.7661442006269592,
"grad_norm": 0.3358501055695873,
"learning_rate": 4.139934913993492e-05,
"loss": 0.4513,
"step": 611
},
{
"epoch": 0.7673981191222571,
"grad_norm": 0.3449210514199624,
"learning_rate": 4.137610413761042e-05,
"loss": 0.4386,
"step": 612
},
{
"epoch": 0.7686520376175549,
"grad_norm": 0.3257413827982445,
"learning_rate": 4.1352859135285917e-05,
"loss": 0.453,
"step": 613
},
{
"epoch": 0.7699059561128526,
"grad_norm": 0.4280638984829963,
"learning_rate": 4.1329614132961414e-05,
"loss": 0.4617,
"step": 614
},
{
"epoch": 0.7711598746081505,
"grad_norm": 0.3542490528925714,
"learning_rate": 4.130636913063691e-05,
"loss": 0.4402,
"step": 615
},
{
"epoch": 0.7724137931034483,
"grad_norm": 0.38414899726870655,
"learning_rate": 4.128312412831242e-05,
"loss": 0.4265,
"step": 616
},
{
"epoch": 0.773667711598746,
"grad_norm": 0.40154859236592044,
"learning_rate": 4.1259879125987915e-05,
"loss": 0.4626,
"step": 617
},
{
"epoch": 0.7749216300940439,
"grad_norm": 0.32727493978003236,
"learning_rate": 4.123663412366341e-05,
"loss": 0.4502,
"step": 618
},
{
"epoch": 0.7761755485893417,
"grad_norm": 0.46809920387039416,
"learning_rate": 4.121338912133891e-05,
"loss": 0.4811,
"step": 619
},
{
"epoch": 0.7774294670846394,
"grad_norm": 0.43044464466955124,
"learning_rate": 4.1190144119014415e-05,
"loss": 0.4718,
"step": 620
},
{
"epoch": 0.7786833855799373,
"grad_norm": 0.3509149224133549,
"learning_rate": 4.116689911668991e-05,
"loss": 0.4076,
"step": 621
},
{
"epoch": 0.7799373040752351,
"grad_norm": 0.47344713586756076,
"learning_rate": 4.114365411436542e-05,
"loss": 0.4373,
"step": 622
},
{
"epoch": 0.7811912225705329,
"grad_norm": 0.3679345061423892,
"learning_rate": 4.1120409112040915e-05,
"loss": 0.4568,
"step": 623
},
{
"epoch": 0.7824451410658307,
"grad_norm": 0.4680763305077707,
"learning_rate": 4.109716410971641e-05,
"loss": 0.4342,
"step": 624
},
{
"epoch": 0.7836990595611285,
"grad_norm": 0.35228091169658793,
"learning_rate": 4.107391910739191e-05,
"loss": 0.4107,
"step": 625
},
{
"epoch": 0.7849529780564264,
"grad_norm": 0.4260486105635574,
"learning_rate": 4.105067410506741e-05,
"loss": 0.4354,
"step": 626
},
{
"epoch": 0.7862068965517242,
"grad_norm": 0.4242559250603418,
"learning_rate": 4.1027429102742913e-05,
"loss": 0.4206,
"step": 627
},
{
"epoch": 0.7874608150470219,
"grad_norm": 0.3984246735081666,
"learning_rate": 4.100418410041841e-05,
"loss": 0.4571,
"step": 628
},
{
"epoch": 0.7887147335423198,
"grad_norm": 0.41686733551548905,
"learning_rate": 4.098093909809391e-05,
"loss": 0.4349,
"step": 629
},
{
"epoch": 0.7899686520376176,
"grad_norm": 0.4562513048974343,
"learning_rate": 4.0957694095769414e-05,
"loss": 0.4623,
"step": 630
},
{
"epoch": 0.7912225705329153,
"grad_norm": 0.3918475369982096,
"learning_rate": 4.093444909344491e-05,
"loss": 0.4702,
"step": 631
},
{
"epoch": 0.7924764890282132,
"grad_norm": 0.45201687189489226,
"learning_rate": 4.091120409112041e-05,
"loss": 0.4701,
"step": 632
},
{
"epoch": 0.793730407523511,
"grad_norm": 0.401905147543322,
"learning_rate": 4.0887959088795914e-05,
"loss": 0.4427,
"step": 633
},
{
"epoch": 0.7949843260188088,
"grad_norm": 0.37403961895421406,
"learning_rate": 4.086471408647141e-05,
"loss": 0.4239,
"step": 634
},
{
"epoch": 0.7962382445141066,
"grad_norm": 0.5160057762494031,
"learning_rate": 4.084146908414691e-05,
"loss": 0.4678,
"step": 635
},
{
"epoch": 0.7974921630094044,
"grad_norm": 0.3600666934862265,
"learning_rate": 4.081822408182241e-05,
"loss": 0.4299,
"step": 636
},
{
"epoch": 0.7987460815047022,
"grad_norm": 0.47031370346919515,
"learning_rate": 4.0794979079497905e-05,
"loss": 0.4475,
"step": 637
},
{
"epoch": 0.8,
"grad_norm": 0.4274730318832894,
"learning_rate": 4.077173407717341e-05,
"loss": 0.474,
"step": 638
},
{
"epoch": 0.8012539184952978,
"grad_norm": 0.43988837868112113,
"learning_rate": 4.074848907484891e-05,
"loss": 0.4395,
"step": 639
},
{
"epoch": 0.8025078369905956,
"grad_norm": 0.4296420456638087,
"learning_rate": 4.072524407252441e-05,
"loss": 0.4414,
"step": 640
},
{
"epoch": 0.8037617554858935,
"grad_norm": 0.4165024780807054,
"learning_rate": 4.070199907019991e-05,
"loss": 0.4616,
"step": 641
},
{
"epoch": 0.8050156739811912,
"grad_norm": 0.45326227140295505,
"learning_rate": 4.067875406787541e-05,
"loss": 0.4281,
"step": 642
},
{
"epoch": 0.806269592476489,
"grad_norm": 0.34282163946528493,
"learning_rate": 4.0655509065550906e-05,
"loss": 0.445,
"step": 643
},
{
"epoch": 0.8075235109717869,
"grad_norm": 0.5318675123136467,
"learning_rate": 4.063226406322641e-05,
"loss": 0.4459,
"step": 644
},
{
"epoch": 0.8087774294670846,
"grad_norm": 0.3613681407566905,
"learning_rate": 4.060901906090191e-05,
"loss": 0.4516,
"step": 645
},
{
"epoch": 0.8100313479623824,
"grad_norm": 0.4256761740418918,
"learning_rate": 4.0585774058577406e-05,
"loss": 0.4442,
"step": 646
},
{
"epoch": 0.8112852664576803,
"grad_norm": 0.44490082318018265,
"learning_rate": 4.0562529056252904e-05,
"loss": 0.4482,
"step": 647
},
{
"epoch": 0.812539184952978,
"grad_norm": 0.37621044972900686,
"learning_rate": 4.053928405392841e-05,
"loss": 0.4491,
"step": 648
},
{
"epoch": 0.8137931034482758,
"grad_norm": 0.532445760596184,
"learning_rate": 4.0516039051603907e-05,
"loss": 0.4564,
"step": 649
},
{
"epoch": 0.8150470219435737,
"grad_norm": 0.36401298940190185,
"learning_rate": 4.049279404927941e-05,
"loss": 0.4418,
"step": 650
},
{
"epoch": 0.8163009404388715,
"grad_norm": 0.3954914213293213,
"learning_rate": 4.046954904695491e-05,
"loss": 0.4303,
"step": 651
},
{
"epoch": 0.8175548589341692,
"grad_norm": 0.41504912290851065,
"learning_rate": 4.044630404463041e-05,
"loss": 0.4472,
"step": 652
},
{
"epoch": 0.8188087774294671,
"grad_norm": 0.4362312488557668,
"learning_rate": 4.0423059042305905e-05,
"loss": 0.4463,
"step": 653
},
{
"epoch": 0.8200626959247649,
"grad_norm": 0.363133079675419,
"learning_rate": 4.03998140399814e-05,
"loss": 0.4589,
"step": 654
},
{
"epoch": 0.8213166144200627,
"grad_norm": 0.3575720551118513,
"learning_rate": 4.037656903765691e-05,
"loss": 0.4652,
"step": 655
},
{
"epoch": 0.8225705329153605,
"grad_norm": 0.41133781145325254,
"learning_rate": 4.0353324035332405e-05,
"loss": 0.4315,
"step": 656
},
{
"epoch": 0.8238244514106583,
"grad_norm": 0.35933201397974796,
"learning_rate": 4.03300790330079e-05,
"loss": 0.4612,
"step": 657
},
{
"epoch": 0.8250783699059561,
"grad_norm": 0.3748434098154192,
"learning_rate": 4.030683403068341e-05,
"loss": 0.4264,
"step": 658
},
{
"epoch": 0.826332288401254,
"grad_norm": 0.38850604823504337,
"learning_rate": 4.0283589028358905e-05,
"loss": 0.4735,
"step": 659
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.33065641567657567,
"learning_rate": 4.02603440260344e-05,
"loss": 0.4461,
"step": 660
},
{
"epoch": 0.8288401253918495,
"grad_norm": 0.43085807279603316,
"learning_rate": 4.023709902370991e-05,
"loss": 0.4251,
"step": 661
},
{
"epoch": 0.8300940438871474,
"grad_norm": 0.3595094430705895,
"learning_rate": 4.0213854021385406e-05,
"loss": 0.4312,
"step": 662
},
{
"epoch": 0.8313479623824451,
"grad_norm": 0.355828984446344,
"learning_rate": 4.01906090190609e-05,
"loss": 0.4578,
"step": 663
},
{
"epoch": 0.8326018808777429,
"grad_norm": 0.42076853255285496,
"learning_rate": 4.01673640167364e-05,
"loss": 0.434,
"step": 664
},
{
"epoch": 0.8338557993730408,
"grad_norm": 0.40723695207312305,
"learning_rate": 4.01441190144119e-05,
"loss": 0.454,
"step": 665
},
{
"epoch": 0.8351097178683385,
"grad_norm": 0.41762296739356664,
"learning_rate": 4.0120874012087404e-05,
"loss": 0.4512,
"step": 666
},
{
"epoch": 0.8363636363636363,
"grad_norm": 0.3819699457963685,
"learning_rate": 4.00976290097629e-05,
"loss": 0.4362,
"step": 667
},
{
"epoch": 0.8376175548589342,
"grad_norm": 0.35068266268176346,
"learning_rate": 4.0074384007438406e-05,
"loss": 0.4461,
"step": 668
},
{
"epoch": 0.838871473354232,
"grad_norm": 0.39649081608198083,
"learning_rate": 4.0051139005113904e-05,
"loss": 0.4454,
"step": 669
},
{
"epoch": 0.8401253918495298,
"grad_norm": 0.39734611029312455,
"learning_rate": 4.00278940027894e-05,
"loss": 0.4351,
"step": 670
},
{
"epoch": 0.8413793103448276,
"grad_norm": 0.38682440100730897,
"learning_rate": 4.00046490004649e-05,
"loss": 0.4602,
"step": 671
},
{
"epoch": 0.8426332288401254,
"grad_norm": 0.4795449884401517,
"learning_rate": 3.9981403998140404e-05,
"loss": 0.4297,
"step": 672
},
{
"epoch": 0.8438871473354232,
"grad_norm": 0.4183237637019898,
"learning_rate": 3.99581589958159e-05,
"loss": 0.4643,
"step": 673
},
{
"epoch": 0.845141065830721,
"grad_norm": 0.4816675099146434,
"learning_rate": 3.99349139934914e-05,
"loss": 0.4369,
"step": 674
},
{
"epoch": 0.8463949843260188,
"grad_norm": 0.43678836647530106,
"learning_rate": 3.99116689911669e-05,
"loss": 0.4576,
"step": 675
},
{
"epoch": 0.8476489028213167,
"grad_norm": 0.4014082893270169,
"learning_rate": 3.98884239888424e-05,
"loss": 0.4468,
"step": 676
},
{
"epoch": 0.8489028213166144,
"grad_norm": 0.4899131199626283,
"learning_rate": 3.98651789865179e-05,
"loss": 0.4374,
"step": 677
},
{
"epoch": 0.8501567398119122,
"grad_norm": 0.39332886476211765,
"learning_rate": 3.9841933984193405e-05,
"loss": 0.4361,
"step": 678
},
{
"epoch": 0.8514106583072101,
"grad_norm": 0.4927351490596207,
"learning_rate": 3.98186889818689e-05,
"loss": 0.4525,
"step": 679
},
{
"epoch": 0.8526645768025078,
"grad_norm": 0.42172755391576666,
"learning_rate": 3.97954439795444e-05,
"loss": 0.4746,
"step": 680
},
{
"epoch": 0.8539184952978056,
"grad_norm": 0.4087673953808888,
"learning_rate": 3.97721989772199e-05,
"loss": 0.4673,
"step": 681
},
{
"epoch": 0.8551724137931035,
"grad_norm": 0.5375183511161877,
"learning_rate": 3.9748953974895396e-05,
"loss": 0.4464,
"step": 682
},
{
"epoch": 0.8564263322884013,
"grad_norm": 0.39062065710900234,
"learning_rate": 3.9725708972570894e-05,
"loss": 0.4352,
"step": 683
},
{
"epoch": 0.857680250783699,
"grad_norm": 0.5953572833773415,
"learning_rate": 3.97024639702464e-05,
"loss": 0.4284,
"step": 684
},
{
"epoch": 0.8589341692789969,
"grad_norm": 0.33514013958393235,
"learning_rate": 3.9679218967921896e-05,
"loss": 0.4217,
"step": 685
},
{
"epoch": 0.8601880877742947,
"grad_norm": 0.5237764392389324,
"learning_rate": 3.96559739655974e-05,
"loss": 0.4304,
"step": 686
},
{
"epoch": 0.8614420062695924,
"grad_norm": 0.40137572413031947,
"learning_rate": 3.96327289632729e-05,
"loss": 0.4625,
"step": 687
},
{
"epoch": 0.8626959247648903,
"grad_norm": 0.42057815080049593,
"learning_rate": 3.96094839609484e-05,
"loss": 0.426,
"step": 688
},
{
"epoch": 0.8639498432601881,
"grad_norm": 0.314735642790944,
"learning_rate": 3.95862389586239e-05,
"loss": 0.4225,
"step": 689
},
{
"epoch": 0.8652037617554859,
"grad_norm": 0.44918266396436557,
"learning_rate": 3.95629939562994e-05,
"loss": 0.4515,
"step": 690
},
{
"epoch": 0.8664576802507837,
"grad_norm": 0.48175255895936353,
"learning_rate": 3.95397489539749e-05,
"loss": 0.4383,
"step": 691
},
{
"epoch": 0.8677115987460815,
"grad_norm": 0.39258857299109656,
"learning_rate": 3.9516503951650395e-05,
"loss": 0.4652,
"step": 692
},
{
"epoch": 0.8689655172413793,
"grad_norm": 0.6009718395971831,
"learning_rate": 3.949325894932589e-05,
"loss": 0.4373,
"step": 693
},
{
"epoch": 0.8702194357366771,
"grad_norm": 0.3945486230234752,
"learning_rate": 3.94700139470014e-05,
"loss": 0.4301,
"step": 694
},
{
"epoch": 0.8714733542319749,
"grad_norm": 0.5852734929237384,
"learning_rate": 3.9446768944676895e-05,
"loss": 0.4428,
"step": 695
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.4449595681990464,
"learning_rate": 3.94235239423524e-05,
"loss": 0.4304,
"step": 696
},
{
"epoch": 0.8739811912225706,
"grad_norm": 0.4660303675792538,
"learning_rate": 3.94002789400279e-05,
"loss": 0.4034,
"step": 697
},
{
"epoch": 0.8752351097178683,
"grad_norm": 0.5388373868278207,
"learning_rate": 3.9377033937703395e-05,
"loss": 0.4423,
"step": 698
},
{
"epoch": 0.8764890282131661,
"grad_norm": 0.4110155562271871,
"learning_rate": 3.935378893537889e-05,
"loss": 0.4561,
"step": 699
},
{
"epoch": 0.877742946708464,
"grad_norm": 0.6188802753764778,
"learning_rate": 3.93305439330544e-05,
"loss": 0.4531,
"step": 700
},
{
"epoch": 0.8789968652037617,
"grad_norm": 0.293784421219067,
"learning_rate": 3.9307298930729896e-05,
"loss": 0.4147,
"step": 701
},
{
"epoch": 0.8802507836990595,
"grad_norm": 0.4889122220919684,
"learning_rate": 3.9284053928405394e-05,
"loss": 0.419,
"step": 702
},
{
"epoch": 0.8815047021943574,
"grad_norm": 0.37411750166940383,
"learning_rate": 3.926080892608089e-05,
"loss": 0.4196,
"step": 703
},
{
"epoch": 0.8827586206896552,
"grad_norm": 0.44900732914904506,
"learning_rate": 3.9237563923756396e-05,
"loss": 0.4283,
"step": 704
},
{
"epoch": 0.8840125391849529,
"grad_norm": 0.4411973046400579,
"learning_rate": 3.9214318921431894e-05,
"loss": 0.4702,
"step": 705
},
{
"epoch": 0.8852664576802508,
"grad_norm": 0.4338848294319396,
"learning_rate": 3.91910739191074e-05,
"loss": 0.4577,
"step": 706
},
{
"epoch": 0.8865203761755486,
"grad_norm": 0.46452893224081204,
"learning_rate": 3.9167828916782896e-05,
"loss": 0.4378,
"step": 707
},
{
"epoch": 0.8877742946708463,
"grad_norm": 0.3966606406585799,
"learning_rate": 3.9144583914458394e-05,
"loss": 0.4449,
"step": 708
},
{
"epoch": 0.8890282131661442,
"grad_norm": 0.34130278584514134,
"learning_rate": 3.912133891213389e-05,
"loss": 0.4385,
"step": 709
},
{
"epoch": 0.890282131661442,
"grad_norm": 0.40418391019162747,
"learning_rate": 3.909809390980939e-05,
"loss": 0.4231,
"step": 710
},
{
"epoch": 0.8915360501567398,
"grad_norm": 0.4115936548946021,
"learning_rate": 3.907484890748489e-05,
"loss": 0.4368,
"step": 711
},
{
"epoch": 0.8927899686520376,
"grad_norm": 0.4030504029549712,
"learning_rate": 3.905160390516039e-05,
"loss": 0.4354,
"step": 712
},
{
"epoch": 0.8940438871473354,
"grad_norm": 0.3739188343719914,
"learning_rate": 3.902835890283589e-05,
"loss": 0.447,
"step": 713
},
{
"epoch": 0.8952978056426333,
"grad_norm": 0.40044177012078397,
"learning_rate": 3.9005113900511395e-05,
"loss": 0.4307,
"step": 714
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.3858341447231616,
"learning_rate": 3.898186889818689e-05,
"loss": 0.4076,
"step": 715
},
{
"epoch": 0.8978056426332288,
"grad_norm": 0.4295674230745876,
"learning_rate": 3.895862389586239e-05,
"loss": 0.4334,
"step": 716
},
{
"epoch": 0.8990595611285267,
"grad_norm": 0.3921402451860404,
"learning_rate": 3.8935378893537895e-05,
"loss": 0.4351,
"step": 717
},
{
"epoch": 0.9003134796238245,
"grad_norm": 0.4592046259993288,
"learning_rate": 3.891213389121339e-05,
"loss": 0.442,
"step": 718
},
{
"epoch": 0.9015673981191222,
"grad_norm": 0.3677900297795609,
"learning_rate": 3.888888888888889e-05,
"loss": 0.4241,
"step": 719
},
{
"epoch": 0.9028213166144201,
"grad_norm": 0.43783434421178363,
"learning_rate": 3.886564388656439e-05,
"loss": 0.458,
"step": 720
},
{
"epoch": 0.9040752351097179,
"grad_norm": 0.37794444667480087,
"learning_rate": 3.8842398884239886e-05,
"loss": 0.4093,
"step": 721
},
{
"epoch": 0.9053291536050156,
"grad_norm": 0.4111021747377529,
"learning_rate": 3.881915388191539e-05,
"loss": 0.4247,
"step": 722
},
{
"epoch": 0.9065830721003135,
"grad_norm": 0.3382431332402009,
"learning_rate": 3.879590887959089e-05,
"loss": 0.4274,
"step": 723
},
{
"epoch": 0.9078369905956113,
"grad_norm": 0.42549178198388476,
"learning_rate": 3.8772663877266393e-05,
"loss": 0.4115,
"step": 724
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.35358674171716353,
"learning_rate": 3.874941887494189e-05,
"loss": 0.4483,
"step": 725
},
{
"epoch": 0.9103448275862069,
"grad_norm": 0.5182257228025047,
"learning_rate": 3.872617387261739e-05,
"loss": 0.429,
"step": 726
},
{
"epoch": 0.9115987460815047,
"grad_norm": 0.3579463307963988,
"learning_rate": 3.870292887029289e-05,
"loss": 0.4182,
"step": 727
},
{
"epoch": 0.9128526645768025,
"grad_norm": 0.5200426421071437,
"learning_rate": 3.8679683867968385e-05,
"loss": 0.4717,
"step": 728
},
{
"epoch": 0.9141065830721004,
"grad_norm": 0.34204846949747253,
"learning_rate": 3.865643886564389e-05,
"loss": 0.4453,
"step": 729
},
{
"epoch": 0.9153605015673981,
"grad_norm": 0.43390374144923183,
"learning_rate": 3.863319386331939e-05,
"loss": 0.4556,
"step": 730
},
{
"epoch": 0.9166144200626959,
"grad_norm": 0.3179813998704739,
"learning_rate": 3.8609948860994885e-05,
"loss": 0.4229,
"step": 731
},
{
"epoch": 0.9178683385579938,
"grad_norm": 0.4313926721196517,
"learning_rate": 3.858670385867039e-05,
"loss": 0.4498,
"step": 732
},
{
"epoch": 0.9191222570532915,
"grad_norm": 0.3486800132604066,
"learning_rate": 3.856345885634589e-05,
"loss": 0.3987,
"step": 733
},
{
"epoch": 0.9203761755485893,
"grad_norm": 0.4093310044174384,
"learning_rate": 3.854021385402139e-05,
"loss": 0.4689,
"step": 734
},
{
"epoch": 0.9216300940438872,
"grad_norm": 0.3300295268966952,
"learning_rate": 3.851696885169689e-05,
"loss": 0.433,
"step": 735
},
{
"epoch": 0.922884012539185,
"grad_norm": 0.3649647980646909,
"learning_rate": 3.849372384937239e-05,
"loss": 0.4669,
"step": 736
},
{
"epoch": 0.9241379310344827,
"grad_norm": 0.33104567398581297,
"learning_rate": 3.8470478847047886e-05,
"loss": 0.444,
"step": 737
},
{
"epoch": 0.9253918495297806,
"grad_norm": 0.3229451212832747,
"learning_rate": 3.8447233844723384e-05,
"loss": 0.4477,
"step": 738
},
{
"epoch": 0.9266457680250784,
"grad_norm": 0.40001882211903245,
"learning_rate": 3.842398884239888e-05,
"loss": 0.4259,
"step": 739
},
{
"epoch": 0.9278996865203761,
"grad_norm": 0.3541498442260299,
"learning_rate": 3.8400743840074386e-05,
"loss": 0.4453,
"step": 740
},
{
"epoch": 0.929153605015674,
"grad_norm": 0.4248492056362718,
"learning_rate": 3.8377498837749884e-05,
"loss": 0.4457,
"step": 741
},
{
"epoch": 0.9304075235109718,
"grad_norm": 0.3881464460810205,
"learning_rate": 3.835425383542539e-05,
"loss": 0.429,
"step": 742
},
{
"epoch": 0.9316614420062695,
"grad_norm": 0.4591108868127738,
"learning_rate": 3.8331008833100886e-05,
"loss": 0.4455,
"step": 743
},
{
"epoch": 0.9329153605015674,
"grad_norm": 0.3418940103579778,
"learning_rate": 3.8307763830776384e-05,
"loss": 0.4307,
"step": 744
},
{
"epoch": 0.9341692789968652,
"grad_norm": 0.40109320113862956,
"learning_rate": 3.828451882845189e-05,
"loss": 0.4498,
"step": 745
},
{
"epoch": 0.935423197492163,
"grad_norm": 0.36917457756408695,
"learning_rate": 3.8261273826127387e-05,
"loss": 0.4368,
"step": 746
},
{
"epoch": 0.9366771159874608,
"grad_norm": 0.39671558983504107,
"learning_rate": 3.8238028823802884e-05,
"loss": 0.4581,
"step": 747
},
{
"epoch": 0.9379310344827586,
"grad_norm": 0.34784376642465636,
"learning_rate": 3.821478382147838e-05,
"loss": 0.4426,
"step": 748
},
{
"epoch": 0.9391849529780564,
"grad_norm": 0.43349542227911075,
"learning_rate": 3.819153881915388e-05,
"loss": 0.4456,
"step": 749
},
{
"epoch": 0.9404388714733543,
"grad_norm": 0.3482142524911942,
"learning_rate": 3.816829381682938e-05,
"loss": 0.4231,
"step": 750
},
{
"epoch": 0.941692789968652,
"grad_norm": 0.4864291799143706,
"learning_rate": 3.814504881450488e-05,
"loss": 0.426,
"step": 751
},
{
"epoch": 0.9429467084639498,
"grad_norm": 0.4348069108962186,
"learning_rate": 3.812180381218039e-05,
"loss": 0.4373,
"step": 752
},
{
"epoch": 0.9442006269592477,
"grad_norm": 0.45365965712213363,
"learning_rate": 3.8098558809855885e-05,
"loss": 0.4237,
"step": 753
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.4588714112471688,
"learning_rate": 3.807531380753138e-05,
"loss": 0.4457,
"step": 754
},
{
"epoch": 0.9467084639498433,
"grad_norm": 0.4135021815534546,
"learning_rate": 3.805206880520688e-05,
"loss": 0.4536,
"step": 755
},
{
"epoch": 0.9479623824451411,
"grad_norm": 0.48009965822507994,
"learning_rate": 3.802882380288238e-05,
"loss": 0.4474,
"step": 756
},
{
"epoch": 0.9492163009404389,
"grad_norm": 0.45027781600946143,
"learning_rate": 3.800557880055788e-05,
"loss": 0.4625,
"step": 757
},
{
"epoch": 0.9504702194357367,
"grad_norm": 0.43379167642414934,
"learning_rate": 3.798233379823338e-05,
"loss": 0.4041,
"step": 758
},
{
"epoch": 0.9517241379310345,
"grad_norm": 0.3958388411270611,
"learning_rate": 3.795908879590888e-05,
"loss": 0.4199,
"step": 759
},
{
"epoch": 0.9529780564263323,
"grad_norm": 0.404710109527787,
"learning_rate": 3.7935843793584383e-05,
"loss": 0.4244,
"step": 760
},
{
"epoch": 0.9542319749216301,
"grad_norm": 0.4220689315091264,
"learning_rate": 3.791259879125988e-05,
"loss": 0.4127,
"step": 761
},
{
"epoch": 0.9554858934169279,
"grad_norm": 0.3963203966654246,
"learning_rate": 3.7889353788935386e-05,
"loss": 0.4432,
"step": 762
},
{
"epoch": 0.9567398119122257,
"grad_norm": 0.43366993825049666,
"learning_rate": 3.7866108786610884e-05,
"loss": 0.4193,
"step": 763
},
{
"epoch": 0.9579937304075236,
"grad_norm": 0.37270002414466524,
"learning_rate": 3.784286378428638e-05,
"loss": 0.4443,
"step": 764
},
{
"epoch": 0.9592476489028213,
"grad_norm": 0.4499406455180983,
"learning_rate": 3.781961878196188e-05,
"loss": 0.4331,
"step": 765
},
{
"epoch": 0.9605015673981191,
"grad_norm": 0.44128307900518904,
"learning_rate": 3.779637377963738e-05,
"loss": 0.439,
"step": 766
},
{
"epoch": 0.961755485893417,
"grad_norm": 0.37204323539515627,
"learning_rate": 3.7773128777312875e-05,
"loss": 0.437,
"step": 767
},
{
"epoch": 0.9630094043887147,
"grad_norm": 0.4426904090630473,
"learning_rate": 3.774988377498838e-05,
"loss": 0.4281,
"step": 768
},
{
"epoch": 0.9642633228840125,
"grad_norm": 0.35646587964190823,
"learning_rate": 3.772663877266388e-05,
"loss": 0.4429,
"step": 769
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.3953639155864673,
"learning_rate": 3.770339377033938e-05,
"loss": 0.4486,
"step": 770
},
{
"epoch": 0.9667711598746082,
"grad_norm": 0.34565034118616056,
"learning_rate": 3.768014876801488e-05,
"loss": 0.4338,
"step": 771
},
{
"epoch": 0.9680250783699059,
"grad_norm": 0.37410908295539147,
"learning_rate": 3.765690376569038e-05,
"loss": 0.4323,
"step": 772
},
{
"epoch": 0.9692789968652038,
"grad_norm": 0.357569081805058,
"learning_rate": 3.7633658763365876e-05,
"loss": 0.4303,
"step": 773
},
{
"epoch": 0.9705329153605016,
"grad_norm": 0.4391647650098833,
"learning_rate": 3.761041376104138e-05,
"loss": 0.4131,
"step": 774
},
{
"epoch": 0.9717868338557993,
"grad_norm": 0.38180115652006974,
"learning_rate": 3.758716875871688e-05,
"loss": 0.4413,
"step": 775
},
{
"epoch": 0.9730407523510972,
"grad_norm": 0.4418938722591234,
"learning_rate": 3.7563923756392376e-05,
"loss": 0.4255,
"step": 776
},
{
"epoch": 0.974294670846395,
"grad_norm": 0.3598254170091579,
"learning_rate": 3.7540678754067874e-05,
"loss": 0.4267,
"step": 777
},
{
"epoch": 0.9755485893416928,
"grad_norm": 0.4883852066871885,
"learning_rate": 3.751743375174337e-05,
"loss": 0.426,
"step": 778
},
{
"epoch": 0.9768025078369906,
"grad_norm": 0.39752441762468294,
"learning_rate": 3.7494188749418876e-05,
"loss": 0.4513,
"step": 779
},
{
"epoch": 0.9780564263322884,
"grad_norm": 0.4133117353580822,
"learning_rate": 3.747094374709438e-05,
"loss": 0.4432,
"step": 780
},
{
"epoch": 0.9793103448275862,
"grad_norm": 0.383688692752082,
"learning_rate": 3.744769874476988e-05,
"loss": 0.4267,
"step": 781
},
{
"epoch": 0.980564263322884,
"grad_norm": 0.45458899312347334,
"learning_rate": 3.7424453742445377e-05,
"loss": 0.4621,
"step": 782
},
{
"epoch": 0.9818181818181818,
"grad_norm": 0.4437363304300539,
"learning_rate": 3.7401208740120874e-05,
"loss": 0.4423,
"step": 783
},
{
"epoch": 0.9830721003134796,
"grad_norm": 0.3996211443377733,
"learning_rate": 3.737796373779637e-05,
"loss": 0.4377,
"step": 784
},
{
"epoch": 0.9843260188087775,
"grad_norm": 0.41412945179976307,
"learning_rate": 3.735471873547188e-05,
"loss": 0.4318,
"step": 785
},
{
"epoch": 0.9855799373040752,
"grad_norm": 0.4063083545270137,
"learning_rate": 3.7331473733147375e-05,
"loss": 0.4316,
"step": 786
},
{
"epoch": 0.986833855799373,
"grad_norm": 0.3691515758184706,
"learning_rate": 3.730822873082287e-05,
"loss": 0.4186,
"step": 787
},
{
"epoch": 0.9880877742946709,
"grad_norm": 0.3514449638098499,
"learning_rate": 3.728498372849838e-05,
"loss": 0.4417,
"step": 788
},
{
"epoch": 0.9893416927899686,
"grad_norm": 0.40940914699788417,
"learning_rate": 3.7261738726173875e-05,
"loss": 0.4353,
"step": 789
},
{
"epoch": 0.9905956112852664,
"grad_norm": 0.3254892811991773,
"learning_rate": 3.723849372384937e-05,
"loss": 0.4144,
"step": 790
},
{
"epoch": 0.9918495297805643,
"grad_norm": 0.3840077962749874,
"learning_rate": 3.721524872152488e-05,
"loss": 0.4152,
"step": 791
},
{
"epoch": 0.993103448275862,
"grad_norm": 0.37747685803577424,
"learning_rate": 3.7192003719200375e-05,
"loss": 0.4337,
"step": 792
},
{
"epoch": 0.9943573667711598,
"grad_norm": 0.3709134076301068,
"learning_rate": 3.716875871687587e-05,
"loss": 0.4351,
"step": 793
},
{
"epoch": 0.9956112852664577,
"grad_norm": 0.44699407313248746,
"learning_rate": 3.714551371455137e-05,
"loss": 0.4499,
"step": 794
},
{
"epoch": 0.9968652037617555,
"grad_norm": 0.35076114261740376,
"learning_rate": 3.712226871222687e-05,
"loss": 0.3983,
"step": 795
},
{
"epoch": 0.9981191222570532,
"grad_norm": 0.4091275373219224,
"learning_rate": 3.7099023709902373e-05,
"loss": 0.4265,
"step": 796
},
{
"epoch": 0.9993730407523511,
"grad_norm": 0.3747380606858744,
"learning_rate": 3.707577870757787e-05,
"loss": 0.4235,
"step": 797
},
{
"epoch": 1.0,
"grad_norm": 0.3747380606858744,
"learning_rate": 3.7052533705253376e-05,
"loss": 0.4343,
"step": 798
},
{
"epoch": 1.0012539184952978,
"grad_norm": 0.5324056664987192,
"learning_rate": 3.7029288702928874e-05,
"loss": 0.3615,
"step": 799
},
{
"epoch": 1.0025078369905955,
"grad_norm": 0.3961023941119493,
"learning_rate": 3.700604370060437e-05,
"loss": 0.3779,
"step": 800
},
{
"epoch": 1.0037617554858935,
"grad_norm": 0.4110314749405901,
"learning_rate": 3.698279869827987e-05,
"loss": 0.3862,
"step": 801
},
{
"epoch": 1.0050156739811913,
"grad_norm": 0.34474529657625996,
"learning_rate": 3.6959553695955374e-05,
"loss": 0.3852,
"step": 802
},
{
"epoch": 1.006269592476489,
"grad_norm": 0.40959483530789786,
"learning_rate": 3.693630869363087e-05,
"loss": 0.3731,
"step": 803
},
{
"epoch": 1.0075235109717868,
"grad_norm": 0.35563550035639624,
"learning_rate": 3.691306369130637e-05,
"loss": 0.3672,
"step": 804
},
{
"epoch": 1.0087774294670846,
"grad_norm": 0.4264557474917855,
"learning_rate": 3.688981868898187e-05,
"loss": 0.3816,
"step": 805
},
{
"epoch": 1.0100313479623824,
"grad_norm": 0.3603147380266279,
"learning_rate": 3.6866573686657365e-05,
"loss": 0.3593,
"step": 806
},
{
"epoch": 1.0112852664576804,
"grad_norm": 0.39491925792488675,
"learning_rate": 3.684332868433287e-05,
"loss": 0.3744,
"step": 807
},
{
"epoch": 1.0125391849529781,
"grad_norm": 0.35938737223013634,
"learning_rate": 3.6820083682008375e-05,
"loss": 0.3606,
"step": 808
},
{
"epoch": 1.013793103448276,
"grad_norm": 0.34649859843490766,
"learning_rate": 3.679683867968387e-05,
"loss": 0.38,
"step": 809
},
{
"epoch": 1.0150470219435737,
"grad_norm": 0.3681488889650523,
"learning_rate": 3.677359367735937e-05,
"loss": 0.3752,
"step": 810
},
{
"epoch": 1.0163009404388714,
"grad_norm": 0.3404735603429267,
"learning_rate": 3.675034867503487e-05,
"loss": 0.3732,
"step": 811
},
{
"epoch": 1.0175548589341692,
"grad_norm": 0.37962467977084524,
"learning_rate": 3.6727103672710366e-05,
"loss": 0.3926,
"step": 812
},
{
"epoch": 1.0188087774294672,
"grad_norm": 0.31688460649216416,
"learning_rate": 3.670385867038587e-05,
"loss": 0.3547,
"step": 813
},
{
"epoch": 1.020062695924765,
"grad_norm": 0.40612431499962337,
"learning_rate": 3.668061366806137e-05,
"loss": 0.3896,
"step": 814
},
{
"epoch": 1.0213166144200627,
"grad_norm": 0.35391931001157073,
"learning_rate": 3.6657368665736866e-05,
"loss": 0.3662,
"step": 815
},
{
"epoch": 1.0225705329153605,
"grad_norm": 0.3657932735808597,
"learning_rate": 3.663412366341237e-05,
"loss": 0.3872,
"step": 816
},
{
"epoch": 1.0238244514106583,
"grad_norm": 0.30743218037419384,
"learning_rate": 3.661087866108787e-05,
"loss": 0.3643,
"step": 817
},
{
"epoch": 1.025078369905956,
"grad_norm": 0.36591971475807206,
"learning_rate": 3.6587633658763366e-05,
"loss": 0.3618,
"step": 818
},
{
"epoch": 1.026332288401254,
"grad_norm": 0.3315769906618183,
"learning_rate": 3.656438865643887e-05,
"loss": 0.3634,
"step": 819
},
{
"epoch": 1.0275862068965518,
"grad_norm": 0.3197019550927611,
"learning_rate": 3.654114365411437e-05,
"loss": 0.3896,
"step": 820
},
{
"epoch": 1.0288401253918495,
"grad_norm": 0.3476184629711736,
"learning_rate": 3.651789865178987e-05,
"loss": 0.364,
"step": 821
},
{
"epoch": 1.0300940438871473,
"grad_norm": 0.3484878760418995,
"learning_rate": 3.6494653649465365e-05,
"loss": 0.3637,
"step": 822
},
{
"epoch": 1.031347962382445,
"grad_norm": 0.3891074882395375,
"learning_rate": 3.647140864714086e-05,
"loss": 0.397,
"step": 823
},
{
"epoch": 1.0326018808777429,
"grad_norm": 0.35725860158084216,
"learning_rate": 3.644816364481637e-05,
"loss": 0.3791,
"step": 824
},
{
"epoch": 1.0338557993730408,
"grad_norm": 0.3327937870043701,
"learning_rate": 3.6424918642491865e-05,
"loss": 0.3879,
"step": 825
},
{
"epoch": 1.0351097178683386,
"grad_norm": 0.333496867012471,
"learning_rate": 3.640167364016737e-05,
"loss": 0.3732,
"step": 826
},
{
"epoch": 1.0363636363636364,
"grad_norm": 0.3101006194721215,
"learning_rate": 3.637842863784287e-05,
"loss": 0.3701,
"step": 827
},
{
"epoch": 1.0376175548589341,
"grad_norm": 0.30338452349427386,
"learning_rate": 3.6355183635518365e-05,
"loss": 0.3964,
"step": 828
},
{
"epoch": 1.038871473354232,
"grad_norm": 0.3574169302755533,
"learning_rate": 3.633193863319386e-05,
"loss": 0.3613,
"step": 829
},
{
"epoch": 1.0401253918495297,
"grad_norm": 0.3728534359024157,
"learning_rate": 3.630869363086937e-05,
"loss": 0.3587,
"step": 830
},
{
"epoch": 1.0413793103448277,
"grad_norm": 0.3432507046860381,
"learning_rate": 3.6285448628544866e-05,
"loss": 0.3713,
"step": 831
},
{
"epoch": 1.0426332288401254,
"grad_norm": 0.3586528145446376,
"learning_rate": 3.626220362622036e-05,
"loss": 0.3806,
"step": 832
},
{
"epoch": 1.0438871473354232,
"grad_norm": 0.4214295755793986,
"learning_rate": 3.623895862389586e-05,
"loss": 0.381,
"step": 833
},
{
"epoch": 1.045141065830721,
"grad_norm": 0.43161509090540484,
"learning_rate": 3.621571362157136e-05,
"loss": 0.3937,
"step": 834
},
{
"epoch": 1.0463949843260187,
"grad_norm": 0.36667471248715844,
"learning_rate": 3.6192468619246864e-05,
"loss": 0.3744,
"step": 835
},
{
"epoch": 1.0476489028213165,
"grad_norm": 0.3670161287798812,
"learning_rate": 3.616922361692237e-05,
"loss": 0.403,
"step": 836
},
{
"epoch": 1.0489028213166145,
"grad_norm": 0.3617131834675146,
"learning_rate": 3.6145978614597866e-05,
"loss": 0.3907,
"step": 837
},
{
"epoch": 1.0501567398119123,
"grad_norm": 0.39093698501758384,
"learning_rate": 3.6122733612273364e-05,
"loss": 0.3705,
"step": 838
},
{
"epoch": 1.05141065830721,
"grad_norm": 0.4073586332233651,
"learning_rate": 3.609948860994886e-05,
"loss": 0.3784,
"step": 839
},
{
"epoch": 1.0526645768025078,
"grad_norm": 0.4607319515223548,
"learning_rate": 3.607624360762436e-05,
"loss": 0.3758,
"step": 840
},
{
"epoch": 1.0539184952978056,
"grad_norm": 0.36646060450692164,
"learning_rate": 3.6052998605299864e-05,
"loss": 0.3785,
"step": 841
},
{
"epoch": 1.0551724137931036,
"grad_norm": 0.34195014703458954,
"learning_rate": 3.602975360297536e-05,
"loss": 0.3668,
"step": 842
},
{
"epoch": 1.0564263322884013,
"grad_norm": 0.3723467368722267,
"learning_rate": 3.600650860065086e-05,
"loss": 0.3672,
"step": 843
},
{
"epoch": 1.057680250783699,
"grad_norm": 0.3770852571663006,
"learning_rate": 3.598326359832636e-05,
"loss": 0.3928,
"step": 844
},
{
"epoch": 1.0589341692789969,
"grad_norm": 0.34777428888529577,
"learning_rate": 3.596001859600186e-05,
"loss": 0.3715,
"step": 845
},
{
"epoch": 1.0601880877742946,
"grad_norm": 0.3709129780585532,
"learning_rate": 3.593677359367736e-05,
"loss": 0.3868,
"step": 846
},
{
"epoch": 1.0614420062695924,
"grad_norm": 0.3435765302778911,
"learning_rate": 3.5913528591352865e-05,
"loss": 0.3829,
"step": 847
},
{
"epoch": 1.0626959247648902,
"grad_norm": 0.37151344994641194,
"learning_rate": 3.589028358902836e-05,
"loss": 0.3805,
"step": 848
},
{
"epoch": 1.0639498432601882,
"grad_norm": 0.33668045441755573,
"learning_rate": 3.586703858670386e-05,
"loss": 0.3826,
"step": 849
},
{
"epoch": 1.065203761755486,
"grad_norm": 0.3537446610670913,
"learning_rate": 3.584379358437936e-05,
"loss": 0.3585,
"step": 850
},
{
"epoch": 1.0664576802507837,
"grad_norm": 0.3650945811794558,
"learning_rate": 3.5820548582054856e-05,
"loss": 0.3873,
"step": 851
},
{
"epoch": 1.0677115987460815,
"grad_norm": 0.4116314168975181,
"learning_rate": 3.579730357973036e-05,
"loss": 0.3849,
"step": 852
},
{
"epoch": 1.0689655172413792,
"grad_norm": 0.3292534744630128,
"learning_rate": 3.577405857740586e-05,
"loss": 0.3746,
"step": 853
},
{
"epoch": 1.0702194357366772,
"grad_norm": 0.3708437092000687,
"learning_rate": 3.575081357508136e-05,
"loss": 0.3657,
"step": 854
},
{
"epoch": 1.071473354231975,
"grad_norm": 0.3093845208754774,
"learning_rate": 3.572756857275686e-05,
"loss": 0.3717,
"step": 855
},
{
"epoch": 1.0727272727272728,
"grad_norm": 0.40161939274972974,
"learning_rate": 3.570432357043236e-05,
"loss": 0.3495,
"step": 856
},
{
"epoch": 1.0739811912225705,
"grad_norm": 0.3853171270753142,
"learning_rate": 3.568107856810786e-05,
"loss": 0.3478,
"step": 857
},
{
"epoch": 1.0752351097178683,
"grad_norm": 0.2934408372532566,
"learning_rate": 3.565783356578336e-05,
"loss": 0.3978,
"step": 858
},
{
"epoch": 1.076489028213166,
"grad_norm": 0.34660059800785364,
"learning_rate": 3.563458856345886e-05,
"loss": 0.3655,
"step": 859
},
{
"epoch": 1.077742946708464,
"grad_norm": 0.3554792216429317,
"learning_rate": 3.561134356113436e-05,
"loss": 0.3659,
"step": 860
},
{
"epoch": 1.0789968652037618,
"grad_norm": 0.3510087892804202,
"learning_rate": 3.5588098558809855e-05,
"loss": 0.4156,
"step": 861
},
{
"epoch": 1.0802507836990596,
"grad_norm": 0.41383676414963233,
"learning_rate": 3.556485355648535e-05,
"loss": 0.3675,
"step": 862
},
{
"epoch": 1.0815047021943573,
"grad_norm": 0.40123840778470365,
"learning_rate": 3.554160855416086e-05,
"loss": 0.3606,
"step": 863
},
{
"epoch": 1.0827586206896551,
"grad_norm": 0.3780209497693519,
"learning_rate": 3.551836355183636e-05,
"loss": 0.3617,
"step": 864
},
{
"epoch": 1.0840125391849529,
"grad_norm": 0.4445600107405926,
"learning_rate": 3.549511854951186e-05,
"loss": 0.3743,
"step": 865
},
{
"epoch": 1.0852664576802509,
"grad_norm": 0.3823162294394944,
"learning_rate": 3.547187354718736e-05,
"loss": 0.3975,
"step": 866
},
{
"epoch": 1.0865203761755486,
"grad_norm": 0.40391369627037194,
"learning_rate": 3.5448628544862855e-05,
"loss": 0.3592,
"step": 867
},
{
"epoch": 1.0877742946708464,
"grad_norm": 0.3646053174398756,
"learning_rate": 3.542538354253835e-05,
"loss": 0.379,
"step": 868
},
{
"epoch": 1.0890282131661442,
"grad_norm": 0.3396424334896394,
"learning_rate": 3.540213854021386e-05,
"loss": 0.3639,
"step": 869
},
{
"epoch": 1.090282131661442,
"grad_norm": 0.49001954837134815,
"learning_rate": 3.5378893537889356e-05,
"loss": 0.3499,
"step": 870
},
{
"epoch": 1.0915360501567397,
"grad_norm": 0.2821440216724557,
"learning_rate": 3.5355648535564854e-05,
"loss": 0.3772,
"step": 871
},
{
"epoch": 1.0927899686520377,
"grad_norm": 0.45741474865265763,
"learning_rate": 3.533240353324035e-05,
"loss": 0.3809,
"step": 872
},
{
"epoch": 1.0940438871473355,
"grad_norm": 0.3270250477739141,
"learning_rate": 3.5309158530915856e-05,
"loss": 0.3603,
"step": 873
},
{
"epoch": 1.0952978056426332,
"grad_norm": 0.4375932275582306,
"learning_rate": 3.5285913528591354e-05,
"loss": 0.3852,
"step": 874
},
{
"epoch": 1.096551724137931,
"grad_norm": 0.33754034724833476,
"learning_rate": 3.526266852626686e-05,
"loss": 0.3491,
"step": 875
},
{
"epoch": 1.0978056426332288,
"grad_norm": 0.3174298231552671,
"learning_rate": 3.5239423523942356e-05,
"loss": 0.3841,
"step": 876
},
{
"epoch": 1.0990595611285268,
"grad_norm": 0.36421414053527174,
"learning_rate": 3.5216178521617854e-05,
"loss": 0.3888,
"step": 877
},
{
"epoch": 1.1003134796238245,
"grad_norm": 0.3429244928543595,
"learning_rate": 3.519293351929335e-05,
"loss": 0.3669,
"step": 878
},
{
"epoch": 1.1015673981191223,
"grad_norm": 0.2810504954459836,
"learning_rate": 3.516968851696885e-05,
"loss": 0.3765,
"step": 879
},
{
"epoch": 1.10282131661442,
"grad_norm": 0.4114642020080427,
"learning_rate": 3.514644351464435e-05,
"loss": 0.3683,
"step": 880
},
{
"epoch": 1.1040752351097178,
"grad_norm": 0.3352767105786651,
"learning_rate": 3.512319851231985e-05,
"loss": 0.4023,
"step": 881
},
{
"epoch": 1.1053291536050156,
"grad_norm": 0.3300061342772888,
"learning_rate": 3.509995350999536e-05,
"loss": 0.37,
"step": 882
},
{
"epoch": 1.1065830721003134,
"grad_norm": 0.3929591756641327,
"learning_rate": 3.5076708507670855e-05,
"loss": 0.3675,
"step": 883
},
{
"epoch": 1.1078369905956114,
"grad_norm": 0.3255401289323663,
"learning_rate": 3.505346350534635e-05,
"loss": 0.3654,
"step": 884
},
{
"epoch": 1.1090909090909091,
"grad_norm": 0.33093175717696516,
"learning_rate": 3.503021850302185e-05,
"loss": 0.3842,
"step": 885
},
{
"epoch": 1.110344827586207,
"grad_norm": 0.3823513513235503,
"learning_rate": 3.5006973500697355e-05,
"loss": 0.3725,
"step": 886
},
{
"epoch": 1.1115987460815047,
"grad_norm": 0.3455809723697425,
"learning_rate": 3.498372849837285e-05,
"loss": 0.3948,
"step": 887
},
{
"epoch": 1.1128526645768024,
"grad_norm": 0.36298045529101447,
"learning_rate": 3.496048349604835e-05,
"loss": 0.3707,
"step": 888
},
{
"epoch": 1.1141065830721004,
"grad_norm": 0.3577398478755335,
"learning_rate": 3.493723849372385e-05,
"loss": 0.386,
"step": 889
},
{
"epoch": 1.1153605015673982,
"grad_norm": 0.33920327654704957,
"learning_rate": 3.4913993491399346e-05,
"loss": 0.3972,
"step": 890
},
{
"epoch": 1.116614420062696,
"grad_norm": 0.4021584059525569,
"learning_rate": 3.489074848907485e-05,
"loss": 0.3844,
"step": 891
},
{
"epoch": 1.1178683385579937,
"grad_norm": 0.3623120553614596,
"learning_rate": 3.4867503486750356e-05,
"loss": 0.3697,
"step": 892
},
{
"epoch": 1.1191222570532915,
"grad_norm": 0.2969194209550022,
"learning_rate": 3.4844258484425853e-05,
"loss": 0.3628,
"step": 893
},
{
"epoch": 1.1203761755485893,
"grad_norm": 0.34328455215433556,
"learning_rate": 3.482101348210135e-05,
"loss": 0.3663,
"step": 894
},
{
"epoch": 1.1216300940438872,
"grad_norm": 0.3236396765394609,
"learning_rate": 3.479776847977685e-05,
"loss": 0.3703,
"step": 895
},
{
"epoch": 1.122884012539185,
"grad_norm": 0.2838115185547167,
"learning_rate": 3.477452347745235e-05,
"loss": 0.3842,
"step": 896
},
{
"epoch": 1.1241379310344828,
"grad_norm": 0.38753611183176745,
"learning_rate": 3.475127847512785e-05,
"loss": 0.4066,
"step": 897
},
{
"epoch": 1.1253918495297806,
"grad_norm": 0.3170886396870046,
"learning_rate": 3.472803347280335e-05,
"loss": 0.3901,
"step": 898
},
{
"epoch": 1.1266457680250783,
"grad_norm": 0.4132176046888832,
"learning_rate": 3.470478847047885e-05,
"loss": 0.3794,
"step": 899
},
{
"epoch": 1.127899686520376,
"grad_norm": 0.3225966750941761,
"learning_rate": 3.4681543468154345e-05,
"loss": 0.3627,
"step": 900
},
{
"epoch": 1.129153605015674,
"grad_norm": 0.2959707565856821,
"learning_rate": 3.465829846582985e-05,
"loss": 0.3505,
"step": 901
},
{
"epoch": 1.1304075235109718,
"grad_norm": 0.3943627108047089,
"learning_rate": 3.463505346350535e-05,
"loss": 0.3755,
"step": 902
},
{
"epoch": 1.1316614420062696,
"grad_norm": 0.33976852793314655,
"learning_rate": 3.461180846118085e-05,
"loss": 0.3551,
"step": 903
},
{
"epoch": 1.1329153605015674,
"grad_norm": 0.33470280669961944,
"learning_rate": 3.458856345885635e-05,
"loss": 0.3688,
"step": 904
},
{
"epoch": 1.1341692789968651,
"grad_norm": 0.36175351457866767,
"learning_rate": 3.456531845653185e-05,
"loss": 0.3867,
"step": 905
},
{
"epoch": 1.135423197492163,
"grad_norm": 0.3735786224446521,
"learning_rate": 3.4542073454207346e-05,
"loss": 0.3685,
"step": 906
},
{
"epoch": 1.136677115987461,
"grad_norm": 0.3297481155518958,
"learning_rate": 3.4518828451882844e-05,
"loss": 0.3765,
"step": 907
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.4943388839145542,
"learning_rate": 3.449558344955834e-05,
"loss": 0.3825,
"step": 908
},
{
"epoch": 1.1391849529780564,
"grad_norm": 0.34953682024434474,
"learning_rate": 3.4472338447233846e-05,
"loss": 0.3932,
"step": 909
},
{
"epoch": 1.1404388714733542,
"grad_norm": 0.3534692603329101,
"learning_rate": 3.4449093444909344e-05,
"loss": 0.3627,
"step": 910
},
{
"epoch": 1.141692789968652,
"grad_norm": 0.3307901088846704,
"learning_rate": 3.442584844258485e-05,
"loss": 0.3681,
"step": 911
},
{
"epoch": 1.14294670846395,
"grad_norm": 0.320559829887424,
"learning_rate": 3.4402603440260346e-05,
"loss": 0.3667,
"step": 912
},
{
"epoch": 1.1442006269592477,
"grad_norm": 0.32129590555554777,
"learning_rate": 3.4379358437935844e-05,
"loss": 0.3676,
"step": 913
},
{
"epoch": 1.1454545454545455,
"grad_norm": 0.3726468150761692,
"learning_rate": 3.435611343561135e-05,
"loss": 0.3878,
"step": 914
},
{
"epoch": 1.1467084639498433,
"grad_norm": 0.3548687657463299,
"learning_rate": 3.4332868433286847e-05,
"loss": 0.3757,
"step": 915
},
{
"epoch": 1.147962382445141,
"grad_norm": 0.35183930573064065,
"learning_rate": 3.4309623430962344e-05,
"loss": 0.3764,
"step": 916
},
{
"epoch": 1.1492163009404388,
"grad_norm": 0.3286383660245774,
"learning_rate": 3.428637842863784e-05,
"loss": 0.3678,
"step": 917
},
{
"epoch": 1.1504702194357366,
"grad_norm": 0.33075594327981284,
"learning_rate": 3.426313342631334e-05,
"loss": 0.3551,
"step": 918
},
{
"epoch": 1.1517241379310346,
"grad_norm": 0.29243115330169595,
"learning_rate": 3.4239888423988845e-05,
"loss": 0.3826,
"step": 919
},
{
"epoch": 1.1529780564263323,
"grad_norm": 0.3680690471985968,
"learning_rate": 3.421664342166435e-05,
"loss": 0.3776,
"step": 920
},
{
"epoch": 1.15423197492163,
"grad_norm": 0.3417175867776188,
"learning_rate": 3.419339841933985e-05,
"loss": 0.3784,
"step": 921
},
{
"epoch": 1.1554858934169279,
"grad_norm": 0.3360921280700044,
"learning_rate": 3.4170153417015345e-05,
"loss": 0.3826,
"step": 922
},
{
"epoch": 1.1567398119122256,
"grad_norm": 0.3417126594764177,
"learning_rate": 3.414690841469084e-05,
"loss": 0.4042,
"step": 923
},
{
"epoch": 1.1579937304075236,
"grad_norm": 0.313458391916607,
"learning_rate": 3.412366341236634e-05,
"loss": 0.3443,
"step": 924
},
{
"epoch": 1.1592476489028214,
"grad_norm": 0.34958070453714846,
"learning_rate": 3.410041841004184e-05,
"loss": 0.3703,
"step": 925
},
{
"epoch": 1.1605015673981192,
"grad_norm": 0.3093695265808923,
"learning_rate": 3.407717340771734e-05,
"loss": 0.3587,
"step": 926
},
{
"epoch": 1.161755485893417,
"grad_norm": 0.3924706814915906,
"learning_rate": 3.405392840539284e-05,
"loss": 0.378,
"step": 927
},
{
"epoch": 1.1630094043887147,
"grad_norm": 0.3259438665545285,
"learning_rate": 3.403068340306834e-05,
"loss": 0.3635,
"step": 928
},
{
"epoch": 1.1642633228840125,
"grad_norm": 0.355223700148204,
"learning_rate": 3.4007438400743843e-05,
"loss": 0.3723,
"step": 929
},
{
"epoch": 1.1655172413793102,
"grad_norm": 0.3957997728780803,
"learning_rate": 3.398419339841934e-05,
"loss": 0.383,
"step": 930
},
{
"epoch": 1.1667711598746082,
"grad_norm": 0.3093882884657544,
"learning_rate": 3.3960948396094846e-05,
"loss": 0.3811,
"step": 931
},
{
"epoch": 1.168025078369906,
"grad_norm": 0.3381847277244351,
"learning_rate": 3.3937703393770344e-05,
"loss": 0.378,
"step": 932
},
{
"epoch": 1.1692789968652038,
"grad_norm": 0.3843086285705784,
"learning_rate": 3.391445839144584e-05,
"loss": 0.3613,
"step": 933
},
{
"epoch": 1.1705329153605015,
"grad_norm": 0.329683429216027,
"learning_rate": 3.389121338912134e-05,
"loss": 0.3731,
"step": 934
},
{
"epoch": 1.1717868338557993,
"grad_norm": 0.2898734853216082,
"learning_rate": 3.386796838679684e-05,
"loss": 0.3907,
"step": 935
},
{
"epoch": 1.1730407523510973,
"grad_norm": 0.35935375386803053,
"learning_rate": 3.3844723384472335e-05,
"loss": 0.36,
"step": 936
},
{
"epoch": 1.174294670846395,
"grad_norm": 0.2984203474833215,
"learning_rate": 3.382147838214784e-05,
"loss": 0.3843,
"step": 937
},
{
"epoch": 1.1755485893416928,
"grad_norm": 0.3200295336608809,
"learning_rate": 3.379823337982334e-05,
"loss": 0.3687,
"step": 938
},
{
"epoch": 1.1768025078369906,
"grad_norm": 0.33119391410095295,
"learning_rate": 3.377498837749884e-05,
"loss": 0.3749,
"step": 939
},
{
"epoch": 1.1780564263322884,
"grad_norm": 0.3452085014392748,
"learning_rate": 3.375174337517434e-05,
"loss": 0.3827,
"step": 940
},
{
"epoch": 1.1793103448275861,
"grad_norm": 0.3224616757395363,
"learning_rate": 3.372849837284984e-05,
"loss": 0.3581,
"step": 941
},
{
"epoch": 1.1805642633228839,
"grad_norm": 0.3938565275437319,
"learning_rate": 3.3705253370525336e-05,
"loss": 0.3758,
"step": 942
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.32998312392655144,
"learning_rate": 3.368200836820084e-05,
"loss": 0.3905,
"step": 943
},
{
"epoch": 1.1830721003134796,
"grad_norm": 0.4180483775764404,
"learning_rate": 3.365876336587634e-05,
"loss": 0.3864,
"step": 944
},
{
"epoch": 1.1843260188087774,
"grad_norm": 0.33016916949286657,
"learning_rate": 3.3635518363551836e-05,
"loss": 0.3753,
"step": 945
},
{
"epoch": 1.1855799373040752,
"grad_norm": 0.3392344998437932,
"learning_rate": 3.3612273361227334e-05,
"loss": 0.3832,
"step": 946
},
{
"epoch": 1.186833855799373,
"grad_norm": 0.3627457065038813,
"learning_rate": 3.358902835890284e-05,
"loss": 0.3949,
"step": 947
},
{
"epoch": 1.188087774294671,
"grad_norm": 0.3482929035320908,
"learning_rate": 3.356578335657834e-05,
"loss": 0.3961,
"step": 948
},
{
"epoch": 1.1893416927899687,
"grad_norm": 0.40015332495681544,
"learning_rate": 3.354253835425384e-05,
"loss": 0.3981,
"step": 949
},
{
"epoch": 1.1905956112852665,
"grad_norm": 0.3688511914478272,
"learning_rate": 3.351929335192934e-05,
"loss": 0.3729,
"step": 950
},
{
"epoch": 1.1918495297805642,
"grad_norm": 0.4050871458203982,
"learning_rate": 3.3496048349604837e-05,
"loss": 0.4016,
"step": 951
},
{
"epoch": 1.193103448275862,
"grad_norm": 0.45912649388869103,
"learning_rate": 3.3472803347280334e-05,
"loss": 0.3719,
"step": 952
},
{
"epoch": 1.1943573667711598,
"grad_norm": 0.3747423687567928,
"learning_rate": 3.344955834495583e-05,
"loss": 0.3616,
"step": 953
},
{
"epoch": 1.1956112852664578,
"grad_norm": 0.43075366160899115,
"learning_rate": 3.342631334263134e-05,
"loss": 0.3814,
"step": 954
},
{
"epoch": 1.1968652037617555,
"grad_norm": 0.3237597563543521,
"learning_rate": 3.3403068340306835e-05,
"loss": 0.3692,
"step": 955
},
{
"epoch": 1.1981191222570533,
"grad_norm": 0.3898749992167921,
"learning_rate": 3.337982333798233e-05,
"loss": 0.3715,
"step": 956
},
{
"epoch": 1.199373040752351,
"grad_norm": 0.3666536166229499,
"learning_rate": 3.335657833565784e-05,
"loss": 0.3813,
"step": 957
},
{
"epoch": 1.2006269592476488,
"grad_norm": 0.3292028428831756,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.3748,
"step": 958
},
{
"epoch": 1.2018808777429468,
"grad_norm": 0.3872238181481738,
"learning_rate": 3.331008833100884e-05,
"loss": 0.3973,
"step": 959
},
{
"epoch": 1.2031347962382446,
"grad_norm": 0.3314373527127078,
"learning_rate": 3.328684332868434e-05,
"loss": 0.3795,
"step": 960
},
{
"epoch": 1.2043887147335424,
"grad_norm": 0.3297645167891427,
"learning_rate": 3.3263598326359835e-05,
"loss": 0.3826,
"step": 961
},
{
"epoch": 1.2056426332288401,
"grad_norm": 0.3751637811277324,
"learning_rate": 3.324035332403533e-05,
"loss": 0.3689,
"step": 962
},
{
"epoch": 1.206896551724138,
"grad_norm": 0.3499285693338164,
"learning_rate": 3.321710832171083e-05,
"loss": 0.3813,
"step": 963
},
{
"epoch": 1.2081504702194357,
"grad_norm": 0.3938186585116234,
"learning_rate": 3.319386331938633e-05,
"loss": 0.3614,
"step": 964
},
{
"epoch": 1.2094043887147334,
"grad_norm": 0.38809156037637615,
"learning_rate": 3.317061831706183e-05,
"loss": 0.3754,
"step": 965
},
{
"epoch": 1.2106583072100314,
"grad_norm": 0.3432234541614212,
"learning_rate": 3.314737331473733e-05,
"loss": 0.3761,
"step": 966
},
{
"epoch": 1.2119122257053292,
"grad_norm": 0.4098389543355861,
"learning_rate": 3.3124128312412836e-05,
"loss": 0.3868,
"step": 967
},
{
"epoch": 1.213166144200627,
"grad_norm": 0.3034470239207852,
"learning_rate": 3.3100883310088334e-05,
"loss": 0.3792,
"step": 968
},
{
"epoch": 1.2144200626959247,
"grad_norm": 0.3732265161698448,
"learning_rate": 3.307763830776383e-05,
"loss": 0.3623,
"step": 969
},
{
"epoch": 1.2156739811912225,
"grad_norm": 0.33480694526956495,
"learning_rate": 3.305439330543933e-05,
"loss": 0.3811,
"step": 970
},
{
"epoch": 1.2169278996865205,
"grad_norm": 0.3398917668667268,
"learning_rate": 3.3031148303114834e-05,
"loss": 0.377,
"step": 971
},
{
"epoch": 1.2181818181818183,
"grad_norm": 0.4490922408331678,
"learning_rate": 3.300790330079033e-05,
"loss": 0.3956,
"step": 972
},
{
"epoch": 1.219435736677116,
"grad_norm": 4.2925624149195905,
"learning_rate": 3.298465829846583e-05,
"loss": 0.3716,
"step": 973
},
{
"epoch": 1.2206896551724138,
"grad_norm": 0.4418407190642969,
"learning_rate": 3.296141329614133e-05,
"loss": 0.3817,
"step": 974
},
{
"epoch": 1.2219435736677116,
"grad_norm": 0.3199169434646702,
"learning_rate": 3.293816829381683e-05,
"loss": 0.3872,
"step": 975
},
{
"epoch": 1.2231974921630093,
"grad_norm": 0.44392427192637907,
"learning_rate": 3.291492329149234e-05,
"loss": 0.364,
"step": 976
},
{
"epoch": 1.224451410658307,
"grad_norm": 0.3495296267856878,
"learning_rate": 3.2891678289167835e-05,
"loss": 0.3798,
"step": 977
},
{
"epoch": 1.225705329153605,
"grad_norm": 0.3747107439918024,
"learning_rate": 3.286843328684333e-05,
"loss": 0.3825,
"step": 978
},
{
"epoch": 1.2269592476489029,
"grad_norm": 0.3316467506892561,
"learning_rate": 3.284518828451883e-05,
"loss": 0.356,
"step": 979
},
{
"epoch": 1.2282131661442006,
"grad_norm": 0.30544062731963156,
"learning_rate": 3.282194328219433e-05,
"loss": 0.3731,
"step": 980
},
{
"epoch": 1.2294670846394984,
"grad_norm": 0.3597081998281626,
"learning_rate": 3.2798698279869826e-05,
"loss": 0.3978,
"step": 981
},
{
"epoch": 1.2307210031347962,
"grad_norm": 0.2952696526454703,
"learning_rate": 3.277545327754533e-05,
"loss": 0.3945,
"step": 982
},
{
"epoch": 1.2319749216300941,
"grad_norm": 0.3489116012438407,
"learning_rate": 3.275220827522083e-05,
"loss": 0.3752,
"step": 983
},
{
"epoch": 1.233228840125392,
"grad_norm": 0.36137404867174416,
"learning_rate": 3.2728963272896326e-05,
"loss": 0.3669,
"step": 984
},
{
"epoch": 1.2344827586206897,
"grad_norm": 0.35212892019476943,
"learning_rate": 3.270571827057183e-05,
"loss": 0.3702,
"step": 985
},
{
"epoch": 1.2357366771159874,
"grad_norm": 0.40575887204726746,
"learning_rate": 3.268247326824733e-05,
"loss": 0.4223,
"step": 986
},
{
"epoch": 1.2369905956112852,
"grad_norm": 0.4536861444902782,
"learning_rate": 3.2659228265922826e-05,
"loss": 0.3732,
"step": 987
},
{
"epoch": 1.238244514106583,
"grad_norm": 0.36340169528642347,
"learning_rate": 3.263598326359833e-05,
"loss": 0.353,
"step": 988
},
{
"epoch": 1.239498432601881,
"grad_norm": 0.44566079966682354,
"learning_rate": 3.261273826127383e-05,
"loss": 0.4201,
"step": 989
},
{
"epoch": 1.2407523510971787,
"grad_norm": 0.38066553848886414,
"learning_rate": 3.258949325894933e-05,
"loss": 0.3814,
"step": 990
},
{
"epoch": 1.2420062695924765,
"grad_norm": 0.38063604661272665,
"learning_rate": 3.2566248256624825e-05,
"loss": 0.3904,
"step": 991
},
{
"epoch": 1.2432601880877743,
"grad_norm": 0.4096337855916192,
"learning_rate": 3.254300325430032e-05,
"loss": 0.3577,
"step": 992
},
{
"epoch": 1.244514106583072,
"grad_norm": 0.3336225841350501,
"learning_rate": 3.251975825197583e-05,
"loss": 0.385,
"step": 993
},
{
"epoch": 1.2457680250783698,
"grad_norm": 0.3674247872290705,
"learning_rate": 3.2496513249651325e-05,
"loss": 0.3951,
"step": 994
},
{
"epoch": 1.2470219435736678,
"grad_norm": 0.4089162999518996,
"learning_rate": 3.247326824732683e-05,
"loss": 0.3756,
"step": 995
},
{
"epoch": 1.2482758620689656,
"grad_norm": 0.330175524132324,
"learning_rate": 3.245002324500233e-05,
"loss": 0.4046,
"step": 996
},
{
"epoch": 1.2495297805642633,
"grad_norm": 0.31827643713416004,
"learning_rate": 3.2426778242677825e-05,
"loss": 0.3705,
"step": 997
},
{
"epoch": 1.250783699059561,
"grad_norm": 0.33632976359988254,
"learning_rate": 3.240353324035332e-05,
"loss": 0.3716,
"step": 998
},
{
"epoch": 1.2520376175548589,
"grad_norm": 0.36862300734855774,
"learning_rate": 3.238028823802883e-05,
"loss": 0.3869,
"step": 999
},
{
"epoch": 1.2532915360501566,
"grad_norm": 0.3619979919592418,
"learning_rate": 3.2357043235704325e-05,
"loss": 0.3942,
"step": 1000
},
{
"epoch": 1.2545454545454544,
"grad_norm": 0.4057545486158015,
"learning_rate": 3.233379823337982e-05,
"loss": 0.3666,
"step": 1001
},
{
"epoch": 1.2557993730407524,
"grad_norm": 0.3005200098747564,
"learning_rate": 3.231055323105532e-05,
"loss": 0.3648,
"step": 1002
},
{
"epoch": 1.2570532915360502,
"grad_norm": 0.40128767007417543,
"learning_rate": 3.2287308228730826e-05,
"loss": 0.3935,
"step": 1003
},
{
"epoch": 1.258307210031348,
"grad_norm": 0.30036245022784264,
"learning_rate": 3.2264063226406324e-05,
"loss": 0.3497,
"step": 1004
},
{
"epoch": 1.2595611285266457,
"grad_norm": 0.2928048442618705,
"learning_rate": 3.224081822408183e-05,
"loss": 0.389,
"step": 1005
},
{
"epoch": 1.2608150470219437,
"grad_norm": 0.33397208312012183,
"learning_rate": 3.2217573221757326e-05,
"loss": 0.3759,
"step": 1006
},
{
"epoch": 1.2620689655172415,
"grad_norm": 0.3267571841726079,
"learning_rate": 3.2194328219432824e-05,
"loss": 0.3695,
"step": 1007
},
{
"epoch": 1.2633228840125392,
"grad_norm": 0.3594700149018815,
"learning_rate": 3.217108321710832e-05,
"loss": 0.3754,
"step": 1008
},
{
"epoch": 1.264576802507837,
"grad_norm": 0.3050691926059993,
"learning_rate": 3.214783821478382e-05,
"loss": 0.355,
"step": 1009
},
{
"epoch": 1.2658307210031348,
"grad_norm": 0.3323756030966606,
"learning_rate": 3.2124593212459324e-05,
"loss": 0.3791,
"step": 1010
},
{
"epoch": 1.2670846394984325,
"grad_norm": 0.35488165910721925,
"learning_rate": 3.210134821013482e-05,
"loss": 0.3898,
"step": 1011
},
{
"epoch": 1.2683385579937303,
"grad_norm": 0.34189942646137783,
"learning_rate": 3.207810320781032e-05,
"loss": 0.4059,
"step": 1012
},
{
"epoch": 1.2695924764890283,
"grad_norm": 0.3093764045474573,
"learning_rate": 3.2054858205485824e-05,
"loss": 0.3931,
"step": 1013
},
{
"epoch": 1.270846394984326,
"grad_norm": 0.35831397839050483,
"learning_rate": 3.203161320316132e-05,
"loss": 0.3728,
"step": 1014
},
{
"epoch": 1.2721003134796238,
"grad_norm": 0.3487738768104738,
"learning_rate": 3.200836820083682e-05,
"loss": 0.3754,
"step": 1015
},
{
"epoch": 1.2733542319749216,
"grad_norm": 0.32681712620832987,
"learning_rate": 3.1985123198512325e-05,
"loss": 0.3741,
"step": 1016
},
{
"epoch": 1.2746081504702194,
"grad_norm": 0.30383054676660065,
"learning_rate": 3.196187819618782e-05,
"loss": 0.3829,
"step": 1017
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.28773822591861126,
"learning_rate": 3.193863319386332e-05,
"loss": 0.3616,
"step": 1018
},
{
"epoch": 1.2771159874608151,
"grad_norm": 0.3028871574296117,
"learning_rate": 3.191538819153882e-05,
"loss": 0.3791,
"step": 1019
},
{
"epoch": 1.2783699059561129,
"grad_norm": 0.3764726164878204,
"learning_rate": 3.1892143189214316e-05,
"loss": 0.3739,
"step": 1020
},
{
"epoch": 1.2796238244514107,
"grad_norm": 0.30250051709708436,
"learning_rate": 3.186889818688982e-05,
"loss": 0.3558,
"step": 1021
},
{
"epoch": 1.2808777429467084,
"grad_norm": 0.3853369490201514,
"learning_rate": 3.184565318456532e-05,
"loss": 0.3772,
"step": 1022
},
{
"epoch": 1.2821316614420062,
"grad_norm": 0.3426038268473075,
"learning_rate": 3.182240818224082e-05,
"loss": 0.3688,
"step": 1023
},
{
"epoch": 1.283385579937304,
"grad_norm": 0.2963200277131022,
"learning_rate": 3.179916317991632e-05,
"loss": 0.3967,
"step": 1024
},
{
"epoch": 1.284639498432602,
"grad_norm": 0.40617319594581214,
"learning_rate": 3.177591817759182e-05,
"loss": 0.3771,
"step": 1025
},
{
"epoch": 1.2858934169278997,
"grad_norm": 0.3828052711836533,
"learning_rate": 3.175267317526732e-05,
"loss": 0.3835,
"step": 1026
},
{
"epoch": 1.2871473354231975,
"grad_norm": 0.4149265666984591,
"learning_rate": 3.172942817294282e-05,
"loss": 0.4015,
"step": 1027
},
{
"epoch": 1.2884012539184952,
"grad_norm": 0.3610210469839352,
"learning_rate": 3.170618317061832e-05,
"loss": 0.3611,
"step": 1028
},
{
"epoch": 1.2896551724137932,
"grad_norm": 0.3774507235407678,
"learning_rate": 3.168293816829382e-05,
"loss": 0.3853,
"step": 1029
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.34867263479398986,
"learning_rate": 3.1659693165969315e-05,
"loss": 0.3516,
"step": 1030
},
{
"epoch": 1.2921630094043888,
"grad_norm": 0.39228726189989105,
"learning_rate": 3.163644816364482e-05,
"loss": 0.3958,
"step": 1031
},
{
"epoch": 1.2934169278996865,
"grad_norm": 0.30630357581093287,
"learning_rate": 3.161320316132032e-05,
"loss": 0.363,
"step": 1032
},
{
"epoch": 1.2946708463949843,
"grad_norm": 0.334258686392191,
"learning_rate": 3.158995815899582e-05,
"loss": 0.3951,
"step": 1033
},
{
"epoch": 1.295924764890282,
"grad_norm": 0.3199276450756677,
"learning_rate": 3.156671315667132e-05,
"loss": 0.3815,
"step": 1034
},
{
"epoch": 1.2971786833855798,
"grad_norm": 0.37110603670772996,
"learning_rate": 3.154346815434682e-05,
"loss": 0.3769,
"step": 1035
},
{
"epoch": 1.2984326018808776,
"grad_norm": 0.30878548326089394,
"learning_rate": 3.1520223152022315e-05,
"loss": 0.3463,
"step": 1036
},
{
"epoch": 1.2996865203761756,
"grad_norm": 0.28467825949649783,
"learning_rate": 3.149697814969781e-05,
"loss": 0.3495,
"step": 1037
},
{
"epoch": 1.3009404388714734,
"grad_norm": 0.32588336662695927,
"learning_rate": 3.147373314737332e-05,
"loss": 0.3889,
"step": 1038
},
{
"epoch": 1.3021943573667711,
"grad_norm": 0.2804036900322404,
"learning_rate": 3.1450488145048816e-05,
"loss": 0.3558,
"step": 1039
},
{
"epoch": 1.303448275862069,
"grad_norm": 0.3123315325241931,
"learning_rate": 3.1427243142724314e-05,
"loss": 0.3353,
"step": 1040
},
{
"epoch": 1.304702194357367,
"grad_norm": 0.34997557329908946,
"learning_rate": 3.140399814039982e-05,
"loss": 0.3862,
"step": 1041
},
{
"epoch": 1.3059561128526647,
"grad_norm": 0.28790104696411384,
"learning_rate": 3.1380753138075316e-05,
"loss": 0.3855,
"step": 1042
},
{
"epoch": 1.3072100313479624,
"grad_norm": 0.41821139479121994,
"learning_rate": 3.1357508135750814e-05,
"loss": 0.3623,
"step": 1043
},
{
"epoch": 1.3084639498432602,
"grad_norm": 0.26090863914278367,
"learning_rate": 3.133426313342632e-05,
"loss": 0.3544,
"step": 1044
},
{
"epoch": 1.309717868338558,
"grad_norm": 0.2989627815279436,
"learning_rate": 3.1311018131101816e-05,
"loss": 0.3565,
"step": 1045
},
{
"epoch": 1.3109717868338557,
"grad_norm": 0.36950952714199786,
"learning_rate": 3.1287773128777314e-05,
"loss": 0.3638,
"step": 1046
},
{
"epoch": 1.3122257053291535,
"grad_norm": 0.31628545426259136,
"learning_rate": 3.126452812645281e-05,
"loss": 0.3804,
"step": 1047
},
{
"epoch": 1.3134796238244515,
"grad_norm": 0.340742452617921,
"learning_rate": 3.124128312412831e-05,
"loss": 0.3864,
"step": 1048
},
{
"epoch": 1.3147335423197493,
"grad_norm": 0.33444285709589544,
"learning_rate": 3.121803812180381e-05,
"loss": 0.3767,
"step": 1049
},
{
"epoch": 1.315987460815047,
"grad_norm": 0.31402315508813977,
"learning_rate": 3.119479311947931e-05,
"loss": 0.3842,
"step": 1050
},
{
"epoch": 1.3172413793103448,
"grad_norm": 0.3918212089186475,
"learning_rate": 3.117154811715482e-05,
"loss": 0.3645,
"step": 1051
},
{
"epoch": 1.3184952978056426,
"grad_norm": 0.3191302282575996,
"learning_rate": 3.1148303114830315e-05,
"loss": 0.3811,
"step": 1052
},
{
"epoch": 1.3197492163009406,
"grad_norm": 0.3262532274198052,
"learning_rate": 3.112505811250581e-05,
"loss": 0.3951,
"step": 1053
},
{
"epoch": 1.3210031347962383,
"grad_norm": 0.34703093764415727,
"learning_rate": 3.110181311018131e-05,
"loss": 0.3587,
"step": 1054
},
{
"epoch": 1.322257053291536,
"grad_norm": 0.28488982821026776,
"learning_rate": 3.1078568107856815e-05,
"loss": 0.395,
"step": 1055
},
{
"epoch": 1.3235109717868339,
"grad_norm": 0.3787168023556999,
"learning_rate": 3.105532310553231e-05,
"loss": 0.4098,
"step": 1056
},
{
"epoch": 1.3247648902821316,
"grad_norm": 0.30623894072651964,
"learning_rate": 3.103207810320781e-05,
"loss": 0.3466,
"step": 1057
},
{
"epoch": 1.3260188087774294,
"grad_norm": 0.32483387304000394,
"learning_rate": 3.100883310088331e-05,
"loss": 0.3765,
"step": 1058
},
{
"epoch": 1.3272727272727272,
"grad_norm": 0.3521984912168087,
"learning_rate": 3.098558809855881e-05,
"loss": 0.3794,
"step": 1059
},
{
"epoch": 1.3285266457680251,
"grad_norm": 0.297249612427211,
"learning_rate": 3.096234309623431e-05,
"loss": 0.3664,
"step": 1060
},
{
"epoch": 1.329780564263323,
"grad_norm": 0.3397463204741916,
"learning_rate": 3.0939098093909816e-05,
"loss": 0.3617,
"step": 1061
},
{
"epoch": 1.3310344827586207,
"grad_norm": 0.33202454815906446,
"learning_rate": 3.0915853091585313e-05,
"loss": 0.3696,
"step": 1062
},
{
"epoch": 1.3322884012539185,
"grad_norm": 0.374197319647271,
"learning_rate": 3.089260808926081e-05,
"loss": 0.3963,
"step": 1063
},
{
"epoch": 1.3335423197492162,
"grad_norm": 0.3402687873157731,
"learning_rate": 3.086936308693631e-05,
"loss": 0.3608,
"step": 1064
},
{
"epoch": 1.3347962382445142,
"grad_norm": 0.3070848599627315,
"learning_rate": 3.084611808461181e-05,
"loss": 0.366,
"step": 1065
},
{
"epoch": 1.336050156739812,
"grad_norm": 0.37481044176761413,
"learning_rate": 3.082287308228731e-05,
"loss": 0.3848,
"step": 1066
},
{
"epoch": 1.3373040752351097,
"grad_norm": 0.35898042425626686,
"learning_rate": 3.079962807996281e-05,
"loss": 0.3909,
"step": 1067
},
{
"epoch": 1.3385579937304075,
"grad_norm": 0.2986153373772032,
"learning_rate": 3.077638307763831e-05,
"loss": 0.3954,
"step": 1068
},
{
"epoch": 1.3398119122257053,
"grad_norm": 0.30610102548895973,
"learning_rate": 3.075313807531381e-05,
"loss": 0.3563,
"step": 1069
},
{
"epoch": 1.341065830721003,
"grad_norm": 0.34974720431763995,
"learning_rate": 3.072989307298931e-05,
"loss": 0.3726,
"step": 1070
},
{
"epoch": 1.3423197492163008,
"grad_norm": 0.3312492009249531,
"learning_rate": 3.070664807066481e-05,
"loss": 0.3846,
"step": 1071
},
{
"epoch": 1.3435736677115988,
"grad_norm": 0.39924873070469363,
"learning_rate": 3.068340306834031e-05,
"loss": 0.3584,
"step": 1072
},
{
"epoch": 1.3448275862068966,
"grad_norm": 0.39886344953930025,
"learning_rate": 3.066015806601581e-05,
"loss": 0.3777,
"step": 1073
},
{
"epoch": 1.3460815047021943,
"grad_norm": 0.3397373777936394,
"learning_rate": 3.063691306369131e-05,
"loss": 0.359,
"step": 1074
},
{
"epoch": 1.347335423197492,
"grad_norm": 0.4101189596574695,
"learning_rate": 3.0613668061366806e-05,
"loss": 0.3713,
"step": 1075
},
{
"epoch": 1.34858934169279,
"grad_norm": 0.3111825274796488,
"learning_rate": 3.0590423059042303e-05,
"loss": 0.3746,
"step": 1076
},
{
"epoch": 1.3498432601880879,
"grad_norm": 0.433754758595782,
"learning_rate": 3.05671780567178e-05,
"loss": 0.381,
"step": 1077
},
{
"epoch": 1.3510971786833856,
"grad_norm": 0.35123159191370645,
"learning_rate": 3.0543933054393306e-05,
"loss": 0.3693,
"step": 1078
},
{
"epoch": 1.3523510971786834,
"grad_norm": 0.3876006539662075,
"learning_rate": 3.052068805206881e-05,
"loss": 0.3878,
"step": 1079
},
{
"epoch": 1.3536050156739812,
"grad_norm": 0.3028741658428645,
"learning_rate": 3.0497443049744305e-05,
"loss": 0.354,
"step": 1080
},
{
"epoch": 1.354858934169279,
"grad_norm": 0.37652175280329386,
"learning_rate": 3.0474198047419806e-05,
"loss": 0.4066,
"step": 1081
},
{
"epoch": 1.3561128526645767,
"grad_norm": 0.30280936922767054,
"learning_rate": 3.0450953045095304e-05,
"loss": 0.3722,
"step": 1082
},
{
"epoch": 1.3573667711598745,
"grad_norm": 0.3232123600984817,
"learning_rate": 3.042770804277081e-05,
"loss": 0.3579,
"step": 1083
},
{
"epoch": 1.3586206896551725,
"grad_norm": 0.33759185451698404,
"learning_rate": 3.0404463040446307e-05,
"loss": 0.3658,
"step": 1084
},
{
"epoch": 1.3598746081504702,
"grad_norm": 0.34285060332483686,
"learning_rate": 3.0381218038121804e-05,
"loss": 0.3758,
"step": 1085
},
{
"epoch": 1.361128526645768,
"grad_norm": 0.3102741206309484,
"learning_rate": 3.0357973035797306e-05,
"loss": 0.3695,
"step": 1086
},
{
"epoch": 1.3623824451410658,
"grad_norm": 0.3373379461875471,
"learning_rate": 3.0334728033472803e-05,
"loss": 0.3718,
"step": 1087
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.29980759874819407,
"learning_rate": 3.03114830311483e-05,
"loss": 0.3917,
"step": 1088
},
{
"epoch": 1.3648902821316615,
"grad_norm": 0.29849564640284376,
"learning_rate": 3.0288238028823806e-05,
"loss": 0.3665,
"step": 1089
},
{
"epoch": 1.3661442006269593,
"grad_norm": 0.3133928885269342,
"learning_rate": 3.0264993026499304e-05,
"loss": 0.3795,
"step": 1090
},
{
"epoch": 1.367398119122257,
"grad_norm": 0.32040408469455606,
"learning_rate": 3.0241748024174805e-05,
"loss": 0.3657,
"step": 1091
},
{
"epoch": 1.3686520376175548,
"grad_norm": 0.3426078976328265,
"learning_rate": 3.0218503021850303e-05,
"loss": 0.3833,
"step": 1092
},
{
"epoch": 1.3699059561128526,
"grad_norm": 0.32338946062071877,
"learning_rate": 3.01952580195258e-05,
"loss": 0.3999,
"step": 1093
},
{
"epoch": 1.3711598746081504,
"grad_norm": 0.33452416833231946,
"learning_rate": 3.0172013017201302e-05,
"loss": 0.3586,
"step": 1094
},
{
"epoch": 1.3724137931034484,
"grad_norm": 0.32719541692949544,
"learning_rate": 3.0148768014876806e-05,
"loss": 0.3618,
"step": 1095
},
{
"epoch": 1.3736677115987461,
"grad_norm": 0.31861213442934905,
"learning_rate": 3.0125523012552304e-05,
"loss": 0.3623,
"step": 1096
},
{
"epoch": 1.374921630094044,
"grad_norm": 0.2904842263582729,
"learning_rate": 3.0102278010227802e-05,
"loss": 0.3811,
"step": 1097
},
{
"epoch": 1.3761755485893417,
"grad_norm": 0.2781429202459624,
"learning_rate": 3.00790330079033e-05,
"loss": 0.3792,
"step": 1098
},
{
"epoch": 1.3774294670846394,
"grad_norm": 0.3162685390781678,
"learning_rate": 3.00557880055788e-05,
"loss": 0.3694,
"step": 1099
},
{
"epoch": 1.3786833855799374,
"grad_norm": 0.3184969694018343,
"learning_rate": 3.0032543003254306e-05,
"loss": 0.3722,
"step": 1100
},
{
"epoch": 1.3799373040752352,
"grad_norm": 0.2835638093959142,
"learning_rate": 3.0009298000929804e-05,
"loss": 0.3918,
"step": 1101
},
{
"epoch": 1.381191222570533,
"grad_norm": 0.3646734870241203,
"learning_rate": 2.99860529986053e-05,
"loss": 0.3893,
"step": 1102
},
{
"epoch": 1.3824451410658307,
"grad_norm": 0.28876745741859894,
"learning_rate": 2.99628079962808e-05,
"loss": 0.369,
"step": 1103
},
{
"epoch": 1.3836990595611285,
"grad_norm": 0.30129504460034895,
"learning_rate": 2.99395629939563e-05,
"loss": 0.388,
"step": 1104
},
{
"epoch": 1.3849529780564263,
"grad_norm": 0.2970506197854972,
"learning_rate": 2.99163179916318e-05,
"loss": 0.3758,
"step": 1105
},
{
"epoch": 1.386206896551724,
"grad_norm": 0.3151186814068136,
"learning_rate": 2.9893072989307303e-05,
"loss": 0.3776,
"step": 1106
},
{
"epoch": 1.387460815047022,
"grad_norm": 0.31651262134444097,
"learning_rate": 2.98698279869828e-05,
"loss": 0.3509,
"step": 1107
},
{
"epoch": 1.3887147335423198,
"grad_norm": 0.32389745777967976,
"learning_rate": 2.98465829846583e-05,
"loss": 0.3874,
"step": 1108
},
{
"epoch": 1.3899686520376175,
"grad_norm": 0.34156477841520744,
"learning_rate": 2.98233379823338e-05,
"loss": 0.3829,
"step": 1109
},
{
"epoch": 1.3912225705329153,
"grad_norm": 0.28186198470406915,
"learning_rate": 2.9800092980009298e-05,
"loss": 0.3664,
"step": 1110
},
{
"epoch": 1.3924764890282133,
"grad_norm": 0.3372325114536071,
"learning_rate": 2.9776847977684802e-05,
"loss": 0.3899,
"step": 1111
},
{
"epoch": 1.393730407523511,
"grad_norm": 0.2943320326430922,
"learning_rate": 2.97536029753603e-05,
"loss": 0.3775,
"step": 1112
},
{
"epoch": 1.3949843260188088,
"grad_norm": 0.3417902746390271,
"learning_rate": 2.9730357973035798e-05,
"loss": 0.3875,
"step": 1113
},
{
"epoch": 1.3962382445141066,
"grad_norm": 0.31238576232089316,
"learning_rate": 2.97071129707113e-05,
"loss": 0.3726,
"step": 1114
},
{
"epoch": 1.3974921630094044,
"grad_norm": 0.37774683455960933,
"learning_rate": 2.9683867968386797e-05,
"loss": 0.3623,
"step": 1115
},
{
"epoch": 1.3987460815047021,
"grad_norm": 0.2984459820607882,
"learning_rate": 2.9660622966062295e-05,
"loss": 0.3614,
"step": 1116
},
{
"epoch": 1.4,
"grad_norm": 0.307699001042636,
"learning_rate": 2.96373779637378e-05,
"loss": 0.3935,
"step": 1117
},
{
"epoch": 1.4012539184952977,
"grad_norm": 0.38584786655756026,
"learning_rate": 2.9614132961413297e-05,
"loss": 0.3636,
"step": 1118
},
{
"epoch": 1.4025078369905957,
"grad_norm": 0.29517010202862654,
"learning_rate": 2.95908879590888e-05,
"loss": 0.3568,
"step": 1119
},
{
"epoch": 1.4037617554858934,
"grad_norm": 0.29270347618215203,
"learning_rate": 2.9567642956764296e-05,
"loss": 0.351,
"step": 1120
},
{
"epoch": 1.4050156739811912,
"grad_norm": 0.4054863448032912,
"learning_rate": 2.9544397954439794e-05,
"loss": 0.4094,
"step": 1121
},
{
"epoch": 1.406269592476489,
"grad_norm": 0.31397620177134383,
"learning_rate": 2.9521152952115296e-05,
"loss": 0.3834,
"step": 1122
},
{
"epoch": 1.407523510971787,
"grad_norm": 0.37005732697907445,
"learning_rate": 2.9497907949790797e-05,
"loss": 0.3826,
"step": 1123
},
{
"epoch": 1.4087774294670847,
"grad_norm": 0.2949578159643034,
"learning_rate": 2.9474662947466298e-05,
"loss": 0.3732,
"step": 1124
},
{
"epoch": 1.4100313479623825,
"grad_norm": 0.32511271158248467,
"learning_rate": 2.9451417945141796e-05,
"loss": 0.3608,
"step": 1125
},
{
"epoch": 1.4112852664576803,
"grad_norm": 0.328037281455968,
"learning_rate": 2.9428172942817294e-05,
"loss": 0.3499,
"step": 1126
},
{
"epoch": 1.412539184952978,
"grad_norm": 0.30580936091610456,
"learning_rate": 2.9404927940492795e-05,
"loss": 0.3664,
"step": 1127
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.32622344037757717,
"learning_rate": 2.93816829381683e-05,
"loss": 0.3725,
"step": 1128
},
{
"epoch": 1.4150470219435736,
"grad_norm": 0.3195341843388382,
"learning_rate": 2.9358437935843797e-05,
"loss": 0.3692,
"step": 1129
},
{
"epoch": 1.4163009404388713,
"grad_norm": 0.27459826351080907,
"learning_rate": 2.9335192933519295e-05,
"loss": 0.3562,
"step": 1130
},
{
"epoch": 1.4175548589341693,
"grad_norm": 0.2634351710500294,
"learning_rate": 2.9311947931194793e-05,
"loss": 0.3879,
"step": 1131
},
{
"epoch": 1.418808777429467,
"grad_norm": 0.2847195589891995,
"learning_rate": 2.9288702928870294e-05,
"loss": 0.3553,
"step": 1132
},
{
"epoch": 1.4200626959247649,
"grad_norm": 0.32930486598883424,
"learning_rate": 2.9265457926545792e-05,
"loss": 0.3764,
"step": 1133
},
{
"epoch": 1.4213166144200626,
"grad_norm": 0.3158337742985722,
"learning_rate": 2.9242212924221297e-05,
"loss": 0.3625,
"step": 1134
},
{
"epoch": 1.4225705329153606,
"grad_norm": 0.28242455102610103,
"learning_rate": 2.9218967921896795e-05,
"loss": 0.3848,
"step": 1135
},
{
"epoch": 1.4238244514106584,
"grad_norm": 0.3158693856762095,
"learning_rate": 2.9195722919572292e-05,
"loss": 0.3994,
"step": 1136
},
{
"epoch": 1.4250783699059562,
"grad_norm": 0.3220194601870974,
"learning_rate": 2.9172477917247794e-05,
"loss": 0.3813,
"step": 1137
},
{
"epoch": 1.426332288401254,
"grad_norm": 0.34364705020789377,
"learning_rate": 2.914923291492329e-05,
"loss": 0.3753,
"step": 1138
},
{
"epoch": 1.4275862068965517,
"grad_norm": 0.2924667754102532,
"learning_rate": 2.912598791259879e-05,
"loss": 0.3538,
"step": 1139
},
{
"epoch": 1.4288401253918495,
"grad_norm": 0.2802746498562486,
"learning_rate": 2.9102742910274294e-05,
"loss": 0.3623,
"step": 1140
},
{
"epoch": 1.4300940438871472,
"grad_norm": 0.3549521004805964,
"learning_rate": 2.9079497907949792e-05,
"loss": 0.3808,
"step": 1141
},
{
"epoch": 1.4313479623824452,
"grad_norm": 0.3433556905764778,
"learning_rate": 2.9056252905625293e-05,
"loss": 0.347,
"step": 1142
},
{
"epoch": 1.432601880877743,
"grad_norm": 0.3765304771477011,
"learning_rate": 2.903300790330079e-05,
"loss": 0.3957,
"step": 1143
},
{
"epoch": 1.4338557993730408,
"grad_norm": 0.3845951032367623,
"learning_rate": 2.900976290097629e-05,
"loss": 0.341,
"step": 1144
},
{
"epoch": 1.4351097178683385,
"grad_norm": 0.29349178041438684,
"learning_rate": 2.8986517898651793e-05,
"loss": 0.378,
"step": 1145
},
{
"epoch": 1.4363636363636363,
"grad_norm": 0.32709095117100656,
"learning_rate": 2.896327289632729e-05,
"loss": 0.382,
"step": 1146
},
{
"epoch": 1.4376175548589343,
"grad_norm": 0.3324045259904513,
"learning_rate": 2.8940027894002792e-05,
"loss": 0.3701,
"step": 1147
},
{
"epoch": 1.438871473354232,
"grad_norm": 0.2717457442233135,
"learning_rate": 2.891678289167829e-05,
"loss": 0.364,
"step": 1148
},
{
"epoch": 1.4401253918495298,
"grad_norm": 0.336623047982674,
"learning_rate": 2.8893537889353788e-05,
"loss": 0.3634,
"step": 1149
},
{
"epoch": 1.4413793103448276,
"grad_norm": 0.32967167383978807,
"learning_rate": 2.887029288702929e-05,
"loss": 0.411,
"step": 1150
},
{
"epoch": 1.4426332288401253,
"grad_norm": 0.30318340455544046,
"learning_rate": 2.884704788470479e-05,
"loss": 0.4004,
"step": 1151
},
{
"epoch": 1.4438871473354231,
"grad_norm": 0.2978484696563557,
"learning_rate": 2.882380288238029e-05,
"loss": 0.3754,
"step": 1152
},
{
"epoch": 1.4451410658307209,
"grad_norm": 0.3319186827805567,
"learning_rate": 2.880055788005579e-05,
"loss": 0.362,
"step": 1153
},
{
"epoch": 1.4463949843260189,
"grad_norm": 0.288702829484456,
"learning_rate": 2.8777312877731287e-05,
"loss": 0.3772,
"step": 1154
},
{
"epoch": 1.4476489028213166,
"grad_norm": 0.35416156728794385,
"learning_rate": 2.875406787540679e-05,
"loss": 0.3755,
"step": 1155
},
{
"epoch": 1.4489028213166144,
"grad_norm": 0.3161706599250288,
"learning_rate": 2.8730822873082286e-05,
"loss": 0.3888,
"step": 1156
},
{
"epoch": 1.4501567398119122,
"grad_norm": 0.3418855051912887,
"learning_rate": 2.870757787075779e-05,
"loss": 0.3615,
"step": 1157
},
{
"epoch": 1.4514106583072102,
"grad_norm": 0.35929628373109807,
"learning_rate": 2.868433286843329e-05,
"loss": 0.3975,
"step": 1158
},
{
"epoch": 1.452664576802508,
"grad_norm": 0.3083552398934555,
"learning_rate": 2.8661087866108787e-05,
"loss": 0.3774,
"step": 1159
},
{
"epoch": 1.4539184952978057,
"grad_norm": 0.3774133525548284,
"learning_rate": 2.8637842863784288e-05,
"loss": 0.3572,
"step": 1160
},
{
"epoch": 1.4551724137931035,
"grad_norm": 0.31565489121133405,
"learning_rate": 2.8614597861459786e-05,
"loss": 0.373,
"step": 1161
},
{
"epoch": 1.4564263322884012,
"grad_norm": 0.3626911852901575,
"learning_rate": 2.859135285913529e-05,
"loss": 0.379,
"step": 1162
},
{
"epoch": 1.457680250783699,
"grad_norm": 0.3711198522152786,
"learning_rate": 2.8568107856810788e-05,
"loss": 0.3698,
"step": 1163
},
{
"epoch": 1.4589341692789968,
"grad_norm": 0.2871526162975372,
"learning_rate": 2.8544862854486286e-05,
"loss": 0.3712,
"step": 1164
},
{
"epoch": 1.4601880877742945,
"grad_norm": 0.4029745712870587,
"learning_rate": 2.8521617852161787e-05,
"loss": 0.3822,
"step": 1165
},
{
"epoch": 1.4614420062695925,
"grad_norm": 0.3418096493637921,
"learning_rate": 2.8498372849837285e-05,
"loss": 0.3833,
"step": 1166
},
{
"epoch": 1.4626959247648903,
"grad_norm": 0.3454736935126704,
"learning_rate": 2.8475127847512783e-05,
"loss": 0.3833,
"step": 1167
},
{
"epoch": 1.463949843260188,
"grad_norm": 0.40992993005001943,
"learning_rate": 2.8451882845188288e-05,
"loss": 0.3576,
"step": 1168
},
{
"epoch": 1.4652037617554858,
"grad_norm": 0.30425253305200006,
"learning_rate": 2.8428637842863785e-05,
"loss": 0.3663,
"step": 1169
},
{
"epoch": 1.4664576802507838,
"grad_norm": 0.3117836341867886,
"learning_rate": 2.8405392840539287e-05,
"loss": 0.3544,
"step": 1170
},
{
"epoch": 1.4677115987460816,
"grad_norm": 0.35072982739315506,
"learning_rate": 2.8382147838214784e-05,
"loss": 0.3579,
"step": 1171
},
{
"epoch": 1.4689655172413794,
"grad_norm": 0.34422047482807094,
"learning_rate": 2.8358902835890282e-05,
"loss": 0.3672,
"step": 1172
},
{
"epoch": 1.4702194357366771,
"grad_norm": 0.3278436447786807,
"learning_rate": 2.8335657833565787e-05,
"loss": 0.3816,
"step": 1173
},
{
"epoch": 1.471473354231975,
"grad_norm": 0.3229189569594198,
"learning_rate": 2.8312412831241285e-05,
"loss": 0.3622,
"step": 1174
},
{
"epoch": 1.4727272727272727,
"grad_norm": 0.3547171850833711,
"learning_rate": 2.8289167828916786e-05,
"loss": 0.3788,
"step": 1175
},
{
"epoch": 1.4739811912225704,
"grad_norm": 0.32295156189339,
"learning_rate": 2.8265922826592284e-05,
"loss": 0.3612,
"step": 1176
},
{
"epoch": 1.4752351097178684,
"grad_norm": 0.3352100323798505,
"learning_rate": 2.824267782426778e-05,
"loss": 0.3826,
"step": 1177
},
{
"epoch": 1.4764890282131662,
"grad_norm": 0.326433218716937,
"learning_rate": 2.8219432821943283e-05,
"loss": 0.3574,
"step": 1178
},
{
"epoch": 1.477742946708464,
"grad_norm": 0.2973090679185412,
"learning_rate": 2.8196187819618784e-05,
"loss": 0.3787,
"step": 1179
},
{
"epoch": 1.4789968652037617,
"grad_norm": 0.317118767737772,
"learning_rate": 2.8172942817294285e-05,
"loss": 0.3694,
"step": 1180
},
{
"epoch": 1.4802507836990595,
"grad_norm": 0.29051503792196637,
"learning_rate": 2.8149697814969783e-05,
"loss": 0.3843,
"step": 1181
},
{
"epoch": 1.4815047021943575,
"grad_norm": 0.318426703601805,
"learning_rate": 2.812645281264528e-05,
"loss": 0.3661,
"step": 1182
},
{
"epoch": 1.4827586206896552,
"grad_norm": 0.3175360761624218,
"learning_rate": 2.8103207810320782e-05,
"loss": 0.3711,
"step": 1183
},
{
"epoch": 1.484012539184953,
"grad_norm": 0.34114285672467565,
"learning_rate": 2.807996280799628e-05,
"loss": 0.3861,
"step": 1184
},
{
"epoch": 1.4852664576802508,
"grad_norm": 0.27373668040723376,
"learning_rate": 2.8056717805671785e-05,
"loss": 0.3779,
"step": 1185
},
{
"epoch": 1.4865203761755486,
"grad_norm": 0.2979999653876826,
"learning_rate": 2.8033472803347283e-05,
"loss": 0.3728,
"step": 1186
},
{
"epoch": 1.4877742946708463,
"grad_norm": 0.30357289560760664,
"learning_rate": 2.801022780102278e-05,
"loss": 0.3458,
"step": 1187
},
{
"epoch": 1.489028213166144,
"grad_norm": 0.35124629826302295,
"learning_rate": 2.798698279869828e-05,
"loss": 0.3822,
"step": 1188
},
{
"epoch": 1.490282131661442,
"grad_norm": 0.27995556717290876,
"learning_rate": 2.796373779637378e-05,
"loss": 0.382,
"step": 1189
},
{
"epoch": 1.4915360501567398,
"grad_norm": 0.3681713481559872,
"learning_rate": 2.7940492794049284e-05,
"loss": 0.3767,
"step": 1190
},
{
"epoch": 1.4927899686520376,
"grad_norm": 0.2780326023157995,
"learning_rate": 2.7917247791724782e-05,
"loss": 0.3994,
"step": 1191
},
{
"epoch": 1.4940438871473354,
"grad_norm": 0.323394686145301,
"learning_rate": 2.789400278940028e-05,
"loss": 0.3791,
"step": 1192
},
{
"epoch": 1.4952978056426331,
"grad_norm": 0.31289847469282805,
"learning_rate": 2.787075778707578e-05,
"loss": 0.3965,
"step": 1193
},
{
"epoch": 1.4965517241379311,
"grad_norm": 0.29509229874495807,
"learning_rate": 2.784751278475128e-05,
"loss": 0.3608,
"step": 1194
},
{
"epoch": 1.497805642633229,
"grad_norm": 0.28842010048811634,
"learning_rate": 2.7824267782426777e-05,
"loss": 0.3625,
"step": 1195
},
{
"epoch": 1.4990595611285267,
"grad_norm": 0.3044672058862012,
"learning_rate": 2.780102278010228e-05,
"loss": 0.3646,
"step": 1196
},
{
"epoch": 1.5003134796238244,
"grad_norm": 0.3519622856297419,
"learning_rate": 2.777777777777778e-05,
"loss": 0.3703,
"step": 1197
},
{
"epoch": 1.5015673981191222,
"grad_norm": 0.37922131273225357,
"learning_rate": 2.775453277545328e-05,
"loss": 0.365,
"step": 1198
},
{
"epoch": 1.50282131661442,
"grad_norm": 0.3137698494524432,
"learning_rate": 2.7731287773128778e-05,
"loss": 0.3897,
"step": 1199
},
{
"epoch": 1.5040752351097177,
"grad_norm": 0.3593878207092214,
"learning_rate": 2.7708042770804276e-05,
"loss": 0.3771,
"step": 1200
},
{
"epoch": 1.5053291536050155,
"grad_norm": 0.3725603258178067,
"learning_rate": 2.7684797768479774e-05,
"loss": 0.365,
"step": 1201
},
{
"epoch": 1.5065830721003135,
"grad_norm": 0.31422888373756097,
"learning_rate": 2.766155276615528e-05,
"loss": 0.3768,
"step": 1202
},
{
"epoch": 1.5078369905956113,
"grad_norm": 0.3029392200737052,
"learning_rate": 2.763830776383078e-05,
"loss": 0.3823,
"step": 1203
},
{
"epoch": 1.509090909090909,
"grad_norm": 0.3847242907960294,
"learning_rate": 2.7615062761506278e-05,
"loss": 0.3786,
"step": 1204
},
{
"epoch": 1.510344827586207,
"grad_norm": 0.34308644348612977,
"learning_rate": 2.7591817759181775e-05,
"loss": 0.386,
"step": 1205
},
{
"epoch": 1.5115987460815048,
"grad_norm": 0.33720515161183184,
"learning_rate": 2.7568572756857277e-05,
"loss": 0.3773,
"step": 1206
},
{
"epoch": 1.5128526645768026,
"grad_norm": 0.32368210287012056,
"learning_rate": 2.7545327754532778e-05,
"loss": 0.3872,
"step": 1207
},
{
"epoch": 1.5141065830721003,
"grad_norm": 0.363928244942658,
"learning_rate": 2.752208275220828e-05,
"loss": 0.367,
"step": 1208
},
{
"epoch": 1.515360501567398,
"grad_norm": 0.3470457704665735,
"learning_rate": 2.7498837749883777e-05,
"loss": 0.3593,
"step": 1209
},
{
"epoch": 1.5166144200626959,
"grad_norm": 0.30376450027050594,
"learning_rate": 2.7475592747559275e-05,
"loss": 0.3831,
"step": 1210
},
{
"epoch": 1.5178683385579936,
"grad_norm": 0.467513939507491,
"learning_rate": 2.7452347745234776e-05,
"loss": 0.3484,
"step": 1211
},
{
"epoch": 1.5191222570532914,
"grad_norm": 0.2926291990842129,
"learning_rate": 2.7429102742910274e-05,
"loss": 0.3452,
"step": 1212
},
{
"epoch": 1.5203761755485894,
"grad_norm": 0.4149007300722941,
"learning_rate": 2.740585774058578e-05,
"loss": 0.3754,
"step": 1213
},
{
"epoch": 1.5216300940438872,
"grad_norm": 0.3648967019678895,
"learning_rate": 2.7382612738261276e-05,
"loss": 0.3855,
"step": 1214
},
{
"epoch": 1.522884012539185,
"grad_norm": 0.3560842602690653,
"learning_rate": 2.7359367735936774e-05,
"loss": 0.3483,
"step": 1215
},
{
"epoch": 1.524137931034483,
"grad_norm": 0.3919176750938763,
"learning_rate": 2.7336122733612275e-05,
"loss": 0.362,
"step": 1216
},
{
"epoch": 1.5253918495297807,
"grad_norm": 0.3443869161327948,
"learning_rate": 2.7312877731287773e-05,
"loss": 0.3614,
"step": 1217
},
{
"epoch": 1.5266457680250785,
"grad_norm": 0.28083927451491336,
"learning_rate": 2.7289632728963278e-05,
"loss": 0.3791,
"step": 1218
},
{
"epoch": 1.5278996865203762,
"grad_norm": 0.42444479822815556,
"learning_rate": 2.7266387726638776e-05,
"loss": 0.3658,
"step": 1219
},
{
"epoch": 1.529153605015674,
"grad_norm": 0.3470287825220272,
"learning_rate": 2.7243142724314273e-05,
"loss": 0.3722,
"step": 1220
},
{
"epoch": 1.5304075235109718,
"grad_norm": 0.32850404489825025,
"learning_rate": 2.7219897721989775e-05,
"loss": 0.3394,
"step": 1221
},
{
"epoch": 1.5316614420062695,
"grad_norm": 0.3306511125285833,
"learning_rate": 2.7196652719665273e-05,
"loss": 0.3765,
"step": 1222
},
{
"epoch": 1.5329153605015673,
"grad_norm": 0.2948528994165521,
"learning_rate": 2.717340771734077e-05,
"loss": 0.3709,
"step": 1223
},
{
"epoch": 1.534169278996865,
"grad_norm": 0.3236257011938977,
"learning_rate": 2.7150162715016275e-05,
"loss": 0.3664,
"step": 1224
},
{
"epoch": 1.535423197492163,
"grad_norm": 0.317396882972685,
"learning_rate": 2.7126917712691773e-05,
"loss": 0.3664,
"step": 1225
},
{
"epoch": 1.5366771159874608,
"grad_norm": 0.3558511513087048,
"learning_rate": 2.7103672710367274e-05,
"loss": 0.3656,
"step": 1226
},
{
"epoch": 1.5379310344827586,
"grad_norm": 0.282223108367491,
"learning_rate": 2.7080427708042772e-05,
"loss": 0.3767,
"step": 1227
},
{
"epoch": 1.5391849529780566,
"grad_norm": 0.2966682348062727,
"learning_rate": 2.705718270571827e-05,
"loss": 0.362,
"step": 1228
},
{
"epoch": 1.5404388714733543,
"grad_norm": 0.33017968699801015,
"learning_rate": 2.7033937703393768e-05,
"loss": 0.3591,
"step": 1229
},
{
"epoch": 1.541692789968652,
"grad_norm": 0.28954649487215567,
"learning_rate": 2.7010692701069272e-05,
"loss": 0.3578,
"step": 1230
},
{
"epoch": 1.5429467084639499,
"grad_norm": 0.29952497302846415,
"learning_rate": 2.6987447698744773e-05,
"loss": 0.3767,
"step": 1231
},
{
"epoch": 1.5442006269592476,
"grad_norm": 0.3400513827030496,
"learning_rate": 2.696420269642027e-05,
"loss": 0.381,
"step": 1232
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.3042649482702414,
"learning_rate": 2.694095769409577e-05,
"loss": 0.4042,
"step": 1233
},
{
"epoch": 1.5467084639498432,
"grad_norm": 0.4007447191062732,
"learning_rate": 2.6917712691771267e-05,
"loss": 0.3782,
"step": 1234
},
{
"epoch": 1.547962382445141,
"grad_norm": 0.2804770147927339,
"learning_rate": 2.689446768944677e-05,
"loss": 0.3546,
"step": 1235
},
{
"epoch": 1.5492163009404387,
"grad_norm": 0.31186163517561927,
"learning_rate": 2.6871222687122273e-05,
"loss": 0.3682,
"step": 1236
},
{
"epoch": 1.5504702194357367,
"grad_norm": 0.3335105065651361,
"learning_rate": 2.684797768479777e-05,
"loss": 0.3681,
"step": 1237
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.3115161106367867,
"learning_rate": 2.682473268247327e-05,
"loss": 0.3835,
"step": 1238
},
{
"epoch": 1.5529780564263322,
"grad_norm": 0.33460195526403075,
"learning_rate": 2.680148768014877e-05,
"loss": 0.3913,
"step": 1239
},
{
"epoch": 1.5542319749216302,
"grad_norm": 0.3016734090218823,
"learning_rate": 2.6778242677824267e-05,
"loss": 0.3553,
"step": 1240
},
{
"epoch": 1.555485893416928,
"grad_norm": 0.32026413466320425,
"learning_rate": 2.6754997675499772e-05,
"loss": 0.3781,
"step": 1241
},
{
"epoch": 1.5567398119122258,
"grad_norm": 0.29534944847410155,
"learning_rate": 2.673175267317527e-05,
"loss": 0.3885,
"step": 1242
},
{
"epoch": 1.5579937304075235,
"grad_norm": 0.2949347402611695,
"learning_rate": 2.6708507670850768e-05,
"loss": 0.3767,
"step": 1243
},
{
"epoch": 1.5592476489028213,
"grad_norm": 0.3410760087836751,
"learning_rate": 2.668526266852627e-05,
"loss": 0.352,
"step": 1244
},
{
"epoch": 1.560501567398119,
"grad_norm": 0.25151120600013593,
"learning_rate": 2.6662017666201767e-05,
"loss": 0.3768,
"step": 1245
},
{
"epoch": 1.5617554858934168,
"grad_norm": 0.3232715544908255,
"learning_rate": 2.6638772663877265e-05,
"loss": 0.3622,
"step": 1246
},
{
"epoch": 1.5630094043887146,
"grad_norm": 0.28181449105499473,
"learning_rate": 2.661552766155277e-05,
"loss": 0.3627,
"step": 1247
},
{
"epoch": 1.5642633228840124,
"grad_norm": 0.3156619672768744,
"learning_rate": 2.6592282659228267e-05,
"loss": 0.3816,
"step": 1248
},
{
"epoch": 1.5655172413793104,
"grad_norm": 0.2704869472532353,
"learning_rate": 2.656903765690377e-05,
"loss": 0.3886,
"step": 1249
},
{
"epoch": 1.5667711598746081,
"grad_norm": 0.32012277258527566,
"learning_rate": 2.6545792654579266e-05,
"loss": 0.3684,
"step": 1250
},
{
"epoch": 1.5680250783699061,
"grad_norm": 0.29679513268152086,
"learning_rate": 2.6522547652254764e-05,
"loss": 0.3549,
"step": 1251
},
{
"epoch": 1.569278996865204,
"grad_norm": 0.32333853062339724,
"learning_rate": 2.649930264993027e-05,
"loss": 0.3706,
"step": 1252
},
{
"epoch": 1.5705329153605017,
"grad_norm": 0.29972880999781026,
"learning_rate": 2.6476057647605766e-05,
"loss": 0.3672,
"step": 1253
},
{
"epoch": 1.5717868338557994,
"grad_norm": 0.3392277557366304,
"learning_rate": 2.6452812645281268e-05,
"loss": 0.3876,
"step": 1254
},
{
"epoch": 1.5730407523510972,
"grad_norm": 0.2822933205889421,
"learning_rate": 2.6429567642956766e-05,
"loss": 0.3701,
"step": 1255
},
{
"epoch": 1.574294670846395,
"grad_norm": 0.2794754114547274,
"learning_rate": 2.6406322640632263e-05,
"loss": 0.3618,
"step": 1256
},
{
"epoch": 1.5755485893416927,
"grad_norm": 0.33999292452636615,
"learning_rate": 2.638307763830776e-05,
"loss": 0.3737,
"step": 1257
},
{
"epoch": 1.5768025078369905,
"grad_norm": 0.2849582210564151,
"learning_rate": 2.6359832635983266e-05,
"loss": 0.3786,
"step": 1258
},
{
"epoch": 1.5780564263322883,
"grad_norm": 0.3059511918146187,
"learning_rate": 2.6336587633658767e-05,
"loss": 0.3667,
"step": 1259
},
{
"epoch": 1.5793103448275863,
"grad_norm": 0.31328583003765426,
"learning_rate": 2.6313342631334265e-05,
"loss": 0.3851,
"step": 1260
},
{
"epoch": 1.580564263322884,
"grad_norm": 0.2742013185190023,
"learning_rate": 2.6290097629009763e-05,
"loss": 0.3601,
"step": 1261
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.2928946142542193,
"learning_rate": 2.626685262668526e-05,
"loss": 0.3705,
"step": 1262
},
{
"epoch": 1.5830721003134798,
"grad_norm": 0.3366573792579463,
"learning_rate": 2.6243607624360762e-05,
"loss": 0.3613,
"step": 1263
},
{
"epoch": 1.5843260188087775,
"grad_norm": 0.3111749028049115,
"learning_rate": 2.6220362622036266e-05,
"loss": 0.3641,
"step": 1264
},
{
"epoch": 1.5855799373040753,
"grad_norm": 0.3260748041545167,
"learning_rate": 2.6197117619711764e-05,
"loss": 0.3663,
"step": 1265
},
{
"epoch": 1.586833855799373,
"grad_norm": 0.3301301988585554,
"learning_rate": 2.6173872617387262e-05,
"loss": 0.3696,
"step": 1266
},
{
"epoch": 1.5880877742946709,
"grad_norm": 0.3196176256456356,
"learning_rate": 2.615062761506276e-05,
"loss": 0.3736,
"step": 1267
},
{
"epoch": 1.5893416927899686,
"grad_norm": 0.32634556842314766,
"learning_rate": 2.612738261273826e-05,
"loss": 0.3701,
"step": 1268
},
{
"epoch": 1.5905956112852664,
"grad_norm": 0.3145428077717483,
"learning_rate": 2.6104137610413766e-05,
"loss": 0.3562,
"step": 1269
},
{
"epoch": 1.5918495297805642,
"grad_norm": 0.30679675400083434,
"learning_rate": 2.6080892608089264e-05,
"loss": 0.3903,
"step": 1270
},
{
"epoch": 1.593103448275862,
"grad_norm": 0.30580744765757983,
"learning_rate": 2.605764760576476e-05,
"loss": 0.3452,
"step": 1271
},
{
"epoch": 1.59435736677116,
"grad_norm": 0.34995114038152003,
"learning_rate": 2.6034402603440263e-05,
"loss": 0.3668,
"step": 1272
},
{
"epoch": 1.5956112852664577,
"grad_norm": 0.2852215084619047,
"learning_rate": 2.601115760111576e-05,
"loss": 0.3556,
"step": 1273
},
{
"epoch": 1.5968652037617554,
"grad_norm": 0.32308107680121567,
"learning_rate": 2.598791259879126e-05,
"loss": 0.3912,
"step": 1274
},
{
"epoch": 1.5981191222570534,
"grad_norm": 0.3117997009705635,
"learning_rate": 2.5964667596466763e-05,
"loss": 0.3653,
"step": 1275
},
{
"epoch": 1.5993730407523512,
"grad_norm": 0.30191570863047035,
"learning_rate": 2.594142259414226e-05,
"loss": 0.3936,
"step": 1276
},
{
"epoch": 1.600626959247649,
"grad_norm": 0.3046298506288419,
"learning_rate": 2.5918177591817762e-05,
"loss": 0.3847,
"step": 1277
},
{
"epoch": 1.6018808777429467,
"grad_norm": 0.36988414539690406,
"learning_rate": 2.589493258949326e-05,
"loss": 0.371,
"step": 1278
},
{
"epoch": 1.6031347962382445,
"grad_norm": 0.2971733303053073,
"learning_rate": 2.5871687587168758e-05,
"loss": 0.3627,
"step": 1279
},
{
"epoch": 1.6043887147335423,
"grad_norm": 0.29567444763209766,
"learning_rate": 2.5848442584844262e-05,
"loss": 0.3679,
"step": 1280
},
{
"epoch": 1.60564263322884,
"grad_norm": 0.2769804853752955,
"learning_rate": 2.582519758251976e-05,
"loss": 0.3433,
"step": 1281
},
{
"epoch": 1.6068965517241378,
"grad_norm": 0.27828759869368747,
"learning_rate": 2.580195258019526e-05,
"loss": 0.3733,
"step": 1282
},
{
"epoch": 1.6081504702194356,
"grad_norm": 0.3019720415350354,
"learning_rate": 2.577870757787076e-05,
"loss": 0.3599,
"step": 1283
},
{
"epoch": 1.6094043887147336,
"grad_norm": 0.28227012399728313,
"learning_rate": 2.5755462575546257e-05,
"loss": 0.3877,
"step": 1284
},
{
"epoch": 1.6106583072100313,
"grad_norm": 0.2934505881387088,
"learning_rate": 2.5732217573221755e-05,
"loss": 0.3631,
"step": 1285
},
{
"epoch": 1.611912225705329,
"grad_norm": 0.30224719444926684,
"learning_rate": 2.570897257089726e-05,
"loss": 0.3575,
"step": 1286
},
{
"epoch": 1.613166144200627,
"grad_norm": 0.29632858940878215,
"learning_rate": 2.568572756857276e-05,
"loss": 0.3896,
"step": 1287
},
{
"epoch": 1.6144200626959249,
"grad_norm": 0.3056425115185049,
"learning_rate": 2.566248256624826e-05,
"loss": 0.372,
"step": 1288
},
{
"epoch": 1.6156739811912226,
"grad_norm": 0.349744706215169,
"learning_rate": 2.5639237563923756e-05,
"loss": 0.3826,
"step": 1289
},
{
"epoch": 1.6169278996865204,
"grad_norm": 0.30823992425925817,
"learning_rate": 2.5615992561599254e-05,
"loss": 0.3667,
"step": 1290
},
{
"epoch": 1.6181818181818182,
"grad_norm": 0.28200370076099807,
"learning_rate": 2.5592747559274755e-05,
"loss": 0.3822,
"step": 1291
},
{
"epoch": 1.619435736677116,
"grad_norm": 0.27893476020433033,
"learning_rate": 2.556950255695026e-05,
"loss": 0.3808,
"step": 1292
},
{
"epoch": 1.6206896551724137,
"grad_norm": 0.3192382372713905,
"learning_rate": 2.5546257554625758e-05,
"loss": 0.364,
"step": 1293
},
{
"epoch": 1.6219435736677115,
"grad_norm": 0.304873697957866,
"learning_rate": 2.5523012552301256e-05,
"loss": 0.3712,
"step": 1294
},
{
"epoch": 1.6231974921630095,
"grad_norm": 0.28292354127195873,
"learning_rate": 2.5499767549976754e-05,
"loss": 0.3728,
"step": 1295
},
{
"epoch": 1.6244514106583072,
"grad_norm": 0.27925862840608556,
"learning_rate": 2.5476522547652255e-05,
"loss": 0.3667,
"step": 1296
},
{
"epoch": 1.625705329153605,
"grad_norm": 0.34026837592203407,
"learning_rate": 2.545327754532776e-05,
"loss": 0.3853,
"step": 1297
},
{
"epoch": 1.626959247648903,
"grad_norm": 0.2697763085176464,
"learning_rate": 2.5430032543003257e-05,
"loss": 0.3529,
"step": 1298
},
{
"epoch": 1.6282131661442008,
"grad_norm": 0.32451527974487243,
"learning_rate": 2.5406787540678755e-05,
"loss": 0.3538,
"step": 1299
},
{
"epoch": 1.6294670846394985,
"grad_norm": 0.3596905323874691,
"learning_rate": 2.5383542538354253e-05,
"loss": 0.3686,
"step": 1300
},
{
"epoch": 1.6307210031347963,
"grad_norm": 0.3060017422009383,
"learning_rate": 2.5360297536029754e-05,
"loss": 0.3653,
"step": 1301
},
{
"epoch": 1.631974921630094,
"grad_norm": 0.30767801371122067,
"learning_rate": 2.5337052533705252e-05,
"loss": 0.3701,
"step": 1302
},
{
"epoch": 1.6332288401253918,
"grad_norm": 0.3386303292351153,
"learning_rate": 2.5313807531380757e-05,
"loss": 0.3618,
"step": 1303
},
{
"epoch": 1.6344827586206896,
"grad_norm": 0.31511975493890587,
"learning_rate": 2.5290562529056255e-05,
"loss": 0.3855,
"step": 1304
},
{
"epoch": 1.6357366771159874,
"grad_norm": 0.3577982128947377,
"learning_rate": 2.5267317526731756e-05,
"loss": 0.3602,
"step": 1305
},
{
"epoch": 1.6369905956112851,
"grad_norm": 0.26724857793468815,
"learning_rate": 2.5244072524407254e-05,
"loss": 0.3549,
"step": 1306
},
{
"epoch": 1.6382445141065831,
"grad_norm": 0.3661675693470131,
"learning_rate": 2.522082752208275e-05,
"loss": 0.3546,
"step": 1307
},
{
"epoch": 1.6394984326018809,
"grad_norm": 0.273282807649188,
"learning_rate": 2.519758251975825e-05,
"loss": 0.382,
"step": 1308
},
{
"epoch": 1.6407523510971787,
"grad_norm": 0.3392800593037572,
"learning_rate": 2.5174337517433754e-05,
"loss": 0.3623,
"step": 1309
},
{
"epoch": 1.6420062695924766,
"grad_norm": 0.34950173289304776,
"learning_rate": 2.5151092515109255e-05,
"loss": 0.3468,
"step": 1310
},
{
"epoch": 1.6432601880877744,
"grad_norm": 0.324944508854198,
"learning_rate": 2.5127847512784753e-05,
"loss": 0.3691,
"step": 1311
},
{
"epoch": 1.6445141065830722,
"grad_norm": 0.3651079050299609,
"learning_rate": 2.510460251046025e-05,
"loss": 0.3931,
"step": 1312
},
{
"epoch": 1.64576802507837,
"grad_norm": 0.3287933864768733,
"learning_rate": 2.508135750813575e-05,
"loss": 0.3526,
"step": 1313
},
{
"epoch": 1.6470219435736677,
"grad_norm": 0.3012322066240432,
"learning_rate": 2.5058112505811253e-05,
"loss": 0.3406,
"step": 1314
},
{
"epoch": 1.6482758620689655,
"grad_norm": 0.4336694039457402,
"learning_rate": 2.5034867503486754e-05,
"loss": 0.3864,
"step": 1315
},
{
"epoch": 1.6495297805642632,
"grad_norm": 0.35659750124043077,
"learning_rate": 2.5011622501162252e-05,
"loss": 0.3705,
"step": 1316
},
{
"epoch": 1.650783699059561,
"grad_norm": 0.3411772125975015,
"learning_rate": 2.498837749883775e-05,
"loss": 0.3621,
"step": 1317
},
{
"epoch": 1.6520376175548588,
"grad_norm": 0.35737587239560864,
"learning_rate": 2.496513249651325e-05,
"loss": 0.3911,
"step": 1318
},
{
"epoch": 1.6532915360501568,
"grad_norm": 0.3870698176719414,
"learning_rate": 2.4941887494188753e-05,
"loss": 0.356,
"step": 1319
},
{
"epoch": 1.6545454545454545,
"grad_norm": 0.3072731667805527,
"learning_rate": 2.491864249186425e-05,
"loss": 0.3595,
"step": 1320
},
{
"epoch": 1.6557993730407523,
"grad_norm": 0.3737750784281979,
"learning_rate": 2.489539748953975e-05,
"loss": 0.3627,
"step": 1321
},
{
"epoch": 1.6570532915360503,
"grad_norm": 0.3297813226259859,
"learning_rate": 2.487215248721525e-05,
"loss": 0.3714,
"step": 1322
},
{
"epoch": 1.658307210031348,
"grad_norm": 0.30484493799359275,
"learning_rate": 2.4848907484890747e-05,
"loss": 0.3971,
"step": 1323
},
{
"epoch": 1.6595611285266458,
"grad_norm": 0.3419619355216738,
"learning_rate": 2.4825662482566252e-05,
"loss": 0.3817,
"step": 1324
},
{
"epoch": 1.6608150470219436,
"grad_norm": 0.351155455265902,
"learning_rate": 2.480241748024175e-05,
"loss": 0.3579,
"step": 1325
},
{
"epoch": 1.6620689655172414,
"grad_norm": 0.3345159514445059,
"learning_rate": 2.4779172477917248e-05,
"loss": 0.3673,
"step": 1326
},
{
"epoch": 1.6633228840125391,
"grad_norm": 0.36387995218245983,
"learning_rate": 2.475592747559275e-05,
"loss": 0.3499,
"step": 1327
},
{
"epoch": 1.664576802507837,
"grad_norm": 0.3805220459745232,
"learning_rate": 2.4732682473268247e-05,
"loss": 0.3726,
"step": 1328
},
{
"epoch": 1.6658307210031347,
"grad_norm": 0.29330857264075166,
"learning_rate": 2.4709437470943748e-05,
"loss": 0.3675,
"step": 1329
},
{
"epoch": 1.6670846394984324,
"grad_norm": 0.36267869727655133,
"learning_rate": 2.468619246861925e-05,
"loss": 0.3899,
"step": 1330
},
{
"epoch": 1.6683385579937304,
"grad_norm": 0.4527636731801724,
"learning_rate": 2.4662947466294747e-05,
"loss": 0.3562,
"step": 1331
},
{
"epoch": 1.6695924764890282,
"grad_norm": 0.2883546907493381,
"learning_rate": 2.4639702463970248e-05,
"loss": 0.3667,
"step": 1332
},
{
"epoch": 1.670846394984326,
"grad_norm": 0.46922574664611116,
"learning_rate": 2.461645746164575e-05,
"loss": 0.3891,
"step": 1333
},
{
"epoch": 1.672100313479624,
"grad_norm": 0.3361443706321037,
"learning_rate": 2.4593212459321247e-05,
"loss": 0.3879,
"step": 1334
},
{
"epoch": 1.6733542319749217,
"grad_norm": 0.3905302199753036,
"learning_rate": 2.456996745699675e-05,
"loss": 0.3654,
"step": 1335
},
{
"epoch": 1.6746081504702195,
"grad_norm": 0.34530546732827877,
"learning_rate": 2.4546722454672246e-05,
"loss": 0.3656,
"step": 1336
},
{
"epoch": 1.6758620689655173,
"grad_norm": 0.3564766280381503,
"learning_rate": 2.4523477452347744e-05,
"loss": 0.3993,
"step": 1337
},
{
"epoch": 1.677115987460815,
"grad_norm": 0.38898553216993953,
"learning_rate": 2.450023245002325e-05,
"loss": 0.3742,
"step": 1338
},
{
"epoch": 1.6783699059561128,
"grad_norm": 0.3279151254751747,
"learning_rate": 2.4476987447698747e-05,
"loss": 0.3528,
"step": 1339
},
{
"epoch": 1.6796238244514106,
"grad_norm": 0.3468511646158206,
"learning_rate": 2.4453742445374244e-05,
"loss": 0.37,
"step": 1340
},
{
"epoch": 1.6808777429467083,
"grad_norm": 0.34729825674731196,
"learning_rate": 2.4430497443049746e-05,
"loss": 0.3935,
"step": 1341
},
{
"epoch": 1.6821316614420063,
"grad_norm": 0.37279475429381376,
"learning_rate": 2.4407252440725244e-05,
"loss": 0.3466,
"step": 1342
},
{
"epoch": 1.683385579937304,
"grad_norm": 0.29759455056463047,
"learning_rate": 2.4384007438400745e-05,
"loss": 0.394,
"step": 1343
},
{
"epoch": 1.6846394984326019,
"grad_norm": 0.34196537221250456,
"learning_rate": 2.4360762436076246e-05,
"loss": 0.3961,
"step": 1344
},
{
"epoch": 1.6858934169278998,
"grad_norm": 0.35739061507611486,
"learning_rate": 2.4337517433751744e-05,
"loss": 0.3562,
"step": 1345
},
{
"epoch": 1.6871473354231976,
"grad_norm": 0.36429635795994103,
"learning_rate": 2.431427243142724e-05,
"loss": 0.3836,
"step": 1346
},
{
"epoch": 1.6884012539184954,
"grad_norm": 0.3024443453278967,
"learning_rate": 2.4291027429102743e-05,
"loss": 0.3775,
"step": 1347
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.38247560896556215,
"learning_rate": 2.4267782426778244e-05,
"loss": 0.3856,
"step": 1348
},
{
"epoch": 1.690909090909091,
"grad_norm": 0.3197278676342063,
"learning_rate": 2.4244537424453745e-05,
"loss": 0.3798,
"step": 1349
},
{
"epoch": 1.6921630094043887,
"grad_norm": 0.3885909941940012,
"learning_rate": 2.4221292422129243e-05,
"loss": 0.3749,
"step": 1350
},
{
"epoch": 1.6934169278996865,
"grad_norm": 0.3531165243945794,
"learning_rate": 2.419804741980474e-05,
"loss": 0.3862,
"step": 1351
},
{
"epoch": 1.6946708463949842,
"grad_norm": 0.4513662236911339,
"learning_rate": 2.4174802417480246e-05,
"loss": 0.3734,
"step": 1352
},
{
"epoch": 1.695924764890282,
"grad_norm": 0.29851398009362173,
"learning_rate": 2.4151557415155743e-05,
"loss": 0.3712,
"step": 1353
},
{
"epoch": 1.69717868338558,
"grad_norm": 0.3273737170821241,
"learning_rate": 2.412831241283124e-05,
"loss": 0.3844,
"step": 1354
},
{
"epoch": 1.6984326018808777,
"grad_norm": 0.3547848799073937,
"learning_rate": 2.4105067410506743e-05,
"loss": 0.362,
"step": 1355
},
{
"epoch": 1.6996865203761755,
"grad_norm": 0.2645742941114308,
"learning_rate": 2.408182240818224e-05,
"loss": 0.3675,
"step": 1356
},
{
"epoch": 1.7009404388714735,
"grad_norm": 0.28308830152836767,
"learning_rate": 2.405857740585774e-05,
"loss": 0.3827,
"step": 1357
},
{
"epoch": 1.7021943573667713,
"grad_norm": 0.29715034621091674,
"learning_rate": 2.4035332403533243e-05,
"loss": 0.3697,
"step": 1358
},
{
"epoch": 1.703448275862069,
"grad_norm": 0.29834139450336045,
"learning_rate": 2.401208740120874e-05,
"loss": 0.3814,
"step": 1359
},
{
"epoch": 1.7047021943573668,
"grad_norm": 0.3103521198042419,
"learning_rate": 2.398884239888424e-05,
"loss": 0.3767,
"step": 1360
},
{
"epoch": 1.7059561128526646,
"grad_norm": 0.29741778052247986,
"learning_rate": 2.396559739655974e-05,
"loss": 0.3625,
"step": 1361
},
{
"epoch": 1.7072100313479623,
"grad_norm": 0.2781826582946832,
"learning_rate": 2.394235239423524e-05,
"loss": 0.3986,
"step": 1362
},
{
"epoch": 1.70846394984326,
"grad_norm": 0.31633066134677895,
"learning_rate": 2.3919107391910742e-05,
"loss": 0.3491,
"step": 1363
},
{
"epoch": 1.7097178683385579,
"grad_norm": 0.3163399806437815,
"learning_rate": 2.389586238958624e-05,
"loss": 0.3627,
"step": 1364
},
{
"epoch": 1.7109717868338556,
"grad_norm": 0.2736285543144872,
"learning_rate": 2.3872617387261738e-05,
"loss": 0.3679,
"step": 1365
},
{
"epoch": 1.7122257053291536,
"grad_norm": 0.2995860715564722,
"learning_rate": 2.3849372384937242e-05,
"loss": 0.3509,
"step": 1366
},
{
"epoch": 1.7134796238244514,
"grad_norm": 0.32219252852931046,
"learning_rate": 2.382612738261274e-05,
"loss": 0.4001,
"step": 1367
},
{
"epoch": 1.7147335423197492,
"grad_norm": 0.3296493984524979,
"learning_rate": 2.3802882380288238e-05,
"loss": 0.3918,
"step": 1368
},
{
"epoch": 1.7159874608150472,
"grad_norm": 0.3334896708903385,
"learning_rate": 2.377963737796374e-05,
"loss": 0.3889,
"step": 1369
},
{
"epoch": 1.717241379310345,
"grad_norm": 0.2774803962764674,
"learning_rate": 2.3756392375639237e-05,
"loss": 0.3741,
"step": 1370
},
{
"epoch": 1.7184952978056427,
"grad_norm": 0.31149731663691543,
"learning_rate": 2.373314737331474e-05,
"loss": 0.3697,
"step": 1371
},
{
"epoch": 1.7197492163009405,
"grad_norm": 0.3327716500988019,
"learning_rate": 2.370990237099024e-05,
"loss": 0.3756,
"step": 1372
},
{
"epoch": 1.7210031347962382,
"grad_norm": 0.297583278329512,
"learning_rate": 2.3686657368665737e-05,
"loss": 0.3729,
"step": 1373
},
{
"epoch": 1.722257053291536,
"grad_norm": 0.3451761070050823,
"learning_rate": 2.3663412366341235e-05,
"loss": 0.3891,
"step": 1374
},
{
"epoch": 1.7235109717868338,
"grad_norm": 0.3015864962985455,
"learning_rate": 2.3640167364016737e-05,
"loss": 0.3692,
"step": 1375
},
{
"epoch": 1.7247648902821315,
"grad_norm": 0.286962972105643,
"learning_rate": 2.3616922361692238e-05,
"loss": 0.3844,
"step": 1376
},
{
"epoch": 1.7260188087774293,
"grad_norm": 0.30868576027083955,
"learning_rate": 2.3593677359367736e-05,
"loss": 0.3843,
"step": 1377
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.26597808580517474,
"learning_rate": 2.3570432357043237e-05,
"loss": 0.3693,
"step": 1378
},
{
"epoch": 1.728526645768025,
"grad_norm": 0.3365938476685139,
"learning_rate": 2.3547187354718735e-05,
"loss": 0.3826,
"step": 1379
},
{
"epoch": 1.729780564263323,
"grad_norm": 0.29548330761818253,
"learning_rate": 2.3523942352394236e-05,
"loss": 0.3705,
"step": 1380
},
{
"epoch": 1.7310344827586208,
"grad_norm": 0.29119701660694275,
"learning_rate": 2.3500697350069737e-05,
"loss": 0.379,
"step": 1381
},
{
"epoch": 1.7322884012539186,
"grad_norm": 0.3475216514776969,
"learning_rate": 2.3477452347745235e-05,
"loss": 0.3681,
"step": 1382
},
{
"epoch": 1.7335423197492164,
"grad_norm": 0.33805407415192995,
"learning_rate": 2.3454207345420736e-05,
"loss": 0.3635,
"step": 1383
},
{
"epoch": 1.7347962382445141,
"grad_norm": 0.32582709271419186,
"learning_rate": 2.3430962343096234e-05,
"loss": 0.3841,
"step": 1384
},
{
"epoch": 1.736050156739812,
"grad_norm": 0.3707397985078461,
"learning_rate": 2.3407717340771735e-05,
"loss": 0.3576,
"step": 1385
},
{
"epoch": 1.7373040752351097,
"grad_norm": 0.3558921065610254,
"learning_rate": 2.3384472338447236e-05,
"loss": 0.3723,
"step": 1386
},
{
"epoch": 1.7385579937304074,
"grad_norm": 0.3178463778737209,
"learning_rate": 2.3361227336122734e-05,
"loss": 0.3748,
"step": 1387
},
{
"epoch": 1.7398119122257052,
"grad_norm": 0.33814954202438563,
"learning_rate": 2.3337982333798232e-05,
"loss": 0.3691,
"step": 1388
},
{
"epoch": 1.7410658307210032,
"grad_norm": 0.2824256405535515,
"learning_rate": 2.3314737331473733e-05,
"loss": 0.3606,
"step": 1389
},
{
"epoch": 1.742319749216301,
"grad_norm": 0.3388677433104451,
"learning_rate": 2.3291492329149235e-05,
"loss": 0.3851,
"step": 1390
},
{
"epoch": 1.7435736677115987,
"grad_norm": 0.2987010692636604,
"learning_rate": 2.3268247326824732e-05,
"loss": 0.3454,
"step": 1391
},
{
"epoch": 1.7448275862068967,
"grad_norm": 0.3148962903282769,
"learning_rate": 2.3245002324500234e-05,
"loss": 0.3661,
"step": 1392
},
{
"epoch": 1.7460815047021945,
"grad_norm": 0.2858844004005626,
"learning_rate": 2.322175732217573e-05,
"loss": 0.3686,
"step": 1393
},
{
"epoch": 1.7473354231974922,
"grad_norm": 0.2934286479415983,
"learning_rate": 2.3198512319851233e-05,
"loss": 0.3732,
"step": 1394
},
{
"epoch": 1.74858934169279,
"grad_norm": 0.3009998198254273,
"learning_rate": 2.3175267317526734e-05,
"loss": 0.333,
"step": 1395
},
{
"epoch": 1.7498432601880878,
"grad_norm": 0.2678436289503578,
"learning_rate": 2.3152022315202232e-05,
"loss": 0.3518,
"step": 1396
},
{
"epoch": 1.7510971786833855,
"grad_norm": 0.32306789707020284,
"learning_rate": 2.3128777312877733e-05,
"loss": 0.3702,
"step": 1397
},
{
"epoch": 1.7523510971786833,
"grad_norm": 0.2440966679143964,
"learning_rate": 2.310553231055323e-05,
"loss": 0.3557,
"step": 1398
},
{
"epoch": 1.753605015673981,
"grad_norm": 0.3615768932271486,
"learning_rate": 2.3082287308228732e-05,
"loss": 0.3654,
"step": 1399
},
{
"epoch": 1.7548589341692789,
"grad_norm": 0.3142026867370723,
"learning_rate": 2.3059042305904233e-05,
"loss": 0.3704,
"step": 1400
},
{
"epoch": 1.7561128526645768,
"grad_norm": 0.2832300365694455,
"learning_rate": 2.303579730357973e-05,
"loss": 0.3824,
"step": 1401
},
{
"epoch": 1.7573667711598746,
"grad_norm": 0.2553310943522007,
"learning_rate": 2.301255230125523e-05,
"loss": 0.3717,
"step": 1402
},
{
"epoch": 1.7586206896551724,
"grad_norm": 0.2565053579842024,
"learning_rate": 2.298930729893073e-05,
"loss": 0.3571,
"step": 1403
},
{
"epoch": 1.7598746081504704,
"grad_norm": 0.3387319137860899,
"learning_rate": 2.296606229660623e-05,
"loss": 0.3739,
"step": 1404
},
{
"epoch": 1.7611285266457681,
"grad_norm": 0.3260781762776723,
"learning_rate": 2.294281729428173e-05,
"loss": 0.3853,
"step": 1405
},
{
"epoch": 1.762382445141066,
"grad_norm": 0.26892651208197976,
"learning_rate": 2.291957229195723e-05,
"loss": 0.3481,
"step": 1406
},
{
"epoch": 1.7636363636363637,
"grad_norm": 0.31250090744618964,
"learning_rate": 2.289632728963273e-05,
"loss": 0.3819,
"step": 1407
},
{
"epoch": 1.7648902821316614,
"grad_norm": 0.35869789334578606,
"learning_rate": 2.287308228730823e-05,
"loss": 0.3925,
"step": 1408
},
{
"epoch": 1.7661442006269592,
"grad_norm": 0.2886997455521158,
"learning_rate": 2.284983728498373e-05,
"loss": 0.3785,
"step": 1409
},
{
"epoch": 1.767398119122257,
"grad_norm": 0.277507526836416,
"learning_rate": 2.282659228265923e-05,
"loss": 0.3707,
"step": 1410
},
{
"epoch": 1.7686520376175547,
"grad_norm": 0.2793645955905433,
"learning_rate": 2.280334728033473e-05,
"loss": 0.3515,
"step": 1411
},
{
"epoch": 1.7699059561128525,
"grad_norm": 0.2803340634791546,
"learning_rate": 2.2780102278010228e-05,
"loss": 0.3782,
"step": 1412
},
{
"epoch": 1.7711598746081505,
"grad_norm": 0.2796878740197575,
"learning_rate": 2.275685727568573e-05,
"loss": 0.3644,
"step": 1413
},
{
"epoch": 1.7724137931034483,
"grad_norm": 0.33816419223448935,
"learning_rate": 2.273361227336123e-05,
"loss": 0.3825,
"step": 1414
},
{
"epoch": 1.773667711598746,
"grad_norm": 0.29817394448501267,
"learning_rate": 2.2710367271036728e-05,
"loss": 0.3729,
"step": 1415
},
{
"epoch": 1.774921630094044,
"grad_norm": 0.28860745252804154,
"learning_rate": 2.2687122268712226e-05,
"loss": 0.3594,
"step": 1416
},
{
"epoch": 1.7761755485893418,
"grad_norm": 0.28292515546437086,
"learning_rate": 2.2663877266387727e-05,
"loss": 0.3648,
"step": 1417
},
{
"epoch": 1.7774294670846396,
"grad_norm": 0.3221455083238854,
"learning_rate": 2.264063226406323e-05,
"loss": 0.3513,
"step": 1418
},
{
"epoch": 1.7786833855799373,
"grad_norm": 0.3137959415926161,
"learning_rate": 2.2617387261738726e-05,
"loss": 0.3826,
"step": 1419
},
{
"epoch": 1.779937304075235,
"grad_norm": 0.3249760991303327,
"learning_rate": 2.2594142259414227e-05,
"loss": 0.3855,
"step": 1420
},
{
"epoch": 1.7811912225705329,
"grad_norm": 0.331004037331532,
"learning_rate": 2.2570897257089725e-05,
"loss": 0.3818,
"step": 1421
},
{
"epoch": 1.7824451410658306,
"grad_norm": 0.2701461776615874,
"learning_rate": 2.2547652254765226e-05,
"loss": 0.3725,
"step": 1422
},
{
"epoch": 1.7836990595611284,
"grad_norm": 0.36547260014387944,
"learning_rate": 2.2524407252440728e-05,
"loss": 0.3727,
"step": 1423
},
{
"epoch": 1.7849529780564264,
"grad_norm": 0.31164521678214785,
"learning_rate": 2.2501162250116226e-05,
"loss": 0.3628,
"step": 1424
},
{
"epoch": 1.7862068965517242,
"grad_norm": 0.3260959505599915,
"learning_rate": 2.2477917247791727e-05,
"loss": 0.3757,
"step": 1425
},
{
"epoch": 1.787460815047022,
"grad_norm": 0.36259022507630856,
"learning_rate": 2.2454672245467225e-05,
"loss": 0.3612,
"step": 1426
},
{
"epoch": 1.78871473354232,
"grad_norm": 0.2586395549376829,
"learning_rate": 2.2431427243142726e-05,
"loss": 0.3867,
"step": 1427
},
{
"epoch": 1.7899686520376177,
"grad_norm": 0.4054970407295695,
"learning_rate": 2.2408182240818227e-05,
"loss": 0.3611,
"step": 1428
},
{
"epoch": 1.7912225705329154,
"grad_norm": 0.26606172939153233,
"learning_rate": 2.2384937238493725e-05,
"loss": 0.3775,
"step": 1429
},
{
"epoch": 1.7924764890282132,
"grad_norm": 0.3470941393671849,
"learning_rate": 2.2361692236169223e-05,
"loss": 0.3804,
"step": 1430
},
{
"epoch": 1.793730407523511,
"grad_norm": 0.27856089173060883,
"learning_rate": 2.2338447233844724e-05,
"loss": 0.3796,
"step": 1431
},
{
"epoch": 1.7949843260188088,
"grad_norm": 0.30389323608490737,
"learning_rate": 2.2315202231520225e-05,
"loss": 0.3528,
"step": 1432
},
{
"epoch": 1.7962382445141065,
"grad_norm": 0.2804612932473104,
"learning_rate": 2.2291957229195723e-05,
"loss": 0.3517,
"step": 1433
},
{
"epoch": 1.7974921630094043,
"grad_norm": 0.3101765104354032,
"learning_rate": 2.2268712226871224e-05,
"loss": 0.3725,
"step": 1434
},
{
"epoch": 1.798746081504702,
"grad_norm": 0.2785437876810824,
"learning_rate": 2.2245467224546722e-05,
"loss": 0.3552,
"step": 1435
},
{
"epoch": 1.8,
"grad_norm": 0.30448638433852304,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.3715,
"step": 1436
},
{
"epoch": 1.8012539184952978,
"grad_norm": 0.3153110023426444,
"learning_rate": 2.2198977219897725e-05,
"loss": 0.3907,
"step": 1437
},
{
"epoch": 1.8025078369905956,
"grad_norm": 0.29444837435257554,
"learning_rate": 2.2175732217573222e-05,
"loss": 0.4088,
"step": 1438
},
{
"epoch": 1.8037617554858936,
"grad_norm": 0.3128358125634626,
"learning_rate": 2.2152487215248724e-05,
"loss": 0.37,
"step": 1439
},
{
"epoch": 1.8050156739811913,
"grad_norm": 0.25403764007137764,
"learning_rate": 2.212924221292422e-05,
"loss": 0.3657,
"step": 1440
},
{
"epoch": 1.806269592476489,
"grad_norm": 0.339803321172258,
"learning_rate": 2.2105997210599723e-05,
"loss": 0.3867,
"step": 1441
},
{
"epoch": 1.8075235109717869,
"grad_norm": 0.2907642059494259,
"learning_rate": 2.2082752208275224e-05,
"loss": 0.3884,
"step": 1442
},
{
"epoch": 1.8087774294670846,
"grad_norm": 0.28221536322031504,
"learning_rate": 2.2059507205950722e-05,
"loss": 0.3405,
"step": 1443
},
{
"epoch": 1.8100313479623824,
"grad_norm": 0.38973743270815375,
"learning_rate": 2.203626220362622e-05,
"loss": 0.3758,
"step": 1444
},
{
"epoch": 1.8112852664576802,
"grad_norm": 0.30654566564398156,
"learning_rate": 2.201301720130172e-05,
"loss": 0.3488,
"step": 1445
},
{
"epoch": 1.812539184952978,
"grad_norm": 0.3236711626381763,
"learning_rate": 2.1989772198977222e-05,
"loss": 0.3682,
"step": 1446
},
{
"epoch": 1.8137931034482757,
"grad_norm": 0.28076270904862144,
"learning_rate": 2.196652719665272e-05,
"loss": 0.372,
"step": 1447
},
{
"epoch": 1.8150470219435737,
"grad_norm": 0.28763822106248194,
"learning_rate": 2.194328219432822e-05,
"loss": 0.353,
"step": 1448
},
{
"epoch": 1.8163009404388715,
"grad_norm": 0.32379344858979753,
"learning_rate": 2.192003719200372e-05,
"loss": 0.3552,
"step": 1449
},
{
"epoch": 1.8175548589341692,
"grad_norm": 0.28094629830350715,
"learning_rate": 2.189679218967922e-05,
"loss": 0.3662,
"step": 1450
},
{
"epoch": 1.8188087774294672,
"grad_norm": 0.339277728732555,
"learning_rate": 2.187354718735472e-05,
"loss": 0.3901,
"step": 1451
},
{
"epoch": 1.820062695924765,
"grad_norm": 0.32994391267917134,
"learning_rate": 2.185030218503022e-05,
"loss": 0.3627,
"step": 1452
},
{
"epoch": 1.8213166144200628,
"grad_norm": 0.299207945307361,
"learning_rate": 2.1827057182705717e-05,
"loss": 0.3585,
"step": 1453
},
{
"epoch": 1.8225705329153605,
"grad_norm": 0.3128379730554713,
"learning_rate": 2.1803812180381218e-05,
"loss": 0.3738,
"step": 1454
},
{
"epoch": 1.8238244514106583,
"grad_norm": 0.3111282634532266,
"learning_rate": 2.178056717805672e-05,
"loss": 0.3832,
"step": 1455
},
{
"epoch": 1.825078369905956,
"grad_norm": 0.30577729399863385,
"learning_rate": 2.175732217573222e-05,
"loss": 0.3724,
"step": 1456
},
{
"epoch": 1.8263322884012538,
"grad_norm": 0.2812338483545835,
"learning_rate": 2.173407717340772e-05,
"loss": 0.3675,
"step": 1457
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.2851855662391339,
"learning_rate": 2.1710832171083216e-05,
"loss": 0.3557,
"step": 1458
},
{
"epoch": 1.8288401253918494,
"grad_norm": 0.2915073733194022,
"learning_rate": 2.1687587168758718e-05,
"loss": 0.3892,
"step": 1459
},
{
"epoch": 1.8300940438871474,
"grad_norm": 0.31565342834509724,
"learning_rate": 2.166434216643422e-05,
"loss": 0.3811,
"step": 1460
},
{
"epoch": 1.8313479623824451,
"grad_norm": 0.26509783820749083,
"learning_rate": 2.1641097164109717e-05,
"loss": 0.3463,
"step": 1461
},
{
"epoch": 1.832601880877743,
"grad_norm": 0.30913125908925554,
"learning_rate": 2.1617852161785218e-05,
"loss": 0.3561,
"step": 1462
},
{
"epoch": 1.8338557993730409,
"grad_norm": 0.2812585155461713,
"learning_rate": 2.1594607159460716e-05,
"loss": 0.3582,
"step": 1463
},
{
"epoch": 1.8351097178683387,
"grad_norm": 0.2682208141971448,
"learning_rate": 2.1571362157136217e-05,
"loss": 0.3741,
"step": 1464
},
{
"epoch": 1.8363636363636364,
"grad_norm": 0.35179405802676933,
"learning_rate": 2.1548117154811718e-05,
"loss": 0.3603,
"step": 1465
},
{
"epoch": 1.8376175548589342,
"grad_norm": 0.27384856779345024,
"learning_rate": 2.1524872152487216e-05,
"loss": 0.3807,
"step": 1466
},
{
"epoch": 1.838871473354232,
"grad_norm": 0.31096316059154866,
"learning_rate": 2.1501627150162714e-05,
"loss": 0.3879,
"step": 1467
},
{
"epoch": 1.8401253918495297,
"grad_norm": 0.3724161321957051,
"learning_rate": 2.1478382147838215e-05,
"loss": 0.3646,
"step": 1468
},
{
"epoch": 1.8413793103448275,
"grad_norm": 0.33912741441555716,
"learning_rate": 2.1455137145513716e-05,
"loss": 0.3685,
"step": 1469
},
{
"epoch": 1.8426332288401253,
"grad_norm": 0.3319786195105875,
"learning_rate": 2.1431892143189218e-05,
"loss": 0.3503,
"step": 1470
},
{
"epoch": 1.8438871473354232,
"grad_norm": 0.28716872904649443,
"learning_rate": 2.1408647140864715e-05,
"loss": 0.3654,
"step": 1471
},
{
"epoch": 1.845141065830721,
"grad_norm": 0.33513787941717216,
"learning_rate": 2.1385402138540213e-05,
"loss": 0.3724,
"step": 1472
},
{
"epoch": 1.8463949843260188,
"grad_norm": 0.3251288722920424,
"learning_rate": 2.1362157136215714e-05,
"loss": 0.3629,
"step": 1473
},
{
"epoch": 1.8476489028213168,
"grad_norm": 0.32120413464618575,
"learning_rate": 2.1338912133891216e-05,
"loss": 0.3524,
"step": 1474
},
{
"epoch": 1.8489028213166145,
"grad_norm": 0.31850613610700723,
"learning_rate": 2.1315667131566714e-05,
"loss": 0.3723,
"step": 1475
},
{
"epoch": 1.8501567398119123,
"grad_norm": 0.25763036942704703,
"learning_rate": 2.1292422129242215e-05,
"loss": 0.3828,
"step": 1476
},
{
"epoch": 1.85141065830721,
"grad_norm": 0.28187053837408677,
"learning_rate": 2.1269177126917713e-05,
"loss": 0.3725,
"step": 1477
},
{
"epoch": 1.8526645768025078,
"grad_norm": 0.29100629336751177,
"learning_rate": 2.1245932124593214e-05,
"loss": 0.3718,
"step": 1478
},
{
"epoch": 1.8539184952978056,
"grad_norm": 0.30948663195822285,
"learning_rate": 2.1222687122268715e-05,
"loss": 0.3715,
"step": 1479
},
{
"epoch": 1.8551724137931034,
"grad_norm": 0.29669076194325517,
"learning_rate": 2.1199442119944213e-05,
"loss": 0.3772,
"step": 1480
},
{
"epoch": 1.8564263322884011,
"grad_norm": 0.33685973598953634,
"learning_rate": 2.117619711761971e-05,
"loss": 0.3581,
"step": 1481
},
{
"epoch": 1.857680250783699,
"grad_norm": 0.31840275295093284,
"learning_rate": 2.1152952115295212e-05,
"loss": 0.3507,
"step": 1482
},
{
"epoch": 1.858934169278997,
"grad_norm": 0.35194330747558816,
"learning_rate": 2.1129707112970713e-05,
"loss": 0.3541,
"step": 1483
},
{
"epoch": 1.8601880877742947,
"grad_norm": 0.336980443855765,
"learning_rate": 2.110646211064621e-05,
"loss": 0.3787,
"step": 1484
},
{
"epoch": 1.8614420062695924,
"grad_norm": 0.3114433262636965,
"learning_rate": 2.1083217108321712e-05,
"loss": 0.3927,
"step": 1485
},
{
"epoch": 1.8626959247648904,
"grad_norm": 0.35527128015245135,
"learning_rate": 2.105997210599721e-05,
"loss": 0.3458,
"step": 1486
},
{
"epoch": 1.8639498432601882,
"grad_norm": 0.26937515084737335,
"learning_rate": 2.103672710367271e-05,
"loss": 0.3736,
"step": 1487
},
{
"epoch": 1.865203761755486,
"grad_norm": 0.31673355870717723,
"learning_rate": 2.1013482101348213e-05,
"loss": 0.3578,
"step": 1488
},
{
"epoch": 1.8664576802507837,
"grad_norm": 0.28083954597134003,
"learning_rate": 2.099023709902371e-05,
"loss": 0.3652,
"step": 1489
},
{
"epoch": 1.8677115987460815,
"grad_norm": 0.23277289433483409,
"learning_rate": 2.096699209669921e-05,
"loss": 0.3469,
"step": 1490
},
{
"epoch": 1.8689655172413793,
"grad_norm": 0.3674166267193647,
"learning_rate": 2.094374709437471e-05,
"loss": 0.3782,
"step": 1491
},
{
"epoch": 1.870219435736677,
"grad_norm": 0.255865536286177,
"learning_rate": 2.092050209205021e-05,
"loss": 0.3626,
"step": 1492
},
{
"epoch": 1.8714733542319748,
"grad_norm": 0.3098186724024676,
"learning_rate": 2.0897257089725712e-05,
"loss": 0.422,
"step": 1493
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.3586247055296875,
"learning_rate": 2.087401208740121e-05,
"loss": 0.4032,
"step": 1494
},
{
"epoch": 1.8739811912225706,
"grad_norm": 0.2701872437019008,
"learning_rate": 2.0850767085076708e-05,
"loss": 0.3844,
"step": 1495
},
{
"epoch": 1.8752351097178683,
"grad_norm": 0.3427401686147468,
"learning_rate": 2.082752208275221e-05,
"loss": 0.3568,
"step": 1496
},
{
"epoch": 1.876489028213166,
"grad_norm": 0.2848916270943444,
"learning_rate": 2.080427708042771e-05,
"loss": 0.3632,
"step": 1497
},
{
"epoch": 1.877742946708464,
"grad_norm": 0.35125303316430767,
"learning_rate": 2.0781032078103208e-05,
"loss": 0.3753,
"step": 1498
},
{
"epoch": 1.8789968652037619,
"grad_norm": 0.2649520696914522,
"learning_rate": 2.075778707577871e-05,
"loss": 0.3523,
"step": 1499
},
{
"epoch": 1.8802507836990596,
"grad_norm": 0.3609362980141422,
"learning_rate": 2.0734542073454207e-05,
"loss": 0.3758,
"step": 1500
},
{
"epoch": 1.8815047021943574,
"grad_norm": 0.3312529882787716,
"learning_rate": 2.0711297071129708e-05,
"loss": 0.396,
"step": 1501
},
{
"epoch": 1.8827586206896552,
"grad_norm": 0.2957788757128359,
"learning_rate": 2.068805206880521e-05,
"loss": 0.3747,
"step": 1502
},
{
"epoch": 1.884012539184953,
"grad_norm": 0.29402600218320174,
"learning_rate": 2.0664807066480707e-05,
"loss": 0.3399,
"step": 1503
},
{
"epoch": 1.8852664576802507,
"grad_norm": 0.2591908246621551,
"learning_rate": 2.064156206415621e-05,
"loss": 0.3803,
"step": 1504
},
{
"epoch": 1.8865203761755485,
"grad_norm": 0.3181006268984123,
"learning_rate": 2.0618317061831706e-05,
"loss": 0.3595,
"step": 1505
},
{
"epoch": 1.8877742946708462,
"grad_norm": 0.31082944907450727,
"learning_rate": 2.0595072059507208e-05,
"loss": 0.3582,
"step": 1506
},
{
"epoch": 1.8890282131661442,
"grad_norm": 0.3102727743860222,
"learning_rate": 2.057182705718271e-05,
"loss": 0.3646,
"step": 1507
},
{
"epoch": 1.890282131661442,
"grad_norm": 0.3353343298609206,
"learning_rate": 2.0548582054858207e-05,
"loss": 0.3619,
"step": 1508
},
{
"epoch": 1.8915360501567398,
"grad_norm": 0.27520214392102293,
"learning_rate": 2.0525337052533704e-05,
"loss": 0.3597,
"step": 1509
},
{
"epoch": 1.8927899686520377,
"grad_norm": 0.30783336206825795,
"learning_rate": 2.0502092050209206e-05,
"loss": 0.3517,
"step": 1510
},
{
"epoch": 1.8940438871473355,
"grad_norm": 0.3474974368738588,
"learning_rate": 2.0478847047884707e-05,
"loss": 0.3829,
"step": 1511
},
{
"epoch": 1.8952978056426333,
"grad_norm": 0.2744302344377205,
"learning_rate": 2.0455602045560205e-05,
"loss": 0.3716,
"step": 1512
},
{
"epoch": 1.896551724137931,
"grad_norm": 0.2928884428769485,
"learning_rate": 2.0432357043235706e-05,
"loss": 0.3753,
"step": 1513
},
{
"epoch": 1.8978056426332288,
"grad_norm": 0.37592471382832643,
"learning_rate": 2.0409112040911204e-05,
"loss": 0.3704,
"step": 1514
},
{
"epoch": 1.8990595611285266,
"grad_norm": 0.3158686762855482,
"learning_rate": 2.0385867038586705e-05,
"loss": 0.3744,
"step": 1515
},
{
"epoch": 1.9003134796238244,
"grad_norm": 0.3659370950336824,
"learning_rate": 2.0362622036262206e-05,
"loss": 0.3707,
"step": 1516
},
{
"epoch": 1.9015673981191221,
"grad_norm": 0.2903814557049549,
"learning_rate": 2.0339377033937704e-05,
"loss": 0.3839,
"step": 1517
},
{
"epoch": 1.90282131661442,
"grad_norm": 0.3135557114194547,
"learning_rate": 2.0316132031613205e-05,
"loss": 0.3562,
"step": 1518
},
{
"epoch": 1.9040752351097179,
"grad_norm": 0.3606058059119533,
"learning_rate": 2.0292887029288703e-05,
"loss": 0.3576,
"step": 1519
},
{
"epoch": 1.9053291536050156,
"grad_norm": 0.27877245239721915,
"learning_rate": 2.0269642026964204e-05,
"loss": 0.3719,
"step": 1520
},
{
"epoch": 1.9065830721003136,
"grad_norm": 0.3675405247866679,
"learning_rate": 2.0246397024639706e-05,
"loss": 0.3898,
"step": 1521
},
{
"epoch": 1.9078369905956114,
"grad_norm": 0.34529408443476134,
"learning_rate": 2.0223152022315203e-05,
"loss": 0.3869,
"step": 1522
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.3175319552125657,
"learning_rate": 2.01999070199907e-05,
"loss": 0.3729,
"step": 1523
},
{
"epoch": 1.910344827586207,
"grad_norm": 0.27013214551827336,
"learning_rate": 2.0176662017666202e-05,
"loss": 0.3754,
"step": 1524
},
{
"epoch": 1.9115987460815047,
"grad_norm": 0.2801322422668281,
"learning_rate": 2.0153417015341704e-05,
"loss": 0.3516,
"step": 1525
},
{
"epoch": 1.9128526645768025,
"grad_norm": 0.28541183621948846,
"learning_rate": 2.01301720130172e-05,
"loss": 0.3889,
"step": 1526
},
{
"epoch": 1.9141065830721002,
"grad_norm": 0.272543286027577,
"learning_rate": 2.0106927010692703e-05,
"loss": 0.3739,
"step": 1527
},
{
"epoch": 1.915360501567398,
"grad_norm": 0.32415603661296616,
"learning_rate": 2.00836820083682e-05,
"loss": 0.3887,
"step": 1528
},
{
"epoch": 1.9166144200626958,
"grad_norm": 0.27301534720358955,
"learning_rate": 2.0060437006043702e-05,
"loss": 0.3659,
"step": 1529
},
{
"epoch": 1.9178683385579938,
"grad_norm": 0.281208859834148,
"learning_rate": 2.0037192003719203e-05,
"loss": 0.362,
"step": 1530
},
{
"epoch": 1.9191222570532915,
"grad_norm": 0.2942699483746739,
"learning_rate": 2.00139470013947e-05,
"loss": 0.3865,
"step": 1531
},
{
"epoch": 1.9203761755485893,
"grad_norm": 0.28300532567647857,
"learning_rate": 1.9990701999070202e-05,
"loss": 0.3542,
"step": 1532
},
{
"epoch": 1.9216300940438873,
"grad_norm": 0.34516470854970555,
"learning_rate": 1.99674569967457e-05,
"loss": 0.3752,
"step": 1533
},
{
"epoch": 1.922884012539185,
"grad_norm": 0.26054089757615567,
"learning_rate": 1.99442119944212e-05,
"loss": 0.3575,
"step": 1534
},
{
"epoch": 1.9241379310344828,
"grad_norm": 0.30214267498189995,
"learning_rate": 1.9920966992096702e-05,
"loss": 0.3688,
"step": 1535
},
{
"epoch": 1.9253918495297806,
"grad_norm": 0.2778871746368888,
"learning_rate": 1.98977219897722e-05,
"loss": 0.3525,
"step": 1536
},
{
"epoch": 1.9266457680250784,
"grad_norm": 0.27196913610926204,
"learning_rate": 1.9874476987447698e-05,
"loss": 0.3631,
"step": 1537
},
{
"epoch": 1.9278996865203761,
"grad_norm": 0.3035606828872824,
"learning_rate": 1.98512319851232e-05,
"loss": 0.364,
"step": 1538
},
{
"epoch": 1.929153605015674,
"grad_norm": 0.28549507241151667,
"learning_rate": 1.98279869827987e-05,
"loss": 0.3769,
"step": 1539
},
{
"epoch": 1.9304075235109717,
"grad_norm": 0.3225450019377393,
"learning_rate": 1.98047419804742e-05,
"loss": 0.3741,
"step": 1540
},
{
"epoch": 1.9316614420062694,
"grad_norm": 0.32078297268671735,
"learning_rate": 1.97814969781497e-05,
"loss": 0.3779,
"step": 1541
},
{
"epoch": 1.9329153605015674,
"grad_norm": 0.33475517118184944,
"learning_rate": 1.9758251975825197e-05,
"loss": 0.3701,
"step": 1542
},
{
"epoch": 1.9341692789968652,
"grad_norm": 0.3923773819803438,
"learning_rate": 1.97350069735007e-05,
"loss": 0.3677,
"step": 1543
},
{
"epoch": 1.935423197492163,
"grad_norm": 0.29920165325522446,
"learning_rate": 1.97117619711762e-05,
"loss": 0.3758,
"step": 1544
},
{
"epoch": 1.936677115987461,
"grad_norm": 0.28203813711075765,
"learning_rate": 1.9688516968851698e-05,
"loss": 0.3638,
"step": 1545
},
{
"epoch": 1.9379310344827587,
"grad_norm": 0.3289395781272717,
"learning_rate": 1.96652719665272e-05,
"loss": 0.3668,
"step": 1546
},
{
"epoch": 1.9391849529780565,
"grad_norm": 0.3029056546285961,
"learning_rate": 1.9642026964202697e-05,
"loss": 0.3947,
"step": 1547
},
{
"epoch": 1.9404388714733543,
"grad_norm": 0.4281606077377055,
"learning_rate": 1.9618781961878198e-05,
"loss": 0.3672,
"step": 1548
},
{
"epoch": 1.941692789968652,
"grad_norm": 0.36949669403311997,
"learning_rate": 1.95955369595537e-05,
"loss": 0.3884,
"step": 1549
},
{
"epoch": 1.9429467084639498,
"grad_norm": 0.314588003275736,
"learning_rate": 1.9572291957229197e-05,
"loss": 0.3545,
"step": 1550
},
{
"epoch": 1.9442006269592476,
"grad_norm": 0.29392391434586407,
"learning_rate": 1.9549046954904695e-05,
"loss": 0.3588,
"step": 1551
},
{
"epoch": 1.9454545454545453,
"grad_norm": 0.28758518791856613,
"learning_rate": 1.9525801952580196e-05,
"loss": 0.3476,
"step": 1552
},
{
"epoch": 1.9467084639498433,
"grad_norm": 0.3225376322940252,
"learning_rate": 1.9502556950255697e-05,
"loss": 0.3627,
"step": 1553
},
{
"epoch": 1.947962382445141,
"grad_norm": 0.30274274313118144,
"learning_rate": 1.9479311947931195e-05,
"loss": 0.3663,
"step": 1554
},
{
"epoch": 1.9492163009404389,
"grad_norm": 0.32464558833484214,
"learning_rate": 1.9456066945606696e-05,
"loss": 0.3714,
"step": 1555
},
{
"epoch": 1.9504702194357368,
"grad_norm": 0.28406697435207134,
"learning_rate": 1.9432821943282194e-05,
"loss": 0.3523,
"step": 1556
},
{
"epoch": 1.9517241379310346,
"grad_norm": 0.2569792849791072,
"learning_rate": 1.9409576940957696e-05,
"loss": 0.365,
"step": 1557
},
{
"epoch": 1.9529780564263324,
"grad_norm": 0.3364322489397566,
"learning_rate": 1.9386331938633197e-05,
"loss": 0.3838,
"step": 1558
},
{
"epoch": 1.9542319749216301,
"grad_norm": 0.27889005978625525,
"learning_rate": 1.9363086936308695e-05,
"loss": 0.3735,
"step": 1559
},
{
"epoch": 1.955485893416928,
"grad_norm": 0.2617158342813191,
"learning_rate": 1.9339841933984192e-05,
"loss": 0.3514,
"step": 1560
},
{
"epoch": 1.9567398119122257,
"grad_norm": 0.3025752025246799,
"learning_rate": 1.9316596931659694e-05,
"loss": 0.3729,
"step": 1561
},
{
"epoch": 1.9579937304075234,
"grad_norm": 0.2915335405665987,
"learning_rate": 1.9293351929335195e-05,
"loss": 0.3686,
"step": 1562
},
{
"epoch": 1.9592476489028212,
"grad_norm": 0.30794310339711595,
"learning_rate": 1.9270106927010696e-05,
"loss": 0.3728,
"step": 1563
},
{
"epoch": 1.960501567398119,
"grad_norm": 0.27934303608082084,
"learning_rate": 1.9246861924686194e-05,
"loss": 0.3433,
"step": 1564
},
{
"epoch": 1.961755485893417,
"grad_norm": 0.27745314806095756,
"learning_rate": 1.9223616922361692e-05,
"loss": 0.3648,
"step": 1565
},
{
"epoch": 1.9630094043887147,
"grad_norm": 0.2835864719168281,
"learning_rate": 1.9200371920037193e-05,
"loss": 0.3628,
"step": 1566
},
{
"epoch": 1.9642633228840125,
"grad_norm": 0.28412119472722186,
"learning_rate": 1.9177126917712694e-05,
"loss": 0.3785,
"step": 1567
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.3145396595600263,
"learning_rate": 1.9153881915388192e-05,
"loss": 0.3954,
"step": 1568
},
{
"epoch": 1.9667711598746083,
"grad_norm": 0.26656425038965087,
"learning_rate": 1.9130636913063693e-05,
"loss": 0.3735,
"step": 1569
},
{
"epoch": 1.968025078369906,
"grad_norm": 0.3456114069686159,
"learning_rate": 1.910739191073919e-05,
"loss": 0.386,
"step": 1570
},
{
"epoch": 1.9692789968652038,
"grad_norm": 0.29545903133236817,
"learning_rate": 1.908414690841469e-05,
"loss": 0.3667,
"step": 1571
},
{
"epoch": 1.9705329153605016,
"grad_norm": 0.26193044980856894,
"learning_rate": 1.9060901906090194e-05,
"loss": 0.3606,
"step": 1572
},
{
"epoch": 1.9717868338557993,
"grad_norm": 0.3062711434772209,
"learning_rate": 1.903765690376569e-05,
"loss": 0.3627,
"step": 1573
},
{
"epoch": 1.973040752351097,
"grad_norm": 0.2550810012007726,
"learning_rate": 1.901441190144119e-05,
"loss": 0.328,
"step": 1574
},
{
"epoch": 1.9742946708463949,
"grad_norm": 0.25916941167661195,
"learning_rate": 1.899116689911669e-05,
"loss": 0.3479,
"step": 1575
},
{
"epoch": 1.9755485893416926,
"grad_norm": 0.2928707796536945,
"learning_rate": 1.8967921896792192e-05,
"loss": 0.3737,
"step": 1576
},
{
"epoch": 1.9768025078369906,
"grad_norm": 0.28412714755301394,
"learning_rate": 1.8944676894467693e-05,
"loss": 0.3394,
"step": 1577
},
{
"epoch": 1.9780564263322884,
"grad_norm": 0.2594753676807368,
"learning_rate": 1.892143189214319e-05,
"loss": 0.3651,
"step": 1578
},
{
"epoch": 1.9793103448275862,
"grad_norm": 0.2963150235256023,
"learning_rate": 1.889818688981869e-05,
"loss": 0.3575,
"step": 1579
},
{
"epoch": 1.9805642633228842,
"grad_norm": 0.2928615177339626,
"learning_rate": 1.887494188749419e-05,
"loss": 0.3803,
"step": 1580
},
{
"epoch": 1.981818181818182,
"grad_norm": 0.3511465268980666,
"learning_rate": 1.885169688516969e-05,
"loss": 0.3868,
"step": 1581
},
{
"epoch": 1.9830721003134797,
"grad_norm": 0.2897056843303611,
"learning_rate": 1.882845188284519e-05,
"loss": 0.3707,
"step": 1582
},
{
"epoch": 1.9843260188087775,
"grad_norm": 0.24873624626371305,
"learning_rate": 1.880520688052069e-05,
"loss": 0.3665,
"step": 1583
},
{
"epoch": 1.9855799373040752,
"grad_norm": 0.2930432507642075,
"learning_rate": 1.8781961878196188e-05,
"loss": 0.3554,
"step": 1584
},
{
"epoch": 1.986833855799373,
"grad_norm": 0.290095300600869,
"learning_rate": 1.8758716875871686e-05,
"loss": 0.3774,
"step": 1585
},
{
"epoch": 1.9880877742946708,
"grad_norm": 0.30462622636275394,
"learning_rate": 1.873547187354719e-05,
"loss": 0.375,
"step": 1586
},
{
"epoch": 1.9893416927899685,
"grad_norm": 0.25289770717878934,
"learning_rate": 1.8712226871222688e-05,
"loss": 0.3635,
"step": 1587
},
{
"epoch": 1.9905956112852663,
"grad_norm": 0.2858410111563425,
"learning_rate": 1.8688981868898186e-05,
"loss": 0.3726,
"step": 1588
},
{
"epoch": 1.9918495297805643,
"grad_norm": 0.3039957504945038,
"learning_rate": 1.8665736866573687e-05,
"loss": 0.3552,
"step": 1589
},
{
"epoch": 1.993103448275862,
"grad_norm": 0.24817289834109557,
"learning_rate": 1.864249186424919e-05,
"loss": 0.3623,
"step": 1590
},
{
"epoch": 1.9943573667711598,
"grad_norm": 0.29634803398301096,
"learning_rate": 1.8619246861924686e-05,
"loss": 0.3798,
"step": 1591
},
{
"epoch": 1.9956112852664578,
"grad_norm": 0.28287762182447046,
"learning_rate": 1.8596001859600188e-05,
"loss": 0.3738,
"step": 1592
},
{
"epoch": 1.9968652037617556,
"grad_norm": 0.2640411792002975,
"learning_rate": 1.8572756857275685e-05,
"loss": 0.3898,
"step": 1593
},
{
"epoch": 1.9981191222570533,
"grad_norm": 0.28746039819420166,
"learning_rate": 1.8549511854951187e-05,
"loss": 0.3508,
"step": 1594
},
{
"epoch": 1.9993730407523511,
"grad_norm": 0.3059212324417853,
"learning_rate": 1.8526266852626688e-05,
"loss": 0.3577,
"step": 1595
},
{
"epoch": 2.0,
"grad_norm": 0.43555117164068585,
"learning_rate": 1.8503021850302186e-05,
"loss": 0.2932,
"step": 1596
},
{
"epoch": 2.0012539184952978,
"grad_norm": 0.32776157698619945,
"learning_rate": 1.8479776847977687e-05,
"loss": 0.2931,
"step": 1597
},
{
"epoch": 2.0025078369905955,
"grad_norm": 0.28310508560703196,
"learning_rate": 1.8456531845653185e-05,
"loss": 0.2848,
"step": 1598
},
{
"epoch": 2.0037617554858933,
"grad_norm": 0.3255444195818248,
"learning_rate": 1.8433286843328683e-05,
"loss": 0.283,
"step": 1599
},
{
"epoch": 2.005015673981191,
"grad_norm": 0.3340342223522126,
"learning_rate": 1.8410041841004187e-05,
"loss": 0.2917,
"step": 1600
},
{
"epoch": 2.006269592476489,
"grad_norm": 0.32606393850359916,
"learning_rate": 1.8386796838679685e-05,
"loss": 0.3041,
"step": 1601
},
{
"epoch": 2.007523510971787,
"grad_norm": 0.31916382827689804,
"learning_rate": 1.8363551836355183e-05,
"loss": 0.3029,
"step": 1602
},
{
"epoch": 2.008777429467085,
"grad_norm": 0.3291449662387648,
"learning_rate": 1.8340306834030684e-05,
"loss": 0.2967,
"step": 1603
},
{
"epoch": 2.0100313479623826,
"grad_norm": 0.33062801246997664,
"learning_rate": 1.8317061831706185e-05,
"loss": 0.2851,
"step": 1604
},
{
"epoch": 2.0112852664576804,
"grad_norm": 0.32468569433656613,
"learning_rate": 1.8293816829381683e-05,
"loss": 0.3106,
"step": 1605
},
{
"epoch": 2.012539184952978,
"grad_norm": 0.35085415625601685,
"learning_rate": 1.8270571827057184e-05,
"loss": 0.3158,
"step": 1606
},
{
"epoch": 2.013793103448276,
"grad_norm": 0.29593229673798865,
"learning_rate": 1.8247326824732682e-05,
"loss": 0.3112,
"step": 1607
},
{
"epoch": 2.0150470219435737,
"grad_norm": 0.3594694690507236,
"learning_rate": 1.8224081822408184e-05,
"loss": 0.3011,
"step": 1608
},
{
"epoch": 2.0163009404388714,
"grad_norm": 0.32229804931951406,
"learning_rate": 1.8200836820083685e-05,
"loss": 0.307,
"step": 1609
},
{
"epoch": 2.017554858934169,
"grad_norm": 0.2886947567362647,
"learning_rate": 1.8177591817759183e-05,
"loss": 0.299,
"step": 1610
},
{
"epoch": 2.018808777429467,
"grad_norm": 0.32858361464230046,
"learning_rate": 1.8154346815434684e-05,
"loss": 0.301,
"step": 1611
},
{
"epoch": 2.0200626959247647,
"grad_norm": 0.2981804693177194,
"learning_rate": 1.813110181311018e-05,
"loss": 0.29,
"step": 1612
},
{
"epoch": 2.0213166144200625,
"grad_norm": 0.28199880877738054,
"learning_rate": 1.810785681078568e-05,
"loss": 0.3032,
"step": 1613
},
{
"epoch": 2.0225705329153607,
"grad_norm": 0.27345395666479827,
"learning_rate": 1.8084611808461184e-05,
"loss": 0.3121,
"step": 1614
},
{
"epoch": 2.0238244514106585,
"grad_norm": 0.2654918017906298,
"learning_rate": 1.8061366806136682e-05,
"loss": 0.2973,
"step": 1615
},
{
"epoch": 2.0250783699059562,
"grad_norm": 0.2847143947393576,
"learning_rate": 1.803812180381218e-05,
"loss": 0.2826,
"step": 1616
},
{
"epoch": 2.026332288401254,
"grad_norm": 0.3309014723469459,
"learning_rate": 1.801487680148768e-05,
"loss": 0.3138,
"step": 1617
},
{
"epoch": 2.027586206896552,
"grad_norm": 0.30052354285801514,
"learning_rate": 1.799163179916318e-05,
"loss": 0.3058,
"step": 1618
},
{
"epoch": 2.0288401253918495,
"grad_norm": 0.2568613632252142,
"learning_rate": 1.796838679683868e-05,
"loss": 0.2988,
"step": 1619
},
{
"epoch": 2.0300940438871473,
"grad_norm": 0.30266489320082407,
"learning_rate": 1.794514179451418e-05,
"loss": 0.3012,
"step": 1620
},
{
"epoch": 2.031347962382445,
"grad_norm": 0.2978555892487361,
"learning_rate": 1.792189679218968e-05,
"loss": 0.2823,
"step": 1621
},
{
"epoch": 2.032601880877743,
"grad_norm": 0.2678399945553755,
"learning_rate": 1.789865178986518e-05,
"loss": 0.3068,
"step": 1622
},
{
"epoch": 2.0338557993730406,
"grad_norm": 0.2886119536073379,
"learning_rate": 1.787540678754068e-05,
"loss": 0.2954,
"step": 1623
},
{
"epoch": 2.0351097178683384,
"grad_norm": 0.288880489765786,
"learning_rate": 1.785216178521618e-05,
"loss": 0.2936,
"step": 1624
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.2866595477649304,
"learning_rate": 1.782891678289168e-05,
"loss": 0.3017,
"step": 1625
},
{
"epoch": 2.0376175548589344,
"grad_norm": 0.23704502343007677,
"learning_rate": 1.780567178056718e-05,
"loss": 0.2995,
"step": 1626
},
{
"epoch": 2.038871473354232,
"grad_norm": 0.2546468388431892,
"learning_rate": 1.7782426778242676e-05,
"loss": 0.3062,
"step": 1627
},
{
"epoch": 2.04012539184953,
"grad_norm": 0.2526466984734851,
"learning_rate": 1.775918177591818e-05,
"loss": 0.2962,
"step": 1628
},
{
"epoch": 2.0413793103448277,
"grad_norm": 0.3762450884074059,
"learning_rate": 1.773593677359368e-05,
"loss": 0.2925,
"step": 1629
},
{
"epoch": 2.0426332288401254,
"grad_norm": 0.2443923169144196,
"learning_rate": 1.7712691771269177e-05,
"loss": 0.2954,
"step": 1630
},
{
"epoch": 2.043887147335423,
"grad_norm": 0.23960553475020602,
"learning_rate": 1.7689446768944678e-05,
"loss": 0.3003,
"step": 1631
},
{
"epoch": 2.045141065830721,
"grad_norm": 0.2349436937787735,
"learning_rate": 1.7666201766620176e-05,
"loss": 0.2798,
"step": 1632
},
{
"epoch": 2.0463949843260187,
"grad_norm": 0.235457665056093,
"learning_rate": 1.7642956764295677e-05,
"loss": 0.3049,
"step": 1633
},
{
"epoch": 2.0476489028213165,
"grad_norm": 0.28163494908422015,
"learning_rate": 1.7619711761971178e-05,
"loss": 0.3019,
"step": 1634
},
{
"epoch": 2.0489028213166143,
"grad_norm": 0.2552425850639124,
"learning_rate": 1.7596466759646676e-05,
"loss": 0.2938,
"step": 1635
},
{
"epoch": 2.050156739811912,
"grad_norm": 0.25412103648843043,
"learning_rate": 1.7573221757322174e-05,
"loss": 0.3064,
"step": 1636
},
{
"epoch": 2.0514106583072103,
"grad_norm": 0.25435537561854416,
"learning_rate": 1.754997675499768e-05,
"loss": 0.2874,
"step": 1637
},
{
"epoch": 2.052664576802508,
"grad_norm": 0.2527049097697476,
"learning_rate": 1.7526731752673176e-05,
"loss": 0.3116,
"step": 1638
},
{
"epoch": 2.053918495297806,
"grad_norm": 0.2281281399080251,
"learning_rate": 1.7503486750348678e-05,
"loss": 0.2821,
"step": 1639
},
{
"epoch": 2.0551724137931036,
"grad_norm": 0.2601387939348469,
"learning_rate": 1.7480241748024175e-05,
"loss": 0.2952,
"step": 1640
},
{
"epoch": 2.0564263322884013,
"grad_norm": 0.2516428545196864,
"learning_rate": 1.7456996745699673e-05,
"loss": 0.2896,
"step": 1641
},
{
"epoch": 2.057680250783699,
"grad_norm": 0.2390530337912706,
"learning_rate": 1.7433751743375178e-05,
"loss": 0.2924,
"step": 1642
},
{
"epoch": 2.058934169278997,
"grad_norm": 0.26102880674242857,
"learning_rate": 1.7410506741050676e-05,
"loss": 0.31,
"step": 1643
},
{
"epoch": 2.0601880877742946,
"grad_norm": 0.25290939843222604,
"learning_rate": 1.7387261738726173e-05,
"loss": 0.293,
"step": 1644
},
{
"epoch": 2.0614420062695924,
"grad_norm": 0.2511776129707411,
"learning_rate": 1.7364016736401675e-05,
"loss": 0.2907,
"step": 1645
},
{
"epoch": 2.06269592476489,
"grad_norm": 0.23751982661983917,
"learning_rate": 1.7340771734077173e-05,
"loss": 0.3123,
"step": 1646
},
{
"epoch": 2.063949843260188,
"grad_norm": 0.2587170326238229,
"learning_rate": 1.7317526731752674e-05,
"loss": 0.2808,
"step": 1647
},
{
"epoch": 2.0652037617554857,
"grad_norm": 0.274270866624352,
"learning_rate": 1.7294281729428175e-05,
"loss": 0.3269,
"step": 1648
},
{
"epoch": 2.066457680250784,
"grad_norm": 0.24223876900123517,
"learning_rate": 1.7271036727103673e-05,
"loss": 0.2993,
"step": 1649
},
{
"epoch": 2.0677115987460817,
"grad_norm": 0.26017198670143954,
"learning_rate": 1.724779172477917e-05,
"loss": 0.308,
"step": 1650
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.2414553906799462,
"learning_rate": 1.7224546722454672e-05,
"loss": 0.2929,
"step": 1651
},
{
"epoch": 2.070219435736677,
"grad_norm": 0.260347870264937,
"learning_rate": 1.7201301720130173e-05,
"loss": 0.2926,
"step": 1652
},
{
"epoch": 2.071473354231975,
"grad_norm": 0.25531311274368323,
"learning_rate": 1.7178056717805674e-05,
"loss": 0.2941,
"step": 1653
},
{
"epoch": 2.0727272727272728,
"grad_norm": 0.25262046723105586,
"learning_rate": 1.7154811715481172e-05,
"loss": 0.3082,
"step": 1654
},
{
"epoch": 2.0739811912225705,
"grad_norm": 0.28640100478378216,
"learning_rate": 1.713156671315667e-05,
"loss": 0.2968,
"step": 1655
},
{
"epoch": 2.0752351097178683,
"grad_norm": 0.2701421588514749,
"learning_rate": 1.7108321710832175e-05,
"loss": 0.3097,
"step": 1656
},
{
"epoch": 2.076489028213166,
"grad_norm": 0.2548903064495605,
"learning_rate": 1.7085076708507672e-05,
"loss": 0.2781,
"step": 1657
},
{
"epoch": 2.077742946708464,
"grad_norm": 0.22826318283907418,
"learning_rate": 1.706183170618317e-05,
"loss": 0.2837,
"step": 1658
},
{
"epoch": 2.0789968652037616,
"grad_norm": 0.2568651461394605,
"learning_rate": 1.703858670385867e-05,
"loss": 0.2824,
"step": 1659
},
{
"epoch": 2.0802507836990594,
"grad_norm": 0.27821183997776355,
"learning_rate": 1.701534170153417e-05,
"loss": 0.3005,
"step": 1660
},
{
"epoch": 2.0815047021943576,
"grad_norm": 0.2599406674469079,
"learning_rate": 1.699209669920967e-05,
"loss": 0.2901,
"step": 1661
},
{
"epoch": 2.0827586206896553,
"grad_norm": 0.2556622167958506,
"learning_rate": 1.6968851696885172e-05,
"loss": 0.2872,
"step": 1662
},
{
"epoch": 2.084012539184953,
"grad_norm": 0.24360208669217062,
"learning_rate": 1.694560669456067e-05,
"loss": 0.2915,
"step": 1663
},
{
"epoch": 2.085266457680251,
"grad_norm": 0.2684600947254053,
"learning_rate": 1.6922361692236168e-05,
"loss": 0.2904,
"step": 1664
},
{
"epoch": 2.0865203761755486,
"grad_norm": 0.25999300436828277,
"learning_rate": 1.689911668991167e-05,
"loss": 0.297,
"step": 1665
},
{
"epoch": 2.0877742946708464,
"grad_norm": 0.2358333863311693,
"learning_rate": 1.687587168758717e-05,
"loss": 0.3044,
"step": 1666
},
{
"epoch": 2.089028213166144,
"grad_norm": 0.25816625969687124,
"learning_rate": 1.6852626685262668e-05,
"loss": 0.3031,
"step": 1667
},
{
"epoch": 2.090282131661442,
"grad_norm": 0.26662395384036086,
"learning_rate": 1.682938168293817e-05,
"loss": 0.3015,
"step": 1668
},
{
"epoch": 2.0915360501567397,
"grad_norm": 0.2422096753684432,
"learning_rate": 1.6806136680613667e-05,
"loss": 0.3066,
"step": 1669
},
{
"epoch": 2.0927899686520375,
"grad_norm": 0.24792919652832754,
"learning_rate": 1.678289167828917e-05,
"loss": 0.315,
"step": 1670
},
{
"epoch": 2.0940438871473352,
"grad_norm": 0.2905693079552795,
"learning_rate": 1.675964667596467e-05,
"loss": 0.2995,
"step": 1671
},
{
"epoch": 2.095297805642633,
"grad_norm": 0.22899845063430602,
"learning_rate": 1.6736401673640167e-05,
"loss": 0.3028,
"step": 1672
},
{
"epoch": 2.0965517241379312,
"grad_norm": 0.23261592542431805,
"learning_rate": 1.671315667131567e-05,
"loss": 0.3035,
"step": 1673
},
{
"epoch": 2.097805642633229,
"grad_norm": 0.23947754281889433,
"learning_rate": 1.6689911668991166e-05,
"loss": 0.282,
"step": 1674
},
{
"epoch": 2.0990595611285268,
"grad_norm": 0.2355975652757473,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.2755,
"step": 1675
},
{
"epoch": 2.1003134796238245,
"grad_norm": 0.2460716071341843,
"learning_rate": 1.664342166434217e-05,
"loss": 0.2996,
"step": 1676
},
{
"epoch": 2.1015673981191223,
"grad_norm": 0.24736060132884669,
"learning_rate": 1.6620176662017667e-05,
"loss": 0.3121,
"step": 1677
},
{
"epoch": 2.10282131661442,
"grad_norm": 0.25707317986612627,
"learning_rate": 1.6596931659693164e-05,
"loss": 0.3003,
"step": 1678
},
{
"epoch": 2.104075235109718,
"grad_norm": 0.2789535335639655,
"learning_rate": 1.6573686657368666e-05,
"loss": 0.3068,
"step": 1679
},
{
"epoch": 2.1053291536050156,
"grad_norm": 0.26041881204653883,
"learning_rate": 1.6550441655044167e-05,
"loss": 0.295,
"step": 1680
},
{
"epoch": 2.1065830721003134,
"grad_norm": 0.2269275756150215,
"learning_rate": 1.6527196652719665e-05,
"loss": 0.2846,
"step": 1681
},
{
"epoch": 2.107836990595611,
"grad_norm": 0.2813733578829718,
"learning_rate": 1.6503951650395166e-05,
"loss": 0.3127,
"step": 1682
},
{
"epoch": 2.109090909090909,
"grad_norm": 0.2656569790485734,
"learning_rate": 1.6480706648070664e-05,
"loss": 0.3,
"step": 1683
},
{
"epoch": 2.110344827586207,
"grad_norm": 0.28244433841744226,
"learning_rate": 1.645746164574617e-05,
"loss": 0.31,
"step": 1684
},
{
"epoch": 2.111598746081505,
"grad_norm": 0.24827476042194124,
"learning_rate": 1.6434216643421666e-05,
"loss": 0.2961,
"step": 1685
},
{
"epoch": 2.1128526645768027,
"grad_norm": 0.2595052202484928,
"learning_rate": 1.6410971641097164e-05,
"loss": 0.3029,
"step": 1686
},
{
"epoch": 2.1141065830721004,
"grad_norm": 0.2778767223203673,
"learning_rate": 1.6387726638772665e-05,
"loss": 0.3072,
"step": 1687
},
{
"epoch": 2.115360501567398,
"grad_norm": 0.2612833617017513,
"learning_rate": 1.6364481636448163e-05,
"loss": 0.2984,
"step": 1688
},
{
"epoch": 2.116614420062696,
"grad_norm": 0.2833506248520489,
"learning_rate": 1.6341236634123664e-05,
"loss": 0.276,
"step": 1689
},
{
"epoch": 2.1178683385579937,
"grad_norm": 0.252695424725672,
"learning_rate": 1.6317991631799166e-05,
"loss": 0.2914,
"step": 1690
},
{
"epoch": 2.1191222570532915,
"grad_norm": 0.255741410052811,
"learning_rate": 1.6294746629474663e-05,
"loss": 0.3006,
"step": 1691
},
{
"epoch": 2.1203761755485893,
"grad_norm": 0.3012359261742704,
"learning_rate": 1.627150162715016e-05,
"loss": 0.3083,
"step": 1692
},
{
"epoch": 2.121630094043887,
"grad_norm": 0.27675291752766756,
"learning_rate": 1.6248256624825662e-05,
"loss": 0.2885,
"step": 1693
},
{
"epoch": 2.122884012539185,
"grad_norm": 0.24239216111817363,
"learning_rate": 1.6225011622501164e-05,
"loss": 0.301,
"step": 1694
},
{
"epoch": 2.1241379310344826,
"grad_norm": 0.3280031322924894,
"learning_rate": 1.620176662017666e-05,
"loss": 0.2959,
"step": 1695
},
{
"epoch": 2.1253918495297803,
"grad_norm": 0.2751686248111108,
"learning_rate": 1.6178521617852163e-05,
"loss": 0.2823,
"step": 1696
},
{
"epoch": 2.1266457680250785,
"grad_norm": 0.26946863080851347,
"learning_rate": 1.615527661552766e-05,
"loss": 0.2975,
"step": 1697
},
{
"epoch": 2.1278996865203763,
"grad_norm": 0.2761381410448136,
"learning_rate": 1.6132031613203162e-05,
"loss": 0.2973,
"step": 1698
},
{
"epoch": 2.129153605015674,
"grad_norm": 0.2577320238537325,
"learning_rate": 1.6108786610878663e-05,
"loss": 0.3017,
"step": 1699
},
{
"epoch": 2.130407523510972,
"grad_norm": 0.2851860181827438,
"learning_rate": 1.608554160855416e-05,
"loss": 0.302,
"step": 1700
},
{
"epoch": 2.1316614420062696,
"grad_norm": 0.2605861142931671,
"learning_rate": 1.6062296606229662e-05,
"loss": 0.3025,
"step": 1701
},
{
"epoch": 2.1329153605015674,
"grad_norm": 0.3687669548935718,
"learning_rate": 1.603905160390516e-05,
"loss": 0.3086,
"step": 1702
},
{
"epoch": 2.134169278996865,
"grad_norm": 0.2818183169232765,
"learning_rate": 1.601580660158066e-05,
"loss": 0.3039,
"step": 1703
},
{
"epoch": 2.135423197492163,
"grad_norm": 0.28843378944776665,
"learning_rate": 1.5992561599256162e-05,
"loss": 0.2943,
"step": 1704
},
{
"epoch": 2.1366771159874607,
"grad_norm": 0.2695889240904266,
"learning_rate": 1.596931659693166e-05,
"loss": 0.2986,
"step": 1705
},
{
"epoch": 2.1379310344827585,
"grad_norm": 0.2804815246383472,
"learning_rate": 1.5946071594607158e-05,
"loss": 0.2841,
"step": 1706
},
{
"epoch": 2.139184952978056,
"grad_norm": 0.2589536730529997,
"learning_rate": 1.592282659228266e-05,
"loss": 0.2931,
"step": 1707
},
{
"epoch": 2.1404388714733544,
"grad_norm": 0.29194586096474073,
"learning_rate": 1.589958158995816e-05,
"loss": 0.3,
"step": 1708
},
{
"epoch": 2.141692789968652,
"grad_norm": 0.27501851236629254,
"learning_rate": 1.587633658763366e-05,
"loss": 0.2858,
"step": 1709
},
{
"epoch": 2.14294670846395,
"grad_norm": 0.2764850900175734,
"learning_rate": 1.585309158530916e-05,
"loss": 0.2932,
"step": 1710
},
{
"epoch": 2.1442006269592477,
"grad_norm": 0.2716413475475733,
"learning_rate": 1.5829846582984657e-05,
"loss": 0.3074,
"step": 1711
},
{
"epoch": 2.1454545454545455,
"grad_norm": 0.3031067300222352,
"learning_rate": 1.580660158066016e-05,
"loss": 0.2909,
"step": 1712
},
{
"epoch": 2.1467084639498433,
"grad_norm": 0.2728386823724064,
"learning_rate": 1.578335657833566e-05,
"loss": 0.3185,
"step": 1713
},
{
"epoch": 2.147962382445141,
"grad_norm": 0.2847610754419473,
"learning_rate": 1.5760111576011158e-05,
"loss": 0.2912,
"step": 1714
},
{
"epoch": 2.149216300940439,
"grad_norm": 0.3354917896092358,
"learning_rate": 1.573686657368666e-05,
"loss": 0.2871,
"step": 1715
},
{
"epoch": 2.1504702194357366,
"grad_norm": 0.2952592970697281,
"learning_rate": 1.5713621571362157e-05,
"loss": 0.2954,
"step": 1716
},
{
"epoch": 2.1517241379310343,
"grad_norm": 0.26270761671816617,
"learning_rate": 1.5690376569037658e-05,
"loss": 0.2927,
"step": 1717
},
{
"epoch": 2.152978056426332,
"grad_norm": 0.3324757005834936,
"learning_rate": 1.566713156671316e-05,
"loss": 0.3073,
"step": 1718
},
{
"epoch": 2.15423197492163,
"grad_norm": 0.27387160272449745,
"learning_rate": 1.5643886564388657e-05,
"loss": 0.2976,
"step": 1719
},
{
"epoch": 2.155485893416928,
"grad_norm": 0.2529746406088967,
"learning_rate": 1.5620641562064155e-05,
"loss": 0.3065,
"step": 1720
},
{
"epoch": 2.156739811912226,
"grad_norm": 0.2523141289427067,
"learning_rate": 1.5597396559739656e-05,
"loss": 0.2956,
"step": 1721
},
{
"epoch": 2.1579937304075236,
"grad_norm": 0.25109642198367715,
"learning_rate": 1.5574151557415157e-05,
"loss": 0.3016,
"step": 1722
},
{
"epoch": 2.1592476489028214,
"grad_norm": 0.30014196094360335,
"learning_rate": 1.5550906555090655e-05,
"loss": 0.2832,
"step": 1723
},
{
"epoch": 2.160501567398119,
"grad_norm": 0.26035306057702534,
"learning_rate": 1.5527661552766156e-05,
"loss": 0.2877,
"step": 1724
},
{
"epoch": 2.161755485893417,
"grad_norm": 0.24085696719541644,
"learning_rate": 1.5504416550441654e-05,
"loss": 0.2873,
"step": 1725
},
{
"epoch": 2.1630094043887147,
"grad_norm": 0.25997224575081246,
"learning_rate": 1.5481171548117155e-05,
"loss": 0.2992,
"step": 1726
},
{
"epoch": 2.1642633228840125,
"grad_norm": 0.29091307556450674,
"learning_rate": 1.5457926545792657e-05,
"loss": 0.3182,
"step": 1727
},
{
"epoch": 2.1655172413793102,
"grad_norm": 0.24833029293213535,
"learning_rate": 1.5434681543468155e-05,
"loss": 0.2933,
"step": 1728
},
{
"epoch": 2.166771159874608,
"grad_norm": 0.2331338816303229,
"learning_rate": 1.5411436541143656e-05,
"loss": 0.2866,
"step": 1729
},
{
"epoch": 2.1680250783699058,
"grad_norm": 0.25534229109316736,
"learning_rate": 1.5388191538819154e-05,
"loss": 0.3044,
"step": 1730
},
{
"epoch": 2.169278996865204,
"grad_norm": 0.26521054746429445,
"learning_rate": 1.5364946536494655e-05,
"loss": 0.3038,
"step": 1731
},
{
"epoch": 2.1705329153605017,
"grad_norm": 0.22887293262136296,
"learning_rate": 1.5341701534170156e-05,
"loss": 0.3032,
"step": 1732
},
{
"epoch": 2.1717868338557995,
"grad_norm": 0.2452245812215447,
"learning_rate": 1.5318456531845654e-05,
"loss": 0.2919,
"step": 1733
},
{
"epoch": 2.1730407523510973,
"grad_norm": 0.24441499851055087,
"learning_rate": 1.5295211529521152e-05,
"loss": 0.2858,
"step": 1734
},
{
"epoch": 2.174294670846395,
"grad_norm": 0.24774815453852347,
"learning_rate": 1.5271966527196653e-05,
"loss": 0.3232,
"step": 1735
},
{
"epoch": 2.175548589341693,
"grad_norm": 0.2690271424270524,
"learning_rate": 1.5248721524872153e-05,
"loss": 0.2888,
"step": 1736
},
{
"epoch": 2.1768025078369906,
"grad_norm": 0.2325555477593293,
"learning_rate": 1.5225476522547652e-05,
"loss": 0.2719,
"step": 1737
},
{
"epoch": 2.1780564263322884,
"grad_norm": 0.2575986029814697,
"learning_rate": 1.5202231520223153e-05,
"loss": 0.2829,
"step": 1738
},
{
"epoch": 2.179310344827586,
"grad_norm": 0.24108758592536866,
"learning_rate": 1.5178986517898653e-05,
"loss": 0.2844,
"step": 1739
},
{
"epoch": 2.180564263322884,
"grad_norm": 0.2868121810490177,
"learning_rate": 1.515574151557415e-05,
"loss": 0.3058,
"step": 1740
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.28825523285927934,
"learning_rate": 1.5132496513249652e-05,
"loss": 0.2962,
"step": 1741
},
{
"epoch": 2.1830721003134794,
"grad_norm": 0.21705837051436072,
"learning_rate": 1.5109251510925151e-05,
"loss": 0.3074,
"step": 1742
},
{
"epoch": 2.1843260188087776,
"grad_norm": 0.2962532780119198,
"learning_rate": 1.5086006508600651e-05,
"loss": 0.3103,
"step": 1743
},
{
"epoch": 2.1855799373040754,
"grad_norm": 0.28787458166507135,
"learning_rate": 1.5062761506276152e-05,
"loss": 0.2975,
"step": 1744
},
{
"epoch": 2.186833855799373,
"grad_norm": 0.2521153919337955,
"learning_rate": 1.503951650395165e-05,
"loss": 0.2867,
"step": 1745
},
{
"epoch": 2.188087774294671,
"grad_norm": 0.2557459082484896,
"learning_rate": 1.5016271501627153e-05,
"loss": 0.3071,
"step": 1746
},
{
"epoch": 2.1893416927899687,
"grad_norm": 0.29110302991113063,
"learning_rate": 1.499302649930265e-05,
"loss": 0.3056,
"step": 1747
},
{
"epoch": 2.1905956112852665,
"grad_norm": 0.2796066257618519,
"learning_rate": 1.496978149697815e-05,
"loss": 0.3172,
"step": 1748
},
{
"epoch": 2.1918495297805642,
"grad_norm": 0.24508228303145715,
"learning_rate": 1.4946536494653652e-05,
"loss": 0.2969,
"step": 1749
},
{
"epoch": 2.193103448275862,
"grad_norm": 0.2652716182615862,
"learning_rate": 1.492329149232915e-05,
"loss": 0.3093,
"step": 1750
},
{
"epoch": 2.19435736677116,
"grad_norm": 0.2845054162586105,
"learning_rate": 1.4900046490004649e-05,
"loss": 0.3034,
"step": 1751
},
{
"epoch": 2.1956112852664575,
"grad_norm": 0.23868474100302203,
"learning_rate": 1.487680148768015e-05,
"loss": 0.3027,
"step": 1752
},
{
"epoch": 2.1968652037617553,
"grad_norm": 0.27579393015987025,
"learning_rate": 1.485355648535565e-05,
"loss": 0.3024,
"step": 1753
},
{
"epoch": 2.1981191222570535,
"grad_norm": 0.25664009909193886,
"learning_rate": 1.4830311483031147e-05,
"loss": 0.2988,
"step": 1754
},
{
"epoch": 2.1993730407523513,
"grad_norm": 0.27891889990712265,
"learning_rate": 1.4807066480706649e-05,
"loss": 0.3048,
"step": 1755
},
{
"epoch": 2.200626959247649,
"grad_norm": 0.2601054650687977,
"learning_rate": 1.4783821478382148e-05,
"loss": 0.3012,
"step": 1756
},
{
"epoch": 2.201880877742947,
"grad_norm": 0.25366887127536314,
"learning_rate": 1.4760576476057648e-05,
"loss": 0.2896,
"step": 1757
},
{
"epoch": 2.2031347962382446,
"grad_norm": 0.25454562990165575,
"learning_rate": 1.4737331473733149e-05,
"loss": 0.3003,
"step": 1758
},
{
"epoch": 2.2043887147335424,
"grad_norm": 0.26057606875990985,
"learning_rate": 1.4714086471408647e-05,
"loss": 0.2786,
"step": 1759
},
{
"epoch": 2.20564263322884,
"grad_norm": 0.23718809489660356,
"learning_rate": 1.469084146908415e-05,
"loss": 0.2991,
"step": 1760
},
{
"epoch": 2.206896551724138,
"grad_norm": 0.24622272940933318,
"learning_rate": 1.4667596466759648e-05,
"loss": 0.2889,
"step": 1761
},
{
"epoch": 2.2081504702194357,
"grad_norm": 0.27150314225977396,
"learning_rate": 1.4644351464435147e-05,
"loss": 0.3048,
"step": 1762
},
{
"epoch": 2.2094043887147334,
"grad_norm": 0.23125410194005938,
"learning_rate": 1.4621106462110648e-05,
"loss": 0.2905,
"step": 1763
},
{
"epoch": 2.210658307210031,
"grad_norm": 0.26248502505373195,
"learning_rate": 1.4597861459786146e-05,
"loss": 0.3113,
"step": 1764
},
{
"epoch": 2.211912225705329,
"grad_norm": 0.2542846686125078,
"learning_rate": 1.4574616457461646e-05,
"loss": 0.2974,
"step": 1765
},
{
"epoch": 2.2131661442006267,
"grad_norm": 0.22786350423336788,
"learning_rate": 1.4551371455137147e-05,
"loss": 0.3002,
"step": 1766
},
{
"epoch": 2.214420062695925,
"grad_norm": 0.25253263058636827,
"learning_rate": 1.4528126452812646e-05,
"loss": 0.301,
"step": 1767
},
{
"epoch": 2.2156739811912227,
"grad_norm": 0.26360996585963353,
"learning_rate": 1.4504881450488144e-05,
"loss": 0.2893,
"step": 1768
},
{
"epoch": 2.2169278996865205,
"grad_norm": 0.25306131240460633,
"learning_rate": 1.4481636448163646e-05,
"loss": 0.2987,
"step": 1769
},
{
"epoch": 2.2181818181818183,
"grad_norm": 0.24576183166675925,
"learning_rate": 1.4458391445839145e-05,
"loss": 0.2716,
"step": 1770
},
{
"epoch": 2.219435736677116,
"grad_norm": 0.24073514255324385,
"learning_rate": 1.4435146443514645e-05,
"loss": 0.3097,
"step": 1771
},
{
"epoch": 2.220689655172414,
"grad_norm": 0.23124027929365323,
"learning_rate": 1.4411901441190146e-05,
"loss": 0.2824,
"step": 1772
},
{
"epoch": 2.2219435736677116,
"grad_norm": 0.24465248887195926,
"learning_rate": 1.4388656438865644e-05,
"loss": 0.2884,
"step": 1773
},
{
"epoch": 2.2231974921630093,
"grad_norm": 0.24503450795488546,
"learning_rate": 1.4365411436541143e-05,
"loss": 0.2925,
"step": 1774
},
{
"epoch": 2.224451410658307,
"grad_norm": 0.24048241732687545,
"learning_rate": 1.4342166434216644e-05,
"loss": 0.307,
"step": 1775
},
{
"epoch": 2.225705329153605,
"grad_norm": 0.2374429243882282,
"learning_rate": 1.4318921431892144e-05,
"loss": 0.2817,
"step": 1776
},
{
"epoch": 2.2269592476489026,
"grad_norm": 0.22375047321728367,
"learning_rate": 1.4295676429567645e-05,
"loss": 0.2897,
"step": 1777
},
{
"epoch": 2.228213166144201,
"grad_norm": 0.24107965166184647,
"learning_rate": 1.4272431427243143e-05,
"loss": 0.292,
"step": 1778
},
{
"epoch": 2.2294670846394986,
"grad_norm": 0.24621071533460973,
"learning_rate": 1.4249186424918643e-05,
"loss": 0.3088,
"step": 1779
},
{
"epoch": 2.2307210031347964,
"grad_norm": 0.24544570806581822,
"learning_rate": 1.4225941422594144e-05,
"loss": 0.3076,
"step": 1780
},
{
"epoch": 2.231974921630094,
"grad_norm": 0.22295073914991279,
"learning_rate": 1.4202696420269643e-05,
"loss": 0.288,
"step": 1781
},
{
"epoch": 2.233228840125392,
"grad_norm": 0.23983740135690798,
"learning_rate": 1.4179451417945141e-05,
"loss": 0.2972,
"step": 1782
},
{
"epoch": 2.2344827586206897,
"grad_norm": 0.24920857961692358,
"learning_rate": 1.4156206415620642e-05,
"loss": 0.3102,
"step": 1783
},
{
"epoch": 2.2357366771159874,
"grad_norm": 0.21565401899205883,
"learning_rate": 1.4132961413296142e-05,
"loss": 0.3073,
"step": 1784
},
{
"epoch": 2.236990595611285,
"grad_norm": 0.2507528424538107,
"learning_rate": 1.4109716410971641e-05,
"loss": 0.3128,
"step": 1785
},
{
"epoch": 2.238244514106583,
"grad_norm": 0.2494575191416315,
"learning_rate": 1.4086471408647143e-05,
"loss": 0.2962,
"step": 1786
},
{
"epoch": 2.2394984326018808,
"grad_norm": 0.23079004970781097,
"learning_rate": 1.406322640632264e-05,
"loss": 0.3111,
"step": 1787
},
{
"epoch": 2.2407523510971785,
"grad_norm": 0.2503321899301802,
"learning_rate": 1.403998140399814e-05,
"loss": 0.3112,
"step": 1788
},
{
"epoch": 2.2420062695924763,
"grad_norm": 0.25451268004314437,
"learning_rate": 1.4016736401673641e-05,
"loss": 0.2978,
"step": 1789
},
{
"epoch": 2.2432601880877745,
"grad_norm": 0.23986164107903973,
"learning_rate": 1.399349139934914e-05,
"loss": 0.3104,
"step": 1790
},
{
"epoch": 2.2445141065830723,
"grad_norm": 0.23898192440615235,
"learning_rate": 1.3970246397024642e-05,
"loss": 0.2928,
"step": 1791
},
{
"epoch": 2.24576802507837,
"grad_norm": 0.2559494647147087,
"learning_rate": 1.394700139470014e-05,
"loss": 0.3053,
"step": 1792
},
{
"epoch": 2.247021943573668,
"grad_norm": 0.2281051634399345,
"learning_rate": 1.392375639237564e-05,
"loss": 0.2868,
"step": 1793
},
{
"epoch": 2.2482758620689656,
"grad_norm": 0.2689696044144527,
"learning_rate": 1.390051139005114e-05,
"loss": 0.294,
"step": 1794
},
{
"epoch": 2.2495297805642633,
"grad_norm": 0.2770703008802369,
"learning_rate": 1.387726638772664e-05,
"loss": 0.2915,
"step": 1795
},
{
"epoch": 2.250783699059561,
"grad_norm": 0.224202694772019,
"learning_rate": 1.3854021385402138e-05,
"loss": 0.2969,
"step": 1796
},
{
"epoch": 2.252037617554859,
"grad_norm": 0.24944068075770623,
"learning_rate": 1.383077638307764e-05,
"loss": 0.3051,
"step": 1797
},
{
"epoch": 2.2532915360501566,
"grad_norm": 0.3096950828871202,
"learning_rate": 1.3807531380753139e-05,
"loss": 0.3116,
"step": 1798
},
{
"epoch": 2.2545454545454544,
"grad_norm": 0.26513630248744924,
"learning_rate": 1.3784286378428638e-05,
"loss": 0.2851,
"step": 1799
},
{
"epoch": 2.255799373040752,
"grad_norm": 0.2265136385393993,
"learning_rate": 1.376104137610414e-05,
"loss": 0.297,
"step": 1800
},
{
"epoch": 2.2570532915360504,
"grad_norm": 0.2642252439982428,
"learning_rate": 1.3737796373779637e-05,
"loss": 0.3179,
"step": 1801
},
{
"epoch": 2.258307210031348,
"grad_norm": 0.26411665896176467,
"learning_rate": 1.3714551371455137e-05,
"loss": 0.3116,
"step": 1802
},
{
"epoch": 2.259561128526646,
"grad_norm": 0.24844971639752572,
"learning_rate": 1.3691306369130638e-05,
"loss": 0.2951,
"step": 1803
},
{
"epoch": 2.2608150470219437,
"grad_norm": 0.25602790818764526,
"learning_rate": 1.3668061366806138e-05,
"loss": 0.3015,
"step": 1804
},
{
"epoch": 2.2620689655172415,
"grad_norm": 0.2697875008479732,
"learning_rate": 1.3644816364481639e-05,
"loss": 0.2994,
"step": 1805
},
{
"epoch": 2.2633228840125392,
"grad_norm": 0.2530950011191231,
"learning_rate": 1.3621571362157137e-05,
"loss": 0.2993,
"step": 1806
},
{
"epoch": 2.264576802507837,
"grad_norm": 0.23520151102243847,
"learning_rate": 1.3598326359832636e-05,
"loss": 0.2991,
"step": 1807
},
{
"epoch": 2.2658307210031348,
"grad_norm": 0.22808887712403603,
"learning_rate": 1.3575081357508137e-05,
"loss": 0.2706,
"step": 1808
},
{
"epoch": 2.2670846394984325,
"grad_norm": 0.25481970544139526,
"learning_rate": 1.3551836355183637e-05,
"loss": 0.3118,
"step": 1809
},
{
"epoch": 2.2683385579937303,
"grad_norm": 0.24758023017379646,
"learning_rate": 1.3528591352859135e-05,
"loss": 0.3108,
"step": 1810
},
{
"epoch": 2.269592476489028,
"grad_norm": 0.2589484719516585,
"learning_rate": 1.3505346350534636e-05,
"loss": 0.2903,
"step": 1811
},
{
"epoch": 2.270846394984326,
"grad_norm": 0.25685282321840003,
"learning_rate": 1.3482101348210136e-05,
"loss": 0.3039,
"step": 1812
},
{
"epoch": 2.2721003134796236,
"grad_norm": 0.26151979975516665,
"learning_rate": 1.3458856345885633e-05,
"loss": 0.2864,
"step": 1813
},
{
"epoch": 2.273354231974922,
"grad_norm": 0.2578657434556848,
"learning_rate": 1.3435611343561136e-05,
"loss": 0.3226,
"step": 1814
},
{
"epoch": 2.2746081504702196,
"grad_norm": 0.2323405364026528,
"learning_rate": 1.3412366341236634e-05,
"loss": 0.2836,
"step": 1815
},
{
"epoch": 2.2758620689655173,
"grad_norm": 0.23456589802326505,
"learning_rate": 1.3389121338912134e-05,
"loss": 0.2903,
"step": 1816
},
{
"epoch": 2.277115987460815,
"grad_norm": 0.25189106987612125,
"learning_rate": 1.3365876336587635e-05,
"loss": 0.3068,
"step": 1817
},
{
"epoch": 2.278369905956113,
"grad_norm": 0.2316715501499928,
"learning_rate": 1.3342631334263134e-05,
"loss": 0.2918,
"step": 1818
},
{
"epoch": 2.2796238244514107,
"grad_norm": 0.24482342138142388,
"learning_rate": 1.3319386331938632e-05,
"loss": 0.3118,
"step": 1819
},
{
"epoch": 2.2808777429467084,
"grad_norm": 0.22990617420544732,
"learning_rate": 1.3296141329614134e-05,
"loss": 0.2875,
"step": 1820
},
{
"epoch": 2.282131661442006,
"grad_norm": 0.23942731533808612,
"learning_rate": 1.3272896327289633e-05,
"loss": 0.2966,
"step": 1821
},
{
"epoch": 2.283385579937304,
"grad_norm": 0.2859995124393401,
"learning_rate": 1.3249651324965134e-05,
"loss": 0.2942,
"step": 1822
},
{
"epoch": 2.2846394984326017,
"grad_norm": 0.2222153211232312,
"learning_rate": 1.3226406322640634e-05,
"loss": 0.287,
"step": 1823
},
{
"epoch": 2.2858934169279,
"grad_norm": 0.2359604510795701,
"learning_rate": 1.3203161320316132e-05,
"loss": 0.3162,
"step": 1824
},
{
"epoch": 2.2871473354231977,
"grad_norm": 0.217254814164127,
"learning_rate": 1.3179916317991633e-05,
"loss": 0.2833,
"step": 1825
},
{
"epoch": 2.2884012539184955,
"grad_norm": 0.25608857460547596,
"learning_rate": 1.3156671315667132e-05,
"loss": 0.2914,
"step": 1826
},
{
"epoch": 2.2896551724137932,
"grad_norm": 0.23603162611167613,
"learning_rate": 1.313342631334263e-05,
"loss": 0.3063,
"step": 1827
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.23416430275449057,
"learning_rate": 1.3110181311018133e-05,
"loss": 0.2945,
"step": 1828
},
{
"epoch": 2.2921630094043888,
"grad_norm": 0.2550486240306843,
"learning_rate": 1.3086936308693631e-05,
"loss": 0.2787,
"step": 1829
},
{
"epoch": 2.2934169278996865,
"grad_norm": 0.24283116724068896,
"learning_rate": 1.306369130636913e-05,
"loss": 0.2996,
"step": 1830
},
{
"epoch": 2.2946708463949843,
"grad_norm": 0.22113873321912694,
"learning_rate": 1.3040446304044632e-05,
"loss": 0.2973,
"step": 1831
},
{
"epoch": 2.295924764890282,
"grad_norm": 0.26365978139822815,
"learning_rate": 1.3017201301720131e-05,
"loss": 0.2899,
"step": 1832
},
{
"epoch": 2.29717868338558,
"grad_norm": 0.246463692513242,
"learning_rate": 1.299395629939563e-05,
"loss": 0.2806,
"step": 1833
},
{
"epoch": 2.2984326018808776,
"grad_norm": 0.22338249855127093,
"learning_rate": 1.297071129707113e-05,
"loss": 0.2988,
"step": 1834
},
{
"epoch": 2.2996865203761754,
"grad_norm": 0.2533685754359579,
"learning_rate": 1.294746629474663e-05,
"loss": 0.3017,
"step": 1835
},
{
"epoch": 2.300940438871473,
"grad_norm": 0.2558768690797258,
"learning_rate": 1.2924221292422131e-05,
"loss": 0.2986,
"step": 1836
},
{
"epoch": 2.302194357366771,
"grad_norm": 0.2300497203038705,
"learning_rate": 1.290097629009763e-05,
"loss": 0.3071,
"step": 1837
},
{
"epoch": 2.303448275862069,
"grad_norm": 0.2693610956365079,
"learning_rate": 1.2877731287773129e-05,
"loss": 0.2968,
"step": 1838
},
{
"epoch": 2.304702194357367,
"grad_norm": 0.24240367659938294,
"learning_rate": 1.285448628544863e-05,
"loss": 0.3002,
"step": 1839
},
{
"epoch": 2.3059561128526647,
"grad_norm": 0.2602728787603176,
"learning_rate": 1.283124128312413e-05,
"loss": 0.3057,
"step": 1840
},
{
"epoch": 2.3072100313479624,
"grad_norm": 0.22392950789042568,
"learning_rate": 1.2807996280799627e-05,
"loss": 0.281,
"step": 1841
},
{
"epoch": 2.30846394984326,
"grad_norm": 0.2404968816099336,
"learning_rate": 1.278475127847513e-05,
"loss": 0.284,
"step": 1842
},
{
"epoch": 2.309717868338558,
"grad_norm": 0.22330145528498577,
"learning_rate": 1.2761506276150628e-05,
"loss": 0.2845,
"step": 1843
},
{
"epoch": 2.3109717868338557,
"grad_norm": 0.2563825988799964,
"learning_rate": 1.2738261273826127e-05,
"loss": 0.28,
"step": 1844
},
{
"epoch": 2.3122257053291535,
"grad_norm": 0.2723687168014003,
"learning_rate": 1.2715016271501629e-05,
"loss": 0.3054,
"step": 1845
},
{
"epoch": 2.3134796238244513,
"grad_norm": 0.23659187545647253,
"learning_rate": 1.2691771269177126e-05,
"loss": 0.2979,
"step": 1846
},
{
"epoch": 2.314733542319749,
"grad_norm": 0.23745120932751412,
"learning_rate": 1.2668526266852626e-05,
"loss": 0.3159,
"step": 1847
},
{
"epoch": 2.3159874608150472,
"grad_norm": 0.24106705852048868,
"learning_rate": 1.2645281264528127e-05,
"loss": 0.3078,
"step": 1848
},
{
"epoch": 2.317241379310345,
"grad_norm": 0.22414663304813792,
"learning_rate": 1.2622036262203627e-05,
"loss": 0.2905,
"step": 1849
},
{
"epoch": 2.318495297805643,
"grad_norm": 0.23564295722236267,
"learning_rate": 1.2598791259879125e-05,
"loss": 0.2953,
"step": 1850
},
{
"epoch": 2.3197492163009406,
"grad_norm": 0.2223107912232997,
"learning_rate": 1.2575546257554628e-05,
"loss": 0.2903,
"step": 1851
},
{
"epoch": 2.3210031347962383,
"grad_norm": 0.22198710095872146,
"learning_rate": 1.2552301255230125e-05,
"loss": 0.2887,
"step": 1852
},
{
"epoch": 2.322257053291536,
"grad_norm": 0.23260131787247323,
"learning_rate": 1.2529056252905627e-05,
"loss": 0.3005,
"step": 1853
},
{
"epoch": 2.323510971786834,
"grad_norm": 0.27372614945679513,
"learning_rate": 1.2505811250581126e-05,
"loss": 0.3113,
"step": 1854
},
{
"epoch": 2.3247648902821316,
"grad_norm": 0.2555058884703125,
"learning_rate": 1.2482566248256626e-05,
"loss": 0.3098,
"step": 1855
},
{
"epoch": 2.3260188087774294,
"grad_norm": 0.23305356871592678,
"learning_rate": 1.2459321245932125e-05,
"loss": 0.2989,
"step": 1856
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.22119688624283995,
"learning_rate": 1.2436076243607625e-05,
"loss": 0.2959,
"step": 1857
},
{
"epoch": 2.328526645768025,
"grad_norm": 0.237214165838946,
"learning_rate": 1.2412831241283126e-05,
"loss": 0.293,
"step": 1858
},
{
"epoch": 2.3297805642633227,
"grad_norm": 0.2367572646709813,
"learning_rate": 1.2389586238958624e-05,
"loss": 0.3036,
"step": 1859
},
{
"epoch": 2.3310344827586205,
"grad_norm": 0.24289671233998794,
"learning_rate": 1.2366341236634123e-05,
"loss": 0.2896,
"step": 1860
},
{
"epoch": 2.3322884012539187,
"grad_norm": 0.24073365677774425,
"learning_rate": 1.2343096234309625e-05,
"loss": 0.295,
"step": 1861
},
{
"epoch": 2.3335423197492164,
"grad_norm": 0.23864044255017353,
"learning_rate": 1.2319851231985124e-05,
"loss": 0.2847,
"step": 1862
},
{
"epoch": 2.334796238244514,
"grad_norm": 0.25061511286511257,
"learning_rate": 1.2296606229660624e-05,
"loss": 0.308,
"step": 1863
},
{
"epoch": 2.336050156739812,
"grad_norm": 0.23735725169550467,
"learning_rate": 1.2273361227336123e-05,
"loss": 0.2738,
"step": 1864
},
{
"epoch": 2.3373040752351097,
"grad_norm": 0.26291491368660946,
"learning_rate": 1.2250116225011624e-05,
"loss": 0.2951,
"step": 1865
},
{
"epoch": 2.3385579937304075,
"grad_norm": 0.25094296313455955,
"learning_rate": 1.2226871222687122e-05,
"loss": 0.2897,
"step": 1866
},
{
"epoch": 2.3398119122257053,
"grad_norm": 0.24654178486751796,
"learning_rate": 1.2203626220362622e-05,
"loss": 0.3229,
"step": 1867
},
{
"epoch": 2.341065830721003,
"grad_norm": 0.218891906308536,
"learning_rate": 1.2180381218038123e-05,
"loss": 0.2806,
"step": 1868
},
{
"epoch": 2.342319749216301,
"grad_norm": 0.2430565185488236,
"learning_rate": 1.215713621571362e-05,
"loss": 0.3059,
"step": 1869
},
{
"epoch": 2.3435736677115986,
"grad_norm": 0.22967263313197203,
"learning_rate": 1.2133891213389122e-05,
"loss": 0.2861,
"step": 1870
},
{
"epoch": 2.344827586206897,
"grad_norm": 0.2630564520936018,
"learning_rate": 1.2110646211064622e-05,
"loss": 0.3036,
"step": 1871
},
{
"epoch": 2.3460815047021946,
"grad_norm": 0.24724141108211778,
"learning_rate": 1.2087401208740123e-05,
"loss": 0.2919,
"step": 1872
},
{
"epoch": 2.3473354231974923,
"grad_norm": 0.21249577436540593,
"learning_rate": 1.206415620641562e-05,
"loss": 0.2882,
"step": 1873
},
{
"epoch": 2.34858934169279,
"grad_norm": 0.22054441548456047,
"learning_rate": 1.204091120409112e-05,
"loss": 0.2834,
"step": 1874
},
{
"epoch": 2.349843260188088,
"grad_norm": 0.26178740022806385,
"learning_rate": 1.2017666201766621e-05,
"loss": 0.3199,
"step": 1875
},
{
"epoch": 2.3510971786833856,
"grad_norm": 0.25506226612228133,
"learning_rate": 1.199442119944212e-05,
"loss": 0.2951,
"step": 1876
},
{
"epoch": 2.3523510971786834,
"grad_norm": 0.23011939324869257,
"learning_rate": 1.197117619711762e-05,
"loss": 0.2744,
"step": 1877
},
{
"epoch": 2.353605015673981,
"grad_norm": 0.27161337589385526,
"learning_rate": 1.194793119479312e-05,
"loss": 0.3103,
"step": 1878
},
{
"epoch": 2.354858934169279,
"grad_norm": 0.24633231191882923,
"learning_rate": 1.1924686192468621e-05,
"loss": 0.3062,
"step": 1879
},
{
"epoch": 2.3561128526645767,
"grad_norm": 0.246234662415482,
"learning_rate": 1.1901441190144119e-05,
"loss": 0.2917,
"step": 1880
},
{
"epoch": 2.3573667711598745,
"grad_norm": 0.23604335990700848,
"learning_rate": 1.1878196187819619e-05,
"loss": 0.2884,
"step": 1881
},
{
"epoch": 2.3586206896551722,
"grad_norm": 0.25258897568539856,
"learning_rate": 1.185495118549512e-05,
"loss": 0.2983,
"step": 1882
},
{
"epoch": 2.35987460815047,
"grad_norm": 0.23116247100815568,
"learning_rate": 1.1831706183170618e-05,
"loss": 0.3021,
"step": 1883
},
{
"epoch": 2.3611285266457678,
"grad_norm": 0.24770989313063693,
"learning_rate": 1.1808461180846119e-05,
"loss": 0.3125,
"step": 1884
},
{
"epoch": 2.362382445141066,
"grad_norm": 0.24016954240679278,
"learning_rate": 1.1785216178521618e-05,
"loss": 0.2808,
"step": 1885
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.26982930144885103,
"learning_rate": 1.1761971176197118e-05,
"loss": 0.2988,
"step": 1886
},
{
"epoch": 2.3648902821316615,
"grad_norm": 0.23508019304616323,
"learning_rate": 1.1738726173872617e-05,
"loss": 0.2874,
"step": 1887
},
{
"epoch": 2.3661442006269593,
"grad_norm": 0.2679402420926217,
"learning_rate": 1.1715481171548117e-05,
"loss": 0.2918,
"step": 1888
},
{
"epoch": 2.367398119122257,
"grad_norm": 0.23835395013813668,
"learning_rate": 1.1692236169223618e-05,
"loss": 0.295,
"step": 1889
},
{
"epoch": 2.368652037617555,
"grad_norm": 0.27534941806320345,
"learning_rate": 1.1668991166899116e-05,
"loss": 0.305,
"step": 1890
},
{
"epoch": 2.3699059561128526,
"grad_norm": 0.2464610723646613,
"learning_rate": 1.1645746164574617e-05,
"loss": 0.2976,
"step": 1891
},
{
"epoch": 2.3711598746081504,
"grad_norm": 0.22450584548751218,
"learning_rate": 1.1622501162250117e-05,
"loss": 0.2974,
"step": 1892
},
{
"epoch": 2.372413793103448,
"grad_norm": 0.23515678667991885,
"learning_rate": 1.1599256159925616e-05,
"loss": 0.2981,
"step": 1893
},
{
"epoch": 2.373667711598746,
"grad_norm": 0.2636054422630499,
"learning_rate": 1.1576011157601116e-05,
"loss": 0.3117,
"step": 1894
},
{
"epoch": 2.374921630094044,
"grad_norm": 0.24188292879490123,
"learning_rate": 1.1552766155276615e-05,
"loss": 0.304,
"step": 1895
},
{
"epoch": 2.376175548589342,
"grad_norm": 0.22456015200779658,
"learning_rate": 1.1529521152952117e-05,
"loss": 0.2984,
"step": 1896
},
{
"epoch": 2.3774294670846396,
"grad_norm": 0.23061577926208102,
"learning_rate": 1.1506276150627615e-05,
"loss": 0.2941,
"step": 1897
},
{
"epoch": 2.3786833855799374,
"grad_norm": 0.2737608287298759,
"learning_rate": 1.1483031148303116e-05,
"loss": 0.3124,
"step": 1898
},
{
"epoch": 2.379937304075235,
"grad_norm": 0.24343385602908038,
"learning_rate": 1.1459786145978615e-05,
"loss": 0.2875,
"step": 1899
},
{
"epoch": 2.381191222570533,
"grad_norm": 0.24657988182567064,
"learning_rate": 1.1436541143654115e-05,
"loss": 0.3036,
"step": 1900
},
{
"epoch": 2.3824451410658307,
"grad_norm": 0.24043081266899255,
"learning_rate": 1.1413296141329614e-05,
"loss": 0.3055,
"step": 1901
},
{
"epoch": 2.3836990595611285,
"grad_norm": 0.24042971192645293,
"learning_rate": 1.1390051139005114e-05,
"loss": 0.2909,
"step": 1902
},
{
"epoch": 2.3849529780564263,
"grad_norm": 0.23135878978778382,
"learning_rate": 1.1366806136680615e-05,
"loss": 0.2963,
"step": 1903
},
{
"epoch": 2.386206896551724,
"grad_norm": 0.24689948406512166,
"learning_rate": 1.1343561134356113e-05,
"loss": 0.2867,
"step": 1904
},
{
"epoch": 2.387460815047022,
"grad_norm": 0.2405940817410217,
"learning_rate": 1.1320316132031614e-05,
"loss": 0.2949,
"step": 1905
},
{
"epoch": 2.3887147335423196,
"grad_norm": 0.21899582527678876,
"learning_rate": 1.1297071129707114e-05,
"loss": 0.2826,
"step": 1906
},
{
"epoch": 2.3899686520376173,
"grad_norm": 0.27559239023151383,
"learning_rate": 1.1273826127382613e-05,
"loss": 0.3057,
"step": 1907
},
{
"epoch": 2.3912225705329155,
"grad_norm": 0.22885115357751626,
"learning_rate": 1.1250581125058113e-05,
"loss": 0.3078,
"step": 1908
},
{
"epoch": 2.3924764890282133,
"grad_norm": 0.22744370403524714,
"learning_rate": 1.1227336122733612e-05,
"loss": 0.2893,
"step": 1909
},
{
"epoch": 2.393730407523511,
"grad_norm": 0.23565235168065998,
"learning_rate": 1.1204091120409114e-05,
"loss": 0.2899,
"step": 1910
},
{
"epoch": 2.394984326018809,
"grad_norm": 0.22584987057834907,
"learning_rate": 1.1180846118084611e-05,
"loss": 0.2629,
"step": 1911
},
{
"epoch": 2.3962382445141066,
"grad_norm": 0.23019918284333077,
"learning_rate": 1.1157601115760113e-05,
"loss": 0.3162,
"step": 1912
},
{
"epoch": 2.3974921630094044,
"grad_norm": 0.24037968118048542,
"learning_rate": 1.1134356113435612e-05,
"loss": 0.3212,
"step": 1913
},
{
"epoch": 2.398746081504702,
"grad_norm": 0.2470068444057083,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.306,
"step": 1914
},
{
"epoch": 2.4,
"grad_norm": 0.22631986671905424,
"learning_rate": 1.1087866108786611e-05,
"loss": 0.2898,
"step": 1915
},
{
"epoch": 2.4012539184952977,
"grad_norm": 0.23030553087099617,
"learning_rate": 1.106462110646211e-05,
"loss": 0.2854,
"step": 1916
},
{
"epoch": 2.4025078369905954,
"grad_norm": 0.2317182276267081,
"learning_rate": 1.1041376104137612e-05,
"loss": 0.3004,
"step": 1917
},
{
"epoch": 2.4037617554858937,
"grad_norm": 0.23730327378468818,
"learning_rate": 1.101813110181311e-05,
"loss": 0.3014,
"step": 1918
},
{
"epoch": 2.4050156739811914,
"grad_norm": 0.24183309955858545,
"learning_rate": 1.0994886099488611e-05,
"loss": 0.2853,
"step": 1919
},
{
"epoch": 2.406269592476489,
"grad_norm": 0.22299630961455638,
"learning_rate": 1.097164109716411e-05,
"loss": 0.283,
"step": 1920
},
{
"epoch": 2.407523510971787,
"grad_norm": 0.23637233986000028,
"learning_rate": 1.094839609483961e-05,
"loss": 0.2966,
"step": 1921
},
{
"epoch": 2.4087774294670847,
"grad_norm": 0.2476700225880769,
"learning_rate": 1.092515109251511e-05,
"loss": 0.3113,
"step": 1922
},
{
"epoch": 2.4100313479623825,
"grad_norm": 0.23072255509871312,
"learning_rate": 1.0901906090190609e-05,
"loss": 0.2929,
"step": 1923
},
{
"epoch": 2.4112852664576803,
"grad_norm": 0.2387894686261957,
"learning_rate": 1.087866108786611e-05,
"loss": 0.2851,
"step": 1924
},
{
"epoch": 2.412539184952978,
"grad_norm": 0.21824532821563378,
"learning_rate": 1.0855416085541608e-05,
"loss": 0.3161,
"step": 1925
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.22523837610732778,
"learning_rate": 1.083217108321711e-05,
"loss": 0.2943,
"step": 1926
},
{
"epoch": 2.4150470219435736,
"grad_norm": 0.23255544322967403,
"learning_rate": 1.0808926080892609e-05,
"loss": 0.2801,
"step": 1927
},
{
"epoch": 2.4163009404388713,
"grad_norm": 0.2575463498605155,
"learning_rate": 1.0785681078568108e-05,
"loss": 0.3077,
"step": 1928
},
{
"epoch": 2.417554858934169,
"grad_norm": 0.2242501014257073,
"learning_rate": 1.0762436076243608e-05,
"loss": 0.2948,
"step": 1929
},
{
"epoch": 2.418808777429467,
"grad_norm": 0.2259144431106535,
"learning_rate": 1.0739191073919108e-05,
"loss": 0.294,
"step": 1930
},
{
"epoch": 2.420062695924765,
"grad_norm": 0.23445491309506405,
"learning_rate": 1.0715946071594609e-05,
"loss": 0.3,
"step": 1931
},
{
"epoch": 2.421316614420063,
"grad_norm": 0.23614295267856342,
"learning_rate": 1.0692701069270107e-05,
"loss": 0.3138,
"step": 1932
},
{
"epoch": 2.4225705329153606,
"grad_norm": 0.22964548983740213,
"learning_rate": 1.0669456066945608e-05,
"loss": 0.2986,
"step": 1933
},
{
"epoch": 2.4238244514106584,
"grad_norm": 0.2233389460112934,
"learning_rate": 1.0646211064621107e-05,
"loss": 0.2843,
"step": 1934
},
{
"epoch": 2.425078369905956,
"grad_norm": 0.24551137813939305,
"learning_rate": 1.0622966062296607e-05,
"loss": 0.2828,
"step": 1935
},
{
"epoch": 2.426332288401254,
"grad_norm": 0.24668498254319793,
"learning_rate": 1.0599721059972106e-05,
"loss": 0.2933,
"step": 1936
},
{
"epoch": 2.4275862068965517,
"grad_norm": 0.24515060786893805,
"learning_rate": 1.0576476057647606e-05,
"loss": 0.3005,
"step": 1937
},
{
"epoch": 2.4288401253918495,
"grad_norm": 0.21935210794037746,
"learning_rate": 1.0553231055323106e-05,
"loss": 0.2825,
"step": 1938
},
{
"epoch": 2.4300940438871472,
"grad_norm": 0.22883240614822306,
"learning_rate": 1.0529986052998605e-05,
"loss": 0.3047,
"step": 1939
},
{
"epoch": 2.431347962382445,
"grad_norm": 0.25782500318618273,
"learning_rate": 1.0506741050674106e-05,
"loss": 0.2815,
"step": 1940
},
{
"epoch": 2.4326018808777428,
"grad_norm": 0.23344478913984304,
"learning_rate": 1.0483496048349606e-05,
"loss": 0.3032,
"step": 1941
},
{
"epoch": 2.433855799373041,
"grad_norm": 0.21974198767381103,
"learning_rate": 1.0460251046025105e-05,
"loss": 0.3026,
"step": 1942
},
{
"epoch": 2.4351097178683387,
"grad_norm": 0.22696460577758568,
"learning_rate": 1.0437006043700605e-05,
"loss": 0.2885,
"step": 1943
},
{
"epoch": 2.4363636363636365,
"grad_norm": 0.23042902322331643,
"learning_rate": 1.0413761041376104e-05,
"loss": 0.2963,
"step": 1944
},
{
"epoch": 2.4376175548589343,
"grad_norm": 0.19556078177649527,
"learning_rate": 1.0390516039051604e-05,
"loss": 0.2828,
"step": 1945
},
{
"epoch": 2.438871473354232,
"grad_norm": 0.21732101280858943,
"learning_rate": 1.0367271036727103e-05,
"loss": 0.2978,
"step": 1946
},
{
"epoch": 2.44012539184953,
"grad_norm": 0.23778543760760792,
"learning_rate": 1.0344026034402605e-05,
"loss": 0.3021,
"step": 1947
},
{
"epoch": 2.4413793103448276,
"grad_norm": 0.21772115295896313,
"learning_rate": 1.0320781032078104e-05,
"loss": 0.2996,
"step": 1948
},
{
"epoch": 2.4426332288401253,
"grad_norm": 0.2276931450991446,
"learning_rate": 1.0297536029753604e-05,
"loss": 0.2951,
"step": 1949
},
{
"epoch": 2.443887147335423,
"grad_norm": 0.23097301752435995,
"learning_rate": 1.0274291027429103e-05,
"loss": 0.3023,
"step": 1950
},
{
"epoch": 2.445141065830721,
"grad_norm": 0.22766768878965543,
"learning_rate": 1.0251046025104603e-05,
"loss": 0.2934,
"step": 1951
},
{
"epoch": 2.4463949843260187,
"grad_norm": 0.25109522988013216,
"learning_rate": 1.0227801022780102e-05,
"loss": 0.2906,
"step": 1952
},
{
"epoch": 2.4476489028213164,
"grad_norm": 0.21914882985426679,
"learning_rate": 1.0204556020455602e-05,
"loss": 0.2882,
"step": 1953
},
{
"epoch": 2.448902821316614,
"grad_norm": 0.2343844382578704,
"learning_rate": 1.0181311018131103e-05,
"loss": 0.2977,
"step": 1954
},
{
"epoch": 2.4501567398119124,
"grad_norm": 0.2724369529419338,
"learning_rate": 1.0158066015806603e-05,
"loss": 0.3079,
"step": 1955
},
{
"epoch": 2.45141065830721,
"grad_norm": 0.20529977458638315,
"learning_rate": 1.0134821013482102e-05,
"loss": 0.2655,
"step": 1956
},
{
"epoch": 2.452664576802508,
"grad_norm": 0.22223373222761772,
"learning_rate": 1.0111576011157602e-05,
"loss": 0.3158,
"step": 1957
},
{
"epoch": 2.4539184952978057,
"grad_norm": 0.22633994113737155,
"learning_rate": 1.0088331008833101e-05,
"loss": 0.302,
"step": 1958
},
{
"epoch": 2.4551724137931035,
"grad_norm": 0.23372119758725732,
"learning_rate": 1.00650860065086e-05,
"loss": 0.3021,
"step": 1959
},
{
"epoch": 2.4564263322884012,
"grad_norm": 0.21822036113039833,
"learning_rate": 1.00418410041841e-05,
"loss": 0.2653,
"step": 1960
},
{
"epoch": 2.457680250783699,
"grad_norm": 0.23874543517103422,
"learning_rate": 1.0018596001859602e-05,
"loss": 0.3162,
"step": 1961
},
{
"epoch": 2.4589341692789968,
"grad_norm": 0.23826140051909903,
"learning_rate": 9.995350999535101e-06,
"loss": 0.3087,
"step": 1962
},
{
"epoch": 2.4601880877742945,
"grad_norm": 0.24872505723965782,
"learning_rate": 9.9721059972106e-06,
"loss": 0.3208,
"step": 1963
},
{
"epoch": 2.4614420062695923,
"grad_norm": 0.22458824502934685,
"learning_rate": 9.9488609948861e-06,
"loss": 0.2751,
"step": 1964
},
{
"epoch": 2.4626959247648905,
"grad_norm": 0.23917573959672733,
"learning_rate": 9.9256159925616e-06,
"loss": 0.326,
"step": 1965
},
{
"epoch": 2.4639498432601883,
"grad_norm": 0.2035389700763071,
"learning_rate": 9.9023709902371e-06,
"loss": 0.2756,
"step": 1966
},
{
"epoch": 2.465203761755486,
"grad_norm": 0.2286715517176188,
"learning_rate": 9.879125987912599e-06,
"loss": 0.3005,
"step": 1967
},
{
"epoch": 2.466457680250784,
"grad_norm": 0.23088474542487866,
"learning_rate": 9.8558809855881e-06,
"loss": 0.2966,
"step": 1968
},
{
"epoch": 2.4677115987460816,
"grad_norm": 0.2513615807991823,
"learning_rate": 9.8326359832636e-06,
"loss": 0.3138,
"step": 1969
},
{
"epoch": 2.4689655172413794,
"grad_norm": 0.22317502915458207,
"learning_rate": 9.809390980939099e-06,
"loss": 0.3052,
"step": 1970
},
{
"epoch": 2.470219435736677,
"grad_norm": 0.24740385255673059,
"learning_rate": 9.786145978614599e-06,
"loss": 0.3086,
"step": 1971
},
{
"epoch": 2.471473354231975,
"grad_norm": 0.24258126908826164,
"learning_rate": 9.762900976290098e-06,
"loss": 0.2965,
"step": 1972
},
{
"epoch": 2.4727272727272727,
"grad_norm": 0.24344141888434317,
"learning_rate": 9.739655973965598e-06,
"loss": 0.2913,
"step": 1973
},
{
"epoch": 2.4739811912225704,
"grad_norm": 0.22973589596933353,
"learning_rate": 9.716410971641097e-06,
"loss": 0.2919,
"step": 1974
},
{
"epoch": 2.475235109717868,
"grad_norm": 0.297685974444787,
"learning_rate": 9.693165969316598e-06,
"loss": 0.3025,
"step": 1975
},
{
"epoch": 2.476489028213166,
"grad_norm": 0.2588257555789097,
"learning_rate": 9.669920966992096e-06,
"loss": 0.2921,
"step": 1976
},
{
"epoch": 2.4777429467084637,
"grad_norm": 0.2640425247123717,
"learning_rate": 9.646675964667597e-06,
"loss": 0.2983,
"step": 1977
},
{
"epoch": 2.478996865203762,
"grad_norm": 0.24546295435197846,
"learning_rate": 9.623430962343097e-06,
"loss": 0.3002,
"step": 1978
},
{
"epoch": 2.4802507836990597,
"grad_norm": 0.27662837081187397,
"learning_rate": 9.600185960018597e-06,
"loss": 0.3123,
"step": 1979
},
{
"epoch": 2.4815047021943575,
"grad_norm": 0.31293845717938346,
"learning_rate": 9.576940957694096e-06,
"loss": 0.3067,
"step": 1980
},
{
"epoch": 2.4827586206896552,
"grad_norm": 0.23008539614617915,
"learning_rate": 9.553695955369596e-06,
"loss": 0.2896,
"step": 1981
},
{
"epoch": 2.484012539184953,
"grad_norm": 0.24481404053492406,
"learning_rate": 9.530450953045097e-06,
"loss": 0.3086,
"step": 1982
},
{
"epoch": 2.485266457680251,
"grad_norm": 0.2762089962644676,
"learning_rate": 9.507205950720595e-06,
"loss": 0.2906,
"step": 1983
},
{
"epoch": 2.4865203761755486,
"grad_norm": 0.25824342132858885,
"learning_rate": 9.483960948396096e-06,
"loss": 0.2837,
"step": 1984
},
{
"epoch": 2.4877742946708463,
"grad_norm": 0.22536914497913446,
"learning_rate": 9.460715946071595e-06,
"loss": 0.2813,
"step": 1985
},
{
"epoch": 2.489028213166144,
"grad_norm": 0.2538133951419674,
"learning_rate": 9.437470943747095e-06,
"loss": 0.3106,
"step": 1986
},
{
"epoch": 2.490282131661442,
"grad_norm": 0.23179508161963552,
"learning_rate": 9.414225941422594e-06,
"loss": 0.2994,
"step": 1987
},
{
"epoch": 2.4915360501567396,
"grad_norm": 0.26596142513119964,
"learning_rate": 9.390980939098094e-06,
"loss": 0.2968,
"step": 1988
},
{
"epoch": 2.492789968652038,
"grad_norm": 0.23989256937777473,
"learning_rate": 9.367735936773595e-06,
"loss": 0.2892,
"step": 1989
},
{
"epoch": 2.4940438871473356,
"grad_norm": 0.242536870687771,
"learning_rate": 9.344490934449093e-06,
"loss": 0.2994,
"step": 1990
},
{
"epoch": 2.4952978056426334,
"grad_norm": 0.22910688388274683,
"learning_rate": 9.321245932124594e-06,
"loss": 0.3045,
"step": 1991
},
{
"epoch": 2.496551724137931,
"grad_norm": 0.22894474440044707,
"learning_rate": 9.298000929800094e-06,
"loss": 0.3041,
"step": 1992
},
{
"epoch": 2.497805642633229,
"grad_norm": 0.20909905660306244,
"learning_rate": 9.274755927475593e-06,
"loss": 0.2832,
"step": 1993
},
{
"epoch": 2.4990595611285267,
"grad_norm": 0.25070920838820865,
"learning_rate": 9.251510925151093e-06,
"loss": 0.2964,
"step": 1994
},
{
"epoch": 2.5003134796238244,
"grad_norm": 0.22411088569156984,
"learning_rate": 9.228265922826592e-06,
"loss": 0.2842,
"step": 1995
},
{
"epoch": 2.501567398119122,
"grad_norm": 0.23379125993880462,
"learning_rate": 9.205020920502094e-06,
"loss": 0.3006,
"step": 1996
},
{
"epoch": 2.50282131661442,
"grad_norm": 0.21618252739317512,
"learning_rate": 9.181775918177591e-06,
"loss": 0.2911,
"step": 1997
},
{
"epoch": 2.5040752351097177,
"grad_norm": 0.21427291664483095,
"learning_rate": 9.158530915853093e-06,
"loss": 0.278,
"step": 1998
},
{
"epoch": 2.5053291536050155,
"grad_norm": 0.23487158770650052,
"learning_rate": 9.135285913528592e-06,
"loss": 0.3032,
"step": 1999
},
{
"epoch": 2.5065830721003133,
"grad_norm": 0.22254956495224906,
"learning_rate": 9.112040911204092e-06,
"loss": 0.2973,
"step": 2000
},
{
"epoch": 2.507836990595611,
"grad_norm": 0.23291658880419727,
"learning_rate": 9.088795908879591e-06,
"loss": 0.2978,
"step": 2001
},
{
"epoch": 2.509090909090909,
"grad_norm": 0.2296187008258446,
"learning_rate": 9.06555090655509e-06,
"loss": 0.2729,
"step": 2002
},
{
"epoch": 2.510344827586207,
"grad_norm": 0.24845635946736178,
"learning_rate": 9.042305904230592e-06,
"loss": 0.3068,
"step": 2003
},
{
"epoch": 2.511598746081505,
"grad_norm": 0.21477291136685564,
"learning_rate": 9.01906090190609e-06,
"loss": 0.3162,
"step": 2004
},
{
"epoch": 2.5128526645768026,
"grad_norm": 0.21600020275245546,
"learning_rate": 8.99581589958159e-06,
"loss": 0.2773,
"step": 2005
},
{
"epoch": 2.5141065830721003,
"grad_norm": 0.23031281410643145,
"learning_rate": 8.97257089725709e-06,
"loss": 0.2885,
"step": 2006
},
{
"epoch": 2.515360501567398,
"grad_norm": 0.2128167671876887,
"learning_rate": 8.94932589493259e-06,
"loss": 0.2856,
"step": 2007
},
{
"epoch": 2.516614420062696,
"grad_norm": 0.25674296333103286,
"learning_rate": 8.92608089260809e-06,
"loss": 0.2989,
"step": 2008
},
{
"epoch": 2.5178683385579936,
"grad_norm": 0.2265892801052551,
"learning_rate": 8.90283589028359e-06,
"loss": 0.3002,
"step": 2009
},
{
"epoch": 2.5191222570532914,
"grad_norm": 0.211433729736665,
"learning_rate": 8.87959088795909e-06,
"loss": 0.2931,
"step": 2010
},
{
"epoch": 2.5203761755485896,
"grad_norm": 0.21724860685283304,
"learning_rate": 8.856345885634588e-06,
"loss": 0.3061,
"step": 2011
},
{
"epoch": 2.5216300940438874,
"grad_norm": 0.2075298965923879,
"learning_rate": 8.833100883310088e-06,
"loss": 0.3055,
"step": 2012
},
{
"epoch": 2.522884012539185,
"grad_norm": 0.2263306355558275,
"learning_rate": 8.809855880985589e-06,
"loss": 0.3018,
"step": 2013
},
{
"epoch": 2.524137931034483,
"grad_norm": 0.21466904461551342,
"learning_rate": 8.786610878661087e-06,
"loss": 0.2907,
"step": 2014
},
{
"epoch": 2.5253918495297807,
"grad_norm": 0.2061977290582163,
"learning_rate": 8.763365876336588e-06,
"loss": 0.2867,
"step": 2015
},
{
"epoch": 2.5266457680250785,
"grad_norm": 0.19636981525707242,
"learning_rate": 8.740120874012088e-06,
"loss": 0.2917,
"step": 2016
},
{
"epoch": 2.527899686520376,
"grad_norm": 0.20728238174112748,
"learning_rate": 8.716875871687589e-06,
"loss": 0.2917,
"step": 2017
},
{
"epoch": 2.529153605015674,
"grad_norm": 0.22726994745018206,
"learning_rate": 8.693630869363087e-06,
"loss": 0.2898,
"step": 2018
},
{
"epoch": 2.5304075235109718,
"grad_norm": 0.23863774922620945,
"learning_rate": 8.670385867038586e-06,
"loss": 0.2955,
"step": 2019
},
{
"epoch": 2.5316614420062695,
"grad_norm": 0.20620104840092898,
"learning_rate": 8.647140864714088e-06,
"loss": 0.2819,
"step": 2020
},
{
"epoch": 2.5329153605015673,
"grad_norm": 0.25045972355272994,
"learning_rate": 8.623895862389585e-06,
"loss": 0.302,
"step": 2021
},
{
"epoch": 2.534169278996865,
"grad_norm": 0.2564848642265122,
"learning_rate": 8.600650860065087e-06,
"loss": 0.3063,
"step": 2022
},
{
"epoch": 2.535423197492163,
"grad_norm": 0.20108841784441198,
"learning_rate": 8.577405857740586e-06,
"loss": 0.2752,
"step": 2023
},
{
"epoch": 2.5366771159874606,
"grad_norm": 0.21803605239721083,
"learning_rate": 8.554160855416087e-06,
"loss": 0.2878,
"step": 2024
},
{
"epoch": 2.5379310344827584,
"grad_norm": 0.21209072688753275,
"learning_rate": 8.530915853091585e-06,
"loss": 0.2779,
"step": 2025
},
{
"epoch": 2.5391849529780566,
"grad_norm": 0.22110935219658392,
"learning_rate": 8.507670850767085e-06,
"loss": 0.2947,
"step": 2026
},
{
"epoch": 2.5404388714733543,
"grad_norm": 0.26644490179194263,
"learning_rate": 8.484425848442586e-06,
"loss": 0.3089,
"step": 2027
},
{
"epoch": 2.541692789968652,
"grad_norm": 0.21202533681957053,
"learning_rate": 8.461180846118084e-06,
"loss": 0.3013,
"step": 2028
},
{
"epoch": 2.54294670846395,
"grad_norm": 0.2111275410570272,
"learning_rate": 8.437935843793585e-06,
"loss": 0.2836,
"step": 2029
},
{
"epoch": 2.5442006269592476,
"grad_norm": 0.22704697763455753,
"learning_rate": 8.414690841469085e-06,
"loss": 0.3044,
"step": 2030
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.2289082161109731,
"learning_rate": 8.391445839144586e-06,
"loss": 0.3111,
"step": 2031
},
{
"epoch": 2.546708463949843,
"grad_norm": 0.21094087378367685,
"learning_rate": 8.368200836820084e-06,
"loss": 0.2836,
"step": 2032
},
{
"epoch": 2.547962382445141,
"grad_norm": 0.2958357047288123,
"learning_rate": 8.344955834495583e-06,
"loss": 0.3195,
"step": 2033
},
{
"epoch": 2.5492163009404387,
"grad_norm": 0.2201383027136679,
"learning_rate": 8.321710832171084e-06,
"loss": 0.2796,
"step": 2034
},
{
"epoch": 2.550470219435737,
"grad_norm": 0.21736125658553962,
"learning_rate": 8.298465829846582e-06,
"loss": 0.3087,
"step": 2035
},
{
"epoch": 2.5517241379310347,
"grad_norm": 0.23985952592361925,
"learning_rate": 8.275220827522083e-06,
"loss": 0.3154,
"step": 2036
},
{
"epoch": 2.5529780564263325,
"grad_norm": 0.23648458106891493,
"learning_rate": 8.251975825197583e-06,
"loss": 0.3082,
"step": 2037
},
{
"epoch": 2.5542319749216302,
"grad_norm": 0.23694274455826497,
"learning_rate": 8.228730822873084e-06,
"loss": 0.2926,
"step": 2038
},
{
"epoch": 2.555485893416928,
"grad_norm": 0.22250132325144314,
"learning_rate": 8.205485820548582e-06,
"loss": 0.2988,
"step": 2039
},
{
"epoch": 2.5567398119122258,
"grad_norm": 0.22029750737485526,
"learning_rate": 8.182240818224082e-06,
"loss": 0.3075,
"step": 2040
},
{
"epoch": 2.5579937304075235,
"grad_norm": 0.23917807956534487,
"learning_rate": 8.158995815899583e-06,
"loss": 0.2965,
"step": 2041
},
{
"epoch": 2.5592476489028213,
"grad_norm": 0.2135133211298134,
"learning_rate": 8.13575081357508e-06,
"loss": 0.2929,
"step": 2042
},
{
"epoch": 2.560501567398119,
"grad_norm": 0.2108404428033065,
"learning_rate": 8.112505811250582e-06,
"loss": 0.2874,
"step": 2043
},
{
"epoch": 2.561755485893417,
"grad_norm": 0.2388149299898232,
"learning_rate": 8.089260808926081e-06,
"loss": 0.3099,
"step": 2044
},
{
"epoch": 2.5630094043887146,
"grad_norm": 0.2618375388901706,
"learning_rate": 8.066015806601581e-06,
"loss": 0.3026,
"step": 2045
},
{
"epoch": 2.5642633228840124,
"grad_norm": 0.23830988659117774,
"learning_rate": 8.04277080427708e-06,
"loss": 0.2907,
"step": 2046
},
{
"epoch": 2.56551724137931,
"grad_norm": 0.2164393758232813,
"learning_rate": 8.01952580195258e-06,
"loss": 0.2906,
"step": 2047
},
{
"epoch": 2.566771159874608,
"grad_norm": 0.20370471226214665,
"learning_rate": 7.996280799628081e-06,
"loss": 0.283,
"step": 2048
},
{
"epoch": 2.568025078369906,
"grad_norm": 0.23225764971465201,
"learning_rate": 7.973035797303579e-06,
"loss": 0.3014,
"step": 2049
},
{
"epoch": 2.569278996865204,
"grad_norm": 0.23342743853222753,
"learning_rate": 7.94979079497908e-06,
"loss": 0.3145,
"step": 2050
},
{
"epoch": 2.5705329153605017,
"grad_norm": 0.2247883581075607,
"learning_rate": 7.92654579265458e-06,
"loss": 0.2989,
"step": 2051
},
{
"epoch": 2.5717868338557994,
"grad_norm": 0.22073978544993683,
"learning_rate": 7.90330079033008e-06,
"loss": 0.2989,
"step": 2052
},
{
"epoch": 2.573040752351097,
"grad_norm": 0.2034871186721174,
"learning_rate": 7.880055788005579e-06,
"loss": 0.2949,
"step": 2053
},
{
"epoch": 2.574294670846395,
"grad_norm": 0.2127877972155832,
"learning_rate": 7.856810785681078e-06,
"loss": 0.3015,
"step": 2054
},
{
"epoch": 2.5755485893416927,
"grad_norm": 0.2122882383899126,
"learning_rate": 7.83356578335658e-06,
"loss": 0.2888,
"step": 2055
},
{
"epoch": 2.5768025078369905,
"grad_norm": 0.21424991280202563,
"learning_rate": 7.810320781032077e-06,
"loss": 0.2877,
"step": 2056
},
{
"epoch": 2.5780564263322883,
"grad_norm": 0.2205153546437731,
"learning_rate": 7.787075778707579e-06,
"loss": 0.2893,
"step": 2057
},
{
"epoch": 2.5793103448275865,
"grad_norm": 0.21860793812421378,
"learning_rate": 7.763830776383078e-06,
"loss": 0.2774,
"step": 2058
},
{
"epoch": 2.5805642633228842,
"grad_norm": 0.21783537676995388,
"learning_rate": 7.740585774058578e-06,
"loss": 0.3032,
"step": 2059
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.23475815753496462,
"learning_rate": 7.717340771734077e-06,
"loss": 0.3014,
"step": 2060
},
{
"epoch": 2.58307210031348,
"grad_norm": 0.22144795774472195,
"learning_rate": 7.694095769409577e-06,
"loss": 0.2934,
"step": 2061
},
{
"epoch": 2.5843260188087775,
"grad_norm": 0.20763852011641964,
"learning_rate": 7.670850767085078e-06,
"loss": 0.2937,
"step": 2062
},
{
"epoch": 2.5855799373040753,
"grad_norm": 0.2265150321430951,
"learning_rate": 7.647605764760576e-06,
"loss": 0.2989,
"step": 2063
},
{
"epoch": 2.586833855799373,
"grad_norm": 0.24764754438696004,
"learning_rate": 7.624360762436076e-06,
"loss": 0.2973,
"step": 2064
},
{
"epoch": 2.588087774294671,
"grad_norm": 0.24082527048654187,
"learning_rate": 7.601115760111577e-06,
"loss": 0.3031,
"step": 2065
},
{
"epoch": 2.5893416927899686,
"grad_norm": 0.22526964794352033,
"learning_rate": 7.577870757787075e-06,
"loss": 0.2932,
"step": 2066
},
{
"epoch": 2.5905956112852664,
"grad_norm": 0.23100801709248528,
"learning_rate": 7.554625755462576e-06,
"loss": 0.2919,
"step": 2067
},
{
"epoch": 2.591849529780564,
"grad_norm": 0.22561283495403692,
"learning_rate": 7.531380753138076e-06,
"loss": 0.305,
"step": 2068
},
{
"epoch": 2.593103448275862,
"grad_norm": 0.2168275532912764,
"learning_rate": 7.5081357508135765e-06,
"loss": 0.306,
"step": 2069
},
{
"epoch": 2.5943573667711597,
"grad_norm": 0.21593364375898774,
"learning_rate": 7.484890748489075e-06,
"loss": 0.2896,
"step": 2070
},
{
"epoch": 2.5956112852664575,
"grad_norm": 0.2668812108070002,
"learning_rate": 7.461645746164575e-06,
"loss": 0.3184,
"step": 2071
},
{
"epoch": 2.5968652037617552,
"grad_norm": 0.21492864724843774,
"learning_rate": 7.438400743840075e-06,
"loss": 0.3002,
"step": 2072
},
{
"epoch": 2.5981191222570534,
"grad_norm": 0.22118781093440343,
"learning_rate": 7.415155741515574e-06,
"loss": 0.3284,
"step": 2073
},
{
"epoch": 2.599373040752351,
"grad_norm": 0.23092386530652023,
"learning_rate": 7.391910739191074e-06,
"loss": 0.2954,
"step": 2074
},
{
"epoch": 2.600626959247649,
"grad_norm": 0.202328229827701,
"learning_rate": 7.3686657368665745e-06,
"loss": 0.2849,
"step": 2075
},
{
"epoch": 2.6018808777429467,
"grad_norm": 0.21955968113838428,
"learning_rate": 7.345420734542075e-06,
"loss": 0.2908,
"step": 2076
},
{
"epoch": 2.6031347962382445,
"grad_norm": 0.24497203705862275,
"learning_rate": 7.3221757322175736e-06,
"loss": 0.3098,
"step": 2077
},
{
"epoch": 2.6043887147335423,
"grad_norm": 0.22013705970692762,
"learning_rate": 7.298930729893073e-06,
"loss": 0.2942,
"step": 2078
},
{
"epoch": 2.60564263322884,
"grad_norm": 0.22216727276583528,
"learning_rate": 7.2756857275685735e-06,
"loss": 0.3058,
"step": 2079
},
{
"epoch": 2.606896551724138,
"grad_norm": 0.2252568306331399,
"learning_rate": 7.252440725244072e-06,
"loss": 0.2835,
"step": 2080
},
{
"epoch": 2.6081504702194356,
"grad_norm": 0.22708340521183282,
"learning_rate": 7.2291957229195725e-06,
"loss": 0.2818,
"step": 2081
},
{
"epoch": 2.609404388714734,
"grad_norm": 0.24534512766504848,
"learning_rate": 7.205950720595073e-06,
"loss": 0.2862,
"step": 2082
},
{
"epoch": 2.6106583072100316,
"grad_norm": 0.24760434013192498,
"learning_rate": 7.182705718270572e-06,
"loss": 0.2952,
"step": 2083
},
{
"epoch": 2.6119122257053293,
"grad_norm": 0.23325781984201546,
"learning_rate": 7.159460715946072e-06,
"loss": 0.309,
"step": 2084
},
{
"epoch": 2.613166144200627,
"grad_norm": 0.2507347163320025,
"learning_rate": 7.1362157136215715e-06,
"loss": 0.3065,
"step": 2085
},
{
"epoch": 2.614420062695925,
"grad_norm": 0.2209984845783113,
"learning_rate": 7.112970711297072e-06,
"loss": 0.3137,
"step": 2086
},
{
"epoch": 2.6156739811912226,
"grad_norm": 0.21249840529599395,
"learning_rate": 7.089725708972571e-06,
"loss": 0.2963,
"step": 2087
},
{
"epoch": 2.6169278996865204,
"grad_norm": 0.21848180691015628,
"learning_rate": 7.066480706648071e-06,
"loss": 0.2833,
"step": 2088
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.228598141414097,
"learning_rate": 7.043235704323571e-06,
"loss": 0.3026,
"step": 2089
},
{
"epoch": 2.619435736677116,
"grad_norm": 0.22602483733747722,
"learning_rate": 7.01999070199907e-06,
"loss": 0.2866,
"step": 2090
},
{
"epoch": 2.6206896551724137,
"grad_norm": 0.23429087976602817,
"learning_rate": 6.99674569967457e-06,
"loss": 0.2849,
"step": 2091
},
{
"epoch": 2.6219435736677115,
"grad_norm": 0.21123501019500698,
"learning_rate": 6.97350069735007e-06,
"loss": 0.2913,
"step": 2092
},
{
"epoch": 2.6231974921630092,
"grad_norm": 0.2098653620182775,
"learning_rate": 6.95025569502557e-06,
"loss": 0.2901,
"step": 2093
},
{
"epoch": 2.624451410658307,
"grad_norm": 0.20007522422923021,
"learning_rate": 6.927010692701069e-06,
"loss": 0.287,
"step": 2094
},
{
"epoch": 2.6257053291536048,
"grad_norm": 0.2689021544151461,
"learning_rate": 6.903765690376569e-06,
"loss": 0.2939,
"step": 2095
},
{
"epoch": 2.626959247648903,
"grad_norm": 0.2293640248509719,
"learning_rate": 6.88052068805207e-06,
"loss": 0.2814,
"step": 2096
},
{
"epoch": 2.6282131661442008,
"grad_norm": 0.24365622744047755,
"learning_rate": 6.8572756857275684e-06,
"loss": 0.287,
"step": 2097
},
{
"epoch": 2.6294670846394985,
"grad_norm": 0.22989674892602757,
"learning_rate": 6.834030683403069e-06,
"loss": 0.2853,
"step": 2098
},
{
"epoch": 2.6307210031347963,
"grad_norm": 0.24128730703236384,
"learning_rate": 6.810785681078568e-06,
"loss": 0.2926,
"step": 2099
},
{
"epoch": 2.631974921630094,
"grad_norm": 0.25776402987487673,
"learning_rate": 6.787540678754069e-06,
"loss": 0.2895,
"step": 2100
},
{
"epoch": 2.633228840125392,
"grad_norm": 0.2212025240316488,
"learning_rate": 6.7642956764295674e-06,
"loss": 0.2854,
"step": 2101
},
{
"epoch": 2.6344827586206896,
"grad_norm": 0.2207089771944118,
"learning_rate": 6.741050674105068e-06,
"loss": 0.304,
"step": 2102
},
{
"epoch": 2.6357366771159874,
"grad_norm": 0.23229748738863193,
"learning_rate": 6.717805671780568e-06,
"loss": 0.3048,
"step": 2103
},
{
"epoch": 2.636990595611285,
"grad_norm": 0.21752594518515606,
"learning_rate": 6.694560669456067e-06,
"loss": 0.3172,
"step": 2104
},
{
"epoch": 2.6382445141065833,
"grad_norm": 0.21876453036875265,
"learning_rate": 6.671315667131567e-06,
"loss": 0.2867,
"step": 2105
},
{
"epoch": 2.639498432601881,
"grad_norm": 0.22292812720835084,
"learning_rate": 6.648070664807067e-06,
"loss": 0.2782,
"step": 2106
},
{
"epoch": 2.640752351097179,
"grad_norm": 0.20101697395415835,
"learning_rate": 6.624825662482567e-06,
"loss": 0.2962,
"step": 2107
},
{
"epoch": 2.6420062695924766,
"grad_norm": 0.2313104667535263,
"learning_rate": 6.601580660158066e-06,
"loss": 0.2988,
"step": 2108
},
{
"epoch": 2.6432601880877744,
"grad_norm": 0.24694192465654377,
"learning_rate": 6.578335657833566e-06,
"loss": 0.295,
"step": 2109
},
{
"epoch": 2.644514106583072,
"grad_norm": 0.23396429120142956,
"learning_rate": 6.555090655509067e-06,
"loss": 0.2918,
"step": 2110
},
{
"epoch": 2.64576802507837,
"grad_norm": 0.22040008984662823,
"learning_rate": 6.531845653184565e-06,
"loss": 0.2833,
"step": 2111
},
{
"epoch": 2.6470219435736677,
"grad_norm": 0.20748715519209388,
"learning_rate": 6.508600650860066e-06,
"loss": 0.3007,
"step": 2112
},
{
"epoch": 2.6482758620689655,
"grad_norm": 0.25611457709285484,
"learning_rate": 6.485355648535565e-06,
"loss": 0.2818,
"step": 2113
},
{
"epoch": 2.6495297805642632,
"grad_norm": 0.23608173874478974,
"learning_rate": 6.462110646211066e-06,
"loss": 0.2927,
"step": 2114
},
{
"epoch": 2.650783699059561,
"grad_norm": 0.23997693567399708,
"learning_rate": 6.438865643886564e-06,
"loss": 0.3007,
"step": 2115
},
{
"epoch": 2.652037617554859,
"grad_norm": 0.21891772707308046,
"learning_rate": 6.415620641562065e-06,
"loss": 0.2944,
"step": 2116
},
{
"epoch": 2.6532915360501566,
"grad_norm": 0.24655074418457784,
"learning_rate": 6.392375639237565e-06,
"loss": 0.2805,
"step": 2117
},
{
"epoch": 2.6545454545454543,
"grad_norm": 0.22144040470372692,
"learning_rate": 6.369130636913064e-06,
"loss": 0.2919,
"step": 2118
},
{
"epoch": 2.655799373040752,
"grad_norm": 0.22063665925015127,
"learning_rate": 6.345885634588563e-06,
"loss": 0.2896,
"step": 2119
},
{
"epoch": 2.6570532915360503,
"grad_norm": 0.22566136187835217,
"learning_rate": 6.322640632264064e-06,
"loss": 0.2798,
"step": 2120
},
{
"epoch": 2.658307210031348,
"grad_norm": 0.21213161150840273,
"learning_rate": 6.299395629939562e-06,
"loss": 0.2956,
"step": 2121
},
{
"epoch": 2.659561128526646,
"grad_norm": 0.2277583134769657,
"learning_rate": 6.276150627615063e-06,
"loss": 0.2822,
"step": 2122
},
{
"epoch": 2.6608150470219436,
"grad_norm": 0.26890041701469597,
"learning_rate": 6.252905625290563e-06,
"loss": 0.2973,
"step": 2123
},
{
"epoch": 2.6620689655172414,
"grad_norm": 0.2343183888940753,
"learning_rate": 6.229660622966063e-06,
"loss": 0.2873,
"step": 2124
},
{
"epoch": 2.663322884012539,
"grad_norm": 0.23687868174300106,
"learning_rate": 6.206415620641563e-06,
"loss": 0.2937,
"step": 2125
},
{
"epoch": 2.664576802507837,
"grad_norm": 0.23003758721501227,
"learning_rate": 6.183170618317062e-06,
"loss": 0.2846,
"step": 2126
},
{
"epoch": 2.6658307210031347,
"grad_norm": 0.24784890494915912,
"learning_rate": 6.159925615992562e-06,
"loss": 0.2761,
"step": 2127
},
{
"epoch": 2.6670846394984324,
"grad_norm": 0.2676599721646094,
"learning_rate": 6.136680613668062e-06,
"loss": 0.287,
"step": 2128
},
{
"epoch": 2.6683385579937307,
"grad_norm": 0.22198593870462943,
"learning_rate": 6.113435611343561e-06,
"loss": 0.2907,
"step": 2129
},
{
"epoch": 2.6695924764890284,
"grad_norm": 0.23401362141215412,
"learning_rate": 6.0901906090190615e-06,
"loss": 0.2851,
"step": 2130
},
{
"epoch": 2.670846394984326,
"grad_norm": 0.26403393639093214,
"learning_rate": 6.066945606694561e-06,
"loss": 0.2903,
"step": 2131
},
{
"epoch": 2.672100313479624,
"grad_norm": 0.24448186614837755,
"learning_rate": 6.043700604370061e-06,
"loss": 0.3085,
"step": 2132
},
{
"epoch": 2.6733542319749217,
"grad_norm": 0.27220913377700745,
"learning_rate": 6.02045560204556e-06,
"loss": 0.2907,
"step": 2133
},
{
"epoch": 2.6746081504702195,
"grad_norm": 0.20532211378850904,
"learning_rate": 5.99721059972106e-06,
"loss": 0.2842,
"step": 2134
},
{
"epoch": 2.6758620689655173,
"grad_norm": 0.22479459804323915,
"learning_rate": 5.97396559739656e-06,
"loss": 0.3132,
"step": 2135
},
{
"epoch": 2.677115987460815,
"grad_norm": 0.24593108944307965,
"learning_rate": 5.9507205950720595e-06,
"loss": 0.2941,
"step": 2136
},
{
"epoch": 2.678369905956113,
"grad_norm": 0.24403848787514615,
"learning_rate": 5.92747559274756e-06,
"loss": 0.2852,
"step": 2137
},
{
"epoch": 2.6796238244514106,
"grad_norm": 0.21581616748796212,
"learning_rate": 5.9042305904230594e-06,
"loss": 0.2896,
"step": 2138
},
{
"epoch": 2.6808777429467083,
"grad_norm": 0.21094757538534978,
"learning_rate": 5.880985588098559e-06,
"loss": 0.2813,
"step": 2139
},
{
"epoch": 2.682131661442006,
"grad_norm": 0.22038555206054597,
"learning_rate": 5.8577405857740585e-06,
"loss": 0.2956,
"step": 2140
},
{
"epoch": 2.683385579937304,
"grad_norm": 0.2060931153084385,
"learning_rate": 5.834495583449558e-06,
"loss": 0.2821,
"step": 2141
},
{
"epoch": 2.6846394984326016,
"grad_norm": 0.21335778446259562,
"learning_rate": 5.811250581125058e-06,
"loss": 0.2898,
"step": 2142
},
{
"epoch": 2.6858934169279,
"grad_norm": 0.21639727337674242,
"learning_rate": 5.788005578800558e-06,
"loss": 0.2976,
"step": 2143
},
{
"epoch": 2.6871473354231976,
"grad_norm": 0.2206356849349169,
"learning_rate": 5.764760576476058e-06,
"loss": 0.2768,
"step": 2144
},
{
"epoch": 2.6884012539184954,
"grad_norm": 0.22892111117928005,
"learning_rate": 5.741515574151558e-06,
"loss": 0.2861,
"step": 2145
},
{
"epoch": 2.689655172413793,
"grad_norm": 0.25275454389705854,
"learning_rate": 5.718270571827057e-06,
"loss": 0.2989,
"step": 2146
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.2154215419685078,
"learning_rate": 5.695025569502557e-06,
"loss": 0.3072,
"step": 2147
},
{
"epoch": 2.6921630094043887,
"grad_norm": 0.20393847551344146,
"learning_rate": 5.6717805671780565e-06,
"loss": 0.3051,
"step": 2148
},
{
"epoch": 2.6934169278996865,
"grad_norm": 0.22037801854691835,
"learning_rate": 5.648535564853557e-06,
"loss": 0.3014,
"step": 2149
},
{
"epoch": 2.694670846394984,
"grad_norm": 0.22260997202188965,
"learning_rate": 5.625290562529056e-06,
"loss": 0.2922,
"step": 2150
},
{
"epoch": 2.695924764890282,
"grad_norm": 0.22879685384321616,
"learning_rate": 5.602045560204557e-06,
"loss": 0.2974,
"step": 2151
},
{
"epoch": 2.69717868338558,
"grad_norm": 0.21295443417699395,
"learning_rate": 5.578800557880056e-06,
"loss": 0.3056,
"step": 2152
},
{
"epoch": 2.698432601880878,
"grad_norm": 0.204073362481689,
"learning_rate": 5.555555555555556e-06,
"loss": 0.2912,
"step": 2153
},
{
"epoch": 2.6996865203761757,
"grad_norm": 0.22369191061921434,
"learning_rate": 5.532310553231055e-06,
"loss": 0.2961,
"step": 2154
},
{
"epoch": 2.7009404388714735,
"grad_norm": 0.209772641275576,
"learning_rate": 5.509065550906555e-06,
"loss": 0.3091,
"step": 2155
},
{
"epoch": 2.7021943573667713,
"grad_norm": 0.19139735972648408,
"learning_rate": 5.485820548582055e-06,
"loss": 0.2693,
"step": 2156
},
{
"epoch": 2.703448275862069,
"grad_norm": 0.20874524554287796,
"learning_rate": 5.462575546257555e-06,
"loss": 0.2957,
"step": 2157
},
{
"epoch": 2.704702194357367,
"grad_norm": 0.22930205407484275,
"learning_rate": 5.439330543933055e-06,
"loss": 0.2965,
"step": 2158
},
{
"epoch": 2.7059561128526646,
"grad_norm": 0.21930942557061348,
"learning_rate": 5.416085541608555e-06,
"loss": 0.2869,
"step": 2159
},
{
"epoch": 2.7072100313479623,
"grad_norm": 0.2086461560257158,
"learning_rate": 5.392840539284054e-06,
"loss": 0.2977,
"step": 2160
},
{
"epoch": 2.70846394984326,
"grad_norm": 0.20977844806033516,
"learning_rate": 5.369595536959554e-06,
"loss": 0.2994,
"step": 2161
},
{
"epoch": 2.709717868338558,
"grad_norm": 0.2179723897929952,
"learning_rate": 5.346350534635053e-06,
"loss": 0.3001,
"step": 2162
},
{
"epoch": 2.7109717868338556,
"grad_norm": 0.1965056693792897,
"learning_rate": 5.323105532310554e-06,
"loss": 0.279,
"step": 2163
},
{
"epoch": 2.7122257053291534,
"grad_norm": 0.23646958012930633,
"learning_rate": 5.299860529986053e-06,
"loss": 0.2956,
"step": 2164
},
{
"epoch": 2.713479623824451,
"grad_norm": 0.2128629797528962,
"learning_rate": 5.276615527661553e-06,
"loss": 0.2959,
"step": 2165
},
{
"epoch": 2.714733542319749,
"grad_norm": 0.1942271780320406,
"learning_rate": 5.253370525337053e-06,
"loss": 0.2917,
"step": 2166
},
{
"epoch": 2.715987460815047,
"grad_norm": 0.207283601788188,
"learning_rate": 5.230125523012553e-06,
"loss": 0.29,
"step": 2167
},
{
"epoch": 2.717241379310345,
"grad_norm": 0.20489931375480455,
"learning_rate": 5.206880520688052e-06,
"loss": 0.2824,
"step": 2168
},
{
"epoch": 2.7184952978056427,
"grad_norm": 0.22297585556766433,
"learning_rate": 5.183635518363552e-06,
"loss": 0.2877,
"step": 2169
},
{
"epoch": 2.7197492163009405,
"grad_norm": 0.22152130968970676,
"learning_rate": 5.160390516039052e-06,
"loss": 0.3188,
"step": 2170
},
{
"epoch": 2.7210031347962382,
"grad_norm": 0.2170287629331641,
"learning_rate": 5.137145513714552e-06,
"loss": 0.294,
"step": 2171
},
{
"epoch": 2.722257053291536,
"grad_norm": 0.21101577823823872,
"learning_rate": 5.113900511390051e-06,
"loss": 0.2914,
"step": 2172
},
{
"epoch": 2.7235109717868338,
"grad_norm": 0.20653554962445694,
"learning_rate": 5.0906555090655516e-06,
"loss": 0.3019,
"step": 2173
},
{
"epoch": 2.7247648902821315,
"grad_norm": 0.21574064782934554,
"learning_rate": 5.067410506741051e-06,
"loss": 0.306,
"step": 2174
},
{
"epoch": 2.7260188087774293,
"grad_norm": 0.23252861190184293,
"learning_rate": 5.044165504416551e-06,
"loss": 0.3176,
"step": 2175
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.2030366946925913,
"learning_rate": 5.02092050209205e-06,
"loss": 0.273,
"step": 2176
},
{
"epoch": 2.7285266457680253,
"grad_norm": 0.21622810408571755,
"learning_rate": 4.9976754997675505e-06,
"loss": 0.2942,
"step": 2177
},
{
"epoch": 2.729780564263323,
"grad_norm": 0.20511590208440628,
"learning_rate": 4.97443049744305e-06,
"loss": 0.2894,
"step": 2178
},
{
"epoch": 2.731034482758621,
"grad_norm": 0.19846539811729075,
"learning_rate": 4.95118549511855e-06,
"loss": 0.284,
"step": 2179
},
{
"epoch": 2.7322884012539186,
"grad_norm": 0.22519540038900182,
"learning_rate": 4.92794049279405e-06,
"loss": 0.2907,
"step": 2180
},
{
"epoch": 2.7335423197492164,
"grad_norm": 0.21581855003472294,
"learning_rate": 4.9046954904695495e-06,
"loss": 0.3132,
"step": 2181
},
{
"epoch": 2.734796238244514,
"grad_norm": 0.20773059691977439,
"learning_rate": 4.881450488145049e-06,
"loss": 0.2921,
"step": 2182
},
{
"epoch": 2.736050156739812,
"grad_norm": 0.2094165466834347,
"learning_rate": 4.8582054858205486e-06,
"loss": 0.2948,
"step": 2183
},
{
"epoch": 2.7373040752351097,
"grad_norm": 0.24140828474778792,
"learning_rate": 4.834960483496048e-06,
"loss": 0.2996,
"step": 2184
},
{
"epoch": 2.7385579937304074,
"grad_norm": 0.21889190467353561,
"learning_rate": 4.8117154811715485e-06,
"loss": 0.2943,
"step": 2185
},
{
"epoch": 2.739811912225705,
"grad_norm": 0.21795569708669968,
"learning_rate": 4.788470478847048e-06,
"loss": 0.2966,
"step": 2186
},
{
"epoch": 2.741065830721003,
"grad_norm": 0.20558556883597737,
"learning_rate": 4.765225476522548e-06,
"loss": 0.2887,
"step": 2187
},
{
"epoch": 2.7423197492163007,
"grad_norm": 0.20671821535775967,
"learning_rate": 4.741980474198048e-06,
"loss": 0.3093,
"step": 2188
},
{
"epoch": 2.7435736677115985,
"grad_norm": 0.21944375055084284,
"learning_rate": 4.7187354718735475e-06,
"loss": 0.2913,
"step": 2189
},
{
"epoch": 2.7448275862068967,
"grad_norm": 0.22761448856484903,
"learning_rate": 4.695490469549047e-06,
"loss": 0.2963,
"step": 2190
},
{
"epoch": 2.7460815047021945,
"grad_norm": 0.2159003608522351,
"learning_rate": 4.6722454672245465e-06,
"loss": 0.3199,
"step": 2191
},
{
"epoch": 2.7473354231974922,
"grad_norm": 0.2130699827625645,
"learning_rate": 4.649000464900047e-06,
"loss": 0.2773,
"step": 2192
},
{
"epoch": 2.74858934169279,
"grad_norm": 0.201013997134557,
"learning_rate": 4.6257554625755464e-06,
"loss": 0.2892,
"step": 2193
},
{
"epoch": 2.749843260188088,
"grad_norm": 0.24926431440860733,
"learning_rate": 4.602510460251047e-06,
"loss": 0.316,
"step": 2194
},
{
"epoch": 2.7510971786833855,
"grad_norm": 0.24390385736406175,
"learning_rate": 4.579265457926546e-06,
"loss": 0.2987,
"step": 2195
},
{
"epoch": 2.7523510971786833,
"grad_norm": 0.2156247432245583,
"learning_rate": 4.556020455602046e-06,
"loss": 0.2859,
"step": 2196
},
{
"epoch": 2.753605015673981,
"grad_norm": 0.21341843544658673,
"learning_rate": 4.532775453277545e-06,
"loss": 0.3097,
"step": 2197
},
{
"epoch": 2.754858934169279,
"grad_norm": 0.21984478628090215,
"learning_rate": 4.509530450953045e-06,
"loss": 0.2932,
"step": 2198
},
{
"epoch": 2.756112852664577,
"grad_norm": 0.2132357652599372,
"learning_rate": 4.486285448628545e-06,
"loss": 0.2916,
"step": 2199
},
{
"epoch": 2.757366771159875,
"grad_norm": 0.25343075450217545,
"learning_rate": 4.463040446304045e-06,
"loss": 0.3043,
"step": 2200
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.2200850110509242,
"learning_rate": 4.439795443979545e-06,
"loss": 0.3095,
"step": 2201
},
{
"epoch": 2.7598746081504704,
"grad_norm": 0.224838557137271,
"learning_rate": 4.416550441655044e-06,
"loss": 0.2937,
"step": 2202
},
{
"epoch": 2.761128526645768,
"grad_norm": 0.22249712873979094,
"learning_rate": 4.3933054393305435e-06,
"loss": 0.2996,
"step": 2203
},
{
"epoch": 2.762382445141066,
"grad_norm": 0.21036785461700405,
"learning_rate": 4.370060437006044e-06,
"loss": 0.3042,
"step": 2204
},
{
"epoch": 2.7636363636363637,
"grad_norm": 0.2144697563601178,
"learning_rate": 4.346815434681543e-06,
"loss": 0.2934,
"step": 2205
},
{
"epoch": 2.7648902821316614,
"grad_norm": 0.22728767027335608,
"learning_rate": 4.323570432357044e-06,
"loss": 0.3103,
"step": 2206
},
{
"epoch": 2.766144200626959,
"grad_norm": 0.21660341320196436,
"learning_rate": 4.300325430032543e-06,
"loss": 0.2916,
"step": 2207
},
{
"epoch": 2.767398119122257,
"grad_norm": 0.20510614740054917,
"learning_rate": 4.277080427708044e-06,
"loss": 0.283,
"step": 2208
},
{
"epoch": 2.7686520376175547,
"grad_norm": 0.20242619787832544,
"learning_rate": 4.253835425383542e-06,
"loss": 0.2894,
"step": 2209
},
{
"epoch": 2.7699059561128525,
"grad_norm": 0.21086600073762995,
"learning_rate": 4.230590423059042e-06,
"loss": 0.2928,
"step": 2210
},
{
"epoch": 2.7711598746081503,
"grad_norm": 0.21946206083540706,
"learning_rate": 4.207345420734542e-06,
"loss": 0.3042,
"step": 2211
},
{
"epoch": 2.772413793103448,
"grad_norm": 0.20223220970676523,
"learning_rate": 4.184100418410042e-06,
"loss": 0.2975,
"step": 2212
},
{
"epoch": 2.773667711598746,
"grad_norm": 0.20659766191501672,
"learning_rate": 4.160855416085542e-06,
"loss": 0.2932,
"step": 2213
},
{
"epoch": 2.774921630094044,
"grad_norm": 0.20354686030786606,
"learning_rate": 4.137610413761042e-06,
"loss": 0.2949,
"step": 2214
},
{
"epoch": 2.776175548589342,
"grad_norm": 0.20277238541514575,
"learning_rate": 4.114365411436542e-06,
"loss": 0.2971,
"step": 2215
},
{
"epoch": 2.7774294670846396,
"grad_norm": 0.20672700234712535,
"learning_rate": 4.091120409112041e-06,
"loss": 0.3072,
"step": 2216
},
{
"epoch": 2.7786833855799373,
"grad_norm": 0.22067000494392908,
"learning_rate": 4.06787540678754e-06,
"loss": 0.2884,
"step": 2217
},
{
"epoch": 2.779937304075235,
"grad_norm": 0.2289464526547561,
"learning_rate": 4.044630404463041e-06,
"loss": 0.3055,
"step": 2218
},
{
"epoch": 2.781191222570533,
"grad_norm": 0.20842639076317304,
"learning_rate": 4.02138540213854e-06,
"loss": 0.2954,
"step": 2219
},
{
"epoch": 2.7824451410658306,
"grad_norm": 0.1997652433580996,
"learning_rate": 3.998140399814041e-06,
"loss": 0.2935,
"step": 2220
},
{
"epoch": 2.7836990595611284,
"grad_norm": 0.2408396639862395,
"learning_rate": 3.97489539748954e-06,
"loss": 0.2909,
"step": 2221
},
{
"epoch": 2.7849529780564266,
"grad_norm": 0.210277802073483,
"learning_rate": 3.95165039516504e-06,
"loss": 0.2972,
"step": 2222
},
{
"epoch": 2.7862068965517244,
"grad_norm": 0.20908815764872085,
"learning_rate": 3.928405392840539e-06,
"loss": 0.295,
"step": 2223
},
{
"epoch": 2.787460815047022,
"grad_norm": 0.2094396521103149,
"learning_rate": 3.905160390516039e-06,
"loss": 0.2751,
"step": 2224
},
{
"epoch": 2.78871473354232,
"grad_norm": 0.2004454299180258,
"learning_rate": 3.881915388191539e-06,
"loss": 0.2836,
"step": 2225
},
{
"epoch": 2.7899686520376177,
"grad_norm": 0.19766912817123855,
"learning_rate": 3.858670385867039e-06,
"loss": 0.2717,
"step": 2226
},
{
"epoch": 2.7912225705329154,
"grad_norm": 0.2106192302982002,
"learning_rate": 3.835425383542539e-06,
"loss": 0.3075,
"step": 2227
},
{
"epoch": 2.792476489028213,
"grad_norm": 0.20055614292460974,
"learning_rate": 3.812180381218038e-06,
"loss": 0.2826,
"step": 2228
},
{
"epoch": 2.793730407523511,
"grad_norm": 0.19698790696995888,
"learning_rate": 3.7889353788935377e-06,
"loss": 0.288,
"step": 2229
},
{
"epoch": 2.7949843260188088,
"grad_norm": 0.21640009392468632,
"learning_rate": 3.765690376569038e-06,
"loss": 0.3038,
"step": 2230
},
{
"epoch": 2.7962382445141065,
"grad_norm": 0.19993985564291575,
"learning_rate": 3.7424453742445376e-06,
"loss": 0.2973,
"step": 2231
},
{
"epoch": 2.7974921630094043,
"grad_norm": 0.20651944444665726,
"learning_rate": 3.7192003719200375e-06,
"loss": 0.3092,
"step": 2232
},
{
"epoch": 2.798746081504702,
"grad_norm": 0.19791532028790657,
"learning_rate": 3.695955369595537e-06,
"loss": 0.29,
"step": 2233
},
{
"epoch": 2.8,
"grad_norm": 0.2214704558679226,
"learning_rate": 3.6727103672710374e-06,
"loss": 0.3035,
"step": 2234
},
{
"epoch": 2.8012539184952976,
"grad_norm": 0.20615056745432095,
"learning_rate": 3.6494653649465365e-06,
"loss": 0.2885,
"step": 2235
},
{
"epoch": 2.8025078369905954,
"grad_norm": 0.20663799263367244,
"learning_rate": 3.626220362622036e-06,
"loss": 0.3057,
"step": 2236
},
{
"epoch": 2.8037617554858936,
"grad_norm": 0.2015856313934883,
"learning_rate": 3.6029753602975365e-06,
"loss": 0.2868,
"step": 2237
},
{
"epoch": 2.8050156739811913,
"grad_norm": 0.20954135278703587,
"learning_rate": 3.579730357973036e-06,
"loss": 0.3115,
"step": 2238
},
{
"epoch": 2.806269592476489,
"grad_norm": 0.19995159589153724,
"learning_rate": 3.556485355648536e-06,
"loss": 0.2927,
"step": 2239
},
{
"epoch": 2.807523510971787,
"grad_norm": 0.19978637016749703,
"learning_rate": 3.5332403533240355e-06,
"loss": 0.2902,
"step": 2240
},
{
"epoch": 2.8087774294670846,
"grad_norm": 0.2325190078880887,
"learning_rate": 3.509995350999535e-06,
"loss": 0.3078,
"step": 2241
},
{
"epoch": 2.8100313479623824,
"grad_norm": 0.22157967523187208,
"learning_rate": 3.486750348675035e-06,
"loss": 0.2987,
"step": 2242
},
{
"epoch": 2.81128526645768,
"grad_norm": 0.20788092343623144,
"learning_rate": 3.4635053463505345e-06,
"loss": 0.3093,
"step": 2243
},
{
"epoch": 2.812539184952978,
"grad_norm": 0.20483412435741102,
"learning_rate": 3.440260344026035e-06,
"loss": 0.2919,
"step": 2244
},
{
"epoch": 2.8137931034482757,
"grad_norm": 0.20135702368785538,
"learning_rate": 3.4170153417015344e-06,
"loss": 0.2918,
"step": 2245
},
{
"epoch": 2.815047021943574,
"grad_norm": 0.21055325625373814,
"learning_rate": 3.3937703393770344e-06,
"loss": 0.2823,
"step": 2246
},
{
"epoch": 2.8163009404388717,
"grad_norm": 0.21354738904825524,
"learning_rate": 3.370525337052534e-06,
"loss": 0.3018,
"step": 2247
},
{
"epoch": 2.8175548589341695,
"grad_norm": 0.21300853601792943,
"learning_rate": 3.3472803347280334e-06,
"loss": 0.2864,
"step": 2248
},
{
"epoch": 2.8188087774294672,
"grad_norm": 0.2131417302185367,
"learning_rate": 3.3240353324035334e-06,
"loss": 0.2931,
"step": 2249
},
{
"epoch": 2.820062695924765,
"grad_norm": 0.21669046862329203,
"learning_rate": 3.300790330079033e-06,
"loss": 0.2977,
"step": 2250
},
{
"epoch": 2.8213166144200628,
"grad_norm": 0.21539163715672938,
"learning_rate": 3.2775453277545333e-06,
"loss": 0.3061,
"step": 2251
},
{
"epoch": 2.8225705329153605,
"grad_norm": 0.20246356396505,
"learning_rate": 3.254300325430033e-06,
"loss": 0.2906,
"step": 2252
},
{
"epoch": 2.8238244514106583,
"grad_norm": 0.20683765190432812,
"learning_rate": 3.231055323105533e-06,
"loss": 0.2797,
"step": 2253
},
{
"epoch": 2.825078369905956,
"grad_norm": 0.20314423174486654,
"learning_rate": 3.2078103207810323e-06,
"loss": 0.3014,
"step": 2254
},
{
"epoch": 2.826332288401254,
"grad_norm": 0.21123010891468888,
"learning_rate": 3.184565318456532e-06,
"loss": 0.3085,
"step": 2255
},
{
"epoch": 2.8275862068965516,
"grad_norm": 0.19637211099582083,
"learning_rate": 3.161320316132032e-06,
"loss": 0.2816,
"step": 2256
},
{
"epoch": 2.8288401253918494,
"grad_norm": 0.21034149328623092,
"learning_rate": 3.1380753138075313e-06,
"loss": 0.2914,
"step": 2257
},
{
"epoch": 2.830094043887147,
"grad_norm": 0.19385978088486228,
"learning_rate": 3.1148303114830313e-06,
"loss": 0.2985,
"step": 2258
},
{
"epoch": 2.831347962382445,
"grad_norm": 0.2103762670559537,
"learning_rate": 3.091585309158531e-06,
"loss": 0.3015,
"step": 2259
},
{
"epoch": 2.8326018808777427,
"grad_norm": 0.2157478844179482,
"learning_rate": 3.068340306834031e-06,
"loss": 0.2955,
"step": 2260
},
{
"epoch": 2.833855799373041,
"grad_norm": 0.1995963180713835,
"learning_rate": 3.0450953045095307e-06,
"loss": 0.291,
"step": 2261
},
{
"epoch": 2.8351097178683387,
"grad_norm": 0.20545985242459566,
"learning_rate": 3.0218503021850307e-06,
"loss": 0.2915,
"step": 2262
},
{
"epoch": 2.8363636363636364,
"grad_norm": 0.20122247320271056,
"learning_rate": 2.99860529986053e-06,
"loss": 0.2893,
"step": 2263
},
{
"epoch": 2.837617554858934,
"grad_norm": 0.2389966303846273,
"learning_rate": 2.9753602975360298e-06,
"loss": 0.308,
"step": 2264
},
{
"epoch": 2.838871473354232,
"grad_norm": 0.20428874813453898,
"learning_rate": 2.9521152952115297e-06,
"loss": 0.2864,
"step": 2265
},
{
"epoch": 2.8401253918495297,
"grad_norm": 0.19676643869209967,
"learning_rate": 2.9288702928870293e-06,
"loss": 0.2674,
"step": 2266
},
{
"epoch": 2.8413793103448275,
"grad_norm": 0.21854490136691634,
"learning_rate": 2.905625290562529e-06,
"loss": 0.3029,
"step": 2267
},
{
"epoch": 2.8426332288401253,
"grad_norm": 0.19964021303624394,
"learning_rate": 2.882380288238029e-06,
"loss": 0.2958,
"step": 2268
},
{
"epoch": 2.8438871473354235,
"grad_norm": 0.2021236267034733,
"learning_rate": 2.8591352859135287e-06,
"loss": 0.3018,
"step": 2269
},
{
"epoch": 2.8451410658307212,
"grad_norm": 0.2242151107866929,
"learning_rate": 2.8358902835890282e-06,
"loss": 0.3253,
"step": 2270
},
{
"epoch": 2.846394984326019,
"grad_norm": 0.23350519655815705,
"learning_rate": 2.812645281264528e-06,
"loss": 0.3006,
"step": 2271
},
{
"epoch": 2.8476489028213168,
"grad_norm": 0.20952852232824917,
"learning_rate": 2.789400278940028e-06,
"loss": 0.3101,
"step": 2272
},
{
"epoch": 2.8489028213166145,
"grad_norm": 0.18028284553458165,
"learning_rate": 2.7661552766155277e-06,
"loss": 0.2628,
"step": 2273
},
{
"epoch": 2.8501567398119123,
"grad_norm": 0.2116470727173841,
"learning_rate": 2.7429102742910276e-06,
"loss": 0.3272,
"step": 2274
},
{
"epoch": 2.85141065830721,
"grad_norm": 0.2126356910873682,
"learning_rate": 2.7196652719665276e-06,
"loss": 0.3036,
"step": 2275
},
{
"epoch": 2.852664576802508,
"grad_norm": 0.21453602750827225,
"learning_rate": 2.696420269642027e-06,
"loss": 0.2798,
"step": 2276
},
{
"epoch": 2.8539184952978056,
"grad_norm": 0.214169495165591,
"learning_rate": 2.6731752673175267e-06,
"loss": 0.2924,
"step": 2277
},
{
"epoch": 2.8551724137931034,
"grad_norm": 0.19616104208190385,
"learning_rate": 2.6499302649930266e-06,
"loss": 0.2762,
"step": 2278
},
{
"epoch": 2.856426332288401,
"grad_norm": 0.20241727891811703,
"learning_rate": 2.6266852626685266e-06,
"loss": 0.3148,
"step": 2279
},
{
"epoch": 2.857680250783699,
"grad_norm": 0.20979759317462815,
"learning_rate": 2.603440260344026e-06,
"loss": 0.2951,
"step": 2280
},
{
"epoch": 2.8589341692789967,
"grad_norm": 0.20046568302134266,
"learning_rate": 2.580195258019526e-06,
"loss": 0.274,
"step": 2281
},
{
"epoch": 2.8601880877742945,
"grad_norm": 0.2030361265513294,
"learning_rate": 2.5569502556950256e-06,
"loss": 0.2917,
"step": 2282
},
{
"epoch": 2.861442006269592,
"grad_norm": 0.20301598383896585,
"learning_rate": 2.5337052533705255e-06,
"loss": 0.2872,
"step": 2283
},
{
"epoch": 2.8626959247648904,
"grad_norm": 0.20619042209801836,
"learning_rate": 2.510460251046025e-06,
"loss": 0.2963,
"step": 2284
},
{
"epoch": 2.863949843260188,
"grad_norm": 0.19687703989118813,
"learning_rate": 2.487215248721525e-06,
"loss": 0.2998,
"step": 2285
},
{
"epoch": 2.865203761755486,
"grad_norm": 0.2043280088740131,
"learning_rate": 2.463970246397025e-06,
"loss": 0.3104,
"step": 2286
},
{
"epoch": 2.8664576802507837,
"grad_norm": 0.20572123273966572,
"learning_rate": 2.4407252440725245e-06,
"loss": 0.2821,
"step": 2287
},
{
"epoch": 2.8677115987460815,
"grad_norm": 0.20593164681939544,
"learning_rate": 2.417480241748024e-06,
"loss": 0.309,
"step": 2288
},
{
"epoch": 2.8689655172413793,
"grad_norm": 0.2085986484307882,
"learning_rate": 2.394235239423524e-06,
"loss": 0.2945,
"step": 2289
},
{
"epoch": 2.870219435736677,
"grad_norm": 0.19400650866341707,
"learning_rate": 2.370990237099024e-06,
"loss": 0.2777,
"step": 2290
},
{
"epoch": 2.871473354231975,
"grad_norm": 0.20953067461669264,
"learning_rate": 2.3477452347745235e-06,
"loss": 0.2839,
"step": 2291
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.20554501614948803,
"learning_rate": 2.3245002324500235e-06,
"loss": 0.2936,
"step": 2292
},
{
"epoch": 2.873981191222571,
"grad_norm": 0.2163222235188518,
"learning_rate": 2.3012552301255234e-06,
"loss": 0.2918,
"step": 2293
},
{
"epoch": 2.8752351097178686,
"grad_norm": 0.1978933814894312,
"learning_rate": 2.278010227801023e-06,
"loss": 0.2945,
"step": 2294
},
{
"epoch": 2.8764890282131663,
"grad_norm": 0.2221860415063494,
"learning_rate": 2.2547652254765225e-06,
"loss": 0.2981,
"step": 2295
},
{
"epoch": 2.877742946708464,
"grad_norm": 0.19189558927093897,
"learning_rate": 2.2315202231520224e-06,
"loss": 0.2839,
"step": 2296
},
{
"epoch": 2.878996865203762,
"grad_norm": 0.19751164454495532,
"learning_rate": 2.208275220827522e-06,
"loss": 0.286,
"step": 2297
},
{
"epoch": 2.8802507836990596,
"grad_norm": 0.2111622833531837,
"learning_rate": 2.185030218503022e-06,
"loss": 0.3214,
"step": 2298
},
{
"epoch": 2.8815047021943574,
"grad_norm": 0.23163188966438952,
"learning_rate": 2.161785216178522e-06,
"loss": 0.2937,
"step": 2299
},
{
"epoch": 2.882758620689655,
"grad_norm": 0.21456286921454013,
"learning_rate": 2.138540213854022e-06,
"loss": 0.2944,
"step": 2300
},
{
"epoch": 2.884012539184953,
"grad_norm": 0.20090442988312812,
"learning_rate": 2.115295211529521e-06,
"loss": 0.3006,
"step": 2301
},
{
"epoch": 2.8852664576802507,
"grad_norm": 0.2190947106957158,
"learning_rate": 2.092050209205021e-06,
"loss": 0.2945,
"step": 2302
},
{
"epoch": 2.8865203761755485,
"grad_norm": 0.21562700195411966,
"learning_rate": 2.068805206880521e-06,
"loss": 0.3018,
"step": 2303
},
{
"epoch": 2.8877742946708462,
"grad_norm": 0.2028068615733502,
"learning_rate": 2.0455602045560204e-06,
"loss": 0.2996,
"step": 2304
},
{
"epoch": 2.889028213166144,
"grad_norm": 0.20932063028742529,
"learning_rate": 2.0223152022315203e-06,
"loss": 0.292,
"step": 2305
},
{
"epoch": 2.8902821316614418,
"grad_norm": 0.21011810966161135,
"learning_rate": 1.9990701999070203e-06,
"loss": 0.3163,
"step": 2306
},
{
"epoch": 2.8915360501567395,
"grad_norm": 0.1945311627334373,
"learning_rate": 1.97582519758252e-06,
"loss": 0.2774,
"step": 2307
},
{
"epoch": 2.8927899686520377,
"grad_norm": 0.20621899866557716,
"learning_rate": 1.9525801952580194e-06,
"loss": 0.3094,
"step": 2308
},
{
"epoch": 2.8940438871473355,
"grad_norm": 0.21173818714517176,
"learning_rate": 1.9293351929335193e-06,
"loss": 0.3056,
"step": 2309
},
{
"epoch": 2.8952978056426333,
"grad_norm": 0.210724829882866,
"learning_rate": 1.906090190609019e-06,
"loss": 0.3058,
"step": 2310
},
{
"epoch": 2.896551724137931,
"grad_norm": 0.19853522731292658,
"learning_rate": 1.882845188284519e-06,
"loss": 0.2981,
"step": 2311
},
{
"epoch": 2.897805642633229,
"grad_norm": 0.20586521896073104,
"learning_rate": 1.8596001859600188e-06,
"loss": 0.3136,
"step": 2312
},
{
"epoch": 2.8990595611285266,
"grad_norm": 0.20425313321494604,
"learning_rate": 1.8363551836355187e-06,
"loss": 0.2914,
"step": 2313
},
{
"epoch": 2.9003134796238244,
"grad_norm": 0.1936889728174295,
"learning_rate": 1.813110181311018e-06,
"loss": 0.2846,
"step": 2314
},
{
"epoch": 2.901567398119122,
"grad_norm": 0.20002397465340344,
"learning_rate": 1.789865178986518e-06,
"loss": 0.2908,
"step": 2315
},
{
"epoch": 2.9028213166144203,
"grad_norm": 0.19168591687457545,
"learning_rate": 1.7666201766620177e-06,
"loss": 0.2954,
"step": 2316
},
{
"epoch": 2.904075235109718,
"grad_norm": 0.20092648225906654,
"learning_rate": 1.7433751743375175e-06,
"loss": 0.2828,
"step": 2317
},
{
"epoch": 2.905329153605016,
"grad_norm": 0.21640434764332747,
"learning_rate": 1.7201301720130174e-06,
"loss": 0.304,
"step": 2318
},
{
"epoch": 2.9065830721003136,
"grad_norm": 0.1998804263301631,
"learning_rate": 1.6968851696885172e-06,
"loss": 0.307,
"step": 2319
},
{
"epoch": 2.9078369905956114,
"grad_norm": 0.20397801885160524,
"learning_rate": 1.6736401673640167e-06,
"loss": 0.2772,
"step": 2320
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.20742121992162296,
"learning_rate": 1.6503951650395165e-06,
"loss": 0.299,
"step": 2321
},
{
"epoch": 2.910344827586207,
"grad_norm": 0.20990697163943367,
"learning_rate": 1.6271501627150164e-06,
"loss": 0.2912,
"step": 2322
},
{
"epoch": 2.9115987460815047,
"grad_norm": 0.20655696195177622,
"learning_rate": 1.6039051603905162e-06,
"loss": 0.3092,
"step": 2323
},
{
"epoch": 2.9128526645768025,
"grad_norm": 0.23517275116203465,
"learning_rate": 1.580660158066016e-06,
"loss": 0.3127,
"step": 2324
},
{
"epoch": 2.9141065830721002,
"grad_norm": 0.19795970376113417,
"learning_rate": 1.5574151557415157e-06,
"loss": 0.2867,
"step": 2325
},
{
"epoch": 2.915360501567398,
"grad_norm": 0.21279201163410058,
"learning_rate": 1.5341701534170154e-06,
"loss": 0.307,
"step": 2326
},
{
"epoch": 2.916614420062696,
"grad_norm": 0.1914552299902782,
"learning_rate": 1.5109251510925154e-06,
"loss": 0.2775,
"step": 2327
},
{
"epoch": 2.9178683385579935,
"grad_norm": 0.2182109426451137,
"learning_rate": 1.4876801487680149e-06,
"loss": 0.3221,
"step": 2328
},
{
"epoch": 2.9191222570532913,
"grad_norm": 0.19809030885202905,
"learning_rate": 1.4644351464435146e-06,
"loss": 0.2988,
"step": 2329
},
{
"epoch": 2.920376175548589,
"grad_norm": 0.20698557514147936,
"learning_rate": 1.4411901441190146e-06,
"loss": 0.2823,
"step": 2330
},
{
"epoch": 2.9216300940438873,
"grad_norm": 0.2027349291483169,
"learning_rate": 1.4179451417945141e-06,
"loss": 0.3036,
"step": 2331
},
{
"epoch": 2.922884012539185,
"grad_norm": 0.20807743236959528,
"learning_rate": 1.394700139470014e-06,
"loss": 0.3041,
"step": 2332
},
{
"epoch": 2.924137931034483,
"grad_norm": 0.18906245353149478,
"learning_rate": 1.3714551371455138e-06,
"loss": 0.2772,
"step": 2333
},
{
"epoch": 2.9253918495297806,
"grad_norm": 0.22321567575870133,
"learning_rate": 1.3482101348210136e-06,
"loss": 0.2924,
"step": 2334
},
{
"epoch": 2.9266457680250784,
"grad_norm": 0.2006706188495219,
"learning_rate": 1.3249651324965133e-06,
"loss": 0.3112,
"step": 2335
},
{
"epoch": 2.927899686520376,
"grad_norm": 0.22224315210643172,
"learning_rate": 1.301720130172013e-06,
"loss": 0.3007,
"step": 2336
},
{
"epoch": 2.929153605015674,
"grad_norm": 0.20996915821093046,
"learning_rate": 1.2784751278475128e-06,
"loss": 0.2857,
"step": 2337
},
{
"epoch": 2.9304075235109717,
"grad_norm": 0.20918208904037947,
"learning_rate": 1.2552301255230125e-06,
"loss": 0.3061,
"step": 2338
},
{
"epoch": 2.9316614420062694,
"grad_norm": 0.23093164456512272,
"learning_rate": 1.2319851231985125e-06,
"loss": 0.3017,
"step": 2339
},
{
"epoch": 2.9329153605015676,
"grad_norm": 0.207091102351485,
"learning_rate": 1.208740120874012e-06,
"loss": 0.2834,
"step": 2340
},
{
"epoch": 2.9341692789968654,
"grad_norm": 0.1848851999939273,
"learning_rate": 1.185495118549512e-06,
"loss": 0.2784,
"step": 2341
},
{
"epoch": 2.935423197492163,
"grad_norm": 0.19222379663914346,
"learning_rate": 1.1622501162250117e-06,
"loss": 0.2827,
"step": 2342
},
{
"epoch": 2.936677115987461,
"grad_norm": 0.19740926257774238,
"learning_rate": 1.1390051139005115e-06,
"loss": 0.2901,
"step": 2343
},
{
"epoch": 2.9379310344827587,
"grad_norm": 0.19852687988613646,
"learning_rate": 1.1157601115760112e-06,
"loss": 0.3063,
"step": 2344
},
{
"epoch": 2.9391849529780565,
"grad_norm": 0.1909804567325446,
"learning_rate": 1.092515109251511e-06,
"loss": 0.2868,
"step": 2345
},
{
"epoch": 2.9404388714733543,
"grad_norm": 0.20559222551045442,
"learning_rate": 1.069270106927011e-06,
"loss": 0.2958,
"step": 2346
},
{
"epoch": 2.941692789968652,
"grad_norm": 0.19356482320176072,
"learning_rate": 1.0460251046025104e-06,
"loss": 0.2935,
"step": 2347
},
{
"epoch": 2.94294670846395,
"grad_norm": 0.20283806371422422,
"learning_rate": 1.0227801022780102e-06,
"loss": 0.2934,
"step": 2348
},
{
"epoch": 2.9442006269592476,
"grad_norm": 0.22642907942148052,
"learning_rate": 9.995350999535101e-07,
"loss": 0.2896,
"step": 2349
},
{
"epoch": 2.9454545454545453,
"grad_norm": 0.20099169852608226,
"learning_rate": 9.762900976290097e-07,
"loss": 0.2903,
"step": 2350
},
{
"epoch": 2.946708463949843,
"grad_norm": 0.19894948407151314,
"learning_rate": 9.530450953045095e-07,
"loss": 0.2997,
"step": 2351
},
{
"epoch": 2.947962382445141,
"grad_norm": 0.20143395662011787,
"learning_rate": 9.298000929800094e-07,
"loss": 0.3001,
"step": 2352
},
{
"epoch": 2.9492163009404386,
"grad_norm": 0.20330899127482283,
"learning_rate": 9.06555090655509e-07,
"loss": 0.2928,
"step": 2353
},
{
"epoch": 2.950470219435737,
"grad_norm": 0.20412086596448156,
"learning_rate": 8.833100883310089e-07,
"loss": 0.2982,
"step": 2354
},
{
"epoch": 2.9517241379310346,
"grad_norm": 0.19550415660679465,
"learning_rate": 8.600650860065087e-07,
"loss": 0.316,
"step": 2355
},
{
"epoch": 2.9529780564263324,
"grad_norm": 0.20446226555718794,
"learning_rate": 8.368200836820084e-07,
"loss": 0.2919,
"step": 2356
},
{
"epoch": 2.95423197492163,
"grad_norm": 0.1883147420873677,
"learning_rate": 8.135750813575082e-07,
"loss": 0.2852,
"step": 2357
},
{
"epoch": 2.955485893416928,
"grad_norm": 0.21041349179904822,
"learning_rate": 7.90330079033008e-07,
"loss": 0.3091,
"step": 2358
},
{
"epoch": 2.9567398119122257,
"grad_norm": 0.18403073174668855,
"learning_rate": 7.670850767085077e-07,
"loss": 0.2889,
"step": 2359
},
{
"epoch": 2.9579937304075234,
"grad_norm": 0.2346403359142298,
"learning_rate": 7.438400743840074e-07,
"loss": 0.3005,
"step": 2360
},
{
"epoch": 2.959247648902821,
"grad_norm": 0.18892079497477632,
"learning_rate": 7.205950720595073e-07,
"loss": 0.2868,
"step": 2361
},
{
"epoch": 2.960501567398119,
"grad_norm": 0.1983556779297143,
"learning_rate": 6.97350069735007e-07,
"loss": 0.2931,
"step": 2362
},
{
"epoch": 2.961755485893417,
"grad_norm": 0.20391170004547984,
"learning_rate": 6.741050674105068e-07,
"loss": 0.3183,
"step": 2363
},
{
"epoch": 2.963009404388715,
"grad_norm": 0.19058540773459545,
"learning_rate": 6.508600650860065e-07,
"loss": 0.297,
"step": 2364
},
{
"epoch": 2.9642633228840127,
"grad_norm": 0.1945341447901286,
"learning_rate": 6.276150627615063e-07,
"loss": 0.2873,
"step": 2365
},
{
"epoch": 2.9655172413793105,
"grad_norm": 0.1949026641278786,
"learning_rate": 6.04370060437006e-07,
"loss": 0.3004,
"step": 2366
},
{
"epoch": 2.9667711598746083,
"grad_norm": 0.20056221059761192,
"learning_rate": 5.811250581125059e-07,
"loss": 0.2908,
"step": 2367
},
{
"epoch": 2.968025078369906,
"grad_norm": 0.1879816038229043,
"learning_rate": 5.578800557880056e-07,
"loss": 0.2776,
"step": 2368
},
{
"epoch": 2.969278996865204,
"grad_norm": 0.1965198127526313,
"learning_rate": 5.346350534635055e-07,
"loss": 0.2957,
"step": 2369
},
{
"epoch": 2.9705329153605016,
"grad_norm": 0.20626120816136498,
"learning_rate": 5.113900511390051e-07,
"loss": 0.2774,
"step": 2370
},
{
"epoch": 2.9717868338557993,
"grad_norm": 0.19994605803243184,
"learning_rate": 4.881450488145048e-07,
"loss": 0.3097,
"step": 2371
},
{
"epoch": 2.973040752351097,
"grad_norm": 0.2092089315296401,
"learning_rate": 4.649000464900047e-07,
"loss": 0.3013,
"step": 2372
},
{
"epoch": 2.974294670846395,
"grad_norm": 0.1932846915100285,
"learning_rate": 4.4165504416550444e-07,
"loss": 0.2896,
"step": 2373
},
{
"epoch": 2.9755485893416926,
"grad_norm": 0.19003447769080256,
"learning_rate": 4.184100418410042e-07,
"loss": 0.2875,
"step": 2374
},
{
"epoch": 2.9768025078369904,
"grad_norm": 0.21097782544154126,
"learning_rate": 3.95165039516504e-07,
"loss": 0.2877,
"step": 2375
},
{
"epoch": 2.978056426332288,
"grad_norm": 0.18710836723112906,
"learning_rate": 3.719200371920037e-07,
"loss": 0.2675,
"step": 2376
},
{
"epoch": 2.979310344827586,
"grad_norm": 0.1938458936977835,
"learning_rate": 3.486750348675035e-07,
"loss": 0.3011,
"step": 2377
},
{
"epoch": 2.980564263322884,
"grad_norm": 0.1979924022021737,
"learning_rate": 3.2543003254300326e-07,
"loss": 0.3,
"step": 2378
},
{
"epoch": 2.981818181818182,
"grad_norm": 0.21782432269517585,
"learning_rate": 3.02185030218503e-07,
"loss": 0.3009,
"step": 2379
},
{
"epoch": 2.9830721003134797,
"grad_norm": 0.1944424238953003,
"learning_rate": 2.789400278940028e-07,
"loss": 0.2756,
"step": 2380
},
{
"epoch": 2.9843260188087775,
"grad_norm": 0.2093910297313566,
"learning_rate": 2.5569502556950255e-07,
"loss": 0.2878,
"step": 2381
},
{
"epoch": 2.9855799373040752,
"grad_norm": 0.20478800367739342,
"learning_rate": 2.3245002324500235e-07,
"loss": 0.3146,
"step": 2382
},
{
"epoch": 2.986833855799373,
"grad_norm": 0.1887440301951096,
"learning_rate": 2.092050209205021e-07,
"loss": 0.2845,
"step": 2383
},
{
"epoch": 2.9880877742946708,
"grad_norm": 0.2036016241742681,
"learning_rate": 1.8596001859600186e-07,
"loss": 0.291,
"step": 2384
},
{
"epoch": 2.9893416927899685,
"grad_norm": 0.19639033548865972,
"learning_rate": 1.6271501627150163e-07,
"loss": 0.2927,
"step": 2385
},
{
"epoch": 2.9905956112852663,
"grad_norm": 0.18925143939264202,
"learning_rate": 1.394700139470014e-07,
"loss": 0.2953,
"step": 2386
},
{
"epoch": 2.9918495297805645,
"grad_norm": 0.17972416877800695,
"learning_rate": 1.1622501162250117e-07,
"loss": 0.2686,
"step": 2387
},
{
"epoch": 2.9931034482758623,
"grad_norm": 0.2127589384949437,
"learning_rate": 9.298000929800093e-08,
"loss": 0.2979,
"step": 2388
},
{
"epoch": 2.99435736677116,
"grad_norm": 0.18987324300449337,
"learning_rate": 6.97350069735007e-08,
"loss": 0.2791,
"step": 2389
},
{
"epoch": 2.995611285266458,
"grad_norm": 0.1943521997902393,
"learning_rate": 4.6490004649000465e-08,
"loss": 0.2906,
"step": 2390
},
{
"epoch": 2.9968652037617556,
"grad_norm": 0.19917213413119425,
"learning_rate": 2.3245002324500233e-08,
"loss": 0.2873,
"step": 2391
},
{
"epoch": 2.9968652037617556,
"step": 2391,
"total_flos": 2.653350006194438e+19,
"train_loss": 0.4467789067079312,
"train_runtime": 143906.2234,
"train_samples_per_second": 0.266,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 2391,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.653350006194438e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}