{ "best_global_step": 5000, "best_metric": 0.25055432372505543, "best_model_checkpoint": "out_qwen_4b_sft_augmented/checkpoint-5000", "epoch": 7.886959952653383, "eval_steps": 50, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015782205563227462, "grad_norm": 174.16427506976208, "learning_rate": 8.858267716535433e-08, "loss": 36.8525, "step": 10, "true_loss": 4.5242 }, { "epoch": 0.031564411126454923, "grad_norm": 167.6085061757231, "learning_rate": 1.8700787401574806e-07, "loss": 36.1013, "step": 20, "true_loss": 4.3914 }, { "epoch": 0.04734661668968238, "grad_norm": 177.49081213144376, "learning_rate": 2.8543307086614175e-07, "loss": 37.3428, "step": 30, "true_loss": 4.816 }, { "epoch": 0.06312882225290985, "grad_norm": 223.9851508820277, "learning_rate": 3.8385826771653546e-07, "loss": 36.5105, "step": 40, "true_loss": 4.6624 }, { "epoch": 0.0789110278161373, "grad_norm": 231.61545863242762, "learning_rate": 4.822834645669292e-07, "loss": 36.4121, "step": 50, "true_loss": 4.3832 }, { "epoch": 0.0789110278161373, "eval_accuracy": 0.03215077605321508, "eval_loss": 4.5295867919921875, "eval_runtime": 23.635, "eval_samples_per_second": 38.164, "eval_steps_per_second": 4.781, "step": 50 }, { "epoch": 0.09469323337936476, "grad_norm": 152.10660153770212, "learning_rate": 5.807086614173228e-07, "loss": 36.1448, "step": 60, "true_loss": 4.4281 }, { "epoch": 0.11047543894259222, "grad_norm": 160.9903980922439, "learning_rate": 6.791338582677166e-07, "loss": 35.3635, "step": 70, "true_loss": 4.3689 }, { "epoch": 0.1262576445058197, "grad_norm": 197.0259688154707, "learning_rate": 7.775590551181103e-07, "loss": 35.1756, "step": 80, "true_loss": 4.587 }, { "epoch": 0.14203985006904715, "grad_norm": 188.2581089740781, "learning_rate": 8.759842519685039e-07, "loss": 34.6323, "step": 90, "true_loss": 4.2268 }, { "epoch": 0.1578220556322746, "grad_norm": 189.99944001910137, "learning_rate": 9.744094488188976e-07, "loss": 34.0603, "step": 100, "true_loss": 4.2365 }, { "epoch": 0.1578220556322746, "eval_accuracy": 0.025498891352549888, "eval_loss": 4.291808128356934, "eval_runtime": 23.5974, "eval_samples_per_second": 38.225, "eval_steps_per_second": 4.789, "step": 100 }, { "epoch": 0.17360426119550207, "grad_norm": 217.95488292008284, "learning_rate": 1.0728346456692915e-06, "loss": 33.2188, "step": 110, "true_loss": 4.1615 }, { "epoch": 0.18938646675872953, "grad_norm": 165.066552245844, "learning_rate": 1.1712598425196851e-06, "loss": 33.3914, "step": 120, "true_loss": 4.2157 }, { "epoch": 0.20516867232195699, "grad_norm": 195.1015279558477, "learning_rate": 1.2696850393700788e-06, "loss": 33.0292, "step": 130, "true_loss": 4.2275 }, { "epoch": 0.22095087788518444, "grad_norm": 190.91494851629787, "learning_rate": 1.3681102362204727e-06, "loss": 33.7361, "step": 140, "true_loss": 4.2454 }, { "epoch": 0.2367330834484119, "grad_norm": 188.2976996602128, "learning_rate": 1.4665354330708661e-06, "loss": 33.5512, "step": 150, "true_loss": 4.3871 }, { "epoch": 0.2367330834484119, "eval_accuracy": 0.037694013303769404, "eval_loss": 4.101545333862305, "eval_runtime": 23.3706, "eval_samples_per_second": 38.596, "eval_steps_per_second": 4.835, "step": 150 }, { "epoch": 0.2525152890116394, "grad_norm": 184.36326052249296, "learning_rate": 1.56496062992126e-06, "loss": 32.907, "step": 160, "true_loss": 4.1332 }, { "epoch": 0.26829749457486685, "grad_norm": 270.21563922540906, "learning_rate": 1.6633858267716539e-06, "loss": 32.8046, "step": 170, "true_loss": 4.109 }, { "epoch": 0.2840797001380943, "grad_norm": 172.75566257709625, "learning_rate": 1.7618110236220473e-06, "loss": 32.8227, "step": 180, "true_loss": 4.1665 }, { "epoch": 0.29986190570132176, "grad_norm": 193.33897708701926, "learning_rate": 1.8602362204724412e-06, "loss": 33.0616, "step": 190, "true_loss": 3.9749 }, { "epoch": 0.3156441112645492, "grad_norm": 190.82576930912984, "learning_rate": 1.958661417322835e-06, "loss": 32.8765, "step": 200, "true_loss": 4.1028 }, { "epoch": 0.3156441112645492, "eval_accuracy": 0.052106430155210645, "eval_loss": 4.016829013824463, "eval_runtime": 23.4288, "eval_samples_per_second": 38.5, "eval_steps_per_second": 4.823, "step": 200 }, { "epoch": 0.3314263168277767, "grad_norm": 136.42168368750973, "learning_rate": 2.0570866141732285e-06, "loss": 33.0946, "step": 210, "true_loss": 4.0481 }, { "epoch": 0.34720852239100414, "grad_norm": 154.43641797464133, "learning_rate": 2.155511811023622e-06, "loss": 32.6089, "step": 220, "true_loss": 4.1939 }, { "epoch": 0.3629907279542316, "grad_norm": 133.3441398301233, "learning_rate": 2.253937007874016e-06, "loss": 32.9959, "step": 230, "true_loss": 4.1266 }, { "epoch": 0.37877293351745905, "grad_norm": 130.95877792011237, "learning_rate": 2.3523622047244095e-06, "loss": 32.7445, "step": 240, "true_loss": 3.9952 }, { "epoch": 0.3945551390806865, "grad_norm": 133.98923934847613, "learning_rate": 2.4507874015748036e-06, "loss": 32.7257, "step": 250, "true_loss": 4.151 }, { "epoch": 0.3945551390806865, "eval_accuracy": 0.058758314855875834, "eval_loss": 3.9920618534088135, "eval_runtime": 23.3601, "eval_samples_per_second": 38.613, "eval_steps_per_second": 4.837, "step": 250 }, { "epoch": 0.41033734464391397, "grad_norm": 121.38525842724825, "learning_rate": 2.549212598425197e-06, "loss": 32.9611, "step": 260, "true_loss": 4.108 }, { "epoch": 0.42611955020714143, "grad_norm": 114.63942456849516, "learning_rate": 2.6476377952755905e-06, "loss": 32.7922, "step": 270, "true_loss": 4.1449 }, { "epoch": 0.4419017557703689, "grad_norm": 105.4529770040223, "learning_rate": 2.7460629921259846e-06, "loss": 32.0971, "step": 280, "true_loss": 3.9221 }, { "epoch": 0.45768396133359635, "grad_norm": 127.05577792211547, "learning_rate": 2.8444881889763782e-06, "loss": 32.5, "step": 290, "true_loss": 4.0502 }, { "epoch": 0.4734661668968238, "grad_norm": 104.32509789241224, "learning_rate": 2.9429133858267715e-06, "loss": 32.4098, "step": 300, "true_loss": 4.1742 }, { "epoch": 0.4734661668968238, "eval_accuracy": 0.06984478935698447, "eval_loss": 3.9740073680877686, "eval_runtime": 23.7165, "eval_samples_per_second": 38.033, "eval_steps_per_second": 4.765, "step": 300 }, { "epoch": 0.4892483724600513, "grad_norm": 100.02995214077195, "learning_rate": 3.0413385826771656e-06, "loss": 32.8842, "step": 310, "true_loss": 3.9566 }, { "epoch": 0.5050305780232788, "grad_norm": 188.732126374455, "learning_rate": 3.1397637795275592e-06, "loss": 32.4406, "step": 320, "true_loss": 4.1384 }, { "epoch": 0.5208127835865062, "grad_norm": 102.94686318366128, "learning_rate": 3.238188976377953e-06, "loss": 32.9819, "step": 330, "true_loss": 4.3307 }, { "epoch": 0.5365949891497337, "grad_norm": 108.37307217795114, "learning_rate": 3.336614173228347e-06, "loss": 32.4922, "step": 340, "true_loss": 4.1693 }, { "epoch": 0.5523771947129611, "grad_norm": 94.15863065658708, "learning_rate": 3.4350393700787406e-06, "loss": 33.0664, "step": 350, "true_loss": 4.065 }, { "epoch": 0.5523771947129611, "eval_accuracy": 0.07427937915742794, "eval_loss": 3.9935474395751953, "eval_runtime": 23.476, "eval_samples_per_second": 38.422, "eval_steps_per_second": 4.813, "step": 350 }, { "epoch": 0.5681594002761886, "grad_norm": 104.24180086472042, "learning_rate": 3.533464566929134e-06, "loss": 32.0742, "step": 360, "true_loss": 4.0407 }, { "epoch": 0.583941605839416, "grad_norm": 94.16448088348912, "learning_rate": 3.631889763779528e-06, "loss": 32.2163, "step": 370, "true_loss": 4.0543 }, { "epoch": 0.5997238114026435, "grad_norm": 83.41510765519034, "learning_rate": 3.7303149606299216e-06, "loss": 31.936, "step": 380, "true_loss": 4.0115 }, { "epoch": 0.6155060169658709, "grad_norm": 87.51703962604186, "learning_rate": 3.828740157480315e-06, "loss": 32.5367, "step": 390, "true_loss": 4.2035 }, { "epoch": 0.6312882225290984, "grad_norm": 80.60904438703314, "learning_rate": 3.927165354330709e-06, "loss": 32.6616, "step": 400, "true_loss": 3.9478 }, { "epoch": 0.6312882225290984, "eval_accuracy": 0.0787139689578714, "eval_loss": 3.9456546306610107, "eval_runtime": 23.3617, "eval_samples_per_second": 38.61, "eval_steps_per_second": 4.837, "step": 400 }, { "epoch": 0.647070428092326, "grad_norm": 82.68754107900482, "learning_rate": 4.025590551181103e-06, "loss": 32.4794, "step": 410, "true_loss": 4.002 }, { "epoch": 0.6628526336555534, "grad_norm": 85.78266911527552, "learning_rate": 4.124015748031497e-06, "loss": 32.3882, "step": 420, "true_loss": 4.0118 }, { "epoch": 0.6786348392187809, "grad_norm": 86.11358238695115, "learning_rate": 4.222440944881891e-06, "loss": 32.5061, "step": 430, "true_loss": 3.9504 }, { "epoch": 0.6944170447820083, "grad_norm": 92.30640559386582, "learning_rate": 4.320866141732284e-06, "loss": 32.4052, "step": 440, "true_loss": 4.1456 }, { "epoch": 0.7101992503452358, "grad_norm": 89.58650504794474, "learning_rate": 4.419291338582677e-06, "loss": 32.4321, "step": 450, "true_loss": 4.0698 }, { "epoch": 0.7101992503452358, "eval_accuracy": 0.08093126385809313, "eval_loss": 3.902703285217285, "eval_runtime": 23.2537, "eval_samples_per_second": 38.789, "eval_steps_per_second": 4.859, "step": 450 }, { "epoch": 0.7259814559084632, "grad_norm": 90.04454010872678, "learning_rate": 4.517716535433071e-06, "loss": 32.5713, "step": 460, "true_loss": 4.1408 }, { "epoch": 0.7417636614716907, "grad_norm": 80.81511157664153, "learning_rate": 4.616141732283465e-06, "loss": 32.1417, "step": 470, "true_loss": 3.974 }, { "epoch": 0.7575458670349181, "grad_norm": 88.1484516973123, "learning_rate": 4.714566929133859e-06, "loss": 32.5118, "step": 480, "true_loss": 3.9936 }, { "epoch": 0.7733280725981456, "grad_norm": 86.34135386078943, "learning_rate": 4.812992125984252e-06, "loss": 32.4657, "step": 490, "true_loss": 4.0883 }, { "epoch": 0.789110278161373, "grad_norm": 135.76868833155947, "learning_rate": 4.911417322834646e-06, "loss": 31.8174, "step": 500, "true_loss": 3.9796 }, { "epoch": 0.789110278161373, "eval_accuracy": 0.09866962305986696, "eval_loss": 3.8499600887298584, "eval_runtime": 23.5068, "eval_samples_per_second": 38.372, "eval_steps_per_second": 4.807, "step": 500 }, { "epoch": 0.8048924837246005, "grad_norm": 86.1711640049212, "learning_rate": 4.998904469763366e-06, "loss": 32.7917, "step": 510, "true_loss": 4.1162 }, { "epoch": 0.8206746892878279, "grad_norm": 83.66765407731347, "learning_rate": 4.987949167397021e-06, "loss": 32.1801, "step": 520, "true_loss": 4.1161 }, { "epoch": 0.8364568948510555, "grad_norm": 76.21119570368347, "learning_rate": 4.976993865030675e-06, "loss": 32.2797, "step": 530, "true_loss": 4.0858 }, { "epoch": 0.8522391004142829, "grad_norm": 76.03527006332567, "learning_rate": 4.96603856266433e-06, "loss": 32.3483, "step": 540, "true_loss": 3.9666 }, { "epoch": 0.8680213059775104, "grad_norm": 90.40138000679266, "learning_rate": 4.9550832602979845e-06, "loss": 32.3033, "step": 550, "true_loss": 3.9968 }, { "epoch": 0.8680213059775104, "eval_accuracy": 0.10421286031042129, "eval_loss": 3.775171995162964, "eval_runtime": 23.3348, "eval_samples_per_second": 38.655, "eval_steps_per_second": 4.843, "step": 550 }, { "epoch": 0.8838035115407378, "grad_norm": 80.65084468026359, "learning_rate": 4.944127957931639e-06, "loss": 31.7379, "step": 560, "true_loss": 3.9121 }, { "epoch": 0.8995857171039653, "grad_norm": 85.27373385064382, "learning_rate": 4.933172655565295e-06, "loss": 32.4789, "step": 570, "true_loss": 4.0935 }, { "epoch": 0.9153679226671927, "grad_norm": 83.42739374632569, "learning_rate": 4.922217353198948e-06, "loss": 31.7407, "step": 580, "true_loss": 3.9859 }, { "epoch": 0.9311501282304202, "grad_norm": 85.11856503695495, "learning_rate": 4.911262050832604e-06, "loss": 31.779, "step": 590, "true_loss": 4.0604 }, { "epoch": 0.9469323337936476, "grad_norm": 96.49759195029552, "learning_rate": 4.9003067484662584e-06, "loss": 31.6683, "step": 600, "true_loss": 3.9236 }, { "epoch": 0.9469323337936476, "eval_accuracy": 0.10532150776053215, "eval_loss": 3.7553915977478027, "eval_runtime": 23.5486, "eval_samples_per_second": 38.304, "eval_steps_per_second": 4.799, "step": 600 }, { "epoch": 0.9627145393568751, "grad_norm": 77.40515202969824, "learning_rate": 4.889351446099912e-06, "loss": 31.6165, "step": 610, "true_loss": 3.9 }, { "epoch": 0.9784967449201026, "grad_norm": 78.65684975765593, "learning_rate": 4.878396143733568e-06, "loss": 32.0743, "step": 620, "true_loss": 3.8977 }, { "epoch": 0.99427895048333, "grad_norm": 84.00632303949844, "learning_rate": 4.867440841367222e-06, "loss": 32.053, "step": 630, "true_loss": 3.9405 }, { "epoch": 1.0094693233379364, "grad_norm": 86.53116951522864, "learning_rate": 4.856485539000877e-06, "loss": 30.8277, "step": 640, "true_loss": 4.0447 }, { "epoch": 1.0252515289011639, "grad_norm": 76.42542739289013, "learning_rate": 4.8455302366345315e-06, "loss": 31.4788, "step": 650, "true_loss": 4.0438 }, { "epoch": 1.0252515289011639, "eval_accuracy": 0.1197339246119734, "eval_loss": 3.684516191482544, "eval_runtime": 23.3178, "eval_samples_per_second": 38.683, "eval_steps_per_second": 4.846, "step": 650 }, { "epoch": 1.0410337344643914, "grad_norm": 83.79936090415067, "learning_rate": 4.834574934268186e-06, "loss": 31.1704, "step": 660, "true_loss": 4.1986 }, { "epoch": 1.056815940027619, "grad_norm": 78.2353569729384, "learning_rate": 4.823619631901841e-06, "loss": 31.2897, "step": 670, "true_loss": 4.1662 }, { "epoch": 1.0725981455908462, "grad_norm": 86.79746116556045, "learning_rate": 4.812664329535496e-06, "loss": 31.4329, "step": 680, "true_loss": 3.954 }, { "epoch": 1.0883803511540737, "grad_norm": 91.03308204547182, "learning_rate": 4.80170902716915e-06, "loss": 31.0246, "step": 690, "true_loss": 3.629 }, { "epoch": 1.1041625567173012, "grad_norm": 95.5543326586022, "learning_rate": 4.790753724802805e-06, "loss": 30.7064, "step": 700, "true_loss": 3.8903 }, { "epoch": 1.1041625567173012, "eval_accuracy": 0.13414634146341464, "eval_loss": 3.6483702659606934, "eval_runtime": 23.2932, "eval_samples_per_second": 38.724, "eval_steps_per_second": 4.851, "step": 700 }, { "epoch": 1.1199447622805287, "grad_norm": 79.97842689644025, "learning_rate": 4.77979842243646e-06, "loss": 30.7059, "step": 710, "true_loss": 3.7666 }, { "epoch": 1.1357269678437563, "grad_norm": 81.58486936450443, "learning_rate": 4.768843120070114e-06, "loss": 31.2974, "step": 720, "true_loss": 3.7239 }, { "epoch": 1.1515091734069836, "grad_norm": 81.62750665419482, "learning_rate": 4.757887817703769e-06, "loss": 30.8988, "step": 730, "true_loss": 3.8959 }, { "epoch": 1.167291378970211, "grad_norm": 76.29181528031711, "learning_rate": 4.746932515337424e-06, "loss": 30.4343, "step": 740, "true_loss": 4.1375 }, { "epoch": 1.1830735845334386, "grad_norm": 89.83766546896902, "learning_rate": 4.735977212971078e-06, "loss": 31.1135, "step": 750, "true_loss": 3.9016 }, { "epoch": 1.1830735845334386, "eval_accuracy": 0.14745011086474502, "eval_loss": 3.5865719318389893, "eval_runtime": 23.4877, "eval_samples_per_second": 38.403, "eval_steps_per_second": 4.811, "step": 750 }, { "epoch": 1.198855790096666, "grad_norm": 78.24399623511877, "learning_rate": 4.725021910604733e-06, "loss": 30.5554, "step": 760, "true_loss": 3.8632 }, { "epoch": 1.2146379956598934, "grad_norm": 76.12755346059757, "learning_rate": 4.7140666082383876e-06, "loss": 30.8955, "step": 770, "true_loss": 3.859 }, { "epoch": 1.230420201223121, "grad_norm": 127.92778602678045, "learning_rate": 4.703111305872042e-06, "loss": 30.8453, "step": 780, "true_loss": 3.7739 }, { "epoch": 1.2462024067863484, "grad_norm": 76.80423404390017, "learning_rate": 4.692156003505698e-06, "loss": 30.5773, "step": 790, "true_loss": 3.959 }, { "epoch": 1.261984612349576, "grad_norm": 81.5502755186819, "learning_rate": 4.681200701139351e-06, "loss": 30.7506, "step": 800, "true_loss": 3.7586 }, { "epoch": 1.261984612349576, "eval_accuracy": 0.14966740576496673, "eval_loss": 3.5678353309631348, "eval_runtime": 23.4696, "eval_samples_per_second": 38.433, "eval_steps_per_second": 4.815, "step": 800 }, { "epoch": 1.2777668179128034, "grad_norm": 85.80122726399287, "learning_rate": 4.670245398773007e-06, "loss": 30.8062, "step": 810, "true_loss": 3.8887 }, { "epoch": 1.2935490234760307, "grad_norm": 84.42365154547282, "learning_rate": 4.6592900964066614e-06, "loss": 30.036, "step": 820, "true_loss": 3.6739 }, { "epoch": 1.3093312290392582, "grad_norm": 78.03930782671614, "learning_rate": 4.648334794040315e-06, "loss": 31.0261, "step": 830, "true_loss": 3.8727 }, { "epoch": 1.3251134346024858, "grad_norm": 85.75983079835166, "learning_rate": 4.637379491673971e-06, "loss": 30.5831, "step": 840, "true_loss": 3.9457 }, { "epoch": 1.340895640165713, "grad_norm": 79.40570200457304, "learning_rate": 4.626424189307625e-06, "loss": 30.7713, "step": 850, "true_loss": 3.9606 }, { "epoch": 1.340895640165713, "eval_accuracy": 0.1385809312638581, "eval_loss": 3.5111687183380127, "eval_runtime": 23.5209, "eval_samples_per_second": 38.349, "eval_steps_per_second": 4.804, "step": 850 }, { "epoch": 1.3566778457289406, "grad_norm": 87.96181357071613, "learning_rate": 4.61546888694128e-06, "loss": 30.4215, "step": 860, "true_loss": 4.0675 }, { "epoch": 1.372460051292168, "grad_norm": 84.31171636959083, "learning_rate": 4.6045135845749345e-06, "loss": 30.8854, "step": 870, "true_loss": 3.8743 }, { "epoch": 1.3882422568553956, "grad_norm": 88.71895588552248, "learning_rate": 4.593558282208589e-06, "loss": 30.6506, "step": 880, "true_loss": 4.0285 }, { "epoch": 1.404024462418623, "grad_norm": 79.5801198749425, "learning_rate": 4.582602979842244e-06, "loss": 30.6195, "step": 890, "true_loss": 3.8911 }, { "epoch": 1.4198066679818504, "grad_norm": 79.43549032930792, "learning_rate": 4.571647677475899e-06, "loss": 30.9371, "step": 900, "true_loss": 3.6768 }, { "epoch": 1.4198066679818504, "eval_accuracy": 0.15631929046563192, "eval_loss": 3.516885280609131, "eval_runtime": 23.4984, "eval_samples_per_second": 38.386, "eval_steps_per_second": 4.809, "step": 900 }, { "epoch": 1.435588873545078, "grad_norm": 81.53400694175745, "learning_rate": 4.560692375109553e-06, "loss": 31.3002, "step": 910, "true_loss": 3.8526 }, { "epoch": 1.4513710791083054, "grad_norm": 75.1443329518506, "learning_rate": 4.549737072743208e-06, "loss": 30.3588, "step": 920, "true_loss": 3.9476 }, { "epoch": 1.4671532846715327, "grad_norm": 85.40910472916514, "learning_rate": 4.538781770376863e-06, "loss": 30.3214, "step": 930, "true_loss": 3.5832 }, { "epoch": 1.4829354902347602, "grad_norm": 88.4516515835087, "learning_rate": 4.5278264680105176e-06, "loss": 30.4432, "step": 940, "true_loss": 3.7606 }, { "epoch": 1.4987176957979877, "grad_norm": 83.7163741797305, "learning_rate": 4.516871165644172e-06, "loss": 30.0341, "step": 950, "true_loss": 3.708 }, { "epoch": 1.4987176957979877, "eval_accuracy": 0.14745011086474502, "eval_loss": 3.501782178878784, "eval_runtime": 23.4541, "eval_samples_per_second": 38.458, "eval_steps_per_second": 4.818, "step": 950 }, { "epoch": 1.5144999013612153, "grad_norm": 84.89672851982097, "learning_rate": 4.505915863277827e-06, "loss": 29.8211, "step": 960, "true_loss": 3.636 }, { "epoch": 1.5302821069244428, "grad_norm": 98.531441708619, "learning_rate": 4.494960560911481e-06, "loss": 30.3922, "step": 970, "true_loss": 3.9862 }, { "epoch": 1.5460643124876703, "grad_norm": 80.35703030333623, "learning_rate": 4.484005258545136e-06, "loss": 30.5338, "step": 980, "true_loss": 3.6081 }, { "epoch": 1.5618465180508976, "grad_norm": 82.07595103508196, "learning_rate": 4.473049956178791e-06, "loss": 30.3083, "step": 990, "true_loss": 3.8274 }, { "epoch": 1.577628723614125, "grad_norm": 79.59671721927485, "learning_rate": 4.462094653812445e-06, "loss": 30.4286, "step": 1000, "true_loss": 3.6957 }, { "epoch": 1.577628723614125, "eval_accuracy": 0.13636363636363635, "eval_loss": 3.4948031902313232, "eval_runtime": 24.2585, "eval_samples_per_second": 37.183, "eval_steps_per_second": 4.658, "step": 1000 }, { "epoch": 1.5934109291773524, "grad_norm": 82.47604231181, "learning_rate": 4.451139351446101e-06, "loss": 30.322, "step": 1010, "true_loss": 3.7297 }, { "epoch": 1.60919313474058, "grad_norm": 77.40578844926777, "learning_rate": 4.440184049079755e-06, "loss": 30.4083, "step": 1020, "true_loss": 4.0732 }, { "epoch": 1.6249753403038074, "grad_norm": 82.94825841620002, "learning_rate": 4.42922874671341e-06, "loss": 29.8286, "step": 1030, "true_loss": 3.822 }, { "epoch": 1.640757545867035, "grad_norm": 83.44180849899234, "learning_rate": 4.4182734443470645e-06, "loss": 30.0889, "step": 1040, "true_loss": 3.92 }, { "epoch": 1.6565397514302624, "grad_norm": 75.34397566776506, "learning_rate": 4.407318141980719e-06, "loss": 30.0305, "step": 1050, "true_loss": 3.748 }, { "epoch": 1.6565397514302624, "eval_accuracy": 0.16186252771618626, "eval_loss": 3.4837870597839355, "eval_runtime": 23.4392, "eval_samples_per_second": 38.482, "eval_steps_per_second": 4.821, "step": 1050 }, { "epoch": 1.67232195699349, "grad_norm": 86.58622739826328, "learning_rate": 4.396362839614374e-06, "loss": 29.9125, "step": 1060, "true_loss": 3.5295 }, { "epoch": 1.6881041625567172, "grad_norm": 83.26525049158347, "learning_rate": 4.385407537248028e-06, "loss": 30.0806, "step": 1070, "true_loss": 3.9204 }, { "epoch": 1.7038863681199448, "grad_norm": 78.14206749946366, "learning_rate": 4.374452234881683e-06, "loss": 30.1753, "step": 1080, "true_loss": 3.8661 }, { "epoch": 1.7196685736831723, "grad_norm": 91.73104580537138, "learning_rate": 4.3634969325153375e-06, "loss": 30.3678, "step": 1090, "true_loss": 3.7271 }, { "epoch": 1.7354507792463996, "grad_norm": 85.70308212054894, "learning_rate": 4.352541630148993e-06, "loss": 30.0965, "step": 1100, "true_loss": 3.7848 }, { "epoch": 1.7354507792463996, "eval_accuracy": 0.17184035476718404, "eval_loss": 3.456275463104248, "eval_runtime": 23.5809, "eval_samples_per_second": 38.251, "eval_steps_per_second": 4.792, "step": 1100 }, { "epoch": 1.751232984809627, "grad_norm": 113.05298933496893, "learning_rate": 4.341586327782647e-06, "loss": 29.6654, "step": 1110, "true_loss": 3.683 }, { "epoch": 1.7670151903728546, "grad_norm": 83.82656225986186, "learning_rate": 4.330631025416302e-06, "loss": 29.6349, "step": 1120, "true_loss": 3.8404 }, { "epoch": 1.782797395936082, "grad_norm": 85.63660392692398, "learning_rate": 4.319675723049957e-06, "loss": 29.7907, "step": 1130, "true_loss": 3.8374 }, { "epoch": 1.7985796014993096, "grad_norm": 83.32739107388537, "learning_rate": 4.308720420683611e-06, "loss": 30.2289, "step": 1140, "true_loss": 3.7084 }, { "epoch": 1.8143618070625371, "grad_norm": 83.64576027399661, "learning_rate": 4.297765118317266e-06, "loss": 30.3927, "step": 1150, "true_loss": 3.8711 }, { "epoch": 1.8143618070625371, "eval_accuracy": 0.16851441241685144, "eval_loss": 3.45339035987854, "eval_runtime": 23.4334, "eval_samples_per_second": 38.492, "eval_steps_per_second": 4.822, "step": 1150 }, { "epoch": 1.8301440126257644, "grad_norm": 80.2640158820411, "learning_rate": 4.286809815950921e-06, "loss": 29.8517, "step": 1160, "true_loss": 3.8122 }, { "epoch": 1.845926218188992, "grad_norm": 79.88431250597438, "learning_rate": 4.275854513584575e-06, "loss": 30.3006, "step": 1170, "true_loss": 3.9271 }, { "epoch": 1.8617084237522192, "grad_norm": 72.52904684567581, "learning_rate": 4.26489921121823e-06, "loss": 30.7417, "step": 1180, "true_loss": 3.6993 }, { "epoch": 1.8774906293154467, "grad_norm": 78.8291797715746, "learning_rate": 4.253943908851884e-06, "loss": 30.1426, "step": 1190, "true_loss": 3.5637 }, { "epoch": 1.8932728348786743, "grad_norm": 80.92121268530722, "learning_rate": 4.242988606485539e-06, "loss": 29.8786, "step": 1200, "true_loss": 3.6987 }, { "epoch": 1.8932728348786743, "eval_accuracy": 0.16518847006651885, "eval_loss": 3.4521777629852295, "eval_runtime": 23.525, "eval_samples_per_second": 38.342, "eval_steps_per_second": 4.803, "step": 1200 }, { "epoch": 1.9090550404419018, "grad_norm": 98.662691098247, "learning_rate": 4.2320333041191945e-06, "loss": 29.6018, "step": 1210, "true_loss": 3.6699 }, { "epoch": 1.9248372460051293, "grad_norm": 86.77145422694792, "learning_rate": 4.221078001752848e-06, "loss": 30.1541, "step": 1220, "true_loss": 3.6909 }, { "epoch": 1.9406194515683568, "grad_norm": 80.78042974871043, "learning_rate": 4.210122699386504e-06, "loss": 29.4607, "step": 1230, "true_loss": 3.5711 }, { "epoch": 1.956401657131584, "grad_norm": 81.04377653758199, "learning_rate": 4.199167397020158e-06, "loss": 29.3211, "step": 1240, "true_loss": 3.8011 }, { "epoch": 1.9721838626948116, "grad_norm": 86.55563161686023, "learning_rate": 4.188212094653813e-06, "loss": 29.9817, "step": 1250, "true_loss": 3.7149 }, { "epoch": 1.9721838626948116, "eval_accuracy": 0.15964523281596452, "eval_loss": 3.449939727783203, "eval_runtime": 23.6136, "eval_samples_per_second": 38.198, "eval_steps_per_second": 4.785, "step": 1250 }, { "epoch": 1.987966068258039, "grad_norm": 89.14780268157863, "learning_rate": 4.1772567922874675e-06, "loss": 29.7539, "step": 1260, "true_loss": 3.8852 }, { "epoch": 2.0031564411126457, "grad_norm": 85.67873888388756, "learning_rate": 4.166301489921122e-06, "loss": 27.7418, "step": 1270, "true_loss": 3.4435 }, { "epoch": 2.0189386466758728, "grad_norm": 91.84834530303071, "learning_rate": 4.155346187554777e-06, "loss": 28.0091, "step": 1280, "true_loss": 3.6655 }, { "epoch": 2.0347208522391003, "grad_norm": 92.68958340455731, "learning_rate": 4.144390885188432e-06, "loss": 28.1967, "step": 1290, "true_loss": 3.4276 }, { "epoch": 2.0505030578023278, "grad_norm": 89.43624828935572, "learning_rate": 4.133435582822086e-06, "loss": 28.5997, "step": 1300, "true_loss": 3.5717 }, { "epoch": 2.0505030578023278, "eval_accuracy": 0.17738359201773837, "eval_loss": 3.4442319869995117, "eval_runtime": 23.8191, "eval_samples_per_second": 37.869, "eval_steps_per_second": 4.744, "step": 1300 }, { "epoch": 2.0662852633655553, "grad_norm": 99.35761435133952, "learning_rate": 4.1224802804557405e-06, "loss": 28.5825, "step": 1310, "true_loss": 3.6905 }, { "epoch": 2.082067468928783, "grad_norm": 107.54245855574793, "learning_rate": 4.111524978089396e-06, "loss": 28.3337, "step": 1320, "true_loss": 3.1879 }, { "epoch": 2.0978496744920103, "grad_norm": 101.36444136892075, "learning_rate": 4.10056967572305e-06, "loss": 28.1448, "step": 1330, "true_loss": 3.3838 }, { "epoch": 2.113631880055238, "grad_norm": 91.57881684087138, "learning_rate": 4.089614373356705e-06, "loss": 28.439, "step": 1340, "true_loss": 3.4833 }, { "epoch": 2.1294140856184653, "grad_norm": 98.34826339265827, "learning_rate": 4.07865907099036e-06, "loss": 28.1968, "step": 1350, "true_loss": 3.5336 }, { "epoch": 2.1294140856184653, "eval_accuracy": 0.17405764966740578, "eval_loss": 3.4274425506591797, "eval_runtime": 23.5103, "eval_samples_per_second": 38.366, "eval_steps_per_second": 4.806, "step": 1350 }, { "epoch": 2.1451962911816924, "grad_norm": 96.2167621370401, "learning_rate": 4.067703768624014e-06, "loss": 28.4515, "step": 1360, "true_loss": 3.4945 }, { "epoch": 2.16097849674492, "grad_norm": 94.64497682878313, "learning_rate": 4.056748466257669e-06, "loss": 27.6678, "step": 1370, "true_loss": 3.4098 }, { "epoch": 2.1767607023081474, "grad_norm": 100.20459116401894, "learning_rate": 4.045793163891324e-06, "loss": 28.8158, "step": 1380, "true_loss": 3.5925 }, { "epoch": 2.192542907871375, "grad_norm": 106.69369311300359, "learning_rate": 4.034837861524978e-06, "loss": 28.4355, "step": 1390, "true_loss": 3.6526 }, { "epoch": 2.2083251134346025, "grad_norm": 95.16067729523829, "learning_rate": 4.023882559158634e-06, "loss": 28.3308, "step": 1400, "true_loss": 3.4854 }, { "epoch": 2.2083251134346025, "eval_accuracy": 0.17073170731707318, "eval_loss": 3.468222141265869, "eval_runtime": 23.3676, "eval_samples_per_second": 38.601, "eval_steps_per_second": 4.836, "step": 1400 }, { "epoch": 2.22410731899783, "grad_norm": 106.31029544813128, "learning_rate": 4.0129272567922874e-06, "loss": 27.7881, "step": 1410, "true_loss": 3.5438 }, { "epoch": 2.2398895245610575, "grad_norm": 118.12691339175365, "learning_rate": 4.001971954425942e-06, "loss": 28.4915, "step": 1420, "true_loss": 3.7524 }, { "epoch": 2.255671730124285, "grad_norm": 102.56498798791145, "learning_rate": 3.9910166520595975e-06, "loss": 28.4596, "step": 1430, "true_loss": 3.5802 }, { "epoch": 2.2714539356875125, "grad_norm": 92.83492025526797, "learning_rate": 3.980061349693251e-06, "loss": 29.4852, "step": 1440, "true_loss": 3.697 }, { "epoch": 2.28723614125074, "grad_norm": 104.63474442877235, "learning_rate": 3.969106047326907e-06, "loss": 28.4732, "step": 1450, "true_loss": 3.6749 }, { "epoch": 2.28723614125074, "eval_accuracy": 0.16518847006651885, "eval_loss": 3.4468021392822266, "eval_runtime": 24.1619, "eval_samples_per_second": 37.331, "eval_steps_per_second": 4.677, "step": 1450 }, { "epoch": 2.303018346813967, "grad_norm": 99.15997067939499, "learning_rate": 3.958150744960561e-06, "loss": 28.6682, "step": 1460, "true_loss": 3.705 }, { "epoch": 2.3188005523771946, "grad_norm": 91.4398616289482, "learning_rate": 3.947195442594216e-06, "loss": 28.56, "step": 1470, "true_loss": 3.6423 }, { "epoch": 2.334582757940422, "grad_norm": 96.16402518504508, "learning_rate": 3.9362401402278705e-06, "loss": 28.3552, "step": 1480, "true_loss": 3.5847 }, { "epoch": 2.3503649635036497, "grad_norm": 100.32044951837123, "learning_rate": 3.925284837861525e-06, "loss": 27.8238, "step": 1490, "true_loss": 3.4951 }, { "epoch": 2.366147169066877, "grad_norm": 108.30929919855534, "learning_rate": 3.91432953549518e-06, "loss": 28.0564, "step": 1500, "true_loss": 3.4013 }, { "epoch": 2.366147169066877, "eval_accuracy": 0.19068736141906872, "eval_loss": 3.4383325576782227, "eval_runtime": 23.4626, "eval_samples_per_second": 38.444, "eval_steps_per_second": 4.816, "step": 1500 }, { "epoch": 2.3819293746301047, "grad_norm": 101.32218106896308, "learning_rate": 3.903374233128835e-06, "loss": 28.136, "step": 1510, "true_loss": 3.4468 }, { "epoch": 2.397711580193332, "grad_norm": 119.46781444424694, "learning_rate": 3.892418930762489e-06, "loss": 28.1505, "step": 1520, "true_loss": 3.3804 }, { "epoch": 2.4134937857565593, "grad_norm": 98.2054285284842, "learning_rate": 3.881463628396144e-06, "loss": 27.6035, "step": 1530, "true_loss": 3.3891 }, { "epoch": 2.4292759913197868, "grad_norm": 112.97473372070763, "learning_rate": 3.870508326029799e-06, "loss": 28.4998, "step": 1540, "true_loss": 3.5322 }, { "epoch": 2.4450581968830143, "grad_norm": 117.64901710704211, "learning_rate": 3.859553023663454e-06, "loss": 27.5489, "step": 1550, "true_loss": 3.6249 }, { "epoch": 2.4450581968830143, "eval_accuracy": 0.18514412416851442, "eval_loss": 3.4197070598602295, "eval_runtime": 23.6222, "eval_samples_per_second": 38.184, "eval_steps_per_second": 4.784, "step": 1550 }, { "epoch": 2.460840402446242, "grad_norm": 109.35620586092381, "learning_rate": 3.848597721297108e-06, "loss": 28.0459, "step": 1560, "true_loss": 3.457 }, { "epoch": 2.4766226080094693, "grad_norm": 107.89850154465785, "learning_rate": 3.837642418930763e-06, "loss": 28.2065, "step": 1570, "true_loss": 3.4175 }, { "epoch": 2.492404813572697, "grad_norm": 107.9485757956826, "learning_rate": 3.8266871165644174e-06, "loss": 28.0224, "step": 1580, "true_loss": 3.381 }, { "epoch": 2.5081870191359243, "grad_norm": 104.63289094164804, "learning_rate": 3.815731814198072e-06, "loss": 27.5733, "step": 1590, "true_loss": 3.3078 }, { "epoch": 2.523969224699152, "grad_norm": 108.18649246205509, "learning_rate": 3.804776511831727e-06, "loss": 27.2167, "step": 1600, "true_loss": 3.4603 }, { "epoch": 2.523969224699152, "eval_accuracy": 0.18070953436807094, "eval_loss": 3.431137800216675, "eval_runtime": 23.504, "eval_samples_per_second": 38.376, "eval_steps_per_second": 4.808, "step": 1600 }, { "epoch": 2.5397514302623794, "grad_norm": 114.14145896126591, "learning_rate": 3.7938212094653817e-06, "loss": 27.6766, "step": 1610, "true_loss": 3.5939 }, { "epoch": 2.555533635825607, "grad_norm": 101.41778644287517, "learning_rate": 3.7828659070990363e-06, "loss": 28.5637, "step": 1620, "true_loss": 3.5553 }, { "epoch": 2.571315841388834, "grad_norm": 118.96886281607144, "learning_rate": 3.771910604732691e-06, "loss": 28.2376, "step": 1630, "true_loss": 3.4421 }, { "epoch": 2.5870980469520615, "grad_norm": 105.63150744436943, "learning_rate": 3.760955302366346e-06, "loss": 27.8268, "step": 1640, "true_loss": 3.658 }, { "epoch": 2.602880252515289, "grad_norm": 101.02996745585388, "learning_rate": 3.7500000000000005e-06, "loss": 27.7274, "step": 1650, "true_loss": 3.4037 }, { "epoch": 2.602880252515289, "eval_accuracy": 0.17960088691796008, "eval_loss": 3.376224994659424, "eval_runtime": 23.5541, "eval_samples_per_second": 38.295, "eval_steps_per_second": 4.797, "step": 1650 }, { "epoch": 2.6186624580785165, "grad_norm": 116.24820801533909, "learning_rate": 3.7390446976336547e-06, "loss": 27.7633, "step": 1660, "true_loss": 3.6444 }, { "epoch": 2.634444663641744, "grad_norm": 110.02247087865383, "learning_rate": 3.7280893952673097e-06, "loss": 27.6797, "step": 1670, "true_loss": 3.435 }, { "epoch": 2.6502268692049715, "grad_norm": 120.06730846501912, "learning_rate": 3.7171340929009643e-06, "loss": 27.2491, "step": 1680, "true_loss": 3.1836 }, { "epoch": 2.6660090747681986, "grad_norm": 121.81270597045369, "learning_rate": 3.706178790534619e-06, "loss": 28.073, "step": 1690, "true_loss": 3.3306 }, { "epoch": 2.681791280331426, "grad_norm": 110.21441029227314, "learning_rate": 3.6952234881682735e-06, "loss": 27.938, "step": 1700, "true_loss": 3.4295 }, { "epoch": 2.681791280331426, "eval_accuracy": 0.18292682926829268, "eval_loss": 3.401149034500122, "eval_runtime": 23.5868, "eval_samples_per_second": 38.242, "eval_steps_per_second": 4.791, "step": 1700 }, { "epoch": 2.6975734858946536, "grad_norm": 110.62961142172523, "learning_rate": 3.6842681858019286e-06, "loss": 27.7022, "step": 1710, "true_loss": 3.6102 }, { "epoch": 2.713355691457881, "grad_norm": 121.59511608435213, "learning_rate": 3.673312883435583e-06, "loss": 27.8377, "step": 1720, "true_loss": 3.3472 }, { "epoch": 2.7291378970211086, "grad_norm": 110.26717672982993, "learning_rate": 3.662357581069238e-06, "loss": 27.6534, "step": 1730, "true_loss": 3.3431 }, { "epoch": 2.744920102584336, "grad_norm": 117.14013110668091, "learning_rate": 3.6514022787028924e-06, "loss": 28.1573, "step": 1740, "true_loss": 3.5585 }, { "epoch": 2.7607023081475637, "grad_norm": 103.22808556925521, "learning_rate": 3.6404469763365474e-06, "loss": 28.8963, "step": 1750, "true_loss": 3.7366 }, { "epoch": 2.7607023081475637, "eval_accuracy": 0.21507760532150777, "eval_loss": 3.3600847721099854, "eval_runtime": 23.7689, "eval_samples_per_second": 37.949, "eval_steps_per_second": 4.754, "step": 1750 }, { "epoch": 2.776484513710791, "grad_norm": 119.68672918874461, "learning_rate": 3.629491673970202e-06, "loss": 27.1578, "step": 1760, "true_loss": 3.2452 }, { "epoch": 2.7922667192740187, "grad_norm": 113.81962909510548, "learning_rate": 3.6185363716038562e-06, "loss": 27.3928, "step": 1770, "true_loss": 3.4252 }, { "epoch": 2.808048924837246, "grad_norm": 110.9797561464594, "learning_rate": 3.6075810692375112e-06, "loss": 27.0098, "step": 1780, "true_loss": 3.5172 }, { "epoch": 2.8238311304004737, "grad_norm": 135.79617775683806, "learning_rate": 3.596625766871166e-06, "loss": 27.3674, "step": 1790, "true_loss": 3.3797 }, { "epoch": 2.839613335963701, "grad_norm": 119.33589374926603, "learning_rate": 3.585670464504821e-06, "loss": 27.2162, "step": 1800, "true_loss": 3.3234 }, { "epoch": 2.839613335963701, "eval_accuracy": 0.1951219512195122, "eval_loss": 3.3591113090515137, "eval_runtime": 23.4441, "eval_samples_per_second": 38.474, "eval_steps_per_second": 4.82, "step": 1800 }, { "epoch": 2.8553955415269283, "grad_norm": 119.37156598330388, "learning_rate": 3.574715162138475e-06, "loss": 27.3081, "step": 1810, "true_loss": 3.2923 }, { "epoch": 2.871177747090156, "grad_norm": 132.69349392778963, "learning_rate": 3.56375985977213e-06, "loss": 26.8125, "step": 1820, "true_loss": 3.536 }, { "epoch": 2.8869599526533833, "grad_norm": 116.2205229286727, "learning_rate": 3.5528045574057847e-06, "loss": 26.5684, "step": 1830, "true_loss": 3.1813 }, { "epoch": 2.902742158216611, "grad_norm": 111.1425353804662, "learning_rate": 3.5418492550394397e-06, "loss": 26.8854, "step": 1840, "true_loss": 3.1503 }, { "epoch": 2.9185243637798384, "grad_norm": 113.60358152983127, "learning_rate": 3.530893952673094e-06, "loss": 26.6366, "step": 1850, "true_loss": 2.9838 }, { "epoch": 2.9185243637798384, "eval_accuracy": 0.19955654101995565, "eval_loss": 3.397289276123047, "eval_runtime": 23.961, "eval_samples_per_second": 37.644, "eval_steps_per_second": 4.716, "step": 1850 }, { "epoch": 2.9343065693430654, "grad_norm": 112.20064363741088, "learning_rate": 3.519938650306749e-06, "loss": 27.5522, "step": 1860, "true_loss": 3.4456 }, { "epoch": 2.950088774906293, "grad_norm": 116.35639667251797, "learning_rate": 3.5089833479404035e-06, "loss": 27.3933, "step": 1870, "true_loss": 3.2476 }, { "epoch": 2.9658709804695205, "grad_norm": 113.51852904309564, "learning_rate": 3.4980280455740586e-06, "loss": 27.3116, "step": 1880, "true_loss": 3.4052 }, { "epoch": 2.981653186032748, "grad_norm": 140.06716231141482, "learning_rate": 3.4870727432077128e-06, "loss": 27.529, "step": 1890, "true_loss": 3.2584 }, { "epoch": 2.9974353915959755, "grad_norm": 135.07195993457486, "learning_rate": 3.4761174408413674e-06, "loss": 26.2922, "step": 1900, "true_loss": 3.6881 }, { "epoch": 2.9974353915959755, "eval_accuracy": 0.21175166297117518, "eval_loss": 3.410022020339966, "eval_runtime": 23.388, "eval_samples_per_second": 38.567, "eval_steps_per_second": 4.832, "step": 1900 }, { "epoch": 3.012625764450582, "grad_norm": 137.85175905803882, "learning_rate": 3.4651621384750224e-06, "loss": 23.6846, "step": 1910, "true_loss": 3.1292 }, { "epoch": 3.0284079700138093, "grad_norm": 172.1559954781924, "learning_rate": 3.4542068361086766e-06, "loss": 24.3275, "step": 1920, "true_loss": 3.029 }, { "epoch": 3.044190175577037, "grad_norm": 162.1531120047678, "learning_rate": 3.4432515337423316e-06, "loss": 23.6388, "step": 1930, "true_loss": 3.1426 }, { "epoch": 3.0599723811402644, "grad_norm": 172.67413218718934, "learning_rate": 3.432296231375986e-06, "loss": 23.244, "step": 1940, "true_loss": 3.1781 }, { "epoch": 3.075754586703492, "grad_norm": 187.41213076128065, "learning_rate": 3.4213409290096412e-06, "loss": 22.4652, "step": 1950, "true_loss": 2.8263 }, { "epoch": 3.075754586703492, "eval_accuracy": 0.2073170731707317, "eval_loss": 3.5123629570007324, "eval_runtime": 23.4919, "eval_samples_per_second": 38.396, "eval_steps_per_second": 4.81, "step": 1950 }, { "epoch": 3.0915367922667194, "grad_norm": 166.74434221165308, "learning_rate": 3.4103856266432954e-06, "loss": 23.1014, "step": 1960, "true_loss": 2.6932 }, { "epoch": 3.107318997829947, "grad_norm": 184.15815191712096, "learning_rate": 3.3994303242769504e-06, "loss": 23.5115, "step": 1970, "true_loss": 2.9855 }, { "epoch": 3.123101203393174, "grad_norm": 193.10485815495565, "learning_rate": 3.388475021910605e-06, "loss": 23.2851, "step": 1980, "true_loss": 3.0261 }, { "epoch": 3.1388834089564015, "grad_norm": 176.70237245748737, "learning_rate": 3.37751971954426e-06, "loss": 22.5552, "step": 1990, "true_loss": 2.7875 }, { "epoch": 3.154665614519629, "grad_norm": 175.45874674208062, "learning_rate": 3.3665644171779143e-06, "loss": 23.0779, "step": 2000, "true_loss": 3.0755 }, { "epoch": 3.154665614519629, "eval_accuracy": 0.1951219512195122, "eval_loss": 3.518279790878296, "eval_runtime": 23.429, "eval_samples_per_second": 38.499, "eval_steps_per_second": 4.823, "step": 2000 }, { "epoch": 3.1704478200828565, "grad_norm": 204.6875043602089, "learning_rate": 3.355609114811569e-06, "loss": 23.9188, "step": 2010, "true_loss": 2.9287 }, { "epoch": 3.186230025646084, "grad_norm": 181.8017919728403, "learning_rate": 3.344653812445224e-06, "loss": 23.0104, "step": 2020, "true_loss": 2.5328 }, { "epoch": 3.2020122312093116, "grad_norm": 173.98317233976945, "learning_rate": 3.333698510078878e-06, "loss": 23.7301, "step": 2030, "true_loss": 2.871 }, { "epoch": 3.217794436772539, "grad_norm": 174.0443479722808, "learning_rate": 3.322743207712533e-06, "loss": 24.3288, "step": 2040, "true_loss": 3.1793 }, { "epoch": 3.2335766423357666, "grad_norm": 185.90013088197432, "learning_rate": 3.3117879053461877e-06, "loss": 22.5937, "step": 2050, "true_loss": 3.1133 }, { "epoch": 3.2335766423357666, "eval_accuracy": 0.18625277161862527, "eval_loss": 3.5166099071502686, "eval_runtime": 23.593, "eval_samples_per_second": 38.232, "eval_steps_per_second": 4.79, "step": 2050 }, { "epoch": 3.2493588478989937, "grad_norm": 189.857780105008, "learning_rate": 3.3008326029798428e-06, "loss": 22.687, "step": 2060, "true_loss": 2.9259 }, { "epoch": 3.265141053462221, "grad_norm": 198.11103175678093, "learning_rate": 3.289877300613497e-06, "loss": 22.464, "step": 2070, "true_loss": 2.717 }, { "epoch": 3.2809232590254487, "grad_norm": 173.06723146578037, "learning_rate": 3.278921998247152e-06, "loss": 23.7517, "step": 2080, "true_loss": 2.8572 }, { "epoch": 3.296705464588676, "grad_norm": 177.6202554246561, "learning_rate": 3.2679666958808066e-06, "loss": 23.381, "step": 2090, "true_loss": 2.8204 }, { "epoch": 3.3124876701519037, "grad_norm": 194.9987607510721, "learning_rate": 3.2570113935144616e-06, "loss": 23.0476, "step": 2100, "true_loss": 2.7996 }, { "epoch": 3.3124876701519037, "eval_accuracy": 0.20953436807095344, "eval_loss": 3.53348445892334, "eval_runtime": 23.5604, "eval_samples_per_second": 38.285, "eval_steps_per_second": 4.796, "step": 2100 }, { "epoch": 3.328269875715131, "grad_norm": 196.92681142174288, "learning_rate": 3.2460560911481158e-06, "loss": 23.0719, "step": 2110, "true_loss": 2.9121 }, { "epoch": 3.3440520812783587, "grad_norm": 195.62741425178288, "learning_rate": 3.235100788781771e-06, "loss": 23.2077, "step": 2120, "true_loss": 3.0814 }, { "epoch": 3.3598342868415862, "grad_norm": 177.23029524504884, "learning_rate": 3.2241454864154254e-06, "loss": 22.3518, "step": 2130, "true_loss": 2.7904 }, { "epoch": 3.3756164924048138, "grad_norm": 202.04264278641077, "learning_rate": 3.2131901840490796e-06, "loss": 23.189, "step": 2140, "true_loss": 2.9555 }, { "epoch": 3.3913986979680413, "grad_norm": 185.33417167714055, "learning_rate": 3.2022348816827346e-06, "loss": 23.3729, "step": 2150, "true_loss": 2.8725 }, { "epoch": 3.3913986979680413, "eval_accuracy": 0.21951219512195122, "eval_loss": 3.5268046855926514, "eval_runtime": 24.2636, "eval_samples_per_second": 37.175, "eval_steps_per_second": 4.657, "step": 2150 }, { "epoch": 3.4071809035312683, "grad_norm": 192.10647604764068, "learning_rate": 3.1912795793163892e-06, "loss": 22.5887, "step": 2160, "true_loss": 2.9837 }, { "epoch": 3.422963109094496, "grad_norm": 202.2192284482673, "learning_rate": 3.1803242769500443e-06, "loss": 23.7945, "step": 2170, "true_loss": 3.1055 }, { "epoch": 3.4387453146577234, "grad_norm": 204.53307345645257, "learning_rate": 3.1693689745836984e-06, "loss": 23.563, "step": 2180, "true_loss": 2.9075 }, { "epoch": 3.454527520220951, "grad_norm": 207.29464247219633, "learning_rate": 3.1584136722173535e-06, "loss": 22.7859, "step": 2190, "true_loss": 2.7214 }, { "epoch": 3.4703097257841784, "grad_norm": 198.53826456839306, "learning_rate": 3.147458369851008e-06, "loss": 23.0344, "step": 2200, "true_loss": 2.7901 }, { "epoch": 3.4703097257841784, "eval_accuracy": 0.21618625277161863, "eval_loss": 3.5072102546691895, "eval_runtime": 23.4117, "eval_samples_per_second": 38.528, "eval_steps_per_second": 4.827, "step": 2200 }, { "epoch": 3.486091931347406, "grad_norm": 190.81780201345606, "learning_rate": 3.136503067484663e-06, "loss": 23.5435, "step": 2210, "true_loss": 3.0691 }, { "epoch": 3.501874136910633, "grad_norm": 214.328858364254, "learning_rate": 3.1255477651183173e-06, "loss": 23.68, "step": 2220, "true_loss": 2.6397 }, { "epoch": 3.5176563424738605, "grad_norm": 206.68952001153212, "learning_rate": 3.1145924627519723e-06, "loss": 23.0952, "step": 2230, "true_loss": 2.7205 }, { "epoch": 3.533438548037088, "grad_norm": 208.75152165326494, "learning_rate": 3.103637160385627e-06, "loss": 22.5336, "step": 2240, "true_loss": 3.036 }, { "epoch": 3.5492207536003155, "grad_norm": 184.9626693955468, "learning_rate": 3.0926818580192815e-06, "loss": 23.4844, "step": 2250, "true_loss": 2.9116 }, { "epoch": 3.5492207536003155, "eval_accuracy": 0.2106430155210643, "eval_loss": 3.498487949371338, "eval_runtime": 23.483, "eval_samples_per_second": 38.411, "eval_steps_per_second": 4.812, "step": 2250 }, { "epoch": 3.565002959163543, "grad_norm": 198.96691829413248, "learning_rate": 3.081726555652936e-06, "loss": 23.3554, "step": 2260, "true_loss": 2.9216 }, { "epoch": 3.5807851647267706, "grad_norm": 180.92135475830804, "learning_rate": 3.0707712532865907e-06, "loss": 22.7764, "step": 2270, "true_loss": 2.9984 }, { "epoch": 3.596567370289998, "grad_norm": 183.4421393512338, "learning_rate": 3.0598159509202458e-06, "loss": 23.372, "step": 2280, "true_loss": 2.9573 }, { "epoch": 3.6123495758532256, "grad_norm": 202.35817132936737, "learning_rate": 3.0488606485539004e-06, "loss": 23.4257, "step": 2290, "true_loss": 2.6883 }, { "epoch": 3.628131781416453, "grad_norm": 199.82230242607557, "learning_rate": 3.037905346187555e-06, "loss": 22.1407, "step": 2300, "true_loss": 2.74 }, { "epoch": 3.628131781416453, "eval_accuracy": 0.20842572062084258, "eval_loss": 3.5309500694274902, "eval_runtime": 24.0391, "eval_samples_per_second": 37.522, "eval_steps_per_second": 4.701, "step": 2300 }, { "epoch": 3.6439139869796806, "grad_norm": 194.28708558330584, "learning_rate": 3.0269500438212096e-06, "loss": 22.5618, "step": 2310, "true_loss": 2.6648 }, { "epoch": 3.659696192542908, "grad_norm": 193.7524928452101, "learning_rate": 3.0159947414548646e-06, "loss": 22.1443, "step": 2320, "true_loss": 2.8645 }, { "epoch": 3.675478398106135, "grad_norm": 192.96315977280997, "learning_rate": 3.0050394390885192e-06, "loss": 23.2362, "step": 2330, "true_loss": 2.8177 }, { "epoch": 3.6912606036693627, "grad_norm": 198.51233866275422, "learning_rate": 2.994084136722174e-06, "loss": 22.4093, "step": 2340, "true_loss": 2.7964 }, { "epoch": 3.70704280923259, "grad_norm": 208.98437504465784, "learning_rate": 2.9831288343558284e-06, "loss": 23.2475, "step": 2350, "true_loss": 3.0637 }, { "epoch": 3.70704280923259, "eval_accuracy": 0.21286031042128603, "eval_loss": 3.5128560066223145, "eval_runtime": 23.431, "eval_samples_per_second": 38.496, "eval_steps_per_second": 4.823, "step": 2350 }, { "epoch": 3.7228250147958177, "grad_norm": 182.89852894050068, "learning_rate": 2.972173531989483e-06, "loss": 22.6541, "step": 2360, "true_loss": 2.9868 }, { "epoch": 3.7386072203590452, "grad_norm": 223.98374293428023, "learning_rate": 2.961218229623138e-06, "loss": 22.4632, "step": 2370, "true_loss": 2.6294 }, { "epoch": 3.7543894259222728, "grad_norm": 182.3598356504213, "learning_rate": 2.9502629272567923e-06, "loss": 22.1748, "step": 2380, "true_loss": 2.5931 }, { "epoch": 3.7701716314855, "grad_norm": 213.11724384324478, "learning_rate": 2.9393076248904473e-06, "loss": 22.3503, "step": 2390, "true_loss": 2.8395 }, { "epoch": 3.7859538370487273, "grad_norm": 208.8162547470841, "learning_rate": 2.928352322524102e-06, "loss": 23.484, "step": 2400, "true_loss": 2.8696 }, { "epoch": 3.7859538370487273, "eval_accuracy": 0.21951219512195122, "eval_loss": 3.5314548015594482, "eval_runtime": 23.301, "eval_samples_per_second": 38.711, "eval_steps_per_second": 4.85, "step": 2400 }, { "epoch": 3.801736042611955, "grad_norm": 194.12324067147534, "learning_rate": 2.917397020157757e-06, "loss": 22.9102, "step": 2410, "true_loss": 3.0521 }, { "epoch": 3.8175182481751824, "grad_norm": 202.83827734728044, "learning_rate": 2.906441717791411e-06, "loss": 22.6028, "step": 2420, "true_loss": 2.8784 }, { "epoch": 3.83330045373841, "grad_norm": 195.96924169110147, "learning_rate": 2.895486415425066e-06, "loss": 22.2391, "step": 2430, "true_loss": 2.9091 }, { "epoch": 3.8490826593016374, "grad_norm": 188.2089301960466, "learning_rate": 2.8845311130587207e-06, "loss": 22.8444, "step": 2440, "true_loss": 2.9223 }, { "epoch": 3.864864864864865, "grad_norm": 201.5088960578464, "learning_rate": 2.8735758106923758e-06, "loss": 22.4999, "step": 2450, "true_loss": 2.8717 }, { "epoch": 3.864864864864865, "eval_accuracy": 0.20842572062084258, "eval_loss": 3.4804859161376953, "eval_runtime": 23.7691, "eval_samples_per_second": 37.948, "eval_steps_per_second": 4.754, "step": 2450 }, { "epoch": 3.8806470704280924, "grad_norm": 198.094728115419, "learning_rate": 2.86262050832603e-06, "loss": 23.0397, "step": 2460, "true_loss": 2.7442 }, { "epoch": 3.89642927599132, "grad_norm": 195.44923168057892, "learning_rate": 2.851665205959685e-06, "loss": 23.0451, "step": 2470, "true_loss": 2.8521 }, { "epoch": 3.9122114815545475, "grad_norm": 201.32496209702103, "learning_rate": 2.8407099035933396e-06, "loss": 22.3542, "step": 2480, "true_loss": 2.9322 }, { "epoch": 3.927993687117775, "grad_norm": 194.7632340960965, "learning_rate": 2.8297546012269938e-06, "loss": 23.3372, "step": 2490, "true_loss": 3.1217 }, { "epoch": 3.943775892681002, "grad_norm": 200.353777290284, "learning_rate": 2.818799298860649e-06, "loss": 21.9466, "step": 2500, "true_loss": 2.5317 }, { "epoch": 3.943775892681002, "eval_accuracy": 0.21286031042128603, "eval_loss": 3.514308452606201, "eval_runtime": 23.534, "eval_samples_per_second": 38.328, "eval_steps_per_second": 4.802, "step": 2500 }, { "epoch": 3.9595580982442296, "grad_norm": 187.63799888370244, "learning_rate": 2.8078439964943034e-06, "loss": 22.6469, "step": 2510, "true_loss": 2.8054 }, { "epoch": 3.975340303807457, "grad_norm": 205.7323322057032, "learning_rate": 2.7968886941279584e-06, "loss": 21.98, "step": 2520, "true_loss": 2.8249 }, { "epoch": 3.9911225093706846, "grad_norm": 197.38257436289075, "learning_rate": 2.7859333917616126e-06, "loss": 21.7159, "step": 2530, "true_loss": 2.7724 }, { "epoch": 4.006312882225291, "grad_norm": 222.54953462158178, "learning_rate": 2.7749780893952676e-06, "loss": 20.2579, "step": 2540, "true_loss": 2.5081 }, { "epoch": 4.022095087788519, "grad_norm": 263.179309170716, "learning_rate": 2.7640227870289223e-06, "loss": 15.9901, "step": 2550, "true_loss": 1.9248 }, { "epoch": 4.022095087788519, "eval_accuracy": 0.19623059866962306, "eval_loss": 3.57926344871521, "eval_runtime": 23.416, "eval_samples_per_second": 38.521, "eval_steps_per_second": 4.826, "step": 2550 }, { "epoch": 4.0378772933517455, "grad_norm": 224.06117268420874, "learning_rate": 2.7530674846625773e-06, "loss": 15.1076, "step": 2560, "true_loss": 1.7561 }, { "epoch": 4.053659498914973, "grad_norm": 245.64790419721942, "learning_rate": 2.7421121822962315e-06, "loss": 15.5596, "step": 2570, "true_loss": 1.995 }, { "epoch": 4.0694417044782005, "grad_norm": 254.73726115807497, "learning_rate": 2.7311568799298865e-06, "loss": 14.7854, "step": 2580, "true_loss": 1.8945 }, { "epoch": 4.085223910041428, "grad_norm": 217.37217358949115, "learning_rate": 2.720201577563541e-06, "loss": 14.8279, "step": 2590, "true_loss": 1.8225 }, { "epoch": 4.1010061156046556, "grad_norm": 249.87912566036826, "learning_rate": 2.7092462751971953e-06, "loss": 15.3187, "step": 2600, "true_loss": 2.1311 }, { "epoch": 4.1010061156046556, "eval_accuracy": 0.21618625277161863, "eval_loss": 3.5809571743011475, "eval_runtime": 23.3476, "eval_samples_per_second": 38.634, "eval_steps_per_second": 4.84, "step": 2600 }, { "epoch": 4.116788321167883, "grad_norm": 231.73774788572183, "learning_rate": 2.6982909728308503e-06, "loss": 14.5893, "step": 2610, "true_loss": 1.724 }, { "epoch": 4.132570526731111, "grad_norm": 258.1470312272524, "learning_rate": 2.687335670464505e-06, "loss": 15.7209, "step": 2620, "true_loss": 2.1887 }, { "epoch": 4.148352732294338, "grad_norm": 245.45889800670054, "learning_rate": 2.67638036809816e-06, "loss": 15.1269, "step": 2630, "true_loss": 1.7815 }, { "epoch": 4.164134937857566, "grad_norm": 264.72298003757504, "learning_rate": 2.665425065731814e-06, "loss": 15.0722, "step": 2640, "true_loss": 1.7324 }, { "epoch": 4.179917143420793, "grad_norm": 253.08272787030552, "learning_rate": 2.654469763365469e-06, "loss": 15.2283, "step": 2650, "true_loss": 1.9495 }, { "epoch": 4.179917143420793, "eval_accuracy": 0.21729490022172948, "eval_loss": 3.5796923637390137, "eval_runtime": 23.5115, "eval_samples_per_second": 38.364, "eval_steps_per_second": 4.806, "step": 2650 }, { "epoch": 4.195699348984021, "grad_norm": 248.27551163388904, "learning_rate": 2.6435144609991238e-06, "loss": 15.6778, "step": 2660, "true_loss": 1.8399 }, { "epoch": 4.211481554547248, "grad_norm": 269.8800669651974, "learning_rate": 2.632559158632779e-06, "loss": 14.883, "step": 2670, "true_loss": 1.7684 }, { "epoch": 4.227263760110476, "grad_norm": 248.74538903579307, "learning_rate": 2.621603856266433e-06, "loss": 14.5076, "step": 2680, "true_loss": 1.6692 }, { "epoch": 4.243045965673703, "grad_norm": 294.13582834762326, "learning_rate": 2.610648553900088e-06, "loss": 15.4048, "step": 2690, "true_loss": 1.8551 }, { "epoch": 4.258828171236931, "grad_norm": 268.07178788749997, "learning_rate": 2.5996932515337426e-06, "loss": 16.0024, "step": 2700, "true_loss": 1.9951 }, { "epoch": 4.258828171236931, "eval_accuracy": 0.22062084257206208, "eval_loss": 3.612994909286499, "eval_runtime": 23.4031, "eval_samples_per_second": 38.542, "eval_steps_per_second": 4.828, "step": 2700 }, { "epoch": 4.274610376800158, "grad_norm": 280.6505601935841, "learning_rate": 2.588737949167397e-06, "loss": 15.4043, "step": 2710, "true_loss": 1.9804 }, { "epoch": 4.290392582363385, "grad_norm": 266.4103588432262, "learning_rate": 2.577782646801052e-06, "loss": 16.1067, "step": 2720, "true_loss": 2.0907 }, { "epoch": 4.306174787926612, "grad_norm": 239.77626855795776, "learning_rate": 2.5668273444347064e-06, "loss": 15.0776, "step": 2730, "true_loss": 1.8379 }, { "epoch": 4.32195699348984, "grad_norm": 243.11694972613176, "learning_rate": 2.5558720420683615e-06, "loss": 15.342, "step": 2740, "true_loss": 1.805 }, { "epoch": 4.337739199053067, "grad_norm": 245.8940946348113, "learning_rate": 2.5449167397020156e-06, "loss": 14.451, "step": 2750, "true_loss": 1.867 }, { "epoch": 4.337739199053067, "eval_accuracy": 0.22394678492239467, "eval_loss": 3.6279399394989014, "eval_runtime": 23.5051, "eval_samples_per_second": 38.375, "eval_steps_per_second": 4.807, "step": 2750 }, { "epoch": 4.353521404616295, "grad_norm": 262.68860671766873, "learning_rate": 2.5339614373356707e-06, "loss": 14.9646, "step": 2760, "true_loss": 1.835 }, { "epoch": 4.369303610179522, "grad_norm": 274.50927252942034, "learning_rate": 2.5230061349693253e-06, "loss": 15.7225, "step": 2770, "true_loss": 2.03 }, { "epoch": 4.38508581574275, "grad_norm": 256.87195772482625, "learning_rate": 2.5120508326029803e-06, "loss": 15.1995, "step": 2780, "true_loss": 1.9858 }, { "epoch": 4.400868021305977, "grad_norm": 261.71908712411187, "learning_rate": 2.5010955302366345e-06, "loss": 15.8069, "step": 2790, "true_loss": 2.1286 }, { "epoch": 4.416650226869205, "grad_norm": 263.7677468649738, "learning_rate": 2.4901402278702895e-06, "loss": 14.8953, "step": 2800, "true_loss": 1.8668 }, { "epoch": 4.416650226869205, "eval_accuracy": 0.22062084257206208, "eval_loss": 3.6281204223632812, "eval_runtime": 23.6222, "eval_samples_per_second": 38.184, "eval_steps_per_second": 4.784, "step": 2800 }, { "epoch": 4.4324324324324325, "grad_norm": 240.0068241706479, "learning_rate": 2.479184925503944e-06, "loss": 15.3401, "step": 2810, "true_loss": 1.8909 }, { "epoch": 4.44821463799566, "grad_norm": 350.84965505742645, "learning_rate": 2.4682296231375987e-06, "loss": 15.8098, "step": 2820, "true_loss": 2.192 }, { "epoch": 4.4639968435588875, "grad_norm": 260.6311152796187, "learning_rate": 2.4572743207712533e-06, "loss": 15.242, "step": 2830, "true_loss": 2.0686 }, { "epoch": 4.479779049122115, "grad_norm": 256.6005106575228, "learning_rate": 2.4463190184049084e-06, "loss": 15.8606, "step": 2840, "true_loss": 1.753 }, { "epoch": 4.4955612546853425, "grad_norm": 248.6497689178351, "learning_rate": 2.435363716038563e-06, "loss": 15.3305, "step": 2850, "true_loss": 2.0909 }, { "epoch": 4.4955612546853425, "eval_accuracy": 0.22838137472283815, "eval_loss": 3.613753318786621, "eval_runtime": 24.0498, "eval_samples_per_second": 37.506, "eval_steps_per_second": 4.699, "step": 2850 }, { "epoch": 4.51134346024857, "grad_norm": 272.58834462165584, "learning_rate": 2.4244084136722176e-06, "loss": 15.1298, "step": 2860, "true_loss": 1.7107 }, { "epoch": 4.5271256658117975, "grad_norm": 279.58563220463907, "learning_rate": 2.413453111305872e-06, "loss": 15.0806, "step": 2870, "true_loss": 1.7587 }, { "epoch": 4.542907871375025, "grad_norm": 292.7850680133941, "learning_rate": 2.4024978089395272e-06, "loss": 15.4111, "step": 2880, "true_loss": 1.8804 }, { "epoch": 4.558690076938252, "grad_norm": 254.6269287705716, "learning_rate": 2.3915425065731814e-06, "loss": 15.5332, "step": 2890, "true_loss": 1.9938 }, { "epoch": 4.57447228250148, "grad_norm": 284.08524286820415, "learning_rate": 2.380587204206836e-06, "loss": 14.8029, "step": 2900, "true_loss": 1.901 }, { "epoch": 4.57447228250148, "eval_accuracy": 0.23170731707317074, "eval_loss": 3.6279330253601074, "eval_runtime": 23.461, "eval_samples_per_second": 38.447, "eval_steps_per_second": 4.817, "step": 2900 }, { "epoch": 4.590254488064707, "grad_norm": 291.652737916272, "learning_rate": 2.369631901840491e-06, "loss": 15.6408, "step": 2910, "true_loss": 1.9064 }, { "epoch": 4.606036693627934, "grad_norm": 264.13342924096054, "learning_rate": 2.3586765994741456e-06, "loss": 14.467, "step": 2920, "true_loss": 1.9756 }, { "epoch": 4.621818899191162, "grad_norm": 280.70128406326563, "learning_rate": 2.3477212971078002e-06, "loss": 15.741, "step": 2930, "true_loss": 2.3466 }, { "epoch": 4.637601104754389, "grad_norm": 254.55499933002392, "learning_rate": 2.336765994741455e-06, "loss": 15.4684, "step": 2940, "true_loss": 2.0214 }, { "epoch": 4.653383310317617, "grad_norm": 268.4422835691179, "learning_rate": 2.32581069237511e-06, "loss": 15.1953, "step": 2950, "true_loss": 2.011 }, { "epoch": 4.653383310317617, "eval_accuracy": 0.22838137472283815, "eval_loss": 3.6416101455688477, "eval_runtime": 23.5806, "eval_samples_per_second": 38.252, "eval_steps_per_second": 4.792, "step": 2950 }, { "epoch": 4.669165515880844, "grad_norm": 294.88576397111973, "learning_rate": 2.3148553900087645e-06, "loss": 15.3099, "step": 2960, "true_loss": 1.9063 }, { "epoch": 4.684947721444072, "grad_norm": 265.25779761578593, "learning_rate": 2.303900087642419e-06, "loss": 14.8606, "step": 2970, "true_loss": 1.6947 }, { "epoch": 4.700729927007299, "grad_norm": 278.77122203926234, "learning_rate": 2.2929447852760737e-06, "loss": 16.0485, "step": 2980, "true_loss": 2.3105 }, { "epoch": 4.716512132570527, "grad_norm": 292.0770295818184, "learning_rate": 2.2819894829097287e-06, "loss": 15.8231, "step": 2990, "true_loss": 2.2769 }, { "epoch": 4.732294338133754, "grad_norm": 267.4329313410888, "learning_rate": 2.2710341805433833e-06, "loss": 15.5546, "step": 3000, "true_loss": 1.7627 }, { "epoch": 4.732294338133754, "eval_accuracy": 0.22505543237250555, "eval_loss": 3.6318559646606445, "eval_runtime": 24.1259, "eval_samples_per_second": 37.387, "eval_steps_per_second": 4.684, "step": 3000 }, { "epoch": 4.748076543696982, "grad_norm": 264.0371238056488, "learning_rate": 2.260078878177038e-06, "loss": 15.1835, "step": 3010, "true_loss": 1.5888 }, { "epoch": 4.763858749260209, "grad_norm": 246.3488064797015, "learning_rate": 2.2491235758106925e-06, "loss": 15.439, "step": 3020, "true_loss": 1.7396 }, { "epoch": 4.779640954823437, "grad_norm": 238.79276861638553, "learning_rate": 2.238168273444347e-06, "loss": 15.0343, "step": 3030, "true_loss": 1.9508 }, { "epoch": 4.795423160386664, "grad_norm": 273.5360275134248, "learning_rate": 2.2272129710780018e-06, "loss": 15.2999, "step": 3040, "true_loss": 2.0503 }, { "epoch": 4.811205365949892, "grad_norm": 254.79695828112452, "learning_rate": 2.2162576687116568e-06, "loss": 14.106, "step": 3050, "true_loss": 1.8271 }, { "epoch": 4.811205365949892, "eval_accuracy": 0.23059866962305986, "eval_loss": 3.6070680618286133, "eval_runtime": 23.3702, "eval_samples_per_second": 38.596, "eval_steps_per_second": 4.835, "step": 3050 }, { "epoch": 4.8269875715131185, "grad_norm": 255.99867755555348, "learning_rate": 2.2053023663453114e-06, "loss": 14.9771, "step": 3060, "true_loss": 1.73 }, { "epoch": 4.842769777076346, "grad_norm": 286.00668633943013, "learning_rate": 2.194347063978966e-06, "loss": 14.9414, "step": 3070, "true_loss": 1.8704 }, { "epoch": 4.8585519826395736, "grad_norm": 378.8631646508851, "learning_rate": 2.1833917616126206e-06, "loss": 15.6775, "step": 3080, "true_loss": 1.9395 }, { "epoch": 4.874334188202801, "grad_norm": 279.52842733819, "learning_rate": 2.1724364592462756e-06, "loss": 15.3267, "step": 3090, "true_loss": 1.9507 }, { "epoch": 4.890116393766029, "grad_norm": 270.44390815755895, "learning_rate": 2.1614811568799302e-06, "loss": 15.51, "step": 3100, "true_loss": 1.9118 }, { "epoch": 4.890116393766029, "eval_accuracy": 0.23170731707317074, "eval_loss": 3.612380266189575, "eval_runtime": 23.6236, "eval_samples_per_second": 38.182, "eval_steps_per_second": 4.783, "step": 3100 }, { "epoch": 4.905898599329256, "grad_norm": 278.3564236753552, "learning_rate": 2.150525854513585e-06, "loss": 15.5043, "step": 3110, "true_loss": 1.8665 }, { "epoch": 4.921680804892484, "grad_norm": 265.6572339709949, "learning_rate": 2.1395705521472395e-06, "loss": 14.8824, "step": 3120, "true_loss": 1.8847 }, { "epoch": 4.937463010455711, "grad_norm": 250.981550661661, "learning_rate": 2.128615249780894e-06, "loss": 14.6781, "step": 3130, "true_loss": 2.0143 }, { "epoch": 4.953245216018939, "grad_norm": 295.97281421299397, "learning_rate": 2.1176599474145487e-06, "loss": 15.1454, "step": 3140, "true_loss": 1.7135 }, { "epoch": 4.969027421582166, "grad_norm": 266.2528643762291, "learning_rate": 2.1067046450482033e-06, "loss": 14.8432, "step": 3150, "true_loss": 1.8743 }, { "epoch": 4.969027421582166, "eval_accuracy": 0.23059866962305986, "eval_loss": 3.6412768363952637, "eval_runtime": 23.7259, "eval_samples_per_second": 38.018, "eval_steps_per_second": 4.763, "step": 3150 }, { "epoch": 4.984809627145394, "grad_norm": 273.5417128312429, "learning_rate": 2.0957493426818583e-06, "loss": 14.7765, "step": 3160, "true_loss": 1.9702 }, { "epoch": 5.0, "grad_norm": 210.60631521758378, "learning_rate": 2.084794040315513e-06, "loss": 14.8339, "step": 3170, "true_loss": 1.781 }, { "epoch": 5.0157822055632275, "grad_norm": 211.4880629717372, "learning_rate": 2.0738387379491675e-06, "loss": 8.6916, "step": 3180, "true_loss": 1.1461 }, { "epoch": 5.031564411126455, "grad_norm": 215.6130576412713, "learning_rate": 2.062883435582822e-06, "loss": 8.0195, "step": 3190, "true_loss": 1.2088 }, { "epoch": 5.0473466166896825, "grad_norm": 222.8585820076581, "learning_rate": 2.051928133216477e-06, "loss": 7.5374, "step": 3200, "true_loss": 0.893 }, { "epoch": 5.0473466166896825, "eval_accuracy": 0.21840354767184036, "eval_loss": 3.659017324447632, "eval_runtime": 23.3909, "eval_samples_per_second": 38.562, "eval_steps_per_second": 4.831, "step": 3200 }, { "epoch": 5.06312882225291, "grad_norm": 205.4653306406008, "learning_rate": 2.0409728308501318e-06, "loss": 7.928, "step": 3210, "true_loss": 0.9474 }, { "epoch": 5.078911027816138, "grad_norm": 229.57368500445037, "learning_rate": 2.0300175284837864e-06, "loss": 8.1057, "step": 3220, "true_loss": 0.9603 }, { "epoch": 5.094693233379365, "grad_norm": 241.01143837842798, "learning_rate": 2.019062226117441e-06, "loss": 7.6067, "step": 3230, "true_loss": 0.7023 }, { "epoch": 5.110475438942593, "grad_norm": 215.27457747718245, "learning_rate": 2.0081069237510956e-06, "loss": 7.7044, "step": 3240, "true_loss": 0.8728 }, { "epoch": 5.12625764450582, "grad_norm": 246.10635321679487, "learning_rate": 1.99715162138475e-06, "loss": 7.8058, "step": 3250, "true_loss": 0.8129 }, { "epoch": 5.12625764450582, "eval_accuracy": 0.22727272727272727, "eval_loss": 3.668959856033325, "eval_runtime": 23.3895, "eval_samples_per_second": 38.564, "eval_steps_per_second": 4.831, "step": 3250 }, { "epoch": 5.142039850069047, "grad_norm": 229.81102790302637, "learning_rate": 1.9861963190184048e-06, "loss": 7.1821, "step": 3260, "true_loss": 0.9617 }, { "epoch": 5.157822055632274, "grad_norm": 251.61305994055863, "learning_rate": 1.97524101665206e-06, "loss": 8.0239, "step": 3270, "true_loss": 0.9278 }, { "epoch": 5.173604261195502, "grad_norm": 250.23396953331886, "learning_rate": 1.9642857142857144e-06, "loss": 7.5576, "step": 3280, "true_loss": 1.0655 }, { "epoch": 5.189386466758729, "grad_norm": 217.51604136251817, "learning_rate": 1.953330411919369e-06, "loss": 7.8428, "step": 3290, "true_loss": 1.0078 }, { "epoch": 5.205168672321957, "grad_norm": 224.37251696424042, "learning_rate": 1.9423751095530236e-06, "loss": 6.8004, "step": 3300, "true_loss": 0.9819 }, { "epoch": 5.205168672321957, "eval_accuracy": 0.21729490022172948, "eval_loss": 3.69161057472229, "eval_runtime": 23.5695, "eval_samples_per_second": 38.27, "eval_steps_per_second": 4.794, "step": 3300 }, { "epoch": 5.220950877885184, "grad_norm": 255.1996913911392, "learning_rate": 1.9314198071866787e-06, "loss": 8.2128, "step": 3310, "true_loss": 0.9753 }, { "epoch": 5.236733083448412, "grad_norm": 264.71065827646686, "learning_rate": 1.9204645048203333e-06, "loss": 8.0909, "step": 3320, "true_loss": 1.0297 }, { "epoch": 5.252515289011639, "grad_norm": 225.7360252554658, "learning_rate": 1.909509202453988e-06, "loss": 7.4812, "step": 3330, "true_loss": 1.022 }, { "epoch": 5.268297494574867, "grad_norm": 248.81074162374006, "learning_rate": 1.8985539000876427e-06, "loss": 7.8768, "step": 3340, "true_loss": 1.1491 }, { "epoch": 5.284079700138094, "grad_norm": 218.34786738976797, "learning_rate": 1.8875985977212973e-06, "loss": 7.6919, "step": 3350, "true_loss": 1.0007 }, { "epoch": 5.284079700138094, "eval_accuracy": 0.22838137472283815, "eval_loss": 3.691500186920166, "eval_runtime": 23.5086, "eval_samples_per_second": 38.369, "eval_steps_per_second": 4.807, "step": 3350 }, { "epoch": 5.299861905701322, "grad_norm": 258.1853336569247, "learning_rate": 1.876643295354952e-06, "loss": 7.9807, "step": 3360, "true_loss": 1.0429 }, { "epoch": 5.315644111264549, "grad_norm": 269.3533525110865, "learning_rate": 1.8656879929886065e-06, "loss": 7.2861, "step": 3370, "true_loss": 1.0275 }, { "epoch": 5.331426316827777, "grad_norm": 262.01914816307266, "learning_rate": 1.8547326906222613e-06, "loss": 7.6446, "step": 3380, "true_loss": 0.9375 }, { "epoch": 5.347208522391004, "grad_norm": 633.8389744849534, "learning_rate": 1.843777388255916e-06, "loss": 8.1658, "step": 3390, "true_loss": 1.1047 }, { "epoch": 5.362990727954232, "grad_norm": 244.45401858212358, "learning_rate": 1.8328220858895707e-06, "loss": 8.337, "step": 3400, "true_loss": 1.0639 }, { "epoch": 5.362990727954232, "eval_accuracy": 0.23059866962305986, "eval_loss": 3.6856296062469482, "eval_runtime": 23.531, "eval_samples_per_second": 38.332, "eval_steps_per_second": 4.802, "step": 3400 }, { "epoch": 5.378772933517459, "grad_norm": 243.76292162752466, "learning_rate": 1.8218667835232254e-06, "loss": 8.7476, "step": 3410, "true_loss": 0.9285 }, { "epoch": 5.394555139080687, "grad_norm": 212.24261332252615, "learning_rate": 1.8109114811568802e-06, "loss": 7.9304, "step": 3420, "true_loss": 0.8972 }, { "epoch": 5.410337344643914, "grad_norm": 263.00942595070404, "learning_rate": 1.7999561787905348e-06, "loss": 8.1546, "step": 3430, "true_loss": 1.1745 }, { "epoch": 5.426119550207141, "grad_norm": 211.72496204574992, "learning_rate": 1.7890008764241896e-06, "loss": 8.0793, "step": 3440, "true_loss": 1.2823 }, { "epoch": 5.441901755770369, "grad_norm": 273.13965306529917, "learning_rate": 1.7780455740578442e-06, "loss": 8.2127, "step": 3450, "true_loss": 0.8455 }, { "epoch": 5.441901755770369, "eval_accuracy": 0.22172949002217296, "eval_loss": 3.6701295375823975, "eval_runtime": 23.5843, "eval_samples_per_second": 38.246, "eval_steps_per_second": 4.791, "step": 3450 }, { "epoch": 5.457683961333596, "grad_norm": 261.99980534155185, "learning_rate": 1.767090271691499e-06, "loss": 7.9949, "step": 3460, "true_loss": 0.875 }, { "epoch": 5.473466166896824, "grad_norm": 247.35217877512287, "learning_rate": 1.7561349693251536e-06, "loss": 7.9718, "step": 3470, "true_loss": 1.1792 }, { "epoch": 5.489248372460051, "grad_norm": 249.91907254043844, "learning_rate": 1.745179666958808e-06, "loss": 7.8827, "step": 3480, "true_loss": 1.0348 }, { "epoch": 5.505030578023279, "grad_norm": 261.0219814779852, "learning_rate": 1.7342243645924628e-06, "loss": 7.77, "step": 3490, "true_loss": 0.9038 }, { "epoch": 5.520812783586506, "grad_norm": 212.20072368079562, "learning_rate": 1.7232690622261174e-06, "loss": 7.5585, "step": 3500, "true_loss": 0.8129 }, { "epoch": 5.520812783586506, "eval_accuracy": 0.23170731707317074, "eval_loss": 3.6853256225585938, "eval_runtime": 23.5185, "eval_samples_per_second": 38.353, "eval_steps_per_second": 4.805, "step": 3500 }, { "epoch": 5.536594989149734, "grad_norm": 235.9078904033777, "learning_rate": 1.7123137598597723e-06, "loss": 7.5711, "step": 3510, "true_loss": 0.9626 }, { "epoch": 5.552377194712961, "grad_norm": 255.32363674641684, "learning_rate": 1.7013584574934269e-06, "loss": 7.8968, "step": 3520, "true_loss": 1.0482 }, { "epoch": 5.568159400276189, "grad_norm": 243.72615405564525, "learning_rate": 1.6904031551270817e-06, "loss": 8.3098, "step": 3530, "true_loss": 0.8398 }, { "epoch": 5.583941605839416, "grad_norm": 270.2171791584776, "learning_rate": 1.6794478527607363e-06, "loss": 8.1675, "step": 3540, "true_loss": 1.1636 }, { "epoch": 5.599723811402644, "grad_norm": 213.1666532194797, "learning_rate": 1.6684925503943911e-06, "loss": 8.0201, "step": 3550, "true_loss": 0.8318 }, { "epoch": 5.599723811402644, "eval_accuracy": 0.22838137472283815, "eval_loss": 3.705551862716675, "eval_runtime": 24.1239, "eval_samples_per_second": 37.39, "eval_steps_per_second": 4.684, "step": 3550 }, { "epoch": 5.615506016965871, "grad_norm": 242.4749998274103, "learning_rate": 1.6575372480280457e-06, "loss": 8.3001, "step": 3560, "true_loss": 0.8278 }, { "epoch": 5.631288222529099, "grad_norm": 253.5750968615379, "learning_rate": 1.6465819456617005e-06, "loss": 7.5409, "step": 3570, "true_loss": 0.9173 }, { "epoch": 5.647070428092326, "grad_norm": 500.4945703975864, "learning_rate": 1.6356266432953551e-06, "loss": 8.0423, "step": 3580, "true_loss": 1.0754 }, { "epoch": 5.662852633655554, "grad_norm": 217.62071349539866, "learning_rate": 1.62467134092901e-06, "loss": 7.574, "step": 3590, "true_loss": 0.7832 }, { "epoch": 5.67863483921878, "grad_norm": 248.59249549950363, "learning_rate": 1.6137160385626643e-06, "loss": 7.5434, "step": 3600, "true_loss": 0.746 }, { "epoch": 5.67863483921878, "eval_accuracy": 0.23170731707317074, "eval_loss": 3.7329254150390625, "eval_runtime": 23.5059, "eval_samples_per_second": 38.373, "eval_steps_per_second": 4.807, "step": 3600 }, { "epoch": 5.694417044782008, "grad_norm": 249.80555246680507, "learning_rate": 1.602760736196319e-06, "loss": 8.065, "step": 3610, "true_loss": 0.9916 }, { "epoch": 5.7101992503452355, "grad_norm": 265.27144449478527, "learning_rate": 1.5918054338299738e-06, "loss": 7.7567, "step": 3620, "true_loss": 0.9806 }, { "epoch": 5.725981455908463, "grad_norm": 236.59765116745436, "learning_rate": 1.5808501314636284e-06, "loss": 7.7053, "step": 3630, "true_loss": 0.7775 }, { "epoch": 5.7417636614716905, "grad_norm": 240.63011983772756, "learning_rate": 1.5698948290972832e-06, "loss": 7.9619, "step": 3640, "true_loss": 1.0881 }, { "epoch": 5.757545867034918, "grad_norm": 262.01906833228776, "learning_rate": 1.5589395267309378e-06, "loss": 7.8087, "step": 3650, "true_loss": 0.953 }, { "epoch": 5.757545867034918, "eval_accuracy": 0.22172949002217296, "eval_loss": 3.723902463912964, "eval_runtime": 23.5327, "eval_samples_per_second": 38.33, "eval_steps_per_second": 4.802, "step": 3650 }, { "epoch": 5.7733280725981455, "grad_norm": 243.96844563586995, "learning_rate": 1.5479842243645926e-06, "loss": 7.2145, "step": 3660, "true_loss": 1.0771 }, { "epoch": 5.789110278161373, "grad_norm": 257.7877211427621, "learning_rate": 1.5370289219982472e-06, "loss": 8.1113, "step": 3670, "true_loss": 1.0796 }, { "epoch": 5.8048924837246005, "grad_norm": 274.4683712805951, "learning_rate": 1.526073619631902e-06, "loss": 7.7984, "step": 3680, "true_loss": 1.0509 }, { "epoch": 5.820674689287828, "grad_norm": 630.5201526526877, "learning_rate": 1.5151183172655566e-06, "loss": 7.8103, "step": 3690, "true_loss": 1.087 }, { "epoch": 5.836456894851056, "grad_norm": 258.73227179555346, "learning_rate": 1.5041630148992115e-06, "loss": 7.3694, "step": 3700, "true_loss": 0.7589 }, { "epoch": 5.836456894851056, "eval_accuracy": 0.22838137472283815, "eval_loss": 3.7345616817474365, "eval_runtime": 24.226, "eval_samples_per_second": 37.233, "eval_steps_per_second": 4.664, "step": 3700 }, { "epoch": 5.852239100414283, "grad_norm": 221.81132901904544, "learning_rate": 1.4932077125328659e-06, "loss": 7.1529, "step": 3710, "true_loss": 1.0133 }, { "epoch": 5.868021305977511, "grad_norm": 229.85247706805845, "learning_rate": 1.4822524101665207e-06, "loss": 7.7953, "step": 3720, "true_loss": 1.0672 }, { "epoch": 5.883803511540738, "grad_norm": 252.59659973274174, "learning_rate": 1.4712971078001753e-06, "loss": 7.6561, "step": 3730, "true_loss": 1.0122 }, { "epoch": 5.899585717103966, "grad_norm": 351.91668224119803, "learning_rate": 1.46034180543383e-06, "loss": 7.3552, "step": 3740, "true_loss": 1.0605 }, { "epoch": 5.915367922667192, "grad_norm": 255.06396093134663, "learning_rate": 1.4493865030674847e-06, "loss": 7.6849, "step": 3750, "true_loss": 0.9133 }, { "epoch": 5.915367922667192, "eval_accuracy": 0.22172949002217296, "eval_loss": 3.7306151390075684, "eval_runtime": 23.5603, "eval_samples_per_second": 38.285, "eval_steps_per_second": 4.796, "step": 3750 }, { "epoch": 5.931150128230421, "grad_norm": 268.56649505196793, "learning_rate": 1.4384312007011395e-06, "loss": 7.9126, "step": 3760, "true_loss": 0.8575 }, { "epoch": 5.946932333793647, "grad_norm": 268.3016345503555, "learning_rate": 1.4274758983347941e-06, "loss": 7.6727, "step": 3770, "true_loss": 0.8832 }, { "epoch": 5.962714539356875, "grad_norm": 258.1872257046714, "learning_rate": 1.416520595968449e-06, "loss": 8.2187, "step": 3780, "true_loss": 1.0428 }, { "epoch": 5.978496744920102, "grad_norm": 253.34173748742137, "learning_rate": 1.4055652936021036e-06, "loss": 7.8932, "step": 3790, "true_loss": 0.9234 }, { "epoch": 5.99427895048333, "grad_norm": 254.12151921984102, "learning_rate": 1.3946099912357584e-06, "loss": 7.7489, "step": 3800, "true_loss": 1.0681 }, { "epoch": 5.99427895048333, "eval_accuracy": 0.22172949002217296, "eval_loss": 3.7100601196289062, "eval_runtime": 23.696, "eval_samples_per_second": 38.066, "eval_steps_per_second": 4.769, "step": 3800 }, { "epoch": 6.009469323337936, "grad_norm": 157.3623487365359, "learning_rate": 1.383654688869413e-06, "loss": 5.3213, "step": 3810, "true_loss": 0.711 }, { "epoch": 6.025251528901164, "grad_norm": 172.44448673400197, "learning_rate": 1.3726993865030678e-06, "loss": 3.3077, "step": 3820, "true_loss": 0.3847 }, { "epoch": 6.041033734464391, "grad_norm": 152.1857696461382, "learning_rate": 1.3617440841367222e-06, "loss": 3.2034, "step": 3830, "true_loss": 0.3749 }, { "epoch": 6.056815940027619, "grad_norm": 153.47247439067277, "learning_rate": 1.3507887817703768e-06, "loss": 3.3564, "step": 3840, "true_loss": 0.546 }, { "epoch": 6.072598145590846, "grad_norm": 161.22382500282086, "learning_rate": 1.3398334794040316e-06, "loss": 3.3815, "step": 3850, "true_loss": 0.3163 }, { "epoch": 6.072598145590846, "eval_accuracy": 0.22838137472283815, "eval_loss": 3.7402727603912354, "eval_runtime": 24.4647, "eval_samples_per_second": 36.869, "eval_steps_per_second": 4.619, "step": 3850 }, { "epoch": 6.088380351154074, "grad_norm": 148.0159535633642, "learning_rate": 1.3288781770376862e-06, "loss": 3.5165, "step": 3860, "true_loss": 0.416 }, { "epoch": 6.104162556717301, "grad_norm": 145.63663936903177, "learning_rate": 1.317922874671341e-06, "loss": 3.4343, "step": 3870, "true_loss": 0.4403 }, { "epoch": 6.119944762280529, "grad_norm": 122.70380287238012, "learning_rate": 1.3069675723049956e-06, "loss": 2.854, "step": 3880, "true_loss": 0.2925 }, { "epoch": 6.135726967843756, "grad_norm": 161.85973734551766, "learning_rate": 1.2960122699386505e-06, "loss": 3.3474, "step": 3890, "true_loss": 0.3021 }, { "epoch": 6.151509173406984, "grad_norm": 158.6162965446901, "learning_rate": 1.285056967572305e-06, "loss": 3.467, "step": 3900, "true_loss": 0.4418 }, { "epoch": 6.151509173406984, "eval_accuracy": 0.22838137472283815, "eval_loss": 3.7654776573181152, "eval_runtime": 23.6937, "eval_samples_per_second": 38.069, "eval_steps_per_second": 4.769, "step": 3900 }, { "epoch": 6.167291378970211, "grad_norm": 162.07680720061893, "learning_rate": 1.2741016652059599e-06, "loss": 3.1611, "step": 3910, "true_loss": 0.3767 }, { "epoch": 6.183073584533439, "grad_norm": 129.24265717345276, "learning_rate": 1.2631463628396145e-06, "loss": 3.2409, "step": 3920, "true_loss": 0.4173 }, { "epoch": 6.198855790096666, "grad_norm": 179.72224757617286, "learning_rate": 1.2521910604732693e-06, "loss": 3.5711, "step": 3930, "true_loss": 0.3879 }, { "epoch": 6.214637995659894, "grad_norm": 192.96062107384324, "learning_rate": 1.241235758106924e-06, "loss": 3.3694, "step": 3940, "true_loss": 0.4602 }, { "epoch": 6.230420201223121, "grad_norm": 167.03378504595022, "learning_rate": 1.2302804557405785e-06, "loss": 3.561, "step": 3950, "true_loss": 0.4057 }, { "epoch": 6.230420201223121, "eval_accuracy": 0.22062084257206208, "eval_loss": 3.7767574787139893, "eval_runtime": 23.7779, "eval_samples_per_second": 37.934, "eval_steps_per_second": 4.752, "step": 3950 }, { "epoch": 6.246202406786348, "grad_norm": 182.71756052085996, "learning_rate": 1.2193251533742333e-06, "loss": 3.446, "step": 3960, "true_loss": 0.5335 }, { "epoch": 6.2619846123495755, "grad_norm": 146.7327614801931, "learning_rate": 1.208369851007888e-06, "loss": 3.6396, "step": 3970, "true_loss": 0.4966 }, { "epoch": 6.277766817912803, "grad_norm": 176.09635113701836, "learning_rate": 1.1974145486415425e-06, "loss": 3.2624, "step": 3980, "true_loss": 0.3441 }, { "epoch": 6.2935490234760305, "grad_norm": 178.73960701033369, "learning_rate": 1.1864592462751974e-06, "loss": 3.2378, "step": 3990, "true_loss": 0.3249 }, { "epoch": 6.309331229039258, "grad_norm": 197.8582094367491, "learning_rate": 1.175503943908852e-06, "loss": 3.4962, "step": 4000, "true_loss": 0.507 }, { "epoch": 6.309331229039258, "eval_accuracy": 0.23059866962305986, "eval_loss": 3.7738096714019775, "eval_runtime": 24.236, "eval_samples_per_second": 37.217, "eval_steps_per_second": 4.662, "step": 4000 }, { "epoch": 6.3251134346024855, "grad_norm": 192.9193994183117, "learning_rate": 1.1645486415425068e-06, "loss": 3.5811, "step": 4010, "true_loss": 0.4784 }, { "epoch": 6.340895640165713, "grad_norm": 147.08694007767318, "learning_rate": 1.1535933391761614e-06, "loss": 3.4343, "step": 4020, "true_loss": 0.4624 }, { "epoch": 6.356677845728941, "grad_norm": 169.2574091719148, "learning_rate": 1.1426380368098162e-06, "loss": 3.4695, "step": 4030, "true_loss": 0.5002 }, { "epoch": 6.372460051292168, "grad_norm": 155.9095192533325, "learning_rate": 1.1316827344434706e-06, "loss": 3.1402, "step": 4040, "true_loss": 0.3293 }, { "epoch": 6.388242256855396, "grad_norm": 171.6435971775636, "learning_rate": 1.1207274320771254e-06, "loss": 3.5833, "step": 4050, "true_loss": 0.6128 }, { "epoch": 6.388242256855396, "eval_accuracy": 0.24390243902439024, "eval_loss": 3.7884247303009033, "eval_runtime": 23.4729, "eval_samples_per_second": 38.427, "eval_steps_per_second": 4.814, "step": 4050 }, { "epoch": 6.404024462418623, "grad_norm": 161.61504993970277, "learning_rate": 1.10977212971078e-06, "loss": 3.1155, "step": 4060, "true_loss": 0.2752 }, { "epoch": 6.419806667981851, "grad_norm": 161.95667366681636, "learning_rate": 1.0988168273444349e-06, "loss": 3.6741, "step": 4070, "true_loss": 0.5558 }, { "epoch": 6.435588873545078, "grad_norm": 163.99695502410947, "learning_rate": 1.0878615249780895e-06, "loss": 3.406, "step": 4080, "true_loss": 0.4532 }, { "epoch": 6.451371079108306, "grad_norm": 183.7778216432675, "learning_rate": 1.0769062226117443e-06, "loss": 3.264, "step": 4090, "true_loss": 0.4541 }, { "epoch": 6.467153284671533, "grad_norm": 147.59845547410535, "learning_rate": 1.0659509202453989e-06, "loss": 3.448, "step": 4100, "true_loss": 0.5413 }, { "epoch": 6.467153284671533, "eval_accuracy": 0.22838137472283815, "eval_loss": 3.7974653244018555, "eval_runtime": 23.6445, "eval_samples_per_second": 38.148, "eval_steps_per_second": 4.779, "step": 4100 }, { "epoch": 6.482935490234761, "grad_norm": 252.515347272953, "learning_rate": 1.0549956178790535e-06, "loss": 3.2192, "step": 4110, "true_loss": 0.4028 }, { "epoch": 6.498717695797987, "grad_norm": 177.9868241333003, "learning_rate": 1.0440403155127083e-06, "loss": 3.6647, "step": 4120, "true_loss": 0.4441 }, { "epoch": 6.514499901361216, "grad_norm": 220.48440065290902, "learning_rate": 1.033085013146363e-06, "loss": 3.5525, "step": 4130, "true_loss": 0.282 }, { "epoch": 6.530282106924442, "grad_norm": 156.1318289676741, "learning_rate": 1.0221297107800177e-06, "loss": 3.5674, "step": 4140, "true_loss": 0.4388 }, { "epoch": 6.54606431248767, "grad_norm": 163.9167569080248, "learning_rate": 1.0111744084136723e-06, "loss": 3.1716, "step": 4150, "true_loss": 0.3568 }, { "epoch": 6.54606431248767, "eval_accuracy": 0.23392461197339245, "eval_loss": 3.799577236175537, "eval_runtime": 23.7535, "eval_samples_per_second": 37.973, "eval_steps_per_second": 4.757, "step": 4150 }, { "epoch": 6.561846518050897, "grad_norm": 163.4173016109069, "learning_rate": 1.000219106047327e-06, "loss": 3.3789, "step": 4160, "true_loss": 0.5105 }, { "epoch": 6.577628723614125, "grad_norm": 135.21876637653696, "learning_rate": 9.892638036809818e-07, "loss": 3.6663, "step": 4170, "true_loss": 0.3597 }, { "epoch": 6.593410929177352, "grad_norm": 179.33963478315178, "learning_rate": 9.783085013146364e-07, "loss": 3.4816, "step": 4180, "true_loss": 0.3026 }, { "epoch": 6.60919313474058, "grad_norm": 156.94291167613932, "learning_rate": 9.673531989482912e-07, "loss": 3.215, "step": 4190, "true_loss": 0.2668 }, { "epoch": 6.624975340303807, "grad_norm": 185.90720736593772, "learning_rate": 9.563978965819458e-07, "loss": 3.3075, "step": 4200, "true_loss": 0.43 }, { "epoch": 6.624975340303807, "eval_accuracy": 0.24057649667405764, "eval_loss": 3.786198854446411, "eval_runtime": 23.6222, "eval_samples_per_second": 38.184, "eval_steps_per_second": 4.784, "step": 4200 }, { "epoch": 6.640757545867035, "grad_norm": 176.8072773121208, "learning_rate": 9.454425942156004e-07, "loss": 3.1964, "step": 4210, "true_loss": 0.3582 }, { "epoch": 6.656539751430262, "grad_norm": 170.49858529820645, "learning_rate": 9.344872918492551e-07, "loss": 3.3814, "step": 4220, "true_loss": 0.3033 }, { "epoch": 6.67232195699349, "grad_norm": 161.07442898479096, "learning_rate": 9.235319894829098e-07, "loss": 3.9512, "step": 4230, "true_loss": 0.5216 }, { "epoch": 6.6881041625567175, "grad_norm": 203.15138948673695, "learning_rate": 9.125766871165645e-07, "loss": 3.6654, "step": 4240, "true_loss": 0.3439 }, { "epoch": 6.703886368119945, "grad_norm": 152.33545262236234, "learning_rate": 9.016213847502192e-07, "loss": 3.2303, "step": 4250, "true_loss": 0.3756 }, { "epoch": 6.703886368119945, "eval_accuracy": 0.24279379157427938, "eval_loss": 3.773749351501465, "eval_runtime": 23.5339, "eval_samples_per_second": 38.328, "eval_steps_per_second": 4.802, "step": 4250 }, { "epoch": 6.7196685736831725, "grad_norm": 170.13463864140093, "learning_rate": 8.90666082383874e-07, "loss": 3.4239, "step": 4260, "true_loss": 0.4533 }, { "epoch": 6.7354507792464, "grad_norm": 165.34685293283408, "learning_rate": 8.797107800175285e-07, "loss": 3.3423, "step": 4270, "true_loss": 0.4279 }, { "epoch": 6.7512329848096275, "grad_norm": 161.24888867207457, "learning_rate": 8.687554776511832e-07, "loss": 3.1141, "step": 4280, "true_loss": 0.5133 }, { "epoch": 6.767015190372854, "grad_norm": 159.62875630877653, "learning_rate": 8.578001752848379e-07, "loss": 3.3544, "step": 4290, "true_loss": 0.468 }, { "epoch": 6.7827973959360826, "grad_norm": 192.00912562152587, "learning_rate": 8.468448729184926e-07, "loss": 3.2624, "step": 4300, "true_loss": 0.4606 }, { "epoch": 6.7827973959360826, "eval_accuracy": 0.24390243902439024, "eval_loss": 3.7823567390441895, "eval_runtime": 23.4839, "eval_samples_per_second": 38.409, "eval_steps_per_second": 4.812, "step": 4300 }, { "epoch": 6.798579601499309, "grad_norm": 135.73327781821766, "learning_rate": 8.358895705521473e-07, "loss": 3.4532, "step": 4310, "true_loss": 0.4951 }, { "epoch": 6.814361807062537, "grad_norm": 180.50294844857802, "learning_rate": 8.24934268185802e-07, "loss": 3.8061, "step": 4320, "true_loss": 0.3461 }, { "epoch": 6.830144012625764, "grad_norm": 142.4101162894937, "learning_rate": 8.139789658194566e-07, "loss": 3.4039, "step": 4330, "true_loss": 0.4718 }, { "epoch": 6.845926218188992, "grad_norm": 157.02605462115628, "learning_rate": 8.030236634531113e-07, "loss": 3.4394, "step": 4340, "true_loss": 0.4215 }, { "epoch": 6.861708423752219, "grad_norm": 181.77916083266524, "learning_rate": 7.92068361086766e-07, "loss": 3.7555, "step": 4350, "true_loss": 0.5889 }, { "epoch": 6.861708423752219, "eval_accuracy": 0.24722838137472283, "eval_loss": 3.7913777828216553, "eval_runtime": 23.4108, "eval_samples_per_second": 38.529, "eval_steps_per_second": 4.827, "step": 4350 }, { "epoch": 6.877490629315447, "grad_norm": 180.3595004444777, "learning_rate": 7.811130587204208e-07, "loss": 3.39, "step": 4360, "true_loss": 0.4151 }, { "epoch": 6.893272834878674, "grad_norm": 157.3305218882265, "learning_rate": 7.701577563540755e-07, "loss": 3.1018, "step": 4370, "true_loss": 0.3628 }, { "epoch": 6.909055040441902, "grad_norm": 166.8979472220896, "learning_rate": 7.592024539877302e-07, "loss": 3.4239, "step": 4380, "true_loss": 0.4116 }, { "epoch": 6.924837246005129, "grad_norm": 191.81471291197857, "learning_rate": 7.482471516213848e-07, "loss": 2.975, "step": 4390, "true_loss": 0.4336 }, { "epoch": 6.940619451568357, "grad_norm": 177.9235704204187, "learning_rate": 7.372918492550395e-07, "loss": 3.4614, "step": 4400, "true_loss": 0.3441 }, { "epoch": 6.940619451568357, "eval_accuracy": 0.23725055432372505, "eval_loss": 3.7932116985321045, "eval_runtime": 24.2554, "eval_samples_per_second": 37.188, "eval_steps_per_second": 4.659, "step": 4400 }, { "epoch": 6.956401657131584, "grad_norm": 184.81203868931837, "learning_rate": 7.263365468886942e-07, "loss": 3.4846, "step": 4410, "true_loss": 0.3435 }, { "epoch": 6.972183862694812, "grad_norm": 187.587079475093, "learning_rate": 7.153812445223489e-07, "loss": 4.0774, "step": 4420, "true_loss": 0.2932 }, { "epoch": 6.987966068258039, "grad_norm": 199.2449543984309, "learning_rate": 7.044259421560036e-07, "loss": 3.9163, "step": 4430, "true_loss": 0.3897 }, { "epoch": 7.003156441112646, "grad_norm": 105.53295145788066, "learning_rate": 6.934706397896583e-07, "loss": 2.5755, "step": 4440, "true_loss": 0.3192 }, { "epoch": 7.018938646675873, "grad_norm": 103.81118404439054, "learning_rate": 6.825153374233128e-07, "loss": 1.7792, "step": 4450, "true_loss": 0.195 }, { "epoch": 7.018938646675873, "eval_accuracy": 0.24390243902439024, "eval_loss": 3.791168451309204, "eval_runtime": 23.5252, "eval_samples_per_second": 38.342, "eval_steps_per_second": 4.803, "step": 4450 }, { "epoch": 7.034720852239101, "grad_norm": 79.64682013766931, "learning_rate": 6.715600350569676e-07, "loss": 1.571, "step": 4460, "true_loss": 0.146 }, { "epoch": 7.050503057802328, "grad_norm": 105.9942408058766, "learning_rate": 6.606047326906223e-07, "loss": 1.8884, "step": 4470, "true_loss": 0.2065 }, { "epoch": 7.066285263365556, "grad_norm": 101.85501386180424, "learning_rate": 6.49649430324277e-07, "loss": 1.5474, "step": 4480, "true_loss": 0.2419 }, { "epoch": 7.082067468928782, "grad_norm": 93.2404029331028, "learning_rate": 6.386941279579317e-07, "loss": 1.5579, "step": 4490, "true_loss": 0.1367 }, { "epoch": 7.09784967449201, "grad_norm": 83.89556113955376, "learning_rate": 6.277388255915864e-07, "loss": 1.5726, "step": 4500, "true_loss": 0.1513 }, { "epoch": 7.09784967449201, "eval_accuracy": 0.23392461197339245, "eval_loss": 3.8132660388946533, "eval_runtime": 23.4155, "eval_samples_per_second": 38.521, "eval_steps_per_second": 4.826, "step": 4500 }, { "epoch": 7.113631880055237, "grad_norm": 101.07194872691511, "learning_rate": 6.167835232252411e-07, "loss": 1.5201, "step": 4510, "true_loss": 0.1443 }, { "epoch": 7.129414085618465, "grad_norm": 113.04758011318795, "learning_rate": 6.058282208588957e-07, "loss": 1.6384, "step": 4520, "true_loss": 0.2383 }, { "epoch": 7.145196291181692, "grad_norm": 100.69279349775708, "learning_rate": 5.948729184925504e-07, "loss": 1.6024, "step": 4530, "true_loss": 0.2174 }, { "epoch": 7.16097849674492, "grad_norm": 103.8247309105772, "learning_rate": 5.839176161262051e-07, "loss": 1.5384, "step": 4540, "true_loss": 0.1352 }, { "epoch": 7.176760702308147, "grad_norm": 107.24611946542777, "learning_rate": 5.729623137598597e-07, "loss": 1.4723, "step": 4550, "true_loss": 0.1874 }, { "epoch": 7.176760702308147, "eval_accuracy": 0.23392461197339245, "eval_loss": 3.821075677871704, "eval_runtime": 24.1146, "eval_samples_per_second": 37.405, "eval_steps_per_second": 4.686, "step": 4550 }, { "epoch": 7.192542907871375, "grad_norm": 114.78873597171163, "learning_rate": 5.620070113935145e-07, "loss": 1.6084, "step": 4560, "true_loss": 0.1477 }, { "epoch": 7.2083251134346025, "grad_norm": 120.4342772568361, "learning_rate": 5.510517090271692e-07, "loss": 1.7631, "step": 4570, "true_loss": 0.2004 }, { "epoch": 7.22410731899783, "grad_norm": 116.63172355588391, "learning_rate": 5.400964066608239e-07, "loss": 1.7131, "step": 4580, "true_loss": 0.2328 }, { "epoch": 7.2398895245610575, "grad_norm": 100.05574584352458, "learning_rate": 5.291411042944786e-07, "loss": 1.818, "step": 4590, "true_loss": 0.2573 }, { "epoch": 7.255671730124285, "grad_norm": 116.06315149406957, "learning_rate": 5.181858019281333e-07, "loss": 1.7221, "step": 4600, "true_loss": 0.1953 }, { "epoch": 7.255671730124285, "eval_accuracy": 0.24279379157427938, "eval_loss": 3.8259878158569336, "eval_runtime": 23.6453, "eval_samples_per_second": 38.147, "eval_steps_per_second": 4.779, "step": 4600 }, { "epoch": 7.2714539356875125, "grad_norm": 111.82918698975686, "learning_rate": 5.072304995617879e-07, "loss": 1.692, "step": 4610, "true_loss": 0.2322 }, { "epoch": 7.28723614125074, "grad_norm": 100.22374119200155, "learning_rate": 4.962751971954426e-07, "loss": 1.2305, "step": 4620, "true_loss": 0.095 }, { "epoch": 7.3030183468139676, "grad_norm": 116.26152938429104, "learning_rate": 4.853198948290973e-07, "loss": 1.8399, "step": 4630, "true_loss": 0.2352 }, { "epoch": 7.318800552377195, "grad_norm": 120.46776375337953, "learning_rate": 4.74364592462752e-07, "loss": 1.6735, "step": 4640, "true_loss": 0.3253 }, { "epoch": 7.334582757940423, "grad_norm": 105.75603353452355, "learning_rate": 4.634092900964067e-07, "loss": 1.8944, "step": 4650, "true_loss": 0.2131 }, { "epoch": 7.334582757940423, "eval_accuracy": 0.24390243902439024, "eval_loss": 3.8275279998779297, "eval_runtime": 23.5722, "eval_samples_per_second": 38.265, "eval_steps_per_second": 4.794, "step": 4650 }, { "epoch": 7.350364963503649, "grad_norm": 112.24669038955847, "learning_rate": 4.524539877300614e-07, "loss": 1.824, "step": 4660, "true_loss": 0.1964 }, { "epoch": 7.366147169066877, "grad_norm": 90.24016827031272, "learning_rate": 4.414986853637161e-07, "loss": 1.5728, "step": 4670, "true_loss": 0.1985 }, { "epoch": 7.381929374630104, "grad_norm": 121.65452107360474, "learning_rate": 4.305433829973708e-07, "loss": 1.7557, "step": 4680, "true_loss": 0.1516 }, { "epoch": 7.397711580193332, "grad_norm": 87.69595116385209, "learning_rate": 4.195880806310255e-07, "loss": 1.2686, "step": 4690, "true_loss": 0.1029 }, { "epoch": 7.413493785756559, "grad_norm": 111.75004729818835, "learning_rate": 4.086327782646801e-07, "loss": 1.9696, "step": 4700, "true_loss": 0.292 }, { "epoch": 7.413493785756559, "eval_accuracy": 0.24279379157427938, "eval_loss": 3.831446409225464, "eval_runtime": 23.6876, "eval_samples_per_second": 38.079, "eval_steps_per_second": 4.77, "step": 4700 }, { "epoch": 7.429275991319787, "grad_norm": 108.03730107191782, "learning_rate": 3.976774758983348e-07, "loss": 1.6106, "step": 4710, "true_loss": 0.1283 }, { "epoch": 7.445058196883014, "grad_norm": 121.7081275730794, "learning_rate": 3.8672217353198953e-07, "loss": 1.5908, "step": 4720, "true_loss": 0.2813 }, { "epoch": 7.460840402446242, "grad_norm": 123.87313217986515, "learning_rate": 3.757668711656442e-07, "loss": 1.7509, "step": 4730, "true_loss": 0.1369 }, { "epoch": 7.476622608009469, "grad_norm": 106.6646988661937, "learning_rate": 3.648115687992989e-07, "loss": 1.4201, "step": 4740, "true_loss": 0.1805 }, { "epoch": 7.492404813572697, "grad_norm": 82.98799294932141, "learning_rate": 3.538562664329536e-07, "loss": 1.4546, "step": 4750, "true_loss": 0.1447 }, { "epoch": 7.492404813572697, "eval_accuracy": 0.24833702882483372, "eval_loss": 3.8366105556488037, "eval_runtime": 23.7374, "eval_samples_per_second": 37.999, "eval_steps_per_second": 4.76, "step": 4750 }, { "epoch": 7.508187019135924, "grad_norm": 73.28584420695034, "learning_rate": 3.4290096406660827e-07, "loss": 1.7633, "step": 4760, "true_loss": 0.2418 }, { "epoch": 7.523969224699152, "grad_norm": 98.7172416100943, "learning_rate": 3.31945661700263e-07, "loss": 1.7587, "step": 4770, "true_loss": 0.3002 }, { "epoch": 7.539751430262379, "grad_norm": 57.5506849585333, "learning_rate": 3.209903593339177e-07, "loss": 1.885, "step": 4780, "true_loss": 0.2346 }, { "epoch": 7.555533635825607, "grad_norm": 80.20524586507351, "learning_rate": 3.100350569675723e-07, "loss": 1.7866, "step": 4790, "true_loss": 0.1811 }, { "epoch": 7.571315841388834, "grad_norm": 85.61705765996145, "learning_rate": 2.99079754601227e-07, "loss": 1.6021, "step": 4800, "true_loss": 0.0998 }, { "epoch": 7.571315841388834, "eval_accuracy": 0.24501108647450112, "eval_loss": 3.826876163482666, "eval_runtime": 24.4037, "eval_samples_per_second": 36.962, "eval_steps_per_second": 4.63, "step": 4800 }, { "epoch": 7.587098046952062, "grad_norm": 84.05951816424748, "learning_rate": 2.881244522348817e-07, "loss": 1.3573, "step": 4810, "true_loss": 0.1803 }, { "epoch": 7.602880252515289, "grad_norm": 98.67187916600565, "learning_rate": 2.771691498685364e-07, "loss": 1.6579, "step": 4820, "true_loss": 0.17 }, { "epoch": 7.618662458078516, "grad_norm": 122.95403174294823, "learning_rate": 2.662138475021911e-07, "loss": 1.7085, "step": 4830, "true_loss": 0.1443 }, { "epoch": 7.634444663641744, "grad_norm": 99.55082244849568, "learning_rate": 2.5525854513584575e-07, "loss": 1.7022, "step": 4840, "true_loss": 0.2159 }, { "epoch": 7.650226869204971, "grad_norm": 95.43848705075885, "learning_rate": 2.4430324276950047e-07, "loss": 1.7045, "step": 4850, "true_loss": 0.1982 }, { "epoch": 7.650226869204971, "eval_accuracy": 0.24833702882483372, "eval_loss": 3.827522039413452, "eval_runtime": 23.7644, "eval_samples_per_second": 37.956, "eval_steps_per_second": 4.755, "step": 4850 }, { "epoch": 7.666009074768199, "grad_norm": 153.60641735085687, "learning_rate": 2.3334794040315515e-07, "loss": 1.8336, "step": 4860, "true_loss": 0.2564 }, { "epoch": 7.681791280331426, "grad_norm": 72.95642409819222, "learning_rate": 2.2239263803680984e-07, "loss": 1.512, "step": 4870, "true_loss": 0.2407 }, { "epoch": 7.697573485894654, "grad_norm": 101.56743386015985, "learning_rate": 2.1143733567046452e-07, "loss": 1.6632, "step": 4880, "true_loss": 0.203 }, { "epoch": 7.713355691457881, "grad_norm": 112.67818055510378, "learning_rate": 2.0048203330411923e-07, "loss": 1.5799, "step": 4890, "true_loss": 0.1725 }, { "epoch": 7.729137897021109, "grad_norm": 103.18275378051146, "learning_rate": 1.895267309377739e-07, "loss": 1.8377, "step": 4900, "true_loss": 0.2486 }, { "epoch": 7.729137897021109, "eval_accuracy": 0.24944567627494457, "eval_loss": 3.8336145877838135, "eval_runtime": 23.743, "eval_samples_per_second": 37.99, "eval_steps_per_second": 4.759, "step": 4900 }, { "epoch": 7.744920102584336, "grad_norm": 100.02459829967228, "learning_rate": 1.7857142857142858e-07, "loss": 1.6563, "step": 4910, "true_loss": 0.1448 }, { "epoch": 7.760702308147564, "grad_norm": 75.93446519999665, "learning_rate": 1.676161262050833e-07, "loss": 1.8641, "step": 4920, "true_loss": 0.1784 }, { "epoch": 7.776484513710791, "grad_norm": 101.91950391162042, "learning_rate": 1.5666082383873798e-07, "loss": 1.4482, "step": 4930, "true_loss": 0.2067 }, { "epoch": 7.792266719274019, "grad_norm": 80.87765122704334, "learning_rate": 1.4570552147239263e-07, "loss": 1.6518, "step": 4940, "true_loss": 0.1788 }, { "epoch": 7.808048924837246, "grad_norm": 103.05839887788406, "learning_rate": 1.3475021910604735e-07, "loss": 1.7937, "step": 4950, "true_loss": 0.1388 }, { "epoch": 7.808048924837246, "eval_accuracy": 0.24611973392461198, "eval_loss": 3.829207181930542, "eval_runtime": 23.2892, "eval_samples_per_second": 38.73, "eval_steps_per_second": 4.852, "step": 4950 }, { "epoch": 7.823831130400474, "grad_norm": 89.03688845040247, "learning_rate": 1.2379491673970203e-07, "loss": 1.7303, "step": 4960, "true_loss": 0.2164 }, { "epoch": 7.839613335963701, "grad_norm": 73.16071893147557, "learning_rate": 1.128396143733567e-07, "loss": 1.802, "step": 4970, "true_loss": 0.2847 }, { "epoch": 7.855395541526929, "grad_norm": 69.79887151991092, "learning_rate": 1.018843120070114e-07, "loss": 1.7562, "step": 4980, "true_loss": 0.1797 }, { "epoch": 7.871177747090156, "grad_norm": 101.31703211159908, "learning_rate": 9.09290096406661e-08, "loss": 1.6354, "step": 4990, "true_loss": 0.2285 }, { "epoch": 7.886959952653383, "grad_norm": 112.1723402367008, "learning_rate": 7.997370727432077e-08, "loss": 2.0249, "step": 5000, "true_loss": 0.3296 }, { "epoch": 7.886959952653383, "eval_accuracy": 0.25055432372505543, "eval_loss": 3.8272743225097656, "eval_runtime": 23.8119, "eval_samples_per_second": 37.88, "eval_steps_per_second": 4.746, "step": 5000 } ], "logging_steps": 10, "max_steps": 5072, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }