ncbateman commited on
Commit
84f7f42
·
verified ·
1 Parent(s): be6bf95

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ed8e46de5ca3b16b7d4946fe6cf5403d0cd32c3c3f502e1f414e8507dc3bf50
3
  size 2264640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:995b27b879c7b0c136c4b26a6ff6d99d1336153e978506224b7adb7687eeedb6
3
  size 2264640
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057901fd6d7593948715109075c12bc6c8bcda4fe4cb163a0db2068774e58cad
3
  size 1183674
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fef7346f36b906aad9f1617baee5297e2cf0e70d4d1ea5de0dc4c973364b2240
3
  size 1183674
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e577caed341d63f36ac89f4cdc452526772e9dd6718e0facb5410b7e4008eeda
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e29c007d609a592b9d9b4d096992334f09ca211d70d03d461df92b792f8cae36
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0428512ada8c2471b2f37ecbdd4efa5f13e3ba0e777fddbfec0396eebc36c01a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4077036d99500a708f700f75da24d51b5300e184ad35fda49dc5a4df5596cca2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5120910384068279,
5
  "eval_steps": 250,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6339,6 +6339,714 @@
6339
  "learning_rate": 2.7091379149682685e-06,
6340
  "loss": 1.598,
6341
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6342
  }
6343
  ],
6344
  "logging_steps": 1,
@@ -6353,12 +7061,12 @@
6353
  "should_evaluate": false,
6354
  "should_log": false,
6355
  "should_save": true,
6356
- "should_training_stop": false
6357
  },
6358
  "attributes": {}
6359
  }
6360
  },
6361
- "total_flos": 8440376419418112.0,
6362
  "train_batch_size": 2,
6363
  "trial_name": null,
6364
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5689900426742532,
5
  "eval_steps": 250,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6339
  "learning_rate": 2.7091379149682685e-06,
6340
  "loss": 1.598,
6341
  "step": 900
6342
+ },
6343
+ {
6344
+ "epoch": 0.5126600284495021,
6345
+ "grad_norm": 6.939188480377197,
6346
+ "learning_rate": 2.6557085182532582e-06,
6347
+ "loss": 1.4069,
6348
+ "step": 901
6349
+ },
6350
+ {
6351
+ "epoch": 0.5132290184921764,
6352
+ "grad_norm": 6.613868236541748,
6353
+ "learning_rate": 2.602796871124663e-06,
6354
+ "loss": 2.6769,
6355
+ "step": 902
6356
+ },
6357
+ {
6358
+ "epoch": 0.5137980085348507,
6359
+ "grad_norm": 4.967800617218018,
6360
+ "learning_rate": 2.5504035522157854e-06,
6361
+ "loss": 1.7265,
6362
+ "step": 903
6363
+ },
6364
+ {
6365
+ "epoch": 0.5143669985775249,
6366
+ "grad_norm": 7.544163227081299,
6367
+ "learning_rate": 2.4985291344915674e-06,
6368
+ "loss": 3.8019,
6369
+ "step": 904
6370
+ },
6371
+ {
6372
+ "epoch": 0.5149359886201992,
6373
+ "grad_norm": 6.729382514953613,
6374
+ "learning_rate": 2.4471741852423237e-06,
6375
+ "loss": 1.1223,
6376
+ "step": 905
6377
+ },
6378
+ {
6379
+ "epoch": 0.5155049786628734,
6380
+ "grad_norm": 5.194341659545898,
6381
+ "learning_rate": 2.3963392660775575e-06,
6382
+ "loss": 2.6233,
6383
+ "step": 906
6384
+ },
6385
+ {
6386
+ "epoch": 0.5160739687055477,
6387
+ "grad_norm": 5.60222864151001,
6388
+ "learning_rate": 2.3460249329197824e-06,
6389
+ "loss": 1.6183,
6390
+ "step": 907
6391
+ },
6392
+ {
6393
+ "epoch": 0.5166429587482219,
6394
+ "grad_norm": 5.169523239135742,
6395
+ "learning_rate": 2.296231735998511e-06,
6396
+ "loss": 1.4671,
6397
+ "step": 908
6398
+ },
6399
+ {
6400
+ "epoch": 0.5172119487908962,
6401
+ "grad_norm": 8.426031112670898,
6402
+ "learning_rate": 2.2469602198441573e-06,
6403
+ "loss": 2.8763,
6404
+ "step": 909
6405
+ },
6406
+ {
6407
+ "epoch": 0.5177809388335705,
6408
+ "grad_norm": 6.08635139465332,
6409
+ "learning_rate": 2.1982109232821178e-06,
6410
+ "loss": 2.1274,
6411
+ "step": 910
6412
+ },
6413
+ {
6414
+ "epoch": 0.5183499288762446,
6415
+ "grad_norm": 8.61262321472168,
6416
+ "learning_rate": 2.149984379426906e-06,
6417
+ "loss": 1.795,
6418
+ "step": 911
6419
+ },
6420
+ {
6421
+ "epoch": 0.518918918918919,
6422
+ "grad_norm": 12.403318405151367,
6423
+ "learning_rate": 2.102281115676258e-06,
6424
+ "loss": 1.608,
6425
+ "step": 912
6426
+ },
6427
+ {
6428
+ "epoch": 0.5194879089615931,
6429
+ "grad_norm": 8.934358596801758,
6430
+ "learning_rate": 2.0551016537054493e-06,
6431
+ "loss": 3.1318,
6432
+ "step": 913
6433
+ },
6434
+ {
6435
+ "epoch": 0.5200568990042674,
6436
+ "grad_norm": 7.507859230041504,
6437
+ "learning_rate": 2.008446509461498e-06,
6438
+ "loss": 1.3386,
6439
+ "step": 914
6440
+ },
6441
+ {
6442
+ "epoch": 0.5206258890469416,
6443
+ "grad_norm": 10.2330904006958,
6444
+ "learning_rate": 1.962316193157593e-06,
6445
+ "loss": 1.6304,
6446
+ "step": 915
6447
+ },
6448
+ {
6449
+ "epoch": 0.5211948790896159,
6450
+ "grad_norm": 10.44956111907959,
6451
+ "learning_rate": 1.91671120926748e-06,
6452
+ "loss": 2.102,
6453
+ "step": 916
6454
+ },
6455
+ {
6456
+ "epoch": 0.5217638691322902,
6457
+ "grad_norm": 5.6211652755737305,
6458
+ "learning_rate": 1.8716320565199618e-06,
6459
+ "loss": 2.347,
6460
+ "step": 917
6461
+ },
6462
+ {
6463
+ "epoch": 0.5223328591749644,
6464
+ "grad_norm": 6.046701431274414,
6465
+ "learning_rate": 1.8270792278934302e-06,
6466
+ "loss": 1.8603,
6467
+ "step": 918
6468
+ },
6469
+ {
6470
+ "epoch": 0.5229018492176387,
6471
+ "grad_norm": 6.489639759063721,
6472
+ "learning_rate": 1.7830532106104747e-06,
6473
+ "loss": 1.5991,
6474
+ "step": 919
6475
+ },
6476
+ {
6477
+ "epoch": 0.5234708392603129,
6478
+ "grad_norm": 8.459025382995605,
6479
+ "learning_rate": 1.7395544861325718e-06,
6480
+ "loss": 2.8278,
6481
+ "step": 920
6482
+ },
6483
+ {
6484
+ "epoch": 0.5240398293029872,
6485
+ "grad_norm": 4.396022796630859,
6486
+ "learning_rate": 1.696583530154794e-06,
6487
+ "loss": 1.2431,
6488
+ "step": 921
6489
+ },
6490
+ {
6491
+ "epoch": 0.5246088193456615,
6492
+ "grad_norm": 6.775049686431885,
6493
+ "learning_rate": 1.6541408126006463e-06,
6494
+ "loss": 2.4385,
6495
+ "step": 922
6496
+ },
6497
+ {
6498
+ "epoch": 0.5251778093883357,
6499
+ "grad_norm": 4.970674514770508,
6500
+ "learning_rate": 1.6122267976168781e-06,
6501
+ "loss": 1.9198,
6502
+ "step": 923
6503
+ },
6504
+ {
6505
+ "epoch": 0.52574679943101,
6506
+ "grad_norm": 4.357985019683838,
6507
+ "learning_rate": 1.5708419435684462e-06,
6508
+ "loss": 1.8314,
6509
+ "step": 924
6510
+ },
6511
+ {
6512
+ "epoch": 0.5263157894736842,
6513
+ "grad_norm": 4.673070430755615,
6514
+ "learning_rate": 1.5299867030334814e-06,
6515
+ "loss": 1.6432,
6516
+ "step": 925
6517
+ },
6518
+ {
6519
+ "epoch": 0.5268847795163585,
6520
+ "grad_norm": 5.390334129333496,
6521
+ "learning_rate": 1.4896615227983468e-06,
6522
+ "loss": 3.1497,
6523
+ "step": 926
6524
+ },
6525
+ {
6526
+ "epoch": 0.5274537695590327,
6527
+ "grad_norm": 7.182600021362305,
6528
+ "learning_rate": 1.4498668438527597e-06,
6529
+ "loss": 1.9414,
6530
+ "step": 927
6531
+ },
6532
+ {
6533
+ "epoch": 0.528022759601707,
6534
+ "grad_norm": 4.548110485076904,
6535
+ "learning_rate": 1.4106031013849496e-06,
6536
+ "loss": 1.3904,
6537
+ "step": 928
6538
+ },
6539
+ {
6540
+ "epoch": 0.5285917496443813,
6541
+ "grad_norm": 7.250638961791992,
6542
+ "learning_rate": 1.3718707247769135e-06,
6543
+ "loss": 1.5528,
6544
+ "step": 929
6545
+ },
6546
+ {
6547
+ "epoch": 0.5291607396870555,
6548
+ "grad_norm": 5.74232816696167,
6549
+ "learning_rate": 1.333670137599713e-06,
6550
+ "loss": 1.868,
6551
+ "step": 930
6552
+ },
6553
+ {
6554
+ "epoch": 0.5297297297297298,
6555
+ "grad_norm": 9.61638069152832,
6556
+ "learning_rate": 1.2960017576088446e-06,
6557
+ "loss": 1.9816,
6558
+ "step": 931
6559
+ },
6560
+ {
6561
+ "epoch": 0.530298719772404,
6562
+ "grad_norm": 4.908766746520996,
6563
+ "learning_rate": 1.2588659967397e-06,
6564
+ "loss": 2.0533,
6565
+ "step": 932
6566
+ },
6567
+ {
6568
+ "epoch": 0.5308677098150782,
6569
+ "grad_norm": 4.330400466918945,
6570
+ "learning_rate": 1.222263261102985e-06,
6571
+ "loss": 2.0314,
6572
+ "step": 933
6573
+ },
6574
+ {
6575
+ "epoch": 0.5314366998577524,
6576
+ "grad_norm": 3.70294189453125,
6577
+ "learning_rate": 1.1861939509803687e-06,
6578
+ "loss": 2.0148,
6579
+ "step": 934
6580
+ },
6581
+ {
6582
+ "epoch": 0.5320056899004267,
6583
+ "grad_norm": 5.016382217407227,
6584
+ "learning_rate": 1.1506584608200367e-06,
6585
+ "loss": 2.3735,
6586
+ "step": 935
6587
+ },
6588
+ {
6589
+ "epoch": 0.532574679943101,
6590
+ "grad_norm": 10.169556617736816,
6591
+ "learning_rate": 1.1156571792324211e-06,
6592
+ "loss": 1.8206,
6593
+ "step": 936
6594
+ },
6595
+ {
6596
+ "epoch": 0.5331436699857752,
6597
+ "grad_norm": 6.3581085205078125,
6598
+ "learning_rate": 1.0811904889859336e-06,
6599
+ "loss": 2.533,
6600
+ "step": 937
6601
+ },
6602
+ {
6603
+ "epoch": 0.5337126600284495,
6604
+ "grad_norm": 6.080915451049805,
6605
+ "learning_rate": 1.0472587670027678e-06,
6606
+ "loss": 1.598,
6607
+ "step": 938
6608
+ },
6609
+ {
6610
+ "epoch": 0.5342816500711237,
6611
+ "grad_norm": 6.210170745849609,
6612
+ "learning_rate": 1.0138623843548078e-06,
6613
+ "loss": 1.828,
6614
+ "step": 939
6615
+ },
6616
+ {
6617
+ "epoch": 0.534850640113798,
6618
+ "grad_norm": 6.622677803039551,
6619
+ "learning_rate": 9.810017062595322e-07,
6620
+ "loss": 1.7117,
6621
+ "step": 940
6622
+ },
6623
+ {
6624
+ "epoch": 0.5354196301564723,
6625
+ "grad_norm": 5.1109819412231445,
6626
+ "learning_rate": 9.486770920760668e-07,
6627
+ "loss": 1.8321,
6628
+ "step": 941
6629
+ },
6630
+ {
6631
+ "epoch": 0.5359886201991465,
6632
+ "grad_norm": 6.059126853942871,
6633
+ "learning_rate": 9.168888953011989e-07,
6634
+ "loss": 2.9244,
6635
+ "step": 942
6636
+ },
6637
+ {
6638
+ "epoch": 0.5365576102418208,
6639
+ "grad_norm": 6.728851318359375,
6640
+ "learning_rate": 8.856374635655695e-07,
6641
+ "loss": 1.5337,
6642
+ "step": 943
6643
+ },
6644
+ {
6645
+ "epoch": 0.537126600284495,
6646
+ "grad_norm": 6.851590156555176,
6647
+ "learning_rate": 8.549231386298151e-07,
6648
+ "loss": 2.2449,
6649
+ "step": 944
6650
+ },
6651
+ {
6652
+ "epoch": 0.5376955903271693,
6653
+ "grad_norm": 5.476071357727051,
6654
+ "learning_rate": 8.247462563808817e-07,
6655
+ "loss": 1.999,
6656
+ "step": 945
6657
+ },
6658
+ {
6659
+ "epoch": 0.5382645803698435,
6660
+ "grad_norm": 6.682757377624512,
6661
+ "learning_rate": 7.951071468283167e-07,
6662
+ "loss": 1.5688,
6663
+ "step": 946
6664
+ },
6665
+ {
6666
+ "epoch": 0.5388335704125178,
6667
+ "grad_norm": 6.973927021026611,
6668
+ "learning_rate": 7.66006134100672e-07,
6669
+ "loss": 1.7305,
6670
+ "step": 947
6671
+ },
6672
+ {
6673
+ "epoch": 0.5394025604551921,
6674
+ "grad_norm": 6.132145404815674,
6675
+ "learning_rate": 7.374435364419674e-07,
6676
+ "loss": 2.6093,
6677
+ "step": 948
6678
+ },
6679
+ {
6680
+ "epoch": 0.5399715504978663,
6681
+ "grad_norm": 3.4302048683166504,
6682
+ "learning_rate": 7.094196662081831e-07,
6683
+ "loss": 1.5475,
6684
+ "step": 949
6685
+ },
6686
+ {
6687
+ "epoch": 0.5405405405405406,
6688
+ "grad_norm": 6.01040506362915,
6689
+ "learning_rate": 6.819348298638839e-07,
6690
+ "loss": 1.9851,
6691
+ "step": 950
6692
+ },
6693
+ {
6694
+ "epoch": 0.5411095305832148,
6695
+ "grad_norm": 5.79357385635376,
6696
+ "learning_rate": 6.549893279788277e-07,
6697
+ "loss": 1.2785,
6698
+ "step": 951
6699
+ },
6700
+ {
6701
+ "epoch": 0.5416785206258891,
6702
+ "grad_norm": 6.42232084274292,
6703
+ "learning_rate": 6.285834552247128e-07,
6704
+ "loss": 1.6813,
6705
+ "step": 952
6706
+ },
6707
+ {
6708
+ "epoch": 0.5422475106685632,
6709
+ "grad_norm": 6.847588539123535,
6710
+ "learning_rate": 6.027175003719354e-07,
6711
+ "loss": 1.7042,
6712
+ "step": 953
6713
+ },
6714
+ {
6715
+ "epoch": 0.5428165007112375,
6716
+ "grad_norm": 8.760133743286133,
6717
+ "learning_rate": 5.773917462864264e-07,
6718
+ "loss": 2.6909,
6719
+ "step": 954
6720
+ },
6721
+ {
6722
+ "epoch": 0.5433854907539118,
6723
+ "grad_norm": 6.681099891662598,
6724
+ "learning_rate": 5.526064699265753e-07,
6725
+ "loss": 2.5284,
6726
+ "step": 955
6727
+ },
6728
+ {
6729
+ "epoch": 0.543954480796586,
6730
+ "grad_norm": 8.253862380981445,
6731
+ "learning_rate": 5.283619423401998e-07,
6732
+ "loss": 2.5049,
6733
+ "step": 956
6734
+ },
6735
+ {
6736
+ "epoch": 0.5445234708392603,
6737
+ "grad_norm": 6.301835536956787,
6738
+ "learning_rate": 5.046584286615697e-07,
6739
+ "loss": 2.0345,
6740
+ "step": 957
6741
+ },
6742
+ {
6743
+ "epoch": 0.5450924608819345,
6744
+ "grad_norm": 8.807053565979004,
6745
+ "learning_rate": 4.814961881085045e-07,
6746
+ "loss": 2.1812,
6747
+ "step": 958
6748
+ },
6749
+ {
6750
+ "epoch": 0.5456614509246088,
6751
+ "grad_norm": 6.5801897048950195,
6752
+ "learning_rate": 4.5887547397955864e-07,
6753
+ "loss": 2.3601,
6754
+ "step": 959
6755
+ },
6756
+ {
6757
+ "epoch": 0.5462304409672831,
6758
+ "grad_norm": 6.495395660400391,
6759
+ "learning_rate": 4.367965336512403e-07,
6760
+ "loss": 2.0076,
6761
+ "step": 960
6762
+ },
6763
+ {
6764
+ "epoch": 0.5467994310099573,
6765
+ "grad_norm": 11.701818466186523,
6766
+ "learning_rate": 4.1525960857530243e-07,
6767
+ "loss": 2.6091,
6768
+ "step": 961
6769
+ },
6770
+ {
6771
+ "epoch": 0.5473684210526316,
6772
+ "grad_norm": 10.334817886352539,
6773
+ "learning_rate": 3.9426493427611177e-07,
6774
+ "loss": 1.5223,
6775
+ "step": 962
6776
+ },
6777
+ {
6778
+ "epoch": 0.5479374110953058,
6779
+ "grad_norm": 3.968838691711426,
6780
+ "learning_rate": 3.738127403480507e-07,
6781
+ "loss": 1.8091,
6782
+ "step": 963
6783
+ },
6784
+ {
6785
+ "epoch": 0.5485064011379801,
6786
+ "grad_norm": 5.9291486740112305,
6787
+ "learning_rate": 3.5390325045304706e-07,
6788
+ "loss": 1.5164,
6789
+ "step": 964
6790
+ },
6791
+ {
6792
+ "epoch": 0.5490753911806543,
6793
+ "grad_norm": 5.5870842933654785,
6794
+ "learning_rate": 3.3453668231809286e-07,
6795
+ "loss": 1.7174,
6796
+ "step": 965
6797
+ },
6798
+ {
6799
+ "epoch": 0.5496443812233286,
6800
+ "grad_norm": 7.495179653167725,
6801
+ "learning_rate": 3.157132477328628e-07,
6802
+ "loss": 1.5558,
6803
+ "step": 966
6804
+ },
6805
+ {
6806
+ "epoch": 0.5502133712660029,
6807
+ "grad_norm": 8.937362670898438,
6808
+ "learning_rate": 2.9743315254743833e-07,
6809
+ "loss": 2.2747,
6810
+ "step": 967
6811
+ },
6812
+ {
6813
+ "epoch": 0.5507823613086771,
6814
+ "grad_norm": 4.983654975891113,
6815
+ "learning_rate": 2.796965966699927e-07,
6816
+ "loss": 2.3196,
6817
+ "step": 968
6818
+ },
6819
+ {
6820
+ "epoch": 0.5513513513513514,
6821
+ "grad_norm": 5.2556915283203125,
6822
+ "learning_rate": 2.625037740646763e-07,
6823
+ "loss": 1.9528,
6824
+ "step": 969
6825
+ },
6826
+ {
6827
+ "epoch": 0.5519203413940256,
6828
+ "grad_norm": 5.226326942443848,
6829
+ "learning_rate": 2.458548727494292e-07,
6830
+ "loss": 2.6639,
6831
+ "step": 970
6832
+ },
6833
+ {
6834
+ "epoch": 0.5524893314366999,
6835
+ "grad_norm": 7.9373345375061035,
6836
+ "learning_rate": 2.2975007479397738e-07,
6837
+ "loss": 1.8186,
6838
+ "step": 971
6839
+ },
6840
+ {
6841
+ "epoch": 0.5530583214793741,
6842
+ "grad_norm": 4.213002681732178,
6843
+ "learning_rate": 2.1418955631781202e-07,
6844
+ "loss": 1.3597,
6845
+ "step": 972
6846
+ },
6847
+ {
6848
+ "epoch": 0.5536273115220484,
6849
+ "grad_norm": 7.157125949859619,
6850
+ "learning_rate": 1.9917348748826335e-07,
6851
+ "loss": 2.8583,
6852
+ "step": 973
6853
+ },
6854
+ {
6855
+ "epoch": 0.5541963015647227,
6856
+ "grad_norm": 6.626620292663574,
6857
+ "learning_rate": 1.847020325186577e-07,
6858
+ "loss": 2.1275,
6859
+ "step": 974
6860
+ },
6861
+ {
6862
+ "epoch": 0.5547652916073968,
6863
+ "grad_norm": 7.9781317710876465,
6864
+ "learning_rate": 1.7077534966650766e-07,
6865
+ "loss": 1.6131,
6866
+ "step": 975
6867
+ },
6868
+ {
6869
+ "epoch": 0.5553342816500711,
6870
+ "grad_norm": 4.986821174621582,
6871
+ "learning_rate": 1.5739359123178587e-07,
6872
+ "loss": 2.0745,
6873
+ "step": 976
6874
+ },
6875
+ {
6876
+ "epoch": 0.5559032716927453,
6877
+ "grad_norm": 6.511902809143066,
6878
+ "learning_rate": 1.4455690355525964e-07,
6879
+ "loss": 2.6664,
6880
+ "step": 977
6881
+ },
6882
+ {
6883
+ "epoch": 0.5564722617354196,
6884
+ "grad_norm": 3.586979627609253,
6885
+ "learning_rate": 1.3226542701689215e-07,
6886
+ "loss": 2.1587,
6887
+ "step": 978
6888
+ },
6889
+ {
6890
+ "epoch": 0.5570412517780939,
6891
+ "grad_norm": 6.518825054168701,
6892
+ "learning_rate": 1.2051929603428825e-07,
6893
+ "loss": 2.3117,
6894
+ "step": 979
6895
+ },
6896
+ {
6897
+ "epoch": 0.5576102418207681,
6898
+ "grad_norm": 6.538543224334717,
6899
+ "learning_rate": 1.0931863906127327e-07,
6900
+ "loss": 1.6998,
6901
+ "step": 980
6902
+ },
6903
+ {
6904
+ "epoch": 0.5581792318634424,
6905
+ "grad_norm": 5.692102432250977,
6906
+ "learning_rate": 9.866357858642205e-08,
6907
+ "loss": 2.5719,
6908
+ "step": 981
6909
+ },
6910
+ {
6911
+ "epoch": 0.5587482219061166,
6912
+ "grad_norm": 7.857847690582275,
6913
+ "learning_rate": 8.855423113177664e-08,
6914
+ "loss": 2.4416,
6915
+ "step": 982
6916
+ },
6917
+ {
6918
+ "epoch": 0.5593172119487909,
6919
+ "grad_norm": 4.545175552368164,
6920
+ "learning_rate": 7.899070725153613e-08,
6921
+ "loss": 2.098,
6922
+ "step": 983
6923
+ },
6924
+ {
6925
+ "epoch": 0.5598862019914651,
6926
+ "grad_norm": 6.184070110321045,
6927
+ "learning_rate": 6.997311153086883e-08,
6928
+ "loss": 2.8567,
6929
+ "step": 984
6930
+ },
6931
+ {
6932
+ "epoch": 0.5604551920341394,
6933
+ "grad_norm": 6.378092288970947,
6934
+ "learning_rate": 6.150154258476315e-08,
6935
+ "loss": 1.4213,
6936
+ "step": 985
6937
+ },
6938
+ {
6939
+ "epoch": 0.5610241820768137,
6940
+ "grad_norm": 6.844626426696777,
6941
+ "learning_rate": 5.3576093056922906e-08,
6942
+ "loss": 2.7032,
6943
+ "step": 986
6944
+ },
6945
+ {
6946
+ "epoch": 0.5615931721194879,
6947
+ "grad_norm": 7.550478935241699,
6948
+ "learning_rate": 4.619684961881254e-08,
6949
+ "loss": 1.3663,
6950
+ "step": 987
6951
+ },
6952
+ {
6953
+ "epoch": 0.5621621621621622,
6954
+ "grad_norm": 8.875478744506836,
6955
+ "learning_rate": 3.936389296864129e-08,
6956
+ "loss": 3.3068,
6957
+ "step": 988
6958
+ },
6959
+ {
6960
+ "epoch": 0.5627311522048364,
6961
+ "grad_norm": 4.764933109283447,
6962
+ "learning_rate": 3.3077297830541584e-08,
6963
+ "loss": 1.3713,
6964
+ "step": 989
6965
+ },
6966
+ {
6967
+ "epoch": 0.5633001422475107,
6968
+ "grad_norm": 7.228865146636963,
6969
+ "learning_rate": 2.7337132953697554e-08,
6970
+ "loss": 2.1043,
6971
+ "step": 990
6972
+ },
6973
+ {
6974
+ "epoch": 0.5638691322901849,
6975
+ "grad_norm": 7.009644031524658,
6976
+ "learning_rate": 2.214346111164556e-08,
6977
+ "loss": 1.6978,
6978
+ "step": 991
6979
+ },
6980
+ {
6981
+ "epoch": 0.5644381223328592,
6982
+ "grad_norm": 8.948090553283691,
6983
+ "learning_rate": 1.749633910153592e-08,
6984
+ "loss": 2.9902,
6985
+ "step": 992
6986
+ },
6987
+ {
6988
+ "epoch": 0.5650071123755335,
6989
+ "grad_norm": 8.406102180480957,
6990
+ "learning_rate": 1.3395817743561134e-08,
6991
+ "loss": 2.3606,
6992
+ "step": 993
6993
+ },
6994
+ {
6995
+ "epoch": 0.5655761024182077,
6996
+ "grad_norm": 6.994669437408447,
6997
+ "learning_rate": 9.841941880361916e-09,
6998
+ "loss": 2.2296,
6999
+ "step": 994
7000
+ },
7001
+ {
7002
+ "epoch": 0.566145092460882,
7003
+ "grad_norm": 8.03478717803955,
7004
+ "learning_rate": 6.834750376549792e-09,
7005
+ "loss": 1.2956,
7006
+ "step": 995
7007
+ },
7008
+ {
7009
+ "epoch": 0.5667140825035561,
7010
+ "grad_norm": 4.596794128417969,
7011
+ "learning_rate": 4.3742761183018784e-09,
7012
+ "loss": 1.3163,
7013
+ "step": 996
7014
+ },
7015
+ {
7016
+ "epoch": 0.5672830725462304,
7017
+ "grad_norm": 5.452564716339111,
7018
+ "learning_rate": 2.4605460129556445e-09,
7019
+ "loss": 1.6505,
7020
+ "step": 997
7021
+ },
7022
+ {
7023
+ "epoch": 0.5678520625889047,
7024
+ "grad_norm": 8.177964210510254,
7025
+ "learning_rate": 1.0935809887702154e-09,
7026
+ "loss": 1.8445,
7027
+ "step": 998
7028
+ },
7029
+ {
7030
+ "epoch": 0.5684210526315789,
7031
+ "grad_norm": 6.218096733093262,
7032
+ "learning_rate": 2.7339599464326627e-10,
7033
+ "loss": 1.7874,
7034
+ "step": 999
7035
+ },
7036
+ {
7037
+ "epoch": 0.5689900426742532,
7038
+ "grad_norm": 5.743197917938232,
7039
+ "learning_rate": 0.0,
7040
+ "loss": 2.6632,
7041
+ "step": 1000
7042
+ },
7043
+ {
7044
+ "epoch": 0.5689900426742532,
7045
+ "eval_loss": 2.037670850753784,
7046
+ "eval_runtime": 13.4751,
7047
+ "eval_samples_per_second": 54.916,
7048
+ "eval_steps_per_second": 27.458,
7049
+ "step": 1000
7050
  }
7051
  ],
7052
  "logging_steps": 1,
 
7061
  "should_evaluate": false,
7062
  "should_log": false,
7063
  "should_save": true,
7064
+ "should_training_stop": true
7065
  },
7066
  "attributes": {}
7067
  }
7068
  },
7069
+ "total_flos": 9388052623392768.0,
7070
  "train_batch_size": 2,
7071
  "trial_name": null,
7072
  "trial_params": null