ahmedaali commited on
Commit
0ec18f6
·
verified ·
1 Parent(s): d0fb1d1

Training in progress, step 1350, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:152dbb08fdc5ac415b1aef8cdb064f7a2af23ec037d9f6e1faf797526933b886
3
  size 528550256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46d58394aa961cbd926c561b5aff779db006a0066e23a13e85c327c0e4c1545a
3
  size 528550256
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ea7a39b7f029aabed418f7134bd4793c70721221732e7926264b2029a522e56
3
  size 1057390522
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e766854e476f43601e5416bc84190938026e3bac15055777e1067753193a6440
3
  size 1057390522
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7682299c566684ea51cf26f0c86b6ffaa3c0bc63cbdf84674b29a2c62ac72143
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52cca5856c568bc52c683b690919168fa27bfbdfefc6e0a62355afa6011157c3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bbf814912876811928d96d5682a1ea23a461d2600cea80dc2fd25c5ed5ec5ef
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e199c66e0e47121b15fb9bae86bf70568a0203a8dfc619658b4b2491bc6b8472
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.2223457476375765,
6
  "eval_steps": 100,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -788,6 +788,275 @@
788
  "eval_samples_per_second": 4.15,
789
  "eval_steps_per_second": 4.15,
790
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
791
  }
792
  ],
793
  "logging_steps": 10,
@@ -802,12 +1071,12 @@
802
  "should_evaluate": false,
803
  "should_log": false,
804
  "should_save": true,
805
- "should_training_stop": false
806
  },
807
  "attributes": {}
808
  }
809
  },
810
- "total_flos": 5.995091809512653e+16,
811
  "train_batch_size": 1,
812
  "trial_name": null,
813
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 100,
7
+ "global_step": 1350,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
788
  "eval_samples_per_second": 4.15,
789
  "eval_steps_per_second": 4.15,
790
  "step": 1000
791
+ },
792
+ {
793
+ "epoch": 2.244580322401334,
794
+ "grad_norm": 0.266984224319458,
795
+ "learning_rate": 1.8208560452235625e-05,
796
+ "loss": 0.0614,
797
+ "step": 1010
798
+ },
799
+ {
800
+ "epoch": 2.2668148971650917,
801
+ "grad_norm": 0.24567271769046783,
802
+ "learning_rate": 1.7221448304223327e-05,
803
+ "loss": 0.0385,
804
+ "step": 1020
805
+ },
806
+ {
807
+ "epoch": 2.289049471928849,
808
+ "grad_norm": 0.15998658537864685,
809
+ "learning_rate": 1.6256249706943628e-05,
810
+ "loss": 0.0482,
811
+ "step": 1030
812
+ },
813
+ {
814
+ "epoch": 2.311284046692607,
815
+ "grad_norm": 0.2101755291223526,
816
+ "learning_rate": 1.5313609927723332e-05,
817
+ "loss": 0.0419,
818
+ "step": 1040
819
+ },
820
+ {
821
+ "epoch": 2.333518621456365,
822
+ "grad_norm": 0.10369472205638885,
823
+ "learning_rate": 1.4394159152569903e-05,
824
+ "loss": 0.0248,
825
+ "step": 1050
826
+ },
827
+ {
828
+ "epoch": 2.3557531962201224,
829
+ "grad_norm": 0.3291586637496948,
830
+ "learning_rate": 1.3498512064871271e-05,
831
+ "loss": 0.0611,
832
+ "step": 1060
833
+ },
834
+ {
835
+ "epoch": 2.37798777098388,
836
+ "grad_norm": 0.5122426748275757,
837
+ "learning_rate": 1.262726743445954e-05,
838
+ "loss": 0.0628,
839
+ "step": 1070
840
+ },
841
+ {
842
+ "epoch": 2.4002223457476375,
843
+ "grad_norm": 0.1757289469242096,
844
+ "learning_rate": 1.178100771731339e-05,
845
+ "loss": 0.0414,
846
+ "step": 1080
847
+ },
848
+ {
849
+ "epoch": 2.422456920511395,
850
+ "grad_norm": 0.3590919077396393,
851
+ "learning_rate": 1.096029866616704e-05,
852
+ "loss": 0.0349,
853
+ "step": 1090
854
+ },
855
+ {
856
+ "epoch": 2.444691495275153,
857
+ "grad_norm": 0.21179239451885223,
858
+ "learning_rate": 1.0165688952285651e-05,
859
+ "loss": 0.0318,
860
+ "step": 1100
861
+ },
862
+ {
863
+ "epoch": 2.444691495275153,
864
+ "eval_loss": 0.048208702355623245,
865
+ "eval_runtime": 48.1747,
866
+ "eval_samples_per_second": 4.152,
867
+ "eval_steps_per_second": 4.152,
868
+ "step": 1100
869
+ },
870
+ {
871
+ "epoch": 2.4669260700389106,
872
+ "grad_norm": 0.2429758608341217,
873
+ "learning_rate": 9.397709798660359e-06,
874
+ "loss": 0.0389,
875
+ "step": 1110
876
+ },
877
+ {
878
+ "epoch": 2.489160644802668,
879
+ "grad_norm": 0.3247833251953125,
880
+ "learning_rate": 8.656874624868134e-06,
881
+ "loss": 0.0474,
882
+ "step": 1120
883
+ },
884
+ {
885
+ "epoch": 2.5113952195664258,
886
+ "grad_norm": 0.20058025419712067,
887
+ "learning_rate": 7.943678703833657e-06,
888
+ "loss": 0.0446,
889
+ "step": 1130
890
+ },
891
+ {
892
+ "epoch": 2.5336297943301833,
893
+ "grad_norm": 0.22172123193740845,
894
+ "learning_rate": 7.258598830722946e-06,
895
+ "loss": 0.0429,
896
+ "step": 1140
897
+ },
898
+ {
899
+ "epoch": 2.555864369093941,
900
+ "grad_norm": 0.3664150834083557,
901
+ "learning_rate": 6.6020930041899635e-06,
902
+ "loss": 0.0487,
903
+ "step": 1150
904
+ },
905
+ {
906
+ "epoch": 2.5780989438576984,
907
+ "grad_norm": 0.13659419119358063,
908
+ "learning_rate": 5.974600120189289e-06,
909
+ "loss": 0.0438,
910
+ "step": 1160
911
+ },
912
+ {
913
+ "epoch": 2.6003335186214565,
914
+ "grad_norm": 0.18766269087791443,
915
+ "learning_rate": 5.376539678559567e-06,
916
+ "loss": 0.0385,
917
+ "step": 1170
918
+ },
919
+ {
920
+ "epoch": 2.622568093385214,
921
+ "grad_norm": 0.24047650396823883,
922
+ "learning_rate": 4.8083115025739756e-06,
923
+ "loss": 0.0413,
924
+ "step": 1180
925
+ },
926
+ {
927
+ "epoch": 2.6448026681489716,
928
+ "grad_norm": 0.10476374626159668,
929
+ "learning_rate": 4.270295471645064e-06,
930
+ "loss": 0.0426,
931
+ "step": 1190
932
+ },
933
+ {
934
+ "epoch": 2.667037242912729,
935
+ "grad_norm": 0.3628266155719757,
936
+ "learning_rate": 3.7628512673627215e-06,
937
+ "loss": 0.0527,
938
+ "step": 1200
939
+ },
940
+ {
941
+ "epoch": 2.667037242912729,
942
+ "eval_loss": 0.04677248001098633,
943
+ "eval_runtime": 48.3089,
944
+ "eval_samples_per_second": 4.14,
945
+ "eval_steps_per_second": 4.14,
946
+ "step": 1200
947
+ },
948
+ {
949
+ "epoch": 2.689271817676487,
950
+ "grad_norm": 0.2052982747554779,
951
+ "learning_rate": 3.286318133035132e-06,
952
+ "loss": 0.0394,
953
+ "step": 1210
954
+ },
955
+ {
956
+ "epoch": 2.7115063924402447,
957
+ "grad_norm": 0.14959484338760376,
958
+ "learning_rate": 2.8410146468933364e-06,
959
+ "loss": 0.0351,
960
+ "step": 1220
961
+ },
962
+ {
963
+ "epoch": 2.7337409672040023,
964
+ "grad_norm": 0.17867030203342438,
965
+ "learning_rate": 2.4272385091110516e-06,
966
+ "loss": 0.0465,
967
+ "step": 1230
968
+ },
969
+ {
970
+ "epoch": 2.75597554196776,
971
+ "grad_norm": 0.2831536531448364,
972
+ "learning_rate": 2.0452663427823093e-06,
973
+ "loss": 0.0487,
974
+ "step": 1240
975
+ },
976
+ {
977
+ "epoch": 2.7782101167315174,
978
+ "grad_norm": 0.16684742271900177,
979
+ "learning_rate": 1.6953535089896555e-06,
980
+ "loss": 0.0335,
981
+ "step": 1250
982
+ },
983
+ {
984
+ "epoch": 2.800444691495275,
985
+ "grad_norm": 0.12368661165237427,
986
+ "learning_rate": 1.3777339360867836e-06,
987
+ "loss": 0.0317,
988
+ "step": 1260
989
+ },
990
+ {
991
+ "epoch": 2.8226792662590325,
992
+ "grad_norm": 0.13758961856365204,
993
+ "learning_rate": 1.0926199633097157e-06,
994
+ "loss": 0.0305,
995
+ "step": 1270
996
+ },
997
+ {
998
+ "epoch": 2.8449138410227905,
999
+ "grad_norm": 0.3133557438850403,
1000
+ "learning_rate": 8.402021988209218e-07,
1001
+ "loss": 0.0488,
1002
+ "step": 1280
1003
+ },
1004
+ {
1005
+ "epoch": 2.867148415786548,
1006
+ "grad_norm": 0.13724081218242645,
1007
+ "learning_rate": 6.20649392281425e-07,
1008
+ "loss": 0.0406,
1009
+ "step": 1290
1010
+ },
1011
+ {
1012
+ "epoch": 2.8893829905503057,
1013
+ "grad_norm": 0.286937952041626,
1014
+ "learning_rate": 4.341083220360864e-07,
1015
+ "loss": 0.049,
1016
+ "step": 1300
1017
+ },
1018
+ {
1019
+ "epoch": 2.8893829905503057,
1020
+ "eval_loss": 0.0463690422475338,
1021
+ "eval_runtime": 48.3097,
1022
+ "eval_samples_per_second": 4.14,
1023
+ "eval_steps_per_second": 4.14,
1024
+ "step": 1300
1025
+ },
1026
+ {
1027
+ "epoch": 2.9116175653140632,
1028
+ "grad_norm": 0.06224232539534569,
1029
+ "learning_rate": 2.807036969873722e-07,
1030
+ "loss": 0.0565,
1031
+ "step": 1310
1032
+ },
1033
+ {
1034
+ "epoch": 2.9338521400778212,
1035
+ "grad_norm": 0.08071974664926529,
1036
+ "learning_rate": 1.6053807322333191e-07,
1037
+ "loss": 0.0369,
1038
+ "step": 1320
1039
+ },
1040
+ {
1041
+ "epoch": 2.956086714841579,
1042
+ "grad_norm": 0.22806085646152496,
1043
+ "learning_rate": 7.369178545542088e-08,
1044
+ "loss": 0.0376,
1045
+ "step": 1330
1046
+ },
1047
+ {
1048
+ "epoch": 2.9783212896053364,
1049
+ "grad_norm": 0.20774702727794647,
1050
+ "learning_rate": 2.022289331209959e-08,
1051
+ "loss": 0.0467,
1052
+ "step": 1340
1053
+ },
1054
+ {
1055
+ "epoch": 3.0,
1056
+ "grad_norm": 0.19576112926006317,
1057
+ "learning_rate": 1.671425240434843e-10,
1058
+ "loss": 0.0531,
1059
+ "step": 1350
1060
  }
1061
  ],
1062
  "logging_steps": 10,
 
1071
  "should_evaluate": false,
1072
  "should_log": false,
1073
  "should_save": true,
1074
+ "should_training_stop": true
1075
  },
1076
  "attributes": {}
1077
  }
1078
  },
1079
+ "total_flos": 8.093314295223091e+16,
1080
  "train_batch_size": 1,
1081
  "trial_name": null,
1082
  "trial_params": null