jimnoneill commited on
Commit
7ede6b3
·
verified ·
1 Parent(s): 1d44aa7

Update model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-1135/config.json +51 -0
  2. checkpoint-1135/model.safetensors +3 -0
  3. checkpoint-1135/optimizer.pt +3 -0
  4. checkpoint-1135/rng_state.pth +3 -0
  5. checkpoint-1135/scaler.pt +3 -0
  6. checkpoint-1135/scheduler.pt +3 -0
  7. checkpoint-1135/special_tokens_map.json +7 -0
  8. checkpoint-1135/tokenizer.json +0 -0
  9. checkpoint-1135/tokenizer_config.json +62 -0
  10. checkpoint-1135/trainer_state.json +1635 -0
  11. checkpoint-1135/training_args.bin +3 -0
  12. checkpoint-1135/vocab.txt +0 -0
  13. checkpoint-2270/config.json +51 -0
  14. checkpoint-2270/model.safetensors +3 -0
  15. checkpoint-2270/optimizer.pt +3 -0
  16. checkpoint-2270/rng_state.pth +3 -0
  17. checkpoint-2270/scaler.pt +3 -0
  18. checkpoint-2270/scheduler.pt +3 -0
  19. checkpoint-2270/special_tokens_map.json +7 -0
  20. checkpoint-2270/tokenizer.json +0 -0
  21. checkpoint-2270/tokenizer_config.json +62 -0
  22. checkpoint-2270/trainer_state.json +3236 -0
  23. checkpoint-2270/training_args.bin +3 -0
  24. checkpoint-2270/vocab.txt +0 -0
  25. checkpoint-3405/config.json +51 -0
  26. checkpoint-3405/model.safetensors +3 -0
  27. checkpoint-3405/optimizer.pt +3 -0
  28. checkpoint-3405/rng_state.pth +3 -0
  29. checkpoint-3405/scaler.pt +3 -0
  30. checkpoint-3405/scheduler.pt +3 -0
  31. checkpoint-3405/special_tokens_map.json +7 -0
  32. checkpoint-3405/tokenizer.json +0 -0
  33. checkpoint-3405/tokenizer_config.json +62 -0
  34. checkpoint-3405/trainer_state.json +0 -0
  35. checkpoint-3405/training_args.bin +3 -0
  36. checkpoint-3405/vocab.txt +0 -0
  37. checkpoint-4540/config.json +51 -0
  38. checkpoint-4540/model.safetensors +3 -0
  39. checkpoint-4540/optimizer.pt +3 -0
  40. checkpoint-4540/rng_state.pth +3 -0
  41. checkpoint-4540/scaler.pt +3 -0
  42. checkpoint-4540/scheduler.pt +3 -0
  43. checkpoint-4540/special_tokens_map.json +7 -0
  44. checkpoint-4540/tokenizer.json +0 -0
  45. checkpoint-4540/tokenizer_config.json +62 -0
  46. checkpoint-4540/trainer_state.json +0 -0
  47. checkpoint-4540/training_args.bin +3 -0
  48. checkpoint-4540/vocab.txt +0 -0
  49. checkpoint-5675/config.json +51 -0
  50. checkpoint-5675/model.safetensors +3 -0
checkpoint-1135/config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ElectraForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "embedding_size": 1024,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "id2label": {
12
+ "0": "O",
13
+ "1": "B-antineoplastic",
14
+ "2": "B-cancertype",
15
+ "3": "B-carcinogen",
16
+ "4": "B-negative",
17
+ "5": "I-antineoplastic",
18
+ "6": "I-cancertype",
19
+ "7": "I-carcinogen",
20
+ "8": "I-negative"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "label2id": {
25
+ "B-antineoplastic": 1,
26
+ "B-cancertype": 2,
27
+ "B-carcinogen": 3,
28
+ "B-negative": 4,
29
+ "I-antineoplastic": 5,
30
+ "I-cancertype": 6,
31
+ "I-carcinogen": 7,
32
+ "I-negative": 8,
33
+ "O": 0
34
+ },
35
+ "layer_norm_eps": 1e-12,
36
+ "max_position_embeddings": 512,
37
+ "model_type": "electra",
38
+ "num_attention_heads": 16,
39
+ "num_hidden_layers": 24,
40
+ "pad_token_id": 0,
41
+ "position_embedding_type": "absolute",
42
+ "summary_activation": "gelu",
43
+ "summary_last_dropout": 0.1,
44
+ "summary_type": "first",
45
+ "summary_use_proj": true,
46
+ "torch_dtype": "float32",
47
+ "transformers_version": "4.50.0",
48
+ "type_vocab_size": 2,
49
+ "use_cache": true,
50
+ "vocab_size": 28895
51
+ }
checkpoint-1135/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebe8f6af30cbf911567c0c5d25a15747e99840e1e8f43ef2ba2b056a47617916
3
+ size 1329789812
checkpoint-1135/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a0b564a59893bdc50a30d04a220442b68b248af0e7321e51454547793f5df7f
3
+ size 2659810855
checkpoint-1135/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1561d22169695d4291a5a19cbe50e28a61ebcffec43748f00ccea0bb9382191b
3
+ size 14244
checkpoint-1135/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30172cf14f5dbe00280d63e36224a9f28dc7a0e8b38a74ceb5eb284e84da363
3
+ size 988
checkpoint-1135/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85379799357e52b0fefd2c70aeb83a283830b4ba4f471a1650a917563ee7f73
3
+ size 1064
checkpoint-1135/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-1135/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1135/tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "ignore_mismatched_sizes": true,
50
+ "mask_token": "[MASK]",
51
+ "max_len": 512,
52
+ "model_max_length": 512,
53
+ "never_split": null,
54
+ "pad_token": "[PAD]",
55
+ "padding": true,
56
+ "sep_token": "[SEP]",
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "ElectraTokenizer",
60
+ "truncation": true,
61
+ "unk_token": "[UNK]"
62
+ }
checkpoint-1135/trainer_state.json ADDED
@@ -0,0 +1,1635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1135,
3
+ "best_metric": 0.07397506386041641,
4
+ "best_model_checkpoint": "/storage/BioM-ELECTRA-Large-SQuAD2_5e_update_lg/checkpoint-1135",
5
+ "epoch": 1.0,
6
+ "eval_steps": 5,
7
+ "global_step": 1135,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.004405286343612335,
14
+ "grad_norm": 463300.3125,
15
+ "learning_rate": 1.9982378854625554e-05,
16
+ "loss": 1.3393,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.00881057268722467,
21
+ "grad_norm": 123929.53125,
22
+ "learning_rate": 1.9964757709251103e-05,
23
+ "loss": 0.7964,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.013215859030837005,
28
+ "grad_norm": 103963.2109375,
29
+ "learning_rate": 1.9947136563876656e-05,
30
+ "loss": 0.6685,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.01762114537444934,
35
+ "grad_norm": 85550.5859375,
36
+ "learning_rate": 1.9929515418502202e-05,
37
+ "loss": 0.5623,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.022026431718061675,
42
+ "grad_norm": 122556.59375,
43
+ "learning_rate": 1.9911894273127754e-05,
44
+ "loss": 0.4474,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.02643171806167401,
49
+ "grad_norm": 122631.59375,
50
+ "learning_rate": 1.9894273127753304e-05,
51
+ "loss": 0.4729,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.030837004405286344,
56
+ "grad_norm": 84448.40625,
57
+ "learning_rate": 1.9876651982378856e-05,
58
+ "loss": 0.375,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.03524229074889868,
63
+ "grad_norm": 90985.546875,
64
+ "learning_rate": 1.985903083700441e-05,
65
+ "loss": 0.4128,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.039647577092511016,
70
+ "grad_norm": 83418.0,
71
+ "learning_rate": 1.9841409691629958e-05,
72
+ "loss": 0.3251,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.04405286343612335,
77
+ "grad_norm": 175418.140625,
78
+ "learning_rate": 1.982378854625551e-05,
79
+ "loss": 0.3585,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.048458149779735685,
84
+ "grad_norm": 136878.40625,
85
+ "learning_rate": 1.9806167400881056e-05,
86
+ "loss": 0.296,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.05286343612334802,
91
+ "grad_norm": 98344.71875,
92
+ "learning_rate": 1.978854625550661e-05,
93
+ "loss": 0.32,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.05726872246696035,
98
+ "grad_norm": 108601.3359375,
99
+ "learning_rate": 1.977092511013216e-05,
100
+ "loss": 0.2895,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.06167400881057269,
105
+ "grad_norm": 117893.046875,
106
+ "learning_rate": 1.975330396475771e-05,
107
+ "loss": 0.3008,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.06607929515418502,
112
+ "grad_norm": 92026.25,
113
+ "learning_rate": 1.9735682819383263e-05,
114
+ "loss": 0.2413,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.07048458149779736,
119
+ "grad_norm": 139038.234375,
120
+ "learning_rate": 1.9718061674008812e-05,
121
+ "loss": 0.2645,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.07488986784140969,
126
+ "grad_norm": 156623.421875,
127
+ "learning_rate": 1.9700440528634365e-05,
128
+ "loss": 0.2314,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.07929515418502203,
133
+ "grad_norm": 194062.578125,
134
+ "learning_rate": 1.9682819383259914e-05,
135
+ "loss": 0.2854,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.08370044052863436,
140
+ "grad_norm": 150748.421875,
141
+ "learning_rate": 1.9665198237885463e-05,
142
+ "loss": 0.2485,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.0881057268722467,
147
+ "grad_norm": 160828.71875,
148
+ "learning_rate": 1.9647577092511016e-05,
149
+ "loss": 0.2135,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.09251101321585903,
154
+ "grad_norm": 106817.6640625,
155
+ "learning_rate": 1.9629955947136565e-05,
156
+ "loss": 0.2011,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.09691629955947137,
161
+ "grad_norm": 268202.65625,
162
+ "learning_rate": 1.9612334801762118e-05,
163
+ "loss": 0.2114,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.1013215859030837,
168
+ "grad_norm": 139443.75,
169
+ "learning_rate": 1.9594713656387667e-05,
170
+ "loss": 0.2436,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.10572687224669604,
175
+ "grad_norm": 117417.0234375,
176
+ "learning_rate": 1.957709251101322e-05,
177
+ "loss": 0.1895,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.11013215859030837,
182
+ "grad_norm": 126792.234375,
183
+ "learning_rate": 1.955947136563877e-05,
184
+ "loss": 0.1937,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.1145374449339207,
189
+ "grad_norm": 167020.0625,
190
+ "learning_rate": 1.9541850220264318e-05,
191
+ "loss": 0.203,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.11894273127753303,
196
+ "grad_norm": 220364.109375,
197
+ "learning_rate": 1.952422907488987e-05,
198
+ "loss": 0.1573,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.12334801762114538,
203
+ "grad_norm": 233084.5,
204
+ "learning_rate": 1.950660792951542e-05,
205
+ "loss": 0.2019,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.1277533039647577,
210
+ "grad_norm": 125416.90625,
211
+ "learning_rate": 1.9488986784140972e-05,
212
+ "loss": 0.163,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.13215859030837004,
217
+ "grad_norm": 104147.4453125,
218
+ "learning_rate": 1.947136563876652e-05,
219
+ "loss": 0.1286,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.13656387665198239,
224
+ "grad_norm": 120634.4765625,
225
+ "learning_rate": 1.9453744493392074e-05,
226
+ "loss": 0.1525,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.14096916299559473,
231
+ "grad_norm": 141004.78125,
232
+ "learning_rate": 1.9436123348017623e-05,
233
+ "loss": 0.2162,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.14537444933920704,
238
+ "grad_norm": 229382.46875,
239
+ "learning_rate": 1.9418502202643172e-05,
240
+ "loss": 0.1294,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.14977973568281938,
245
+ "grad_norm": 213832.84375,
246
+ "learning_rate": 1.9400881057268725e-05,
247
+ "loss": 0.136,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.15418502202643172,
252
+ "grad_norm": 241189.96875,
253
+ "learning_rate": 1.9383259911894274e-05,
254
+ "loss": 0.1442,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.15859030837004406,
259
+ "grad_norm": 68183.6484375,
260
+ "learning_rate": 1.9365638766519827e-05,
261
+ "loss": 0.1559,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.16299559471365638,
266
+ "grad_norm": 83166.6796875,
267
+ "learning_rate": 1.9348017621145376e-05,
268
+ "loss": 0.1747,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.16740088105726872,
273
+ "grad_norm": 158394.59375,
274
+ "learning_rate": 1.933039647577093e-05,
275
+ "loss": 0.137,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.17180616740088106,
280
+ "grad_norm": 227525.71875,
281
+ "learning_rate": 1.9312775330396478e-05,
282
+ "loss": 0.1493,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.1762114537444934,
287
+ "grad_norm": 189944.125,
288
+ "learning_rate": 1.9295154185022027e-05,
289
+ "loss": 0.1682,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.18061674008810572,
294
+ "grad_norm": 109924.9921875,
295
+ "learning_rate": 1.927753303964758e-05,
296
+ "loss": 0.1332,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.18502202643171806,
301
+ "grad_norm": 110553.6328125,
302
+ "learning_rate": 1.925991189427313e-05,
303
+ "loss": 0.156,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.1894273127753304,
308
+ "grad_norm": 174909.78125,
309
+ "learning_rate": 1.924229074889868e-05,
310
+ "loss": 0.1264,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.19383259911894274,
315
+ "grad_norm": 93096.015625,
316
+ "learning_rate": 1.922466960352423e-05,
317
+ "loss": 0.1483,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.19823788546255505,
322
+ "grad_norm": 148515.625,
323
+ "learning_rate": 1.9207048458149783e-05,
324
+ "loss": 0.2345,
325
+ "step": 225
326
+ },
327
+ {
328
+ "epoch": 0.2026431718061674,
329
+ "grad_norm": 147129.25,
330
+ "learning_rate": 1.9189427312775332e-05,
331
+ "loss": 0.1201,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 0.20704845814977973,
336
+ "grad_norm": 187686.65625,
337
+ "learning_rate": 1.917180616740088e-05,
338
+ "loss": 0.1236,
339
+ "step": 235
340
+ },
341
+ {
342
+ "epoch": 0.21145374449339208,
343
+ "grad_norm": 197271.859375,
344
+ "learning_rate": 1.9154185022026434e-05,
345
+ "loss": 0.1518,
346
+ "step": 240
347
+ },
348
+ {
349
+ "epoch": 0.21585903083700442,
350
+ "grad_norm": 113562.8828125,
351
+ "learning_rate": 1.9136563876651983e-05,
352
+ "loss": 0.1016,
353
+ "step": 245
354
+ },
355
+ {
356
+ "epoch": 0.22026431718061673,
357
+ "grad_norm": 161111.78125,
358
+ "learning_rate": 1.9118942731277536e-05,
359
+ "loss": 0.1265,
360
+ "step": 250
361
+ },
362
+ {
363
+ "epoch": 0.22466960352422907,
364
+ "grad_norm": 118850.125,
365
+ "learning_rate": 1.9101321585903085e-05,
366
+ "loss": 0.141,
367
+ "step": 255
368
+ },
369
+ {
370
+ "epoch": 0.2290748898678414,
371
+ "grad_norm": 125884.6015625,
372
+ "learning_rate": 1.9083700440528637e-05,
373
+ "loss": 0.1376,
374
+ "step": 260
375
+ },
376
+ {
377
+ "epoch": 0.23348017621145375,
378
+ "grad_norm": 148464.078125,
379
+ "learning_rate": 1.9066079295154187e-05,
380
+ "loss": 0.1456,
381
+ "step": 265
382
+ },
383
+ {
384
+ "epoch": 0.23788546255506607,
385
+ "grad_norm": 77475.3046875,
386
+ "learning_rate": 1.904845814977974e-05,
387
+ "loss": 0.1144,
388
+ "step": 270
389
+ },
390
+ {
391
+ "epoch": 0.2422907488986784,
392
+ "grad_norm": 140790.203125,
393
+ "learning_rate": 1.9030837004405288e-05,
394
+ "loss": 0.1432,
395
+ "step": 275
396
+ },
397
+ {
398
+ "epoch": 0.24669603524229075,
399
+ "grad_norm": 116215.90625,
400
+ "learning_rate": 1.9013215859030837e-05,
401
+ "loss": 0.1661,
402
+ "step": 280
403
+ },
404
+ {
405
+ "epoch": 0.2511013215859031,
406
+ "grad_norm": 141195.609375,
407
+ "learning_rate": 1.899559471365639e-05,
408
+ "loss": 0.1013,
409
+ "step": 285
410
+ },
411
+ {
412
+ "epoch": 0.2555066079295154,
413
+ "grad_norm": 109915.46875,
414
+ "learning_rate": 1.897797356828194e-05,
415
+ "loss": 0.1296,
416
+ "step": 290
417
+ },
418
+ {
419
+ "epoch": 0.2599118942731278,
420
+ "grad_norm": 80954.6796875,
421
+ "learning_rate": 1.8960352422907492e-05,
422
+ "loss": 0.1072,
423
+ "step": 295
424
+ },
425
+ {
426
+ "epoch": 0.2643171806167401,
427
+ "grad_norm": 160752.921875,
428
+ "learning_rate": 1.894273127753304e-05,
429
+ "loss": 0.0952,
430
+ "step": 300
431
+ },
432
+ {
433
+ "epoch": 0.2687224669603524,
434
+ "grad_norm": 181410.125,
435
+ "learning_rate": 1.8925110132158594e-05,
436
+ "loss": 0.1439,
437
+ "step": 305
438
+ },
439
+ {
440
+ "epoch": 0.27312775330396477,
441
+ "grad_norm": 88055.2265625,
442
+ "learning_rate": 1.8907488986784143e-05,
443
+ "loss": 0.126,
444
+ "step": 310
445
+ },
446
+ {
447
+ "epoch": 0.2775330396475771,
448
+ "grad_norm": 295895.375,
449
+ "learning_rate": 1.8889867841409692e-05,
450
+ "loss": 0.1468,
451
+ "step": 315
452
+ },
453
+ {
454
+ "epoch": 0.28193832599118945,
455
+ "grad_norm": 100781.8671875,
456
+ "learning_rate": 1.8872246696035245e-05,
457
+ "loss": 0.1115,
458
+ "step": 320
459
+ },
460
+ {
461
+ "epoch": 0.28634361233480177,
462
+ "grad_norm": 81134.4375,
463
+ "learning_rate": 1.8854625550660794e-05,
464
+ "loss": 0.1283,
465
+ "step": 325
466
+ },
467
+ {
468
+ "epoch": 0.2907488986784141,
469
+ "grad_norm": 113947.65625,
470
+ "learning_rate": 1.8837004405286346e-05,
471
+ "loss": 0.149,
472
+ "step": 330
473
+ },
474
+ {
475
+ "epoch": 0.29515418502202645,
476
+ "grad_norm": 93833.7265625,
477
+ "learning_rate": 1.8819383259911895e-05,
478
+ "loss": 0.1112,
479
+ "step": 335
480
+ },
481
+ {
482
+ "epoch": 0.29955947136563876,
483
+ "grad_norm": 154168.203125,
484
+ "learning_rate": 1.8801762114537448e-05,
485
+ "loss": 0.1433,
486
+ "step": 340
487
+ },
488
+ {
489
+ "epoch": 0.3039647577092511,
490
+ "grad_norm": 160026.171875,
491
+ "learning_rate": 1.8784140969162997e-05,
492
+ "loss": 0.0963,
493
+ "step": 345
494
+ },
495
+ {
496
+ "epoch": 0.30837004405286345,
497
+ "grad_norm": 318680.5,
498
+ "learning_rate": 1.8766519823788546e-05,
499
+ "loss": 0.1234,
500
+ "step": 350
501
+ },
502
+ {
503
+ "epoch": 0.31277533039647576,
504
+ "grad_norm": 105961.140625,
505
+ "learning_rate": 1.87488986784141e-05,
506
+ "loss": 0.0938,
507
+ "step": 355
508
+ },
509
+ {
510
+ "epoch": 0.31718061674008813,
511
+ "grad_norm": 98187.046875,
512
+ "learning_rate": 1.8731277533039648e-05,
513
+ "loss": 0.1424,
514
+ "step": 360
515
+ },
516
+ {
517
+ "epoch": 0.32158590308370044,
518
+ "grad_norm": 84516.953125,
519
+ "learning_rate": 1.87136563876652e-05,
520
+ "loss": 0.12,
521
+ "step": 365
522
+ },
523
+ {
524
+ "epoch": 0.32599118942731276,
525
+ "grad_norm": 224001.703125,
526
+ "learning_rate": 1.869603524229075e-05,
527
+ "loss": 0.1175,
528
+ "step": 370
529
+ },
530
+ {
531
+ "epoch": 0.3303964757709251,
532
+ "grad_norm": 155620.953125,
533
+ "learning_rate": 1.8678414096916303e-05,
534
+ "loss": 0.1075,
535
+ "step": 375
536
+ },
537
+ {
538
+ "epoch": 0.33480176211453744,
539
+ "grad_norm": 117204.7421875,
540
+ "learning_rate": 1.8660792951541852e-05,
541
+ "loss": 0.0985,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 0.3392070484581498,
546
+ "grad_norm": 113135.15625,
547
+ "learning_rate": 1.86431718061674e-05,
548
+ "loss": 0.107,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 0.3436123348017621,
553
+ "grad_norm": 52815.0625,
554
+ "learning_rate": 1.8625550660792953e-05,
555
+ "loss": 0.101,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 0.34801762114537443,
560
+ "grad_norm": 90544.8828125,
561
+ "learning_rate": 1.8607929515418503e-05,
562
+ "loss": 0.117,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 0.3524229074889868,
567
+ "grad_norm": 69778.9921875,
568
+ "learning_rate": 1.8590308370044055e-05,
569
+ "loss": 0.1292,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 0.3568281938325991,
574
+ "grad_norm": 119399.484375,
575
+ "learning_rate": 1.8572687224669604e-05,
576
+ "loss": 0.0989,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 0.36123348017621143,
581
+ "grad_norm": 91506.125,
582
+ "learning_rate": 1.8555066079295157e-05,
583
+ "loss": 0.0897,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 0.3656387665198238,
588
+ "grad_norm": 82248.2265625,
589
+ "learning_rate": 1.8537444933920706e-05,
590
+ "loss": 0.0867,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.3700440528634361,
595
+ "grad_norm": 99124.671875,
596
+ "learning_rate": 1.8519823788546255e-05,
597
+ "loss": 0.0877,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.3744493392070485,
602
+ "grad_norm": 99704.7265625,
603
+ "learning_rate": 1.8502202643171808e-05,
604
+ "loss": 0.0996,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.3788546255506608,
609
+ "grad_norm": 153517.046875,
610
+ "learning_rate": 1.8484581497797357e-05,
611
+ "loss": 0.1302,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.3832599118942731,
616
+ "grad_norm": 91006.6640625,
617
+ "learning_rate": 1.846696035242291e-05,
618
+ "loss": 0.0861,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.3876651982378855,
623
+ "grad_norm": 327581.59375,
624
+ "learning_rate": 1.844933920704846e-05,
625
+ "loss": 0.1338,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.3920704845814978,
630
+ "grad_norm": 124674.0390625,
631
+ "learning_rate": 1.843171806167401e-05,
632
+ "loss": 0.0835,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.3964757709251101,
637
+ "grad_norm": 192952.09375,
638
+ "learning_rate": 1.841409691629956e-05,
639
+ "loss": 0.113,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.4008810572687225,
644
+ "grad_norm": 111717.0390625,
645
+ "learning_rate": 1.839647577092511e-05,
646
+ "loss": 0.1334,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.4052863436123348,
651
+ "grad_norm": 132410.96875,
652
+ "learning_rate": 1.8378854625550662e-05,
653
+ "loss": 0.1233,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.40969162995594716,
658
+ "grad_norm": 120118.8359375,
659
+ "learning_rate": 1.836123348017621e-05,
660
+ "loss": 0.1272,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.41409691629955947,
665
+ "grad_norm": 89015.53125,
666
+ "learning_rate": 1.8343612334801764e-05,
667
+ "loss": 0.0746,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.4185022026431718,
672
+ "grad_norm": 146248.03125,
673
+ "learning_rate": 1.8325991189427313e-05,
674
+ "loss": 0.1042,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.42290748898678415,
679
+ "grad_norm": 71374.6171875,
680
+ "learning_rate": 1.8308370044052866e-05,
681
+ "loss": 0.1077,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.42731277533039647,
686
+ "grad_norm": 81359.4453125,
687
+ "learning_rate": 1.8290748898678415e-05,
688
+ "loss": 0.1554,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.43171806167400884,
693
+ "grad_norm": 154972.984375,
694
+ "learning_rate": 1.8273127753303964e-05,
695
+ "loss": 0.1039,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.43612334801762115,
700
+ "grad_norm": 62776.6484375,
701
+ "learning_rate": 1.8255506607929517e-05,
702
+ "loss": 0.08,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.44052863436123346,
707
+ "grad_norm": 129083.4140625,
708
+ "learning_rate": 1.8237885462555066e-05,
709
+ "loss": 0.1517,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 0.44493392070484583,
714
+ "grad_norm": 128397.796875,
715
+ "learning_rate": 1.822026431718062e-05,
716
+ "loss": 0.1039,
717
+ "step": 505
718
+ },
719
+ {
720
+ "epoch": 0.44933920704845814,
721
+ "grad_norm": 175972.28125,
722
+ "learning_rate": 1.8202643171806168e-05,
723
+ "loss": 0.1032,
724
+ "step": 510
725
+ },
726
+ {
727
+ "epoch": 0.45374449339207046,
728
+ "grad_norm": 106463.9296875,
729
+ "learning_rate": 1.818502202643172e-05,
730
+ "loss": 0.1073,
731
+ "step": 515
732
+ },
733
+ {
734
+ "epoch": 0.4581497797356828,
735
+ "grad_norm": 109430.453125,
736
+ "learning_rate": 1.816740088105727e-05,
737
+ "loss": 0.0928,
738
+ "step": 520
739
+ },
740
+ {
741
+ "epoch": 0.46255506607929514,
742
+ "grad_norm": 138073.25,
743
+ "learning_rate": 1.814977973568282e-05,
744
+ "loss": 0.0627,
745
+ "step": 525
746
+ },
747
+ {
748
+ "epoch": 0.4669603524229075,
749
+ "grad_norm": 104565.8046875,
750
+ "learning_rate": 1.813215859030837e-05,
751
+ "loss": 0.1008,
752
+ "step": 530
753
+ },
754
+ {
755
+ "epoch": 0.4713656387665198,
756
+ "grad_norm": 118428.40625,
757
+ "learning_rate": 1.811453744493392e-05,
758
+ "loss": 0.1062,
759
+ "step": 535
760
+ },
761
+ {
762
+ "epoch": 0.47577092511013214,
763
+ "grad_norm": 98266.734375,
764
+ "learning_rate": 1.8096916299559473e-05,
765
+ "loss": 0.1242,
766
+ "step": 540
767
+ },
768
+ {
769
+ "epoch": 0.4801762114537445,
770
+ "grad_norm": 142845.421875,
771
+ "learning_rate": 1.8079295154185022e-05,
772
+ "loss": 0.1271,
773
+ "step": 545
774
+ },
775
+ {
776
+ "epoch": 0.4845814977973568,
777
+ "grad_norm": 151209.21875,
778
+ "learning_rate": 1.8061674008810575e-05,
779
+ "loss": 0.0677,
780
+ "step": 550
781
+ },
782
+ {
783
+ "epoch": 0.4889867841409692,
784
+ "grad_norm": 97798.6796875,
785
+ "learning_rate": 1.8044052863436124e-05,
786
+ "loss": 0.0931,
787
+ "step": 555
788
+ },
789
+ {
790
+ "epoch": 0.4933920704845815,
791
+ "grad_norm": 80213.59375,
792
+ "learning_rate": 1.8026431718061677e-05,
793
+ "loss": 0.072,
794
+ "step": 560
795
+ },
796
+ {
797
+ "epoch": 0.4977973568281938,
798
+ "grad_norm": 33982.3515625,
799
+ "learning_rate": 1.8008810572687226e-05,
800
+ "loss": 0.0699,
801
+ "step": 565
802
+ },
803
+ {
804
+ "epoch": 0.5022026431718062,
805
+ "grad_norm": 130265.4296875,
806
+ "learning_rate": 1.7991189427312775e-05,
807
+ "loss": 0.1226,
808
+ "step": 570
809
+ },
810
+ {
811
+ "epoch": 0.5066079295154186,
812
+ "grad_norm": 49779.00390625,
813
+ "learning_rate": 1.7973568281938328e-05,
814
+ "loss": 0.1363,
815
+ "step": 575
816
+ },
817
+ {
818
+ "epoch": 0.5110132158590308,
819
+ "grad_norm": 128675.6953125,
820
+ "learning_rate": 1.7955947136563877e-05,
821
+ "loss": 0.1344,
822
+ "step": 580
823
+ },
824
+ {
825
+ "epoch": 0.5154185022026432,
826
+ "grad_norm": 123242.515625,
827
+ "learning_rate": 1.793832599118943e-05,
828
+ "loss": 0.0954,
829
+ "step": 585
830
+ },
831
+ {
832
+ "epoch": 0.5198237885462555,
833
+ "grad_norm": 112552.0859375,
834
+ "learning_rate": 1.792070484581498e-05,
835
+ "loss": 0.0919,
836
+ "step": 590
837
+ },
838
+ {
839
+ "epoch": 0.5242290748898678,
840
+ "grad_norm": 75417.375,
841
+ "learning_rate": 1.790308370044053e-05,
842
+ "loss": 0.0993,
843
+ "step": 595
844
+ },
845
+ {
846
+ "epoch": 0.5286343612334802,
847
+ "grad_norm": 127870.3671875,
848
+ "learning_rate": 1.788546255506608e-05,
849
+ "loss": 0.1006,
850
+ "step": 600
851
+ },
852
+ {
853
+ "epoch": 0.5330396475770925,
854
+ "grad_norm": 108290.8203125,
855
+ "learning_rate": 1.786784140969163e-05,
856
+ "loss": 0.1095,
857
+ "step": 605
858
+ },
859
+ {
860
+ "epoch": 0.5374449339207048,
861
+ "grad_norm": 131361.828125,
862
+ "learning_rate": 1.7850220264317182e-05,
863
+ "loss": 0.0967,
864
+ "step": 610
865
+ },
866
+ {
867
+ "epoch": 0.5418502202643172,
868
+ "grad_norm": 164738.09375,
869
+ "learning_rate": 1.783259911894273e-05,
870
+ "loss": 0.1096,
871
+ "step": 615
872
+ },
873
+ {
874
+ "epoch": 0.5462555066079295,
875
+ "grad_norm": 155269.90625,
876
+ "learning_rate": 1.7814977973568284e-05,
877
+ "loss": 0.0969,
878
+ "step": 620
879
+ },
880
+ {
881
+ "epoch": 0.5506607929515418,
882
+ "grad_norm": 129207.3046875,
883
+ "learning_rate": 1.7797356828193833e-05,
884
+ "loss": 0.1261,
885
+ "step": 625
886
+ },
887
+ {
888
+ "epoch": 0.5550660792951542,
889
+ "grad_norm": 297050.8125,
890
+ "learning_rate": 1.7779735682819386e-05,
891
+ "loss": 0.0901,
892
+ "step": 630
893
+ },
894
+ {
895
+ "epoch": 0.5594713656387665,
896
+ "grad_norm": 110518.3359375,
897
+ "learning_rate": 1.7762114537444935e-05,
898
+ "loss": 0.0647,
899
+ "step": 635
900
+ },
901
+ {
902
+ "epoch": 0.5638766519823789,
903
+ "grad_norm": 73083.71875,
904
+ "learning_rate": 1.7744493392070484e-05,
905
+ "loss": 0.0966,
906
+ "step": 640
907
+ },
908
+ {
909
+ "epoch": 0.5682819383259912,
910
+ "grad_norm": 120506.2265625,
911
+ "learning_rate": 1.7726872246696037e-05,
912
+ "loss": 0.0903,
913
+ "step": 645
914
+ },
915
+ {
916
+ "epoch": 0.5726872246696035,
917
+ "grad_norm": 81893.8515625,
918
+ "learning_rate": 1.7709251101321586e-05,
919
+ "loss": 0.1315,
920
+ "step": 650
921
+ },
922
+ {
923
+ "epoch": 0.5770925110132159,
924
+ "grad_norm": 38761.71484375,
925
+ "learning_rate": 1.769162995594714e-05,
926
+ "loss": 0.0828,
927
+ "step": 655
928
+ },
929
+ {
930
+ "epoch": 0.5814977973568282,
931
+ "grad_norm": 174716.234375,
932
+ "learning_rate": 1.767400881057269e-05,
933
+ "loss": 0.134,
934
+ "step": 660
935
+ },
936
+ {
937
+ "epoch": 0.5859030837004405,
938
+ "grad_norm": 72472.8671875,
939
+ "learning_rate": 1.765638766519824e-05,
940
+ "loss": 0.1047,
941
+ "step": 665
942
+ },
943
+ {
944
+ "epoch": 0.5903083700440529,
945
+ "grad_norm": 106439.15625,
946
+ "learning_rate": 1.763876651982379e-05,
947
+ "loss": 0.0769,
948
+ "step": 670
949
+ },
950
+ {
951
+ "epoch": 0.5947136563876652,
952
+ "grad_norm": 181449.84375,
953
+ "learning_rate": 1.762114537444934e-05,
954
+ "loss": 0.1306,
955
+ "step": 675
956
+ },
957
+ {
958
+ "epoch": 0.5991189427312775,
959
+ "grad_norm": 89675.6171875,
960
+ "learning_rate": 1.760352422907489e-05,
961
+ "loss": 0.111,
962
+ "step": 680
963
+ },
964
+ {
965
+ "epoch": 0.6035242290748899,
966
+ "grad_norm": 99200.21875,
967
+ "learning_rate": 1.758590308370044e-05,
968
+ "loss": 0.1079,
969
+ "step": 685
970
+ },
971
+ {
972
+ "epoch": 0.6079295154185022,
973
+ "grad_norm": 173344.875,
974
+ "learning_rate": 1.7568281938325993e-05,
975
+ "loss": 0.0785,
976
+ "step": 690
977
+ },
978
+ {
979
+ "epoch": 0.6123348017621145,
980
+ "grad_norm": 98200.265625,
981
+ "learning_rate": 1.7550660792951545e-05,
982
+ "loss": 0.0901,
983
+ "step": 695
984
+ },
985
+ {
986
+ "epoch": 0.6167400881057269,
987
+ "grad_norm": 77085.3359375,
988
+ "learning_rate": 1.7533039647577095e-05,
989
+ "loss": 0.0612,
990
+ "step": 700
991
+ },
992
+ {
993
+ "epoch": 0.6211453744493393,
994
+ "grad_norm": 96442.0234375,
995
+ "learning_rate": 1.7515418502202644e-05,
996
+ "loss": 0.1305,
997
+ "step": 705
998
+ },
999
+ {
1000
+ "epoch": 0.6255506607929515,
1001
+ "grad_norm": 391359.71875,
1002
+ "learning_rate": 1.7497797356828193e-05,
1003
+ "loss": 0.13,
1004
+ "step": 710
1005
+ },
1006
+ {
1007
+ "epoch": 0.6299559471365639,
1008
+ "grad_norm": 116567.65625,
1009
+ "learning_rate": 1.7480176211453746e-05,
1010
+ "loss": 0.0964,
1011
+ "step": 715
1012
+ },
1013
+ {
1014
+ "epoch": 0.6343612334801763,
1015
+ "grad_norm": 143271.65625,
1016
+ "learning_rate": 1.7462555066079295e-05,
1017
+ "loss": 0.1161,
1018
+ "step": 720
1019
+ },
1020
+ {
1021
+ "epoch": 0.6387665198237885,
1022
+ "grad_norm": 286173.40625,
1023
+ "learning_rate": 1.7444933920704847e-05,
1024
+ "loss": 0.0977,
1025
+ "step": 725
1026
+ },
1027
+ {
1028
+ "epoch": 0.6431718061674009,
1029
+ "grad_norm": 93874.2578125,
1030
+ "learning_rate": 1.74273127753304e-05,
1031
+ "loss": 0.0932,
1032
+ "step": 730
1033
+ },
1034
+ {
1035
+ "epoch": 0.6475770925110133,
1036
+ "grad_norm": 124057.8828125,
1037
+ "learning_rate": 1.740969162995595e-05,
1038
+ "loss": 0.118,
1039
+ "step": 735
1040
+ },
1041
+ {
1042
+ "epoch": 0.6519823788546255,
1043
+ "grad_norm": 144435.03125,
1044
+ "learning_rate": 1.73920704845815e-05,
1045
+ "loss": 0.1053,
1046
+ "step": 740
1047
+ },
1048
+ {
1049
+ "epoch": 0.6563876651982379,
1050
+ "grad_norm": 72584.375,
1051
+ "learning_rate": 1.7374449339207047e-05,
1052
+ "loss": 0.0731,
1053
+ "step": 745
1054
+ },
1055
+ {
1056
+ "epoch": 0.6607929515418502,
1057
+ "grad_norm": 116053.421875,
1058
+ "learning_rate": 1.73568281938326e-05,
1059
+ "loss": 0.0847,
1060
+ "step": 750
1061
+ },
1062
+ {
1063
+ "epoch": 0.6651982378854625,
1064
+ "grad_norm": 148674.921875,
1065
+ "learning_rate": 1.7339207048458153e-05,
1066
+ "loss": 0.1276,
1067
+ "step": 755
1068
+ },
1069
+ {
1070
+ "epoch": 0.6696035242290749,
1071
+ "grad_norm": 84335.3515625,
1072
+ "learning_rate": 1.7321585903083702e-05,
1073
+ "loss": 0.1314,
1074
+ "step": 760
1075
+ },
1076
+ {
1077
+ "epoch": 0.6740088105726872,
1078
+ "grad_norm": 187770.640625,
1079
+ "learning_rate": 1.7303964757709254e-05,
1080
+ "loss": 0.0957,
1081
+ "step": 765
1082
+ },
1083
+ {
1084
+ "epoch": 0.6784140969162996,
1085
+ "grad_norm": 191359.46875,
1086
+ "learning_rate": 1.7286343612334804e-05,
1087
+ "loss": 0.1358,
1088
+ "step": 770
1089
+ },
1090
+ {
1091
+ "epoch": 0.6828193832599119,
1092
+ "grad_norm": 58638.65625,
1093
+ "learning_rate": 1.7268722466960356e-05,
1094
+ "loss": 0.077,
1095
+ "step": 775
1096
+ },
1097
+ {
1098
+ "epoch": 0.6872246696035242,
1099
+ "grad_norm": 166379.5625,
1100
+ "learning_rate": 1.7251101321585902e-05,
1101
+ "loss": 0.0536,
1102
+ "step": 780
1103
+ },
1104
+ {
1105
+ "epoch": 0.6916299559471366,
1106
+ "grad_norm": 161942.0625,
1107
+ "learning_rate": 1.7233480176211454e-05,
1108
+ "loss": 0.0919,
1109
+ "step": 785
1110
+ },
1111
+ {
1112
+ "epoch": 0.6960352422907489,
1113
+ "grad_norm": 149369.109375,
1114
+ "learning_rate": 1.7215859030837007e-05,
1115
+ "loss": 0.113,
1116
+ "step": 790
1117
+ },
1118
+ {
1119
+ "epoch": 0.7004405286343612,
1120
+ "grad_norm": 98025.09375,
1121
+ "learning_rate": 1.7198237885462556e-05,
1122
+ "loss": 0.0557,
1123
+ "step": 795
1124
+ },
1125
+ {
1126
+ "epoch": 0.7048458149779736,
1127
+ "grad_norm": 77845.296875,
1128
+ "learning_rate": 1.718061674008811e-05,
1129
+ "loss": 0.0858,
1130
+ "step": 800
1131
+ },
1132
+ {
1133
+ "epoch": 0.7092511013215859,
1134
+ "grad_norm": 229359.421875,
1135
+ "learning_rate": 1.7162995594713658e-05,
1136
+ "loss": 0.0905,
1137
+ "step": 805
1138
+ },
1139
+ {
1140
+ "epoch": 0.7136563876651982,
1141
+ "grad_norm": 114946.9453125,
1142
+ "learning_rate": 1.714537444933921e-05,
1143
+ "loss": 0.0855,
1144
+ "step": 810
1145
+ },
1146
+ {
1147
+ "epoch": 0.7180616740088106,
1148
+ "grad_norm": 105548.078125,
1149
+ "learning_rate": 1.712775330396476e-05,
1150
+ "loss": 0.0878,
1151
+ "step": 815
1152
+ },
1153
+ {
1154
+ "epoch": 0.7224669603524229,
1155
+ "grad_norm": 84087.8203125,
1156
+ "learning_rate": 1.711013215859031e-05,
1157
+ "loss": 0.0718,
1158
+ "step": 820
1159
+ },
1160
+ {
1161
+ "epoch": 0.7268722466960352,
1162
+ "grad_norm": 63263.57421875,
1163
+ "learning_rate": 1.709251101321586e-05,
1164
+ "loss": 0.0656,
1165
+ "step": 825
1166
+ },
1167
+ {
1168
+ "epoch": 0.7312775330396476,
1169
+ "grad_norm": 155470.1875,
1170
+ "learning_rate": 1.707488986784141e-05,
1171
+ "loss": 0.1277,
1172
+ "step": 830
1173
+ },
1174
+ {
1175
+ "epoch": 0.73568281938326,
1176
+ "grad_norm": 113818.9140625,
1177
+ "learning_rate": 1.7057268722466963e-05,
1178
+ "loss": 0.1506,
1179
+ "step": 835
1180
+ },
1181
+ {
1182
+ "epoch": 0.7400881057268722,
1183
+ "grad_norm": 207417.890625,
1184
+ "learning_rate": 1.7039647577092512e-05,
1185
+ "loss": 0.1078,
1186
+ "step": 840
1187
+ },
1188
+ {
1189
+ "epoch": 0.7444933920704846,
1190
+ "grad_norm": 96532.390625,
1191
+ "learning_rate": 1.7022026431718065e-05,
1192
+ "loss": 0.1288,
1193
+ "step": 845
1194
+ },
1195
+ {
1196
+ "epoch": 0.748898678414097,
1197
+ "grad_norm": 97353.796875,
1198
+ "learning_rate": 1.7004405286343614e-05,
1199
+ "loss": 0.0624,
1200
+ "step": 850
1201
+ },
1202
+ {
1203
+ "epoch": 0.7533039647577092,
1204
+ "grad_norm": 152348.625,
1205
+ "learning_rate": 1.6986784140969163e-05,
1206
+ "loss": 0.0837,
1207
+ "step": 855
1208
+ },
1209
+ {
1210
+ "epoch": 0.7577092511013216,
1211
+ "grad_norm": 92298.265625,
1212
+ "learning_rate": 1.6969162995594716e-05,
1213
+ "loss": 0.0918,
1214
+ "step": 860
1215
+ },
1216
+ {
1217
+ "epoch": 0.762114537444934,
1218
+ "grad_norm": 97720.921875,
1219
+ "learning_rate": 1.6951541850220265e-05,
1220
+ "loss": 0.1112,
1221
+ "step": 865
1222
+ },
1223
+ {
1224
+ "epoch": 0.7665198237885462,
1225
+ "grad_norm": 88505.6640625,
1226
+ "learning_rate": 1.6933920704845818e-05,
1227
+ "loss": 0.1022,
1228
+ "step": 870
1229
+ },
1230
+ {
1231
+ "epoch": 0.7709251101321586,
1232
+ "grad_norm": 58780.78125,
1233
+ "learning_rate": 1.6916299559471367e-05,
1234
+ "loss": 0.1064,
1235
+ "step": 875
1236
+ },
1237
+ {
1238
+ "epoch": 0.775330396475771,
1239
+ "grad_norm": 102623.4140625,
1240
+ "learning_rate": 1.689867841409692e-05,
1241
+ "loss": 0.077,
1242
+ "step": 880
1243
+ },
1244
+ {
1245
+ "epoch": 0.7797356828193832,
1246
+ "grad_norm": 57006.04296875,
1247
+ "learning_rate": 1.688105726872247e-05,
1248
+ "loss": 0.1078,
1249
+ "step": 885
1250
+ },
1251
+ {
1252
+ "epoch": 0.7841409691629956,
1253
+ "grad_norm": 110325.6171875,
1254
+ "learning_rate": 1.6863436123348018e-05,
1255
+ "loss": 0.0716,
1256
+ "step": 890
1257
+ },
1258
+ {
1259
+ "epoch": 0.788546255506608,
1260
+ "grad_norm": 114105.796875,
1261
+ "learning_rate": 1.684581497797357e-05,
1262
+ "loss": 0.0678,
1263
+ "step": 895
1264
+ },
1265
+ {
1266
+ "epoch": 0.7929515418502202,
1267
+ "grad_norm": 65852.2734375,
1268
+ "learning_rate": 1.682819383259912e-05,
1269
+ "loss": 0.0711,
1270
+ "step": 900
1271
+ },
1272
+ {
1273
+ "epoch": 0.7973568281938326,
1274
+ "grad_norm": 72375.2265625,
1275
+ "learning_rate": 1.6810572687224672e-05,
1276
+ "loss": 0.1311,
1277
+ "step": 905
1278
+ },
1279
+ {
1280
+ "epoch": 0.801762114537445,
1281
+ "grad_norm": 122892.6328125,
1282
+ "learning_rate": 1.679295154185022e-05,
1283
+ "loss": 0.0975,
1284
+ "step": 910
1285
+ },
1286
+ {
1287
+ "epoch": 0.8061674008810573,
1288
+ "grad_norm": 194237.671875,
1289
+ "learning_rate": 1.6775330396475774e-05,
1290
+ "loss": 0.1006,
1291
+ "step": 915
1292
+ },
1293
+ {
1294
+ "epoch": 0.8105726872246696,
1295
+ "grad_norm": 129793.3671875,
1296
+ "learning_rate": 1.6757709251101323e-05,
1297
+ "loss": 0.1123,
1298
+ "step": 920
1299
+ },
1300
+ {
1301
+ "epoch": 0.8149779735682819,
1302
+ "grad_norm": 82357.6640625,
1303
+ "learning_rate": 1.6740088105726872e-05,
1304
+ "loss": 0.0773,
1305
+ "step": 925
1306
+ },
1307
+ {
1308
+ "epoch": 0.8193832599118943,
1309
+ "grad_norm": 67192.2578125,
1310
+ "learning_rate": 1.6722466960352425e-05,
1311
+ "loss": 0.0768,
1312
+ "step": 930
1313
+ },
1314
+ {
1315
+ "epoch": 0.8237885462555066,
1316
+ "grad_norm": 104339.375,
1317
+ "learning_rate": 1.6704845814977974e-05,
1318
+ "loss": 0.0934,
1319
+ "step": 935
1320
+ },
1321
+ {
1322
+ "epoch": 0.8281938325991189,
1323
+ "grad_norm": 141351.65625,
1324
+ "learning_rate": 1.6687224669603527e-05,
1325
+ "loss": 0.0798,
1326
+ "step": 940
1327
+ },
1328
+ {
1329
+ "epoch": 0.8325991189427313,
1330
+ "grad_norm": 76349.3671875,
1331
+ "learning_rate": 1.6669603524229076e-05,
1332
+ "loss": 0.073,
1333
+ "step": 945
1334
+ },
1335
+ {
1336
+ "epoch": 0.8370044052863436,
1337
+ "grad_norm": 78456.09375,
1338
+ "learning_rate": 1.665198237885463e-05,
1339
+ "loss": 0.0569,
1340
+ "step": 950
1341
+ },
1342
+ {
1343
+ "epoch": 0.8414096916299559,
1344
+ "grad_norm": 99650.015625,
1345
+ "learning_rate": 1.6634361233480178e-05,
1346
+ "loss": 0.0957,
1347
+ "step": 955
1348
+ },
1349
+ {
1350
+ "epoch": 0.8458149779735683,
1351
+ "grad_norm": 91759.796875,
1352
+ "learning_rate": 1.6616740088105727e-05,
1353
+ "loss": 0.0595,
1354
+ "step": 960
1355
+ },
1356
+ {
1357
+ "epoch": 0.8502202643171806,
1358
+ "grad_norm": 120993.03125,
1359
+ "learning_rate": 1.659911894273128e-05,
1360
+ "loss": 0.0693,
1361
+ "step": 965
1362
+ },
1363
+ {
1364
+ "epoch": 0.8546255506607929,
1365
+ "grad_norm": 78032.421875,
1366
+ "learning_rate": 1.658149779735683e-05,
1367
+ "loss": 0.0719,
1368
+ "step": 970
1369
+ },
1370
+ {
1371
+ "epoch": 0.8590308370044053,
1372
+ "grad_norm": 233544.96875,
1373
+ "learning_rate": 1.656387665198238e-05,
1374
+ "loss": 0.0661,
1375
+ "step": 975
1376
+ },
1377
+ {
1378
+ "epoch": 0.8634361233480177,
1379
+ "grad_norm": 48958.75390625,
1380
+ "learning_rate": 1.654625550660793e-05,
1381
+ "loss": 0.0961,
1382
+ "step": 980
1383
+ },
1384
+ {
1385
+ "epoch": 0.8678414096916299,
1386
+ "grad_norm": 109371.53125,
1387
+ "learning_rate": 1.6528634361233483e-05,
1388
+ "loss": 0.0789,
1389
+ "step": 985
1390
+ },
1391
+ {
1392
+ "epoch": 0.8722466960352423,
1393
+ "grad_norm": 193694.890625,
1394
+ "learning_rate": 1.6511013215859032e-05,
1395
+ "loss": 0.1096,
1396
+ "step": 990
1397
+ },
1398
+ {
1399
+ "epoch": 0.8766519823788547,
1400
+ "grad_norm": 37518.26171875,
1401
+ "learning_rate": 1.649339207048458e-05,
1402
+ "loss": 0.1039,
1403
+ "step": 995
1404
+ },
1405
+ {
1406
+ "epoch": 0.8810572687224669,
1407
+ "grad_norm": 133068.5625,
1408
+ "learning_rate": 1.6475770925110134e-05,
1409
+ "loss": 0.0726,
1410
+ "step": 1000
1411
+ },
1412
+ {
1413
+ "epoch": 0.8854625550660793,
1414
+ "grad_norm": 93989.75,
1415
+ "learning_rate": 1.6458149779735683e-05,
1416
+ "loss": 0.0908,
1417
+ "step": 1005
1418
+ },
1419
+ {
1420
+ "epoch": 0.8898678414096917,
1421
+ "grad_norm": 106104.7734375,
1422
+ "learning_rate": 1.6440528634361236e-05,
1423
+ "loss": 0.0891,
1424
+ "step": 1010
1425
+ },
1426
+ {
1427
+ "epoch": 0.8942731277533039,
1428
+ "grad_norm": 57476.76171875,
1429
+ "learning_rate": 1.6422907488986785e-05,
1430
+ "loss": 0.105,
1431
+ "step": 1015
1432
+ },
1433
+ {
1434
+ "epoch": 0.8986784140969163,
1435
+ "grad_norm": 95132.3984375,
1436
+ "learning_rate": 1.6405286343612337e-05,
1437
+ "loss": 0.0783,
1438
+ "step": 1020
1439
+ },
1440
+ {
1441
+ "epoch": 0.9030837004405287,
1442
+ "grad_norm": 132834.140625,
1443
+ "learning_rate": 1.6387665198237887e-05,
1444
+ "loss": 0.08,
1445
+ "step": 1025
1446
+ },
1447
+ {
1448
+ "epoch": 0.9074889867841409,
1449
+ "grad_norm": 76089.6484375,
1450
+ "learning_rate": 1.637004405286344e-05,
1451
+ "loss": 0.1149,
1452
+ "step": 1030
1453
+ },
1454
+ {
1455
+ "epoch": 0.9118942731277533,
1456
+ "grad_norm": 63661.88671875,
1457
+ "learning_rate": 1.635242290748899e-05,
1458
+ "loss": 0.0672,
1459
+ "step": 1035
1460
+ },
1461
+ {
1462
+ "epoch": 0.9162995594713657,
1463
+ "grad_norm": 72133.03125,
1464
+ "learning_rate": 1.6334801762114538e-05,
1465
+ "loss": 0.0728,
1466
+ "step": 1040
1467
+ },
1468
+ {
1469
+ "epoch": 0.920704845814978,
1470
+ "grad_norm": 100398.6015625,
1471
+ "learning_rate": 1.631718061674009e-05,
1472
+ "loss": 0.1223,
1473
+ "step": 1045
1474
+ },
1475
+ {
1476
+ "epoch": 0.9251101321585903,
1477
+ "grad_norm": 279089.8125,
1478
+ "learning_rate": 1.629955947136564e-05,
1479
+ "loss": 0.1427,
1480
+ "step": 1050
1481
+ },
1482
+ {
1483
+ "epoch": 0.9295154185022027,
1484
+ "grad_norm": 62917.9765625,
1485
+ "learning_rate": 1.6281938325991192e-05,
1486
+ "loss": 0.1203,
1487
+ "step": 1055
1488
+ },
1489
+ {
1490
+ "epoch": 0.933920704845815,
1491
+ "grad_norm": 140988.796875,
1492
+ "learning_rate": 1.626431718061674e-05,
1493
+ "loss": 0.0967,
1494
+ "step": 1060
1495
+ },
1496
+ {
1497
+ "epoch": 0.9383259911894273,
1498
+ "grad_norm": 259548.109375,
1499
+ "learning_rate": 1.6246696035242294e-05,
1500
+ "loss": 0.0839,
1501
+ "step": 1065
1502
+ },
1503
+ {
1504
+ "epoch": 0.9427312775330396,
1505
+ "grad_norm": 67071.7265625,
1506
+ "learning_rate": 1.6229074889867843e-05,
1507
+ "loss": 0.0556,
1508
+ "step": 1070
1509
+ },
1510
+ {
1511
+ "epoch": 0.947136563876652,
1512
+ "grad_norm": 92416.890625,
1513
+ "learning_rate": 1.6211453744493392e-05,
1514
+ "loss": 0.0858,
1515
+ "step": 1075
1516
+ },
1517
+ {
1518
+ "epoch": 0.9515418502202643,
1519
+ "grad_norm": 44402.7578125,
1520
+ "learning_rate": 1.6193832599118945e-05,
1521
+ "loss": 0.0548,
1522
+ "step": 1080
1523
+ },
1524
+ {
1525
+ "epoch": 0.9559471365638766,
1526
+ "grad_norm": 167386.671875,
1527
+ "learning_rate": 1.6176211453744494e-05,
1528
+ "loss": 0.094,
1529
+ "step": 1085
1530
+ },
1531
+ {
1532
+ "epoch": 0.960352422907489,
1533
+ "grad_norm": 47520.0546875,
1534
+ "learning_rate": 1.6158590308370046e-05,
1535
+ "loss": 0.0758,
1536
+ "step": 1090
1537
+ },
1538
+ {
1539
+ "epoch": 0.9647577092511013,
1540
+ "grad_norm": 131369.015625,
1541
+ "learning_rate": 1.6140969162995596e-05,
1542
+ "loss": 0.0675,
1543
+ "step": 1095
1544
+ },
1545
+ {
1546
+ "epoch": 0.9691629955947136,
1547
+ "grad_norm": 52430.921875,
1548
+ "learning_rate": 1.6123348017621148e-05,
1549
+ "loss": 0.091,
1550
+ "step": 1100
1551
+ },
1552
+ {
1553
+ "epoch": 0.973568281938326,
1554
+ "grad_norm": 114321.75,
1555
+ "learning_rate": 1.6105726872246697e-05,
1556
+ "loss": 0.0827,
1557
+ "step": 1105
1558
+ },
1559
+ {
1560
+ "epoch": 0.9779735682819384,
1561
+ "grad_norm": 184216.453125,
1562
+ "learning_rate": 1.6088105726872247e-05,
1563
+ "loss": 0.1047,
1564
+ "step": 1110
1565
+ },
1566
+ {
1567
+ "epoch": 0.9823788546255506,
1568
+ "grad_norm": 147871.734375,
1569
+ "learning_rate": 1.60704845814978e-05,
1570
+ "loss": 0.0991,
1571
+ "step": 1115
1572
+ },
1573
+ {
1574
+ "epoch": 0.986784140969163,
1575
+ "grad_norm": 144256.390625,
1576
+ "learning_rate": 1.605286343612335e-05,
1577
+ "loss": 0.084,
1578
+ "step": 1120
1579
+ },
1580
+ {
1581
+ "epoch": 0.9911894273127754,
1582
+ "grad_norm": 57534.3359375,
1583
+ "learning_rate": 1.60352422907489e-05,
1584
+ "loss": 0.097,
1585
+ "step": 1125
1586
+ },
1587
+ {
1588
+ "epoch": 0.9955947136563876,
1589
+ "grad_norm": 77203.0546875,
1590
+ "learning_rate": 1.601762114537445e-05,
1591
+ "loss": 0.0529,
1592
+ "step": 1130
1593
+ },
1594
+ {
1595
+ "epoch": 1.0,
1596
+ "grad_norm": 103384.796875,
1597
+ "learning_rate": 1.6000000000000003e-05,
1598
+ "loss": 0.0537,
1599
+ "step": 1135
1600
+ },
1601
+ {
1602
+ "epoch": 1.0,
1603
+ "eval_accuracy_score": 0.9745103887066309,
1604
+ "eval_f1": 0.8684953610326744,
1605
+ "eval_loss": 0.07397506386041641,
1606
+ "eval_precision": 0.8514929800276844,
1607
+ "eval_recall": 0.8861905741922206,
1608
+ "eval_runtime": 56.8324,
1609
+ "eval_samples_per_second": 35.508,
1610
+ "eval_steps_per_second": 4.452,
1611
+ "step": 1135
1612
+ }
1613
+ ],
1614
+ "logging_steps": 5,
1615
+ "max_steps": 5675,
1616
+ "num_input_tokens_seen": 0,
1617
+ "num_train_epochs": 5,
1618
+ "save_steps": 500,
1619
+ "stateful_callbacks": {
1620
+ "TrainerControl": {
1621
+ "args": {
1622
+ "should_epoch_stop": false,
1623
+ "should_evaluate": false,
1624
+ "should_log": false,
1625
+ "should_save": true,
1626
+ "should_training_stop": false
1627
+ },
1628
+ "attributes": {}
1629
+ }
1630
+ },
1631
+ "total_flos": 2692060776400446.0,
1632
+ "train_batch_size": 8,
1633
+ "trial_name": null,
1634
+ "trial_params": null
1635
+ }
checkpoint-1135/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbf6ee382bdf4fc7a704ade5bbd728f43b83dd5b0a0779f6ffdaf51f9e4d6d2
3
+ size 5432
checkpoint-1135/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2270/config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ElectraForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "embedding_size": 1024,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "id2label": {
12
+ "0": "O",
13
+ "1": "B-antineoplastic",
14
+ "2": "B-cancertype",
15
+ "3": "B-carcinogen",
16
+ "4": "B-negative",
17
+ "5": "I-antineoplastic",
18
+ "6": "I-cancertype",
19
+ "7": "I-carcinogen",
20
+ "8": "I-negative"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "label2id": {
25
+ "B-antineoplastic": 1,
26
+ "B-cancertype": 2,
27
+ "B-carcinogen": 3,
28
+ "B-negative": 4,
29
+ "I-antineoplastic": 5,
30
+ "I-cancertype": 6,
31
+ "I-carcinogen": 7,
32
+ "I-negative": 8,
33
+ "O": 0
34
+ },
35
+ "layer_norm_eps": 1e-12,
36
+ "max_position_embeddings": 512,
37
+ "model_type": "electra",
38
+ "num_attention_heads": 16,
39
+ "num_hidden_layers": 24,
40
+ "pad_token_id": 0,
41
+ "position_embedding_type": "absolute",
42
+ "summary_activation": "gelu",
43
+ "summary_last_dropout": 0.1,
44
+ "summary_type": "first",
45
+ "summary_use_proj": true,
46
+ "torch_dtype": "float32",
47
+ "transformers_version": "4.50.0",
48
+ "type_vocab_size": 2,
49
+ "use_cache": true,
50
+ "vocab_size": 28895
51
+ }
checkpoint-2270/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf6526ebf147fd4ef0dddc7d4c5533ee6d5e6be9f1fa5aad26986efa84a890c4
3
+ size 1329789812
checkpoint-2270/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51b7bb4f7fc0b09598b9220552a8e62a58d7c835c4163b1b43345e2a2747f20b
3
+ size 2659810855
checkpoint-2270/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ecc23c91f4df4244eb00f61c33025019466979df67ad933deb79112c5f00f08
3
+ size 14244
checkpoint-2270/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30172cf14f5dbe00280d63e36224a9f28dc7a0e8b38a74ceb5eb284e84da363
3
+ size 988
checkpoint-2270/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83ce0b2ce0b468ea7b08fece0f4c1f21aae09fe7de4a0cda09bcd937913a9b4b
3
+ size 1064
checkpoint-2270/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-2270/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2270/tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "ignore_mismatched_sizes": true,
50
+ "mask_token": "[MASK]",
51
+ "max_len": 512,
52
+ "model_max_length": 512,
53
+ "never_split": null,
54
+ "pad_token": "[PAD]",
55
+ "padding": true,
56
+ "sep_token": "[SEP]",
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "ElectraTokenizer",
60
+ "truncation": true,
61
+ "unk_token": "[UNK]"
62
+ }
checkpoint-2270/trainer_state.json ADDED
@@ -0,0 +1,3236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2270,
3
+ "best_metric": 0.06808020919561386,
4
+ "best_model_checkpoint": "/storage/BioM-ELECTRA-Large-SQuAD2_5e_update_lg/checkpoint-2270",
5
+ "epoch": 2.0,
6
+ "eval_steps": 5,
7
+ "global_step": 2270,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.004405286343612335,
14
+ "grad_norm": 463300.3125,
15
+ "learning_rate": 1.9982378854625554e-05,
16
+ "loss": 1.3393,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.00881057268722467,
21
+ "grad_norm": 123929.53125,
22
+ "learning_rate": 1.9964757709251103e-05,
23
+ "loss": 0.7964,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.013215859030837005,
28
+ "grad_norm": 103963.2109375,
29
+ "learning_rate": 1.9947136563876656e-05,
30
+ "loss": 0.6685,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.01762114537444934,
35
+ "grad_norm": 85550.5859375,
36
+ "learning_rate": 1.9929515418502202e-05,
37
+ "loss": 0.5623,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.022026431718061675,
42
+ "grad_norm": 122556.59375,
43
+ "learning_rate": 1.9911894273127754e-05,
44
+ "loss": 0.4474,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.02643171806167401,
49
+ "grad_norm": 122631.59375,
50
+ "learning_rate": 1.9894273127753304e-05,
51
+ "loss": 0.4729,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.030837004405286344,
56
+ "grad_norm": 84448.40625,
57
+ "learning_rate": 1.9876651982378856e-05,
58
+ "loss": 0.375,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.03524229074889868,
63
+ "grad_norm": 90985.546875,
64
+ "learning_rate": 1.985903083700441e-05,
65
+ "loss": 0.4128,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.039647577092511016,
70
+ "grad_norm": 83418.0,
71
+ "learning_rate": 1.9841409691629958e-05,
72
+ "loss": 0.3251,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.04405286343612335,
77
+ "grad_norm": 175418.140625,
78
+ "learning_rate": 1.982378854625551e-05,
79
+ "loss": 0.3585,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.048458149779735685,
84
+ "grad_norm": 136878.40625,
85
+ "learning_rate": 1.9806167400881056e-05,
86
+ "loss": 0.296,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.05286343612334802,
91
+ "grad_norm": 98344.71875,
92
+ "learning_rate": 1.978854625550661e-05,
93
+ "loss": 0.32,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.05726872246696035,
98
+ "grad_norm": 108601.3359375,
99
+ "learning_rate": 1.977092511013216e-05,
100
+ "loss": 0.2895,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.06167400881057269,
105
+ "grad_norm": 117893.046875,
106
+ "learning_rate": 1.975330396475771e-05,
107
+ "loss": 0.3008,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.06607929515418502,
112
+ "grad_norm": 92026.25,
113
+ "learning_rate": 1.9735682819383263e-05,
114
+ "loss": 0.2413,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.07048458149779736,
119
+ "grad_norm": 139038.234375,
120
+ "learning_rate": 1.9718061674008812e-05,
121
+ "loss": 0.2645,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.07488986784140969,
126
+ "grad_norm": 156623.421875,
127
+ "learning_rate": 1.9700440528634365e-05,
128
+ "loss": 0.2314,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.07929515418502203,
133
+ "grad_norm": 194062.578125,
134
+ "learning_rate": 1.9682819383259914e-05,
135
+ "loss": 0.2854,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.08370044052863436,
140
+ "grad_norm": 150748.421875,
141
+ "learning_rate": 1.9665198237885463e-05,
142
+ "loss": 0.2485,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.0881057268722467,
147
+ "grad_norm": 160828.71875,
148
+ "learning_rate": 1.9647577092511016e-05,
149
+ "loss": 0.2135,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.09251101321585903,
154
+ "grad_norm": 106817.6640625,
155
+ "learning_rate": 1.9629955947136565e-05,
156
+ "loss": 0.2011,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.09691629955947137,
161
+ "grad_norm": 268202.65625,
162
+ "learning_rate": 1.9612334801762118e-05,
163
+ "loss": 0.2114,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.1013215859030837,
168
+ "grad_norm": 139443.75,
169
+ "learning_rate": 1.9594713656387667e-05,
170
+ "loss": 0.2436,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.10572687224669604,
175
+ "grad_norm": 117417.0234375,
176
+ "learning_rate": 1.957709251101322e-05,
177
+ "loss": 0.1895,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.11013215859030837,
182
+ "grad_norm": 126792.234375,
183
+ "learning_rate": 1.955947136563877e-05,
184
+ "loss": 0.1937,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.1145374449339207,
189
+ "grad_norm": 167020.0625,
190
+ "learning_rate": 1.9541850220264318e-05,
191
+ "loss": 0.203,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.11894273127753303,
196
+ "grad_norm": 220364.109375,
197
+ "learning_rate": 1.952422907488987e-05,
198
+ "loss": 0.1573,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.12334801762114538,
203
+ "grad_norm": 233084.5,
204
+ "learning_rate": 1.950660792951542e-05,
205
+ "loss": 0.2019,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.1277533039647577,
210
+ "grad_norm": 125416.90625,
211
+ "learning_rate": 1.9488986784140972e-05,
212
+ "loss": 0.163,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.13215859030837004,
217
+ "grad_norm": 104147.4453125,
218
+ "learning_rate": 1.947136563876652e-05,
219
+ "loss": 0.1286,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.13656387665198239,
224
+ "grad_norm": 120634.4765625,
225
+ "learning_rate": 1.9453744493392074e-05,
226
+ "loss": 0.1525,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.14096916299559473,
231
+ "grad_norm": 141004.78125,
232
+ "learning_rate": 1.9436123348017623e-05,
233
+ "loss": 0.2162,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.14537444933920704,
238
+ "grad_norm": 229382.46875,
239
+ "learning_rate": 1.9418502202643172e-05,
240
+ "loss": 0.1294,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.14977973568281938,
245
+ "grad_norm": 213832.84375,
246
+ "learning_rate": 1.9400881057268725e-05,
247
+ "loss": 0.136,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.15418502202643172,
252
+ "grad_norm": 241189.96875,
253
+ "learning_rate": 1.9383259911894274e-05,
254
+ "loss": 0.1442,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.15859030837004406,
259
+ "grad_norm": 68183.6484375,
260
+ "learning_rate": 1.9365638766519827e-05,
261
+ "loss": 0.1559,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.16299559471365638,
266
+ "grad_norm": 83166.6796875,
267
+ "learning_rate": 1.9348017621145376e-05,
268
+ "loss": 0.1747,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.16740088105726872,
273
+ "grad_norm": 158394.59375,
274
+ "learning_rate": 1.933039647577093e-05,
275
+ "loss": 0.137,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.17180616740088106,
280
+ "grad_norm": 227525.71875,
281
+ "learning_rate": 1.9312775330396478e-05,
282
+ "loss": 0.1493,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.1762114537444934,
287
+ "grad_norm": 189944.125,
288
+ "learning_rate": 1.9295154185022027e-05,
289
+ "loss": 0.1682,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.18061674008810572,
294
+ "grad_norm": 109924.9921875,
295
+ "learning_rate": 1.927753303964758e-05,
296
+ "loss": 0.1332,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.18502202643171806,
301
+ "grad_norm": 110553.6328125,
302
+ "learning_rate": 1.925991189427313e-05,
303
+ "loss": 0.156,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.1894273127753304,
308
+ "grad_norm": 174909.78125,
309
+ "learning_rate": 1.924229074889868e-05,
310
+ "loss": 0.1264,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.19383259911894274,
315
+ "grad_norm": 93096.015625,
316
+ "learning_rate": 1.922466960352423e-05,
317
+ "loss": 0.1483,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.19823788546255505,
322
+ "grad_norm": 148515.625,
323
+ "learning_rate": 1.9207048458149783e-05,
324
+ "loss": 0.2345,
325
+ "step": 225
326
+ },
327
+ {
328
+ "epoch": 0.2026431718061674,
329
+ "grad_norm": 147129.25,
330
+ "learning_rate": 1.9189427312775332e-05,
331
+ "loss": 0.1201,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 0.20704845814977973,
336
+ "grad_norm": 187686.65625,
337
+ "learning_rate": 1.917180616740088e-05,
338
+ "loss": 0.1236,
339
+ "step": 235
340
+ },
341
+ {
342
+ "epoch": 0.21145374449339208,
343
+ "grad_norm": 197271.859375,
344
+ "learning_rate": 1.9154185022026434e-05,
345
+ "loss": 0.1518,
346
+ "step": 240
347
+ },
348
+ {
349
+ "epoch": 0.21585903083700442,
350
+ "grad_norm": 113562.8828125,
351
+ "learning_rate": 1.9136563876651983e-05,
352
+ "loss": 0.1016,
353
+ "step": 245
354
+ },
355
+ {
356
+ "epoch": 0.22026431718061673,
357
+ "grad_norm": 161111.78125,
358
+ "learning_rate": 1.9118942731277536e-05,
359
+ "loss": 0.1265,
360
+ "step": 250
361
+ },
362
+ {
363
+ "epoch": 0.22466960352422907,
364
+ "grad_norm": 118850.125,
365
+ "learning_rate": 1.9101321585903085e-05,
366
+ "loss": 0.141,
367
+ "step": 255
368
+ },
369
+ {
370
+ "epoch": 0.2290748898678414,
371
+ "grad_norm": 125884.6015625,
372
+ "learning_rate": 1.9083700440528637e-05,
373
+ "loss": 0.1376,
374
+ "step": 260
375
+ },
376
+ {
377
+ "epoch": 0.23348017621145375,
378
+ "grad_norm": 148464.078125,
379
+ "learning_rate": 1.9066079295154187e-05,
380
+ "loss": 0.1456,
381
+ "step": 265
382
+ },
383
+ {
384
+ "epoch": 0.23788546255506607,
385
+ "grad_norm": 77475.3046875,
386
+ "learning_rate": 1.904845814977974e-05,
387
+ "loss": 0.1144,
388
+ "step": 270
389
+ },
390
+ {
391
+ "epoch": 0.2422907488986784,
392
+ "grad_norm": 140790.203125,
393
+ "learning_rate": 1.9030837004405288e-05,
394
+ "loss": 0.1432,
395
+ "step": 275
396
+ },
397
+ {
398
+ "epoch": 0.24669603524229075,
399
+ "grad_norm": 116215.90625,
400
+ "learning_rate": 1.9013215859030837e-05,
401
+ "loss": 0.1661,
402
+ "step": 280
403
+ },
404
+ {
405
+ "epoch": 0.2511013215859031,
406
+ "grad_norm": 141195.609375,
407
+ "learning_rate": 1.899559471365639e-05,
408
+ "loss": 0.1013,
409
+ "step": 285
410
+ },
411
+ {
412
+ "epoch": 0.2555066079295154,
413
+ "grad_norm": 109915.46875,
414
+ "learning_rate": 1.897797356828194e-05,
415
+ "loss": 0.1296,
416
+ "step": 290
417
+ },
418
+ {
419
+ "epoch": 0.2599118942731278,
420
+ "grad_norm": 80954.6796875,
421
+ "learning_rate": 1.8960352422907492e-05,
422
+ "loss": 0.1072,
423
+ "step": 295
424
+ },
425
+ {
426
+ "epoch": 0.2643171806167401,
427
+ "grad_norm": 160752.921875,
428
+ "learning_rate": 1.894273127753304e-05,
429
+ "loss": 0.0952,
430
+ "step": 300
431
+ },
432
+ {
433
+ "epoch": 0.2687224669603524,
434
+ "grad_norm": 181410.125,
435
+ "learning_rate": 1.8925110132158594e-05,
436
+ "loss": 0.1439,
437
+ "step": 305
438
+ },
439
+ {
440
+ "epoch": 0.27312775330396477,
441
+ "grad_norm": 88055.2265625,
442
+ "learning_rate": 1.8907488986784143e-05,
443
+ "loss": 0.126,
444
+ "step": 310
445
+ },
446
+ {
447
+ "epoch": 0.2775330396475771,
448
+ "grad_norm": 295895.375,
449
+ "learning_rate": 1.8889867841409692e-05,
450
+ "loss": 0.1468,
451
+ "step": 315
452
+ },
453
+ {
454
+ "epoch": 0.28193832599118945,
455
+ "grad_norm": 100781.8671875,
456
+ "learning_rate": 1.8872246696035245e-05,
457
+ "loss": 0.1115,
458
+ "step": 320
459
+ },
460
+ {
461
+ "epoch": 0.28634361233480177,
462
+ "grad_norm": 81134.4375,
463
+ "learning_rate": 1.8854625550660794e-05,
464
+ "loss": 0.1283,
465
+ "step": 325
466
+ },
467
+ {
468
+ "epoch": 0.2907488986784141,
469
+ "grad_norm": 113947.65625,
470
+ "learning_rate": 1.8837004405286346e-05,
471
+ "loss": 0.149,
472
+ "step": 330
473
+ },
474
+ {
475
+ "epoch": 0.29515418502202645,
476
+ "grad_norm": 93833.7265625,
477
+ "learning_rate": 1.8819383259911895e-05,
478
+ "loss": 0.1112,
479
+ "step": 335
480
+ },
481
+ {
482
+ "epoch": 0.29955947136563876,
483
+ "grad_norm": 154168.203125,
484
+ "learning_rate": 1.8801762114537448e-05,
485
+ "loss": 0.1433,
486
+ "step": 340
487
+ },
488
+ {
489
+ "epoch": 0.3039647577092511,
490
+ "grad_norm": 160026.171875,
491
+ "learning_rate": 1.8784140969162997e-05,
492
+ "loss": 0.0963,
493
+ "step": 345
494
+ },
495
+ {
496
+ "epoch": 0.30837004405286345,
497
+ "grad_norm": 318680.5,
498
+ "learning_rate": 1.8766519823788546e-05,
499
+ "loss": 0.1234,
500
+ "step": 350
501
+ },
502
+ {
503
+ "epoch": 0.31277533039647576,
504
+ "grad_norm": 105961.140625,
505
+ "learning_rate": 1.87488986784141e-05,
506
+ "loss": 0.0938,
507
+ "step": 355
508
+ },
509
+ {
510
+ "epoch": 0.31718061674008813,
511
+ "grad_norm": 98187.046875,
512
+ "learning_rate": 1.8731277533039648e-05,
513
+ "loss": 0.1424,
514
+ "step": 360
515
+ },
516
+ {
517
+ "epoch": 0.32158590308370044,
518
+ "grad_norm": 84516.953125,
519
+ "learning_rate": 1.87136563876652e-05,
520
+ "loss": 0.12,
521
+ "step": 365
522
+ },
523
+ {
524
+ "epoch": 0.32599118942731276,
525
+ "grad_norm": 224001.703125,
526
+ "learning_rate": 1.869603524229075e-05,
527
+ "loss": 0.1175,
528
+ "step": 370
529
+ },
530
+ {
531
+ "epoch": 0.3303964757709251,
532
+ "grad_norm": 155620.953125,
533
+ "learning_rate": 1.8678414096916303e-05,
534
+ "loss": 0.1075,
535
+ "step": 375
536
+ },
537
+ {
538
+ "epoch": 0.33480176211453744,
539
+ "grad_norm": 117204.7421875,
540
+ "learning_rate": 1.8660792951541852e-05,
541
+ "loss": 0.0985,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 0.3392070484581498,
546
+ "grad_norm": 113135.15625,
547
+ "learning_rate": 1.86431718061674e-05,
548
+ "loss": 0.107,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 0.3436123348017621,
553
+ "grad_norm": 52815.0625,
554
+ "learning_rate": 1.8625550660792953e-05,
555
+ "loss": 0.101,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 0.34801762114537443,
560
+ "grad_norm": 90544.8828125,
561
+ "learning_rate": 1.8607929515418503e-05,
562
+ "loss": 0.117,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 0.3524229074889868,
567
+ "grad_norm": 69778.9921875,
568
+ "learning_rate": 1.8590308370044055e-05,
569
+ "loss": 0.1292,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 0.3568281938325991,
574
+ "grad_norm": 119399.484375,
575
+ "learning_rate": 1.8572687224669604e-05,
576
+ "loss": 0.0989,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 0.36123348017621143,
581
+ "grad_norm": 91506.125,
582
+ "learning_rate": 1.8555066079295157e-05,
583
+ "loss": 0.0897,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 0.3656387665198238,
588
+ "grad_norm": 82248.2265625,
589
+ "learning_rate": 1.8537444933920706e-05,
590
+ "loss": 0.0867,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.3700440528634361,
595
+ "grad_norm": 99124.671875,
596
+ "learning_rate": 1.8519823788546255e-05,
597
+ "loss": 0.0877,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.3744493392070485,
602
+ "grad_norm": 99704.7265625,
603
+ "learning_rate": 1.8502202643171808e-05,
604
+ "loss": 0.0996,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.3788546255506608,
609
+ "grad_norm": 153517.046875,
610
+ "learning_rate": 1.8484581497797357e-05,
611
+ "loss": 0.1302,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.3832599118942731,
616
+ "grad_norm": 91006.6640625,
617
+ "learning_rate": 1.846696035242291e-05,
618
+ "loss": 0.0861,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.3876651982378855,
623
+ "grad_norm": 327581.59375,
624
+ "learning_rate": 1.844933920704846e-05,
625
+ "loss": 0.1338,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.3920704845814978,
630
+ "grad_norm": 124674.0390625,
631
+ "learning_rate": 1.843171806167401e-05,
632
+ "loss": 0.0835,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.3964757709251101,
637
+ "grad_norm": 192952.09375,
638
+ "learning_rate": 1.841409691629956e-05,
639
+ "loss": 0.113,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.4008810572687225,
644
+ "grad_norm": 111717.0390625,
645
+ "learning_rate": 1.839647577092511e-05,
646
+ "loss": 0.1334,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.4052863436123348,
651
+ "grad_norm": 132410.96875,
652
+ "learning_rate": 1.8378854625550662e-05,
653
+ "loss": 0.1233,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.40969162995594716,
658
+ "grad_norm": 120118.8359375,
659
+ "learning_rate": 1.836123348017621e-05,
660
+ "loss": 0.1272,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.41409691629955947,
665
+ "grad_norm": 89015.53125,
666
+ "learning_rate": 1.8343612334801764e-05,
667
+ "loss": 0.0746,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.4185022026431718,
672
+ "grad_norm": 146248.03125,
673
+ "learning_rate": 1.8325991189427313e-05,
674
+ "loss": 0.1042,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.42290748898678415,
679
+ "grad_norm": 71374.6171875,
680
+ "learning_rate": 1.8308370044052866e-05,
681
+ "loss": 0.1077,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.42731277533039647,
686
+ "grad_norm": 81359.4453125,
687
+ "learning_rate": 1.8290748898678415e-05,
688
+ "loss": 0.1554,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.43171806167400884,
693
+ "grad_norm": 154972.984375,
694
+ "learning_rate": 1.8273127753303964e-05,
695
+ "loss": 0.1039,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.43612334801762115,
700
+ "grad_norm": 62776.6484375,
701
+ "learning_rate": 1.8255506607929517e-05,
702
+ "loss": 0.08,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.44052863436123346,
707
+ "grad_norm": 129083.4140625,
708
+ "learning_rate": 1.8237885462555066e-05,
709
+ "loss": 0.1517,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 0.44493392070484583,
714
+ "grad_norm": 128397.796875,
715
+ "learning_rate": 1.822026431718062e-05,
716
+ "loss": 0.1039,
717
+ "step": 505
718
+ },
719
+ {
720
+ "epoch": 0.44933920704845814,
721
+ "grad_norm": 175972.28125,
722
+ "learning_rate": 1.8202643171806168e-05,
723
+ "loss": 0.1032,
724
+ "step": 510
725
+ },
726
+ {
727
+ "epoch": 0.45374449339207046,
728
+ "grad_norm": 106463.9296875,
729
+ "learning_rate": 1.818502202643172e-05,
730
+ "loss": 0.1073,
731
+ "step": 515
732
+ },
733
+ {
734
+ "epoch": 0.4581497797356828,
735
+ "grad_norm": 109430.453125,
736
+ "learning_rate": 1.816740088105727e-05,
737
+ "loss": 0.0928,
738
+ "step": 520
739
+ },
740
+ {
741
+ "epoch": 0.46255506607929514,
742
+ "grad_norm": 138073.25,
743
+ "learning_rate": 1.814977973568282e-05,
744
+ "loss": 0.0627,
745
+ "step": 525
746
+ },
747
+ {
748
+ "epoch": 0.4669603524229075,
749
+ "grad_norm": 104565.8046875,
750
+ "learning_rate": 1.813215859030837e-05,
751
+ "loss": 0.1008,
752
+ "step": 530
753
+ },
754
+ {
755
+ "epoch": 0.4713656387665198,
756
+ "grad_norm": 118428.40625,
757
+ "learning_rate": 1.811453744493392e-05,
758
+ "loss": 0.1062,
759
+ "step": 535
760
+ },
761
+ {
762
+ "epoch": 0.47577092511013214,
763
+ "grad_norm": 98266.734375,
764
+ "learning_rate": 1.8096916299559473e-05,
765
+ "loss": 0.1242,
766
+ "step": 540
767
+ },
768
+ {
769
+ "epoch": 0.4801762114537445,
770
+ "grad_norm": 142845.421875,
771
+ "learning_rate": 1.8079295154185022e-05,
772
+ "loss": 0.1271,
773
+ "step": 545
774
+ },
775
+ {
776
+ "epoch": 0.4845814977973568,
777
+ "grad_norm": 151209.21875,
778
+ "learning_rate": 1.8061674008810575e-05,
779
+ "loss": 0.0677,
780
+ "step": 550
781
+ },
782
+ {
783
+ "epoch": 0.4889867841409692,
784
+ "grad_norm": 97798.6796875,
785
+ "learning_rate": 1.8044052863436124e-05,
786
+ "loss": 0.0931,
787
+ "step": 555
788
+ },
789
+ {
790
+ "epoch": 0.4933920704845815,
791
+ "grad_norm": 80213.59375,
792
+ "learning_rate": 1.8026431718061677e-05,
793
+ "loss": 0.072,
794
+ "step": 560
795
+ },
796
+ {
797
+ "epoch": 0.4977973568281938,
798
+ "grad_norm": 33982.3515625,
799
+ "learning_rate": 1.8008810572687226e-05,
800
+ "loss": 0.0699,
801
+ "step": 565
802
+ },
803
+ {
804
+ "epoch": 0.5022026431718062,
805
+ "grad_norm": 130265.4296875,
806
+ "learning_rate": 1.7991189427312775e-05,
807
+ "loss": 0.1226,
808
+ "step": 570
809
+ },
810
+ {
811
+ "epoch": 0.5066079295154186,
812
+ "grad_norm": 49779.00390625,
813
+ "learning_rate": 1.7973568281938328e-05,
814
+ "loss": 0.1363,
815
+ "step": 575
816
+ },
817
+ {
818
+ "epoch": 0.5110132158590308,
819
+ "grad_norm": 128675.6953125,
820
+ "learning_rate": 1.7955947136563877e-05,
821
+ "loss": 0.1344,
822
+ "step": 580
823
+ },
824
+ {
825
+ "epoch": 0.5154185022026432,
826
+ "grad_norm": 123242.515625,
827
+ "learning_rate": 1.793832599118943e-05,
828
+ "loss": 0.0954,
829
+ "step": 585
830
+ },
831
+ {
832
+ "epoch": 0.5198237885462555,
833
+ "grad_norm": 112552.0859375,
834
+ "learning_rate": 1.792070484581498e-05,
835
+ "loss": 0.0919,
836
+ "step": 590
837
+ },
838
+ {
839
+ "epoch": 0.5242290748898678,
840
+ "grad_norm": 75417.375,
841
+ "learning_rate": 1.790308370044053e-05,
842
+ "loss": 0.0993,
843
+ "step": 595
844
+ },
845
+ {
846
+ "epoch": 0.5286343612334802,
847
+ "grad_norm": 127870.3671875,
848
+ "learning_rate": 1.788546255506608e-05,
849
+ "loss": 0.1006,
850
+ "step": 600
851
+ },
852
+ {
853
+ "epoch": 0.5330396475770925,
854
+ "grad_norm": 108290.8203125,
855
+ "learning_rate": 1.786784140969163e-05,
856
+ "loss": 0.1095,
857
+ "step": 605
858
+ },
859
+ {
860
+ "epoch": 0.5374449339207048,
861
+ "grad_norm": 131361.828125,
862
+ "learning_rate": 1.7850220264317182e-05,
863
+ "loss": 0.0967,
864
+ "step": 610
865
+ },
866
+ {
867
+ "epoch": 0.5418502202643172,
868
+ "grad_norm": 164738.09375,
869
+ "learning_rate": 1.783259911894273e-05,
870
+ "loss": 0.1096,
871
+ "step": 615
872
+ },
873
+ {
874
+ "epoch": 0.5462555066079295,
875
+ "grad_norm": 155269.90625,
876
+ "learning_rate": 1.7814977973568284e-05,
877
+ "loss": 0.0969,
878
+ "step": 620
879
+ },
880
+ {
881
+ "epoch": 0.5506607929515418,
882
+ "grad_norm": 129207.3046875,
883
+ "learning_rate": 1.7797356828193833e-05,
884
+ "loss": 0.1261,
885
+ "step": 625
886
+ },
887
+ {
888
+ "epoch": 0.5550660792951542,
889
+ "grad_norm": 297050.8125,
890
+ "learning_rate": 1.7779735682819386e-05,
891
+ "loss": 0.0901,
892
+ "step": 630
893
+ },
894
+ {
895
+ "epoch": 0.5594713656387665,
896
+ "grad_norm": 110518.3359375,
897
+ "learning_rate": 1.7762114537444935e-05,
898
+ "loss": 0.0647,
899
+ "step": 635
900
+ },
901
+ {
902
+ "epoch": 0.5638766519823789,
903
+ "grad_norm": 73083.71875,
904
+ "learning_rate": 1.7744493392070484e-05,
905
+ "loss": 0.0966,
906
+ "step": 640
907
+ },
908
+ {
909
+ "epoch": 0.5682819383259912,
910
+ "grad_norm": 120506.2265625,
911
+ "learning_rate": 1.7726872246696037e-05,
912
+ "loss": 0.0903,
913
+ "step": 645
914
+ },
915
+ {
916
+ "epoch": 0.5726872246696035,
917
+ "grad_norm": 81893.8515625,
918
+ "learning_rate": 1.7709251101321586e-05,
919
+ "loss": 0.1315,
920
+ "step": 650
921
+ },
922
+ {
923
+ "epoch": 0.5770925110132159,
924
+ "grad_norm": 38761.71484375,
925
+ "learning_rate": 1.769162995594714e-05,
926
+ "loss": 0.0828,
927
+ "step": 655
928
+ },
929
+ {
930
+ "epoch": 0.5814977973568282,
931
+ "grad_norm": 174716.234375,
932
+ "learning_rate": 1.767400881057269e-05,
933
+ "loss": 0.134,
934
+ "step": 660
935
+ },
936
+ {
937
+ "epoch": 0.5859030837004405,
938
+ "grad_norm": 72472.8671875,
939
+ "learning_rate": 1.765638766519824e-05,
940
+ "loss": 0.1047,
941
+ "step": 665
942
+ },
943
+ {
944
+ "epoch": 0.5903083700440529,
945
+ "grad_norm": 106439.15625,
946
+ "learning_rate": 1.763876651982379e-05,
947
+ "loss": 0.0769,
948
+ "step": 670
949
+ },
950
+ {
951
+ "epoch": 0.5947136563876652,
952
+ "grad_norm": 181449.84375,
953
+ "learning_rate": 1.762114537444934e-05,
954
+ "loss": 0.1306,
955
+ "step": 675
956
+ },
957
+ {
958
+ "epoch": 0.5991189427312775,
959
+ "grad_norm": 89675.6171875,
960
+ "learning_rate": 1.760352422907489e-05,
961
+ "loss": 0.111,
962
+ "step": 680
963
+ },
964
+ {
965
+ "epoch": 0.6035242290748899,
966
+ "grad_norm": 99200.21875,
967
+ "learning_rate": 1.758590308370044e-05,
968
+ "loss": 0.1079,
969
+ "step": 685
970
+ },
971
+ {
972
+ "epoch": 0.6079295154185022,
973
+ "grad_norm": 173344.875,
974
+ "learning_rate": 1.7568281938325993e-05,
975
+ "loss": 0.0785,
976
+ "step": 690
977
+ },
978
+ {
979
+ "epoch": 0.6123348017621145,
980
+ "grad_norm": 98200.265625,
981
+ "learning_rate": 1.7550660792951545e-05,
982
+ "loss": 0.0901,
983
+ "step": 695
984
+ },
985
+ {
986
+ "epoch": 0.6167400881057269,
987
+ "grad_norm": 77085.3359375,
988
+ "learning_rate": 1.7533039647577095e-05,
989
+ "loss": 0.0612,
990
+ "step": 700
991
+ },
992
+ {
993
+ "epoch": 0.6211453744493393,
994
+ "grad_norm": 96442.0234375,
995
+ "learning_rate": 1.7515418502202644e-05,
996
+ "loss": 0.1305,
997
+ "step": 705
998
+ },
999
+ {
1000
+ "epoch": 0.6255506607929515,
1001
+ "grad_norm": 391359.71875,
1002
+ "learning_rate": 1.7497797356828193e-05,
1003
+ "loss": 0.13,
1004
+ "step": 710
1005
+ },
1006
+ {
1007
+ "epoch": 0.6299559471365639,
1008
+ "grad_norm": 116567.65625,
1009
+ "learning_rate": 1.7480176211453746e-05,
1010
+ "loss": 0.0964,
1011
+ "step": 715
1012
+ },
1013
+ {
1014
+ "epoch": 0.6343612334801763,
1015
+ "grad_norm": 143271.65625,
1016
+ "learning_rate": 1.7462555066079295e-05,
1017
+ "loss": 0.1161,
1018
+ "step": 720
1019
+ },
1020
+ {
1021
+ "epoch": 0.6387665198237885,
1022
+ "grad_norm": 286173.40625,
1023
+ "learning_rate": 1.7444933920704847e-05,
1024
+ "loss": 0.0977,
1025
+ "step": 725
1026
+ },
1027
+ {
1028
+ "epoch": 0.6431718061674009,
1029
+ "grad_norm": 93874.2578125,
1030
+ "learning_rate": 1.74273127753304e-05,
1031
+ "loss": 0.0932,
1032
+ "step": 730
1033
+ },
1034
+ {
1035
+ "epoch": 0.6475770925110133,
1036
+ "grad_norm": 124057.8828125,
1037
+ "learning_rate": 1.740969162995595e-05,
1038
+ "loss": 0.118,
1039
+ "step": 735
1040
+ },
1041
+ {
1042
+ "epoch": 0.6519823788546255,
1043
+ "grad_norm": 144435.03125,
1044
+ "learning_rate": 1.73920704845815e-05,
1045
+ "loss": 0.1053,
1046
+ "step": 740
1047
+ },
1048
+ {
1049
+ "epoch": 0.6563876651982379,
1050
+ "grad_norm": 72584.375,
1051
+ "learning_rate": 1.7374449339207047e-05,
1052
+ "loss": 0.0731,
1053
+ "step": 745
1054
+ },
1055
+ {
1056
+ "epoch": 0.6607929515418502,
1057
+ "grad_norm": 116053.421875,
1058
+ "learning_rate": 1.73568281938326e-05,
1059
+ "loss": 0.0847,
1060
+ "step": 750
1061
+ },
1062
+ {
1063
+ "epoch": 0.6651982378854625,
1064
+ "grad_norm": 148674.921875,
1065
+ "learning_rate": 1.7339207048458153e-05,
1066
+ "loss": 0.1276,
1067
+ "step": 755
1068
+ },
1069
+ {
1070
+ "epoch": 0.6696035242290749,
1071
+ "grad_norm": 84335.3515625,
1072
+ "learning_rate": 1.7321585903083702e-05,
1073
+ "loss": 0.1314,
1074
+ "step": 760
1075
+ },
1076
+ {
1077
+ "epoch": 0.6740088105726872,
1078
+ "grad_norm": 187770.640625,
1079
+ "learning_rate": 1.7303964757709254e-05,
1080
+ "loss": 0.0957,
1081
+ "step": 765
1082
+ },
1083
+ {
1084
+ "epoch": 0.6784140969162996,
1085
+ "grad_norm": 191359.46875,
1086
+ "learning_rate": 1.7286343612334804e-05,
1087
+ "loss": 0.1358,
1088
+ "step": 770
1089
+ },
1090
+ {
1091
+ "epoch": 0.6828193832599119,
1092
+ "grad_norm": 58638.65625,
1093
+ "learning_rate": 1.7268722466960356e-05,
1094
+ "loss": 0.077,
1095
+ "step": 775
1096
+ },
1097
+ {
1098
+ "epoch": 0.6872246696035242,
1099
+ "grad_norm": 166379.5625,
1100
+ "learning_rate": 1.7251101321585902e-05,
1101
+ "loss": 0.0536,
1102
+ "step": 780
1103
+ },
1104
+ {
1105
+ "epoch": 0.6916299559471366,
1106
+ "grad_norm": 161942.0625,
1107
+ "learning_rate": 1.7233480176211454e-05,
1108
+ "loss": 0.0919,
1109
+ "step": 785
1110
+ },
1111
+ {
1112
+ "epoch": 0.6960352422907489,
1113
+ "grad_norm": 149369.109375,
1114
+ "learning_rate": 1.7215859030837007e-05,
1115
+ "loss": 0.113,
1116
+ "step": 790
1117
+ },
1118
+ {
1119
+ "epoch": 0.7004405286343612,
1120
+ "grad_norm": 98025.09375,
1121
+ "learning_rate": 1.7198237885462556e-05,
1122
+ "loss": 0.0557,
1123
+ "step": 795
1124
+ },
1125
+ {
1126
+ "epoch": 0.7048458149779736,
1127
+ "grad_norm": 77845.296875,
1128
+ "learning_rate": 1.718061674008811e-05,
1129
+ "loss": 0.0858,
1130
+ "step": 800
1131
+ },
1132
+ {
1133
+ "epoch": 0.7092511013215859,
1134
+ "grad_norm": 229359.421875,
1135
+ "learning_rate": 1.7162995594713658e-05,
1136
+ "loss": 0.0905,
1137
+ "step": 805
1138
+ },
1139
+ {
1140
+ "epoch": 0.7136563876651982,
1141
+ "grad_norm": 114946.9453125,
1142
+ "learning_rate": 1.714537444933921e-05,
1143
+ "loss": 0.0855,
1144
+ "step": 810
1145
+ },
1146
+ {
1147
+ "epoch": 0.7180616740088106,
1148
+ "grad_norm": 105548.078125,
1149
+ "learning_rate": 1.712775330396476e-05,
1150
+ "loss": 0.0878,
1151
+ "step": 815
1152
+ },
1153
+ {
1154
+ "epoch": 0.7224669603524229,
1155
+ "grad_norm": 84087.8203125,
1156
+ "learning_rate": 1.711013215859031e-05,
1157
+ "loss": 0.0718,
1158
+ "step": 820
1159
+ },
1160
+ {
1161
+ "epoch": 0.7268722466960352,
1162
+ "grad_norm": 63263.57421875,
1163
+ "learning_rate": 1.709251101321586e-05,
1164
+ "loss": 0.0656,
1165
+ "step": 825
1166
+ },
1167
+ {
1168
+ "epoch": 0.7312775330396476,
1169
+ "grad_norm": 155470.1875,
1170
+ "learning_rate": 1.707488986784141e-05,
1171
+ "loss": 0.1277,
1172
+ "step": 830
1173
+ },
1174
+ {
1175
+ "epoch": 0.73568281938326,
1176
+ "grad_norm": 113818.9140625,
1177
+ "learning_rate": 1.7057268722466963e-05,
1178
+ "loss": 0.1506,
1179
+ "step": 835
1180
+ },
1181
+ {
1182
+ "epoch": 0.7400881057268722,
1183
+ "grad_norm": 207417.890625,
1184
+ "learning_rate": 1.7039647577092512e-05,
1185
+ "loss": 0.1078,
1186
+ "step": 840
1187
+ },
1188
+ {
1189
+ "epoch": 0.7444933920704846,
1190
+ "grad_norm": 96532.390625,
1191
+ "learning_rate": 1.7022026431718065e-05,
1192
+ "loss": 0.1288,
1193
+ "step": 845
1194
+ },
1195
+ {
1196
+ "epoch": 0.748898678414097,
1197
+ "grad_norm": 97353.796875,
1198
+ "learning_rate": 1.7004405286343614e-05,
1199
+ "loss": 0.0624,
1200
+ "step": 850
1201
+ },
1202
+ {
1203
+ "epoch": 0.7533039647577092,
1204
+ "grad_norm": 152348.625,
1205
+ "learning_rate": 1.6986784140969163e-05,
1206
+ "loss": 0.0837,
1207
+ "step": 855
1208
+ },
1209
+ {
1210
+ "epoch": 0.7577092511013216,
1211
+ "grad_norm": 92298.265625,
1212
+ "learning_rate": 1.6969162995594716e-05,
1213
+ "loss": 0.0918,
1214
+ "step": 860
1215
+ },
1216
+ {
1217
+ "epoch": 0.762114537444934,
1218
+ "grad_norm": 97720.921875,
1219
+ "learning_rate": 1.6951541850220265e-05,
1220
+ "loss": 0.1112,
1221
+ "step": 865
1222
+ },
1223
+ {
1224
+ "epoch": 0.7665198237885462,
1225
+ "grad_norm": 88505.6640625,
1226
+ "learning_rate": 1.6933920704845818e-05,
1227
+ "loss": 0.1022,
1228
+ "step": 870
1229
+ },
1230
+ {
1231
+ "epoch": 0.7709251101321586,
1232
+ "grad_norm": 58780.78125,
1233
+ "learning_rate": 1.6916299559471367e-05,
1234
+ "loss": 0.1064,
1235
+ "step": 875
1236
+ },
1237
+ {
1238
+ "epoch": 0.775330396475771,
1239
+ "grad_norm": 102623.4140625,
1240
+ "learning_rate": 1.689867841409692e-05,
1241
+ "loss": 0.077,
1242
+ "step": 880
1243
+ },
1244
+ {
1245
+ "epoch": 0.7797356828193832,
1246
+ "grad_norm": 57006.04296875,
1247
+ "learning_rate": 1.688105726872247e-05,
1248
+ "loss": 0.1078,
1249
+ "step": 885
1250
+ },
1251
+ {
1252
+ "epoch": 0.7841409691629956,
1253
+ "grad_norm": 110325.6171875,
1254
+ "learning_rate": 1.6863436123348018e-05,
1255
+ "loss": 0.0716,
1256
+ "step": 890
1257
+ },
1258
+ {
1259
+ "epoch": 0.788546255506608,
1260
+ "grad_norm": 114105.796875,
1261
+ "learning_rate": 1.684581497797357e-05,
1262
+ "loss": 0.0678,
1263
+ "step": 895
1264
+ },
1265
+ {
1266
+ "epoch": 0.7929515418502202,
1267
+ "grad_norm": 65852.2734375,
1268
+ "learning_rate": 1.682819383259912e-05,
1269
+ "loss": 0.0711,
1270
+ "step": 900
1271
+ },
1272
+ {
1273
+ "epoch": 0.7973568281938326,
1274
+ "grad_norm": 72375.2265625,
1275
+ "learning_rate": 1.6810572687224672e-05,
1276
+ "loss": 0.1311,
1277
+ "step": 905
1278
+ },
1279
+ {
1280
+ "epoch": 0.801762114537445,
1281
+ "grad_norm": 122892.6328125,
1282
+ "learning_rate": 1.679295154185022e-05,
1283
+ "loss": 0.0975,
1284
+ "step": 910
1285
+ },
1286
+ {
1287
+ "epoch": 0.8061674008810573,
1288
+ "grad_norm": 194237.671875,
1289
+ "learning_rate": 1.6775330396475774e-05,
1290
+ "loss": 0.1006,
1291
+ "step": 915
1292
+ },
1293
+ {
1294
+ "epoch": 0.8105726872246696,
1295
+ "grad_norm": 129793.3671875,
1296
+ "learning_rate": 1.6757709251101323e-05,
1297
+ "loss": 0.1123,
1298
+ "step": 920
1299
+ },
1300
+ {
1301
+ "epoch": 0.8149779735682819,
1302
+ "grad_norm": 82357.6640625,
1303
+ "learning_rate": 1.6740088105726872e-05,
1304
+ "loss": 0.0773,
1305
+ "step": 925
1306
+ },
1307
+ {
1308
+ "epoch": 0.8193832599118943,
1309
+ "grad_norm": 67192.2578125,
1310
+ "learning_rate": 1.6722466960352425e-05,
1311
+ "loss": 0.0768,
1312
+ "step": 930
1313
+ },
1314
+ {
1315
+ "epoch": 0.8237885462555066,
1316
+ "grad_norm": 104339.375,
1317
+ "learning_rate": 1.6704845814977974e-05,
1318
+ "loss": 0.0934,
1319
+ "step": 935
1320
+ },
1321
+ {
1322
+ "epoch": 0.8281938325991189,
1323
+ "grad_norm": 141351.65625,
1324
+ "learning_rate": 1.6687224669603527e-05,
1325
+ "loss": 0.0798,
1326
+ "step": 940
1327
+ },
1328
+ {
1329
+ "epoch": 0.8325991189427313,
1330
+ "grad_norm": 76349.3671875,
1331
+ "learning_rate": 1.6669603524229076e-05,
1332
+ "loss": 0.073,
1333
+ "step": 945
1334
+ },
1335
+ {
1336
+ "epoch": 0.8370044052863436,
1337
+ "grad_norm": 78456.09375,
1338
+ "learning_rate": 1.665198237885463e-05,
1339
+ "loss": 0.0569,
1340
+ "step": 950
1341
+ },
1342
+ {
1343
+ "epoch": 0.8414096916299559,
1344
+ "grad_norm": 99650.015625,
1345
+ "learning_rate": 1.6634361233480178e-05,
1346
+ "loss": 0.0957,
1347
+ "step": 955
1348
+ },
1349
+ {
1350
+ "epoch": 0.8458149779735683,
1351
+ "grad_norm": 91759.796875,
1352
+ "learning_rate": 1.6616740088105727e-05,
1353
+ "loss": 0.0595,
1354
+ "step": 960
1355
+ },
1356
+ {
1357
+ "epoch": 0.8502202643171806,
1358
+ "grad_norm": 120993.03125,
1359
+ "learning_rate": 1.659911894273128e-05,
1360
+ "loss": 0.0693,
1361
+ "step": 965
1362
+ },
1363
+ {
1364
+ "epoch": 0.8546255506607929,
1365
+ "grad_norm": 78032.421875,
1366
+ "learning_rate": 1.658149779735683e-05,
1367
+ "loss": 0.0719,
1368
+ "step": 970
1369
+ },
1370
+ {
1371
+ "epoch": 0.8590308370044053,
1372
+ "grad_norm": 233544.96875,
1373
+ "learning_rate": 1.656387665198238e-05,
1374
+ "loss": 0.0661,
1375
+ "step": 975
1376
+ },
1377
+ {
1378
+ "epoch": 0.8634361233480177,
1379
+ "grad_norm": 48958.75390625,
1380
+ "learning_rate": 1.654625550660793e-05,
1381
+ "loss": 0.0961,
1382
+ "step": 980
1383
+ },
1384
+ {
1385
+ "epoch": 0.8678414096916299,
1386
+ "grad_norm": 109371.53125,
1387
+ "learning_rate": 1.6528634361233483e-05,
1388
+ "loss": 0.0789,
1389
+ "step": 985
1390
+ },
1391
+ {
1392
+ "epoch": 0.8722466960352423,
1393
+ "grad_norm": 193694.890625,
1394
+ "learning_rate": 1.6511013215859032e-05,
1395
+ "loss": 0.1096,
1396
+ "step": 990
1397
+ },
1398
+ {
1399
+ "epoch": 0.8766519823788547,
1400
+ "grad_norm": 37518.26171875,
1401
+ "learning_rate": 1.649339207048458e-05,
1402
+ "loss": 0.1039,
1403
+ "step": 995
1404
+ },
1405
+ {
1406
+ "epoch": 0.8810572687224669,
1407
+ "grad_norm": 133068.5625,
1408
+ "learning_rate": 1.6475770925110134e-05,
1409
+ "loss": 0.0726,
1410
+ "step": 1000
1411
+ },
1412
+ {
1413
+ "epoch": 0.8854625550660793,
1414
+ "grad_norm": 93989.75,
1415
+ "learning_rate": 1.6458149779735683e-05,
1416
+ "loss": 0.0908,
1417
+ "step": 1005
1418
+ },
1419
+ {
1420
+ "epoch": 0.8898678414096917,
1421
+ "grad_norm": 106104.7734375,
1422
+ "learning_rate": 1.6440528634361236e-05,
1423
+ "loss": 0.0891,
1424
+ "step": 1010
1425
+ },
1426
+ {
1427
+ "epoch": 0.8942731277533039,
1428
+ "grad_norm": 57476.76171875,
1429
+ "learning_rate": 1.6422907488986785e-05,
1430
+ "loss": 0.105,
1431
+ "step": 1015
1432
+ },
1433
+ {
1434
+ "epoch": 0.8986784140969163,
1435
+ "grad_norm": 95132.3984375,
1436
+ "learning_rate": 1.6405286343612337e-05,
1437
+ "loss": 0.0783,
1438
+ "step": 1020
1439
+ },
1440
+ {
1441
+ "epoch": 0.9030837004405287,
1442
+ "grad_norm": 132834.140625,
1443
+ "learning_rate": 1.6387665198237887e-05,
1444
+ "loss": 0.08,
1445
+ "step": 1025
1446
+ },
1447
+ {
1448
+ "epoch": 0.9074889867841409,
1449
+ "grad_norm": 76089.6484375,
1450
+ "learning_rate": 1.637004405286344e-05,
1451
+ "loss": 0.1149,
1452
+ "step": 1030
1453
+ },
1454
+ {
1455
+ "epoch": 0.9118942731277533,
1456
+ "grad_norm": 63661.88671875,
1457
+ "learning_rate": 1.635242290748899e-05,
1458
+ "loss": 0.0672,
1459
+ "step": 1035
1460
+ },
1461
+ {
1462
+ "epoch": 0.9162995594713657,
1463
+ "grad_norm": 72133.03125,
1464
+ "learning_rate": 1.6334801762114538e-05,
1465
+ "loss": 0.0728,
1466
+ "step": 1040
1467
+ },
1468
+ {
1469
+ "epoch": 0.920704845814978,
1470
+ "grad_norm": 100398.6015625,
1471
+ "learning_rate": 1.631718061674009e-05,
1472
+ "loss": 0.1223,
1473
+ "step": 1045
1474
+ },
1475
+ {
1476
+ "epoch": 0.9251101321585903,
1477
+ "grad_norm": 279089.8125,
1478
+ "learning_rate": 1.629955947136564e-05,
1479
+ "loss": 0.1427,
1480
+ "step": 1050
1481
+ },
1482
+ {
1483
+ "epoch": 0.9295154185022027,
1484
+ "grad_norm": 62917.9765625,
1485
+ "learning_rate": 1.6281938325991192e-05,
1486
+ "loss": 0.1203,
1487
+ "step": 1055
1488
+ },
1489
+ {
1490
+ "epoch": 0.933920704845815,
1491
+ "grad_norm": 140988.796875,
1492
+ "learning_rate": 1.626431718061674e-05,
1493
+ "loss": 0.0967,
1494
+ "step": 1060
1495
+ },
1496
+ {
1497
+ "epoch": 0.9383259911894273,
1498
+ "grad_norm": 259548.109375,
1499
+ "learning_rate": 1.6246696035242294e-05,
1500
+ "loss": 0.0839,
1501
+ "step": 1065
1502
+ },
1503
+ {
1504
+ "epoch": 0.9427312775330396,
1505
+ "grad_norm": 67071.7265625,
1506
+ "learning_rate": 1.6229074889867843e-05,
1507
+ "loss": 0.0556,
1508
+ "step": 1070
1509
+ },
1510
+ {
1511
+ "epoch": 0.947136563876652,
1512
+ "grad_norm": 92416.890625,
1513
+ "learning_rate": 1.6211453744493392e-05,
1514
+ "loss": 0.0858,
1515
+ "step": 1075
1516
+ },
1517
+ {
1518
+ "epoch": 0.9515418502202643,
1519
+ "grad_norm": 44402.7578125,
1520
+ "learning_rate": 1.6193832599118945e-05,
1521
+ "loss": 0.0548,
1522
+ "step": 1080
1523
+ },
1524
+ {
1525
+ "epoch": 0.9559471365638766,
1526
+ "grad_norm": 167386.671875,
1527
+ "learning_rate": 1.6176211453744494e-05,
1528
+ "loss": 0.094,
1529
+ "step": 1085
1530
+ },
1531
+ {
1532
+ "epoch": 0.960352422907489,
1533
+ "grad_norm": 47520.0546875,
1534
+ "learning_rate": 1.6158590308370046e-05,
1535
+ "loss": 0.0758,
1536
+ "step": 1090
1537
+ },
1538
+ {
1539
+ "epoch": 0.9647577092511013,
1540
+ "grad_norm": 131369.015625,
1541
+ "learning_rate": 1.6140969162995596e-05,
1542
+ "loss": 0.0675,
1543
+ "step": 1095
1544
+ },
1545
+ {
1546
+ "epoch": 0.9691629955947136,
1547
+ "grad_norm": 52430.921875,
1548
+ "learning_rate": 1.6123348017621148e-05,
1549
+ "loss": 0.091,
1550
+ "step": 1100
1551
+ },
1552
+ {
1553
+ "epoch": 0.973568281938326,
1554
+ "grad_norm": 114321.75,
1555
+ "learning_rate": 1.6105726872246697e-05,
1556
+ "loss": 0.0827,
1557
+ "step": 1105
1558
+ },
1559
+ {
1560
+ "epoch": 0.9779735682819384,
1561
+ "grad_norm": 184216.453125,
1562
+ "learning_rate": 1.6088105726872247e-05,
1563
+ "loss": 0.1047,
1564
+ "step": 1110
1565
+ },
1566
+ {
1567
+ "epoch": 0.9823788546255506,
1568
+ "grad_norm": 147871.734375,
1569
+ "learning_rate": 1.60704845814978e-05,
1570
+ "loss": 0.0991,
1571
+ "step": 1115
1572
+ },
1573
+ {
1574
+ "epoch": 0.986784140969163,
1575
+ "grad_norm": 144256.390625,
1576
+ "learning_rate": 1.605286343612335e-05,
1577
+ "loss": 0.084,
1578
+ "step": 1120
1579
+ },
1580
+ {
1581
+ "epoch": 0.9911894273127754,
1582
+ "grad_norm": 57534.3359375,
1583
+ "learning_rate": 1.60352422907489e-05,
1584
+ "loss": 0.097,
1585
+ "step": 1125
1586
+ },
1587
+ {
1588
+ "epoch": 0.9955947136563876,
1589
+ "grad_norm": 77203.0546875,
1590
+ "learning_rate": 1.601762114537445e-05,
1591
+ "loss": 0.0529,
1592
+ "step": 1130
1593
+ },
1594
+ {
1595
+ "epoch": 1.0,
1596
+ "grad_norm": 103384.796875,
1597
+ "learning_rate": 1.6000000000000003e-05,
1598
+ "loss": 0.0537,
1599
+ "step": 1135
1600
+ },
1601
+ {
1602
+ "epoch": 1.0,
1603
+ "eval_accuracy_score": 0.9745103887066309,
1604
+ "eval_f1": 0.8684953610326744,
1605
+ "eval_loss": 0.07397506386041641,
1606
+ "eval_precision": 0.8514929800276844,
1607
+ "eval_recall": 0.8861905741922206,
1608
+ "eval_runtime": 56.8324,
1609
+ "eval_samples_per_second": 35.508,
1610
+ "eval_steps_per_second": 4.452,
1611
+ "step": 1135
1612
+ },
1613
+ {
1614
+ "epoch": 1.0044052863436124,
1615
+ "grad_norm": 68579.359375,
1616
+ "learning_rate": 1.5982378854625552e-05,
1617
+ "loss": 0.0724,
1618
+ "step": 1140
1619
+ },
1620
+ {
1621
+ "epoch": 1.0088105726872247,
1622
+ "grad_norm": 59411.6875,
1623
+ "learning_rate": 1.59647577092511e-05,
1624
+ "loss": 0.0375,
1625
+ "step": 1145
1626
+ },
1627
+ {
1628
+ "epoch": 1.013215859030837,
1629
+ "grad_norm": 198824.046875,
1630
+ "learning_rate": 1.5947136563876654e-05,
1631
+ "loss": 0.0503,
1632
+ "step": 1150
1633
+ },
1634
+ {
1635
+ "epoch": 1.0176211453744493,
1636
+ "grad_norm": 90052.5078125,
1637
+ "learning_rate": 1.5929515418502203e-05,
1638
+ "loss": 0.0596,
1639
+ "step": 1155
1640
+ },
1641
+ {
1642
+ "epoch": 1.0220264317180616,
1643
+ "grad_norm": 173568.34375,
1644
+ "learning_rate": 1.5911894273127755e-05,
1645
+ "loss": 0.0908,
1646
+ "step": 1160
1647
+ },
1648
+ {
1649
+ "epoch": 1.026431718061674,
1650
+ "grad_norm": 59891.09375,
1651
+ "learning_rate": 1.5894273127753305e-05,
1652
+ "loss": 0.0689,
1653
+ "step": 1165
1654
+ },
1655
+ {
1656
+ "epoch": 1.0308370044052864,
1657
+ "grad_norm": 75100.1875,
1658
+ "learning_rate": 1.5876651982378857e-05,
1659
+ "loss": 0.0364,
1660
+ "step": 1170
1661
+ },
1662
+ {
1663
+ "epoch": 1.0352422907488987,
1664
+ "grad_norm": 148419.75,
1665
+ "learning_rate": 1.5859030837004406e-05,
1666
+ "loss": 0.061,
1667
+ "step": 1175
1668
+ },
1669
+ {
1670
+ "epoch": 1.039647577092511,
1671
+ "grad_norm": 99710.6015625,
1672
+ "learning_rate": 1.5841409691629956e-05,
1673
+ "loss": 0.0345,
1674
+ "step": 1180
1675
+ },
1676
+ {
1677
+ "epoch": 1.0440528634361232,
1678
+ "grad_norm": 82176.625,
1679
+ "learning_rate": 1.5823788546255508e-05,
1680
+ "loss": 0.056,
1681
+ "step": 1185
1682
+ },
1683
+ {
1684
+ "epoch": 1.0484581497797356,
1685
+ "grad_norm": 136751.828125,
1686
+ "learning_rate": 1.5806167400881057e-05,
1687
+ "loss": 0.0589,
1688
+ "step": 1190
1689
+ },
1690
+ {
1691
+ "epoch": 1.052863436123348,
1692
+ "grad_norm": 59449.00390625,
1693
+ "learning_rate": 1.578854625550661e-05,
1694
+ "loss": 0.0458,
1695
+ "step": 1195
1696
+ },
1697
+ {
1698
+ "epoch": 1.0572687224669604,
1699
+ "grad_norm": 180705.421875,
1700
+ "learning_rate": 1.577092511013216e-05,
1701
+ "loss": 0.0573,
1702
+ "step": 1200
1703
+ },
1704
+ {
1705
+ "epoch": 1.0616740088105727,
1706
+ "grad_norm": 92330.03125,
1707
+ "learning_rate": 1.575330396475771e-05,
1708
+ "loss": 0.0524,
1709
+ "step": 1205
1710
+ },
1711
+ {
1712
+ "epoch": 1.066079295154185,
1713
+ "grad_norm": 56101.46875,
1714
+ "learning_rate": 1.573568281938326e-05,
1715
+ "loss": 0.0621,
1716
+ "step": 1210
1717
+ },
1718
+ {
1719
+ "epoch": 1.0704845814977975,
1720
+ "grad_norm": 58973.703125,
1721
+ "learning_rate": 1.571806167400881e-05,
1722
+ "loss": 0.0762,
1723
+ "step": 1215
1724
+ },
1725
+ {
1726
+ "epoch": 1.0748898678414096,
1727
+ "grad_norm": 104488.7578125,
1728
+ "learning_rate": 1.5700440528634363e-05,
1729
+ "loss": 0.0435,
1730
+ "step": 1220
1731
+ },
1732
+ {
1733
+ "epoch": 1.079295154185022,
1734
+ "grad_norm": 45256.01953125,
1735
+ "learning_rate": 1.5682819383259912e-05,
1736
+ "loss": 0.0419,
1737
+ "step": 1225
1738
+ },
1739
+ {
1740
+ "epoch": 1.0837004405286343,
1741
+ "grad_norm": 62852.0234375,
1742
+ "learning_rate": 1.5665198237885464e-05,
1743
+ "loss": 0.0764,
1744
+ "step": 1230
1745
+ },
1746
+ {
1747
+ "epoch": 1.0881057268722467,
1748
+ "grad_norm": 45853.28125,
1749
+ "learning_rate": 1.5647577092511014e-05,
1750
+ "loss": 0.0697,
1751
+ "step": 1235
1752
+ },
1753
+ {
1754
+ "epoch": 1.092511013215859,
1755
+ "grad_norm": 48477.66015625,
1756
+ "learning_rate": 1.5629955947136566e-05,
1757
+ "loss": 0.0407,
1758
+ "step": 1240
1759
+ },
1760
+ {
1761
+ "epoch": 1.0969162995594715,
1762
+ "grad_norm": 131279.234375,
1763
+ "learning_rate": 1.5612334801762115e-05,
1764
+ "loss": 0.0656,
1765
+ "step": 1245
1766
+ },
1767
+ {
1768
+ "epoch": 1.1013215859030836,
1769
+ "grad_norm": 156106.46875,
1770
+ "learning_rate": 1.5594713656387664e-05,
1771
+ "loss": 0.0923,
1772
+ "step": 1250
1773
+ },
1774
+ {
1775
+ "epoch": 1.105726872246696,
1776
+ "grad_norm": 62060.60546875,
1777
+ "learning_rate": 1.5577092511013217e-05,
1778
+ "loss": 0.0454,
1779
+ "step": 1255
1780
+ },
1781
+ {
1782
+ "epoch": 1.1101321585903083,
1783
+ "grad_norm": 32545.71875,
1784
+ "learning_rate": 1.5559471365638766e-05,
1785
+ "loss": 0.0352,
1786
+ "step": 1260
1787
+ },
1788
+ {
1789
+ "epoch": 1.1145374449339207,
1790
+ "grad_norm": 165649.1875,
1791
+ "learning_rate": 1.554185022026432e-05,
1792
+ "loss": 0.0568,
1793
+ "step": 1265
1794
+ },
1795
+ {
1796
+ "epoch": 1.118942731277533,
1797
+ "grad_norm": 77680.109375,
1798
+ "learning_rate": 1.5524229074889868e-05,
1799
+ "loss": 0.0446,
1800
+ "step": 1270
1801
+ },
1802
+ {
1803
+ "epoch": 1.1233480176211454,
1804
+ "grad_norm": 147111.0,
1805
+ "learning_rate": 1.550660792951542e-05,
1806
+ "loss": 0.0713,
1807
+ "step": 1275
1808
+ },
1809
+ {
1810
+ "epoch": 1.1277533039647576,
1811
+ "grad_norm": 67580.578125,
1812
+ "learning_rate": 1.548898678414097e-05,
1813
+ "loss": 0.0599,
1814
+ "step": 1280
1815
+ },
1816
+ {
1817
+ "epoch": 1.13215859030837,
1818
+ "grad_norm": 114479.3046875,
1819
+ "learning_rate": 1.547136563876652e-05,
1820
+ "loss": 0.1017,
1821
+ "step": 1285
1822
+ },
1823
+ {
1824
+ "epoch": 1.1365638766519823,
1825
+ "grad_norm": 37997.4765625,
1826
+ "learning_rate": 1.545374449339207e-05,
1827
+ "loss": 0.0747,
1828
+ "step": 1290
1829
+ },
1830
+ {
1831
+ "epoch": 1.1409691629955947,
1832
+ "grad_norm": 76116.6484375,
1833
+ "learning_rate": 1.543612334801762e-05,
1834
+ "loss": 0.0673,
1835
+ "step": 1295
1836
+ },
1837
+ {
1838
+ "epoch": 1.145374449339207,
1839
+ "grad_norm": 54332.19140625,
1840
+ "learning_rate": 1.5418502202643173e-05,
1841
+ "loss": 0.0661,
1842
+ "step": 1300
1843
+ },
1844
+ {
1845
+ "epoch": 1.1497797356828194,
1846
+ "grad_norm": 144475.734375,
1847
+ "learning_rate": 1.5400881057268722e-05,
1848
+ "loss": 0.0583,
1849
+ "step": 1305
1850
+ },
1851
+ {
1852
+ "epoch": 1.1541850220264318,
1853
+ "grad_norm": 222127.890625,
1854
+ "learning_rate": 1.5383259911894275e-05,
1855
+ "loss": 0.0375,
1856
+ "step": 1310
1857
+ },
1858
+ {
1859
+ "epoch": 1.158590308370044,
1860
+ "grad_norm": 137634.546875,
1861
+ "learning_rate": 1.5365638766519824e-05,
1862
+ "loss": 0.068,
1863
+ "step": 1315
1864
+ },
1865
+ {
1866
+ "epoch": 1.1629955947136563,
1867
+ "grad_norm": 88347.578125,
1868
+ "learning_rate": 1.5348017621145373e-05,
1869
+ "loss": 0.0432,
1870
+ "step": 1320
1871
+ },
1872
+ {
1873
+ "epoch": 1.1674008810572687,
1874
+ "grad_norm": 45639.54296875,
1875
+ "learning_rate": 1.5330396475770926e-05,
1876
+ "loss": 0.0452,
1877
+ "step": 1325
1878
+ },
1879
+ {
1880
+ "epoch": 1.171806167400881,
1881
+ "grad_norm": 150095.0625,
1882
+ "learning_rate": 1.5312775330396475e-05,
1883
+ "loss": 0.0416,
1884
+ "step": 1330
1885
+ },
1886
+ {
1887
+ "epoch": 1.1762114537444934,
1888
+ "grad_norm": 268485.375,
1889
+ "learning_rate": 1.5295154185022028e-05,
1890
+ "loss": 0.0668,
1891
+ "step": 1335
1892
+ },
1893
+ {
1894
+ "epoch": 1.1806167400881058,
1895
+ "grad_norm": 436466.5625,
1896
+ "learning_rate": 1.5277533039647577e-05,
1897
+ "loss": 0.0622,
1898
+ "step": 1340
1899
+ },
1900
+ {
1901
+ "epoch": 1.1850220264317182,
1902
+ "grad_norm": 97963.7890625,
1903
+ "learning_rate": 1.525991189427313e-05,
1904
+ "loss": 0.0437,
1905
+ "step": 1345
1906
+ },
1907
+ {
1908
+ "epoch": 1.1894273127753303,
1909
+ "grad_norm": 84959.671875,
1910
+ "learning_rate": 1.524229074889868e-05,
1911
+ "loss": 0.0661,
1912
+ "step": 1350
1913
+ },
1914
+ {
1915
+ "epoch": 1.1938325991189427,
1916
+ "grad_norm": 143916.921875,
1917
+ "learning_rate": 1.5224669603524231e-05,
1918
+ "loss": 0.0203,
1919
+ "step": 1355
1920
+ },
1921
+ {
1922
+ "epoch": 1.198237885462555,
1923
+ "grad_norm": 156699.5,
1924
+ "learning_rate": 1.520704845814978e-05,
1925
+ "loss": 0.0606,
1926
+ "step": 1360
1927
+ },
1928
+ {
1929
+ "epoch": 1.2026431718061674,
1930
+ "grad_norm": 107392.8828125,
1931
+ "learning_rate": 1.5189427312775331e-05,
1932
+ "loss": 0.0567,
1933
+ "step": 1365
1934
+ },
1935
+ {
1936
+ "epoch": 1.2070484581497798,
1937
+ "grad_norm": 97381.4609375,
1938
+ "learning_rate": 1.5171806167400882e-05,
1939
+ "loss": 0.0592,
1940
+ "step": 1370
1941
+ },
1942
+ {
1943
+ "epoch": 1.2114537444933922,
1944
+ "grad_norm": 117461.0,
1945
+ "learning_rate": 1.5154185022026433e-05,
1946
+ "loss": 0.0571,
1947
+ "step": 1375
1948
+ },
1949
+ {
1950
+ "epoch": 1.2158590308370045,
1951
+ "grad_norm": 67447.71875,
1952
+ "learning_rate": 1.5136563876651984e-05,
1953
+ "loss": 0.0491,
1954
+ "step": 1380
1955
+ },
1956
+ {
1957
+ "epoch": 1.2202643171806167,
1958
+ "grad_norm": 74018.828125,
1959
+ "learning_rate": 1.5118942731277535e-05,
1960
+ "loss": 0.0268,
1961
+ "step": 1385
1962
+ },
1963
+ {
1964
+ "epoch": 1.224669603524229,
1965
+ "grad_norm": 143559.828125,
1966
+ "learning_rate": 1.5101321585903086e-05,
1967
+ "loss": 0.0498,
1968
+ "step": 1390
1969
+ },
1970
+ {
1971
+ "epoch": 1.2290748898678414,
1972
+ "grad_norm": 53218.4296875,
1973
+ "learning_rate": 1.5083700440528635e-05,
1974
+ "loss": 0.0685,
1975
+ "step": 1395
1976
+ },
1977
+ {
1978
+ "epoch": 1.2334801762114538,
1979
+ "grad_norm": 91495.9609375,
1980
+ "learning_rate": 1.5066079295154186e-05,
1981
+ "loss": 0.0468,
1982
+ "step": 1400
1983
+ },
1984
+ {
1985
+ "epoch": 1.2378854625550662,
1986
+ "grad_norm": 74750.8828125,
1987
+ "learning_rate": 1.5048458149779737e-05,
1988
+ "loss": 0.048,
1989
+ "step": 1405
1990
+ },
1991
+ {
1992
+ "epoch": 1.2422907488986783,
1993
+ "grad_norm": 50501.01953125,
1994
+ "learning_rate": 1.5030837004405288e-05,
1995
+ "loss": 0.0795,
1996
+ "step": 1410
1997
+ },
1998
+ {
1999
+ "epoch": 1.2466960352422907,
2000
+ "grad_norm": 53710.2890625,
2001
+ "learning_rate": 1.5013215859030838e-05,
2002
+ "loss": 0.067,
2003
+ "step": 1415
2004
+ },
2005
+ {
2006
+ "epoch": 1.251101321585903,
2007
+ "grad_norm": 121764.3046875,
2008
+ "learning_rate": 1.499559471365639e-05,
2009
+ "loss": 0.0605,
2010
+ "step": 1420
2011
+ },
2012
+ {
2013
+ "epoch": 1.2555066079295154,
2014
+ "grad_norm": 81982.640625,
2015
+ "learning_rate": 1.497797356828194e-05,
2016
+ "loss": 0.0866,
2017
+ "step": 1425
2018
+ },
2019
+ {
2020
+ "epoch": 1.2599118942731278,
2021
+ "grad_norm": 70611.1875,
2022
+ "learning_rate": 1.496035242290749e-05,
2023
+ "loss": 0.0498,
2024
+ "step": 1430
2025
+ },
2026
+ {
2027
+ "epoch": 1.2643171806167401,
2028
+ "grad_norm": 480180.84375,
2029
+ "learning_rate": 1.494273127753304e-05,
2030
+ "loss": 0.0749,
2031
+ "step": 1435
2032
+ },
2033
+ {
2034
+ "epoch": 1.2687224669603525,
2035
+ "grad_norm": 169216.40625,
2036
+ "learning_rate": 1.4925110132158591e-05,
2037
+ "loss": 0.0854,
2038
+ "step": 1440
2039
+ },
2040
+ {
2041
+ "epoch": 1.2731277533039647,
2042
+ "grad_norm": 136009.609375,
2043
+ "learning_rate": 1.4907488986784142e-05,
2044
+ "loss": 0.0592,
2045
+ "step": 1445
2046
+ },
2047
+ {
2048
+ "epoch": 1.277533039647577,
2049
+ "grad_norm": 64462.30859375,
2050
+ "learning_rate": 1.4889867841409693e-05,
2051
+ "loss": 0.0689,
2052
+ "step": 1450
2053
+ },
2054
+ {
2055
+ "epoch": 1.2819383259911894,
2056
+ "grad_norm": 64193.625,
2057
+ "learning_rate": 1.4872246696035244e-05,
2058
+ "loss": 0.0631,
2059
+ "step": 1455
2060
+ },
2061
+ {
2062
+ "epoch": 1.2863436123348018,
2063
+ "grad_norm": 167001.625,
2064
+ "learning_rate": 1.4854625550660795e-05,
2065
+ "loss": 0.0423,
2066
+ "step": 1460
2067
+ },
2068
+ {
2069
+ "epoch": 1.2907488986784141,
2070
+ "grad_norm": 78108.1484375,
2071
+ "learning_rate": 1.4837004405286344e-05,
2072
+ "loss": 0.0637,
2073
+ "step": 1465
2074
+ },
2075
+ {
2076
+ "epoch": 1.2951541850220265,
2077
+ "grad_norm": 208587.71875,
2078
+ "learning_rate": 1.4819383259911895e-05,
2079
+ "loss": 0.0818,
2080
+ "step": 1470
2081
+ },
2082
+ {
2083
+ "epoch": 1.2995594713656389,
2084
+ "grad_norm": 178567.4375,
2085
+ "learning_rate": 1.4801762114537446e-05,
2086
+ "loss": 0.0755,
2087
+ "step": 1475
2088
+ },
2089
+ {
2090
+ "epoch": 1.303964757709251,
2091
+ "grad_norm": 79987.8984375,
2092
+ "learning_rate": 1.4784140969162997e-05,
2093
+ "loss": 0.0684,
2094
+ "step": 1480
2095
+ },
2096
+ {
2097
+ "epoch": 1.3083700440528634,
2098
+ "grad_norm": 642718.875,
2099
+ "learning_rate": 1.4766519823788547e-05,
2100
+ "loss": 0.0883,
2101
+ "step": 1485
2102
+ },
2103
+ {
2104
+ "epoch": 1.3127753303964758,
2105
+ "grad_norm": 129405.8203125,
2106
+ "learning_rate": 1.4748898678414098e-05,
2107
+ "loss": 0.0574,
2108
+ "step": 1490
2109
+ },
2110
+ {
2111
+ "epoch": 1.3171806167400881,
2112
+ "grad_norm": 61089.08984375,
2113
+ "learning_rate": 1.473127753303965e-05,
2114
+ "loss": 0.0511,
2115
+ "step": 1495
2116
+ },
2117
+ {
2118
+ "epoch": 1.3215859030837005,
2119
+ "grad_norm": 102123.171875,
2120
+ "learning_rate": 1.47136563876652e-05,
2121
+ "loss": 0.0356,
2122
+ "step": 1500
2123
+ },
2124
+ {
2125
+ "epoch": 1.3259911894273126,
2126
+ "grad_norm": 119410.8828125,
2127
+ "learning_rate": 1.469603524229075e-05,
2128
+ "loss": 0.049,
2129
+ "step": 1505
2130
+ },
2131
+ {
2132
+ "epoch": 1.3303964757709252,
2133
+ "grad_norm": 67464.9296875,
2134
+ "learning_rate": 1.46784140969163e-05,
2135
+ "loss": 0.0702,
2136
+ "step": 1510
2137
+ },
2138
+ {
2139
+ "epoch": 1.3348017621145374,
2140
+ "grad_norm": 38646.78515625,
2141
+ "learning_rate": 1.4660792951541851e-05,
2142
+ "loss": 0.0305,
2143
+ "step": 1515
2144
+ },
2145
+ {
2146
+ "epoch": 1.3392070484581498,
2147
+ "grad_norm": 81752.4140625,
2148
+ "learning_rate": 1.4643171806167402e-05,
2149
+ "loss": 0.0539,
2150
+ "step": 1520
2151
+ },
2152
+ {
2153
+ "epoch": 1.3436123348017621,
2154
+ "grad_norm": 111795.2265625,
2155
+ "learning_rate": 1.4625550660792953e-05,
2156
+ "loss": 0.0567,
2157
+ "step": 1525
2158
+ },
2159
+ {
2160
+ "epoch": 1.3480176211453745,
2161
+ "grad_norm": 204786.171875,
2162
+ "learning_rate": 1.4607929515418504e-05,
2163
+ "loss": 0.0577,
2164
+ "step": 1530
2165
+ },
2166
+ {
2167
+ "epoch": 1.3524229074889869,
2168
+ "grad_norm": 43613.78125,
2169
+ "learning_rate": 1.4590308370044055e-05,
2170
+ "loss": 0.0618,
2171
+ "step": 1535
2172
+ },
2173
+ {
2174
+ "epoch": 1.356828193832599,
2175
+ "grad_norm": 66957.2109375,
2176
+ "learning_rate": 1.4572687224669604e-05,
2177
+ "loss": 0.0522,
2178
+ "step": 1540
2179
+ },
2180
+ {
2181
+ "epoch": 1.3612334801762114,
2182
+ "grad_norm": 67816.5078125,
2183
+ "learning_rate": 1.4555066079295155e-05,
2184
+ "loss": 0.0457,
2185
+ "step": 1545
2186
+ },
2187
+ {
2188
+ "epoch": 1.3656387665198237,
2189
+ "grad_norm": 70392.7421875,
2190
+ "learning_rate": 1.4537444933920706e-05,
2191
+ "loss": 0.0787,
2192
+ "step": 1550
2193
+ },
2194
+ {
2195
+ "epoch": 1.3700440528634361,
2196
+ "grad_norm": 82702.4765625,
2197
+ "learning_rate": 1.4519823788546256e-05,
2198
+ "loss": 0.0709,
2199
+ "step": 1555
2200
+ },
2201
+ {
2202
+ "epoch": 1.3744493392070485,
2203
+ "grad_norm": 52236.546875,
2204
+ "learning_rate": 1.4502202643171807e-05,
2205
+ "loss": 0.0458,
2206
+ "step": 1560
2207
+ },
2208
+ {
2209
+ "epoch": 1.3788546255506609,
2210
+ "grad_norm": 74592.1953125,
2211
+ "learning_rate": 1.4484581497797358e-05,
2212
+ "loss": 0.0406,
2213
+ "step": 1565
2214
+ },
2215
+ {
2216
+ "epoch": 1.3832599118942732,
2217
+ "grad_norm": 96819.328125,
2218
+ "learning_rate": 1.4466960352422909e-05,
2219
+ "loss": 0.0526,
2220
+ "step": 1570
2221
+ },
2222
+ {
2223
+ "epoch": 1.3876651982378854,
2224
+ "grad_norm": 77327.8671875,
2225
+ "learning_rate": 1.4449339207048458e-05,
2226
+ "loss": 0.042,
2227
+ "step": 1575
2228
+ },
2229
+ {
2230
+ "epoch": 1.3920704845814977,
2231
+ "grad_norm": 82270.8984375,
2232
+ "learning_rate": 1.4431718061674009e-05,
2233
+ "loss": 0.0634,
2234
+ "step": 1580
2235
+ },
2236
+ {
2237
+ "epoch": 1.39647577092511,
2238
+ "grad_norm": 93413.078125,
2239
+ "learning_rate": 1.441409691629956e-05,
2240
+ "loss": 0.0806,
2241
+ "step": 1585
2242
+ },
2243
+ {
2244
+ "epoch": 1.4008810572687225,
2245
+ "grad_norm": 135822.9375,
2246
+ "learning_rate": 1.4396475770925111e-05,
2247
+ "loss": 0.0542,
2248
+ "step": 1590
2249
+ },
2250
+ {
2251
+ "epoch": 1.4052863436123348,
2252
+ "grad_norm": 50764.67578125,
2253
+ "learning_rate": 1.4378854625550662e-05,
2254
+ "loss": 0.0509,
2255
+ "step": 1595
2256
+ },
2257
+ {
2258
+ "epoch": 1.4096916299559472,
2259
+ "grad_norm": 99821.6875,
2260
+ "learning_rate": 1.4361233480176213e-05,
2261
+ "loss": 0.0376,
2262
+ "step": 1600
2263
+ },
2264
+ {
2265
+ "epoch": 1.4140969162995596,
2266
+ "grad_norm": 205982.78125,
2267
+ "learning_rate": 1.4343612334801764e-05,
2268
+ "loss": 0.0758,
2269
+ "step": 1605
2270
+ },
2271
+ {
2272
+ "epoch": 1.4185022026431717,
2273
+ "grad_norm": 120500.5703125,
2274
+ "learning_rate": 1.4325991189427313e-05,
2275
+ "loss": 0.0629,
2276
+ "step": 1610
2277
+ },
2278
+ {
2279
+ "epoch": 1.422907488986784,
2280
+ "grad_norm": 132275.53125,
2281
+ "learning_rate": 1.4308370044052864e-05,
2282
+ "loss": 0.0508,
2283
+ "step": 1615
2284
+ },
2285
+ {
2286
+ "epoch": 1.4273127753303965,
2287
+ "grad_norm": 60844.3515625,
2288
+ "learning_rate": 1.4290748898678414e-05,
2289
+ "loss": 0.0372,
2290
+ "step": 1620
2291
+ },
2292
+ {
2293
+ "epoch": 1.4317180616740088,
2294
+ "grad_norm": 59511.27734375,
2295
+ "learning_rate": 1.4273127753303965e-05,
2296
+ "loss": 0.0462,
2297
+ "step": 1625
2298
+ },
2299
+ {
2300
+ "epoch": 1.4361233480176212,
2301
+ "grad_norm": 17155.3203125,
2302
+ "learning_rate": 1.4255506607929516e-05,
2303
+ "loss": 0.0412,
2304
+ "step": 1630
2305
+ },
2306
+ {
2307
+ "epoch": 1.4405286343612334,
2308
+ "grad_norm": 80875.5546875,
2309
+ "learning_rate": 1.4237885462555067e-05,
2310
+ "loss": 0.0538,
2311
+ "step": 1635
2312
+ },
2313
+ {
2314
+ "epoch": 1.444933920704846,
2315
+ "grad_norm": 378728.5625,
2316
+ "learning_rate": 1.4220264317180618e-05,
2317
+ "loss": 0.0758,
2318
+ "step": 1640
2319
+ },
2320
+ {
2321
+ "epoch": 1.449339207048458,
2322
+ "grad_norm": 95268.5,
2323
+ "learning_rate": 1.4202643171806167e-05,
2324
+ "loss": 0.0536,
2325
+ "step": 1645
2326
+ },
2327
+ {
2328
+ "epoch": 1.4537444933920705,
2329
+ "grad_norm": 235169.375,
2330
+ "learning_rate": 1.4185022026431718e-05,
2331
+ "loss": 0.0725,
2332
+ "step": 1650
2333
+ },
2334
+ {
2335
+ "epoch": 1.4581497797356828,
2336
+ "grad_norm": 81580.4375,
2337
+ "learning_rate": 1.4167400881057269e-05,
2338
+ "loss": 0.1212,
2339
+ "step": 1655
2340
+ },
2341
+ {
2342
+ "epoch": 1.4625550660792952,
2343
+ "grad_norm": 114061.8671875,
2344
+ "learning_rate": 1.414977973568282e-05,
2345
+ "loss": 0.048,
2346
+ "step": 1660
2347
+ },
2348
+ {
2349
+ "epoch": 1.4669603524229076,
2350
+ "grad_norm": 82516.2421875,
2351
+ "learning_rate": 1.413215859030837e-05,
2352
+ "loss": 0.0557,
2353
+ "step": 1665
2354
+ },
2355
+ {
2356
+ "epoch": 1.4713656387665197,
2357
+ "grad_norm": 111354.8359375,
2358
+ "learning_rate": 1.4114537444933922e-05,
2359
+ "loss": 0.0404,
2360
+ "step": 1670
2361
+ },
2362
+ {
2363
+ "epoch": 1.475770925110132,
2364
+ "grad_norm": 71242.171875,
2365
+ "learning_rate": 1.4096916299559472e-05,
2366
+ "loss": 0.0848,
2367
+ "step": 1675
2368
+ },
2369
+ {
2370
+ "epoch": 1.4801762114537445,
2371
+ "grad_norm": 121225.9609375,
2372
+ "learning_rate": 1.4079295154185025e-05,
2373
+ "loss": 0.0423,
2374
+ "step": 1680
2375
+ },
2376
+ {
2377
+ "epoch": 1.4845814977973568,
2378
+ "grad_norm": 95902.4765625,
2379
+ "learning_rate": 1.4061674008810573e-05,
2380
+ "loss": 0.0451,
2381
+ "step": 1685
2382
+ },
2383
+ {
2384
+ "epoch": 1.4889867841409692,
2385
+ "grad_norm": 96145.765625,
2386
+ "learning_rate": 1.4044052863436123e-05,
2387
+ "loss": 0.0434,
2388
+ "step": 1690
2389
+ },
2390
+ {
2391
+ "epoch": 1.4933920704845816,
2392
+ "grad_norm": 54513.20703125,
2393
+ "learning_rate": 1.4026431718061674e-05,
2394
+ "loss": 0.0743,
2395
+ "step": 1695
2396
+ },
2397
+ {
2398
+ "epoch": 1.497797356828194,
2399
+ "grad_norm": 110735.2890625,
2400
+ "learning_rate": 1.4008810572687225e-05,
2401
+ "loss": 0.0364,
2402
+ "step": 1700
2403
+ },
2404
+ {
2405
+ "epoch": 1.502202643171806,
2406
+ "grad_norm": 53126.44921875,
2407
+ "learning_rate": 1.3991189427312776e-05,
2408
+ "loss": 0.0673,
2409
+ "step": 1705
2410
+ },
2411
+ {
2412
+ "epoch": 1.5066079295154187,
2413
+ "grad_norm": 95195.265625,
2414
+ "learning_rate": 1.3973568281938329e-05,
2415
+ "loss": 0.0561,
2416
+ "step": 1710
2417
+ },
2418
+ {
2419
+ "epoch": 1.5110132158590308,
2420
+ "grad_norm": 89720.8046875,
2421
+ "learning_rate": 1.395594713656388e-05,
2422
+ "loss": 0.0426,
2423
+ "step": 1715
2424
+ },
2425
+ {
2426
+ "epoch": 1.5154185022026432,
2427
+ "grad_norm": 113429.3515625,
2428
+ "learning_rate": 1.3938325991189427e-05,
2429
+ "loss": 0.0484,
2430
+ "step": 1720
2431
+ },
2432
+ {
2433
+ "epoch": 1.5198237885462555,
2434
+ "grad_norm": 61384.51953125,
2435
+ "learning_rate": 1.3920704845814978e-05,
2436
+ "loss": 0.0469,
2437
+ "step": 1725
2438
+ },
2439
+ {
2440
+ "epoch": 1.5242290748898677,
2441
+ "grad_norm": 117991.0546875,
2442
+ "learning_rate": 1.3903083700440529e-05,
2443
+ "loss": 0.0472,
2444
+ "step": 1730
2445
+ },
2446
+ {
2447
+ "epoch": 1.5286343612334803,
2448
+ "grad_norm": 88646.40625,
2449
+ "learning_rate": 1.388546255506608e-05,
2450
+ "loss": 0.0504,
2451
+ "step": 1735
2452
+ },
2453
+ {
2454
+ "epoch": 1.5330396475770924,
2455
+ "grad_norm": 69307.2578125,
2456
+ "learning_rate": 1.386784140969163e-05,
2457
+ "loss": 0.0438,
2458
+ "step": 1740
2459
+ },
2460
+ {
2461
+ "epoch": 1.5374449339207048,
2462
+ "grad_norm": 54503.0546875,
2463
+ "learning_rate": 1.3850220264317183e-05,
2464
+ "loss": 0.0763,
2465
+ "step": 1745
2466
+ },
2467
+ {
2468
+ "epoch": 1.5418502202643172,
2469
+ "grad_norm": 98683.953125,
2470
+ "learning_rate": 1.3832599118942734e-05,
2471
+ "loss": 0.0548,
2472
+ "step": 1750
2473
+ },
2474
+ {
2475
+ "epoch": 1.5462555066079295,
2476
+ "grad_norm": 42818.6640625,
2477
+ "learning_rate": 1.3814977973568282e-05,
2478
+ "loss": 0.0361,
2479
+ "step": 1755
2480
+ },
2481
+ {
2482
+ "epoch": 1.550660792951542,
2483
+ "grad_norm": 132435.703125,
2484
+ "learning_rate": 1.3797356828193832e-05,
2485
+ "loss": 0.066,
2486
+ "step": 1760
2487
+ },
2488
+ {
2489
+ "epoch": 1.555066079295154,
2490
+ "grad_norm": 174812.21875,
2491
+ "learning_rate": 1.3779735682819383e-05,
2492
+ "loss": 0.067,
2493
+ "step": 1765
2494
+ },
2495
+ {
2496
+ "epoch": 1.5594713656387666,
2497
+ "grad_norm": 93800.578125,
2498
+ "learning_rate": 1.3762114537444934e-05,
2499
+ "loss": 0.0593,
2500
+ "step": 1770
2501
+ },
2502
+ {
2503
+ "epoch": 1.5638766519823788,
2504
+ "grad_norm": 81683.9609375,
2505
+ "learning_rate": 1.3744493392070487e-05,
2506
+ "loss": 0.0612,
2507
+ "step": 1775
2508
+ },
2509
+ {
2510
+ "epoch": 1.5682819383259912,
2511
+ "grad_norm": 112240.7734375,
2512
+ "learning_rate": 1.3726872246696038e-05,
2513
+ "loss": 0.0545,
2514
+ "step": 1780
2515
+ },
2516
+ {
2517
+ "epoch": 1.5726872246696035,
2518
+ "grad_norm": 137638.65625,
2519
+ "learning_rate": 1.3709251101321588e-05,
2520
+ "loss": 0.0471,
2521
+ "step": 1785
2522
+ },
2523
+ {
2524
+ "epoch": 1.577092511013216,
2525
+ "grad_norm": 123304.8359375,
2526
+ "learning_rate": 1.3691629955947136e-05,
2527
+ "loss": 0.0488,
2528
+ "step": 1790
2529
+ },
2530
+ {
2531
+ "epoch": 1.5814977973568283,
2532
+ "grad_norm": 126164.453125,
2533
+ "learning_rate": 1.3674008810572687e-05,
2534
+ "loss": 0.0499,
2535
+ "step": 1795
2536
+ },
2537
+ {
2538
+ "epoch": 1.5859030837004404,
2539
+ "grad_norm": 107429.15625,
2540
+ "learning_rate": 1.3656387665198238e-05,
2541
+ "loss": 0.0619,
2542
+ "step": 1800
2543
+ },
2544
+ {
2545
+ "epoch": 1.590308370044053,
2546
+ "grad_norm": 119239.5390625,
2547
+ "learning_rate": 1.363876651982379e-05,
2548
+ "loss": 0.0923,
2549
+ "step": 1805
2550
+ },
2551
+ {
2552
+ "epoch": 1.5947136563876652,
2553
+ "grad_norm": 70938.0,
2554
+ "learning_rate": 1.3621145374449341e-05,
2555
+ "loss": 0.0508,
2556
+ "step": 1810
2557
+ },
2558
+ {
2559
+ "epoch": 1.5991189427312775,
2560
+ "grad_norm": 78114.5625,
2561
+ "learning_rate": 1.3603524229074892e-05,
2562
+ "loss": 0.0447,
2563
+ "step": 1815
2564
+ },
2565
+ {
2566
+ "epoch": 1.60352422907489,
2567
+ "grad_norm": 73220.28125,
2568
+ "learning_rate": 1.3585903083700443e-05,
2569
+ "loss": 0.0813,
2570
+ "step": 1820
2571
+ },
2572
+ {
2573
+ "epoch": 1.607929515418502,
2574
+ "grad_norm": 75684.3671875,
2575
+ "learning_rate": 1.3568281938325994e-05,
2576
+ "loss": 0.0758,
2577
+ "step": 1825
2578
+ },
2579
+ {
2580
+ "epoch": 1.6123348017621146,
2581
+ "grad_norm": 127801.0390625,
2582
+ "learning_rate": 1.3550660792951541e-05,
2583
+ "loss": 0.0626,
2584
+ "step": 1830
2585
+ },
2586
+ {
2587
+ "epoch": 1.6167400881057268,
2588
+ "grad_norm": 254675.78125,
2589
+ "learning_rate": 1.3533039647577094e-05,
2590
+ "loss": 0.0836,
2591
+ "step": 1835
2592
+ },
2593
+ {
2594
+ "epoch": 1.6211453744493394,
2595
+ "grad_norm": 125710.7421875,
2596
+ "learning_rate": 1.3515418502202645e-05,
2597
+ "loss": 0.0629,
2598
+ "step": 1840
2599
+ },
2600
+ {
2601
+ "epoch": 1.6255506607929515,
2602
+ "grad_norm": 80626.09375,
2603
+ "learning_rate": 1.3497797356828196e-05,
2604
+ "loss": 0.079,
2605
+ "step": 1845
2606
+ },
2607
+ {
2608
+ "epoch": 1.6299559471365639,
2609
+ "grad_norm": 82203.9140625,
2610
+ "learning_rate": 1.3480176211453747e-05,
2611
+ "loss": 0.0763,
2612
+ "step": 1850
2613
+ },
2614
+ {
2615
+ "epoch": 1.6343612334801763,
2616
+ "grad_norm": 102255.546875,
2617
+ "learning_rate": 1.3462555066079297e-05,
2618
+ "loss": 0.0733,
2619
+ "step": 1855
2620
+ },
2621
+ {
2622
+ "epoch": 1.6387665198237884,
2623
+ "grad_norm": 81746.6875,
2624
+ "learning_rate": 1.3444933920704848e-05,
2625
+ "loss": 0.0373,
2626
+ "step": 1860
2627
+ },
2628
+ {
2629
+ "epoch": 1.643171806167401,
2630
+ "grad_norm": 93679.203125,
2631
+ "learning_rate": 1.3427312775330398e-05,
2632
+ "loss": 0.0395,
2633
+ "step": 1865
2634
+ },
2635
+ {
2636
+ "epoch": 1.6475770925110131,
2637
+ "grad_norm": 58704.046875,
2638
+ "learning_rate": 1.3409691629955948e-05,
2639
+ "loss": 0.0743,
2640
+ "step": 1870
2641
+ },
2642
+ {
2643
+ "epoch": 1.6519823788546255,
2644
+ "grad_norm": 49662.890625,
2645
+ "learning_rate": 1.33920704845815e-05,
2646
+ "loss": 0.0506,
2647
+ "step": 1875
2648
+ },
2649
+ {
2650
+ "epoch": 1.6563876651982379,
2651
+ "grad_norm": 122799.9140625,
2652
+ "learning_rate": 1.337444933920705e-05,
2653
+ "loss": 0.0613,
2654
+ "step": 1880
2655
+ },
2656
+ {
2657
+ "epoch": 1.6607929515418502,
2658
+ "grad_norm": 80782.875,
2659
+ "learning_rate": 1.3356828193832601e-05,
2660
+ "loss": 0.0432,
2661
+ "step": 1885
2662
+ },
2663
+ {
2664
+ "epoch": 1.6651982378854626,
2665
+ "grad_norm": 151278.734375,
2666
+ "learning_rate": 1.3339207048458152e-05,
2667
+ "loss": 0.0538,
2668
+ "step": 1890
2669
+ },
2670
+ {
2671
+ "epoch": 1.6696035242290748,
2672
+ "grad_norm": 148539.515625,
2673
+ "learning_rate": 1.3321585903083703e-05,
2674
+ "loss": 0.0466,
2675
+ "step": 1895
2676
+ },
2677
+ {
2678
+ "epoch": 1.6740088105726874,
2679
+ "grad_norm": 22102.578125,
2680
+ "learning_rate": 1.3303964757709252e-05,
2681
+ "loss": 0.0353,
2682
+ "step": 1900
2683
+ },
2684
+ {
2685
+ "epoch": 1.6784140969162995,
2686
+ "grad_norm": 96858.3203125,
2687
+ "learning_rate": 1.3286343612334803e-05,
2688
+ "loss": 0.0726,
2689
+ "step": 1905
2690
+ },
2691
+ {
2692
+ "epoch": 1.6828193832599119,
2693
+ "grad_norm": 172830.28125,
2694
+ "learning_rate": 1.3268722466960354e-05,
2695
+ "loss": 0.0666,
2696
+ "step": 1910
2697
+ },
2698
+ {
2699
+ "epoch": 1.6872246696035242,
2700
+ "grad_norm": 142388.375,
2701
+ "learning_rate": 1.3251101321585905e-05,
2702
+ "loss": 0.0415,
2703
+ "step": 1915
2704
+ },
2705
+ {
2706
+ "epoch": 1.6916299559471366,
2707
+ "grad_norm": 70996.0234375,
2708
+ "learning_rate": 1.3233480176211456e-05,
2709
+ "loss": 0.0669,
2710
+ "step": 1920
2711
+ },
2712
+ {
2713
+ "epoch": 1.696035242290749,
2714
+ "grad_norm": 170509.53125,
2715
+ "learning_rate": 1.3215859030837006e-05,
2716
+ "loss": 0.0597,
2717
+ "step": 1925
2718
+ },
2719
+ {
2720
+ "epoch": 1.7004405286343611,
2721
+ "grad_norm": 36728.19921875,
2722
+ "learning_rate": 1.3198237885462557e-05,
2723
+ "loss": 0.0391,
2724
+ "step": 1930
2725
+ },
2726
+ {
2727
+ "epoch": 1.7048458149779737,
2728
+ "grad_norm": 94136.2265625,
2729
+ "learning_rate": 1.3180616740088106e-05,
2730
+ "loss": 0.0449,
2731
+ "step": 1935
2732
+ },
2733
+ {
2734
+ "epoch": 1.7092511013215859,
2735
+ "grad_norm": 60395.50390625,
2736
+ "learning_rate": 1.3162995594713657e-05,
2737
+ "loss": 0.0518,
2738
+ "step": 1940
2739
+ },
2740
+ {
2741
+ "epoch": 1.7136563876651982,
2742
+ "grad_norm": 52403.046875,
2743
+ "learning_rate": 1.3145374449339208e-05,
2744
+ "loss": 0.0704,
2745
+ "step": 1945
2746
+ },
2747
+ {
2748
+ "epoch": 1.7180616740088106,
2749
+ "grad_norm": 52815.953125,
2750
+ "learning_rate": 1.3127753303964759e-05,
2751
+ "loss": 0.0694,
2752
+ "step": 1950
2753
+ },
2754
+ {
2755
+ "epoch": 1.7224669603524227,
2756
+ "grad_norm": 83290.625,
2757
+ "learning_rate": 1.311013215859031e-05,
2758
+ "loss": 0.0492,
2759
+ "step": 1955
2760
+ },
2761
+ {
2762
+ "epoch": 1.7268722466960353,
2763
+ "grad_norm": 120697.515625,
2764
+ "learning_rate": 1.3092511013215861e-05,
2765
+ "loss": 0.0474,
2766
+ "step": 1960
2767
+ },
2768
+ {
2769
+ "epoch": 1.7312775330396475,
2770
+ "grad_norm": 105623.4375,
2771
+ "learning_rate": 1.3074889867841412e-05,
2772
+ "loss": 0.0485,
2773
+ "step": 1965
2774
+ },
2775
+ {
2776
+ "epoch": 1.73568281938326,
2777
+ "grad_norm": 96892.4140625,
2778
+ "learning_rate": 1.3057268722466961e-05,
2779
+ "loss": 0.06,
2780
+ "step": 1970
2781
+ },
2782
+ {
2783
+ "epoch": 1.7400881057268722,
2784
+ "grad_norm": 143312.1875,
2785
+ "learning_rate": 1.3039647577092512e-05,
2786
+ "loss": 0.056,
2787
+ "step": 1975
2788
+ },
2789
+ {
2790
+ "epoch": 1.7444933920704846,
2791
+ "grad_norm": 95480.8125,
2792
+ "learning_rate": 1.3022026431718063e-05,
2793
+ "loss": 0.0886,
2794
+ "step": 1980
2795
+ },
2796
+ {
2797
+ "epoch": 1.748898678414097,
2798
+ "grad_norm": 154313.875,
2799
+ "learning_rate": 1.3004405286343614e-05,
2800
+ "loss": 0.0663,
2801
+ "step": 1985
2802
+ },
2803
+ {
2804
+ "epoch": 1.753303964757709,
2805
+ "grad_norm": 75262.078125,
2806
+ "learning_rate": 1.2986784140969164e-05,
2807
+ "loss": 0.0423,
2808
+ "step": 1990
2809
+ },
2810
+ {
2811
+ "epoch": 1.7577092511013217,
2812
+ "grad_norm": 37457.72265625,
2813
+ "learning_rate": 1.2969162995594715e-05,
2814
+ "loss": 0.0428,
2815
+ "step": 1995
2816
+ },
2817
+ {
2818
+ "epoch": 1.7621145374449338,
2819
+ "grad_norm": 164677.90625,
2820
+ "learning_rate": 1.2951541850220266e-05,
2821
+ "loss": 0.0678,
2822
+ "step": 2000
2823
+ },
2824
+ {
2825
+ "epoch": 1.7665198237885462,
2826
+ "grad_norm": 57418.01953125,
2827
+ "learning_rate": 1.2933920704845817e-05,
2828
+ "loss": 0.064,
2829
+ "step": 2005
2830
+ },
2831
+ {
2832
+ "epoch": 1.7709251101321586,
2833
+ "grad_norm": 115902.359375,
2834
+ "learning_rate": 1.2916299559471366e-05,
2835
+ "loss": 0.0869,
2836
+ "step": 2010
2837
+ },
2838
+ {
2839
+ "epoch": 1.775330396475771,
2840
+ "grad_norm": 153639.390625,
2841
+ "learning_rate": 1.2898678414096917e-05,
2842
+ "loss": 0.0524,
2843
+ "step": 2015
2844
+ },
2845
+ {
2846
+ "epoch": 1.7797356828193833,
2847
+ "grad_norm": 116208.609375,
2848
+ "learning_rate": 1.2881057268722468e-05,
2849
+ "loss": 0.1127,
2850
+ "step": 2020
2851
+ },
2852
+ {
2853
+ "epoch": 1.7841409691629955,
2854
+ "grad_norm": 175014.640625,
2855
+ "learning_rate": 1.2863436123348019e-05,
2856
+ "loss": 0.073,
2857
+ "step": 2025
2858
+ },
2859
+ {
2860
+ "epoch": 1.788546255506608,
2861
+ "grad_norm": 154037.015625,
2862
+ "learning_rate": 1.284581497797357e-05,
2863
+ "loss": 0.0292,
2864
+ "step": 2030
2865
+ },
2866
+ {
2867
+ "epoch": 1.7929515418502202,
2868
+ "grad_norm": 62813.921875,
2869
+ "learning_rate": 1.282819383259912e-05,
2870
+ "loss": 0.0302,
2871
+ "step": 2035
2872
+ },
2873
+ {
2874
+ "epoch": 1.7973568281938326,
2875
+ "grad_norm": 120037.15625,
2876
+ "learning_rate": 1.2810572687224672e-05,
2877
+ "loss": 0.0431,
2878
+ "step": 2040
2879
+ },
2880
+ {
2881
+ "epoch": 1.801762114537445,
2882
+ "grad_norm": 131998.609375,
2883
+ "learning_rate": 1.279295154185022e-05,
2884
+ "loss": 0.0476,
2885
+ "step": 2045
2886
+ },
2887
+ {
2888
+ "epoch": 1.8061674008810573,
2889
+ "grad_norm": 99164.234375,
2890
+ "learning_rate": 1.2775330396475772e-05,
2891
+ "loss": 0.0297,
2892
+ "step": 2050
2893
+ },
2894
+ {
2895
+ "epoch": 1.8105726872246697,
2896
+ "grad_norm": 142317.390625,
2897
+ "learning_rate": 1.2757709251101323e-05,
2898
+ "loss": 0.0434,
2899
+ "step": 2055
2900
+ },
2901
+ {
2902
+ "epoch": 1.8149779735682818,
2903
+ "grad_norm": 46120.3671875,
2904
+ "learning_rate": 1.2740088105726873e-05,
2905
+ "loss": 0.0815,
2906
+ "step": 2060
2907
+ },
2908
+ {
2909
+ "epoch": 1.8193832599118944,
2910
+ "grad_norm": 104930.0703125,
2911
+ "learning_rate": 1.2722466960352424e-05,
2912
+ "loss": 0.0523,
2913
+ "step": 2065
2914
+ },
2915
+ {
2916
+ "epoch": 1.8237885462555066,
2917
+ "grad_norm": 76672.9140625,
2918
+ "learning_rate": 1.2704845814977975e-05,
2919
+ "loss": 0.0869,
2920
+ "step": 2070
2921
+ },
2922
+ {
2923
+ "epoch": 1.828193832599119,
2924
+ "grad_norm": 144851.765625,
2925
+ "learning_rate": 1.2687224669603526e-05,
2926
+ "loss": 0.0721,
2927
+ "step": 2075
2928
+ },
2929
+ {
2930
+ "epoch": 1.8325991189427313,
2931
+ "grad_norm": 69859.546875,
2932
+ "learning_rate": 1.2669603524229075e-05,
2933
+ "loss": 0.0556,
2934
+ "step": 2080
2935
+ },
2936
+ {
2937
+ "epoch": 1.8370044052863435,
2938
+ "grad_norm": 55571.99609375,
2939
+ "learning_rate": 1.2651982378854626e-05,
2940
+ "loss": 0.0721,
2941
+ "step": 2085
2942
+ },
2943
+ {
2944
+ "epoch": 1.841409691629956,
2945
+ "grad_norm": 147866.203125,
2946
+ "learning_rate": 1.2634361233480177e-05,
2947
+ "loss": 0.051,
2948
+ "step": 2090
2949
+ },
2950
+ {
2951
+ "epoch": 1.8458149779735682,
2952
+ "grad_norm": 70289.234375,
2953
+ "learning_rate": 1.2616740088105728e-05,
2954
+ "loss": 0.0809,
2955
+ "step": 2095
2956
+ },
2957
+ {
2958
+ "epoch": 1.8502202643171806,
2959
+ "grad_norm": 90098.3203125,
2960
+ "learning_rate": 1.2599118942731279e-05,
2961
+ "loss": 0.0645,
2962
+ "step": 2100
2963
+ },
2964
+ {
2965
+ "epoch": 1.854625550660793,
2966
+ "grad_norm": 151529.96875,
2967
+ "learning_rate": 1.258149779735683e-05,
2968
+ "loss": 0.0589,
2969
+ "step": 2105
2970
+ },
2971
+ {
2972
+ "epoch": 1.8590308370044053,
2973
+ "grad_norm": 102989.0625,
2974
+ "learning_rate": 1.256387665198238e-05,
2975
+ "loss": 0.0416,
2976
+ "step": 2110
2977
+ },
2978
+ {
2979
+ "epoch": 1.8634361233480177,
2980
+ "grad_norm": 26888.615234375,
2981
+ "learning_rate": 1.254625550660793e-05,
2982
+ "loss": 0.066,
2983
+ "step": 2115
2984
+ },
2985
+ {
2986
+ "epoch": 1.8678414096916298,
2987
+ "grad_norm": 127484.2265625,
2988
+ "learning_rate": 1.252863436123348e-05,
2989
+ "loss": 0.0615,
2990
+ "step": 2120
2991
+ },
2992
+ {
2993
+ "epoch": 1.8722466960352424,
2994
+ "grad_norm": 78853.7265625,
2995
+ "learning_rate": 1.2511013215859032e-05,
2996
+ "loss": 0.0576,
2997
+ "step": 2125
2998
+ },
2999
+ {
3000
+ "epoch": 1.8766519823788546,
3001
+ "grad_norm": 75856.0625,
3002
+ "learning_rate": 1.2493392070484582e-05,
3003
+ "loss": 0.0464,
3004
+ "step": 2130
3005
+ },
3006
+ {
3007
+ "epoch": 1.881057268722467,
3008
+ "grad_norm": 107549.515625,
3009
+ "learning_rate": 1.2475770925110133e-05,
3010
+ "loss": 0.0696,
3011
+ "step": 2135
3012
+ },
3013
+ {
3014
+ "epoch": 1.8854625550660793,
3015
+ "grad_norm": 93227.1953125,
3016
+ "learning_rate": 1.2458149779735684e-05,
3017
+ "loss": 0.0491,
3018
+ "step": 2140
3019
+ },
3020
+ {
3021
+ "epoch": 1.8898678414096917,
3022
+ "grad_norm": 63623.6171875,
3023
+ "learning_rate": 1.2440528634361235e-05,
3024
+ "loss": 0.0665,
3025
+ "step": 2145
3026
+ },
3027
+ {
3028
+ "epoch": 1.894273127753304,
3029
+ "grad_norm": 111105.1484375,
3030
+ "learning_rate": 1.2422907488986786e-05,
3031
+ "loss": 0.0594,
3032
+ "step": 2150
3033
+ },
3034
+ {
3035
+ "epoch": 1.8986784140969162,
3036
+ "grad_norm": 187963.828125,
3037
+ "learning_rate": 1.2405286343612335e-05,
3038
+ "loss": 0.07,
3039
+ "step": 2155
3040
+ },
3041
+ {
3042
+ "epoch": 1.9030837004405288,
3043
+ "grad_norm": 113976.90625,
3044
+ "learning_rate": 1.2387665198237886e-05,
3045
+ "loss": 0.0763,
3046
+ "step": 2160
3047
+ },
3048
+ {
3049
+ "epoch": 1.907488986784141,
3050
+ "grad_norm": 108388.859375,
3051
+ "learning_rate": 1.2370044052863437e-05,
3052
+ "loss": 0.0587,
3053
+ "step": 2165
3054
+ },
3055
+ {
3056
+ "epoch": 1.9118942731277533,
3057
+ "grad_norm": 134882.953125,
3058
+ "learning_rate": 1.2352422907488988e-05,
3059
+ "loss": 0.0541,
3060
+ "step": 2170
3061
+ },
3062
+ {
3063
+ "epoch": 1.9162995594713657,
3064
+ "grad_norm": 56433.33203125,
3065
+ "learning_rate": 1.2334801762114539e-05,
3066
+ "loss": 0.058,
3067
+ "step": 2175
3068
+ },
3069
+ {
3070
+ "epoch": 1.920704845814978,
3071
+ "grad_norm": 127455.4453125,
3072
+ "learning_rate": 1.231718061674009e-05,
3073
+ "loss": 0.042,
3074
+ "step": 2180
3075
+ },
3076
+ {
3077
+ "epoch": 1.9251101321585904,
3078
+ "grad_norm": 78662.1875,
3079
+ "learning_rate": 1.229955947136564e-05,
3080
+ "loss": 0.035,
3081
+ "step": 2185
3082
+ },
3083
+ {
3084
+ "epoch": 1.9295154185022025,
3085
+ "grad_norm": 124117.0,
3086
+ "learning_rate": 1.228193832599119e-05,
3087
+ "loss": 0.0738,
3088
+ "step": 2190
3089
+ },
3090
+ {
3091
+ "epoch": 1.9339207048458151,
3092
+ "grad_norm": 27490.91015625,
3093
+ "learning_rate": 1.226431718061674e-05,
3094
+ "loss": 0.0402,
3095
+ "step": 2195
3096
+ },
3097
+ {
3098
+ "epoch": 1.9383259911894273,
3099
+ "grad_norm": 106559.34375,
3100
+ "learning_rate": 1.2246696035242291e-05,
3101
+ "loss": 0.0631,
3102
+ "step": 2200
3103
+ },
3104
+ {
3105
+ "epoch": 1.9427312775330396,
3106
+ "grad_norm": 144464.453125,
3107
+ "learning_rate": 1.2229074889867842e-05,
3108
+ "loss": 0.0552,
3109
+ "step": 2205
3110
+ },
3111
+ {
3112
+ "epoch": 1.947136563876652,
3113
+ "grad_norm": 141480.140625,
3114
+ "learning_rate": 1.2211453744493393e-05,
3115
+ "loss": 0.0796,
3116
+ "step": 2210
3117
+ },
3118
+ {
3119
+ "epoch": 1.9515418502202642,
3120
+ "grad_norm": 72824.8203125,
3121
+ "learning_rate": 1.2193832599118944e-05,
3122
+ "loss": 0.0312,
3123
+ "step": 2215
3124
+ },
3125
+ {
3126
+ "epoch": 1.9559471365638768,
3127
+ "grad_norm": 57519.6328125,
3128
+ "learning_rate": 1.2176211453744495e-05,
3129
+ "loss": 0.041,
3130
+ "step": 2220
3131
+ },
3132
+ {
3133
+ "epoch": 1.960352422907489,
3134
+ "grad_norm": 70551.2421875,
3135
+ "learning_rate": 1.2158590308370044e-05,
3136
+ "loss": 0.0492,
3137
+ "step": 2225
3138
+ },
3139
+ {
3140
+ "epoch": 1.9647577092511013,
3141
+ "grad_norm": 169272.984375,
3142
+ "learning_rate": 1.2140969162995595e-05,
3143
+ "loss": 0.0637,
3144
+ "step": 2230
3145
+ },
3146
+ {
3147
+ "epoch": 1.9691629955947136,
3148
+ "grad_norm": 103776.7265625,
3149
+ "learning_rate": 1.2123348017621146e-05,
3150
+ "loss": 0.0527,
3151
+ "step": 2235
3152
+ },
3153
+ {
3154
+ "epoch": 1.973568281938326,
3155
+ "grad_norm": 35581.64453125,
3156
+ "learning_rate": 1.2105726872246697e-05,
3157
+ "loss": 0.1226,
3158
+ "step": 2240
3159
+ },
3160
+ {
3161
+ "epoch": 1.9779735682819384,
3162
+ "grad_norm": 115400.7421875,
3163
+ "learning_rate": 1.2088105726872248e-05,
3164
+ "loss": 0.0509,
3165
+ "step": 2245
3166
+ },
3167
+ {
3168
+ "epoch": 1.9823788546255505,
3169
+ "grad_norm": 121133.8125,
3170
+ "learning_rate": 1.2070484581497798e-05,
3171
+ "loss": 0.0676,
3172
+ "step": 2250
3173
+ },
3174
+ {
3175
+ "epoch": 1.9867841409691631,
3176
+ "grad_norm": 123540.25,
3177
+ "learning_rate": 1.205286343612335e-05,
3178
+ "loss": 0.0396,
3179
+ "step": 2255
3180
+ },
3181
+ {
3182
+ "epoch": 1.9911894273127753,
3183
+ "grad_norm": 116588.140625,
3184
+ "learning_rate": 1.2035242290748899e-05,
3185
+ "loss": 0.098,
3186
+ "step": 2260
3187
+ },
3188
+ {
3189
+ "epoch": 1.9955947136563876,
3190
+ "grad_norm": 77203.84375,
3191
+ "learning_rate": 1.201762114537445e-05,
3192
+ "loss": 0.046,
3193
+ "step": 2265
3194
+ },
3195
+ {
3196
+ "epoch": 2.0,
3197
+ "grad_norm": 97453.390625,
3198
+ "learning_rate": 1.2e-05,
3199
+ "loss": 0.0379,
3200
+ "step": 2270
3201
+ },
3202
+ {
3203
+ "epoch": 2.0,
3204
+ "eval_accuracy_score": 0.9774530271398747,
3205
+ "eval_f1": 0.884310291542186,
3206
+ "eval_loss": 0.06808020919561386,
3207
+ "eval_precision": 0.8853135313531353,
3208
+ "eval_recall": 0.8833093229059478,
3209
+ "eval_runtime": 56.8181,
3210
+ "eval_samples_per_second": 35.517,
3211
+ "eval_steps_per_second": 4.453,
3212
+ "step": 2270
3213
+ }
3214
+ ],
3215
+ "logging_steps": 5,
3216
+ "max_steps": 5675,
3217
+ "num_input_tokens_seen": 0,
3218
+ "num_train_epochs": 5,
3219
+ "save_steps": 500,
3220
+ "stateful_callbacks": {
3221
+ "TrainerControl": {
3222
+ "args": {
3223
+ "should_epoch_stop": false,
3224
+ "should_evaluate": false,
3225
+ "should_log": false,
3226
+ "should_save": true,
3227
+ "should_training_stop": false
3228
+ },
3229
+ "attributes": {}
3230
+ }
3231
+ },
3232
+ "total_flos": 5387250571518042.0,
3233
+ "train_batch_size": 8,
3234
+ "trial_name": null,
3235
+ "trial_params": null
3236
+ }
checkpoint-2270/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbf6ee382bdf4fc7a704ade5bbd728f43b83dd5b0a0779f6ffdaf51f9e4d6d2
3
+ size 5432
checkpoint-2270/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3405/config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ElectraForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "embedding_size": 1024,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "id2label": {
12
+ "0": "O",
13
+ "1": "B-antineoplastic",
14
+ "2": "B-cancertype",
15
+ "3": "B-carcinogen",
16
+ "4": "B-negative",
17
+ "5": "I-antineoplastic",
18
+ "6": "I-cancertype",
19
+ "7": "I-carcinogen",
20
+ "8": "I-negative"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "label2id": {
25
+ "B-antineoplastic": 1,
26
+ "B-cancertype": 2,
27
+ "B-carcinogen": 3,
28
+ "B-negative": 4,
29
+ "I-antineoplastic": 5,
30
+ "I-cancertype": 6,
31
+ "I-carcinogen": 7,
32
+ "I-negative": 8,
33
+ "O": 0
34
+ },
35
+ "layer_norm_eps": 1e-12,
36
+ "max_position_embeddings": 512,
37
+ "model_type": "electra",
38
+ "num_attention_heads": 16,
39
+ "num_hidden_layers": 24,
40
+ "pad_token_id": 0,
41
+ "position_embedding_type": "absolute",
42
+ "summary_activation": "gelu",
43
+ "summary_last_dropout": 0.1,
44
+ "summary_type": "first",
45
+ "summary_use_proj": true,
46
+ "torch_dtype": "float32",
47
+ "transformers_version": "4.50.0",
48
+ "type_vocab_size": 2,
49
+ "use_cache": true,
50
+ "vocab_size": 28895
51
+ }
checkpoint-3405/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bd24c2a71923ecdaf81aee12d68a24ab4412ea3128ede713af303d8a17a6372
3
+ size 1329789812
checkpoint-3405/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02e3466dc83c3be17c5a0f2b6c6bf13f136dd1a4acc828e56dc6158fb3039c1
3
+ size 2659810855
checkpoint-3405/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9fb0e75bb5f3a7cd9306885ba9c94f400761810c6873fc6802046229b208cc3
3
+ size 14244
checkpoint-3405/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30172cf14f5dbe00280d63e36224a9f28dc7a0e8b38a74ceb5eb284e84da363
3
+ size 988
checkpoint-3405/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdda42c56439ac97097ac88cc2da2af8c10a9b8a13dc974c952552608aaaaf94
3
+ size 1064
checkpoint-3405/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-3405/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3405/tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "ignore_mismatched_sizes": true,
50
+ "mask_token": "[MASK]",
51
+ "max_len": 512,
52
+ "model_max_length": 512,
53
+ "never_split": null,
54
+ "pad_token": "[PAD]",
55
+ "padding": true,
56
+ "sep_token": "[SEP]",
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "ElectraTokenizer",
60
+ "truncation": true,
61
+ "unk_token": "[UNK]"
62
+ }
checkpoint-3405/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3405/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbf6ee382bdf4fc7a704ade5bbd728f43b83dd5b0a0779f6ffdaf51f9e4d6d2
3
+ size 5432
checkpoint-3405/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4540/config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ElectraForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "embedding_size": 1024,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "id2label": {
12
+ "0": "O",
13
+ "1": "B-antineoplastic",
14
+ "2": "B-cancertype",
15
+ "3": "B-carcinogen",
16
+ "4": "B-negative",
17
+ "5": "I-antineoplastic",
18
+ "6": "I-cancertype",
19
+ "7": "I-carcinogen",
20
+ "8": "I-negative"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "label2id": {
25
+ "B-antineoplastic": 1,
26
+ "B-cancertype": 2,
27
+ "B-carcinogen": 3,
28
+ "B-negative": 4,
29
+ "I-antineoplastic": 5,
30
+ "I-cancertype": 6,
31
+ "I-carcinogen": 7,
32
+ "I-negative": 8,
33
+ "O": 0
34
+ },
35
+ "layer_norm_eps": 1e-12,
36
+ "max_position_embeddings": 512,
37
+ "model_type": "electra",
38
+ "num_attention_heads": 16,
39
+ "num_hidden_layers": 24,
40
+ "pad_token_id": 0,
41
+ "position_embedding_type": "absolute",
42
+ "summary_activation": "gelu",
43
+ "summary_last_dropout": 0.1,
44
+ "summary_type": "first",
45
+ "summary_use_proj": true,
46
+ "torch_dtype": "float32",
47
+ "transformers_version": "4.50.0",
48
+ "type_vocab_size": 2,
49
+ "use_cache": true,
50
+ "vocab_size": 28895
51
+ }
checkpoint-4540/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2e8fa132790c87f4f9328ae76fc8ed6e003060b08437ea1cf9a553df4837e2f
3
+ size 1329789812
checkpoint-4540/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:437fb6a15b1f409dfad59217134683e7054fb987810a24662f3c22bc658bc2f2
3
+ size 2659810855
checkpoint-4540/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68fc9e549ec0f7be8188e169402333b571c00a5e954277de0dba77cc99d0135a
3
+ size 14244
checkpoint-4540/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30172cf14f5dbe00280d63e36224a9f28dc7a0e8b38a74ceb5eb284e84da363
3
+ size 988
checkpoint-4540/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe112390f0814f80e25d53156f6d164a32467ecf2899f65b9348bf59b822d8de
3
+ size 1064
checkpoint-4540/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-4540/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4540/tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "ignore_mismatched_sizes": true,
50
+ "mask_token": "[MASK]",
51
+ "max_len": 512,
52
+ "model_max_length": 512,
53
+ "never_split": null,
54
+ "pad_token": "[PAD]",
55
+ "padding": true,
56
+ "sep_token": "[SEP]",
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "ElectraTokenizer",
60
+ "truncation": true,
61
+ "unk_token": "[UNK]"
62
+ }
checkpoint-4540/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4540/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbf6ee382bdf4fc7a704ade5bbd728f43b83dd5b0a0779f6ffdaf51f9e4d6d2
3
+ size 5432
checkpoint-4540/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5675/config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ElectraForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "embedding_size": 1024,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "id2label": {
12
+ "0": "O",
13
+ "1": "B-antineoplastic",
14
+ "2": "B-cancertype",
15
+ "3": "B-carcinogen",
16
+ "4": "B-negative",
17
+ "5": "I-antineoplastic",
18
+ "6": "I-cancertype",
19
+ "7": "I-carcinogen",
20
+ "8": "I-negative"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "label2id": {
25
+ "B-antineoplastic": 1,
26
+ "B-cancertype": 2,
27
+ "B-carcinogen": 3,
28
+ "B-negative": 4,
29
+ "I-antineoplastic": 5,
30
+ "I-cancertype": 6,
31
+ "I-carcinogen": 7,
32
+ "I-negative": 8,
33
+ "O": 0
34
+ },
35
+ "layer_norm_eps": 1e-12,
36
+ "max_position_embeddings": 512,
37
+ "model_type": "electra",
38
+ "num_attention_heads": 16,
39
+ "num_hidden_layers": 24,
40
+ "pad_token_id": 0,
41
+ "position_embedding_type": "absolute",
42
+ "summary_activation": "gelu",
43
+ "summary_last_dropout": 0.1,
44
+ "summary_type": "first",
45
+ "summary_use_proj": true,
46
+ "torch_dtype": "float32",
47
+ "transformers_version": "4.50.0",
48
+ "type_vocab_size": 2,
49
+ "use_cache": true,
50
+ "vocab_size": 28895
51
+ }
checkpoint-5675/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d17489b4518b84cf7daf8ea4faf85d554e35a988b1322d56e011dda0e1e258b2
3
+ size 1329789812