davda54 commited on
Commit
a3cac50
·
verified ·
1 Parent(s): 6eebeac

FlashAttention support

Browse files
Files changed (1) hide show
  1. config.json +18 -29
config.json CHANGED
@@ -12,38 +12,27 @@
12
  "AutoModelForQuestionAnswering": "modeling_gptbert.GptBertForQuestionAnswering",
13
  "AutoModelForMultipleChoice": "modeling_gptbert.GptBertForMultipleChoice"
14
  },
15
- "attention_dropout": 0.0,
16
- "attention_output_dropout_p": 0.0,
17
- "attention_inter_norm_affine": false,
18
- "attention_inter_norm_eps": 1e-07,
19
- "attention_pre_norm_affine": false,
20
- "attention_pre_norm_eps": 1e-07,
21
- "attention_probabilities_dropout_p": 0.0,
22
- "classifier_post_norm_affine": false,
23
- "classifier_post_norm_eps": 1e-07,
24
- "classifier_pre_norm_affine": false,
25
- "classifier_pre_norm_eps": 1e-07,
26
- "d_qk": 64,
27
- "d_v": 64,
28
- "embedding_dropout_p": 0.1,
29
- "feed_forward_dropout_p": 0.0,
30
- "feed_forward_inter_norm_affine": false,
31
- "feed_forward_inter_norm_eps": 1e-07,
32
- "feed_forward_pre_norm_affine": false,
33
- "feed_forward_pre_norm_eps": 1e-07,
34
  "hidden_size": 192,
35
  "intermediate_size": 512,
36
  "max_sequence_length": 16384,
37
- "num_attention_heads": 3,
38
- "num_kv_heads": 3,
39
  "num_layers": 16,
 
 
 
 
 
 
 
 
40
  "rope_theta": 160000,
41
  "vocab_size": 51200,
42
- "word_norm_affine": true,
43
- "word_norm_eps": 1e-07,
44
- "short_long_ratio": 4,
45
- "window_length": 8192,
46
- "is_decoder": false,
47
- "not_flex": true,
48
- "hidden_dropout_prob": 0.2
49
- }
 
12
  "AutoModelForQuestionAnswering": "modeling_gptbert.GptBertForQuestionAnswering",
13
  "AutoModelForMultipleChoice": "modeling_gptbert.GptBertForMultipleChoice"
14
  },
15
+ "unk_token_id": 0,
16
+ "bos_token_id": 1,
17
+ "eos_token_id": 2,
18
+ "pad_token_id": 3,
19
+ "mask_token_id": 4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  "hidden_size": 192,
21
  "intermediate_size": 512,
22
  "max_sequence_length": 16384,
 
 
23
  "num_layers": 16,
24
+ "attention_dropout": 0.0,
25
+ "hidden_dropout": 0.0,
26
+ "embedding_dropout": 0.1,
27
+ "classifier_dropout": 0.2,
28
+ "layer_norm_eps": 1e-07,
29
+ "query_key_head_size": 64,
30
+ "value_head_size": 64,
31
+ "num_attention_heads": 3,
32
  "rope_theta": 160000,
33
  "vocab_size": 51200,
34
+ "local_global_ratio": 4,
35
+ "global_window_length": 8192,
36
+ "local_window_length": 256,
37
+ "deterministic_flash_attn": false
38
+ }