sunbaby commited on
Commit
0ae52ab
·
verified ·
1 Parent(s): 82fea91

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +2 -11
  2. tokenizer.json +0 -72
  3. tokenizer_config.json +5 -76
special_tokens_map.json CHANGED
@@ -1,13 +1,4 @@
1
  {
2
- "additional_special_tokens": [
3
- "[INST]",
4
- " [/INST]",
5
- "[SYS]",
6
- "[/SYS]",
7
- "[ASST]",
8
- "[/ASST]",
9
- "[end_of_turn]"
10
- ],
11
  "bos_token": {
12
  "content": "<|begin_of_text|>",
13
  "lstrip": false,
@@ -16,14 +7,14 @@
16
  "single_word": false
17
  },
18
  "eos_token": {
19
- "content": "[/ASST]",
20
  "lstrip": false,
21
  "normalized": false,
22
  "rstrip": false,
23
  "single_word": false
24
  },
25
  "pad_token": {
26
- "content": "<|pad_of_token|>",
27
  "lstrip": false,
28
  "normalized": false,
29
  "rstrip": false,
 
1
  {
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<|begin_of_text|>",
4
  "lstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|eot_id|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "<|eot_id|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
tokenizer.json CHANGED
@@ -2306,78 +2306,6 @@
2306
  "rstrip": false,
2307
  "normalized": false,
2308
  "special": true
2309
- },
2310
- {
2311
- "id": 128256,
2312
- "content": "<|pad_of_token|>",
2313
- "single_word": false,
2314
- "lstrip": false,
2315
- "rstrip": false,
2316
- "normalized": false,
2317
- "special": true
2318
- },
2319
- {
2320
- "id": 128257,
2321
- "content": "[INST]",
2322
- "single_word": false,
2323
- "lstrip": false,
2324
- "rstrip": false,
2325
- "normalized": false,
2326
- "special": true
2327
- },
2328
- {
2329
- "id": 128258,
2330
- "content": " [/INST]",
2331
- "single_word": false,
2332
- "lstrip": false,
2333
- "rstrip": false,
2334
- "normalized": false,
2335
- "special": true
2336
- },
2337
- {
2338
- "id": 128259,
2339
- "content": "[SYS]",
2340
- "single_word": false,
2341
- "lstrip": false,
2342
- "rstrip": false,
2343
- "normalized": false,
2344
- "special": true
2345
- },
2346
- {
2347
- "id": 128260,
2348
- "content": "[/SYS]",
2349
- "single_word": false,
2350
- "lstrip": false,
2351
- "rstrip": false,
2352
- "normalized": false,
2353
- "special": true
2354
- },
2355
- {
2356
- "id": 128261,
2357
- "content": "[ASST]",
2358
- "single_word": false,
2359
- "lstrip": false,
2360
- "rstrip": false,
2361
- "normalized": false,
2362
- "special": true
2363
- },
2364
- {
2365
- "id": 128262,
2366
- "content": "[/ASST]",
2367
- "single_word": false,
2368
- "lstrip": false,
2369
- "rstrip": false,
2370
- "normalized": false,
2371
- "special": true
2372
- },
2373
- {
2374
- "id": 128263,
2375
- "content": "[end_of_turn]",
2376
- "single_word": false,
2377
- "lstrip": false,
2378
- "rstrip": false,
2379
- "normalized": false,
2380
- "special": true
2381
  }
2382
  ],
2383
  "normalizer": null,
 
2306
  "rstrip": false,
2307
  "normalized": false,
2308
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2309
  }
2310
  ],
2311
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -2047,90 +2047,19 @@
2047
  "rstrip": false,
2048
  "single_word": false,
2049
  "special": true
2050
- },
2051
- "128256": {
2052
- "content": "<|pad_of_token|>",
2053
- "lstrip": false,
2054
- "normalized": false,
2055
- "rstrip": false,
2056
- "single_word": false,
2057
- "special": true
2058
- },
2059
- "128257": {
2060
- "content": "[INST]",
2061
- "lstrip": false,
2062
- "normalized": false,
2063
- "rstrip": false,
2064
- "single_word": false,
2065
- "special": true
2066
- },
2067
- "128258": {
2068
- "content": " [/INST]",
2069
- "lstrip": false,
2070
- "normalized": false,
2071
- "rstrip": false,
2072
- "single_word": false,
2073
- "special": true
2074
- },
2075
- "128259": {
2076
- "content": "[SYS]",
2077
- "lstrip": false,
2078
- "normalized": false,
2079
- "rstrip": false,
2080
- "single_word": false,
2081
- "special": true
2082
- },
2083
- "128260": {
2084
- "content": "[/SYS]",
2085
- "lstrip": false,
2086
- "normalized": false,
2087
- "rstrip": false,
2088
- "single_word": false,
2089
- "special": true
2090
- },
2091
- "128261": {
2092
- "content": "[ASST]",
2093
- "lstrip": false,
2094
- "normalized": false,
2095
- "rstrip": false,
2096
- "single_word": false,
2097
- "special": true
2098
- },
2099
- "128262": {
2100
- "content": "[/ASST]",
2101
- "lstrip": false,
2102
- "normalized": false,
2103
- "rstrip": false,
2104
- "single_word": false,
2105
- "special": true
2106
- },
2107
- "128263": {
2108
- "content": "[end_of_turn]",
2109
- "lstrip": false,
2110
- "normalized": false,
2111
- "rstrip": false,
2112
- "single_word": false,
2113
- "special": true
2114
  }
2115
  },
2116
- "additional_special_tokens": [
2117
- "[INST]",
2118
- " [/INST]",
2119
- "[SYS]",
2120
- "[/SYS]",
2121
- "[ASST]",
2122
- "[/ASST]",
2123
- "[end_of_turn]"
2124
- ],
2125
  "bos_token": "<|begin_of_text|>",
2126
- "chat_template": "{{- bos_token }} {%- for message in messages %} {%- if message['from'] == 'human' %} {{- '[INST] ' + message['value'].strip() + ' [/INST]' }} {%- elif message['from'] == 'system' %} {{- '[SYS]' + message['value'].strip() + '[/SYS]' }} {%- elif message['from'] == 'gpt' %} {{- '[ASST] ' + message['value'] + ' [/ASST]' }} {%- if message.last and add_generate_prompt %} {{- '[ASST] '}} {%- endif %} {%- endif %} {%- endfor %}",
2127
  "clean_up_tokenization_spaces": true,
2128
- "eos_token": "[/ASST]",
2129
  "model_input_names": [
2130
  "input_ids",
2131
  "attention_mask"
2132
  ],
2133
  "model_max_length": 1000000000000000019884624838656,
2134
- "pad_token": "<|pad_of_token|>",
 
 
2135
  "tokenizer_class": "PreTrainedTokenizerFast"
2136
  }
 
2047
  "rstrip": false,
2048
  "single_word": false,
2049
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2050
  }
2051
  },
 
 
 
 
 
 
 
 
 
2052
  "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{{ '<|begin_of_text|>' }}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}",
2054
  "clean_up_tokenization_spaces": true,
2055
+ "eos_token": "<|eot_id|>",
2056
  "model_input_names": [
2057
  "input_ids",
2058
  "attention_mask"
2059
  ],
2060
  "model_max_length": 1000000000000000019884624838656,
2061
+ "pad_token": "<|eot_id|>",
2062
+ "padding_side": "right",
2063
+ "split_special_tokens": false,
2064
  "tokenizer_class": "PreTrainedTokenizerFast"
2065
  }