Upload tokenizer
Browse files- special_tokens_map.json +2 -2
- tokenizer.py +216 -0
- tokenizer_config.json +45 -6
- vocab.json +37 -1
special_tokens_map.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"bos_token": "/s",
|
3 |
"eos_token": "s",
|
4 |
-
"
|
5 |
-
"
|
6 |
}
|
|
|
1 |
{
|
2 |
"bos_token": "/s",
|
3 |
"eos_token": "s",
|
4 |
+
"pad_token": "pad",
|
5 |
+
"unk_token": "unk"
|
6 |
}
|
tokenizer.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
4 |
+
from transformers import PreTrainedTokenizer
|
5 |
+
from transformers.utils import logging
|
6 |
+
from pathlib import Path
|
7 |
+
import json
|
8 |
+
|
9 |
+
logger = logging.get_logger(__name__)
|
10 |
+
|
11 |
+
from huggingface_hub import hf_hub_download
|
12 |
+
import json
|
13 |
+
import os
|
14 |
+
|
15 |
+
def load_json(path, repo_id=None):
|
16 |
+
if repo_id:
|
17 |
+
path = hf_hub_download(repo_id, path)
|
18 |
+
with open(path, "r", encoding="utf-8") as f:
|
19 |
+
return json.load(f)
|
20 |
+
|
21 |
+
def load_json_old(path: str) -> Union[Dict, List]:
|
22 |
+
"""
|
23 |
+
Load a JSON file from the given path.
|
24 |
+
Args:
|
25 |
+
path (str): The path to the JSON file to be loaded.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
|
29 |
+
"""
|
30 |
+
full_path = Path(__file__).parent / path
|
31 |
+
with open(full_path, "r", encoding="utf-8") as f:
|
32 |
+
return json.load(f)
|
33 |
+
|
34 |
+
|
35 |
+
class STLTokenizer(PreTrainedTokenizer):
|
36 |
+
"""
|
37 |
+
A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
|
38 |
+
This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
|
39 |
+
and handle padding and special tokens.
|
40 |
+
"""
|
41 |
+
def __init__(self, vocab_path: str = "vocab.json", unk_token: str = "unk", pad_token: str = "pad",
|
42 |
+
bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
|
43 |
+
"""
|
44 |
+
Initializes the STLTokenizer with a given vocabulary and special tokens.
|
45 |
+
Args:
|
46 |
+
vocab_path (str): The path to the JSON file containing the vocabulary.
|
47 |
+
unk_token (str, optional): The token used for unknown words. Defaults to "unk".
|
48 |
+
pad_token (str, optional): The token used for padding. Defaults to "pad".
|
49 |
+
bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s".
|
50 |
+
eos_token (str, optional): The token used for the end of a sequence. Defaults to "s".
|
51 |
+
"""
|
52 |
+
self.vocab = load_json("vocab.json", repo_id="saracandu/stldec_random_32")
|
53 |
+
self.unk_token = unk_token
|
54 |
+
self.pad_token = pad_token
|
55 |
+
self.bos_token = bos_token
|
56 |
+
self.eos_token = eos_token
|
57 |
+
self.model_max_length = model_max_length
|
58 |
+
self.id_to_token = {v: k for k, v in self.vocab.items()} # Reverse mapping
|
59 |
+
|
60 |
+
super().__init__(
|
61 |
+
unk_token=unk_token,
|
62 |
+
pad_token=pad_token,
|
63 |
+
bos_token=bos_token,
|
64 |
+
eos_token=eos_token,
|
65 |
+
model_max_length=model_max_length,
|
66 |
+
**kwargs
|
67 |
+
)
|
68 |
+
|
69 |
+
@property
|
70 |
+
def vocab_size(self) -> int:
|
71 |
+
"""
|
72 |
+
Returns the size of the vocabulary.
|
73 |
+
Returns:
|
74 |
+
int: The number of tokens in the vocabulary.
|
75 |
+
"""
|
76 |
+
return len(self.vocab)
|
77 |
+
|
78 |
+
def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
|
79 |
+
"""
|
80 |
+
Replaces spaces in the input sequence with a specified token.
|
81 |
+
Args:
|
82 |
+
sequence (str): The input sequence.
|
83 |
+
undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
|
84 |
+
Returns:
|
85 |
+
str: The preprocessed sequence with spaces or padding tokens replaced.
|
86 |
+
"""
|
87 |
+
if undo:
|
88 |
+
return sequence.replace(new_space_token, space_token)
|
89 |
+
else:
|
90 |
+
return sequence.replace(space_token, new_space_token)
|
91 |
+
|
92 |
+
def add_bos_eos(self, sequence: str) -> str:
|
93 |
+
"""
|
94 |
+
Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
|
95 |
+
Args:
|
96 |
+
sequence (str): La sequenza di input.
|
97 |
+
Returns:
|
98 |
+
str: La sequenza con i token BOS ed EOS.
|
99 |
+
"""
|
100 |
+
return f'{self.bos_token} {sequence} {self.eos_token}'
|
101 |
+
|
102 |
+
def tokenize(self, text: str) -> List[str]:
|
103 |
+
"""
|
104 |
+
Tokenizes the input text into a list of tokens.
|
105 |
+
The method preprocesses the input text by replacing spaces with padding tokens and then tries to
|
106 |
+
find the longest possible match for each substring in the vocabulary.
|
107 |
+
Args:
|
108 |
+
text (str): The input text to be tokenized.
|
109 |
+
Returns:
|
110 |
+
List[str]: A list of tokens representing the tokenized text.
|
111 |
+
"""
|
112 |
+
text = self.add_bos_eos(text)
|
113 |
+
text = self.prepad_sequence(text)
|
114 |
+
|
115 |
+
tokens = []
|
116 |
+
i = 0
|
117 |
+
while i < len(text):
|
118 |
+
best_match = None
|
119 |
+
for j in range(len(text), i, -1): # Try matching substrings of decreasing length
|
120 |
+
subtoken = text[i:j]
|
121 |
+
if subtoken in self.vocab:
|
122 |
+
best_match = subtoken
|
123 |
+
break
|
124 |
+
if best_match:
|
125 |
+
tokens.append(best_match)
|
126 |
+
i += len(best_match)
|
127 |
+
else:
|
128 |
+
tokens.append(self.unk_token)
|
129 |
+
i += 1
|
130 |
+
return tokens
|
131 |
+
|
132 |
+
def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
|
133 |
+
"""
|
134 |
+
Converts a list of tokens into a list of token IDs.
|
135 |
+
Args:
|
136 |
+
tokens (List[str]): A list of tokens to be converted into IDs.
|
137 |
+
Returns:
|
138 |
+
List[int]: A list of corresponding token IDs.
|
139 |
+
"""
|
140 |
+
return [self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens]
|
141 |
+
|
142 |
+
def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
|
143 |
+
"""
|
144 |
+
Converts a list of token IDs into a list of tokens.
|
145 |
+
Args:
|
146 |
+
ids (List[int]): A list of token IDs to be converted into tokens.
|
147 |
+
Returns:
|
148 |
+
List[str]: A list of corresponding tokens.
|
149 |
+
"""
|
150 |
+
return [self.id_to_token.get(i, self.unk_token) for i in ids]
|
151 |
+
|
152 |
+
def encode(self, sequence: str) -> List[int]:
|
153 |
+
"""
|
154 |
+
Encodes a string sequence into a list of token IDs.
|
155 |
+
|
156 |
+
This method tokenizes the input sequence using the `tokenize` method,
|
157 |
+
and then converts the resulting tokens into their corresponding token IDs
|
158 |
+
using the `convert_tokens_to_ids` method.
|
159 |
+
|
160 |
+
Args:
|
161 |
+
sequence (str): The input sequence (text) to be encoded.
|
162 |
+
|
163 |
+
Returns:
|
164 |
+
List[int]: A list of token IDs corresponding to the input sequence.
|
165 |
+
"""
|
166 |
+
splitted_sequence = self.tokenize(sequence)
|
167 |
+
return self.convert_tokens_to_ids(splitted_sequence)
|
168 |
+
|
169 |
+
def postpad_sequence(self, sequence, pad_token_id):
|
170 |
+
"""
|
171 |
+
Fills the sequence up to max_length padding elements
|
172 |
+
"""
|
173 |
+
num_extra_elements = self.model_max_length - len(sequence) -1
|
174 |
+
if num_extra_elements > 0:
|
175 |
+
sequence.extend([pad_token_id] * num_extra_elements)
|
176 |
+
return sequence
|
177 |
+
|
178 |
+
def decode(self, token_ids: List[int]) -> str:
|
179 |
+
"""
|
180 |
+
Decodes a list of token IDs into a string of text.
|
181 |
+
The method converts the IDs to tokens and joins them to form a string.
|
182 |
+
It also restores the original spaces or padding tokens if `undo` is True.
|
183 |
+
Args:
|
184 |
+
token_ids (List[int]): A list of token IDs to be decoded.
|
185 |
+
skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
|
186 |
+
Returns:
|
187 |
+
str: The decoded string.
|
188 |
+
"""
|
189 |
+
tokens = self.convert_ids_to_tokens(token_ids)
|
190 |
+
decoded = "".join(tokens)
|
191 |
+
return self.prepad_sequence(decoded, undo=True)
|
192 |
+
|
193 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
194 |
+
"""
|
195 |
+
Saves the tokenizer's vocabulary to a file.
|
196 |
+
Useful only when the vocabulary has to be retrieved and is not given
|
197 |
+
(thus this is not the case: here to further improvements with sentencepiece).
|
198 |
+
This method saves the vocabulary to a JSON file in the specified directory.
|
199 |
+
Args:
|
200 |
+
save_directory (str): The directory where the vocabulary file will be saved.
|
201 |
+
filename_prefix (Optional[str]): An optional prefix for the filename.
|
202 |
+
Returns:
|
203 |
+
Tuple[str]: A tuple containing the path to the saved vocabulary file.
|
204 |
+
"""
|
205 |
+
vocab_file = f"{save_directory}/{filename_prefix + '-' if filename_prefix else ''}vocab.json"
|
206 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
207 |
+
json.dump(self.vocab, f, indent=2, ensure_ascii=False)
|
208 |
+
return (vocab_file,)
|
209 |
+
|
210 |
+
def get_vocab(self) -> dict:
|
211 |
+
"""
|
212 |
+
Retrieves the vocabulary used by the tokenizer.
|
213 |
+
Returns:
|
214 |
+
dict: The vocabulary as a dictionary.
|
215 |
+
"""
|
216 |
+
return self.vocab
|
tokenizer_config.json
CHANGED
@@ -1,11 +1,50 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"bos_token": "/s",
|
|
|
7 |
"eos_token": "s",
|
8 |
-
"
|
|
|
9 |
"pad_token": "pad",
|
10 |
-
"
|
|
|
11 |
}
|
|
|
1 |
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "unk",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "pad",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "/s",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "s",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"auto_map": {
|
37 |
+
"AutoTokenizer": [
|
38 |
+
"tokenizer.STLTokenizer",
|
39 |
+
null
|
40 |
+
]
|
41 |
+
},
|
42 |
"bos_token": "/s",
|
43 |
+
"clean_up_tokenization_spaces": false,
|
44 |
"eos_token": "s",
|
45 |
+
"extra_special_tokens": {},
|
46 |
+
"model_max_length": 512,
|
47 |
"pad_token": "pad",
|
48 |
+
"tokenizer_class": "STLTokenizer",
|
49 |
+
"unk_token": "unk"
|
50 |
}
|
vocab.json
CHANGED
@@ -1 +1,37 @@
|
|
1 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"unk": 0,
|
3 |
+
"pad": 1,
|
4 |
+
"/s": 2,
|
5 |
+
"s": 3,
|
6 |
+
"(": 4,
|
7 |
+
")": 5,
|
8 |
+
"always": 6,
|
9 |
+
"eventually": 7,
|
10 |
+
"until": 8,
|
11 |
+
"and": 9,
|
12 |
+
"or": 10,
|
13 |
+
"not": 11,
|
14 |
+
">=": 12,
|
15 |
+
"<=": 13,
|
16 |
+
">": 14,
|
17 |
+
"<": 15,
|
18 |
+
"=": 16,
|
19 |
+
"x_": 17,
|
20 |
+
"[": 18,
|
21 |
+
"]": 19,
|
22 |
+
",": 20,
|
23 |
+
"inf": 21,
|
24 |
+
"-": 22,
|
25 |
+
".": 23,
|
26 |
+
"0": 24,
|
27 |
+
"1": 25,
|
28 |
+
"2": 26,
|
29 |
+
"3": 27,
|
30 |
+
"4": 28,
|
31 |
+
"5": 29,
|
32 |
+
"6": 30,
|
33 |
+
"7": 31,
|
34 |
+
"8": 32,
|
35 |
+
"9": 33,
|
36 |
+
"@": 34
|
37 |
+
}
|