saracandu commited on
Commit
8348ea6
·
verified ·
1 Parent(s): 8241eca

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +2 -2
  2. tokenizer.py +216 -0
  3. tokenizer_config.json +45 -6
  4. vocab.json +37 -1
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "bos_token": "/s",
3
  "eos_token": "s",
4
- "unk_token": "unk",
5
- "pad_token": "pad"
6
  }
 
1
  {
2
  "bos_token": "/s",
3
  "eos_token": "s",
4
+ "pad_token": "pad",
5
+ "unk_token": "unk"
6
  }
tokenizer.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+ from transformers import PreTrainedTokenizer
5
+ from transformers.utils import logging
6
+ from pathlib import Path
7
+ import json
8
+
9
+ logger = logging.get_logger(__name__)
10
+
11
+ from huggingface_hub import hf_hub_download
12
+ import json
13
+ import os
14
+
15
+ def load_json(path, repo_id=None):
16
+ if repo_id:
17
+ path = hf_hub_download(repo_id, path)
18
+ with open(path, "r", encoding="utf-8") as f:
19
+ return json.load(f)
20
+
21
+ def load_json_old(path: str) -> Union[Dict, List]:
22
+ """
23
+ Load a JSON file from the given path.
24
+ Args:
25
+ path (str): The path to the JSON file to be loaded.
26
+
27
+ Returns:
28
+ Union[Dict, List]: The parsed content of the JSON file, which could be a dictionary or a list.
29
+ """
30
+ full_path = Path(__file__).parent / path
31
+ with open(full_path, "r", encoding="utf-8") as f:
32
+ return json.load(f)
33
+
34
+
35
+ class STLTokenizer(PreTrainedTokenizer):
36
+ """
37
+ A custom tokenizer class that extends `PreTrainedTokenizer` to handle a specific vocabulary and tokenization process.
38
+ This tokenizer can load a vocabulary from a JSON file, tokenize text, convert tokens to IDs,
39
+ and handle padding and special tokens.
40
+ """
41
+ def __init__(self, vocab_path: str = "vocab.json", unk_token: str = "unk", pad_token: str = "pad",
42
+ bos_token: str = "/s", eos_token: str = "s", model_max_length = 512, **kwargs):
43
+ """
44
+ Initializes the STLTokenizer with a given vocabulary and special tokens.
45
+ Args:
46
+ vocab_path (str): The path to the JSON file containing the vocabulary.
47
+ unk_token (str, optional): The token used for unknown words. Defaults to "unk".
48
+ pad_token (str, optional): The token used for padding. Defaults to "pad".
49
+ bos_token (str, optional): The token used for the beginning of a sequence. Defaults to "/s".
50
+ eos_token (str, optional): The token used for the end of a sequence. Defaults to "s".
51
+ """
52
+ self.vocab = load_json("vocab.json", repo_id="saracandu/stldec_random_32")
53
+ self.unk_token = unk_token
54
+ self.pad_token = pad_token
55
+ self.bos_token = bos_token
56
+ self.eos_token = eos_token
57
+ self.model_max_length = model_max_length
58
+ self.id_to_token = {v: k for k, v in self.vocab.items()} # Reverse mapping
59
+
60
+ super().__init__(
61
+ unk_token=unk_token,
62
+ pad_token=pad_token,
63
+ bos_token=bos_token,
64
+ eos_token=eos_token,
65
+ model_max_length=model_max_length,
66
+ **kwargs
67
+ )
68
+
69
+ @property
70
+ def vocab_size(self) -> int:
71
+ """
72
+ Returns the size of the vocabulary.
73
+ Returns:
74
+ int: The number of tokens in the vocabulary.
75
+ """
76
+ return len(self.vocab)
77
+
78
+ def prepad_sequence(self, sequence, space_token = ' ', new_space_token = '@', undo = False):
79
+ """
80
+ Replaces spaces in the input sequence with a specified token.
81
+ Args:
82
+ sequence (str): The input sequence.
83
+ undo (bool): If True, replace the padding token with spaces. Defaults to False, which pads the spaces.
84
+ Returns:
85
+ str: The preprocessed sequence with spaces or padding tokens replaced.
86
+ """
87
+ if undo:
88
+ return sequence.replace(new_space_token, space_token)
89
+ else:
90
+ return sequence.replace(space_token, new_space_token)
91
+
92
+ def add_bos_eos(self, sequence: str) -> str:
93
+ """
94
+ Aggiunge i token BOS all'inizio e EOS alla fine della sequenza.
95
+ Args:
96
+ sequence (str): La sequenza di input.
97
+ Returns:
98
+ str: La sequenza con i token BOS ed EOS.
99
+ """
100
+ return f'{self.bos_token} {sequence} {self.eos_token}'
101
+
102
+ def tokenize(self, text: str) -> List[str]:
103
+ """
104
+ Tokenizes the input text into a list of tokens.
105
+ The method preprocesses the input text by replacing spaces with padding tokens and then tries to
106
+ find the longest possible match for each substring in the vocabulary.
107
+ Args:
108
+ text (str): The input text to be tokenized.
109
+ Returns:
110
+ List[str]: A list of tokens representing the tokenized text.
111
+ """
112
+ text = self.add_bos_eos(text)
113
+ text = self.prepad_sequence(text)
114
+
115
+ tokens = []
116
+ i = 0
117
+ while i < len(text):
118
+ best_match = None
119
+ for j in range(len(text), i, -1): # Try matching substrings of decreasing length
120
+ subtoken = text[i:j]
121
+ if subtoken in self.vocab:
122
+ best_match = subtoken
123
+ break
124
+ if best_match:
125
+ tokens.append(best_match)
126
+ i += len(best_match)
127
+ else:
128
+ tokens.append(self.unk_token)
129
+ i += 1
130
+ return tokens
131
+
132
+ def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
133
+ """
134
+ Converts a list of tokens into a list of token IDs.
135
+ Args:
136
+ tokens (List[str]): A list of tokens to be converted into IDs.
137
+ Returns:
138
+ List[int]: A list of corresponding token IDs.
139
+ """
140
+ return [self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens]
141
+
142
+ def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
143
+ """
144
+ Converts a list of token IDs into a list of tokens.
145
+ Args:
146
+ ids (List[int]): A list of token IDs to be converted into tokens.
147
+ Returns:
148
+ List[str]: A list of corresponding tokens.
149
+ """
150
+ return [self.id_to_token.get(i, self.unk_token) for i in ids]
151
+
152
+ def encode(self, sequence: str) -> List[int]:
153
+ """
154
+ Encodes a string sequence into a list of token IDs.
155
+
156
+ This method tokenizes the input sequence using the `tokenize` method,
157
+ and then converts the resulting tokens into their corresponding token IDs
158
+ using the `convert_tokens_to_ids` method.
159
+
160
+ Args:
161
+ sequence (str): The input sequence (text) to be encoded.
162
+
163
+ Returns:
164
+ List[int]: A list of token IDs corresponding to the input sequence.
165
+ """
166
+ splitted_sequence = self.tokenize(sequence)
167
+ return self.convert_tokens_to_ids(splitted_sequence)
168
+
169
+ def postpad_sequence(self, sequence, pad_token_id):
170
+ """
171
+ Fills the sequence up to max_length padding elements
172
+ """
173
+ num_extra_elements = self.model_max_length - len(sequence) -1
174
+ if num_extra_elements > 0:
175
+ sequence.extend([pad_token_id] * num_extra_elements)
176
+ return sequence
177
+
178
+ def decode(self, token_ids: List[int]) -> str:
179
+ """
180
+ Decodes a list of token IDs into a string of text.
181
+ The method converts the IDs to tokens and joins them to form a string.
182
+ It also restores the original spaces or padding tokens if `undo` is True.
183
+ Args:
184
+ token_ids (List[int]): A list of token IDs to be decoded.
185
+ skip_special_tokens (bool, optional): Whether to skip special tokens during decoding. Defaults to False.
186
+ Returns:
187
+ str: The decoded string.
188
+ """
189
+ tokens = self.convert_ids_to_tokens(token_ids)
190
+ decoded = "".join(tokens)
191
+ return self.prepad_sequence(decoded, undo=True)
192
+
193
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
194
+ """
195
+ Saves the tokenizer's vocabulary to a file.
196
+ Useful only when the vocabulary has to be retrieved and is not given
197
+ (thus this is not the case: here to further improvements with sentencepiece).
198
+ This method saves the vocabulary to a JSON file in the specified directory.
199
+ Args:
200
+ save_directory (str): The directory where the vocabulary file will be saved.
201
+ filename_prefix (Optional[str]): An optional prefix for the filename.
202
+ Returns:
203
+ Tuple[str]: A tuple containing the path to the saved vocabulary file.
204
+ """
205
+ vocab_file = f"{save_directory}/{filename_prefix + '-' if filename_prefix else ''}vocab.json"
206
+ with open(vocab_file, "w", encoding="utf-8") as f:
207
+ json.dump(self.vocab, f, indent=2, ensure_ascii=False)
208
+ return (vocab_file,)
209
+
210
+ def get_vocab(self) -> dict:
211
+ """
212
+ Retrieves the vocabulary used by the tokenizer.
213
+ Returns:
214
+ dict: The vocabulary as a dictionary.
215
+ """
216
+ return self.vocab
tokenizer_config.json CHANGED
@@ -1,11 +1,50 @@
1
  {
2
- "name_or_path": "temporal_logic_tokenizer",
3
- "special_tokens_map_file": "special_tokens_map.json",
4
- "do_lower_case": false,
5
- "model_max_length": 500,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "bos_token": "/s",
 
7
  "eos_token": "s",
8
- "unk_token": "unk",
 
9
  "pad_token": "pad",
10
- "added_tokens": []
 
11
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "unk",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "pad",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "/s",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "s",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer.STLTokenizer",
39
+ null
40
+ ]
41
+ },
42
  "bos_token": "/s",
43
+ "clean_up_tokenization_spaces": false,
44
  "eos_token": "s",
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 512,
47
  "pad_token": "pad",
48
+ "tokenizer_class": "STLTokenizer",
49
+ "unk_token": "unk"
50
  }
vocab.json CHANGED
@@ -1 +1,37 @@
1
- {"unk": 0, "pad": 1, "/s": 2, "s": 3, "(": 4, ")": 5, "always": 6, "eventually": 7, "until": 8, "and": 9, "or": 10, "not": 11, ">=": 12, "<=": 13, ">": 14, "<": 15, "=": 16, "x_": 17, "[": 18, "]": 19, ",": 20, "inf": 21, "-": 22, ".": 23, "0": 24, "1": 25, "2": 26, "3": 27, "4": 28, "5": 29, "6": 30, "7": 31, "8": 32, "9": 33, "@": 34}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "unk": 0,
3
+ "pad": 1,
4
+ "/s": 2,
5
+ "s": 3,
6
+ "(": 4,
7
+ ")": 5,
8
+ "always": 6,
9
+ "eventually": 7,
10
+ "until": 8,
11
+ "and": 9,
12
+ "or": 10,
13
+ "not": 11,
14
+ ">=": 12,
15
+ "<=": 13,
16
+ ">": 14,
17
+ "<": 15,
18
+ "=": 16,
19
+ "x_": 17,
20
+ "[": 18,
21
+ "]": 19,
22
+ ",": 20,
23
+ "inf": 21,
24
+ "-": 22,
25
+ ".": 23,
26
+ "0": 24,
27
+ "1": 25,
28
+ "2": 26,
29
+ "3": 27,
30
+ "4": 28,
31
+ "5": 29,
32
+ "6": 30,
33
+ "7": 31,
34
+ "8": 32,
35
+ "9": 33,
36
+ "@": 34
37
+ }