saracandu commited on
Commit
c1d8d29
·
verified ·
1 Parent(s): 5335906

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.py +3 -1
  2. tokenizer_config.json +6 -0
tokenizer.py CHANGED
@@ -137,7 +137,9 @@ class STLTokenizer(PreTrainedTokenizer):
137
  Returns:
138
  List[int]: A list of corresponding token IDs.
139
  """
140
- return [self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens]
 
 
141
 
142
  def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
143
  """
 
137
  Returns:
138
  List[int]: A list of corresponding token IDs.
139
  """
140
+ unk_token_str = str(self.unk_token)
141
+ unk_token_id = self.vocab.get(unk_token_str)
142
+ return [self.vocab.get(token, unk_token_id) for token in tokens]
143
 
144
  def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
145
  """
tokenizer_config.json CHANGED
@@ -33,6 +33,12 @@
33
  "special": true
34
  }
35
  },
 
 
 
 
 
 
36
  "bos_token": "/s",
37
  "clean_up_tokenization_spaces": false,
38
  "eos_token": "s",
 
33
  "special": true
34
  }
35
  },
36
+ "auto_map": {
37
+ "AutoTokenizer": [
38
+ "tokenizer.STLTokenizer",
39
+ null
40
+ ]
41
+ },
42
  "bos_token": "/s",
43
  "clean_up_tokenization_spaces": false,
44
  "eos_token": "s",