from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from typing import List, Optional, Union import os MASK = "#" MSA_PAD = "!" UL_ALPHABET_PLUS = "ACDEFGHIKLMNPQRSTVWYBZXJOU-*#@!/[]{}" MSA_AAS = "ACDEFGHIKLMNPQRSTVWYBZXJOU-" GAP = "-" START = "@" STOP = "*" SEP = "/" END_AL = "]" END_UL = "}" START_AL = "[" START_UL = "{" class ProteinTokenizer(PreTrainedTokenizer): def __init__( self, protein_alphabet: str = UL_ALPHABET_PLUS, model_max_length: int = 2048, pad_token=MSA_PAD, mask_token=MASK, all_aas=MSA_AAS, gap_token=GAP, bos_token=START, eos_token=STOP, sep_token=SEP, **kwargs ): """Character tokenizer for Hugging Face transformers. model_max_length (int): Model maximum sequence length. """ self.alphabet = list("".join(protein_alphabet)) self.all_aas = list("".join(all_aas)) self.a_to_i = {u: i for i, u in enumerate(self.alphabet)} self.i_to_a = {i: u for i, u in enumerate(self.alphabet)} self.gap_token = gap_token bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token super().__init__( pad_token=pad_token, mask_token=mask_token, eos_token=eos_token, bos_token=bos_token, sep_token=sep_token, model_max_length=model_max_length, **kwargs ) @property def vocab_size(self): return len(self.alphabet) @property def gap_token_id(self): return self.convert_tokens_to_ids(self.gap_token) def get_vocab(self): return self.a_to_i def _tokenize(self, text: str) -> List[str]: return list(text) def _convert_token_to_id(self, token) -> int: return self.a_to_i[token] def _convert_id_to_token(self, index) -> str: return self.i_to_a[index] def convert_tokens_to_string(self, tokens): return "".join(tokens) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): result = token_ids_0 if token_ids_1 is not None: raise NotImplementedError("This tokenizer does not support two sequences") return result def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False, ) -> List[int]: if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True, ) result = [0] * len(token_ids_0) if token_ids_1 is not None: raise NotImplementedError("This tokenizer does not support two sequences") return result def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Identifies the type of token. 0 for the first sentence, 1 for the second sentence if it exists """ result = len(token_ids_0) * [0] if token_ids_1 is not None: raise NotImplementedError("This tokenizer does not support two sequences") return result def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs): super().save_pretrained(save_directory, **kwargs) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None): return ()