Dayhoff-170m-UR90 / tokenizers.py
sarahalamdari's picture
Add files using upload-large-folder tool
332656d verified
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from typing import List, Optional, Union
import os
MASK = "#"
MSA_PAD = "!"
UL_ALPHABET_PLUS = "ACDEFGHIKLMNPQRSTVWYBZXJOU-*#@!/[]{}"
MSA_AAS = "ACDEFGHIKLMNPQRSTVWYBZXJOU-"
GAP = "-"
START = "@"
STOP = "*"
SEP = "/"
END_AL = "]"
END_UL = "}"
START_AL = "["
START_UL = "{"
class ProteinTokenizer(PreTrainedTokenizer):
def __init__(
self,
protein_alphabet: str = UL_ALPHABET_PLUS,
model_max_length: int = 2048,
pad_token=MSA_PAD,
mask_token=MASK,
all_aas=MSA_AAS,
gap_token=GAP,
bos_token=START,
eos_token=STOP,
sep_token=SEP,
**kwargs
):
"""Character tokenizer for Hugging Face transformers.
model_max_length (int): Model maximum sequence length.
"""
self.alphabet = list("".join(protein_alphabet))
self.all_aas = list("".join(all_aas))
self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
self.i_to_a = {i: u for i, u in enumerate(self.alphabet)}
self.gap_token = gap_token
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
super().__init__(
pad_token=pad_token,
mask_token=mask_token,
eos_token=eos_token,
bos_token=bos_token,
sep_token=sep_token,
model_max_length=model_max_length,
**kwargs
)
@property
def vocab_size(self):
return len(self.alphabet)
@property
def gap_token_id(self):
return self.convert_tokens_to_ids(self.gap_token)
def get_vocab(self):
return self.a_to_i
def _tokenize(self, text: str) -> List[str]:
return list(text)
def _convert_token_to_id(self, token) -> int:
return self.a_to_i[token]
def _convert_id_to_token(self, index) -> str:
return self.i_to_a[index]
def convert_tokens_to_string(self, tokens):
return "".join(tokens)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
result = token_ids_0
if token_ids_1 is not None:
raise NotImplementedError("This tokenizer does not support two sequences")
return result
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False,
) -> List[int]:
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True,
)
result = [0] * len(token_ids_0)
if token_ids_1 is not None:
raise NotImplementedError("This tokenizer does not support two sequences")
return result
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Identifies the type of token. 0 for the first sentence, 1 for the second sentence if it exists
"""
result = len(token_ids_0) * [0]
if token_ids_1 is not None:
raise NotImplementedError("This tokenizer does not support two sequences")
return result
def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
super().save_pretrained(save_directory, **kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
return ()