Upload HF wrapper and smitok_core without tails

Browse files

Files changed (2) hide show

FastChemTokenizerHF.py +769 -0
smitok_core/vocab.json +1137 -0

FastChemTokenizerHF.py ADDED Viewed

	@@ -0,0 +1,769 @@

+import torch
+import json
+import os
+from typing import List, Union, Optional, Tuple, Dict, Any
+from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
+from transformers.utils import PaddingStrategy, TensorType
+from functools import lru_cache
+class TrieNode:
+    __slots__ = ['children', 'token_id']
+    def __init__(self):
+        self.children = {}
+        self.token_id = None  # If set, this node completes a valid token
+class FastChemTokenizer(PreTrainedTokenizerBase):
+    """
+    Fully HuggingFace API compatible tokenizer for chemical representations.
+    """
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    def __init__(
+        self,
+        token_to_id=None,
+        vocab_file=None,
+        model_max_length=512,
+        padding_side="right",
+        truncation_side="right",
+        chat_template=None,
+        **kwargs
+    ):
+        # Handle vocab loading
+        if token_to_id is None and vocab_file is None:
+            raise ValueError("Either token_to_id or vocab_file must be provided")
+        if vocab_file is not None:
+            with open(vocab_file, "r", encoding="utf-8") as f:
+                token_to_id = json.load(f)
+                token_to_id = {str(k): int(v) for k, v in token_to_id.items()}
+        self.token_to_id = token_to_id
+        self.id_to_token = {v: k for k, v in token_to_id.items()}
+        # Precompute max token length for possible use & clarity
+        self.max_token_len = max(len(t) for t in token_to_id.keys()) if token_to_id else 0
+        # Build trie for fast longest-match lookup
+        self.trie_root = self._build_trie(token_to_id)
+        # Validate required special tokens
+        required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
+        for tok in required_special_tokens:
+            if tok not in token_to_id:
+                raise KeyError(f"Required special token '{tok}' not found in vocab.")
+        # ✅ Assign special token IDs explicitly
+        self.bos_token_id = token_to_id["<s>"]
+        self.eos_token_id = token_to_id["</s>"]
+        self.pad_token_id = token_to_id["<pad>"]
+        self.unk_token_id = token_to_id["<unk>"]
+        self.mask_token_id = token_to_id["<mask>"]
+        # Special tokens
+        bos_token = "<s>"
+        eos_token = "</s>"
+        pad_token = "<pad>"
+        unk_token = "<unk>"
+        mask_token = "<mask>"
+        # Initialize parent class with all required parameters
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=None,
+            pad_token=pad_token,
+            cls_token=None,
+            mask_token=mask_token,
+            additional_special_tokens=[],
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            truncation_side=truncation_side,
+            chat_template=chat_template,
+            **kwargs,
+        )
+    def _build_trie(self, token_to_id):
+        root = TrieNode()
+        for token, tid in token_to_id.items():
+            node = root
+            for char in token:
+                if char not in node.children:
+                    node.children[char] = TrieNode()
+                node = node.children[char]
+            node.token_id = tid
+        return root
+    @property
+    def vocab_size(self):
+        return len(self.token_to_id)
+    def __len__(self):
+        return len(self.token_to_id)
+    def get_vocab(self) -> Dict[str, int]:
+        return self.token_to_id.copy()
+    @lru_cache(maxsize=10000)
+    def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
+        return tuple(self._encode_core(s))
+    def _encode_core(self, text: str) -> List[int]:
+        """Core encoding logic using Trie — no caching."""
+        tokens = text
+        result_ids = []
+        i = 0
+        n = len(tokens)
+        while i < n:
+            node = self.trie_root
+            j = i
+            last_match_id = None
+            last_match_end = i
+            while j < n and tokens[j] in node.children:
+                node = node.children[tokens[j]]
+                j += 1
+                if node.token_id is not None:
+                    last_match_id = node.token_id
+                    last_match_end = j
+            if last_match_id is not None:
+                result_ids.append(last_match_id)
+                i = last_match_end
+            else:
+                tok = tokens[i]
+                result_ids.append(self.token_to_id.get(tok, self.unk_token_id))
+                i += 1
+        return result_ids
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        token_ids = self._encode_core(text.strip())
+        return [self.id_to_token[tid] for tid in token_ids]
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.token_to_id.get(token, self.unk_token_id)
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.id_to_token.get(index, self.unk_token)
+    # ✅ Public methods
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            return self._convert_token_to_id(tokens)
+        return [self._convert_token_to_id(tok) for tok in tokens]
+    def convert_ids_to_tokens(self, ids: Union[int, List[int]]) -> Union[str, List[str]]:
+        if isinstance(ids, int):
+            return self._convert_id_to_token(ids)
+        return [self._convert_id_to_token(i) for i in ids]
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """SMILES-style decoding: no spaces between tokens."""
+        return "".join(tokens)
+    def encode(
+        self,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+    ) -> List[int]:
+        encoded = self.encode_plus(
+            text=text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+        )
+        input_ids = encoded["input_ids"]
+        if isinstance(input_ids, torch.Tensor):
+            if input_ids.dim() > 1:
+                input_ids = input_ids.squeeze(0)
+            input_ids = input_ids.tolist()
+        return input_ids
+    def decode(
+        self,
+        token_ids: Union[List[int], torch.Tensor],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs
+    ) -> str:
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.tolist()
+        if skip_special_tokens:
+            special_ids = {
+                self.bos_token_id,
+                self.eos_token_id,
+                self.pad_token_id,
+                self.mask_token_id,
+            }
+        else:
+            special_ids = set()
+        tokens = []
+        for tid in token_ids:
+            if tid in special_ids:
+                continue
+            token = self.id_to_token.get(tid, self.unk_token)
+            tokens.append(token)
+        return "".join(tokens)
+    def batch_decode(
+        self,
+        sequences: Union[List[List[int]], torch.Tensor],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs
+    ) -> List[str]:
+        """Batch decode sequences."""
+        if isinstance(sequences, torch.Tensor):
+            sequences = sequences.tolist()
+        return [
+            self.decode(
+                seq,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                **kwargs
+            )
+            for seq in sequences
+        ]
+    def decode_with_trace(self, token_ids: List[int]) -> None:
+        print(f"\n🔍 Decoding {len(token_ids)} tokens:")
+        for i, tid in enumerate(token_ids):
+            token = self.id_to_token.get(tid, self.unk_token)
+            print(f"  [{i:03d}] ID={tid:5d} → '{token}'")
+    def __call__(
+        self,
+        text: Union[str, List[str]],
+        text_pair: Optional[Union[str, List[str]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Main callable method that handles both single and batch inputs.
+        """
+        # Handle defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = True
+        if return_attention_mask is None:
+            return_attention_mask = True
+        if isinstance(text, list):
+            if text_pair is not None:
+                batch = [(t, p) for t, p in zip(text, text_pair)]
+            else:
+                batch = text
+            return self.batch_encode_plus(
+                batch,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs
+            )
+    def encode_plus(
+        self,
+        text: str,
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = True,
+        return_attention_mask: Optional[bool] = True,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        if max_length is None:
+            max_length = self.model_max_length
+        ids_a = list(self._cached_encode_str(text.strip()))
+        if text_pair is not None:
+            ids_b = list(self._cached_encode_str(text_pair.strip()))
+        else:
+            ids_b = None
+        input_ids = []
+        token_type_ids = []
+        if add_special_tokens:
+            input_ids.append(self.bos_token_id)
+            token_type_ids.append(0)
+            if ids_b is not None:
+                input_ids.extend(ids_a)
+                token_type_ids.extend([0] * len(ids_a))
+                input_ids.append(self.eos_token_id)
+                token_type_ids.append(0)
+                input_ids.extend(ids_b)
+                token_type_ids.extend([1] * len(ids_b))
+                input_ids.append(self.eos_token_id)
+                token_type_ids.append(1)
+            else:
+                input_ids.extend(ids_a)
+                token_type_ids.extend([0] * len(ids_a))
+                input_ids.append(self.eos_token_id)
+                token_type_ids.append(0)
+        else:
+            input_ids = ids_a.copy()
+            token_type_ids = [0] * len(input_ids)
+            if ids_b is not None:
+                input_ids.extend(ids_b)
+                token_type_ids.extend([1] * len(ids_b))
+        # Handle truncation
+        if truncation and len(input_ids) > max_length:
+            input_ids = input_ids[:max_length]
+            token_type_ids = token_type_ids[:max_length]
+        # Handle padding
+        if padding == True or padding == "max_length":
+            pad_len = max_length - len(input_ids)
+            if pad_len > 0:
+                if self.padding_side == "right":
+                    input_ids.extend([self.pad_token_id] * pad_len)
+                    token_type_ids.extend([0] * pad_len)
+                else:
+                    input_ids = [self.pad_token_id] * pad_len + input_ids
+                    token_type_ids = [0] * pad_len + token_type_ids
+        attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]
+        encoded_dict = {
+            "input_ids": input_ids,
+        }
+        if return_attention_mask:
+            encoded_dict["attention_mask"] = attention_mask
+        if return_token_type_ids:
+            encoded_dict["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            special_tokens_mask = [
+                1 if tid in {self.bos_token_id, self.eos_token_id, self.pad_token_id, self.mask_token_id} else 0
+                for tid in input_ids
+            ]
+            encoded_dict["special_tokens_mask"] = special_tokens_mask
+        if return_length:
+            encoded_dict["length"] = len([tid for tid in input_ids if tid != self.pad_token_id])
+        if return_tensors == "pt":
+            output = {}
+            for k, v in encoded_dict.items():
+                tensor = torch.tensor(v, dtype=torch.long)
+                if tensor.ndim == 1:
+                    tensor = tensor.unsqueeze(0)
+                output[k] = tensor
+        else:
+            output = encoded_dict
+        return BatchEncoding(output, tensor_type=return_tensors)
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = True,
+        return_attention_mask: Optional[bool] = True,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        all_input_ids = []
+        all_attention_masks = []
+        all_token_type_ids = []
+        all_special_tokens_masks = []
+        all_lengths = []
+        for item in batch_text_or_text_pairs:
+            if isinstance(item, tuple):
+                text, text_pair = item
+            else:
+                text, text_pair = item, None
+            encoded = self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=False,  # We'll handle batch padding later
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=None,  # Don't convert to tensors yet
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs
+            )
+            all_input_ids.append(encoded["input_ids"])
+            if "attention_mask" in encoded:
+                all_attention_masks.append(encoded["attention_mask"])
+            if "token_type_ids" in encoded:
+                all_token_type_ids.append(encoded["token_type_ids"])
+            if "special_tokens_mask" in encoded:
+                all_special_tokens_masks.append(encoded["special_tokens_mask"])
+            if "length" in encoded:
+                all_lengths.append(encoded["length"])
+        batched = {
+            "input_ids": all_input_ids,
+        }
+        if all_attention_masks:
+            batched["attention_mask"] = all_attention_masks
+        if all_token_type_ids:
+            batched["token_type_ids"] = all_token_type_ids
+        if all_special_tokens_masks:
+            batched["special_tokens_mask"] = all_special_tokens_masks
+        if all_lengths:
+            batched["length"] = all_lengths
+        # Handle batch padding
+        if padding == True or padding == "longest":
+            max_len = max(len(ids) for ids in all_input_ids)
+            for key in batched:
+                if key in ["input_ids", "attention_mask", "token_type_ids", "special_tokens_mask"]:
+                    padded_seqs = []
+                    for seq in batched[key]:
+                        pad_len = max_len - len(seq)
+                        if pad_len > 0:
+                            if key == "input_ids":
+                                padding_value = self.pad_token_id
+                            else:
+                                padding_value = 0
+                            if self.padding_side == "right":
+                                padded_seq = seq + [padding_value] * pad_len
+                            else:
+                                padded_seq = [padding_value] * pad_len + seq
+                        else:
+                            padded_seq = seq
+                        padded_seqs.append(padded_seq)
+                    batched[key] = padded_seqs
+        if return_tensors == "pt":
+            def to_tensor_list(lst):
+                return [torch.tensor(item, dtype=torch.long) for item in lst]
+            for key in ["input_ids", "attention_mask", "token_type_ids", "special_tokens_mask"]:
+                if key in batched:
+                    batched[key] = torch.nn.utils.rnn.pad_sequence(
+                        to_tensor_list(batched[key]),
+                        batch_first=True,
+                        padding_value=self.pad_token_id if key == "input_ids" else 0
+                    )
+            # Handle non-sequence data
+            if "length" in batched:
+                batched["length"] = torch.tensor(batched["length"], dtype=torch.long)
+        return BatchEncoding(batched, tensor_type=return_tensors)
+    def pad(
+        self,
+        encoded_inputs,
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """Pad encoded inputs."""
+        # This is a simplified version - full implementation would be more complex
+        return encoded_inputs
+    # Save/Load methods
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """Save vocabulary to files."""
+        if not os.path.isdir(save_directory):
+            os.makedirs(save_directory)
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        legacy_format: bool = True,
+        filename_prefix: Optional[str] = None,
+        push_to_hub: bool = False,
+        **kwargs
+    ):
+        """Save tokenizer to directory."""
+        if not os.path.exists(save_directory):
+            os.makedirs(save_directory)
+        # Save vocabulary
+        vocab_files = self.save_vocabulary(save_directory, filename_prefix)
+        # Save tokenizer config
+        tokenizer_config = {
+            "tokenizer_class": self.__class__.__name__,
+            "model_max_length": self.model_max_length,
+            "padding_side": self.padding_side,
+            "truncation_side": self.truncation_side,
+            "special_tokens": {
+                "bos_token": self.bos_token,
+                "eos_token": self.eos_token,
+                "pad_token": self.pad_token,
+                "unk_token": self.unk_token,
+                "mask_token": self.mask_token,
+            }
+        }
+        config_file = os.path.join(save_directory, "tokenizer_config.json")
+        with open(config_file, "w", encoding="utf-8") as f:
+            json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
+        print(f"✅ Tokenizer saved to: {save_directory}")
+        return (save_directory,)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        **kwargs
+    ):
+        """Load tokenizer from pretrained directory or hub."""
+        if os.path.isdir(pretrained_model_name_or_path):
+            vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
+            config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
+            # Load config if available
+            config = {}
+            if os.path.exists(config_file):
+                with open(config_file, "r", encoding="utf-8") as f:
+                    config = json.load(f)
+            # Merge config with kwargs
+            merged_config = {**config, **kwargs}
+            return cls(vocab_file=vocab_file, **merged_config)
+        else:
+            raise NotImplementedError("Loading from HuggingFace Hub not implemented yet")
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """Get special tokens mask."""
+        if already_has_special_tokens:
+            return [
+                1 if tid in {self.bos_token_id, self.eos_token_id, self.pad_token_id, self.mask_token_id}
+                else 0 for tid in token_ids_0
+            ]
+        mask = [1]  # BOS
+        mask.extend([0] * len(token_ids_0))  # Token sequence
+        mask.append(1)  # EOS
+        if token_ids_1 is not None:
+            mask.extend([0] * len(token_ids_1))  # Second sequence
+            mask.append(1)  # EOS
+        return mask
+    def create_token_type_ids_from_sequences(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """Create token type IDs for sequences."""
+        sep = [self.eos_token_id]
+        cls = [self.bos_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def build_inputs_with_special_tokens(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """Build inputs with special tokens."""
+        if token_ids_1 is None:
+            return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        return ([self.bos_token_id] + token_ids_0 + [self.eos_token_id] +
+                token_ids_1 + [self.eos_token_id])
+class FastChemTokenizerSelfies(FastChemTokenizer):
+    """
+    SELFIES variant that handles whitespace-separated tokens.
+    Uses trie-based longest-match encoding (same as original working version).
+    """
+    def _encode_core(self, text: str) -> List[int]:
+        """Trie-based encoding for SELFIES with fragment + atom vocab."""
+        result_ids = []
+        i = 0
+        n = len(text)
+        while i < n:
+            if text[i].isspace():  # skip literal whitespace
+                i += 1
+                continue
+            node = self.trie_root
+            j = i
+            last_match_id = None
+            last_match_end = i
+            # Traverse trie character by character (including spaces if part of vocab key)
+            while j < n and text[j] in node.children:
+                node = node.children[text[j]]
+                j += 1
+                if node.token_id is not None:
+                    last_match_id = node.token_id
+                    last_match_end = j
+            if last_match_id is not None:
+                result_ids.append(last_match_id)
+                i = last_match_end
+            else:
+                # Fallback: encode one char as unk or atom
+                result_ids.append(self.token_to_id.get(text[i], self.unk_token_id))
+                i += 1
+        return result_ids
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """SELFIES decoding: join tokens with spaces (preserve original format)."""
+        return " ".join(tokens)
+    def decode(
+        self,
+        token_ids: Union[List[int], torch.Tensor],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs
+    ) -> str:
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.tolist()
+        if skip_special_tokens:
+            special_ids = {
+                self.bos_token_id,
+                self.eos_token_id,
+                self.pad_token_id,
+                self.mask_token_id,
+            }
+        else:
+            special_ids = set()
+        tokens = []
+        for tid in token_ids:
+            if tid in special_ids:
+                continue
+            token = self.id_to_token.get(tid, self.unk_token)
+            tokens.append(token)
+        return " ".join(tokens)   # ✅ preserve spaces

smitok_core/vocab.json ADDED Viewed

	@@ -0,0 +1,1137 @@

+{
+  "<s>": 0,
+  "</s>": 2,
+  "<pad>": 1,
+  "<unk>": 3,
+  "<mask>": 4,
+  "COc1ccc(-c2coc3": 5,
+  "COc1ccc(-c2ccc": 6,
+  "COc1ccc(NC(=O)": 7,
+  "O=[N+]([O-])c1": 8,
+  "COc1ccc2[nH]c": 9,
+  "COc1ccc(/C=C/": 10,
+  "COc1ccc(-c2cc": 11,
+  "COc1ccc(C(=O)": 12,
+  "COc1ccc(-c2nc": 13,
+  "COc1cc2c(cc1": 14,
+  "COc1ccc2c(c1": 15,
+  "CCCCCCCCCCCC": 16,
+  "CC(=O)OC1CC2": 17,
+  "CC(=O)Nc1ccc": 18,
+  "CCOC(=O)c1cc": 19,
+  "COc1ccc(/C=C": 20,
+  "COc1cc(C(=O)": 21,
+  "COc1ccc2[nH]": 22,
+  "COC(=O)c1ccc": 23,
+  "Cc1ccc(C(=O)": 24,
+  "COc1cccc(-c2": 25,
+  "COC(=O)[C@@H": 26,
+  "COc1ccc(OC)c": 27,
+  "COc1cc(/C=C/": 28,
+  "CC(=O)OC1CC": 29,
+  "O=C(Nc1ccc(": 30,
+  "Cc1cc(C(=O)": 31,
+  "COc1cc(OC)c": 32,
+  "C[C@]12CC[C": 33,
+  "CCOC(=O)c1c": 34,
+  "COc1cc(/C=C": 35,
+  "C=CC(=O)Nc1": 36,
+  "O=C(Nc1cccc": 37,
+  "O=C(COc1ccc": 38,
+  "COc1ccccc1C": 39,
+  "COc1ccc(-c2": 40,
+  "COC(=O)c1cc": 41,
+  "COc1ccc(-n2": 42,
+  "COC(=O)[C@H": 43,
+  "COc1ccc(/C=": 44,
+  "CCS(=O)(=O)": 45,
+  "CC(=O)OCC1": 46,
+  "CC(C)[C@@H": 47,
+  "CC(C)c1ccc": 48,
+  "CCOC(=O)c1": 49,
+  "COc1ccc(CC": 50,
+  "CCOC(=O)[C": 51,
+  "COc1cc(OC)": 52,
+  "CCOC(=O)C1": 53,
+  "O=c1cc(-c2": 54,
+  "Nc1ncnc2c1": 55,
+  "COc1cc2c(c": 56,
+  "Cc1ccc(-c2": 57,
+  "C[C@@H]1CN": 58,
+  "COc1cc(O)c": 59,
+  "O=C(c1ccc(": 60,
+  "O=C(Nc1ccc": 61,
+  "COc1cccc(-": 62,
+  "CC(C)(C)c1": 63,
+  "O=S(=O)(c1": 64,
+  "COc1cc(/C=": 65,
+  "CC(C)=CCc1": 66,
+  "COc1ccc(CN": 67,
+  "COc1cc(-c2": 68,
+  "FC(F)(F)c1": 69,
+  "CN(C)c1ccc": 70,
+  "COc1ccc(N2": 71,
+  "CS(=O)(=O)": 72,
+  "COc1cccc(C": 73,
+  "COc1ccccc1": 74,
+  "COc1ccc(C=": 75,
+  "CC(=O)c1c(": 76,
+  "COc1ccc([C": 77,
+  "COc1ccc2nc": 78,
+  "COc1ccc(C2": 79,
+  "CC(=O)OC1C": 80,
+  "COc1cc(C2": 81,
+  "COc1cccc(": 82,
+  "CC1(C)CCC": 83,
+  "C=C1C(=O)": 84,
+  "COc1ccc(N": 85,
+  "O=C(c1ccc": 86,
+  "COC(=O)CC": 87,
+  "Cc1cccc(C": 88,
+  "COc1cc2c(": 89,
+  "CCOc1ccc(": 90,
+  "CC(=O)Oc1": 91,
+  "N#Cc1cccc": 92,
+  "COc1ccc(-": 93,
+  "COC(=O)[C": 94,
+  "Cc1cc(O)c": 95,
+  "Cc1cc(=O)": 96,
+  "COC(=O)C(": 97,
+  "CC(=O)N[C": 98,
+  "COc1ccc2c": 99,
+  "COC(=O)c1": 100,
+  "COC(=O)C1": 101,
+  "CC(=O)OC1": 102,
+  "Cc1cc(C)c": 103,
+  "COc1ccc(C": 104,
+  "COc1cccc2": 105,
+  "O=c1[nH]c": 106,
+  "CC(C)(C)C": 107,
+  "CC(=O)O[C": 108,
+  "COc1cc(O)": 109,
+  "Cc1ccccc1": 110,
+  "CC(C)[C@H": 111,
+  "CNC(=O)c1": 112,
+  "CCOC(=O)C": 113,
+  "CC(=O)OCC": 114,
+  "CC(=O)Nc1": 115,
+  "CC1(C)[C": 116,
+  "CCCCCCCC": 117,
+  "CCOC(=O)": 118,
+  "C=CC(=O)": 119,
+  "CC1(C)CC": 120,
+  "O=C(c1cc": 121,
+  "CCC(=O)N": 122,
+  "O=C(O)CC": 123,
+  "O=C(O)[C": 124,
+  "O=C(CCc1": 125,
+  "NC(=O)c1": 126,
+  "O=C(NCc1": 127,
+  "COC(=O)C": 128,
+  "CN1C(=O)": 129,
+  "CC(C)(C)": 130,
+  "CC(C)C[C": 131,
+  "C[C@@H]1": 132,
+  "COc1cccc": 133,
+  "CC(=O)OC": 134,
+  "O=C(COc1": 135,
+  "C=C(C)C1": 136,
+  "CC(=O)N1": 137,
+  "N#Cc1ccc": 138,
+  "Cc1cc2c(": 139,
+  "C[C@@H](": 140,
+  "Cc1cc(C)": 141,
+  "C=C1C(=O": 142,
+  "CC(=O)c1": 143,
+  "COc1ccc(": 144,
+  "COc1ccc2": 145,
+  "Cc1ccc(-": 146,
+  "CCNC(=O)": 147,
+  "COc1cc(C": 148,
+  "COC(=O)N": 149,
+  "Cc1ccc(C": 150,
+  "Cc1cccc(": 151,
+  "O=S(=O)(": 152,
+  "COc1c2c(": 153,
+  "CC1CCC2(": 154,
+  "COc1c(O)": 155,
+  "O=C(CSc1": 156,
+  "O=C(O)c1": 157,
+  "CCCC(=O)": 158,
+  "CC(=O)NC": 159,
+  "CCc1ccc(": 160,
+  "CCOc1ccc": 161,
+  "O=C1NC(=": 162,
+  "Cc1ccc(N": 163,
+  "Cc1cc(O)": 164,
+  "O=C1Nc2": 165,
+  "Cc1nc(N": 166,
+  "CC(C)CN": 167,
+  "Cc1cc(C": 168,
+  "CCOc1cc": 169,
+  "COc1cc2": 170,
+  "O=C1C(=": 171,
+  "CC12CCC": 172,
+  "Clc1ccc": 173,
+  "Cc1cc(N": 174,
+  "CC(C)c1": 175,
+  "CC(C)CC": 176,
+  "CC1CCC2": 177,
+  "CC[C@@H": 178,
+  "CCCCCCC": 179,
+  "O=C(NC1": 180,
+  "C=C1CCC": 181,
+  "Cc1cccc": 182,
+  "C[C@]12": 183,
+  "O=C(O)C": 184,
+  "C=C(C)C": 185,
+  "CC1=C(C": 186,
+  "CCC(C)C": 187,
+  "O=C(CN1": 188,
+  "Cc1ccc(": 189,
+  "CNC(=O)": 190,
+  "CCOC(=O": 191,
+  "CC(C)C(": 192,
+  "COC(=O)": 193,
+  "O=C(Nc1": 194,
+  "CN1C(=O": 195,
+  "COc1ccc": 196,
+  "CC1(C)C": 197,
+  "CCNC(=O": 198,
+  "CN(C)c1": 199,
+  "CC(=O)N": 200,
+  "CCc1ccc": 201,
+  "COc1cc(": 202,
+  "Cc1ccc2": 203,
+  "O=C(NCC": 204,
+  "CC(C)[C": 205,
+  "C[C@H]1": 206,
+  "CCC(=O)": 207,
+  "CN1CCN(": 208,
+  "O=C(N[C": 209,
+  "O=C(Cc1": 210,
+  "C[C@H](": 211,
+  "CCCc1cc": 212,
+  "CCN(CC)": 213,
+  "NC(=O)": 214,
+  "CC[C@H": 215,
+  "CC1=CC": 216,
+  "O=C1C=": 217,
+  "C[C@@H": 218,
+  "Cc1nc2": 219,
+  "Cc1cc(": 220,
+  "Cc1cc2": 221,
+  "CC(C)C": 222,
+  "N=C(N)": 223,
+  "Oc1ccc": 224,
+  "Cc1c(C": 225,
+  "CCCCc1": 226,
+  "Cc1ccc": 227,
+  "COc1cc": 228,
+  "CCc1nc": 229,
+  "Fc1ccc": 230,
+  "CC1OC(": 231,
+  "C=CCc1": 232,
+  "O=c1c2": 233,
+  "CC1(C)": 234,
+  "Cc1oc2": 235,
+  "CC1CCC": 236,
+  "COc1c(": 237,
+  "CCc1cc": 238,
+  "CC(C)N": 239,
+  "CC(C)(": 240,
+  "COC(=O": 241,
+  "c1ccc(": 242,
+  "CNC(=O": 243,
+  "CC(C)=": 244,
+  "O=C(CC": 245,
+  "Cc1nnc": 246,
+  "CCCCCC": 247,
+  "CCCCN(": 248,
+  "C=C1CC": 249,
+  "O=c1cc": 250,
+  "O=C(NC": 251,
+  "N#Cc1c": 252,
+  "CN(C)C": 253,
+  "O=C1c2": 254,
+  "Nc1ccc": 255,
+  "CC(=O)": 256,
+  "O=C1CC": 257,
+  "Cc1nc(": 258,
+  "CCCC[C": 259,
+  "C[C@@]": 260,
+  "CN(Cc1": 261,
+  "O=C1NC": 262,
+  "C=C(C)": 263,
+  "O=C(c1": 264,
+  "O=C(O)": 265,
+  "COCCN": 266,
+  "CCC(C": 267,
+  "COC1C": 268,
+  "COC1=": 269,
+  "CC1CC": 270,
+  "CN1CC": 271,
+  "O=C1C": 272,
+  "Cc1c(": 273,
+  "N#CC1": 274,
+  "C=CCN": 275,
+  "N[C@H": 276,
+  "c1ccc": 277,
+  "O=C(O": 278,
+  "Cc1cn": 279,
+  "COC[C": 280,
+  "O=c1c": 281,
+  "C=C1C": 282,
+  "CCn1c": 283,
+  "C[C@]": 284,
+  "CCCCN": 285,
+  "CC(C)": 286,
+  "CCOc1": 287,
+  "CCC[C": 288,
+  "Cc1nn": 289,
+  "O=C(N": 290,
+  "C/C=C": 291,
+  "C=C(C": 292,
+  "C=C[C": 293,
+  "CCCN(": 294,
+  "COc1c": 295,
+  "O=C(C": 296,
+  "Cc1nc": 297,
+  "Nc1nc": 298,
+  "CC1(C": 299,
+  "C[C@H": 300,
+  "CC(O)": 301,
+  "CCCCC": 302,
+  "Cc1cc": 303,
+  "CCCc1": 304,
+  "CCN(C": 305,
+  "Cc1oc": 306,
+  "CN(C)": 307,
+  "O=C([": 308,
+  "Cn1nc": 309,
+  "CCC1(": 310,
+  "C=CC1": 311,
+  "Cn1cc": 312,
+  "C/C(=": 313,
+  "CC1=C": 314,
+  "N#Cc1": 315,
+  "CC1=": 316,
+  "COc1": 317,
+  "c1cc": 318,
+  "CNCC": 319,
+  "CNc1": 320,
+  "CN1C": 321,
+  "O=c1": 322,
+  "C=C(": 323,
+  "Cc1c": 324,
+  "CC(O": 325,
+  "N#CC": 326,
+  "CSc1": 327,
+  "CC[C": 328,
+  "CCN1": 329,
+  "CCC1": 330,
+  "CC/C": 331,
+  "COC1": 332,
+  "Clc1": 333,
+  "COCC": 334,
+  "CC12": 335,
+  "O=C1": 336,
+  "CN[C": 337,
+  "CC1C": 338,
+  "CCc1": 339,
+  "CCN(": 340,
+  "CC(C": 341,
+  "OC[C": 342,
+  "CC=C": 343,
+  "CC1(": 344,
+  "CCCC": 345,
+  "Cn1c": 346,
+  "O=C(": 347,
+  "CCn1": 348,
+  "C=C1": 349,
+  "CN(C": 350,
+  "CCCN": 351,
+  "CO[C": 352,
+  "CCC(": 353,
+  "C=CC": 354,
+  "N[C": 355,
+  "O=C": 356,
+  "C[C": 357,
+  "Fc1": 358,
+  "CCO": 359,
+  "CN(": 360,
+  "COC": 361,
+  "CNC": 362,
+  "CC(": 363,
+  "CN1": 364,
+  "Cc1": 365,
+  "C/C": 366,
+  "Cn1": 367,
+  "O=S": 368,
+  "Cl.": 369,
+  "CCN": 370,
+  "C=C": 371,
+  "CC1": 372,
+  "N#C": 373,
+  "CC=": 374,
+  "Nc1": 375,
+  "NC(": 376,
+  "Oc1": 377,
+  "O[C": 378,
+  "CCC": 379,
+  "CO/": 380,
+  "C[N": 381,
+  "c1": 382,
+  "O=": 383,
+  "C[": 384,
+  "CC": 385,
+  "C#": 386,
+  "OC": 387,
+  "CN": 388,
+  "CO": 389,
+  "N#": 390,
+  "C=": 391,
+  "NC": 392,
+  "[": 393,
+  "N": 394,
+  "C": 395,
+  "OCCCNC": 396,
+  "BrCCC": 397,
+  "+]": 398,
+  "NOc": 399,
+  "CSc": 400,
+  "nH": 401,
+  "OCCN": 402,
+  "\u0100": 403,
+  "\u00b6": 404,
+  ">": 405,
+  "COCCn": 406,
+  "\u00e5": 407,
+  "(-[": 408,
+  "nncs": 409,
+  "CCCCCSc": 410,
+  "OCOC": 411,
+  "CCCCCn": 412,
+  "OCN": 413,
+  "CCCCCCNC": 414,
+  "OCCCCC": 415,
+  "NCCOCCO": 416,
+  "\u00cd": 417,
+  "NCCCCn": 418,
+  ")\\": 419,
+  "\u00c7": 420,
+  "CCNCC": 421,
+  "nccn": 422,
+  "CCCCNc": 423,
+  "\u00e4": 424,
+  "NCCS": 425,
+  "NCCCCC": 426,
+  "snc": 427,
+  "COCCSCc": 428,
+  "NCCOCc": 429,
+  "NCC": 430,
+  "\u00c1": 431,
+  "CONC": 432,
+  "\u00e6": 433,
+  "\u00ce": 434,
+  "CCl": 435,
+  "+": 436,
+  "cncn": 437,
+  "\u012f": 438,
+  "(": 439,
+  "\u0105": 440,
+  "SC": 441,
+  "OCCS": 442,
+  "\u00b9": 443,
+  "\u00ac": 444,
+  "+])": 445,
+  "!": 446,
+  "-])(": 447,
+  "CCCOCC": 448,
+  "Fc": 449,
+  "SCCCC": 450,
+  "&": 451,
+  "CCCF": 452,
+  "\u0142": 453,
+  "{": 454,
+  "CH": 455,
+  "\u0110": 456,
+  "\u0111": 457,
+  "SCN": 458,
+  "\u00f5": 459,
+  "COCCO": 460,
+  "\u0122": 461,
+  "NCCCSC": 462,
+  "NCCCn": 463,
+  "BrCc": 464,
+  "\u00a5": 465,
+  "CCOCCC": 466,
+  "](/[": 467,
+  "\u00af": 468,
+  "SN": 469,
+  "OCCO": 470,
+  "\u00c6": 471,
+  "CCCCCN": 472,
+  "CSCCc": 473,
+  "@](=": 474,
+  "CCCCCCS": 475,
+  "cccnc": 476,
+  "\u0112": 477,
+  "CSCCNC": 478,
+  "COCCc": 479,
+  "([": 480,
+  "P": 481,
+  "NCCO": 482,
+  "=[": 483,
+  "ncc": 484,
+  "nccs": 485,
+  "\u00f7": 486,
+  "123": 487,
+  "-][": 488,
+  "\u00f8": 489,
+  "t": 490,
+  "ncccn": 491,
+  "\u00ec": 492,
+  "(=": 493,
+  "+](-": 494,
+  "@@]([": 495,
+  "CSCCCCNC": 496,
+  "NNS": 497,
+  "/": 498,
+  "O": 499,
+  "\u00ee": 500,
+  "coc": 501,
+  "CCCNC": 502,
+  ")(=": 503,
+  "r": 504,
+  "NO": 505,
+  "co": 506,
+  "-]/": 507,
+  "OCCCn": 508,
+  "CNCCc": 509,
+  "\u00c4": 510,
+  "ccn": 511,
+  "cc": 512,
+  "CSCN": 513,
+  "\u00d5": 514,
+  "\u00b1": 515,
+  "snnc": 516,
+  "ncccc": 517,
+  "\u013e": 518,
+  "OCCCc": 519,
+  "\u00bd": 520,
+  "NCCNC": 521,
+  "8": 522,
+  "][": 523,
+  "CCc": 524,
+  "COCCOCCNC": 525,
+  "onc": 526,
+  "\u010d": 527,
+  "COCCCC": 528,
+  "\u00fe": 529,
+  "(/[": 530,
+  "\u00e3": 531,
+  "CCCCCNC": 532,
+  "J": 533,
+  "CCOCCCc": 534,
+  "OCCNS": 535,
+  "cccnn": 536,
+  "CCCNCc": 537,
+  "\u0124": 538,
+  "\u0137": 539,
+  "5": 540,
+  "COCCCN": 541,
+  "CCn": 542,
+  "CSCCOc": 543,
+  "A": 544,
+  "ns": 545,
+  "cnc": 546,
+  "NCCCOc": 547,
+  "CCCl": 548,
+  "\u0101": 549,
+  "CCCO": 550,
+  "OCCCO": 551,
+  "OCCCN": 552,
+  "CCCOC": 553,
+  "\u00df": 554,
+  "SCn": 555,
+  "ssc": 556,
+  "\u0127": 557,
+  "n": 558,
+  "OCCOCCOCCOCCO": 559,
+  "w": 560,
+  "CCCCOC": 561,
+  "ClCc": 562,
+  "41": 563,
+  "u": 564,
+  "\u0104": 565,
+  "\u010e": 566,
+  "conc": 567,
+  "\u00d3": 568,
+  "$": 569,
+  "OCCCNc": 570,
+  "\u0130": 571,
+  "\u00ca": 572,
+  "\u012c": 573,
+  "cccs": 574,
+  "\u0106": 575,
+  "CNCCN": 576,
+  "\u00ed": 577,
+  "nsnc": 578,
+  ",": 579,
+  "\u00c8": 580,
+  "\u00be": 581,
+  "d": 582,
+  "COCO": 583,
+  "SCCNC": 584,
+  "ccnnc": 585,
+  "(\\[": 586,
+  "\u00a1": 587,
+  "SCCN": 588,
+  "-])\\": 589,
+  "NCCCC": 590,
+  "NCN": 591,
+  "#": 592,
+  "-])=[": 593,
+  "OCCOCCOCCO": 594,
+  "\u00d9": 595,
+  "NCCSCc": 596,
+  "nncc": 597,
+  "\u00ab": 598,
+  "sccc": 599,
+  "\u00fb": 600,
+  ")(/": 601,
+  "\u00ff": 602,
+  "Cl": 603,
+  "SH": 604,
+  "CCCCOCc": 605,
+  "9": 606,
+  "NCCCN": 607,
+  "-])/": 608,
+  "CSCCS": 609,
+  "CCCNCC": 610,
+  "nc": 611,
+  "NOCCc": 612,
+  "p": 613,
+  "CCOC": 614,
+  "COCCS": 615,
+  "NCCCS": 616,
+  "Oc": 617,
+  "\u010f": 618,
+  "cnnc": 619,
+  "Y": 620,
+  "0": 621,
+  "NCCCCl": 622,
+  "\u011e": 623,
+  "OCCOCC": 624,
+  "SCCc": 625,
+  "ncoc": 626,
+  "OCCCCN": 627,
+  "OCO": 628,
+  "OCCc": 629,
+  "\u0131": 630,
+  "z": 631,
+  "+]([": 632,
+  "OCCCS": 633,
+  ")/": 634,
+  "ccsc": 635,
+  "312": 636,
+  "nnnn": 637,
+  "ssnc": 638,
+  "COCCCn": 639,
+  "ocnc": 640,
+  "cnco": 641,
+  "\u0134": 642,
+  "COCc": 643,
+  "nccnc": 644,
+  "@]": 645,
+  "OCCCCn": 646,
+  "Nn": 647,
+  "nncn": 648,
+  "\u0102": 649,
+  "CNCCC": 650,
+  "NCCNS": 651,
+  ":": 652,
+  "\u00ea": 653,
+  "CCONC": 654,
+  "CCCCNC": 655,
+  "CCCNc": 656,
+  "\u00b8": 657,
+  "CCSCc": 658,
+  "CNS": 659,
+  "\u00b4": 660,
+  "NCCCOCC": 661,
+  "54": 662,
+  "\u00cc": 663,
+  "COCCNc": 664,
+  "CCOCC": 665,
+  "ccnc": 666,
+  "\u0109": 667,
+  "\u00d6": 668,
+  "COCCOCCN": 669,
+  "\"": 670,
+  "cnoc": 671,
+  "cncc": 672,
+  "\u00e7": 673,
+  "@@](": 674,
+  "D": 675,
+  "\u00b0": 676,
+  "SCCCc": 677,
+  "\u00dc": 678,
+  "\u00b3": 679,
+  "o": 680,
+  "CSCCSC": 681,
+  "CCCCCCO": 682,
+  ".": 683,
+  "NCCCNS": 684,
+  "j": 685,
+  "\u0129": 686,
+  "CCOCCCn": 687,
+  "SCCO": 688,
+  "CCCn": 689,
+  "\u0140": 690,
+  "%": 691,
+  "CCCOCc": 692,
+  "m": 693,
+  "\u013f": 694,
+  "+](/[": 695,
+  "](/": 696,
+  ")[": 697,
+  "\u00bc": 698,
+  "\u0132": 699,
+  "CCCCSC": 700,
+  "NCCCOC": 701,
+  "x": 702,
+  "SCC": 703,
+  "~": 704,
+  ")=": 705,
+  "cscc": 706,
+  "occc": 707,
+  "NCCNc": 708,
+  "\u012e": 709,
+  "NCCCCOc": 710,
+  "\u0123": 711,
+  "ccon": 712,
+  "*": 713,
+  "E": 714,
+  "SCCOC": 715,
+  "+]=[": 716,
+  "COCCC": 717,
+  "\u011a": 718,
+  "Sc": 719,
+  "COCCOc": 720,
+  "+]\\": 721,
+  "CNn": 722,
+  "NCCCO": 723,
+  "NCCN": 724,
+  "CSCc": 725,
+  "\u00f1": 726,
+  "b": 727,
+  "CCOCCOc": 728,
+  ";": 729,
+  "4": 730,
+  "CCCCCc": 731,
+  "\u0133": 732,
+  "\u00d7": 733,
+  "CBr": 734,
+  "+](=": 735,
+  "OCCCOc": 736,
+  "\u0120": 737,
+  "CCCNS": 738,
+  "OS": 739,
+  "CCSCCOc": 740,
+  "OCCSC": 741,
+  "ccncc": 742,
+  "CCSc": 743,
+  "\u013c": 744,
+  "7": 745,
+  "COCCNC": 746,
+  "\u00c9": 747,
+  "CCCCOCC": 748,
+  "NS": 749,
+  "CCCCCS": 750,
+  "32": 751,
+  "U": 752,
+  "\u00a8": 753,
+  "SCCCO": 754,
+  "\u00b2": 755,
+  "COCCCCC": 756,
+  "CCOCCSc": 757,
+  "COCOc": 758,
+  "CCCCCOC": 759,
+  "s": 760,
+  "CSCCn": 761,
+  "NCCCNC": 762,
+  "OCCn": 763,
+  "ncco": 764,
+  "ClC": 765,
+  "(/": 766,
+  "R": 767,
+  "ccc": 768,
+  "ccco": 769,
+  "\u0108": 770,
+  "-])": 771,
+  "OP": 772,
+  "'": 773,
+  "12": 774,
+  "COS": 775,
+  "34": 776,
+  "CCCCn": 777,
+  "Z": 778,
+  "CCSCCOC": 779,
+  "\u00e8": 780,
+  "ncon": 781,
+  "\u00a2": 782,
+  "CCOCCOC": 783,
+  "ccccn": 784,
+  "CSCCC": 785,
+  "NCCCSc": 786,
+  "sc": 787,
+  "CNc": 788,
+  "-])([": 789,
+  "OCCCSC": 790,
+  "45": 791,
+  "NOCc": 792,
+  "35": 793,
+  "\u00de": 794,
+  "\u010c": 795,
+  "ncsc": 796,
+  "3": 797,
+  "NCCn": 798,
+  "COCCOCC": 799,
+  "@](": 800,
+  "NCCC": 801,
+  "_": 802,
+  "\u00ae": 803,
+  "ncnn": 804,
+  "nonc": 805,
+  "CCNS": 806,
+  "CCCCl": 807,
+  "OCCNC": 808,
+  "CCOCc": 809,
+  "NCCOc": 810,
+  "ccs": 811,
+  "43": 812,
+  "+][": 813,
+  "g": 814,
+  "CCCCCCN": 815,
+  "\u00a4": 816,
+  "CCOCCO": 817,
+  "CSCCCNC": 818,
+  "ccncn": 819,
+  "\u00dd": 820,
+  "SCCOc": 821,
+  "Brc": 822,
+  "\u0136": 823,
+  "\u0114": 824,
+  "\u0125": 825,
+  "SCCCOc": 826,
+  "}": 827,
+  "CNCc": 828,
+  "ClCC": 829,
+  "Cc": 830,
+  "nsc": 831,
+  "\u0143": 832,
+  "\u0116": 833,
+  "I": 834,
+  "CCBr": 835,
+  "CCOP": 836,
+  "6": 837,
+  "OCCOC": 838,
+  "scc": 839,
+  "CSCCOC": 840,
+  "CCOCCCNc": 841,
+  "NNc": 842,
+  "CCCSc": 843,
+  "\u011c": 844,
+  "NCCCCN": 845,
+  "H": 846,
+  "IC": 847,
+  "cnsc": 848,
+  "\u0107": 849,
+  "CCOCCOCC": 850,
+  "\u0138": 851,
+  "e": 852,
+  "\u010b": 853,
+  "\u012b": 854,
+  "CCCc": 855,
+  "nnco": 856,
+  "CCCSCC": 857,
+  "\u00f2": 858,
+  "SCCC": 859,
+  "\u013a": 860,
+  "cnccn": 861,
+  "OCCSCc": 862,
+  "\u0121": 863,
+  "\u0113": 864,
+  "CCOCCCC": 865,
+  "\u00da": 866,
+  "\u00a6": 867,
+  "l": 868,
+  "On": 869,
+  "CCOc": 870,
+  "\u00e0": 871,
+  "21": 872,
+  "+])(": 873,
+  "2": 874,
+  "](": 875,
+  "CSCCCN": 876,
+  "OCn": 877,
+  "CNN": 878,
+  "\u00fd": 879,
+  "NCCSCC": 880,
+  "@@]": 881,
+  "csnn": 882,
+  "CCCS": 883,
+  "K": 884,
+  "-[": 885,
+  "OCc": 886,
+  "SCCn": 887,
+  "\u00fc": 888,
+  "COCCOC": 889,
+  "COCCCOC": 890,
+  "/[": 891,
+  "OCC": 892,
+  "+](": 893,
+  "cnccc": 894,
+  "V": 895,
+  "`": 896,
+  "NCCOCC": 897,
+  ")=[": 898,
+  "<": 899,
+  "OCCSc": 900,
+  "cnn": 901,
+  "NOC": 902,
+  "\u00d8": 903,
+  "ONC": 904,
+  "\u0139": 905,
+  "Nc": 906,
+  "Ic": 907,
+  "nnn": 908,
+  "\u00cb": 909,
+  "NCCSc": 910,
+  "OH": 911,
+  "CCCCCO": 912,
+  "OCCC": 913,
+  "]([": 914,
+  "COCCNS": 915,
+  "\u00db": 916,
+  "CCCBr": 917,
+  "CSC": 918,
+  "CCOCCOCc": 919,
+  "NCCCNc": 920,
+  "@@](=": 921,
+  "occ": 922,
+  "@]([": 923,
+  "\u011b": 924,
+  "]/": 925,
+  "OCCBr": 926,
+  "\u011d": 927,
+  "CCOCCn": 928,
+  "BrCC": 929,
+  "sn": 930,
+  "COCCSc": 931,
+  "\u013b": 932,
+  ")/[": 933,
+  "\u00a9": 934,
+  "\u00c2": 935,
+  "nscc": 936,
+  "CCCCS": 937,
+  "NNC": 938,
+  "no": 939,
+  "ON": 940,
+  "ccno": 941,
+  "NCCc": 942,
+  "ccnn": 943,
+  "Br": 944,
+  "ncn": 945,
+  "nn": 946,
+  "noc": 947,
+  "S": 948,
+  "nccc": 949,
+  "scnc": 950,
+  "OCCF": 951,
+  "cccc": 952,
+  "+](/": 953,
+  "nnccc": 954,
+  "cnnn": 955,
+  "i": 956,
+  "\u00aa": 957,
+  "CCNCc": 958,
+  "\u00eb": 959,
+  "NCc": 960,
+  "B": 961,
+  "CCOCCS": 962,
+  "(-": 963,
+  "\u00d1": 964,
+  "\u00e2": 965,
+  "\u0119": 966,
+  "q": 967,
+  "-]": 968,
+  "NOCC": 969,
+  "\u013d": 970,
+  "ncncc": 971,
+  "NCCCc": 972,
+  "@@": 973,
+  ")(": 974,
+  "L": 975,
+  "SCCSc": 976,
+  "NNN": 977,
+  "13": 978,
+  "OCCNc": 979,
+  "COCCNCc": 980,
+  "F": 981,
+  "-]=[": 982,
+  "CON": 983,
+  "COCCCOc": 984,
+  "-])[": 985,
+  "CCCCSc": 986,
+  "y": 987,
+  "Cn": 988,
+  "\u010a": 989,
+  "oncc": 990,
+  ")-": 991,
+  "\u012a": 992,
+  "COCCCS": 993,
+  "OCCCC": 994,
+  "^": 995,
+  "X": 996,
+  "\u0118": 997,
+  "CCOCCN": 998,
+  "CCCOc": 999,
+  "nnnc": 1000,
+  "SCc": 1001,
+  "k": 1002,
+  "CCCCCCCCCCC": 1003,
+  "COP": 1004,
+  "@": 1005,
+  "(#": 1006,
+  "nnc": 1007,
+  "cnns": 1008,
+  "ccoc": 1009,
+  "\u012d": 1010,
+  "1": 1011,
+  "\\": 1012,
+  "CCCCO": 1013,
+  "CCCCNS": 1014,
+  "NSC": 1015,
+  "\u0126": 1016,
+  "+]/": 1017,
+  "23": 1018,
+  "\u00bb": 1019,
+  "31": 1020,
+  "CCCCOc": 1021,
+  "OCCSCC": 1022,
+  "+])([": 1023,
+  "\u00fa": 1024,
+  "W": 1025,
+  "\u00b7": 1026,
+  "\u0135": 1027,
+  "FC": 1028,
+  "\u0115": 1029,
+  "\u00f9": 1030,
+  "SCCS": 1031,
+  "\u00bf": 1032,
+  "\u011f": 1033,
+  "CS": 1034,
+  "\u00ef": 1035,
+  "+])[": 1036,
+  "\u00e1": 1037,
+  "\u00cf": 1038,
+  "on": 1039,
+  "+]=": 1040,
+  "COc": 1041,
+  "\u00c3": 1042,
+  "CCOCCCN": 1043,
+  "cnsn": 1044,
+  "CCOCCNc": 1045,
+  "\u00b5": 1046,
+  "CSCCCNc": 1047,
+  "CCSS": 1048,
+  "cccn": 1049,
+  "cncnc": 1050,
+  "-": 1051,
+  "OCCOc": 1052,
+  "NH": 1053,
+  "\u00d2": 1054,
+  "Clc": 1055,
+  "G": 1056,
+  "-])=": 1057,
+  "(\\": 1058,
+  "CSCCO": 1059,
+  "\u00c0": 1060,
+  "COCCCNC": 1061,
+  "CCSC": 1062,
+  "=": 1063,
+  "CSCC": 1064,
+  "\u0117": 1065,
+  "]": 1066,
+  "CCOCCNC": 1067,
+  "OCCCSc": 1068,
+  "CCS": 1069,
+  "oc": 1070,
+  "csc": 1071,
+  "OCCCl": 1072,
+  "\u0103": 1073,
+  "CCOCCOCCOCC": 1074,
+  "CCCSCc": 1075,
+  "\u00e9": 1076,
+  "CSCCCC": 1077,
+  "SCCCS": 1078,
+  "CI": 1079,
+  "\u00f6": 1080,
+  "\u00ba": 1081,
+  "42": 1082,
+  "Q": 1083,
+  "c": 1084,
+  "?": 1085,
+  "f": 1086,
+  "M": 1087,
+  "\u0141": 1088,
+  "COCCCCN": 1089,
+  "FCCC": 1090,
+  "OCCOCCS": 1091,
+  "cs": 1092,
+  "\u00f3": 1093,
+  "NCCCCCC": 1094,
+  "(=[": 1095,
+  "\u00c5": 1096,
+  "NN": 1097,
+  "\u00f0": 1098,
+  "a": 1099,
+  "|": 1100,
+  "COCCCNc": 1101,
+  "CCSCCC": 1102,
+  "CCCCCOc": 1103,
+  "v": 1104,
+  "\u0128": 1105,
+  "cn": 1106,
+  "CCCCc": 1107,
+  "CCNc": 1108,
+  "\u00a3": 1109,
+  "CCNC": 1110,
+  "24": 1111,
+  "CCCSC": 1112,
+  "T": 1113,
+  "NCCSC": 1114,
+  "CSCCN": 1115,
+  "cscn": 1116,
+  "COn": 1117,
+  "ccccc": 1118,
+  "ClCCCSc": 1119,
+  "CCSCC": 1120,
+  "ncnc": 1121,
+  "\u00a7": 1122,
+  "h": 1123,
+  "cncs": 1124,
+  "\u00d4": 1125,
+  "sccn": 1126,
+  ")([": 1127,
+  ")": 1128,
+  "nnsc": 1129,
+  "\\[": 1130,
+  "CCOCCCNC": 1131,
+  "\u00d0": 1132,
+  "NCCOC": 1133,
+  "\u00f4": 1134
+}