gbyuvd commited on
Commit
1c27d17
·
verified ·
1 Parent(s): 2c5df2a

Upload HF wrapper and smitok_core without tails

Browse files
Files changed (2) hide show
  1. FastChemTokenizerHF.py +769 -0
  2. smitok_core/vocab.json +1137 -0
FastChemTokenizerHF.py ADDED
@@ -0,0 +1,769 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import json
3
+ import os
4
+ from typing import List, Union, Optional, Tuple, Dict, Any
5
+ from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
6
+ from transformers.utils import PaddingStrategy, TensorType
7
+ from functools import lru_cache
8
+
9
+
10
+ class TrieNode:
11
+ __slots__ = ['children', 'token_id']
12
+ def __init__(self):
13
+ self.children = {}
14
+ self.token_id = None # If set, this node completes a valid token
15
+
16
+
17
+ class FastChemTokenizer(PreTrainedTokenizerBase):
18
+ """
19
+ Fully HuggingFace API compatible tokenizer for chemical representations.
20
+ """
21
+
22
+ vocab_files_names = {"vocab_file": "vocab.json"}
23
+
24
+ def __init__(
25
+ self,
26
+ token_to_id=None,
27
+ vocab_file=None,
28
+ model_max_length=512,
29
+ padding_side="right",
30
+ truncation_side="right",
31
+ chat_template=None,
32
+ **kwargs
33
+ ):
34
+ # Handle vocab loading
35
+ if token_to_id is None and vocab_file is None:
36
+ raise ValueError("Either token_to_id or vocab_file must be provided")
37
+
38
+ if vocab_file is not None:
39
+ with open(vocab_file, "r", encoding="utf-8") as f:
40
+ token_to_id = json.load(f)
41
+ token_to_id = {str(k): int(v) for k, v in token_to_id.items()}
42
+
43
+ self.token_to_id = token_to_id
44
+ self.id_to_token = {v: k for k, v in token_to_id.items()}
45
+
46
+ # Precompute max token length for possible use & clarity
47
+ self.max_token_len = max(len(t) for t in token_to_id.keys()) if token_to_id else 0
48
+
49
+ # Build trie for fast longest-match lookup
50
+ self.trie_root = self._build_trie(token_to_id)
51
+
52
+ # Validate required special tokens
53
+ required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
54
+ for tok in required_special_tokens:
55
+ if tok not in token_to_id:
56
+ raise KeyError(f"Required special token '{tok}' not found in vocab.")
57
+
58
+ # ✅ Assign special token IDs explicitly
59
+ self.bos_token_id = token_to_id["<s>"]
60
+ self.eos_token_id = token_to_id["</s>"]
61
+ self.pad_token_id = token_to_id["<pad>"]
62
+ self.unk_token_id = token_to_id["<unk>"]
63
+ self.mask_token_id = token_to_id["<mask>"]
64
+
65
+ # Special tokens
66
+ bos_token = "<s>"
67
+ eos_token = "</s>"
68
+ pad_token = "<pad>"
69
+ unk_token = "<unk>"
70
+ mask_token = "<mask>"
71
+
72
+ # Initialize parent class with all required parameters
73
+ super().__init__(
74
+ bos_token=bos_token,
75
+ eos_token=eos_token,
76
+ unk_token=unk_token,
77
+ sep_token=None,
78
+ pad_token=pad_token,
79
+ cls_token=None,
80
+ mask_token=mask_token,
81
+ additional_special_tokens=[],
82
+ model_max_length=model_max_length,
83
+ padding_side=padding_side,
84
+ truncation_side=truncation_side,
85
+ chat_template=chat_template,
86
+ **kwargs,
87
+ )
88
+
89
+ def _build_trie(self, token_to_id):
90
+ root = TrieNode()
91
+ for token, tid in token_to_id.items():
92
+ node = root
93
+ for char in token:
94
+ if char not in node.children:
95
+ node.children[char] = TrieNode()
96
+ node = node.children[char]
97
+ node.token_id = tid
98
+ return root
99
+
100
+ @property
101
+ def vocab_size(self):
102
+ return len(self.token_to_id)
103
+
104
+ def __len__(self):
105
+ return len(self.token_to_id)
106
+
107
+ def get_vocab(self) -> Dict[str, int]:
108
+ return self.token_to_id.copy()
109
+
110
+ @lru_cache(maxsize=10000)
111
+ def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
112
+ return tuple(self._encode_core(s))
113
+
114
+ def _encode_core(self, text: str) -> List[int]:
115
+ """Core encoding logic using Trie — no caching."""
116
+ tokens = text
117
+ result_ids = []
118
+ i = 0
119
+ n = len(tokens)
120
+
121
+ while i < n:
122
+ node = self.trie_root
123
+ j = i
124
+ last_match_id = None
125
+ last_match_end = i
126
+
127
+ while j < n and tokens[j] in node.children:
128
+ node = node.children[tokens[j]]
129
+ j += 1
130
+ if node.token_id is not None:
131
+ last_match_id = node.token_id
132
+ last_match_end = j
133
+
134
+ if last_match_id is not None:
135
+ result_ids.append(last_match_id)
136
+ i = last_match_end
137
+ else:
138
+ tok = tokens[i]
139
+ result_ids.append(self.token_to_id.get(tok, self.unk_token_id))
140
+ i += 1
141
+
142
+ return result_ids
143
+
144
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
145
+ token_ids = self._encode_core(text.strip())
146
+ return [self.id_to_token[tid] for tid in token_ids]
147
+
148
+ def _convert_token_to_id(self, token: str) -> int:
149
+ return self.token_to_id.get(token, self.unk_token_id)
150
+
151
+ def _convert_id_to_token(self, index: int) -> str:
152
+ return self.id_to_token.get(index, self.unk_token)
153
+
154
+ # ✅ Public methods
155
+ def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
156
+ if isinstance(tokens, str):
157
+ return self._convert_token_to_id(tokens)
158
+ return [self._convert_token_to_id(tok) for tok in tokens]
159
+
160
+ def convert_ids_to_tokens(self, ids: Union[int, List[int]]) -> Union[str, List[str]]:
161
+ if isinstance(ids, int):
162
+ return self._convert_id_to_token(ids)
163
+ return [self._convert_id_to_token(i) for i in ids]
164
+
165
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
166
+ """SMILES-style decoding: no spaces between tokens."""
167
+ return "".join(tokens)
168
+
169
+ def encode(
170
+ self,
171
+ text: str,
172
+ text_pair: Optional[str] = None,
173
+ add_special_tokens: bool = True,
174
+ padding: bool = False,
175
+ truncation: bool = False,
176
+ max_length: Optional[int] = None,
177
+ return_tensors: Optional[str] = None,
178
+ ) -> List[int]:
179
+ encoded = self.encode_plus(
180
+ text=text,
181
+ text_pair=text_pair,
182
+ add_special_tokens=add_special_tokens,
183
+ padding=padding,
184
+ truncation=truncation,
185
+ max_length=max_length,
186
+ return_tensors=return_tensors,
187
+ )
188
+
189
+ input_ids = encoded["input_ids"]
190
+ if isinstance(input_ids, torch.Tensor):
191
+ if input_ids.dim() > 1:
192
+ input_ids = input_ids.squeeze(0)
193
+ input_ids = input_ids.tolist()
194
+
195
+ return input_ids
196
+
197
+ def decode(
198
+ self,
199
+ token_ids: Union[List[int], torch.Tensor],
200
+ skip_special_tokens: bool = False,
201
+ clean_up_tokenization_spaces: bool = None,
202
+ **kwargs
203
+ ) -> str:
204
+ if isinstance(token_ids, torch.Tensor):
205
+ token_ids = token_ids.tolist()
206
+
207
+ if skip_special_tokens:
208
+ special_ids = {
209
+ self.bos_token_id,
210
+ self.eos_token_id,
211
+ self.pad_token_id,
212
+ self.mask_token_id,
213
+ }
214
+ else:
215
+ special_ids = set()
216
+
217
+ tokens = []
218
+ for tid in token_ids:
219
+ if tid in special_ids:
220
+ continue
221
+ token = self.id_to_token.get(tid, self.unk_token)
222
+ tokens.append(token)
223
+
224
+ return "".join(tokens)
225
+
226
+ def batch_decode(
227
+ self,
228
+ sequences: Union[List[List[int]], torch.Tensor],
229
+ skip_special_tokens: bool = False,
230
+ clean_up_tokenization_spaces: bool = None,
231
+ **kwargs
232
+ ) -> List[str]:
233
+ """Batch decode sequences."""
234
+ if isinstance(sequences, torch.Tensor):
235
+ sequences = sequences.tolist()
236
+
237
+ return [
238
+ self.decode(
239
+ seq,
240
+ skip_special_tokens=skip_special_tokens,
241
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
242
+ **kwargs
243
+ )
244
+ for seq in sequences
245
+ ]
246
+
247
+ def decode_with_trace(self, token_ids: List[int]) -> None:
248
+ print(f"\n🔍 Decoding {len(token_ids)} tokens:")
249
+ for i, tid in enumerate(token_ids):
250
+ token = self.id_to_token.get(tid, self.unk_token)
251
+ print(f" [{i:03d}] ID={tid:5d} → '{token}'")
252
+
253
+ def __call__(
254
+ self,
255
+ text: Union[str, List[str]],
256
+ text_pair: Optional[Union[str, List[str]]] = None,
257
+ add_special_tokens: bool = True,
258
+ padding: Union[bool, str, PaddingStrategy] = False,
259
+ truncation: Union[bool, str] = False,
260
+ max_length: Optional[int] = None,
261
+ stride: int = 0,
262
+ is_split_into_words: bool = False,
263
+ pad_to_multiple_of: Optional[int] = None,
264
+ return_tensors: Optional[Union[str, TensorType]] = None,
265
+ return_token_type_ids: Optional[bool] = None,
266
+ return_attention_mask: Optional[bool] = None,
267
+ return_overflowing_tokens: bool = False,
268
+ return_special_tokens_mask: bool = False,
269
+ return_offsets_mapping: bool = False,
270
+ return_length: bool = False,
271
+ verbose: bool = True,
272
+ **kwargs
273
+ ) -> BatchEncoding:
274
+ """
275
+ Main callable method that handles both single and batch inputs.
276
+ """
277
+ # Handle defaults
278
+ if return_token_type_ids is None:
279
+ return_token_type_ids = True
280
+ if return_attention_mask is None:
281
+ return_attention_mask = True
282
+
283
+ if isinstance(text, list):
284
+ if text_pair is not None:
285
+ batch = [(t, p) for t, p in zip(text, text_pair)]
286
+ else:
287
+ batch = text
288
+ return self.batch_encode_plus(
289
+ batch,
290
+ add_special_tokens=add_special_tokens,
291
+ padding=padding,
292
+ truncation=truncation,
293
+ max_length=max_length,
294
+ stride=stride,
295
+ is_split_into_words=is_split_into_words,
296
+ pad_to_multiple_of=pad_to_multiple_of,
297
+ return_tensors=return_tensors,
298
+ return_token_type_ids=return_token_type_ids,
299
+ return_attention_mask=return_attention_mask,
300
+ return_overflowing_tokens=return_overflowing_tokens,
301
+ return_special_tokens_mask=return_special_tokens_mask,
302
+ return_offsets_mapping=return_offsets_mapping,
303
+ return_length=return_length,
304
+ verbose=verbose,
305
+ **kwargs
306
+ )
307
+ else:
308
+ return self.encode_plus(
309
+ text=text,
310
+ text_pair=text_pair,
311
+ add_special_tokens=add_special_tokens,
312
+ padding=padding,
313
+ truncation=truncation,
314
+ max_length=max_length,
315
+ stride=stride,
316
+ is_split_into_words=is_split_into_words,
317
+ pad_to_multiple_of=pad_to_multiple_of,
318
+ return_tensors=return_tensors,
319
+ return_token_type_ids=return_token_type_ids,
320
+ return_attention_mask=return_attention_mask,
321
+ return_overflowing_tokens=return_overflowing_tokens,
322
+ return_special_tokens_mask=return_special_tokens_mask,
323
+ return_offsets_mapping=return_offsets_mapping,
324
+ return_length=return_length,
325
+ verbose=verbose,
326
+ **kwargs
327
+ )
328
+
329
+ def encode_plus(
330
+ self,
331
+ text: str,
332
+ text_pair: Optional[str] = None,
333
+ add_special_tokens: bool = True,
334
+ padding: Union[bool, str, PaddingStrategy] = False,
335
+ truncation: Union[bool, str] = False,
336
+ max_length: Optional[int] = None,
337
+ stride: int = 0,
338
+ is_split_into_words: bool = False,
339
+ pad_to_multiple_of: Optional[int] = None,
340
+ return_tensors: Optional[Union[str, TensorType]] = None,
341
+ return_token_type_ids: Optional[bool] = True,
342
+ return_attention_mask: Optional[bool] = True,
343
+ return_overflowing_tokens: bool = False,
344
+ return_special_tokens_mask: bool = False,
345
+ return_offsets_mapping: bool = False,
346
+ return_length: bool = False,
347
+ verbose: bool = True,
348
+ **kwargs
349
+ ) -> BatchEncoding:
350
+ if max_length is None:
351
+ max_length = self.model_max_length
352
+
353
+ ids_a = list(self._cached_encode_str(text.strip()))
354
+
355
+ if text_pair is not None:
356
+ ids_b = list(self._cached_encode_str(text_pair.strip()))
357
+ else:
358
+ ids_b = None
359
+
360
+ input_ids = []
361
+ token_type_ids = []
362
+
363
+ if add_special_tokens:
364
+ input_ids.append(self.bos_token_id)
365
+ token_type_ids.append(0)
366
+ if ids_b is not None:
367
+ input_ids.extend(ids_a)
368
+ token_type_ids.extend([0] * len(ids_a))
369
+ input_ids.append(self.eos_token_id)
370
+ token_type_ids.append(0)
371
+
372
+ input_ids.extend(ids_b)
373
+ token_type_ids.extend([1] * len(ids_b))
374
+ input_ids.append(self.eos_token_id)
375
+ token_type_ids.append(1)
376
+ else:
377
+ input_ids.extend(ids_a)
378
+ token_type_ids.extend([0] * len(ids_a))
379
+ input_ids.append(self.eos_token_id)
380
+ token_type_ids.append(0)
381
+ else:
382
+ input_ids = ids_a.copy()
383
+ token_type_ids = [0] * len(input_ids)
384
+ if ids_b is not None:
385
+ input_ids.extend(ids_b)
386
+ token_type_ids.extend([1] * len(ids_b))
387
+
388
+ # Handle truncation
389
+ if truncation and len(input_ids) > max_length:
390
+ input_ids = input_ids[:max_length]
391
+ token_type_ids = token_type_ids[:max_length]
392
+
393
+ # Handle padding
394
+ if padding == True or padding == "max_length":
395
+ pad_len = max_length - len(input_ids)
396
+ if pad_len > 0:
397
+ if self.padding_side == "right":
398
+ input_ids.extend([self.pad_token_id] * pad_len)
399
+ token_type_ids.extend([0] * pad_len)
400
+ else:
401
+ input_ids = [self.pad_token_id] * pad_len + input_ids
402
+ token_type_ids = [0] * pad_len + token_type_ids
403
+
404
+ attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]
405
+
406
+ encoded_dict = {
407
+ "input_ids": input_ids,
408
+ }
409
+
410
+ if return_attention_mask:
411
+ encoded_dict["attention_mask"] = attention_mask
412
+
413
+ if return_token_type_ids:
414
+ encoded_dict["token_type_ids"] = token_type_ids
415
+
416
+ if return_special_tokens_mask:
417
+ special_tokens_mask = [
418
+ 1 if tid in {self.bos_token_id, self.eos_token_id, self.pad_token_id, self.mask_token_id} else 0
419
+ for tid in input_ids
420
+ ]
421
+ encoded_dict["special_tokens_mask"] = special_tokens_mask
422
+
423
+ if return_length:
424
+ encoded_dict["length"] = len([tid for tid in input_ids if tid != self.pad_token_id])
425
+
426
+ if return_tensors == "pt":
427
+ output = {}
428
+ for k, v in encoded_dict.items():
429
+ tensor = torch.tensor(v, dtype=torch.long)
430
+ if tensor.ndim == 1:
431
+ tensor = tensor.unsqueeze(0)
432
+ output[k] = tensor
433
+ else:
434
+ output = encoded_dict
435
+
436
+ return BatchEncoding(output, tensor_type=return_tensors)
437
+
438
+ def batch_encode_plus(
439
+ self,
440
+ batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
441
+ add_special_tokens: bool = True,
442
+ padding: Union[bool, str, PaddingStrategy] = False,
443
+ truncation: Union[bool, str] = False,
444
+ max_length: Optional[int] = None,
445
+ stride: int = 0,
446
+ is_split_into_words: bool = False,
447
+ pad_to_multiple_of: Optional[int] = None,
448
+ return_tensors: Optional[Union[str, TensorType]] = None,
449
+ return_token_type_ids: Optional[bool] = True,
450
+ return_attention_mask: Optional[bool] = True,
451
+ return_overflowing_tokens: bool = False,
452
+ return_special_tokens_mask: bool = False,
453
+ return_offsets_mapping: bool = False,
454
+ return_length: bool = False,
455
+ verbose: bool = True,
456
+ **kwargs
457
+ ) -> BatchEncoding:
458
+ all_input_ids = []
459
+ all_attention_masks = []
460
+ all_token_type_ids = []
461
+ all_special_tokens_masks = []
462
+ all_lengths = []
463
+
464
+ for item in batch_text_or_text_pairs:
465
+ if isinstance(item, tuple):
466
+ text, text_pair = item
467
+ else:
468
+ text, text_pair = item, None
469
+
470
+ encoded = self.encode_plus(
471
+ text=text,
472
+ text_pair=text_pair,
473
+ add_special_tokens=add_special_tokens,
474
+ padding=False, # We'll handle batch padding later
475
+ truncation=truncation,
476
+ max_length=max_length,
477
+ stride=stride,
478
+ is_split_into_words=is_split_into_words,
479
+ pad_to_multiple_of=pad_to_multiple_of,
480
+ return_tensors=None, # Don't convert to tensors yet
481
+ return_token_type_ids=return_token_type_ids,
482
+ return_attention_mask=return_attention_mask,
483
+ return_overflowing_tokens=return_overflowing_tokens,
484
+ return_special_tokens_mask=return_special_tokens_mask,
485
+ return_offsets_mapping=return_offsets_mapping,
486
+ return_length=return_length,
487
+ verbose=verbose,
488
+ **kwargs
489
+ )
490
+
491
+ all_input_ids.append(encoded["input_ids"])
492
+ if "attention_mask" in encoded:
493
+ all_attention_masks.append(encoded["attention_mask"])
494
+ if "token_type_ids" in encoded:
495
+ all_token_type_ids.append(encoded["token_type_ids"])
496
+ if "special_tokens_mask" in encoded:
497
+ all_special_tokens_masks.append(encoded["special_tokens_mask"])
498
+ if "length" in encoded:
499
+ all_lengths.append(encoded["length"])
500
+
501
+ batched = {
502
+ "input_ids": all_input_ids,
503
+ }
504
+
505
+ if all_attention_masks:
506
+ batched["attention_mask"] = all_attention_masks
507
+ if all_token_type_ids:
508
+ batched["token_type_ids"] = all_token_type_ids
509
+ if all_special_tokens_masks:
510
+ batched["special_tokens_mask"] = all_special_tokens_masks
511
+ if all_lengths:
512
+ batched["length"] = all_lengths
513
+
514
+ # Handle batch padding
515
+ if padding == True or padding == "longest":
516
+ max_len = max(len(ids) for ids in all_input_ids)
517
+ for key in batched:
518
+ if key in ["input_ids", "attention_mask", "token_type_ids", "special_tokens_mask"]:
519
+ padded_seqs = []
520
+ for seq in batched[key]:
521
+ pad_len = max_len - len(seq)
522
+ if pad_len > 0:
523
+ if key == "input_ids":
524
+ padding_value = self.pad_token_id
525
+ else:
526
+ padding_value = 0
527
+
528
+ if self.padding_side == "right":
529
+ padded_seq = seq + [padding_value] * pad_len
530
+ else:
531
+ padded_seq = [padding_value] * pad_len + seq
532
+ else:
533
+ padded_seq = seq
534
+ padded_seqs.append(padded_seq)
535
+ batched[key] = padded_seqs
536
+
537
+ if return_tensors == "pt":
538
+ def to_tensor_list(lst):
539
+ return [torch.tensor(item, dtype=torch.long) for item in lst]
540
+
541
+ for key in ["input_ids", "attention_mask", "token_type_ids", "special_tokens_mask"]:
542
+ if key in batched:
543
+ batched[key] = torch.nn.utils.rnn.pad_sequence(
544
+ to_tensor_list(batched[key]),
545
+ batch_first=True,
546
+ padding_value=self.pad_token_id if key == "input_ids" else 0
547
+ )
548
+
549
+ # Handle non-sequence data
550
+ if "length" in batched:
551
+ batched["length"] = torch.tensor(batched["length"], dtype=torch.long)
552
+
553
+ return BatchEncoding(batched, tensor_type=return_tensors)
554
+
555
+ def pad(
556
+ self,
557
+ encoded_inputs,
558
+ padding: Union[bool, str, PaddingStrategy] = True,
559
+ max_length: Optional[int] = None,
560
+ pad_to_multiple_of: Optional[int] = None,
561
+ return_attention_mask: Optional[bool] = None,
562
+ return_tensors: Optional[Union[str, TensorType]] = None,
563
+ verbose: bool = True,
564
+ ) -> BatchEncoding:
565
+ """Pad encoded inputs."""
566
+ # This is a simplified version - full implementation would be more complex
567
+ return encoded_inputs
568
+
569
+ # Save/Load methods
570
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
571
+ """Save vocabulary to files."""
572
+ if not os.path.isdir(save_directory):
573
+ os.makedirs(save_directory)
574
+
575
+ vocab_file = os.path.join(
576
+ save_directory,
577
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
578
+ )
579
+
580
+ with open(vocab_file, "w", encoding="utf-8") as f:
581
+ json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)
582
+
583
+ return (vocab_file,)
584
+
585
+ def save_pretrained(
586
+ self,
587
+ save_directory: Union[str, os.PathLike],
588
+ legacy_format: bool = True,
589
+ filename_prefix: Optional[str] = None,
590
+ push_to_hub: bool = False,
591
+ **kwargs
592
+ ):
593
+ """Save tokenizer to directory."""
594
+ if not os.path.exists(save_directory):
595
+ os.makedirs(save_directory)
596
+
597
+ # Save vocabulary
598
+ vocab_files = self.save_vocabulary(save_directory, filename_prefix)
599
+
600
+ # Save tokenizer config
601
+ tokenizer_config = {
602
+ "tokenizer_class": self.__class__.__name__,
603
+ "model_max_length": self.model_max_length,
604
+ "padding_side": self.padding_side,
605
+ "truncation_side": self.truncation_side,
606
+ "special_tokens": {
607
+ "bos_token": self.bos_token,
608
+ "eos_token": self.eos_token,
609
+ "pad_token": self.pad_token,
610
+ "unk_token": self.unk_token,
611
+ "mask_token": self.mask_token,
612
+ }
613
+ }
614
+
615
+ config_file = os.path.join(save_directory, "tokenizer_config.json")
616
+ with open(config_file, "w", encoding="utf-8") as f:
617
+ json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
618
+
619
+ print(f"✅ Tokenizer saved to: {save_directory}")
620
+
621
+ return (save_directory,)
622
+
623
+ @classmethod
624
+ def from_pretrained(
625
+ cls,
626
+ pretrained_model_name_or_path: Union[str, os.PathLike],
627
+ *init_inputs,
628
+ **kwargs
629
+ ):
630
+ """Load tokenizer from pretrained directory or hub."""
631
+ if os.path.isdir(pretrained_model_name_or_path):
632
+ vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
633
+ config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
634
+
635
+ # Load config if available
636
+ config = {}
637
+ if os.path.exists(config_file):
638
+ with open(config_file, "r", encoding="utf-8") as f:
639
+ config = json.load(f)
640
+
641
+ # Merge config with kwargs
642
+ merged_config = {**config, **kwargs}
643
+
644
+ return cls(vocab_file=vocab_file, **merged_config)
645
+ else:
646
+ raise NotImplementedError("Loading from HuggingFace Hub not implemented yet")
647
+
648
+ def get_special_tokens_mask(
649
+ self,
650
+ token_ids_0: List[int],
651
+ token_ids_1: Optional[List[int]] = None,
652
+ already_has_special_tokens: bool = False
653
+ ) -> List[int]:
654
+ """Get special tokens mask."""
655
+ if already_has_special_tokens:
656
+ return [
657
+ 1 if tid in {self.bos_token_id, self.eos_token_id, self.pad_token_id, self.mask_token_id}
658
+ else 0 for tid in token_ids_0
659
+ ]
660
+
661
+ mask = [1] # BOS
662
+ mask.extend([0] * len(token_ids_0)) # Token sequence
663
+ mask.append(1) # EOS
664
+
665
+ if token_ids_1 is not None:
666
+ mask.extend([0] * len(token_ids_1)) # Second sequence
667
+ mask.append(1) # EOS
668
+
669
+ return mask
670
+
671
+ def create_token_type_ids_from_sequences(
672
+ self,
673
+ token_ids_0: List[int],
674
+ token_ids_1: Optional[List[int]] = None
675
+ ) -> List[int]:
676
+ """Create token type IDs for sequences."""
677
+ sep = [self.eos_token_id]
678
+ cls = [self.bos_token_id]
679
+
680
+ if token_ids_1 is None:
681
+ return len(cls + token_ids_0 + sep) * [0]
682
+
683
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
684
+
685
+ def build_inputs_with_special_tokens(
686
+ self,
687
+ token_ids_0: List[int],
688
+ token_ids_1: Optional[List[int]] = None
689
+ ) -> List[int]:
690
+ """Build inputs with special tokens."""
691
+ if token_ids_1 is None:
692
+ return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
693
+
694
+ return ([self.bos_token_id] + token_ids_0 + [self.eos_token_id] +
695
+ token_ids_1 + [self.eos_token_id])
696
+
697
+
698
+ class FastChemTokenizerSelfies(FastChemTokenizer):
699
+ """
700
+ SELFIES variant that handles whitespace-separated tokens.
701
+ Uses trie-based longest-match encoding (same as original working version).
702
+ """
703
+
704
+ def _encode_core(self, text: str) -> List[int]:
705
+ """Trie-based encoding for SELFIES with fragment + atom vocab."""
706
+ result_ids = []
707
+ i = 0
708
+ n = len(text)
709
+
710
+ while i < n:
711
+ if text[i].isspace(): # skip literal whitespace
712
+ i += 1
713
+ continue
714
+
715
+ node = self.trie_root
716
+ j = i
717
+ last_match_id = None
718
+ last_match_end = i
719
+
720
+ # Traverse trie character by character (including spaces if part of vocab key)
721
+ while j < n and text[j] in node.children:
722
+ node = node.children[text[j]]
723
+ j += 1
724
+ if node.token_id is not None:
725
+ last_match_id = node.token_id
726
+ last_match_end = j
727
+
728
+ if last_match_id is not None:
729
+ result_ids.append(last_match_id)
730
+ i = last_match_end
731
+ else:
732
+ # Fallback: encode one char as unk or atom
733
+ result_ids.append(self.token_to_id.get(text[i], self.unk_token_id))
734
+ i += 1
735
+
736
+ return result_ids
737
+
738
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
739
+ """SELFIES decoding: join tokens with spaces (preserve original format)."""
740
+ return " ".join(tokens)
741
+
742
+ def decode(
743
+ self,
744
+ token_ids: Union[List[int], torch.Tensor],
745
+ skip_special_tokens: bool = False,
746
+ clean_up_tokenization_spaces: bool = None,
747
+ **kwargs
748
+ ) -> str:
749
+ if isinstance(token_ids, torch.Tensor):
750
+ token_ids = token_ids.tolist()
751
+
752
+ if skip_special_tokens:
753
+ special_ids = {
754
+ self.bos_token_id,
755
+ self.eos_token_id,
756
+ self.pad_token_id,
757
+ self.mask_token_id,
758
+ }
759
+ else:
760
+ special_ids = set()
761
+
762
+ tokens = []
763
+ for tid in token_ids:
764
+ if tid in special_ids:
765
+ continue
766
+ token = self.id_to_token.get(tid, self.unk_token)
767
+ tokens.append(token)
768
+
769
+ return " ".join(tokens) # ✅ preserve spaces
smitok_core/vocab.json ADDED
@@ -0,0 +1,1137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<s>": 0,
3
+ "</s>": 2,
4
+ "<pad>": 1,
5
+ "<unk>": 3,
6
+ "<mask>": 4,
7
+ "COc1ccc(-c2coc3": 5,
8
+ "COc1ccc(-c2ccc": 6,
9
+ "COc1ccc(NC(=O)": 7,
10
+ "O=[N+]([O-])c1": 8,
11
+ "COc1ccc2[nH]c": 9,
12
+ "COc1ccc(/C=C/": 10,
13
+ "COc1ccc(-c2cc": 11,
14
+ "COc1ccc(C(=O)": 12,
15
+ "COc1ccc(-c2nc": 13,
16
+ "COc1cc2c(cc1": 14,
17
+ "COc1ccc2c(c1": 15,
18
+ "CCCCCCCCCCCC": 16,
19
+ "CC(=O)OC1CC2": 17,
20
+ "CC(=O)Nc1ccc": 18,
21
+ "CCOC(=O)c1cc": 19,
22
+ "COc1ccc(/C=C": 20,
23
+ "COc1cc(C(=O)": 21,
24
+ "COc1ccc2[nH]": 22,
25
+ "COC(=O)c1ccc": 23,
26
+ "Cc1ccc(C(=O)": 24,
27
+ "COc1cccc(-c2": 25,
28
+ "COC(=O)[C@@H": 26,
29
+ "COc1ccc(OC)c": 27,
30
+ "COc1cc(/C=C/": 28,
31
+ "CC(=O)OC1CC": 29,
32
+ "O=C(Nc1ccc(": 30,
33
+ "Cc1cc(C(=O)": 31,
34
+ "COc1cc(OC)c": 32,
35
+ "C[C@]12CC[C": 33,
36
+ "CCOC(=O)c1c": 34,
37
+ "COc1cc(/C=C": 35,
38
+ "C=CC(=O)Nc1": 36,
39
+ "O=C(Nc1cccc": 37,
40
+ "O=C(COc1ccc": 38,
41
+ "COc1ccccc1C": 39,
42
+ "COc1ccc(-c2": 40,
43
+ "COC(=O)c1cc": 41,
44
+ "COc1ccc(-n2": 42,
45
+ "COC(=O)[C@H": 43,
46
+ "COc1ccc(/C=": 44,
47
+ "CCS(=O)(=O)": 45,
48
+ "CC(=O)OCC1": 46,
49
+ "CC(C)[C@@H": 47,
50
+ "CC(C)c1ccc": 48,
51
+ "CCOC(=O)c1": 49,
52
+ "COc1ccc(CC": 50,
53
+ "CCOC(=O)[C": 51,
54
+ "COc1cc(OC)": 52,
55
+ "CCOC(=O)C1": 53,
56
+ "O=c1cc(-c2": 54,
57
+ "Nc1ncnc2c1": 55,
58
+ "COc1cc2c(c": 56,
59
+ "Cc1ccc(-c2": 57,
60
+ "C[C@@H]1CN": 58,
61
+ "COc1cc(O)c": 59,
62
+ "O=C(c1ccc(": 60,
63
+ "O=C(Nc1ccc": 61,
64
+ "COc1cccc(-": 62,
65
+ "CC(C)(C)c1": 63,
66
+ "O=S(=O)(c1": 64,
67
+ "COc1cc(/C=": 65,
68
+ "CC(C)=CCc1": 66,
69
+ "COc1ccc(CN": 67,
70
+ "COc1cc(-c2": 68,
71
+ "FC(F)(F)c1": 69,
72
+ "CN(C)c1ccc": 70,
73
+ "COc1ccc(N2": 71,
74
+ "CS(=O)(=O)": 72,
75
+ "COc1cccc(C": 73,
76
+ "COc1ccccc1": 74,
77
+ "COc1ccc(C=": 75,
78
+ "CC(=O)c1c(": 76,
79
+ "COc1ccc([C": 77,
80
+ "COc1ccc2nc": 78,
81
+ "COc1ccc(C2": 79,
82
+ "CC(=O)OC1C": 80,
83
+ "COc1cc(C2": 81,
84
+ "COc1cccc(": 82,
85
+ "CC1(C)CCC": 83,
86
+ "C=C1C(=O)": 84,
87
+ "COc1ccc(N": 85,
88
+ "O=C(c1ccc": 86,
89
+ "COC(=O)CC": 87,
90
+ "Cc1cccc(C": 88,
91
+ "COc1cc2c(": 89,
92
+ "CCOc1ccc(": 90,
93
+ "CC(=O)Oc1": 91,
94
+ "N#Cc1cccc": 92,
95
+ "COc1ccc(-": 93,
96
+ "COC(=O)[C": 94,
97
+ "Cc1cc(O)c": 95,
98
+ "Cc1cc(=O)": 96,
99
+ "COC(=O)C(": 97,
100
+ "CC(=O)N[C": 98,
101
+ "COc1ccc2c": 99,
102
+ "COC(=O)c1": 100,
103
+ "COC(=O)C1": 101,
104
+ "CC(=O)OC1": 102,
105
+ "Cc1cc(C)c": 103,
106
+ "COc1ccc(C": 104,
107
+ "COc1cccc2": 105,
108
+ "O=c1[nH]c": 106,
109
+ "CC(C)(C)C": 107,
110
+ "CC(=O)O[C": 108,
111
+ "COc1cc(O)": 109,
112
+ "Cc1ccccc1": 110,
113
+ "CC(C)[C@H": 111,
114
+ "CNC(=O)c1": 112,
115
+ "CCOC(=O)C": 113,
116
+ "CC(=O)OCC": 114,
117
+ "CC(=O)Nc1": 115,
118
+ "CC1(C)[C": 116,
119
+ "CCCCCCCC": 117,
120
+ "CCOC(=O)": 118,
121
+ "C=CC(=O)": 119,
122
+ "CC1(C)CC": 120,
123
+ "O=C(c1cc": 121,
124
+ "CCC(=O)N": 122,
125
+ "O=C(O)CC": 123,
126
+ "O=C(O)[C": 124,
127
+ "O=C(CCc1": 125,
128
+ "NC(=O)c1": 126,
129
+ "O=C(NCc1": 127,
130
+ "COC(=O)C": 128,
131
+ "CN1C(=O)": 129,
132
+ "CC(C)(C)": 130,
133
+ "CC(C)C[C": 131,
134
+ "C[C@@H]1": 132,
135
+ "COc1cccc": 133,
136
+ "CC(=O)OC": 134,
137
+ "O=C(COc1": 135,
138
+ "C=C(C)C1": 136,
139
+ "CC(=O)N1": 137,
140
+ "N#Cc1ccc": 138,
141
+ "Cc1cc2c(": 139,
142
+ "C[C@@H](": 140,
143
+ "Cc1cc(C)": 141,
144
+ "C=C1C(=O": 142,
145
+ "CC(=O)c1": 143,
146
+ "COc1ccc(": 144,
147
+ "COc1ccc2": 145,
148
+ "Cc1ccc(-": 146,
149
+ "CCNC(=O)": 147,
150
+ "COc1cc(C": 148,
151
+ "COC(=O)N": 149,
152
+ "Cc1ccc(C": 150,
153
+ "Cc1cccc(": 151,
154
+ "O=S(=O)(": 152,
155
+ "COc1c2c(": 153,
156
+ "CC1CCC2(": 154,
157
+ "COc1c(O)": 155,
158
+ "O=C(CSc1": 156,
159
+ "O=C(O)c1": 157,
160
+ "CCCC(=O)": 158,
161
+ "CC(=O)NC": 159,
162
+ "CCc1ccc(": 160,
163
+ "CCOc1ccc": 161,
164
+ "O=C1NC(=": 162,
165
+ "Cc1ccc(N": 163,
166
+ "Cc1cc(O)": 164,
167
+ "O=C1Nc2": 165,
168
+ "Cc1nc(N": 166,
169
+ "CC(C)CN": 167,
170
+ "Cc1cc(C": 168,
171
+ "CCOc1cc": 169,
172
+ "COc1cc2": 170,
173
+ "O=C1C(=": 171,
174
+ "CC12CCC": 172,
175
+ "Clc1ccc": 173,
176
+ "Cc1cc(N": 174,
177
+ "CC(C)c1": 175,
178
+ "CC(C)CC": 176,
179
+ "CC1CCC2": 177,
180
+ "CC[C@@H": 178,
181
+ "CCCCCCC": 179,
182
+ "O=C(NC1": 180,
183
+ "C=C1CCC": 181,
184
+ "Cc1cccc": 182,
185
+ "C[C@]12": 183,
186
+ "O=C(O)C": 184,
187
+ "C=C(C)C": 185,
188
+ "CC1=C(C": 186,
189
+ "CCC(C)C": 187,
190
+ "O=C(CN1": 188,
191
+ "Cc1ccc(": 189,
192
+ "CNC(=O)": 190,
193
+ "CCOC(=O": 191,
194
+ "CC(C)C(": 192,
195
+ "COC(=O)": 193,
196
+ "O=C(Nc1": 194,
197
+ "CN1C(=O": 195,
198
+ "COc1ccc": 196,
199
+ "CC1(C)C": 197,
200
+ "CCNC(=O": 198,
201
+ "CN(C)c1": 199,
202
+ "CC(=O)N": 200,
203
+ "CCc1ccc": 201,
204
+ "COc1cc(": 202,
205
+ "Cc1ccc2": 203,
206
+ "O=C(NCC": 204,
207
+ "CC(C)[C": 205,
208
+ "C[C@H]1": 206,
209
+ "CCC(=O)": 207,
210
+ "CN1CCN(": 208,
211
+ "O=C(N[C": 209,
212
+ "O=C(Cc1": 210,
213
+ "C[C@H](": 211,
214
+ "CCCc1cc": 212,
215
+ "CCN(CC)": 213,
216
+ "NC(=O)": 214,
217
+ "CC[C@H": 215,
218
+ "CC1=CC": 216,
219
+ "O=C1C=": 217,
220
+ "C[C@@H": 218,
221
+ "Cc1nc2": 219,
222
+ "Cc1cc(": 220,
223
+ "Cc1cc2": 221,
224
+ "CC(C)C": 222,
225
+ "N=C(N)": 223,
226
+ "Oc1ccc": 224,
227
+ "Cc1c(C": 225,
228
+ "CCCCc1": 226,
229
+ "Cc1ccc": 227,
230
+ "COc1cc": 228,
231
+ "CCc1nc": 229,
232
+ "Fc1ccc": 230,
233
+ "CC1OC(": 231,
234
+ "C=CCc1": 232,
235
+ "O=c1c2": 233,
236
+ "CC1(C)": 234,
237
+ "Cc1oc2": 235,
238
+ "CC1CCC": 236,
239
+ "COc1c(": 237,
240
+ "CCc1cc": 238,
241
+ "CC(C)N": 239,
242
+ "CC(C)(": 240,
243
+ "COC(=O": 241,
244
+ "c1ccc(": 242,
245
+ "CNC(=O": 243,
246
+ "CC(C)=": 244,
247
+ "O=C(CC": 245,
248
+ "Cc1nnc": 246,
249
+ "CCCCCC": 247,
250
+ "CCCCN(": 248,
251
+ "C=C1CC": 249,
252
+ "O=c1cc": 250,
253
+ "O=C(NC": 251,
254
+ "N#Cc1c": 252,
255
+ "CN(C)C": 253,
256
+ "O=C1c2": 254,
257
+ "Nc1ccc": 255,
258
+ "CC(=O)": 256,
259
+ "O=C1CC": 257,
260
+ "Cc1nc(": 258,
261
+ "CCCC[C": 259,
262
+ "C[C@@]": 260,
263
+ "CN(Cc1": 261,
264
+ "O=C1NC": 262,
265
+ "C=C(C)": 263,
266
+ "O=C(c1": 264,
267
+ "O=C(O)": 265,
268
+ "COCCN": 266,
269
+ "CCC(C": 267,
270
+ "COC1C": 268,
271
+ "COC1=": 269,
272
+ "CC1CC": 270,
273
+ "CN1CC": 271,
274
+ "O=C1C": 272,
275
+ "Cc1c(": 273,
276
+ "N#CC1": 274,
277
+ "C=CCN": 275,
278
+ "N[C@H": 276,
279
+ "c1ccc": 277,
280
+ "O=C(O": 278,
281
+ "Cc1cn": 279,
282
+ "COC[C": 280,
283
+ "O=c1c": 281,
284
+ "C=C1C": 282,
285
+ "CCn1c": 283,
286
+ "C[C@]": 284,
287
+ "CCCCN": 285,
288
+ "CC(C)": 286,
289
+ "CCOc1": 287,
290
+ "CCC[C": 288,
291
+ "Cc1nn": 289,
292
+ "O=C(N": 290,
293
+ "C/C=C": 291,
294
+ "C=C(C": 292,
295
+ "C=C[C": 293,
296
+ "CCCN(": 294,
297
+ "COc1c": 295,
298
+ "O=C(C": 296,
299
+ "Cc1nc": 297,
300
+ "Nc1nc": 298,
301
+ "CC1(C": 299,
302
+ "C[C@H": 300,
303
+ "CC(O)": 301,
304
+ "CCCCC": 302,
305
+ "Cc1cc": 303,
306
+ "CCCc1": 304,
307
+ "CCN(C": 305,
308
+ "Cc1oc": 306,
309
+ "CN(C)": 307,
310
+ "O=C([": 308,
311
+ "Cn1nc": 309,
312
+ "CCC1(": 310,
313
+ "C=CC1": 311,
314
+ "Cn1cc": 312,
315
+ "C/C(=": 313,
316
+ "CC1=C": 314,
317
+ "N#Cc1": 315,
318
+ "CC1=": 316,
319
+ "COc1": 317,
320
+ "c1cc": 318,
321
+ "CNCC": 319,
322
+ "CNc1": 320,
323
+ "CN1C": 321,
324
+ "O=c1": 322,
325
+ "C=C(": 323,
326
+ "Cc1c": 324,
327
+ "CC(O": 325,
328
+ "N#CC": 326,
329
+ "CSc1": 327,
330
+ "CC[C": 328,
331
+ "CCN1": 329,
332
+ "CCC1": 330,
333
+ "CC/C": 331,
334
+ "COC1": 332,
335
+ "Clc1": 333,
336
+ "COCC": 334,
337
+ "CC12": 335,
338
+ "O=C1": 336,
339
+ "CN[C": 337,
340
+ "CC1C": 338,
341
+ "CCc1": 339,
342
+ "CCN(": 340,
343
+ "CC(C": 341,
344
+ "OC[C": 342,
345
+ "CC=C": 343,
346
+ "CC1(": 344,
347
+ "CCCC": 345,
348
+ "Cn1c": 346,
349
+ "O=C(": 347,
350
+ "CCn1": 348,
351
+ "C=C1": 349,
352
+ "CN(C": 350,
353
+ "CCCN": 351,
354
+ "CO[C": 352,
355
+ "CCC(": 353,
356
+ "C=CC": 354,
357
+ "N[C": 355,
358
+ "O=C": 356,
359
+ "C[C": 357,
360
+ "Fc1": 358,
361
+ "CCO": 359,
362
+ "CN(": 360,
363
+ "COC": 361,
364
+ "CNC": 362,
365
+ "CC(": 363,
366
+ "CN1": 364,
367
+ "Cc1": 365,
368
+ "C/C": 366,
369
+ "Cn1": 367,
370
+ "O=S": 368,
371
+ "Cl.": 369,
372
+ "CCN": 370,
373
+ "C=C": 371,
374
+ "CC1": 372,
375
+ "N#C": 373,
376
+ "CC=": 374,
377
+ "Nc1": 375,
378
+ "NC(": 376,
379
+ "Oc1": 377,
380
+ "O[C": 378,
381
+ "CCC": 379,
382
+ "CO/": 380,
383
+ "C[N": 381,
384
+ "c1": 382,
385
+ "O=": 383,
386
+ "C[": 384,
387
+ "CC": 385,
388
+ "C#": 386,
389
+ "OC": 387,
390
+ "CN": 388,
391
+ "CO": 389,
392
+ "N#": 390,
393
+ "C=": 391,
394
+ "NC": 392,
395
+ "[": 393,
396
+ "N": 394,
397
+ "C": 395,
398
+ "OCCCNC": 396,
399
+ "BrCCC": 397,
400
+ "+]": 398,
401
+ "NOc": 399,
402
+ "CSc": 400,
403
+ "nH": 401,
404
+ "OCCN": 402,
405
+ "\u0100": 403,
406
+ "\u00b6": 404,
407
+ ">": 405,
408
+ "COCCn": 406,
409
+ "\u00e5": 407,
410
+ "(-[": 408,
411
+ "nncs": 409,
412
+ "CCCCCSc": 410,
413
+ "OCOC": 411,
414
+ "CCCCCn": 412,
415
+ "OCN": 413,
416
+ "CCCCCCNC": 414,
417
+ "OCCCCC": 415,
418
+ "NCCOCCO": 416,
419
+ "\u00cd": 417,
420
+ "NCCCCn": 418,
421
+ ")\\": 419,
422
+ "\u00c7": 420,
423
+ "CCNCC": 421,
424
+ "nccn": 422,
425
+ "CCCCNc": 423,
426
+ "\u00e4": 424,
427
+ "NCCS": 425,
428
+ "NCCCCC": 426,
429
+ "snc": 427,
430
+ "COCCSCc": 428,
431
+ "NCCOCc": 429,
432
+ "NCC": 430,
433
+ "\u00c1": 431,
434
+ "CONC": 432,
435
+ "\u00e6": 433,
436
+ "\u00ce": 434,
437
+ "CCl": 435,
438
+ "+": 436,
439
+ "cncn": 437,
440
+ "\u012f": 438,
441
+ "(": 439,
442
+ "\u0105": 440,
443
+ "SC": 441,
444
+ "OCCS": 442,
445
+ "\u00b9": 443,
446
+ "\u00ac": 444,
447
+ "+])": 445,
448
+ "!": 446,
449
+ "-])(": 447,
450
+ "CCCOCC": 448,
451
+ "Fc": 449,
452
+ "SCCCC": 450,
453
+ "&": 451,
454
+ "CCCF": 452,
455
+ "\u0142": 453,
456
+ "{": 454,
457
+ "CH": 455,
458
+ "\u0110": 456,
459
+ "\u0111": 457,
460
+ "SCN": 458,
461
+ "\u00f5": 459,
462
+ "COCCO": 460,
463
+ "\u0122": 461,
464
+ "NCCCSC": 462,
465
+ "NCCCn": 463,
466
+ "BrCc": 464,
467
+ "\u00a5": 465,
468
+ "CCOCCC": 466,
469
+ "](/[": 467,
470
+ "\u00af": 468,
471
+ "SN": 469,
472
+ "OCCO": 470,
473
+ "\u00c6": 471,
474
+ "CCCCCN": 472,
475
+ "CSCCc": 473,
476
+ "@](=": 474,
477
+ "CCCCCCS": 475,
478
+ "cccnc": 476,
479
+ "\u0112": 477,
480
+ "CSCCNC": 478,
481
+ "COCCc": 479,
482
+ "([": 480,
483
+ "P": 481,
484
+ "NCCO": 482,
485
+ "=[": 483,
486
+ "ncc": 484,
487
+ "nccs": 485,
488
+ "\u00f7": 486,
489
+ "123": 487,
490
+ "-][": 488,
491
+ "\u00f8": 489,
492
+ "t": 490,
493
+ "ncccn": 491,
494
+ "\u00ec": 492,
495
+ "(=": 493,
496
+ "+](-": 494,
497
+ "@@]([": 495,
498
+ "CSCCCCNC": 496,
499
+ "NNS": 497,
500
+ "/": 498,
501
+ "O": 499,
502
+ "\u00ee": 500,
503
+ "coc": 501,
504
+ "CCCNC": 502,
505
+ ")(=": 503,
506
+ "r": 504,
507
+ "NO": 505,
508
+ "co": 506,
509
+ "-]/": 507,
510
+ "OCCCn": 508,
511
+ "CNCCc": 509,
512
+ "\u00c4": 510,
513
+ "ccn": 511,
514
+ "cc": 512,
515
+ "CSCN": 513,
516
+ "\u00d5": 514,
517
+ "\u00b1": 515,
518
+ "snnc": 516,
519
+ "ncccc": 517,
520
+ "\u013e": 518,
521
+ "OCCCc": 519,
522
+ "\u00bd": 520,
523
+ "NCCNC": 521,
524
+ "8": 522,
525
+ "][": 523,
526
+ "CCc": 524,
527
+ "COCCOCCNC": 525,
528
+ "onc": 526,
529
+ "\u010d": 527,
530
+ "COCCCC": 528,
531
+ "\u00fe": 529,
532
+ "(/[": 530,
533
+ "\u00e3": 531,
534
+ "CCCCCNC": 532,
535
+ "J": 533,
536
+ "CCOCCCc": 534,
537
+ "OCCNS": 535,
538
+ "cccnn": 536,
539
+ "CCCNCc": 537,
540
+ "\u0124": 538,
541
+ "\u0137": 539,
542
+ "5": 540,
543
+ "COCCCN": 541,
544
+ "CCn": 542,
545
+ "CSCCOc": 543,
546
+ "A": 544,
547
+ "ns": 545,
548
+ "cnc": 546,
549
+ "NCCCOc": 547,
550
+ "CCCl": 548,
551
+ "\u0101": 549,
552
+ "CCCO": 550,
553
+ "OCCCO": 551,
554
+ "OCCCN": 552,
555
+ "CCCOC": 553,
556
+ "\u00df": 554,
557
+ "SCn": 555,
558
+ "ssc": 556,
559
+ "\u0127": 557,
560
+ "n": 558,
561
+ "OCCOCCOCCOCCO": 559,
562
+ "w": 560,
563
+ "CCCCOC": 561,
564
+ "ClCc": 562,
565
+ "41": 563,
566
+ "u": 564,
567
+ "\u0104": 565,
568
+ "\u010e": 566,
569
+ "conc": 567,
570
+ "\u00d3": 568,
571
+ "$": 569,
572
+ "OCCCNc": 570,
573
+ "\u0130": 571,
574
+ "\u00ca": 572,
575
+ "\u012c": 573,
576
+ "cccs": 574,
577
+ "\u0106": 575,
578
+ "CNCCN": 576,
579
+ "\u00ed": 577,
580
+ "nsnc": 578,
581
+ ",": 579,
582
+ "\u00c8": 580,
583
+ "\u00be": 581,
584
+ "d": 582,
585
+ "COCO": 583,
586
+ "SCCNC": 584,
587
+ "ccnnc": 585,
588
+ "(\\[": 586,
589
+ "\u00a1": 587,
590
+ "SCCN": 588,
591
+ "-])\\": 589,
592
+ "NCCCC": 590,
593
+ "NCN": 591,
594
+ "#": 592,
595
+ "-])=[": 593,
596
+ "OCCOCCOCCO": 594,
597
+ "\u00d9": 595,
598
+ "NCCSCc": 596,
599
+ "nncc": 597,
600
+ "\u00ab": 598,
601
+ "sccc": 599,
602
+ "\u00fb": 600,
603
+ ")(/": 601,
604
+ "\u00ff": 602,
605
+ "Cl": 603,
606
+ "SH": 604,
607
+ "CCCCOCc": 605,
608
+ "9": 606,
609
+ "NCCCN": 607,
610
+ "-])/": 608,
611
+ "CSCCS": 609,
612
+ "CCCNCC": 610,
613
+ "nc": 611,
614
+ "NOCCc": 612,
615
+ "p": 613,
616
+ "CCOC": 614,
617
+ "COCCS": 615,
618
+ "NCCCS": 616,
619
+ "Oc": 617,
620
+ "\u010f": 618,
621
+ "cnnc": 619,
622
+ "Y": 620,
623
+ "0": 621,
624
+ "NCCCCl": 622,
625
+ "\u011e": 623,
626
+ "OCCOCC": 624,
627
+ "SCCc": 625,
628
+ "ncoc": 626,
629
+ "OCCCCN": 627,
630
+ "OCO": 628,
631
+ "OCCc": 629,
632
+ "\u0131": 630,
633
+ "z": 631,
634
+ "+]([": 632,
635
+ "OCCCS": 633,
636
+ ")/": 634,
637
+ "ccsc": 635,
638
+ "312": 636,
639
+ "nnnn": 637,
640
+ "ssnc": 638,
641
+ "COCCCn": 639,
642
+ "ocnc": 640,
643
+ "cnco": 641,
644
+ "\u0134": 642,
645
+ "COCc": 643,
646
+ "nccnc": 644,
647
+ "@]": 645,
648
+ "OCCCCn": 646,
649
+ "Nn": 647,
650
+ "nncn": 648,
651
+ "\u0102": 649,
652
+ "CNCCC": 650,
653
+ "NCCNS": 651,
654
+ ":": 652,
655
+ "\u00ea": 653,
656
+ "CCONC": 654,
657
+ "CCCCNC": 655,
658
+ "CCCNc": 656,
659
+ "\u00b8": 657,
660
+ "CCSCc": 658,
661
+ "CNS": 659,
662
+ "\u00b4": 660,
663
+ "NCCCOCC": 661,
664
+ "54": 662,
665
+ "\u00cc": 663,
666
+ "COCCNc": 664,
667
+ "CCOCC": 665,
668
+ "ccnc": 666,
669
+ "\u0109": 667,
670
+ "\u00d6": 668,
671
+ "COCCOCCN": 669,
672
+ "\"": 670,
673
+ "cnoc": 671,
674
+ "cncc": 672,
675
+ "\u00e7": 673,
676
+ "@@](": 674,
677
+ "D": 675,
678
+ "\u00b0": 676,
679
+ "SCCCc": 677,
680
+ "\u00dc": 678,
681
+ "\u00b3": 679,
682
+ "o": 680,
683
+ "CSCCSC": 681,
684
+ "CCCCCCO": 682,
685
+ ".": 683,
686
+ "NCCCNS": 684,
687
+ "j": 685,
688
+ "\u0129": 686,
689
+ "CCOCCCn": 687,
690
+ "SCCO": 688,
691
+ "CCCn": 689,
692
+ "\u0140": 690,
693
+ "%": 691,
694
+ "CCCOCc": 692,
695
+ "m": 693,
696
+ "\u013f": 694,
697
+ "+](/[": 695,
698
+ "](/": 696,
699
+ ")[": 697,
700
+ "\u00bc": 698,
701
+ "\u0132": 699,
702
+ "CCCCSC": 700,
703
+ "NCCCOC": 701,
704
+ "x": 702,
705
+ "SCC": 703,
706
+ "~": 704,
707
+ ")=": 705,
708
+ "cscc": 706,
709
+ "occc": 707,
710
+ "NCCNc": 708,
711
+ "\u012e": 709,
712
+ "NCCCCOc": 710,
713
+ "\u0123": 711,
714
+ "ccon": 712,
715
+ "*": 713,
716
+ "E": 714,
717
+ "SCCOC": 715,
718
+ "+]=[": 716,
719
+ "COCCC": 717,
720
+ "\u011a": 718,
721
+ "Sc": 719,
722
+ "COCCOc": 720,
723
+ "+]\\": 721,
724
+ "CNn": 722,
725
+ "NCCCO": 723,
726
+ "NCCN": 724,
727
+ "CSCc": 725,
728
+ "\u00f1": 726,
729
+ "b": 727,
730
+ "CCOCCOc": 728,
731
+ ";": 729,
732
+ "4": 730,
733
+ "CCCCCc": 731,
734
+ "\u0133": 732,
735
+ "\u00d7": 733,
736
+ "CBr": 734,
737
+ "+](=": 735,
738
+ "OCCCOc": 736,
739
+ "\u0120": 737,
740
+ "CCCNS": 738,
741
+ "OS": 739,
742
+ "CCSCCOc": 740,
743
+ "OCCSC": 741,
744
+ "ccncc": 742,
745
+ "CCSc": 743,
746
+ "\u013c": 744,
747
+ "7": 745,
748
+ "COCCNC": 746,
749
+ "\u00c9": 747,
750
+ "CCCCOCC": 748,
751
+ "NS": 749,
752
+ "CCCCCS": 750,
753
+ "32": 751,
754
+ "U": 752,
755
+ "\u00a8": 753,
756
+ "SCCCO": 754,
757
+ "\u00b2": 755,
758
+ "COCCCCC": 756,
759
+ "CCOCCSc": 757,
760
+ "COCOc": 758,
761
+ "CCCCCOC": 759,
762
+ "s": 760,
763
+ "CSCCn": 761,
764
+ "NCCCNC": 762,
765
+ "OCCn": 763,
766
+ "ncco": 764,
767
+ "ClC": 765,
768
+ "(/": 766,
769
+ "R": 767,
770
+ "ccc": 768,
771
+ "ccco": 769,
772
+ "\u0108": 770,
773
+ "-])": 771,
774
+ "OP": 772,
775
+ "'": 773,
776
+ "12": 774,
777
+ "COS": 775,
778
+ "34": 776,
779
+ "CCCCn": 777,
780
+ "Z": 778,
781
+ "CCSCCOC": 779,
782
+ "\u00e8": 780,
783
+ "ncon": 781,
784
+ "\u00a2": 782,
785
+ "CCOCCOC": 783,
786
+ "ccccn": 784,
787
+ "CSCCC": 785,
788
+ "NCCCSc": 786,
789
+ "sc": 787,
790
+ "CNc": 788,
791
+ "-])([": 789,
792
+ "OCCCSC": 790,
793
+ "45": 791,
794
+ "NOCc": 792,
795
+ "35": 793,
796
+ "\u00de": 794,
797
+ "\u010c": 795,
798
+ "ncsc": 796,
799
+ "3": 797,
800
+ "NCCn": 798,
801
+ "COCCOCC": 799,
802
+ "@](": 800,
803
+ "NCCC": 801,
804
+ "_": 802,
805
+ "\u00ae": 803,
806
+ "ncnn": 804,
807
+ "nonc": 805,
808
+ "CCNS": 806,
809
+ "CCCCl": 807,
810
+ "OCCNC": 808,
811
+ "CCOCc": 809,
812
+ "NCCOc": 810,
813
+ "ccs": 811,
814
+ "43": 812,
815
+ "+][": 813,
816
+ "g": 814,
817
+ "CCCCCCN": 815,
818
+ "\u00a4": 816,
819
+ "CCOCCO": 817,
820
+ "CSCCCNC": 818,
821
+ "ccncn": 819,
822
+ "\u00dd": 820,
823
+ "SCCOc": 821,
824
+ "Brc": 822,
825
+ "\u0136": 823,
826
+ "\u0114": 824,
827
+ "\u0125": 825,
828
+ "SCCCOc": 826,
829
+ "}": 827,
830
+ "CNCc": 828,
831
+ "ClCC": 829,
832
+ "Cc": 830,
833
+ "nsc": 831,
834
+ "\u0143": 832,
835
+ "\u0116": 833,
836
+ "I": 834,
837
+ "CCBr": 835,
838
+ "CCOP": 836,
839
+ "6": 837,
840
+ "OCCOC": 838,
841
+ "scc": 839,
842
+ "CSCCOC": 840,
843
+ "CCOCCCNc": 841,
844
+ "NNc": 842,
845
+ "CCCSc": 843,
846
+ "\u011c": 844,
847
+ "NCCCCN": 845,
848
+ "H": 846,
849
+ "IC": 847,
850
+ "cnsc": 848,
851
+ "\u0107": 849,
852
+ "CCOCCOCC": 850,
853
+ "\u0138": 851,
854
+ "e": 852,
855
+ "\u010b": 853,
856
+ "\u012b": 854,
857
+ "CCCc": 855,
858
+ "nnco": 856,
859
+ "CCCSCC": 857,
860
+ "\u00f2": 858,
861
+ "SCCC": 859,
862
+ "\u013a": 860,
863
+ "cnccn": 861,
864
+ "OCCSCc": 862,
865
+ "\u0121": 863,
866
+ "\u0113": 864,
867
+ "CCOCCCC": 865,
868
+ "\u00da": 866,
869
+ "\u00a6": 867,
870
+ "l": 868,
871
+ "On": 869,
872
+ "CCOc": 870,
873
+ "\u00e0": 871,
874
+ "21": 872,
875
+ "+])(": 873,
876
+ "2": 874,
877
+ "](": 875,
878
+ "CSCCCN": 876,
879
+ "OCn": 877,
880
+ "CNN": 878,
881
+ "\u00fd": 879,
882
+ "NCCSCC": 880,
883
+ "@@]": 881,
884
+ "csnn": 882,
885
+ "CCCS": 883,
886
+ "K": 884,
887
+ "-[": 885,
888
+ "OCc": 886,
889
+ "SCCn": 887,
890
+ "\u00fc": 888,
891
+ "COCCOC": 889,
892
+ "COCCCOC": 890,
893
+ "/[": 891,
894
+ "OCC": 892,
895
+ "+](": 893,
896
+ "cnccc": 894,
897
+ "V": 895,
898
+ "`": 896,
899
+ "NCCOCC": 897,
900
+ ")=[": 898,
901
+ "<": 899,
902
+ "OCCSc": 900,
903
+ "cnn": 901,
904
+ "NOC": 902,
905
+ "\u00d8": 903,
906
+ "ONC": 904,
907
+ "\u0139": 905,
908
+ "Nc": 906,
909
+ "Ic": 907,
910
+ "nnn": 908,
911
+ "\u00cb": 909,
912
+ "NCCSc": 910,
913
+ "OH": 911,
914
+ "CCCCCO": 912,
915
+ "OCCC": 913,
916
+ "]([": 914,
917
+ "COCCNS": 915,
918
+ "\u00db": 916,
919
+ "CCCBr": 917,
920
+ "CSC": 918,
921
+ "CCOCCOCc": 919,
922
+ "NCCCNc": 920,
923
+ "@@](=": 921,
924
+ "occ": 922,
925
+ "@]([": 923,
926
+ "\u011b": 924,
927
+ "]/": 925,
928
+ "OCCBr": 926,
929
+ "\u011d": 927,
930
+ "CCOCCn": 928,
931
+ "BrCC": 929,
932
+ "sn": 930,
933
+ "COCCSc": 931,
934
+ "\u013b": 932,
935
+ ")/[": 933,
936
+ "\u00a9": 934,
937
+ "\u00c2": 935,
938
+ "nscc": 936,
939
+ "CCCCS": 937,
940
+ "NNC": 938,
941
+ "no": 939,
942
+ "ON": 940,
943
+ "ccno": 941,
944
+ "NCCc": 942,
945
+ "ccnn": 943,
946
+ "Br": 944,
947
+ "ncn": 945,
948
+ "nn": 946,
949
+ "noc": 947,
950
+ "S": 948,
951
+ "nccc": 949,
952
+ "scnc": 950,
953
+ "OCCF": 951,
954
+ "cccc": 952,
955
+ "+](/": 953,
956
+ "nnccc": 954,
957
+ "cnnn": 955,
958
+ "i": 956,
959
+ "\u00aa": 957,
960
+ "CCNCc": 958,
961
+ "\u00eb": 959,
962
+ "NCc": 960,
963
+ "B": 961,
964
+ "CCOCCS": 962,
965
+ "(-": 963,
966
+ "\u00d1": 964,
967
+ "\u00e2": 965,
968
+ "\u0119": 966,
969
+ "q": 967,
970
+ "-]": 968,
971
+ "NOCC": 969,
972
+ "\u013d": 970,
973
+ "ncncc": 971,
974
+ "NCCCc": 972,
975
+ "@@": 973,
976
+ ")(": 974,
977
+ "L": 975,
978
+ "SCCSc": 976,
979
+ "NNN": 977,
980
+ "13": 978,
981
+ "OCCNc": 979,
982
+ "COCCNCc": 980,
983
+ "F": 981,
984
+ "-]=[": 982,
985
+ "CON": 983,
986
+ "COCCCOc": 984,
987
+ "-])[": 985,
988
+ "CCCCSc": 986,
989
+ "y": 987,
990
+ "Cn": 988,
991
+ "\u010a": 989,
992
+ "oncc": 990,
993
+ ")-": 991,
994
+ "\u012a": 992,
995
+ "COCCCS": 993,
996
+ "OCCCC": 994,
997
+ "^": 995,
998
+ "X": 996,
999
+ "\u0118": 997,
1000
+ "CCOCCN": 998,
1001
+ "CCCOc": 999,
1002
+ "nnnc": 1000,
1003
+ "SCc": 1001,
1004
+ "k": 1002,
1005
+ "CCCCCCCCCCC": 1003,
1006
+ "COP": 1004,
1007
+ "@": 1005,
1008
+ "(#": 1006,
1009
+ "nnc": 1007,
1010
+ "cnns": 1008,
1011
+ "ccoc": 1009,
1012
+ "\u012d": 1010,
1013
+ "1": 1011,
1014
+ "\\": 1012,
1015
+ "CCCCO": 1013,
1016
+ "CCCCNS": 1014,
1017
+ "NSC": 1015,
1018
+ "\u0126": 1016,
1019
+ "+]/": 1017,
1020
+ "23": 1018,
1021
+ "\u00bb": 1019,
1022
+ "31": 1020,
1023
+ "CCCCOc": 1021,
1024
+ "OCCSCC": 1022,
1025
+ "+])([": 1023,
1026
+ "\u00fa": 1024,
1027
+ "W": 1025,
1028
+ "\u00b7": 1026,
1029
+ "\u0135": 1027,
1030
+ "FC": 1028,
1031
+ "\u0115": 1029,
1032
+ "\u00f9": 1030,
1033
+ "SCCS": 1031,
1034
+ "\u00bf": 1032,
1035
+ "\u011f": 1033,
1036
+ "CS": 1034,
1037
+ "\u00ef": 1035,
1038
+ "+])[": 1036,
1039
+ "\u00e1": 1037,
1040
+ "\u00cf": 1038,
1041
+ "on": 1039,
1042
+ "+]=": 1040,
1043
+ "COc": 1041,
1044
+ "\u00c3": 1042,
1045
+ "CCOCCCN": 1043,
1046
+ "cnsn": 1044,
1047
+ "CCOCCNc": 1045,
1048
+ "\u00b5": 1046,
1049
+ "CSCCCNc": 1047,
1050
+ "CCSS": 1048,
1051
+ "cccn": 1049,
1052
+ "cncnc": 1050,
1053
+ "-": 1051,
1054
+ "OCCOc": 1052,
1055
+ "NH": 1053,
1056
+ "\u00d2": 1054,
1057
+ "Clc": 1055,
1058
+ "G": 1056,
1059
+ "-])=": 1057,
1060
+ "(\\": 1058,
1061
+ "CSCCO": 1059,
1062
+ "\u00c0": 1060,
1063
+ "COCCCNC": 1061,
1064
+ "CCSC": 1062,
1065
+ "=": 1063,
1066
+ "CSCC": 1064,
1067
+ "\u0117": 1065,
1068
+ "]": 1066,
1069
+ "CCOCCNC": 1067,
1070
+ "OCCCSc": 1068,
1071
+ "CCS": 1069,
1072
+ "oc": 1070,
1073
+ "csc": 1071,
1074
+ "OCCCl": 1072,
1075
+ "\u0103": 1073,
1076
+ "CCOCCOCCOCC": 1074,
1077
+ "CCCSCc": 1075,
1078
+ "\u00e9": 1076,
1079
+ "CSCCCC": 1077,
1080
+ "SCCCS": 1078,
1081
+ "CI": 1079,
1082
+ "\u00f6": 1080,
1083
+ "\u00ba": 1081,
1084
+ "42": 1082,
1085
+ "Q": 1083,
1086
+ "c": 1084,
1087
+ "?": 1085,
1088
+ "f": 1086,
1089
+ "M": 1087,
1090
+ "\u0141": 1088,
1091
+ "COCCCCN": 1089,
1092
+ "FCCC": 1090,
1093
+ "OCCOCCS": 1091,
1094
+ "cs": 1092,
1095
+ "\u00f3": 1093,
1096
+ "NCCCCCC": 1094,
1097
+ "(=[": 1095,
1098
+ "\u00c5": 1096,
1099
+ "NN": 1097,
1100
+ "\u00f0": 1098,
1101
+ "a": 1099,
1102
+ "|": 1100,
1103
+ "COCCCNc": 1101,
1104
+ "CCSCCC": 1102,
1105
+ "CCCCCOc": 1103,
1106
+ "v": 1104,
1107
+ "\u0128": 1105,
1108
+ "cn": 1106,
1109
+ "CCCCc": 1107,
1110
+ "CCNc": 1108,
1111
+ "\u00a3": 1109,
1112
+ "CCNC": 1110,
1113
+ "24": 1111,
1114
+ "CCCSC": 1112,
1115
+ "T": 1113,
1116
+ "NCCSC": 1114,
1117
+ "CSCCN": 1115,
1118
+ "cscn": 1116,
1119
+ "COn": 1117,
1120
+ "ccccc": 1118,
1121
+ "ClCCCSc": 1119,
1122
+ "CCSCC": 1120,
1123
+ "ncnc": 1121,
1124
+ "\u00a7": 1122,
1125
+ "h": 1123,
1126
+ "cncs": 1124,
1127
+ "\u00d4": 1125,
1128
+ "sccn": 1126,
1129
+ ")([": 1127,
1130
+ ")": 1128,
1131
+ "nnsc": 1129,
1132
+ "\\[": 1130,
1133
+ "CCOCCCNC": 1131,
1134
+ "\u00d0": 1132,
1135
+ "NCCOC": 1133,
1136
+ "\u00f4": 1134
1137
+ }