SVECTOR-OFFICIAL commited on
Commit
f23df61
·
verified ·
1 Parent(s): c7a90a1

Update tessar_tokenizer.py

Browse files
Files changed (1) hide show
  1. tessar_tokenizer.py +22 -1
tessar_tokenizer.py CHANGED
@@ -8,6 +8,9 @@ from transformers import PreTrainedTokenizerFast
8
  class TessarTokenizer(PreTrainedTokenizerFast):
9
  """
10
  Tessar Tokenizer implementation for Hugging Face Transformers
 
 
 
11
  """
12
 
13
  model_input_names = ['input_ids', 'attention_mask']
@@ -74,6 +77,9 @@ class TessarTokenizer(PreTrainedTokenizerFast):
74
  Returns:
75
  tuple: Paths to the saved files
76
  """
 
 
 
77
  # Prepare file paths
78
  vocab_file = os.path.join(
79
  save_directory,
@@ -161,4 +167,19 @@ def load_tessar_tokenizer(pretrained_model_name_or_path: str):
161
  Returns:
162
  TessarTokenizer: Initialized tokenizer
163
  """
164
- return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  class TessarTokenizer(PreTrainedTokenizerFast):
9
  """
10
  Tessar Tokenizer implementation for Hugging Face Transformers
11
+
12
+ This custom tokenizer extends the PreTrainedTokenizerFast with specialized
13
+ configuration and tokenization methods for the Tessar model.
14
  """
15
 
16
  model_input_names = ['input_ids', 'attention_mask']
 
77
  Returns:
78
  tuple: Paths to the saved files
79
  """
80
+ # Ensure the save directory exists
81
+ os.makedirs(save_directory, exist_ok=True)
82
+
83
  # Prepare file paths
84
  vocab_file = os.path.join(
85
  save_directory,
 
167
  Returns:
168
  TessarTokenizer: Initialized tokenizer
169
  """
170
+ return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
171
+
172
+
173
+ # Optionally, add some example usage
174
+ if __name__ == "__main__":
175
+ # Example of loading a pretrained tokenizer
176
+ try:
177
+ tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
178
+ print("Tokenizer loaded successfully!")
179
+
180
+ # Basic tokenization example
181
+ text = "Hello, how are you doing today?"
182
+ encoded = tokenizer(text, return_tensors="pt")
183
+ print("Encoded Input:", encoded)
184
+ except Exception as e:
185
+ print(f"Error loading tokenizer: {e}")