Update tessar_tokenizer.py
Browse files- tessar_tokenizer.py +22 -1
tessar_tokenizer.py
CHANGED
@@ -8,6 +8,9 @@ from transformers import PreTrainedTokenizerFast
|
|
8 |
class TessarTokenizer(PreTrainedTokenizerFast):
|
9 |
"""
|
10 |
Tessar Tokenizer implementation for Hugging Face Transformers
|
|
|
|
|
|
|
11 |
"""
|
12 |
|
13 |
model_input_names = ['input_ids', 'attention_mask']
|
@@ -74,6 +77,9 @@ class TessarTokenizer(PreTrainedTokenizerFast):
|
|
74 |
Returns:
|
75 |
tuple: Paths to the saved files
|
76 |
"""
|
|
|
|
|
|
|
77 |
# Prepare file paths
|
78 |
vocab_file = os.path.join(
|
79 |
save_directory,
|
@@ -161,4 +167,19 @@ def load_tessar_tokenizer(pretrained_model_name_or_path: str):
|
|
161 |
Returns:
|
162 |
TessarTokenizer: Initialized tokenizer
|
163 |
"""
|
164 |
-
return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
class TessarTokenizer(PreTrainedTokenizerFast):
|
9 |
"""
|
10 |
Tessar Tokenizer implementation for Hugging Face Transformers
|
11 |
+
|
12 |
+
This custom tokenizer extends the PreTrainedTokenizerFast with specialized
|
13 |
+
configuration and tokenization methods for the Tessar model.
|
14 |
"""
|
15 |
|
16 |
model_input_names = ['input_ids', 'attention_mask']
|
|
|
77 |
Returns:
|
78 |
tuple: Paths to the saved files
|
79 |
"""
|
80 |
+
# Ensure the save directory exists
|
81 |
+
os.makedirs(save_directory, exist_ok=True)
|
82 |
+
|
83 |
# Prepare file paths
|
84 |
vocab_file = os.path.join(
|
85 |
save_directory,
|
|
|
167 |
Returns:
|
168 |
TessarTokenizer: Initialized tokenizer
|
169 |
"""
|
170 |
+
return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
|
171 |
+
|
172 |
+
|
173 |
+
# Optionally, add some example usage
|
174 |
+
if __name__ == "__main__":
|
175 |
+
# Example of loading a pretrained tokenizer
|
176 |
+
try:
|
177 |
+
tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
|
178 |
+
print("Tokenizer loaded successfully!")
|
179 |
+
|
180 |
+
# Basic tokenization example
|
181 |
+
text = "Hello, how are you doing today?"
|
182 |
+
encoded = tokenizer(text, return_tensors="pt")
|
183 |
+
print("Encoded Input:", encoded)
|
184 |
+
except Exception as e:
|
185 |
+
print(f"Error loading tokenizer: {e}")
|