Update tokenization_chatglm.py
Browse filessupport set self.padding_side 'right'.
- tokenization_chatglm.py +31 -17
tokenization_chatglm.py
CHANGED
|
@@ -380,8 +380,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 380 |
# Load from model defaults
|
| 381 |
bos_token_id = self.sp_tokenizer[self.bos_token]
|
| 382 |
mask_token_id = self.sp_tokenizer[self.mask_token]
|
| 383 |
-
gmask_token_id = self.sp_tokenizer[self.gmask_token]
|
| 384 |
-
assert self.padding_side == "left"
|
| 385 |
|
| 386 |
required_input = encoded_inputs[self.model_input_names[0]]
|
| 387 |
seq_length = len(required_input)
|
|
@@ -424,20 +423,35 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 424 |
|
| 425 |
if needs_to_be_padded:
|
| 426 |
difference = max_length - len(required_input)
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
return encoded_inputs
|
|
|
|
| 380 |
# Load from model defaults
|
| 381 |
bos_token_id = self.sp_tokenizer[self.bos_token]
|
| 382 |
mask_token_id = self.sp_tokenizer[self.mask_token]
|
| 383 |
+
gmask_token_id = self.sp_tokenizer[self.gmask_token]
|
|
|
|
| 384 |
|
| 385 |
required_input = encoded_inputs[self.model_input_names[0]]
|
| 386 |
seq_length = len(required_input)
|
|
|
|
| 423 |
|
| 424 |
if needs_to_be_padded:
|
| 425 |
difference = max_length - len(required_input)
|
| 426 |
+
if self.padding_side == "left":
|
| 427 |
+
if "attention_mask" in encoded_inputs:
|
| 428 |
+
encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
|
| 429 |
+
pad_width=[(0, 0), (difference, 0), (difference, 0)],
|
| 430 |
+
mode='constant', constant_values=True)
|
| 431 |
+
if "token_type_ids" in encoded_inputs:
|
| 432 |
+
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
|
| 433 |
+
"token_type_ids"
|
| 434 |
+
]
|
| 435 |
+
if "special_tokens_mask" in encoded_inputs:
|
| 436 |
+
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
| 437 |
+
if "position_ids" in encoded_inputs:
|
| 438 |
+
encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
|
| 439 |
+
pad_width=[(0, 0), (difference, 0)])
|
| 440 |
+
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
| 441 |
+
elif self.padding_side == "right":
|
| 442 |
+
if "attention_mask" in encoded_inputs:
|
| 443 |
+
encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
|
| 444 |
+
pad_width=[(0, 0), (0, difference), (0, difference)],
|
| 445 |
+
mode='constant', constant_values=True)
|
| 446 |
+
if "token_type_ids" in encoded_inputs:
|
| 447 |
+
encoded_inputs["token_type_ids"] = encoded_inputs[ "token_type_ids" ] + [self.pad_token_type_id] * difference
|
| 448 |
+
if "special_tokens_mask" in encoded_inputs:
|
| 449 |
+
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
| 450 |
+
if "position_ids" in encoded_inputs:
|
| 451 |
+
encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
|
| 452 |
+
pad_width=[(0, 0), (0, difference)])
|
| 453 |
+
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
| 454 |
+
else:
|
| 455 |
+
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
| 456 |
|
| 457 |
return encoded_inputs
|