commit files to HF hub

Files changed (9) hide show

.gitattributes +1 -0
config.json +14 -0
fasttext_jp_embedding.py +30 -0
fasttext_jp_tokenizer.py +90 -0
mecab_tokenizer.py +87 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
tokenizer_config.json +9 -0
vocab.txt +3 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+vocab.txt filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "architectures": [
+    "FastTextJpModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "fasttext_jp_embedding.FastTextJpConfig",
+    "AutoModel": "fasttext_jp_embedding.FastTextJpModel"
+  },
+  "hidden_size": 300,
+  "model_type": "fast_text_jp",
+  "torch_dtype": "float32",
+  "transformers_version": "4.23.1",
+  "vocab_size": 10000
+}

fasttext_jp_embedding.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from __future__ import annotations
+from transformers import PretrainedConfig
+from transformers import PreTrainedModel
+from torch import nn
+import torch
+class FastTextJpConfig(PretrainedConfig):
+    model_type = "fast_text_jp"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+class FastTextJpModel(PreTrainedModel):
+    """FastTextのEmbeddingを行います。
+    """
+    config_class = FastTextJpConfig
+    def __init__(self, config: FastTextJpConfig):
+        super().__init__(config)
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+    def forward(self, input_ids, **kwargs):
+        return self.word_embeddings(torch.tensor([0]))
+FastTextJpConfig.register_for_auto_class()
+FastTextJpModel.register_for_auto_class("AutoModel")

fasttext_jp_tokenizer.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from __future__ import annotations
+from .mecab_tokenizer import MeCabTokenizer
+import os
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+def save_stoi(stoi: dict[str, int], vocab_file: str):
+    with open(vocab_file, "w", encoding="utf-8") as writer:
+        index = 0
+        for token, token_index in sorted(stoi.items(), key=lambda kv: kv[1]):
+            if index != token_index:
+                raise ValueError(
+                    "Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                    " Please check that the vocabulary is not corrupted!")
+            writer.write(token + "\n")
+            index += 1
+def load_stoi(vocab_file: str) -> dict[str, int]:
+    stoi: dict[str, int] = {}
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        stoi[token] = index
+    return stoi
+class FastTextJpTokenizer(MeCabTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(self,
+                 vocab_file: str,
+                 hinshi: list[str] | None = None,
+                 mecab_dicdir: str | None = None,
+                 **kwargs):
+        """初期化処理
+        Args:
+            vocab_file (str): vocab_fileのpath
+            hinshi (list[str] | None, optional): 抽出する品詞
+            mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
+        """
+        super().__init__(hinshi, mecab_dicdir, **kwargs)
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.stoi = load_stoi(vocab_file)
+        self.itos = dict([(ids, tok) for tok, ids in self.stoi.items()])
+        self.v_size = len(self.stoi)
+        # self._auto_map = {
+        #     "AutoTokenizer": ["modeling.FastTextMeCabTokenizer", None]
+        # }
+        # self.init_inputs = ["vocab.txt"]
+    @property
+    def vocab_size(self) -> int:
+        """
+        `int`: Size of the base vocabulary (without the added tokens).
+        """
+        return self.v_size
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.stoi[token]
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.itos[index]
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: str | None = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") +
+                "vocab.txt")
+        else:
+            vocab_file = (filename_prefix +
+                          "-" if filename_prefix else "") + save_directory
+        save_stoi(self.stoi, vocab_file)
+        return (vocab_file, )
+FastTextJpTokenizer.register_for_auto_class("AutoTokenizer")

mecab_tokenizer.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from __future__ import annotations
+from typing import NamedTuple
+import MeCab
+from transformers import PreTrainedTokenizer
+class MeCabResult(NamedTuple):
+    hyosokei: str
+    hinshi: str
+    hinshi_saibunrui_1: str
+    hinshi_saibunrui_2: str
+    hinshi_saibunrui_3: str
+    katsuyokei_1: str
+    katsuyokei_2: str
+    genkei: str
+    yomi: str
+    hatsuon: str
+class MeCabTokenizer(PreTrainedTokenizer):
+    def __init__(self,
+                 hinshi: list[str] | None = None,
+                 mecab_dicdir: str | None = None,
+                 **kwargs):
+        """初期化処理
+        Args:
+            hinshi (list[str] | None): 抽出する品詞
+            mecab_dicdir (str | None, optional): dicrcのあるディレクトリ
+        """
+        self.target_hinshi = hinshi
+        if mecab_dicdir is not None:
+            self.mecab = MeCab.Tagger(f"-d {mecab_dicdir}")
+        else:
+            self.mecab = MeCab.Tagger()
+        super().__init__(**kwargs)
+    def _tokenize(self, text: str) -> list[str]:
+        """文章から特定の品詞の単語を返します。
+        Args:
+            text (str): 文章
+        Returns:
+            list[str]: 特定の品詞の単語
+        """
+        out = []
+        # Mecabで分析します。
+        result_words = self.mecab_analyze(text)
+        for result_word in result_words:
+            # 最初と最後は空文字
+            if result_word.hyosokei == "":
+                continue
+            if self.target_hinshi is not None and result_word.hinshi in self.target_hinshi:
+                # 特定の品詞のみ返します。
+                out.append(result_word.hyosokei)
+            else:
+                out.append(result_word.hyosokei)
+        return out
+    def mecab_analyze(self, text: str) -> list[MeCabResult]:
+        """文章をMecabで分析します。
+        Args:
+            text (str): 文章
+        Returns:
+            list[MeCabResult]: MeCabの解析結果
+        """
+        node = self.mecab.parseToNode(text)
+        #形態素1つ1つを処理
+        out = []
+        while node:
+            args = []
+            args.append(node.surface)
+            feature = node.feature.split(",")
+            args.extend(feature)
+            mecab_result = MeCabResult(args[0], args[1], args[2], args[3],
+                                       args[4], args[5], args[6], args[7],
+                                       args[8], args[9])
+            out.append(mecab_result)
+            node = node.next  # 最後のEOSを省く
+        return out

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16c44d91478fe733c856779a82ff9a9da10fd8da41f594b4088b0c3d3a783003
+size 12000829

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "fasttext_jp_tokenizer.FastTextJpTokenizer",
+      null
+    ]
+  },
+  "tokenizer_class": "FastTextJpTokenizer"
+}

vocab.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a1770ed0a47f44e882afc3f56271a16bc8dba675f18dd61e2cffac276b49acc
+size 29910902