|
|
import fugashi |
|
|
import jaconv |
|
|
import mojimoji |
|
|
import unicodedata |
|
|
import re |
|
|
import pathlib |
|
|
import sys |
|
|
from dataclasses import dataclass |
|
|
|
|
|
from .mapping import * |
|
|
|
|
|
SUTEGANA = 'ゃゅょぁぃぅぇぉ' |
|
|
PUNCT = '\'".!?(),;:-' |
|
|
ODORI = '々〃ゝゞヽゞ' |
|
|
|
|
|
|
|
|
CHAR_ALPHA = 5 |
|
|
CHAR_HIRAGANA = 6 |
|
|
CHAR_KATAKANA = 7 |
|
|
|
|
|
SYSTEMS = { |
|
|
'hepburn': HEPBURN, |
|
|
'kunrei': KUNREISHIKI, |
|
|
'nihon': NIHONSHIKI, |
|
|
} |
|
|
|
|
|
def has_foreign_lemma(word): |
|
|
"""Check if a word (node) has a foreign lemma. |
|
|
|
|
|
In UniDic, these lemmas don't get their own field, instead the lemma field |
|
|
is overloaded. There are also cases where the lemma field is overloaded |
|
|
with non-foreign-lemma information. |
|
|
""" |
|
|
|
|
|
if '-' in word.surface: |
|
|
|
|
|
return False |
|
|
|
|
|
if not word.feature.lemma: |
|
|
|
|
|
return False |
|
|
|
|
|
lemma = word.feature.lemma |
|
|
|
|
|
if not '-' in lemma: |
|
|
return False |
|
|
|
|
|
cand = lemma.split('-', 1)[-1] |
|
|
|
|
|
|
|
|
|
|
|
if cand.isascii(): |
|
|
return True |
|
|
|
|
|
def normalize_text(text): |
|
|
"""Given text, normalize variations in Japanese. |
|
|
|
|
|
This specifically removes variations that are meaningless for romaji |
|
|
conversion using the following steps: |
|
|
|
|
|
- Unicode NFKC normalization |
|
|
- Full-width Latin to half-width |
|
|
- Half-width katakana to full-width |
|
|
""" |
|
|
|
|
|
text = unicodedata.normalize('NFKC', text) |
|
|
|
|
|
text = mojimoji.zen_to_han(text, kana=False) |
|
|
|
|
|
text = mojimoji.han_to_zen(text, digit=False, ascii=False) |
|
|
return text |
|
|
|
|
|
def load_exceptions(): |
|
|
"""Load list of exceptions from included data file.""" |
|
|
cdir = pathlib.Path(__file__).parent.absolute() |
|
|
exceptions = {} |
|
|
with open(cdir / 'exceptions.tsv', encoding='utf-8') as exceptions_file: |
|
|
for line in exceptions_file: |
|
|
line = line.strip() |
|
|
|
|
|
if line[0] == '#' or not line: continue |
|
|
key, val = line.split('\t') |
|
|
exceptions[key] = val |
|
|
return exceptions |
|
|
|
|
|
@dataclass |
|
|
class Token: |
|
|
surface: str |
|
|
space: bool |
|
|
|
|
|
foreign: bool = False |
|
|
|
|
|
def __str__(self): |
|
|
sp = " " if self.space else "" |
|
|
return f"{self.surface}{sp}" |
|
|
|
|
|
class Cutlet: |
|
|
def __init__( |
|
|
self, |
|
|
system = 'hepburn', |
|
|
use_foreign_spelling = True, |
|
|
ensure_ascii = True, |
|
|
mecab_args = "", |
|
|
): |
|
|
"""Create a Cutlet object, which holds configuration as well as |
|
|
tokenizer state. |
|
|
|
|
|
`system` is `hepburn` by default, and may also be `kunrei` or |
|
|
`nihon`. `nippon` is permitted as a synonym for `nihon`. |
|
|
|
|
|
If `use_foreign_spelling` is true, output will use the foreign spelling |
|
|
provided in a UniDic lemma when available. For example, "カツ" will |
|
|
become "cutlet" instead of "katsu". |
|
|
|
|
|
If `ensure_ascii` is true, any non-ASCII characters that can't be |
|
|
romanized will be replaced with `?`. If false, they will be passed |
|
|
through. |
|
|
|
|
|
Typical usage: |
|
|
|
|
|
```python |
|
|
katsu = Cutlet() |
|
|
roma = katsu.romaji("カツカレーを食べた") |
|
|
# "Cutlet curry wo tabeta" |
|
|
``` |
|
|
""" |
|
|
|
|
|
if system == 'nippon': system = 'nihon' |
|
|
self.system = system |
|
|
try: |
|
|
|
|
|
self.table = dict(SYSTEMS[system]) |
|
|
except KeyError: |
|
|
print("unknown system: {}".format(system)) |
|
|
raise |
|
|
|
|
|
self.tagger = fugashi.Tagger(mecab_args) |
|
|
self.exceptions = load_exceptions() |
|
|
|
|
|
|
|
|
self.use_tch = (self.system in ('hepburn',)) |
|
|
self.use_wa = (self.system in ('hepburn', 'kunrei')) |
|
|
self.use_he = (self.system in ('nihon',)) |
|
|
self.use_wo = (self.system in ('hepburn', 'nihon')) |
|
|
|
|
|
self.use_foreign_spelling = use_foreign_spelling |
|
|
self.ensure_ascii = ensure_ascii |
|
|
|
|
|
def add_exception(self, key, val): |
|
|
"""Add an exception to the internal list. |
|
|
|
|
|
An exception overrides a whole token, for example to replace "Toukyou" |
|
|
with "Tokyo". Note that it must match the tokenizer output and be a |
|
|
single token to work. To replace longer phrases, you'll need to use a |
|
|
different strategy, like string replacement. |
|
|
""" |
|
|
self.exceptions[key] = val |
|
|
|
|
|
def update_mapping(self, key, val): |
|
|
"""Update mapping table for a single kana. |
|
|
|
|
|
This can be used to mix common systems, or to modify particular |
|
|
details. For example, you can use `update_mapping("ぢ", "di")` to |
|
|
differentiate ぢ and じ in Hepburn. |
|
|
|
|
|
Example usage: |
|
|
|
|
|
``` |
|
|
cut = Cutlet() |
|
|
cut.romaji("お茶漬け") # Ochazuke |
|
|
cut.update_mapping("づ", "du") |
|
|
cut.romaji("お茶漬け") # Ochaduke |
|
|
``` |
|
|
""" |
|
|
self.table[key] = val |
|
|
|
|
|
def slug(self, text): |
|
|
"""Generate a URL-friendly slug. |
|
|
|
|
|
After converting the input to romaji using `Cutlet.romaji` and making |
|
|
the result lower-case, any runs of non alpha-numeric characters are |
|
|
replaced with a single hyphen. Any leading or trailing hyphens are |
|
|
stripped. |
|
|
""" |
|
|
roma = self.romaji(text).lower() |
|
|
slug = re.sub(r'[^a-z0-9]+', '-', roma).strip('-') |
|
|
return slug |
|
|
|
|
|
def romaji_tokens(self, words, capitalize=True, title=False): |
|
|
"""Build a list of tokens from input nodes. |
|
|
|
|
|
If `capitalize` is true, then the first letter of the first token will be |
|
|
capitalized. This is typically the desired behavior if the input is a |
|
|
complete sentence. |
|
|
|
|
|
If `title` is true, then words will be capitalized as in a book title. |
|
|
This means most words will be capitalized, but some parts of speech |
|
|
(particles, endings) will not. |
|
|
|
|
|
If the text was not normalized before being tokenized, the output is |
|
|
undefined. For details of normalization, see `normalize_text`. |
|
|
|
|
|
The number of output tokens will equal the number of input nodes. |
|
|
""" |
|
|
|
|
|
out = [] |
|
|
|
|
|
for wi, word in enumerate(words): |
|
|
po = out[-1] if out else None |
|
|
pw = words[wi - 1] if wi > 0 else None |
|
|
nw = words[wi + 1] if wi < len(words) - 1 else None |
|
|
|
|
|
|
|
|
if (word.surface == "'" and |
|
|
(nw and nw.char_type == CHAR_ALPHA and not nw.white_space) and |
|
|
not word.white_space): |
|
|
|
|
|
if po: |
|
|
po.space = False |
|
|
out.append(Token(word.surface, False)) |
|
|
continue |
|
|
|
|
|
|
|
|
roma = self.romaji_word(word) |
|
|
if roma and po and po.surface and po.surface[-1] == 'っ': |
|
|
po.surface = po.surface[:-1] + roma[0] |
|
|
if word.feature.pos2 == '固有名詞': |
|
|
roma = roma.title() |
|
|
if (title and |
|
|
word.feature.pos1 not in ('助詞', '助動詞', '接尾辞') and |
|
|
not (pw and pw.feature.pos1 == '接頭辞')): |
|
|
roma = roma.title() |
|
|
|
|
|
foreign = self.use_foreign_spelling and has_foreign_lemma(word) |
|
|
tok = Token(roma, False, foreign) |
|
|
|
|
|
if word.surface in '「『': |
|
|
if po: |
|
|
po.space = True |
|
|
out.append(tok) |
|
|
continue |
|
|
if roma in '([': |
|
|
if po: |
|
|
po.space = True |
|
|
out.append(tok) |
|
|
continue |
|
|
if roma == '/': |
|
|
out.append(tok) |
|
|
continue |
|
|
|
|
|
|
|
|
if (word.surface.isascii() and |
|
|
nw and nw.surface.isascii()): |
|
|
use_space = bool(nw.white_space) |
|
|
out.append(Token(word.surface, use_space)) |
|
|
continue |
|
|
|
|
|
out.append(tok) |
|
|
|
|
|
|
|
|
|
|
|
if word.feature.pos1 == '接頭辞': continue |
|
|
|
|
|
if nw and nw.feature.pos1 in ('補助記号', '接尾辞'): continue |
|
|
|
|
|
if nw and nw.surface == ',': continue |
|
|
|
|
|
if foreign and roma[-1] == "-": continue |
|
|
|
|
|
if nw and nw.feature.pos2 in ('接続助詞'): continue |
|
|
|
|
|
if (word.surface.isdigit() and |
|
|
nw and nw.surface.isdigit()): |
|
|
continue |
|
|
|
|
|
if (nw and word.feature.pos1 in ('動詞', '助動詞','形容詞') |
|
|
and nw.feature.pos1 == '助動詞' |
|
|
and nw.surface != 'です'): |
|
|
continue |
|
|
|
|
|
|
|
|
tok.space = True |
|
|
|
|
|
|
|
|
for tok in out: |
|
|
tok.surface = tok.surface.replace("っ", "") |
|
|
|
|
|
|
|
|
if capitalize and out and out[0].surface: |
|
|
ss = out[0].surface |
|
|
out[0].surface = ss[0].capitalize() + ss[1:] |
|
|
return out |
|
|
|
|
|
def romaji(self, text, capitalize=True, title=False): |
|
|
"""Build a complete string from input text. |
|
|
|
|
|
If `capitalize` is true, then the first letter of the text will be |
|
|
capitalized. This is typically the desired behavior if the input is a |
|
|
complete sentence. |
|
|
|
|
|
If `title` is true, then words will be capitalized as in a book title. |
|
|
This means most words will be capitalized, but some parts of speech |
|
|
(particles, endings) will not. |
|
|
""" |
|
|
if not text: |
|
|
return '' |
|
|
|
|
|
text = normalize_text(text) |
|
|
words = self.tagger(text) |
|
|
|
|
|
tokens = self.romaji_tokens(words, capitalize, title) |
|
|
out = ''.join([str(tok) for tok in tokens]).strip() |
|
|
return out |
|
|
|
|
|
def romaji_word(self, word): |
|
|
"""Return the romaji for a single word (node).""" |
|
|
|
|
|
if word.surface in self.exceptions: |
|
|
return self.exceptions[word.surface] |
|
|
|
|
|
if word.surface.isdigit(): |
|
|
return word.surface |
|
|
|
|
|
if word.surface.isascii(): |
|
|
return word.surface |
|
|
|
|
|
|
|
|
if word.is_unk: |
|
|
|
|
|
|
|
|
|
|
|
if word.char_type in (CHAR_HIRAGANA, CHAR_KATAKANA): |
|
|
kana = jaconv.kata2hira(word.surface) |
|
|
return self.map_kana(kana) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.ensure_ascii: |
|
|
out = '?' * len(word.surface) |
|
|
return out |
|
|
else: |
|
|
return word.surface |
|
|
|
|
|
if word.feature.pos1 == '補助記号': |
|
|
|
|
|
return self.table.get(word.surface, '') |
|
|
elif (self.use_wa and |
|
|
word.feature.pos1 == '助詞' and word.feature.pron == 'ワ'): |
|
|
return 'wa' |
|
|
elif (not self.use_he and |
|
|
word.feature.pos1 == '助詞' and word.feature.pron == 'エ'): |
|
|
return 'e' |
|
|
elif (not self.use_wo and |
|
|
word.feature.pos1 == '助詞' and word.feature.pron == 'オ'): |
|
|
return 'o' |
|
|
elif (self.use_foreign_spelling and |
|
|
has_foreign_lemma(word)): |
|
|
|
|
|
return word.feature.lemma.split('-', 1)[-1] |
|
|
elif word.feature.kana: |
|
|
|
|
|
kana = jaconv.kata2hira(word.feature.kana) |
|
|
return self.map_kana(kana) |
|
|
else: |
|
|
|
|
|
return word.surface |
|
|
|
|
|
def map_kana(self, kana): |
|
|
"""Given a list of kana, convert them to romaji. |
|
|
|
|
|
The exact romaji resulting from a kana sequence depend on the preceding |
|
|
or following kana, so this handles that conversion. |
|
|
""" |
|
|
out = '' |
|
|
for ki, char in enumerate(kana): |
|
|
nk = kana[ki + 1] if ki < len(kana) - 1 else None |
|
|
pk = kana[ki - 1] if ki > 0 else None |
|
|
out += self.get_single_mapping(pk, char, nk) |
|
|
return out |
|
|
|
|
|
def get_single_mapping(self, pk, kk, nk): |
|
|
"""Given a single kana and its neighbors, return the mapped romaji.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if kk in ODORI: |
|
|
if kk in 'ゝヽ': |
|
|
if pk: return pk |
|
|
else: return '' |
|
|
if kk in 'ゞヾ': |
|
|
if not pk: return '' |
|
|
vv = add_dakuten(pk) |
|
|
if vv: return self.table[vv] |
|
|
else: return '' |
|
|
|
|
|
|
|
|
return '' |
|
|
|
|
|
|
|
|
|
|
|
if pk and (pk + kk) in self.table: |
|
|
return self.table[pk + kk] |
|
|
if nk and (kk + nk) in self.table: |
|
|
return '' |
|
|
|
|
|
if nk and nk in SUTEGANA: |
|
|
if kk == 'っ': return '' |
|
|
return self.table[kk][:-1] + self.table[nk] |
|
|
if kk in SUTEGANA: |
|
|
return '' |
|
|
|
|
|
if kk == 'ー': |
|
|
if pk and pk in self.table: return self.table[pk][-1] |
|
|
else: return '-' |
|
|
|
|
|
if kk == 'っ': |
|
|
if nk: |
|
|
if self.use_tch and nk == 'ち': return 't' |
|
|
elif nk in 'あいうえおっ': return '-' |
|
|
else: return self.table[nk][0] |
|
|
else: |
|
|
|
|
|
|
|
|
return 'っ' |
|
|
|
|
|
if kk == 'ん': |
|
|
if nk and nk in 'あいうえおやゆよ': return "n'" |
|
|
else: return 'n' |
|
|
|
|
|
return self.table[kk] |
|
|
|