Spaces:

polyglot-tagger
/

language-extractor-demo

Running

App Files Files Community

DerivedFunction1 commited on 3 days ago

Commit

e6c77d8

1 Parent(s): 7117091

add

Browse files

Files changed (3) hide show

fleurs_cache.py +7 -2
language.py +74 -0
sib200_cache.py +3 -1

fleurs_cache.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Any
 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download
-from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
 from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
@@ -33,7 +33,10 @@ FLEURS_LEAN_COLUMNS = ["id", "text", "source_lang", "model_lang", "split"]
 def _normalize_model_lang(source_lang: str) -> str:
     """Map a FLEURS locale like `am_et` to the model language code."""
     base_lang = source_lang.split("_", 1)[0].strip().lower()
-    return canonical_lang(base_lang)
 def _discover_tsv_files() -> list[str]:
@@ -137,6 +140,8 @@ def _frame_from_tsv(tsv_path: Path, source_lang: str) -> pd.DataFrame:
     frame["source"] = "fleurs"
     frame["source_lang"] = source_lang
     frame["model_lang"] = _normalize_model_lang(source_lang)
     frame["split"] = _normalize_split_name(tsv_path.name)
     frame["lang_iso3"] = frame["model_lang"].map(lambda lang: LANG_ISO2_TO_ISO3.get(lang, ""))
     frame["language_name"] = source_lang

 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download
+from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang, is_latin_script_compatible
 from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
 def _normalize_model_lang(source_lang: str) -> str:
     """Map a FLEURS locale like `am_et` to the model language code."""
     base_lang = source_lang.split("_", 1)[0].strip().lower()
+    normalized = canonical_lang(base_lang)
+    if not is_latin_script_compatible(normalized, source_lang):
+        return ""
+    return normalized
 def _discover_tsv_files() -> list[str]:
     frame["source"] = "fleurs"
     frame["source_lang"] = source_lang
     frame["model_lang"] = _normalize_model_lang(source_lang)
+    if not frame["model_lang"].astype(str).str.strip().any():
+        return pd.DataFrame()
     frame["split"] = _normalize_split_name(tsv_path.name)
     frame["lang_iso3"] = frame["model_lang"].map(lambda lang: LANG_ISO2_TO_ISO3.get(lang, ""))
     frame["language_name"] = source_lang

language.py CHANGED Viewed

@@ -28,6 +28,61 @@ def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tupl
 LANGUAGE_ALIASES = _load_language_aliases()
 ALL_LANGS = list(LANGUAGE_ALIASES.keys())
 LANG_ISO2_TO_ISO3 = {
     lang: (
         getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
@@ -44,3 +99,22 @@ LANG_ALIASES = {
 def canonical_lang(lang: str) -> str:
     return LANG_ALIASES.get(lang, lang)

 LANGUAGE_ALIASES = _load_language_aliases()
 ALL_LANGS = list(LANGUAGE_ALIASES.keys())
+LATIN_ONLY_LANGS = {
+    lang
+    for lang in ALL_LANGS
+    if lang
+    not in {
+        "ar",
+        "fa",
+        "ps",
+        "sd",
+        "ug",
+        "ur",
+        "ckb",
+        "he",
+        "yi",
+        "ru",
+        "uk",
+        "be",
+        "bg",
+        "mk",
+        "kk",
+        "mn",
+        "tt",
+        "ky",
+        "tg",
+        "ba",
+        "ce",
+        "el",
+        "hy",
+        "ka",
+        "am",
+        "ti",
+        "dv",
+        "km",
+        "lo",
+        "my",
+        "th",
+        "si",
+        "bo",
+        "hi",
+        "mr",
+        "ne",
+        "bn",
+        "as",
+        "ta",
+        "te",
+        "gu",
+        "kn",
+        "ml",
+        "pa",
+        "or",
+        "ja",
+        "zh",
+        "ko",
+    }
+}
 LANG_ISO2_TO_ISO3 = {
     lang: (
         getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
 def canonical_lang(lang: str) -> str:
     return LANG_ALIASES.get(lang, lang)
+def label_script_suffix(label: str) -> str | None:
+    label = (label or "").strip()
+    if "_" not in label:
+        return None
+    suffix = label.rsplit("_", 1)[1].strip()
+    return suffix or None
+def is_latin_script_label(label: str) -> bool:
+    return label_script_suffix(label) == "Latn"
+def is_latin_script_compatible(lang: str, label: str) -> bool:
+    """Return False when a `_Latn` label is used for a non-Latin language."""
+    if not is_latin_script_label(label):
+        return True
+    return canonical_lang(lang) in LATIN_ONLY_LANGS

sib200_cache.py CHANGED Viewed

@@ -11,7 +11,7 @@ from datasets import get_dataset_config_names, load_dataset
 import pycountry
 from tqdm.auto import tqdm
-from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
 from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
@@ -38,6 +38,8 @@ def _normalize_source_lang(config_name: str) -> str:
             if alpha_2:
                 return canonical_lang(alpha_2.lower())
     language = canonical_lang(base)
     return language if language in ALL_LANGS else base

 import pycountry
 from tqdm.auto import tqdm
+from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang, is_latin_script_compatible
 from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
             if alpha_2:
                 return canonical_lang(alpha_2.lower())
     language = canonical_lang(base)
+    if not is_latin_script_compatible(language, config_name):
+        return ""
     return language if language in ALL_LANGS else base