DerivedFunction1 commited on
Commit
e6c77d8
·
1 Parent(s): 7117091
Files changed (3) hide show
  1. fleurs_cache.py +7 -2
  2. language.py +74 -0
  3. sib200_cache.py +3 -1
fleurs_cache.py CHANGED
@@ -9,7 +9,7 @@ from typing import Any
9
  import pandas as pd
10
  from huggingface_hub import HfApi, hf_hub_download
11
 
12
- from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
13
  from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
14
 
15
 
@@ -33,7 +33,10 @@ FLEURS_LEAN_COLUMNS = ["id", "text", "source_lang", "model_lang", "split"]
33
  def _normalize_model_lang(source_lang: str) -> str:
34
  """Map a FLEURS locale like `am_et` to the model language code."""
35
  base_lang = source_lang.split("_", 1)[0].strip().lower()
36
- return canonical_lang(base_lang)
 
 
 
37
 
38
 
39
  def _discover_tsv_files() -> list[str]:
@@ -137,6 +140,8 @@ def _frame_from_tsv(tsv_path: Path, source_lang: str) -> pd.DataFrame:
137
  frame["source"] = "fleurs"
138
  frame["source_lang"] = source_lang
139
  frame["model_lang"] = _normalize_model_lang(source_lang)
 
 
140
  frame["split"] = _normalize_split_name(tsv_path.name)
141
  frame["lang_iso3"] = frame["model_lang"].map(lambda lang: LANG_ISO2_TO_ISO3.get(lang, ""))
142
  frame["language_name"] = source_lang
 
9
  import pandas as pd
10
  from huggingface_hub import HfApi, hf_hub_download
11
 
12
+ from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang, is_latin_script_compatible
13
  from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
14
 
15
 
 
33
  def _normalize_model_lang(source_lang: str) -> str:
34
  """Map a FLEURS locale like `am_et` to the model language code."""
35
  base_lang = source_lang.split("_", 1)[0].strip().lower()
36
+ normalized = canonical_lang(base_lang)
37
+ if not is_latin_script_compatible(normalized, source_lang):
38
+ return ""
39
+ return normalized
40
 
41
 
42
  def _discover_tsv_files() -> list[str]:
 
140
  frame["source"] = "fleurs"
141
  frame["source_lang"] = source_lang
142
  frame["model_lang"] = _normalize_model_lang(source_lang)
143
+ if not frame["model_lang"].astype(str).str.strip().any():
144
+ return pd.DataFrame()
145
  frame["split"] = _normalize_split_name(tsv_path.name)
146
  frame["lang_iso3"] = frame["model_lang"].map(lambda lang: LANG_ISO2_TO_ISO3.get(lang, ""))
147
  frame["language_name"] = source_lang
language.py CHANGED
@@ -28,6 +28,61 @@ def _load_language_aliases(path: Path = LANGUAGE_ALIASES_JSON) -> dict[str, tupl
28
 
29
  LANGUAGE_ALIASES = _load_language_aliases()
30
  ALL_LANGS = list(LANGUAGE_ALIASES.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  LANG_ISO2_TO_ISO3 = {
32
  lang: (
33
  getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
@@ -44,3 +99,22 @@ LANG_ALIASES = {
44
 
45
  def canonical_lang(lang: str) -> str:
46
  return LANG_ALIASES.get(lang, lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  LANGUAGE_ALIASES = _load_language_aliases()
30
  ALL_LANGS = list(LANGUAGE_ALIASES.keys())
31
+ LATIN_ONLY_LANGS = {
32
+ lang
33
+ for lang in ALL_LANGS
34
+ if lang
35
+ not in {
36
+ "ar",
37
+ "fa",
38
+ "ps",
39
+ "sd",
40
+ "ug",
41
+ "ur",
42
+ "ckb",
43
+ "he",
44
+ "yi",
45
+ "ru",
46
+ "uk",
47
+ "be",
48
+ "bg",
49
+ "mk",
50
+ "kk",
51
+ "mn",
52
+ "tt",
53
+ "ky",
54
+ "tg",
55
+ "ba",
56
+ "ce",
57
+ "el",
58
+ "hy",
59
+ "ka",
60
+ "am",
61
+ "ti",
62
+ "dv",
63
+ "km",
64
+ "lo",
65
+ "my",
66
+ "th",
67
+ "si",
68
+ "bo",
69
+ "hi",
70
+ "mr",
71
+ "ne",
72
+ "bn",
73
+ "as",
74
+ "ta",
75
+ "te",
76
+ "gu",
77
+ "kn",
78
+ "ml",
79
+ "pa",
80
+ "or",
81
+ "ja",
82
+ "zh",
83
+ "ko",
84
+ }
85
+ }
86
  LANG_ISO2_TO_ISO3 = {
87
  lang: (
88
  getattr(pycountry.languages.get(alpha_2=lang) or pycountry.languages.get(alpha_3=lang), "alpha_3", None)
 
99
 
100
  def canonical_lang(lang: str) -> str:
101
  return LANG_ALIASES.get(lang, lang)
102
+
103
+
104
+ def label_script_suffix(label: str) -> str | None:
105
+ label = (label or "").strip()
106
+ if "_" not in label:
107
+ return None
108
+ suffix = label.rsplit("_", 1)[1].strip()
109
+ return suffix or None
110
+
111
+
112
+ def is_latin_script_label(label: str) -> bool:
113
+ return label_script_suffix(label) == "Latn"
114
+
115
+
116
+ def is_latin_script_compatible(lang: str, label: str) -> bool:
117
+ """Return False when a `_Latn` label is used for a non-Latin language."""
118
+ if not is_latin_script_label(label):
119
+ return True
120
+ return canonical_lang(lang) in LATIN_ONLY_LANGS
sib200_cache.py CHANGED
@@ -11,7 +11,7 @@ from datasets import get_dataset_config_names, load_dataset
11
  import pycountry
12
  from tqdm.auto import tqdm
13
 
14
- from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang
15
  from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
16
 
17
 
@@ -38,6 +38,8 @@ def _normalize_source_lang(config_name: str) -> str:
38
  if alpha_2:
39
  return canonical_lang(alpha_2.lower())
40
  language = canonical_lang(base)
 
 
41
  return language if language in ALL_LANGS else base
42
 
43
 
 
11
  import pycountry
12
  from tqdm.auto import tqdm
13
 
14
+ from language import ALL_LANGS, LANG_ISO2_TO_ISO3, canonical_lang, is_latin_script_compatible
15
  from sentence_sampling import sample_multi_group_bundle, sample_single_group_bundle
16
 
17
 
 
38
  if alpha_2:
39
  return canonical_lang(alpha_2.lower())
40
  language = canonical_lang(base)
41
+ if not is_latin_script_compatible(language, config_name):
42
+ return ""
43
  return language if language in ALL_LANGS else base
44
 
45