Spaces:

polyglot-tagger
/

language-extractor-demo

Sleeping

App Files Files Community

DerivedFunction1 commited on Apr 18

Commit

ed7fe38

1 Parent(s): 8a63f11

update

Browse files

Files changed (1) hide show

adverse_mix.py +45 -59

adverse_mix.py CHANGED Viewed

@@ -35,69 +35,54 @@ def _normalize_text_key(text: str) -> str:
     return " ".join(str(text or "").split()).casefold().strip()
-def _source_frame_to_records(frame: pd.DataFrame, *, source: str) -> list[dict[str, Any]]:
     if frame.empty:
-        return []
-    records: list[dict[str, Any]] = []
     if source == "fleurs":
-        for _, row in frame.iterrows():
-            lang_iso2 = str(row.get("model_lang", "")).strip()
-            text = str(row.get("text", "")).strip()
-            if not lang_iso2 or not text:
-                continue
-            records.append(
-                {
-                    "source": source,
-                    "source_lang": str(row.get("source_lang", "")).strip(),
-                    "lang_iso2": lang_iso2,
-                    "lang_iso3": str(row.get("lang_iso3", "")).strip() or LANG_ISO2_TO_ISO3.get(lang_iso2, ""),
-                    "text": text,
-                    "split": str(row.get("split", "")).strip(),
-                    "example_id": int(row.get("id", -1)) if str(row.get("id", "-1")).strip().lstrip("-").isdigit() else -1,
-                }
-            )
-        return records
     if source == "tatoeba":
-        for _, row in frame.iterrows():
-            lang_iso2 = str(row.get("source_lang", "")).strip()
-            text = str(row.get("text", "")).strip()
-            if not lang_iso2 or not text:
-                continue
-            records.append(
-                {
-                    "source": source,
-                    "source_lang": lang_iso2,
-                    "lang_iso2": lang_iso2,
-                    "lang_iso3": str(row.get("lang_iso3", "")).strip() or LANG_ISO2_TO_ISO3.get(lang_iso2, ""),
-                    "text": text,
-                    "split": "",
-                    "example_id": int(row.get("id", -1)) if str(row.get("id", "-1")).strip().lstrip("-").isdigit() else -1,
-                }
-            )
-        return records
     if source == "sib200":
-        for _, row in frame.iterrows():
-            lang_iso2 = str(row.get("lang_iso2", "")).strip()
-            text = str(row.get("text", "")).strip()
-            if not lang_iso2 or not text:
-                continue
-            records.append(
-                {
-                    "source": source,
-                    "source_lang": str(row.get("source_lang", "")).strip(),
-                    "lang_iso2": lang_iso2,
-                    "lang_iso3": str(row.get("lang_iso3", "")).strip() or LANG_ISO2_TO_ISO3.get(lang_iso2, ""),
-                    "text": text,
-                    "split": str(row.get("split", "")).strip(),
-                    "example_id": int(row.get("index_id", -1)) if str(row.get("index_id", "-1")).strip().lstrip("-").isdigit() else -1,
-                    "topic": str(row.get("topic", "")).strip(),
-                    "label": int(row.get("label", -1)) if str(row.get("label", "-1")).strip().lstrip("-").isdigit() else -1,
-                }
-            )
-        return records
     raise RuntimeError(f"Unsupported source: {source}")
@@ -115,14 +100,15 @@ def _load_adverse_pool() -> pd.DataFrame:
             source_frame = loader()
         except FileNotFoundError:
             continue
-        records = _source_frame_to_records(source_frame, source=source)
-        if records:
-            frames.append(pd.DataFrame.from_records(records))
     if not frames:
         raise RuntimeError("No cached sources were available for adverse mixes.")
     combined = pd.concat(frames, ignore_index=True)
     combined = combined[combined["lang_iso2"].isin(ALL_LANGS)]
     combined["text_key"] = combined["text"].astype(str).map(_normalize_text_key)
     combined = combined[combined["text_key"].ne("")].drop_duplicates(subset=["lang_iso2", "text_key"], keep="first")

     return " ".join(str(text or "").split()).casefold().strip()
+def _column_or_default(frame: pd.DataFrame, column: str, default: Any) -> pd.Series:
+    if column in frame.columns:
+        return frame[column]
+    return pd.Series([default] * len(frame), index=frame.index)
+def _standardize_frame(frame: pd.DataFrame, *, source: str) -> pd.DataFrame:
     if frame.empty:
+        return pd.DataFrame()
     if source == "fleurs":
+        standardized = frame.copy()
+        standardized["source"] = source
+        standardized["lang_iso2"] = _column_or_default(standardized, "model_lang", "").astype(str).str.strip()
+        standardized["source_lang"] = _column_or_default(standardized, "source_lang", "").astype(str).str.strip()
+        standardized["lang_iso3"] = _column_or_default(standardized, "lang_iso3", "").astype(str).str.strip()
+        standardized["lang_iso3"] = standardized["lang_iso3"].where(standardized["lang_iso3"].ne(""), standardized["lang_iso2"].map(lambda lang: LANG_ISO2_TO_ISO3.get(lang, "")))
+        standardized["split"] = _column_or_default(standardized, "split", "").astype(str).str.strip()
+        standardized["example_id"] = pd.to_numeric(_column_or_default(standardized, "id", -1), errors="coerce").fillna(-1).astype(int)
+        standardized["topic"] = ""
+        standardized["label"] = -1
+        return standardized.loc[:, ["source", "source_lang", "lang_iso2", "lang_iso3", "text", "split", "example_id", "topic", "label"]].copy()
     if source == "tatoeba":
+        standardized = frame.copy()
+        standardized["source"] = source
+        standardized["source_lang"] = _column_or_default(standardized, "source_lang", "").astype(str).str.strip()
+        standardized["lang_iso2"] = standardized["source_lang"]
+        standardized["lang_iso3"] = _column_or_default(standardized, "lang_iso3", "").astype(str).str.strip()
+        standardized["lang_iso3"] = standardized["lang_iso3"].where(standardized["lang_iso3"].ne(""), standardized["lang_iso2"].map(lambda lang: LANG_ISO2_TO_ISO3.get(lang, "")))
+        standardized["split"] = ""
+        standardized["example_id"] = pd.to_numeric(_column_or_default(standardized, "id", -1), errors="coerce").fillna(-1).astype(int)
+        standardized["topic"] = ""
+        standardized["label"] = -1
+        return standardized.loc[:, ["source", "source_lang", "lang_iso2", "lang_iso3", "text", "split", "example_id", "topic", "label"]].copy()
     if source == "sib200":
+        standardized = frame.copy()
+        standardized["source"] = source
+        standardized["source_lang"] = _column_or_default(standardized, "source_lang", "").astype(str).str.strip()
+        standardized["lang_iso2"] = _column_or_default(standardized, "lang_iso2", "").astype(str).str.strip()
+        standardized["lang_iso3"] = _column_or_default(standardized, "lang_iso3", "").astype(str).str.strip()
+        standardized["lang_iso3"] = standardized["lang_iso3"].where(standardized["lang_iso3"].ne(""), standardized["lang_iso2"].map(lambda lang: LANG_ISO2_TO_ISO3.get(lang, "")))
+        standardized["split"] = _column_or_default(standardized, "split", "").astype(str).str.strip()
+        standardized["example_id"] = pd.to_numeric(_column_or_default(standardized, "index_id", -1), errors="coerce").fillna(-1).astype(int)
+        standardized["topic"] = _column_or_default(standardized, "topic", "").astype(str).str.strip()
+        standardized["label"] = pd.to_numeric(_column_or_default(standardized, "label", -1), errors="coerce").fillna(-1).astype(int)
+        return standardized.loc[:, ["source", "source_lang", "lang_iso2", "lang_iso3", "text", "split", "example_id", "topic", "label"]].copy()
     raise RuntimeError(f"Unsupported source: {source}")
             source_frame = loader()
         except FileNotFoundError:
             continue
+        standardized = _standardize_frame(source_frame, source=source)
+        if not standardized.empty:
+            frames.append(standardized)
     if not frames:
         raise RuntimeError("No cached sources were available for adverse mixes.")
     combined = pd.concat(frames, ignore_index=True)
+    combined = combined[combined["text"].astype(str).str.strip().ne("")]
     combined = combined[combined["lang_iso2"].isin(ALL_LANGS)]
     combined["text_key"] = combined["text"].astype(str).map(_normalize_text_key)
     combined = combined[combined["text_key"].ne("")].drop_duplicates(subset=["lang_iso2", "text_key"], keep="first")