|
|
import gradio as gr |
|
|
import os |
|
|
import json |
|
|
from morphseg import MorphemeSegmenter |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LOADED_MODELS = {} |
|
|
|
|
|
LANGUAGES = { |
|
|
"English": "en", |
|
|
"Spanish": "es", |
|
|
"Russian": "ru", |
|
|
"French": "fr", |
|
|
"Italian": "it", |
|
|
"Czech": "cs", |
|
|
"Hungarian": "hu", |
|
|
"Mongolian": "mn", |
|
|
"Latin": "la" |
|
|
} |
|
|
|
|
|
EXAMPLES = [ |
|
|
["English", "The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", "+"], |
|
|
["Spanish", "desafortunadamente reescribieron rápidamente", "+"], |
|
|
["Russian", "неизбежность переработки неисправима", "+"], |
|
|
["French", "incompréhensible prétraitement irréversiblement", "+"], |
|
|
["Italian", "incredibilmente preprocessarono inevitabilmente", "+"], |
|
|
["Czech", "nepochopitelně přepracování nevratně", "+"], |
|
|
["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", "+"], |
|
|
["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", "+"], |
|
|
["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", "+"] |
|
|
] |
|
|
|
|
|
|
|
|
def get_segmenter(lang_code): |
|
|
"""Retrieves a model from cache or loads it if not present.""" |
|
|
if lang_code not in LOADED_MODELS: |
|
|
print(f"Loading model for {lang_code}...") |
|
|
try: |
|
|
LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code) |
|
|
except Exception as e: |
|
|
raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}") |
|
|
return LOADED_MODELS[lang_code] |
|
|
|
|
|
|
|
|
def process_segmentation(language_name, text_input, file_input, delimiter, output_format): |
|
|
"""Main processing function for the Gradio interface.""" |
|
|
|
|
|
|
|
|
content = "" |
|
|
if file_input is not None: |
|
|
try: |
|
|
with open(file_input.name, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
except UnicodeDecodeError: |
|
|
return "Error: File must be a text file (UTF-8).", None |
|
|
else: |
|
|
content = text_input |
|
|
|
|
|
if not content or content.strip() == "": |
|
|
return "Please enter text or upload a file.", None |
|
|
|
|
|
|
|
|
lang_code = LANGUAGES.get(language_name) |
|
|
if not lang_code: |
|
|
return "Error: Invalid language selection.", None |
|
|
|
|
|
segmenter = get_segmenter(lang_code) |
|
|
|
|
|
|
|
|
is_output_string = (output_format == "String") |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter) |
|
|
except Exception as e: |
|
|
return f"Error during segmentation: {str(e)}", None |
|
|
|
|
|
|
|
|
display_output = "" |
|
|
|
|
|
if is_output_string: |
|
|
display_output = result |
|
|
else: |
|
|
|
|
|
|
|
|
|
|
|
display_output = json.dumps(result, ensure_ascii=False, indent=2) |
|
|
|
|
|
|
|
|
output_filename = "segmented_output.txt" |
|
|
|
|
|
if not is_output_string: |
|
|
output_filename = "segmented_output.json" |
|
|
|
|
|
with open(output_filename, "w", encoding="utf-8") as f: |
|
|
f.write(display_output) |
|
|
|
|
|
return display_output, output_filename |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="MorphSeg Demo") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# 🧩 MorphSeg: Canonical Morpheme Segmentation |
|
|
|
|
|
**MorphSeg** provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics, |
|
|
MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation). |
|
|
|
|
|
*Select a language, enter text, and see the morphemes!* |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
|
|
|
lang_dropdown = gr.Dropdown( |
|
|
choices=list(LANGUAGES.keys()), |
|
|
value="English", |
|
|
label="Language", |
|
|
info="Select the language of your text." |
|
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("📝 Text Input"): |
|
|
txt_input = gr.Textbox( |
|
|
lines=5, |
|
|
placeholder="Type word or sentence here...", |
|
|
value="The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", |
|
|
label="Input Text" |
|
|
) |
|
|
with gr.TabItem("mb File Upload"): |
|
|
file_input = gr.File( |
|
|
label="Upload Text File (.txt)", |
|
|
file_types=[".txt", ".csv", ".tsv"] |
|
|
) |
|
|
|
|
|
with gr.Accordion("⚙️ Advanced Options", open=False): |
|
|
delimiter_input = gr.Textbox( |
|
|
value="+", |
|
|
label="Morpheme Delimiter", |
|
|
info="The string used to separate morphemes (e.g., '+', '|', ' @@')." |
|
|
) |
|
|
format_radio = gr.Radio( |
|
|
choices=["String", "List"], |
|
|
value="String", |
|
|
label="Output Format", |
|
|
info="String returns text with delimiters. List returns a Python list structure." |
|
|
) |
|
|
|
|
|
submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
|
|
|
output_area = gr.Textbox( |
|
|
label="Segmented Output", |
|
|
lines=10, |
|
|
show_label=True |
|
|
) |
|
|
download_btn = gr.File(label="Download Result") |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=process_segmentation, |
|
|
inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio], |
|
|
outputs=[output_area, download_btn] |
|
|
) |
|
|
|
|
|
gr.Markdown("### Examples") |
|
|
gr.Examples( |
|
|
examples=EXAMPLES, |
|
|
inputs=[lang_dropdown, txt_input, delimiter_input], |
|
|
label="Click on an example to populate:" |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) | Based on the work of [Girrbach (2022)](https://aclanthology.org/2022.sigmorphon-1.13/) submitted to the [SIGMORPHON 2022 Shared Task on Morpheme Segmentation](https://arxiv.org/abs/2206.07615) |
|
|
""" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(theme=gr.themes.Soft()) |