Donald Winkelman
Adding default value to input textbox
0154f98
import gradio as gr
import os
import json
from morphseg import MorphemeSegmenter
# --- Global Cache for Models ---
# We load models lazily (only when a user requests a specific language)
# to prevent the app from timing out during startup.
LOADED_MODELS = {}
LANGUAGES = {
"English": "en",
"Spanish": "es",
"Russian": "ru",
"French": "fr",
"Italian": "it",
"Czech": "cs",
"Hungarian": "hu",
"Mongolian": "mn",
"Latin": "la"
}
EXAMPLES = [
["English", "The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", "+"],
["Spanish", "desafortunadamente reescribieron rápidamente", "+"],
["Russian", "неизбежность переработки неисправима", "+"],
["French", "incompréhensible prétraitement irréversiblement", "+"],
["Italian", "incredibilmente preprocessarono inevitabilmente", "+"],
["Czech", "nepochopitelně přepracování nevratně", "+"],
["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", "+"],
["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", "+"],
["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", "+"]
]
def get_segmenter(lang_code):
"""Retrieves a model from cache or loads it if not present."""
if lang_code not in LOADED_MODELS:
print(f"Loading model for {lang_code}...")
try:
LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code)
except Exception as e:
raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}")
return LOADED_MODELS[lang_code]
def process_segmentation(language_name, text_input, file_input, delimiter, output_format):
"""Main processing function for the Gradio interface."""
# 1. Determine Input Source
content = ""
if file_input is not None:
try:
with open(file_input.name, 'r', encoding='utf-8') as f:
content = f.read()
except UnicodeDecodeError:
return "Error: File must be a text file (UTF-8).", None
else:
content = text_input
if not content or content.strip() == "":
return "Please enter text or upload a file.", None
# 2. Get Language Code and Model
lang_code = LANGUAGES.get(language_name)
if not lang_code:
return "Error: Invalid language selection.", None
segmenter = get_segmenter(lang_code)
# 3. Determine Output Format
is_output_string = (output_format == "String")
# 4. Run Segmentation
# Note: The library segment() method handles the empty string check internally
try:
result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter)
except Exception as e:
return f"Error during segmentation: {str(e)}", None
# 5. Format Output for Display and File Generation
display_output = ""
if is_output_string:
display_output = result
else:
# If list, pretty print it as JSON strings for readability
# If the input was a single sentence, it's a list of lists.
# If massive text, it's a large list of lists.
display_output = json.dumps(result, ensure_ascii=False, indent=2)
# 6. Create Downloadable File
output_filename = "segmented_output.txt"
# If it's JSON/List, save as .json, otherwise .txt
if not is_output_string:
output_filename = "segmented_output.json"
with open(output_filename, "w", encoding="utf-8") as f:
f.write(display_output)
return display_output, output_filename
# --- Gradio UI Construction ---
with gr.Blocks(title="MorphSeg Demo") as demo:
gr.Markdown(
"""
# 🧩 MorphSeg: Canonical Morpheme Segmentation
**MorphSeg** provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics,
MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation).
*Select a language, enter text, and see the morphemes!*
"""
)
with gr.Row():
with gr.Column(scale=1):
# Controls
lang_dropdown = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English",
label="Language",
info="Select the language of your text."
)
with gr.Tabs():
with gr.TabItem("📝 Text Input"):
txt_input = gr.Textbox(
lines=5,
placeholder="Type word or sentence here...",
value="The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization",
label="Input Text"
)
with gr.TabItem("mb File Upload"):
file_input = gr.File(
label="Upload Text File (.txt)",
file_types=[".txt", ".csv", ".tsv"]
)
with gr.Accordion("⚙️ Advanced Options", open=False):
delimiter_input = gr.Textbox(
value="+",
label="Morpheme Delimiter",
info="The string used to separate morphemes (e.g., '+', '|', ' @@')."
)
format_radio = gr.Radio(
choices=["String", "List"],
value="String",
label="Output Format",
info="String returns text with delimiters. List returns a Python list structure."
)
submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg")
with gr.Column(scale=1):
# Outputs
output_area = gr.Textbox(
label="Segmented Output",
lines=10,
show_label=True
)
download_btn = gr.File(label="Download Result")
# Event Listeners
submit_btn.click(
fn=process_segmentation,
inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio],
outputs=[output_area, download_btn]
)
gr.Markdown("### Examples")
gr.Examples(
examples=EXAMPLES,
inputs=[lang_dropdown, txt_input, delimiter_input],
label="Click on an example to populate:"
)
gr.Markdown(
"""
---
Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) | Based on the work of [Girrbach (2022)](https://aclanthology.org/2022.sigmorphon-1.13/) submitted to the [SIGMORPHON 2022 Shared Task on Morpheme Segmentation](https://arxiv.org/abs/2206.07615)
"""
)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft())