import gradio as gr import os import json from morphseg import MorphemeSegmenter # --- Global Cache for Models --- # We load models lazily (only when a user requests a specific language) # to prevent the app from timing out during startup. LOADED_MODELS = {} LANGUAGES = { "English": "en", "Spanish": "es", "Russian": "ru", "French": "fr", "Italian": "it", "Czech": "cs", "Hungarian": "hu", "Mongolian": "mn", "Latin": "la" } EXAMPLES = [ ["English", "The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", "+"], ["Spanish", "desafortunadamente reescribieron rápidamente", "+"], ["Russian", "неизбежность переработки неисправима", "+"], ["French", "incompréhensible prétraitement irréversiblement", "+"], ["Italian", "incredibilmente preprocessarono inevitabilmente", "+"], ["Czech", "nepochopitelně přepracování nevratně", "+"], ["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", "+"], ["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", "+"], ["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", "+"] ] def get_segmenter(lang_code): """Retrieves a model from cache or loads it if not present.""" if lang_code not in LOADED_MODELS: print(f"Loading model for {lang_code}...") try: LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code) except Exception as e: raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}") return LOADED_MODELS[lang_code] def process_segmentation(language_name, text_input, file_input, delimiter, output_format): """Main processing function for the Gradio interface.""" # 1. Determine Input Source content = "" if file_input is not None: try: with open(file_input.name, 'r', encoding='utf-8') as f: content = f.read() except UnicodeDecodeError: return "Error: File must be a text file (UTF-8).", None else: content = text_input if not content or content.strip() == "": return "Please enter text or upload a file.", None # 2. Get Language Code and Model lang_code = LANGUAGES.get(language_name) if not lang_code: return "Error: Invalid language selection.", None segmenter = get_segmenter(lang_code) # 3. Determine Output Format is_output_string = (output_format == "String") # 4. Run Segmentation # Note: The library segment() method handles the empty string check internally try: result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter) except Exception as e: return f"Error during segmentation: {str(e)}", None # 5. Format Output for Display and File Generation display_output = "" if is_output_string: display_output = result else: # If list, pretty print it as JSON strings for readability # If the input was a single sentence, it's a list of lists. # If massive text, it's a large list of lists. display_output = json.dumps(result, ensure_ascii=False, indent=2) # 6. Create Downloadable File output_filename = "segmented_output.txt" # If it's JSON/List, save as .json, otherwise .txt if not is_output_string: output_filename = "segmented_output.json" with open(output_filename, "w", encoding="utf-8") as f: f.write(display_output) return display_output, output_filename # --- Gradio UI Construction --- with gr.Blocks(title="MorphSeg Demo") as demo: gr.Markdown( """ # 🧩 MorphSeg: Canonical Morpheme Segmentation **MorphSeg** provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics, MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation). *Select a language, enter text, and see the morphemes!* """ ) with gr.Row(): with gr.Column(scale=1): # Controls lang_dropdown = gr.Dropdown( choices=list(LANGUAGES.keys()), value="English", label="Language", info="Select the language of your text." ) with gr.Tabs(): with gr.TabItem("📝 Text Input"): txt_input = gr.Textbox( lines=5, placeholder="Type word or sentence here...", value="The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", label="Input Text" ) with gr.TabItem("mb File Upload"): file_input = gr.File( label="Upload Text File (.txt)", file_types=[".txt", ".csv", ".tsv"] ) with gr.Accordion("⚙️ Advanced Options", open=False): delimiter_input = gr.Textbox( value="+", label="Morpheme Delimiter", info="The string used to separate morphemes (e.g., '+', '|', ' @@')." ) format_radio = gr.Radio( choices=["String", "List"], value="String", label="Output Format", info="String returns text with delimiters. List returns a Python list structure." ) submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg") with gr.Column(scale=1): # Outputs output_area = gr.Textbox( label="Segmented Output", lines=10, show_label=True ) download_btn = gr.File(label="Download Result") # Event Listeners submit_btn.click( fn=process_segmentation, inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio], outputs=[output_area, download_btn] ) gr.Markdown("### Examples") gr.Examples( examples=EXAMPLES, inputs=[lang_dropdown, txt_input, delimiter_input], label="Click on an example to populate:" ) gr.Markdown( """ --- Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) | Based on the work of [Girrbach (2022)](https://aclanthology.org/2022.sigmorphon-1.13/) submitted to the [SIGMORPHON 2022 Shared Task on Morpheme Segmentation](https://arxiv.org/abs/2206.07615) """ ) if __name__ == "__main__": demo.launch(theme=gr.themes.Soft())