Spaces:

Morphological-Segmentation
/

Morpheme_Segmentation_Demo

Running

Donald Winkelman

Adding default value to input textbox

0154f98 8 days ago

6.89 kB

	import gradio as gr
	import os
	import json
	from morphseg import MorphemeSegmenter

	# --- Global Cache for Models ---
	# We load models lazily (only when a user requests a specific language)
	# to prevent the app from timing out during startup.
	LOADED_MODELS = {}

	LANGUAGES = {
	"English": "en",
	"Spanish": "es",
	"Russian": "ru",
	"French": "fr",
	"Italian": "it",
	"Czech": "cs",
	"Hungarian": "hu",
	"Mongolian": "mn",
	"Latin": "la"
	}

	EXAMPLES = [
	["English", "The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", "+"],
	["Spanish", "desafortunadamente reescribieron rápidamente", "+"],
	["Russian", "неизбежность переработки неисправима", "+"],
	["French", "incompréhensible prétraitement irréversiblement", "+"],
	["Italian", "incredibilmente preprocessarono inevitabilmente", "+"],
	["Czech", "nepochopitelně přepracování nevratně", "+"],
	["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", "+"],
	["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", "+"],
	["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", "+"]
	]


	def get_segmenter(lang_code):
	"""Retrieves a model from cache or loads it if not present."""
	if lang_code not in LOADED_MODELS:
	print(f"Loading model for {lang_code}...")
	try:
	LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code)
	except Exception as e:
	raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}")
	return LOADED_MODELS[lang_code]


	def process_segmentation(language_name, text_input, file_input, delimiter, output_format):
	"""Main processing function for the Gradio interface."""

	# 1. Determine Input Source
	content = ""
	if file_input is not None:
	try:
	with open(file_input.name, 'r', encoding='utf-8') as f:
	content = f.read()
	except UnicodeDecodeError:
	return "Error: File must be a text file (UTF-8).", None
	else:
	content = text_input

	if not content or content.strip() == "":
	return "Please enter text or upload a file.", None

	# 2. Get Language Code and Model
	lang_code = LANGUAGES.get(language_name)
	if not lang_code:
	return "Error: Invalid language selection.", None

	segmenter = get_segmenter(lang_code)

	# 3. Determine Output Format
	is_output_string = (output_format == "String")

	# 4. Run Segmentation
	# Note: The library segment() method handles the empty string check internally
	try:
	result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter)
	except Exception as e:
	return f"Error during segmentation: {str(e)}", None

	# 5. Format Output for Display and File Generation
	display_output = ""

	if is_output_string:
	display_output = result
	else:
	# If list, pretty print it as JSON strings for readability
	# If the input was a single sentence, it's a list of lists.
	# If massive text, it's a large list of lists.
	display_output = json.dumps(result, ensure_ascii=False, indent=2)

	# 6. Create Downloadable File
	output_filename = "segmented_output.txt"
	# If it's JSON/List, save as .json, otherwise .txt
	if not is_output_string:
	output_filename = "segmented_output.json"

	with open(output_filename, "w", encoding="utf-8") as f:
	f.write(display_output)

	return display_output, output_filename


	# --- Gradio UI Construction ---

	with gr.Blocks(title="MorphSeg Demo") as demo:
	gr.Markdown(
	"""
	# 🧩 MorphSeg: Canonical Morpheme Segmentation

	MorphSeg provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics,
	MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation).

	Select a language, enter text, and see the morphemes!
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	# Controls
	lang_dropdown = gr.Dropdown(
	choices=list(LANGUAGES.keys()),
	value="English",
	label="Language",
	info="Select the language of your text."
	)

	with gr.Tabs():
	with gr.TabItem("📝 Text Input"):
	txt_input = gr.Textbox(
	lines=5,
	placeholder="Type word or sentence here...",
	value="The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization",
	label="Input Text"
	)
	with gr.TabItem("mb File Upload"):
	file_input = gr.File(
	label="Upload Text File (.txt)",
	file_types=[".txt", ".csv", ".tsv"]
	)

	with gr.Accordion("⚙️ Advanced Options", open=False):
	delimiter_input = gr.Textbox(
	value="+",
	label="Morpheme Delimiter",
	info="The string used to separate morphemes (e.g., '+', '\|', ' @@')."
	)
	format_radio = gr.Radio(
	choices=["String", "List"],
	value="String",
	label="Output Format",
	info="String returns text with delimiters. List returns a Python list structure."
	)

	submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg")

	with gr.Column(scale=1):
	# Outputs
	output_area = gr.Textbox(
	label="Segmented Output",
	lines=10,
	show_label=True
	)
	download_btn = gr.File(label="Download Result")

	# Event Listeners
	submit_btn.click(
	fn=process_segmentation,
	inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio],
	outputs=[output_area, download_btn]
	)

	gr.Markdown("### Examples")
	gr.Examples(
	examples=EXAMPLES,
	inputs=[lang_dropdown, txt_input, delimiter_input],
	label="Click on an example to populate:"
	)

	gr.Markdown(
	"""
	---
	Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) \| Based on the work of [Girrbach (2022)](https://aclanthology.org/2022.sigmorphon-1.13/) submitted to the [SIGMORPHON 2022 Shared Task on Morpheme Segmentation](https://arxiv.org/abs/2206.07615)
	"""
	)

	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Soft())