Spaces:

impresso-project
/

solr-normalization-demo

Running

maslionok

change

fc5ce2f 7 minutes ago

3.51 kB

	import os

	# Redirect cache to a writable path inside container
	os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"

	import gradio as gr
	from impresso_pipelines.solrnormalization import SolrNormalizationPipeline

	pipeline = SolrNormalizationPipeline()

	LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]

	# Example text and default language
	EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes."
	DEFAULT_LANGUAGE = "en"

	def normalize(text, lang_choice):
	try:
	lang = None if lang_choice == "Auto-detect" else lang_choice
	result = pipeline(text, lang=lang, diagnostics=True)

	# Format analyzer pipeline for better readability
	analyzer_steps = []
	if 'analyzer_pipeline' in result and result['analyzer_pipeline']:
	for i, step in enumerate(result['analyzer_pipeline'], 1):
	step_type = step.get('type', 'unknown')
	step_name = step.get('name', 'unnamed')
	analyzer_steps.append(f" {i}. {step_type}: {step_name}")

	analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found"

	return f"🌍 Language: {result['language']}\n\n🔤 Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\n⚙️ Analyzer pipeline:\n{analyzer_display}"
	except Exception as e:
	print("❌ Pipeline error:", e)
	return f"Error: {e}"

	# Create the interface with logo and improved description
	with gr.Blocks(title="Solr Normalization Demo") as demo:
	# Add logo at the top
	gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)

	gr.Markdown(
	"""
	# 🔥 Solr Normalization Pipeline Demo

	Solr normalization is meant to demonstrate how text is normalized in the Impresso project.
	This pipeline replicates Solr's text processing functionality, showing how text goes through various
	analyzers including tokenization, stopword removal, and language-specific transformations.

	Try the example below or enter your own text to see how it gets processed!
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Enter Text",
	value=EXAMPLE_TEXT,
	lines=3,
	placeholder="Enter your text here..."
	)
	lang_dropdown = gr.Dropdown(
	choices=["Auto-detect"] + LANGUAGES,
	value=DEFAULT_LANGUAGE,
	label="Language"
	)
	submit_btn = gr.Button("🚀 Normalize Text", variant="primary")

	with gr.Column():
	output = gr.Textbox(
	label="Normalized Output",
	lines=15,
	placeholder="Results will appear here..."
	)

	submit_btn.click(
	fn=normalize,
	inputs=[text_input, lang_dropdown],
	outputs=output
	)

	gr.Markdown(
	"""
	### 📝 About the Pipeline
	- Tokenization: Splits text into individual tokens
	- Stopword Removal: Identifies and removes common words
	- Language Detection: Automatically detects text language
	- Normalization: Applies language-specific text transformations
	"""
	)

	demo.launch(server_name="0.0.0.0", server_port=7860)