maslionok
change
fc5ce2f
import os
# Redirect cache to a writable path inside container
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
import gradio as gr
from impresso_pipelines.solrnormalization import SolrNormalizationPipeline
pipeline = SolrNormalizationPipeline()
LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]
# Example text and default language
EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes."
DEFAULT_LANGUAGE = "en"
def normalize(text, lang_choice):
try:
lang = None if lang_choice == "Auto-detect" else lang_choice
result = pipeline(text, lang=lang, diagnostics=True)
# Format analyzer pipeline for better readability
analyzer_steps = []
if 'analyzer_pipeline' in result and result['analyzer_pipeline']:
for i, step in enumerate(result['analyzer_pipeline'], 1):
step_type = step.get('type', 'unknown')
step_name = step.get('name', 'unnamed')
analyzer_steps.append(f" {i}. {step_type}: {step_name}")
analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found"
return f"🌍 Language: {result['language']}\n\nπŸ”€ Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\nβš™οΈ Analyzer pipeline:\n{analyzer_display}"
except Exception as e:
print("❌ Pipeline error:", e)
return f"Error: {e}"
# Create the interface with logo and improved description
with gr.Blocks(title="Solr Normalization Demo") as demo:
# Add logo at the top
gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
gr.Markdown(
"""
# πŸ”₯ Solr Normalization Pipeline Demo
**Solr normalization** is meant to demonstrate how text is normalized in the **Impresso** project.
This pipeline replicates Solr's text processing functionality, showing how text goes through various
analyzers including tokenization, stopword removal, and language-specific transformations.
Try the example below or enter your own text to see how it gets processed!
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter Text",
value=EXAMPLE_TEXT,
lines=3,
placeholder="Enter your text here..."
)
lang_dropdown = gr.Dropdown(
choices=["Auto-detect"] + LANGUAGES,
value=DEFAULT_LANGUAGE,
label="Language"
)
submit_btn = gr.Button("πŸš€ Normalize Text", variant="primary")
with gr.Column():
output = gr.Textbox(
label="Normalized Output",
lines=15,
placeholder="Results will appear here..."
)
submit_btn.click(
fn=normalize,
inputs=[text_input, lang_dropdown],
outputs=output
)
gr.Markdown(
"""
### πŸ“ About the Pipeline
- **Tokenization**: Splits text into individual tokens
- **Stopword Removal**: Identifies and removes common words
- **Language Detection**: Automatically detects text language
- **Normalization**: Applies language-specific text transformations
"""
)
demo.launch(server_name="0.0.0.0", server_port=7860)