Spaces:

impresso-project
/

solr-normalization-demo

Running

File size: 3,509 Bytes

import os

# Redirect cache to a writable path inside container
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"

import gradio as gr
from impresso_pipelines.solrnormalization import SolrNormalizationPipeline

pipeline = SolrNormalizationPipeline()

LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]

# Example text and default language
EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes."
DEFAULT_LANGUAGE = "en"

def normalize(text, lang_choice):
    try:
        lang = None if lang_choice == "Auto-detect" else lang_choice
        result = pipeline(text, lang=lang, diagnostics=True)
        
        # Format analyzer pipeline for better readability
        analyzer_steps = []
        if 'analyzer_pipeline' in result and result['analyzer_pipeline']:
            for i, step in enumerate(result['analyzer_pipeline'], 1):
                step_type = step.get('type', 'unknown')
                step_name = step.get('name', 'unnamed')
                analyzer_steps.append(f"  {i}. {step_type}: {step_name}")
        
        analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else "  No analyzer steps found"
        
        return f"🌍 Language: {result['language']}\n\n🔤 Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\n⚙️ Analyzer pipeline:\n{analyzer_display}"
    except Exception as e:
        print("❌ Pipeline error:", e)
        return f"Error: {e}"

# Create the interface with logo and improved description
with gr.Blocks(title="Solr Normalization Demo") as demo:
    # Add logo at the top
    gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
    
    gr.Markdown(
        """
        # 🔥 Solr Normalization Pipeline Demo
        
        **Solr normalization** is meant to demonstrate how text is normalized in the **Impresso** project. 
        This pipeline replicates Solr's text processing functionality, showing how text goes through various 
        analyzers including tokenization, stopword removal, and language-specific transformations.
        
        Try the example below or enter your own text to see how it gets processed!
        """
    )
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter Text", 
                value=EXAMPLE_TEXT,
                lines=3,
                placeholder="Enter your text here..."
            )
            lang_dropdown = gr.Dropdown(
                choices=["Auto-detect"] + LANGUAGES, 
                value=DEFAULT_LANGUAGE, 
                label="Language"
            )
            submit_btn = gr.Button("🚀 Normalize Text", variant="primary")
        
        with gr.Column():
            output = gr.Textbox(
                label="Normalized Output", 
                lines=15,
                placeholder="Results will appear here..."
            )
    
    submit_btn.click(
        fn=normalize,
        inputs=[text_input, lang_dropdown],
        outputs=output
    )
    
    gr.Markdown(
        """
        ### 📝 About the Pipeline
        - **Tokenization**: Splits text into individual tokens
        - **Stopword Removal**: Identifies and removes common words
        - **Language Detection**: Automatically detects text language
        - **Normalization**: Applies language-specific text transformations
        """
    )

demo.launch(server_name="0.0.0.0", server_port=7860)