import os # Redirect cache to a writable path inside container os.environ["XDG_CACHE_HOME"] = "/tmp/.cache" import gradio as gr from impresso_pipelines.solrnormalization import SolrNormalizationPipeline pipeline = SolrNormalizationPipeline() LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"] # Example text and default language EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes." DEFAULT_LANGUAGE = "en" def normalize(text, lang_choice): try: lang = None if lang_choice == "Auto-detect" else lang_choice result = pipeline(text, lang=lang, diagnostics=True) # Format analyzer pipeline for better readability analyzer_steps = [] if 'analyzer_pipeline' in result and result['analyzer_pipeline']: for i, step in enumerate(result['analyzer_pipeline'], 1): step_type = step.get('type', 'unknown') step_name = step.get('name', 'unnamed') analyzer_steps.append(f" {i}. {step_type}: {step_name}") analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found" return f"šŸŒ Language: {result['language']}\n\nšŸ”¤ Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\nāš™ļø Analyzer pipeline:\n{analyzer_display}" except Exception as e: print("āŒ Pipeline error:", e) return f"Error: {e}" # Create the interface with logo and improved description with gr.Blocks(title="Solr Normalization Demo") as demo: # Add logo at the top gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100) gr.Markdown( """ # šŸ”„ Solr Normalization Pipeline Demo **Solr normalization** is meant to demonstrate how text is normalized in the **Impresso** project. This pipeline replicates Solr's text processing functionality, showing how text goes through various analyzers including tokenization, stopword removal, and language-specific transformations. Try the example below or enter your own text to see how it gets processed! """ ) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Enter Text", value=EXAMPLE_TEXT, lines=3, placeholder="Enter your text here..." ) lang_dropdown = gr.Dropdown( choices=["Auto-detect"] + LANGUAGES, value=DEFAULT_LANGUAGE, label="Language" ) submit_btn = gr.Button("šŸš€ Normalize Text", variant="primary") with gr.Column(): with gr.Row(): output = gr.Textbox( label="Normalized Output", lines=15, placeholder="Results will appear here...", scale=10 ) info_btn = gr.Button("Analyzer Pipeline Explanation", size="sm", scale=1) # Info modal/accordion for pipeline details with gr.Accordion("šŸ“ About the Pipeline", open=False, visible=False) as info_accordion: gr.Markdown( """ - **Tokenization**: Splits text into individual tokens - **Stopword Removal**: Identifies and removes common words - **Normalization**: Applies language-specific text transformations """ ) submit_btn.click( fn=normalize, inputs=[text_input, lang_dropdown], outputs=output ) # Toggle info visibility when info button is clicked info_btn.click( fn=lambda: gr.Accordion(visible=True, open=True), outputs=info_accordion ) demo.launch(server_name="0.0.0.0", server_port=7860)