File size: 3,509 Bytes
327bd85 b09d94b 42c4e1a 8c7a402 42c4e1a 8e796ef 42c4e1a e36aaa8 527919e e36aaa8 8e796ef fc5ce2f 42c4e1a 66d1427 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
# Redirect cache to a writable path inside container
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
import gradio as gr
from impresso_pipelines.solrnormalization import SolrNormalizationPipeline
pipeline = SolrNormalizationPipeline()
LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]
# Example text and default language
EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes."
DEFAULT_LANGUAGE = "en"
def normalize(text, lang_choice):
try:
lang = None if lang_choice == "Auto-detect" else lang_choice
result = pipeline(text, lang=lang, diagnostics=True)
# Format analyzer pipeline for better readability
analyzer_steps = []
if 'analyzer_pipeline' in result and result['analyzer_pipeline']:
for i, step in enumerate(result['analyzer_pipeline'], 1):
step_type = step.get('type', 'unknown')
step_name = step.get('name', 'unnamed')
analyzer_steps.append(f" {i}. {step_type}: {step_name}")
analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found"
return f"π Language: {result['language']}\n\nπ€ Tokens:\n{result['tokens']}\n\nπ« Detected stopwords:\n{result['stopwords_detected']}\n\nβοΈ Analyzer pipeline:\n{analyzer_display}"
except Exception as e:
print("β Pipeline error:", e)
return f"Error: {e}"
# Create the interface with logo and improved description
with gr.Blocks(title="Solr Normalization Demo") as demo:
# Add logo at the top
gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
gr.Markdown(
"""
# π₯ Solr Normalization Pipeline Demo
**Solr normalization** is meant to demonstrate how text is normalized in the **Impresso** project.
This pipeline replicates Solr's text processing functionality, showing how text goes through various
analyzers including tokenization, stopword removal, and language-specific transformations.
Try the example below or enter your own text to see how it gets processed!
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter Text",
value=EXAMPLE_TEXT,
lines=3,
placeholder="Enter your text here..."
)
lang_dropdown = gr.Dropdown(
choices=["Auto-detect"] + LANGUAGES,
value=DEFAULT_LANGUAGE,
label="Language"
)
submit_btn = gr.Button("π Normalize Text", variant="primary")
with gr.Column():
output = gr.Textbox(
label="Normalized Output",
lines=15,
placeholder="Results will appear here..."
)
submit_btn.click(
fn=normalize,
inputs=[text_input, lang_dropdown],
outputs=output
)
gr.Markdown(
"""
### π About the Pipeline
- **Tokenization**: Splits text into individual tokens
- **Stopword Removal**: Identifies and removes common words
- **Language Detection**: Automatically detects text language
- **Normalization**: Applies language-specific text transformations
"""
)
demo.launch(server_name="0.0.0.0", server_port=7860) |