|
import os |
|
|
|
|
|
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache" |
|
|
|
import gradio as gr |
|
from impresso_pipelines.solrnormalization import SolrNormalizationPipeline |
|
|
|
pipeline = SolrNormalizationPipeline() |
|
|
|
LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"] |
|
|
|
|
|
EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes." |
|
DEFAULT_LANGUAGE = "en" |
|
|
|
def normalize(text, lang_choice): |
|
try: |
|
lang = None if lang_choice == "Auto-detect" else lang_choice |
|
result = pipeline(text, lang=lang, diagnostics=True) |
|
|
|
|
|
analyzer_steps = [] |
|
if 'analyzer_pipeline' in result and result['analyzer_pipeline']: |
|
for i, step in enumerate(result['analyzer_pipeline'], 1): |
|
step_type = step.get('type', 'unknown') |
|
step_name = step.get('name', 'unnamed') |
|
analyzer_steps.append(f" {i}. {step_type}: {step_name}") |
|
|
|
analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found" |
|
|
|
return f"π Language: {result['language']}\n\nπ€ Tokens:\n{result['tokens']}\n\nπ« Detected stopwords:\n{result['stopwords_detected']}\n\nβοΈ Analyzer pipeline:\n{analyzer_display}" |
|
except Exception as e: |
|
print("β Pipeline error:", e) |
|
return f"Error: {e}" |
|
|
|
|
|
with gr.Blocks(title="Solr Normalization Demo") as demo: |
|
|
|
gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100) |
|
|
|
gr.Markdown( |
|
""" |
|
# π₯ Solr Normalization Pipeline Demo |
|
|
|
**Solr normalization** is meant to demonstrate how text is normalized in the **Impresso** project. |
|
This pipeline replicates Solr's text processing functionality, showing how text goes through various |
|
analyzers including tokenization, stopword removal, and language-specific transformations. |
|
|
|
Try the example below or enter your own text to see how it gets processed! |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
text_input = gr.Textbox( |
|
label="Enter Text", |
|
value=EXAMPLE_TEXT, |
|
lines=3, |
|
placeholder="Enter your text here..." |
|
) |
|
lang_dropdown = gr.Dropdown( |
|
choices=["Auto-detect"] + LANGUAGES, |
|
value=DEFAULT_LANGUAGE, |
|
label="Language" |
|
) |
|
submit_btn = gr.Button("π Normalize Text", variant="primary") |
|
|
|
with gr.Column(): |
|
output = gr.Textbox( |
|
label="Normalized Output", |
|
lines=15, |
|
placeholder="Results will appear here..." |
|
) |
|
|
|
submit_btn.click( |
|
fn=normalize, |
|
inputs=[text_input, lang_dropdown], |
|
outputs=output |
|
) |
|
|
|
gr.Markdown( |
|
""" |
|
### π About the Pipeline |
|
- **Tokenization**: Splits text into individual tokens |
|
- **Stopword Removal**: Identifies and removes common words |
|
- **Language Detection**: Automatically detects text language |
|
- **Normalization**: Applies language-specific text transformations |
|
""" |
|
) |
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |