maslionok commited on
Commit
8e796ef
Β·
1 Parent(s): ac40fb1
Files changed (3) hide show
  1. README.md +14 -1
  2. app.py +58 -11
  3. logo.jpeg +0 -0
README.md CHANGED
@@ -5,7 +5,20 @@ colorFrom: blue
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
- short_description: Solr Normalization Demo
9
  ---
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
+ short_description: Demonstrate text normalization in the Impresso project using Solr functionality
9
  ---
10
 
11
+ # Solr Normalization Demo
12
+
13
+ This space demonstrates how text is normalized in the **Impresso** project, replicating Solr's text processing functionality.
14
+
15
+ Solr normalization is meant to demonstrate how text is normalized in the Impresso project. The pipeline processes text through various analyzers including tokenization, stopword removal, and language-specific transformations to prepare text for search and analysis.
16
+
17
+ ## Features
18
+ - Multi-language support (German, French, Spanish, Italian, Portuguese, Dutch, English)
19
+ - Automatic language detection
20
+ - Detailed analyzer pipeline visualization
21
+ - Stopword detection and removal
22
+ - Token normalization
23
+
24
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -10,6 +10,10 @@ pipeline = SolrNormalizationPipeline()
10
 
11
  LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]
12
 
 
 
 
 
13
  def normalize(text, lang_choice):
14
  try:
15
  lang = None if lang_choice == "Auto-detect" else lang_choice
@@ -30,16 +34,59 @@ def normalize(text, lang_choice):
30
  print("❌ Pipeline error:", e)
31
  return f"Error: {e}"
32
 
33
- demo = gr.Interface(
34
- fn=normalize,
35
- inputs=[
36
- gr.Textbox(label="Enter Text"),
37
- gr.Dropdown(choices=LANGUAGES, value="Auto-detect", label="Language")
38
- ],
39
- outputs=gr.Textbox(label="Normalized Output"),
40
- title="Solr Normalization Pipeline",
41
- description="Text normalization replicating Solr functionality.",
42
- allow_flagging="never"
43
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
10
 
11
  LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]
12
 
13
+ # Example text and default language
14
+ EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes."
15
+ DEFAULT_LANGUAGE = "en"
16
+
17
  def normalize(text, lang_choice):
18
  try:
19
  lang = None if lang_choice == "Auto-detect" else lang_choice
 
34
  print("❌ Pipeline error:", e)
35
  return f"Error: {e}"
36
 
37
+ # Create the interface with logo and improved description
38
+ with gr.Blocks(title="Solr Normalization Demo") as demo:
39
+ # Add logo at the top
40
+ gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
41
+
42
+ gr.Markdown(
43
+ """
44
+ # πŸ”₯ Solr Normalization Pipeline Demo
45
+
46
+ **Solr normalization** is meant to demonstrate how text is normalized in the **Impresso** project.
47
+ This pipeline replicates Solr's text processing functionality, showing how text goes through various
48
+ analyzers including tokenization, stopword removal, and language-specific transformations.
49
+
50
+ Try the example below or enter your own text to see how it gets processed!
51
+ """
52
+ )
53
+
54
+ with gr.Row():
55
+ with gr.Column():
56
+ text_input = gr.Textbox(
57
+ label="Enter Text",
58
+ value=EXAMPLE_TEXT,
59
+ lines=3,
60
+ placeholder="Enter your text here..."
61
+ )
62
+ lang_dropdown = gr.Dropdown(
63
+ choices=["Auto-detect"] + LANGUAGES,
64
+ value=DEFAULT_LANGUAGE,
65
+ label="Language"
66
+ )
67
+ submit_btn = gr.Button("πŸš€ Normalize Text", variant="primary")
68
+
69
+ with gr.Column():
70
+ output = gr.Textbox(
71
+ label="Normalized Output",
72
+ lines=15,
73
+ placeholder="Results will appear here..."
74
+ )
75
+
76
+ submit_btn.click(
77
+ fn=normalize,
78
+ inputs=[text_input, lang_dropdown],
79
+ outputs=output
80
+ )
81
+
82
+ gr.Markdown(
83
+ """
84
+ ### πŸ“ About the Pipeline
85
+ - **Tokenization**: Splits text into individual tokens
86
+ - **Stopword Removal**: Identifies and removes common words
87
+ - **Language Detection**: Automatically detects text language
88
+ - **Normalization**: Applies language-specific text transformations
89
+ """
90
+ )
91
 
92
  demo.launch(server_name="0.0.0.0", server_port=7860)
logo.jpeg ADDED