Spaces:

r-three
/

quick-tokenizer-accuracy

Sleeping

App Files Files Community

Gül Sena Altıntaş commited on 12 days ago

Commit

279fdab

1 Parent(s): 93f64e6

Fixed Farsi copy-paste error, Coding [WIP]

Browse files

Files changed (1) hide show

app.py +123 -6

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import gc
 import logging
 import os
 import re
@@ -55,6 +57,7 @@ PREDEFINED_MODELS = [
     "google/byt5-small",
     "gsaltintas/supertoken_models-llama_gpt2",
     "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
 ]
 # Global cache for loaded models
 model_cache = {}
@@ -62,25 +65,93 @@ model_cache = {}
 def parse_dataset(text):
     """Parse the input dataset text into structured questions"""
     if not text.strip():
         return [], "Please enter your dataset"
-    lines = text.strip().split("\n")
-    # Skip header and detect delimiter
-    first_data_line = lines[1] if len(lines) > 1 else lines[0]
-    delimiter = "\t" if "\t" in first_data_line else ","
     questions = []
     errors = []
-    for i, line in enumerate(lines, 1):
         # for i, line in enumerate(lines[1:], 2):  # Start from line 2 (after header)
         line = line.strip()
         if not line:
             continue
-        parts = [part.strip().strip('"') for part in line.split(delimiter)]
         if len(parts) < 5:
             errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
@@ -522,6 +593,23 @@ def create_summary_markdown(summary_stats):
     return "\n".join(lines)
 def create_detailed_results_html(questions, results):
     """Create detailed HTML results for each question"""
     if not questions or not results:
@@ -614,6 +702,7 @@ def create_detailed_results_html(questions, results):
         opacity: 0.7;
         font-family: monospace;
     }
     </style>
     """
     ]
@@ -917,6 +1006,33 @@ css = """
     font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
     font-size: 12px;
 }
 """
 # Create Gradio interface
@@ -958,6 +1074,7 @@ What is 2+2?,4,3,2,5
 What is the capital of France?,Paris,London,Berlin,Paris""",
                 lines=8,
                 max_lines=15,
             )
             gr.Markdown("""

+import csv
 import gc
+import io
 import logging
 import os
 import re
     "google/byt5-small",
     "gsaltintas/supertoken_models-llama_gpt2",
     "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
+    "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
 ]
 # Global cache for loaded models
 model_cache = {}
 def parse_dataset(text):
     """Parse the input dataset text into structured questions"""
+    def clean_cell(s: str) -> str:
+        return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip()
     if not text.strip():
         return [], "Please enter your dataset"
+    # Normalize line endings
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    # Detect delimiter from first non-empty line
+    for line in text.splitlines():
+        if line.strip():
+            delimiter = "\t" if "\t" in line else ","
+            break
+    # Use csv.reader to handle quoted multi-line cells
+    reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"')
     questions = []
     errors = []
+    for i, row in enumerate(reader, 1):
+        # skip empty rows
+        if not any(cell.strip() for cell in row):
+            continue
+        parts = [clean_cell(p) for p in row]
+        if len(parts) < 5:
+            errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
+            continue
+        question = {
+            "question": parts[0],
+            "correct_answer": parts[1],
+            "choices": [parts[2], parts[3], parts[4]],
+        }
+        if question["correct_answer"] not in question["choices"]:
+            question["choices"].append(question["correct_answer"])
+        questions.append(question)
+    error_msg = "\n".join(errors) if errors else ""
+    return questions, error_msg
+def parse_datasetold(text):
+    """Parse the input dataset text into structured questions"""
+    if not text.strip():
+        return [], "Please enter your dataset"
+    # Detect delimiter
+    sample_line = text.splitlines()[0]
+    delimiter = "\t" if "\t" in sample_line else ","
+    # Use csv.reader to correctly parse quotes & newlines
+    reader = csv.reader(io.StringIO(text), delimiter=delimiter)
+    questions = []
+    errors = []
+    for i, row in enumerate(reader, 1):
+        parts = [clean_cell(p) for p in row if p.strip()]
+        if len(parts) < 5:
+            errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
+            continue
+        question = {
+            "question": parts[0],
+            "correct_answer": parts[1],
+            "choices": [parts[2], parts[3], parts[4]],
+        }
+        if question["correct_answer"] not in question["choices"]:
+            question["choices"].append(question["correct_answer"])
+        questions.append(question)
+    error_msg = "\n".join(errors) if errors else ""
+    return questions, error_msg
+    for i, line in enumerate(reader, 1):
         # for i, line in enumerate(lines[1:], 2):  # Start from line 2 (after header)
         line = line.strip()
         if not line:
             continue
+        parts = [clean_text(part) for part in line.split(delimiter)]
         if len(parts) < 5:
             errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
     return "\n".join(lines)
+# CSS for universal text handling
+universal_css = """
+.universal-text textarea {
+    direction: auto !important;
+    text-align: start !important;
+    unicode-bidi: plaintext !important;
+    font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI',
+                 'Roboto', 'Arial', 'Noto Sans', sans-serif !important;
+}
+/* Better handling for mixed content */
+.universal-text textarea:focus {
+    unicode-bidi: plaintext !important;
+}
+"""
 def create_detailed_results_html(questions, results):
     """Create detailed HTML results for each question"""
     if not questions or not results:
         opacity: 0.7;
         font-family: monospace;
     }
     </style>
     """
     ]
     font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
     font-size: 12px;
 }
+.universal-text textarea {
+    direction: ltr !important;
+    text-align: left !important;
+    unicode-bidi: bidi-override !important;
+    font-family: 'Courier New', monospace !important;
+    white-space: pre !important;
+}
+/* Reset direction after paste */
+.universal-text textarea:focus {
+    direction: auto !important;
+    unicode-bidi: plaintext !important;
+}
+# .universal-text textarea {
+#     direction: auto !important;
+#     text-align: start !important;
+#     unicode-bidi: plaintext !important;
+#     font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI',
+#                  'Roboto', 'Arial', 'Noto Sans', sans-serif !important;
+# }
+# /* Better handling for mixed content */
+# .universal-text textarea:focus {
+#     unicode-bidi: plaintext !important;
+# }
 """
 # Create Gradio interface
 What is the capital of France?,Paris,London,Berlin,Paris""",
                 lines=8,
                 max_lines=15,
+                elem_classes=["universal-text"],
             )
             gr.Markdown("""