Gül Sena Altıntaş commited on
Commit
279fdab
·
1 Parent(s): 93f64e6

Fixed Farsi copy-paste error, Coding [WIP]

Browse files
Files changed (1) hide show
  1. app.py +123 -6
app.py CHANGED
@@ -1,4 +1,6 @@
 
1
  import gc
 
2
  import logging
3
  import os
4
  import re
@@ -55,6 +57,7 @@ PREDEFINED_MODELS = [
55
  "google/byt5-small",
56
  "gsaltintas/supertoken_models-llama_gpt2",
57
  "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
 
58
  ]
59
  # Global cache for loaded models
60
  model_cache = {}
@@ -62,25 +65,93 @@ model_cache = {}
62
 
63
  def parse_dataset(text):
64
  """Parse the input dataset text into structured questions"""
 
 
 
 
65
  if not text.strip():
66
  return [], "Please enter your dataset"
67
 
68
- lines = text.strip().split("\n")
 
69
 
70
- # Skip header and detect delimiter
71
- first_data_line = lines[1] if len(lines) > 1 else lines[0]
72
- delimiter = "\t" if "\t" in first_data_line else ","
 
 
 
 
 
73
 
74
  questions = []
75
  errors = []
76
 
77
- for i, line in enumerate(lines, 1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  # for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
79
  line = line.strip()
80
  if not line:
81
  continue
82
 
83
- parts = [part.strip().strip('"') for part in line.split(delimiter)]
84
 
85
  if len(parts) < 5:
86
  errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
@@ -522,6 +593,23 @@ def create_summary_markdown(summary_stats):
522
  return "\n".join(lines)
523
 
524
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
  def create_detailed_results_html(questions, results):
526
  """Create detailed HTML results for each question"""
527
  if not questions or not results:
@@ -614,6 +702,7 @@ def create_detailed_results_html(questions, results):
614
  opacity: 0.7;
615
  font-family: monospace;
616
  }
 
617
  </style>
618
  """
619
  ]
@@ -917,6 +1006,33 @@ css = """
917
  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
918
  font-size: 12px;
919
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
920
  """
921
 
922
  # Create Gradio interface
@@ -958,6 +1074,7 @@ What is 2+2?,4,3,2,5
958
  What is the capital of France?,Paris,London,Berlin,Paris""",
959
  lines=8,
960
  max_lines=15,
 
961
  )
962
 
963
  gr.Markdown("""
 
1
+ import csv
2
  import gc
3
+ import io
4
  import logging
5
  import os
6
  import re
 
57
  "google/byt5-small",
58
  "gsaltintas/supertoken_models-llama_gpt2",
59
  "gsaltintas/supertoken_models-llama_google-gemma-2-2b",
60
+ "gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
61
  ]
62
  # Global cache for loaded models
63
  model_cache = {}
 
65
 
66
  def parse_dataset(text):
67
  """Parse the input dataset text into structured questions"""
68
+
69
+ def clean_cell(s: str) -> str:
70
+ return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip()
71
+
72
  if not text.strip():
73
  return [], "Please enter your dataset"
74
 
75
+ # Normalize line endings
76
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
77
 
78
+ # Detect delimiter from first non-empty line
79
+ for line in text.splitlines():
80
+ if line.strip():
81
+ delimiter = "\t" if "\t" in line else ","
82
+ break
83
+
84
+ # Use csv.reader to handle quoted multi-line cells
85
+ reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"')
86
 
87
  questions = []
88
  errors = []
89
 
90
+ for i, row in enumerate(reader, 1):
91
+ # skip empty rows
92
+ if not any(cell.strip() for cell in row):
93
+ continue
94
+
95
+ parts = [clean_cell(p) for p in row]
96
+ if len(parts) < 5:
97
+ errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
98
+ continue
99
+
100
+ question = {
101
+ "question": parts[0],
102
+ "correct_answer": parts[1],
103
+ "choices": [parts[2], parts[3], parts[4]],
104
+ }
105
+
106
+ if question["correct_answer"] not in question["choices"]:
107
+ question["choices"].append(question["correct_answer"])
108
+
109
+ questions.append(question)
110
+
111
+ error_msg = "\n".join(errors) if errors else ""
112
+ return questions, error_msg
113
+
114
+
115
+ def parse_datasetold(text):
116
+ """Parse the input dataset text into structured questions"""
117
+ if not text.strip():
118
+ return [], "Please enter your dataset"
119
+
120
+ # Detect delimiter
121
+ sample_line = text.splitlines()[0]
122
+ delimiter = "\t" if "\t" in sample_line else ","
123
+
124
+ # Use csv.reader to correctly parse quotes & newlines
125
+ reader = csv.reader(io.StringIO(text), delimiter=delimiter)
126
+
127
+ questions = []
128
+ errors = []
129
+ for i, row in enumerate(reader, 1):
130
+ parts = [clean_cell(p) for p in row if p.strip()]
131
+ if len(parts) < 5:
132
+ errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
133
+ continue
134
+
135
+ question = {
136
+ "question": parts[0],
137
+ "correct_answer": parts[1],
138
+ "choices": [parts[2], parts[3], parts[4]],
139
+ }
140
+
141
+ if question["correct_answer"] not in question["choices"]:
142
+ question["choices"].append(question["correct_answer"])
143
+
144
+ questions.append(question)
145
+
146
+ error_msg = "\n".join(errors) if errors else ""
147
+ return questions, error_msg
148
+ for i, line in enumerate(reader, 1):
149
  # for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
150
  line = line.strip()
151
  if not line:
152
  continue
153
 
154
+ parts = [clean_text(part) for part in line.split(delimiter)]
155
 
156
  if len(parts) < 5:
157
  errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
 
593
  return "\n".join(lines)
594
 
595
 
596
+ # CSS for universal text handling
597
+ universal_css = """
598
+ .universal-text textarea {
599
+ direction: auto !important;
600
+ text-align: start !important;
601
+ unicode-bidi: plaintext !important;
602
+ font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI',
603
+ 'Roboto', 'Arial', 'Noto Sans', sans-serif !important;
604
+ }
605
+
606
+ /* Better handling for mixed content */
607
+ .universal-text textarea:focus {
608
+ unicode-bidi: plaintext !important;
609
+ }
610
+ """
611
+
612
+
613
  def create_detailed_results_html(questions, results):
614
  """Create detailed HTML results for each question"""
615
  if not questions or not results:
 
702
  opacity: 0.7;
703
  font-family: monospace;
704
  }
705
+
706
  </style>
707
  """
708
  ]
 
1006
  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
1007
  font-size: 12px;
1008
  }
1009
+
1010
+ .universal-text textarea {
1011
+ direction: ltr !important;
1012
+ text-align: left !important;
1013
+ unicode-bidi: bidi-override !important;
1014
+ font-family: 'Courier New', monospace !important;
1015
+ white-space: pre !important;
1016
+ }
1017
+
1018
+ /* Reset direction after paste */
1019
+ .universal-text textarea:focus {
1020
+ direction: auto !important;
1021
+ unicode-bidi: plaintext !important;
1022
+ }
1023
+
1024
+ # .universal-text textarea {
1025
+ # direction: auto !important;
1026
+ # text-align: start !important;
1027
+ # unicode-bidi: plaintext !important;
1028
+ # font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI',
1029
+ # 'Roboto', 'Arial', 'Noto Sans', sans-serif !important;
1030
+ # }
1031
+
1032
+ # /* Better handling for mixed content */
1033
+ # .universal-text textarea:focus {
1034
+ # unicode-bidi: plaintext !important;
1035
+ # }
1036
  """
1037
 
1038
  # Create Gradio interface
 
1074
  What is the capital of France?,Paris,London,Berlin,Paris""",
1075
  lines=8,
1076
  max_lines=15,
1077
+ elem_classes=["universal-text"],
1078
  )
1079
 
1080
  gr.Markdown("""