Spaces:
Sleeping
Sleeping
Gül Sena Altıntaş
commited on
Commit
·
279fdab
1
Parent(s):
93f64e6
Fixed Farsi copy-paste error, Coding [WIP]
Browse files
app.py
CHANGED
@@ -1,4 +1,6 @@
|
|
|
|
1 |
import gc
|
|
|
2 |
import logging
|
3 |
import os
|
4 |
import re
|
@@ -55,6 +57,7 @@ PREDEFINED_MODELS = [
|
|
55 |
"google/byt5-small",
|
56 |
"gsaltintas/supertoken_models-llama_gpt2",
|
57 |
"gsaltintas/supertoken_models-llama_google-gemma-2-2b",
|
|
|
58 |
]
|
59 |
# Global cache for loaded models
|
60 |
model_cache = {}
|
@@ -62,25 +65,93 @@ model_cache = {}
|
|
62 |
|
63 |
def parse_dataset(text):
|
64 |
"""Parse the input dataset text into structured questions"""
|
|
|
|
|
|
|
|
|
65 |
if not text.strip():
|
66 |
return [], "Please enter your dataset"
|
67 |
|
68 |
-
|
|
|
69 |
|
70 |
-
#
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
questions = []
|
75 |
errors = []
|
76 |
|
77 |
-
for i,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
# for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
|
79 |
line = line.strip()
|
80 |
if not line:
|
81 |
continue
|
82 |
|
83 |
-
parts = [part
|
84 |
|
85 |
if len(parts) < 5:
|
86 |
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
@@ -522,6 +593,23 @@ def create_summary_markdown(summary_stats):
|
|
522 |
return "\n".join(lines)
|
523 |
|
524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
def create_detailed_results_html(questions, results):
|
526 |
"""Create detailed HTML results for each question"""
|
527 |
if not questions or not results:
|
@@ -614,6 +702,7 @@ def create_detailed_results_html(questions, results):
|
|
614 |
opacity: 0.7;
|
615 |
font-family: monospace;
|
616 |
}
|
|
|
617 |
</style>
|
618 |
"""
|
619 |
]
|
@@ -917,6 +1006,33 @@ css = """
|
|
917 |
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
918 |
font-size: 12px;
|
919 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
920 |
"""
|
921 |
|
922 |
# Create Gradio interface
|
@@ -958,6 +1074,7 @@ What is 2+2?,4,3,2,5
|
|
958 |
What is the capital of France?,Paris,London,Berlin,Paris""",
|
959 |
lines=8,
|
960 |
max_lines=15,
|
|
|
961 |
)
|
962 |
|
963 |
gr.Markdown("""
|
|
|
1 |
+
import csv
|
2 |
import gc
|
3 |
+
import io
|
4 |
import logging
|
5 |
import os
|
6 |
import re
|
|
|
57 |
"google/byt5-small",
|
58 |
"gsaltintas/supertoken_models-llama_gpt2",
|
59 |
"gsaltintas/supertoken_models-llama_google-gemma-2-2b",
|
60 |
+
"gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
|
61 |
]
|
62 |
# Global cache for loaded models
|
63 |
model_cache = {}
|
|
|
65 |
|
66 |
def parse_dataset(text):
|
67 |
"""Parse the input dataset text into structured questions"""
|
68 |
+
|
69 |
+
def clean_cell(s: str) -> str:
|
70 |
+
return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip()
|
71 |
+
|
72 |
if not text.strip():
|
73 |
return [], "Please enter your dataset"
|
74 |
|
75 |
+
# Normalize line endings
|
76 |
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
77 |
|
78 |
+
# Detect delimiter from first non-empty line
|
79 |
+
for line in text.splitlines():
|
80 |
+
if line.strip():
|
81 |
+
delimiter = "\t" if "\t" in line else ","
|
82 |
+
break
|
83 |
+
|
84 |
+
# Use csv.reader to handle quoted multi-line cells
|
85 |
+
reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"')
|
86 |
|
87 |
questions = []
|
88 |
errors = []
|
89 |
|
90 |
+
for i, row in enumerate(reader, 1):
|
91 |
+
# skip empty rows
|
92 |
+
if not any(cell.strip() for cell in row):
|
93 |
+
continue
|
94 |
+
|
95 |
+
parts = [clean_cell(p) for p in row]
|
96 |
+
if len(parts) < 5:
|
97 |
+
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
98 |
+
continue
|
99 |
+
|
100 |
+
question = {
|
101 |
+
"question": parts[0],
|
102 |
+
"correct_answer": parts[1],
|
103 |
+
"choices": [parts[2], parts[3], parts[4]],
|
104 |
+
}
|
105 |
+
|
106 |
+
if question["correct_answer"] not in question["choices"]:
|
107 |
+
question["choices"].append(question["correct_answer"])
|
108 |
+
|
109 |
+
questions.append(question)
|
110 |
+
|
111 |
+
error_msg = "\n".join(errors) if errors else ""
|
112 |
+
return questions, error_msg
|
113 |
+
|
114 |
+
|
115 |
+
def parse_datasetold(text):
|
116 |
+
"""Parse the input dataset text into structured questions"""
|
117 |
+
if not text.strip():
|
118 |
+
return [], "Please enter your dataset"
|
119 |
+
|
120 |
+
# Detect delimiter
|
121 |
+
sample_line = text.splitlines()[0]
|
122 |
+
delimiter = "\t" if "\t" in sample_line else ","
|
123 |
+
|
124 |
+
# Use csv.reader to correctly parse quotes & newlines
|
125 |
+
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
|
126 |
+
|
127 |
+
questions = []
|
128 |
+
errors = []
|
129 |
+
for i, row in enumerate(reader, 1):
|
130 |
+
parts = [clean_cell(p) for p in row if p.strip()]
|
131 |
+
if len(parts) < 5:
|
132 |
+
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
133 |
+
continue
|
134 |
+
|
135 |
+
question = {
|
136 |
+
"question": parts[0],
|
137 |
+
"correct_answer": parts[1],
|
138 |
+
"choices": [parts[2], parts[3], parts[4]],
|
139 |
+
}
|
140 |
+
|
141 |
+
if question["correct_answer"] not in question["choices"]:
|
142 |
+
question["choices"].append(question["correct_answer"])
|
143 |
+
|
144 |
+
questions.append(question)
|
145 |
+
|
146 |
+
error_msg = "\n".join(errors) if errors else ""
|
147 |
+
return questions, error_msg
|
148 |
+
for i, line in enumerate(reader, 1):
|
149 |
# for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
|
150 |
line = line.strip()
|
151 |
if not line:
|
152 |
continue
|
153 |
|
154 |
+
parts = [clean_text(part) for part in line.split(delimiter)]
|
155 |
|
156 |
if len(parts) < 5:
|
157 |
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
|
|
593 |
return "\n".join(lines)
|
594 |
|
595 |
|
596 |
+
# CSS for universal text handling
|
597 |
+
universal_css = """
|
598 |
+
.universal-text textarea {
|
599 |
+
direction: auto !important;
|
600 |
+
text-align: start !important;
|
601 |
+
unicode-bidi: plaintext !important;
|
602 |
+
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI',
|
603 |
+
'Roboto', 'Arial', 'Noto Sans', sans-serif !important;
|
604 |
+
}
|
605 |
+
|
606 |
+
/* Better handling for mixed content */
|
607 |
+
.universal-text textarea:focus {
|
608 |
+
unicode-bidi: plaintext !important;
|
609 |
+
}
|
610 |
+
"""
|
611 |
+
|
612 |
+
|
613 |
def create_detailed_results_html(questions, results):
|
614 |
"""Create detailed HTML results for each question"""
|
615 |
if not questions or not results:
|
|
|
702 |
opacity: 0.7;
|
703 |
font-family: monospace;
|
704 |
}
|
705 |
+
|
706 |
</style>
|
707 |
"""
|
708 |
]
|
|
|
1006 |
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
1007 |
font-size: 12px;
|
1008 |
}
|
1009 |
+
|
1010 |
+
.universal-text textarea {
|
1011 |
+
direction: ltr !important;
|
1012 |
+
text-align: left !important;
|
1013 |
+
unicode-bidi: bidi-override !important;
|
1014 |
+
font-family: 'Courier New', monospace !important;
|
1015 |
+
white-space: pre !important;
|
1016 |
+
}
|
1017 |
+
|
1018 |
+
/* Reset direction after paste */
|
1019 |
+
.universal-text textarea:focus {
|
1020 |
+
direction: auto !important;
|
1021 |
+
unicode-bidi: plaintext !important;
|
1022 |
+
}
|
1023 |
+
|
1024 |
+
# .universal-text textarea {
|
1025 |
+
# direction: auto !important;
|
1026 |
+
# text-align: start !important;
|
1027 |
+
# unicode-bidi: plaintext !important;
|
1028 |
+
# font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI',
|
1029 |
+
# 'Roboto', 'Arial', 'Noto Sans', sans-serif !important;
|
1030 |
+
# }
|
1031 |
+
|
1032 |
+
# /* Better handling for mixed content */
|
1033 |
+
# .universal-text textarea:focus {
|
1034 |
+
# unicode-bidi: plaintext !important;
|
1035 |
+
# }
|
1036 |
"""
|
1037 |
|
1038 |
# Create Gradio interface
|
|
|
1074 |
What is the capital of France?,Paris,London,Berlin,Paris""",
|
1075 |
lines=8,
|
1076 |
max_lines=15,
|
1077 |
+
elem_classes=["universal-text"],
|
1078 |
)
|
1079 |
|
1080 |
gr.Markdown("""
|