Gül Sena Altıntaş
commited on
Commit
·
15729bc
1
Parent(s):
279fdab
Now accepts multiline!
Browse files- app.py +48 -87
- serve_on_killarney.sh +2 -2
app.py
CHANGED
@@ -63,8 +63,18 @@ PREDEFINED_MODELS = [
|
|
63 |
model_cache = {}
|
64 |
|
65 |
|
66 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
"""Parse the input dataset text into structured questions"""
|
|
|
68 |
|
69 |
def clean_cell(s: str) -> str:
|
70 |
return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip()
|
@@ -75,12 +85,6 @@ def parse_dataset(text):
|
|
75 |
# Normalize line endings
|
76 |
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
77 |
|
78 |
-
# Detect delimiter from first non-empty line
|
79 |
-
for line in text.splitlines():
|
80 |
-
if line.strip():
|
81 |
-
delimiter = "\t" if "\t" in line else ","
|
82 |
-
break
|
83 |
-
|
84 |
# Use csv.reader to handle quoted multi-line cells
|
85 |
reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"')
|
86 |
|
@@ -112,67 +116,6 @@ def parse_dataset(text):
|
|
112 |
return questions, error_msg
|
113 |
|
114 |
|
115 |
-
def parse_datasetold(text):
|
116 |
-
"""Parse the input dataset text into structured questions"""
|
117 |
-
if not text.strip():
|
118 |
-
return [], "Please enter your dataset"
|
119 |
-
|
120 |
-
# Detect delimiter
|
121 |
-
sample_line = text.splitlines()[0]
|
122 |
-
delimiter = "\t" if "\t" in sample_line else ","
|
123 |
-
|
124 |
-
# Use csv.reader to correctly parse quotes & newlines
|
125 |
-
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
|
126 |
-
|
127 |
-
questions = []
|
128 |
-
errors = []
|
129 |
-
for i, row in enumerate(reader, 1):
|
130 |
-
parts = [clean_cell(p) for p in row if p.strip()]
|
131 |
-
if len(parts) < 5:
|
132 |
-
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
133 |
-
continue
|
134 |
-
|
135 |
-
question = {
|
136 |
-
"question": parts[0],
|
137 |
-
"correct_answer": parts[1],
|
138 |
-
"choices": [parts[2], parts[3], parts[4]],
|
139 |
-
}
|
140 |
-
|
141 |
-
if question["correct_answer"] not in question["choices"]:
|
142 |
-
question["choices"].append(question["correct_answer"])
|
143 |
-
|
144 |
-
questions.append(question)
|
145 |
-
|
146 |
-
error_msg = "\n".join(errors) if errors else ""
|
147 |
-
return questions, error_msg
|
148 |
-
for i, line in enumerate(reader, 1):
|
149 |
-
# for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
|
150 |
-
line = line.strip()
|
151 |
-
if not line:
|
152 |
-
continue
|
153 |
-
|
154 |
-
parts = [clean_text(part) for part in line.split(delimiter)]
|
155 |
-
|
156 |
-
if len(parts) < 5:
|
157 |
-
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
158 |
-
continue
|
159 |
-
|
160 |
-
question = {
|
161 |
-
"question": parts[0],
|
162 |
-
"correct_answer": parts[1],
|
163 |
-
"choices": [parts[2], parts[3], parts[4]],
|
164 |
-
}
|
165 |
-
|
166 |
-
# Ensure correct answer is in choices
|
167 |
-
if question["correct_answer"] not in question["choices"]:
|
168 |
-
question["choices"].append(question["correct_answer"])
|
169 |
-
|
170 |
-
questions.append(question)
|
171 |
-
|
172 |
-
error_msg = "\n".join(errors) if errors else ""
|
173 |
-
return questions, error_msg
|
174 |
-
|
175 |
-
|
176 |
def setup_tokenizer(model_path):
|
177 |
tokenizer_name = model_path
|
178 |
if "supertoken" in model_path:
|
@@ -403,7 +346,11 @@ def evaluate_model_on_questions(model_path, questions, progress_callback=None):
|
|
403 |
|
404 |
|
405 |
def run_evaluation(
|
406 |
-
dataset_text,
|
|
|
|
|
|
|
|
|
407 |
):
|
408 |
"""Main evaluation function"""
|
409 |
if not dataset_text.strip():
|
@@ -447,7 +394,7 @@ def run_evaluation(
|
|
447 |
)
|
448 |
|
449 |
# Parse dataset
|
450 |
-
questions, parse_error = parse_dataset(dataset_text)
|
451 |
|
452 |
if parse_error:
|
453 |
return (
|
@@ -976,22 +923,18 @@ def generate_csv_summary(questions, results, summary_stats):
|
|
976 |
# Sample datasets for quick testing
|
977 |
SAMPLE_DATASETS = {
|
978 |
"Custom (enter below)": "",
|
979 |
-
"LP": """
|
980 |
-
In which country is Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch located? Wales Germany France Scotland
|
981 |
In which country is Llanfair pwllgwyngyll located? Wales Germany France Scotland
|
982 |
In which country is Llanfair PG located? Wales Germany France Scotland""",
|
983 |
-
"Simple Math": """
|
984 |
-
What is
|
985 |
-
What is
|
986 |
-
What is
|
987 |
-
|
988 |
-
|
989 |
-
What is the capital of
|
990 |
-
What is the capital of
|
991 |
-
What is the
|
992 |
-
What is the capital of Australia?,Canberra,Sydney,Melbourne,Perth""",
|
993 |
-
"Science Quiz": """Question,Correct Answer,Choice1,Choice2,Choice3
|
994 |
-
What is the chemical symbol for gold?,Au,Ag,Ca,K
|
995 |
Which planet is closest to the Sun?,Mercury,Venus,Earth,Mars
|
996 |
What is the speed of light?,299792458 m/s,300000000 m/s,2992458 m/s,299000000 m/s
|
997 |
What gas do plants absorb from the atmosphere?,Carbon dioxide,Oxygen,Nitrogen,Hydrogen""",
|
@@ -1035,11 +978,14 @@ css = """
|
|
1035 |
# }
|
1036 |
"""
|
1037 |
|
|
|
1038 |
# Create Gradio interface
|
1039 |
with gr.Blocks(
|
1040 |
title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
|
1041 |
) as demo:
|
1042 |
-
gr.
|
|
|
|
|
1043 |
# 🤖 Model Performance Comparison Tool
|
1044 |
|
1045 |
Compare LLM performance on multiple-choice questions using Hugging Face models.
|
@@ -1052,7 +998,17 @@ with gr.Blocks(
|
|
1052 |
- Detailed question-by-question results
|
1053 |
- Performance charts and statistics
|
1054 |
""")
|
1055 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1056 |
with gr.Row():
|
1057 |
with gr.Column(scale=2):
|
1058 |
# Sample dataset selector
|
@@ -1178,7 +1134,12 @@ bigscience/bloom-560m""",
|
|
1178 |
|
1179 |
evaluate_btn.click(
|
1180 |
fn=run_evaluation,
|
1181 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
1182 |
outputs=[
|
1183 |
summary_output,
|
1184 |
detailed_results,
|
|
|
63 |
model_cache = {}
|
64 |
|
65 |
|
66 |
+
def normalize_delimiter(delim: str) -> str:
|
67 |
+
delim = delim.strip()
|
68 |
+
if delim == "\\t": # user typed literal \t
|
69 |
+
return "\t"
|
70 |
+
if len(delim) != 1:
|
71 |
+
raise ValueError(f"Delimiter must be a single character, got {repr(delim)}")
|
72 |
+
return delim
|
73 |
+
|
74 |
+
|
75 |
+
def parse_dataset(text, delimiter: str = "\t"):
|
76 |
"""Parse the input dataset text into structured questions"""
|
77 |
+
delimiter = normalize_delimiter(delimiter)
|
78 |
|
79 |
def clean_cell(s: str) -> str:
|
80 |
return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip()
|
|
|
85 |
# Normalize line endings
|
86 |
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
# Use csv.reader to handle quoted multi-line cells
|
89 |
reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"')
|
90 |
|
|
|
116 |
return questions, error_msg
|
117 |
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
def setup_tokenizer(model_path):
|
120 |
tokenizer_name = model_path
|
121 |
if "supertoken" in model_path:
|
|
|
346 |
|
347 |
|
348 |
def run_evaluation(
|
349 |
+
dataset_text,
|
350 |
+
selected_predefined,
|
351 |
+
custom_models_text="",
|
352 |
+
delimiter: str = "\t",
|
353 |
+
progress=gr.Progress(),
|
354 |
):
|
355 |
"""Main evaluation function"""
|
356 |
if not dataset_text.strip():
|
|
|
394 |
)
|
395 |
|
396 |
# Parse dataset
|
397 |
+
questions, parse_error = parse_dataset(dataset_text, delimiter=delimiter)
|
398 |
|
399 |
if parse_error:
|
400 |
return (
|
|
|
923 |
# Sample datasets for quick testing
|
924 |
SAMPLE_DATASETS = {
|
925 |
"Custom (enter below)": "",
|
926 |
+
"LP": """In which country is Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch located? Wales Germany France Scotland
|
|
|
927 |
In which country is Llanfair pwllgwyngyll located? Wales Germany France Scotland
|
928 |
In which country is Llanfair PG located? Wales Germany France Scotland""",
|
929 |
+
"Simple Math": """What is 2+2? 4 3 2 5
|
930 |
+
What is 5*3? 15 12 16 18
|
931 |
+
What is 10-7? 3 7 4 2
|
932 |
+
What is 8/2? 4 3 2 5""",
|
933 |
+
"World Capitals": """What is the capital of France? Paris London Berlin Rome
|
934 |
+
What is the capital of Japan? Tokyo Seoul Beijing Bangkok
|
935 |
+
What is the capital of Brazil? Brasília Rio de Janeiro São Paulo Salvador
|
936 |
+
What is the capital of Australia? Canberra Sydney Melbourne Perth""",
|
937 |
+
"Science Quiz": """What is the chemical symbol for gold?,Au,Ag,Ca,K
|
|
|
|
|
|
|
938 |
Which planet is closest to the Sun?,Mercury,Venus,Earth,Mars
|
939 |
What is the speed of light?,299792458 m/s,300000000 m/s,2992458 m/s,299000000 m/s
|
940 |
What gas do plants absorb from the atmosphere?,Carbon dioxide,Oxygen,Nitrogen,Hydrogen""",
|
|
|
978 |
# }
|
979 |
"""
|
980 |
|
981 |
+
|
982 |
# Create Gradio interface
|
983 |
with gr.Blocks(
|
984 |
title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
|
985 |
) as demo:
|
986 |
+
with gr.Row():
|
987 |
+
with gr.Column(scale=2):
|
988 |
+
gr.Markdown("""
|
989 |
# 🤖 Model Performance Comparison Tool
|
990 |
|
991 |
Compare LLM performance on multiple-choice questions using Hugging Face models.
|
|
|
998 |
- Detailed question-by-question results
|
999 |
- Performance charts and statistics
|
1000 |
""")
|
1001 |
+
with gr.Column(scale=1):
|
1002 |
+
# with gr.Accordion("Delimiter Options"):
|
1003 |
+
gr.Markdown("""
|
1004 |
+
Enter the delimiter used in your dataset:
|
1005 |
+
""")
|
1006 |
+
delimiter_selector = gr.Textbox(
|
1007 |
+
label="Delimiter",
|
1008 |
+
placeholder="Enter a delimiter, e.g., , or \\t",
|
1009 |
+
value="\\t", # default
|
1010 |
+
lines=1,
|
1011 |
+
)
|
1012 |
with gr.Row():
|
1013 |
with gr.Column(scale=2):
|
1014 |
# Sample dataset selector
|
|
|
1134 |
|
1135 |
evaluate_btn.click(
|
1136 |
fn=run_evaluation,
|
1137 |
+
inputs=[
|
1138 |
+
dataset_input,
|
1139 |
+
predefined_selector,
|
1140 |
+
custom_models_input,
|
1141 |
+
delimiter_selector,
|
1142 |
+
],
|
1143 |
outputs=[
|
1144 |
summary_output,
|
1145 |
detailed_results,
|
serve_on_killarney.sh
CHANGED
@@ -16,8 +16,8 @@ NODES=1
|
|
16 |
NTASKS_PER_NODE=1
|
17 |
CPUS_PER_TASK=4
|
18 |
### request more memory to run on more models
|
19 |
-
MEM="
|
20 |
-
TIME="
|
21 |
GRADIO_PORT=7861
|
22 |
script_location="$APP_DIR/$SCRIPT_NAME"
|
23 |
|
|
|
16 |
NTASKS_PER_NODE=1
|
17 |
CPUS_PER_TASK=4
|
18 |
### request more memory to run on more models
|
19 |
+
MEM="64G"
|
20 |
+
TIME="06:00:00"
|
21 |
GRADIO_PORT=7861
|
22 |
script_location="$APP_DIR/$SCRIPT_NAME"
|
23 |
|