taesiri commited on
Commit
71783c2
·
1 Parent(s): b522908
Files changed (1) hide show
  1. app.py +603 -0
app.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import base64
3
+ import json
4
+ import os
5
+ import shutil
6
+ import uuid
7
+ import shortuuid
8
+ from huggingface_hub import CommitScheduler
9
+
10
+ scheduler = CommitScheduler(
11
+ repo_id="taesiri/EdgeQuest",
12
+ repo_type="dataset",
13
+ folder_path="./data",
14
+ path_in_repo="data",
15
+ every=1,
16
+ )
17
+
18
+
19
+ def generate_json_files(
20
+ system_message,
21
+ # New fields
22
+ name,
23
+ email_address,
24
+ institution,
25
+ openreview_profile,
26
+ question_categories,
27
+ subquestion_1_text,
28
+ subquestion_1_answer,
29
+ subquestion_2_text,
30
+ subquestion_2_answer,
31
+ # Existing fields
32
+ question,
33
+ final_answer,
34
+ rationale_text,
35
+ # Question images
36
+ image1,
37
+ image2,
38
+ image3,
39
+ image4,
40
+ # Rationale images
41
+ rationale_image1,
42
+ rationale_image2,
43
+ ):
44
+ """
45
+ For each request:
46
+ 1) Create a unique folder under ./data/
47
+ 2) Copy uploaded images (question + rationale) into that folder
48
+ 3) Produce two JSON files:
49
+ - request_urls.json (local file paths in content)
50
+ - request_base64.json (base64-encoded images in content)
51
+ 4) Return paths to both files for Gradio to provide as download links
52
+ """
53
+
54
+ # 1) Create parent data folder if it doesn't exist
55
+ parent_data_folder = "./data"
56
+ os.makedirs(parent_data_folder, exist_ok=True)
57
+
58
+ # 2) Generate a unique request ID and create a subfolder
59
+ request_id = str(uuid.uuid4()) # unique ID
60
+ request_folder = os.path.join(parent_data_folder, request_id)
61
+ os.makedirs(request_folder)
62
+
63
+ # Handle defaults
64
+ if not system_message:
65
+ system_message = "You are a helpful assistant"
66
+
67
+ # Convert None strings
68
+ def safe_str(val):
69
+ return val if val is not None else ""
70
+
71
+ name = safe_str(name)
72
+ email_address = safe_str(email_address)
73
+ institution = safe_str(institution)
74
+ openreview_profile = safe_str(openreview_profile)
75
+ # Convert question_categories to list
76
+ question_categories = (
77
+ [cat.strip() for cat in safe_str(question_categories).split(",")]
78
+ if question_categories
79
+ else []
80
+ )
81
+ subquestion_1_text = safe_str(subquestion_1_text)
82
+ subquestion_1_answer = safe_str(subquestion_1_answer)
83
+ subquestion_2_text = safe_str(subquestion_2_text)
84
+ subquestion_2_answer = safe_str(subquestion_2_answer)
85
+ question = safe_str(question)
86
+ final_answer = safe_str(final_answer)
87
+ rationale_text = safe_str(rationale_text)
88
+
89
+ # Collect image-like fields so we can process them in one loop
90
+ all_images = [
91
+ ("question_image_1", image1),
92
+ ("question_image_2", image2),
93
+ ("question_image_3", image3),
94
+ ("question_image_4", image4),
95
+ ("rationale_image_1", rationale_image1),
96
+ ("rationale_image_2", rationale_image2),
97
+ ]
98
+
99
+ files_list = []
100
+ for idx, (img_label, img_obj) in enumerate(all_images):
101
+ if img_obj is not None:
102
+ temp_path = os.path.join(request_folder, f"{img_label}.png")
103
+ if isinstance(img_obj, str):
104
+ # If image is a file path
105
+ shutil.copy2(img_obj, temp_path)
106
+ else:
107
+ # If image is a numpy array
108
+ gr.processing_utils.save_image(img_obj, temp_path)
109
+ # Keep track of the saved path + label
110
+ files_list.append((img_label, temp_path))
111
+
112
+ # Build user content in two flavors: local file paths vs base64
113
+ # We’ll store text fields as simple dictionaries, and then images separately.
114
+ content_list_urls = [
115
+ {"type": "field", "label": "name", "value": name},
116
+ {"type": "field", "label": "email_address", "value": email_address},
117
+ {"type": "field", "label": "institution", "value": institution},
118
+ {"type": "field", "label": "openreview_profile", "value": openreview_profile},
119
+ {"type": "field", "label": "question_categories", "value": question_categories},
120
+ {"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text},
121
+ {
122
+ "type": "field",
123
+ "label": "subquestion_1_answer",
124
+ "value": subquestion_1_answer,
125
+ },
126
+ {"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text},
127
+ {
128
+ "type": "field",
129
+ "label": "subquestion_2_answer",
130
+ "value": subquestion_2_answer,
131
+ },
132
+ {"type": "field", "label": "question", "value": question},
133
+ {"type": "field", "label": "final_answer", "value": final_answer},
134
+ {"type": "field", "label": "rationale_text", "value": rationale_text},
135
+ ]
136
+ content_list_base64 = [
137
+ {"type": "field", "label": "name", "value": name},
138
+ {"type": "field", "label": "email_address", "value": email_address},
139
+ {"type": "field", "label": "institution", "value": institution},
140
+ {"type": "field", "label": "openreview_profile", "value": openreview_profile},
141
+ {"type": "field", "label": "question_categories", "value": question_categories},
142
+ {"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text},
143
+ {
144
+ "type": "field",
145
+ "label": "subquestion_1_answer",
146
+ "value": subquestion_1_answer,
147
+ },
148
+ {"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text},
149
+ {
150
+ "type": "field",
151
+ "label": "subquestion_2_answer",
152
+ "value": subquestion_2_answer,
153
+ },
154
+ {"type": "field", "label": "question", "value": question},
155
+ {"type": "field", "label": "final_answer", "value": final_answer},
156
+ {"type": "field", "label": "rationale_text", "value": rationale_text},
157
+ ]
158
+
159
+ # Append image references
160
+ for img_label, file_path in files_list:
161
+ # 1) Local path (URL) version
162
+ rel_path = os.path.join(".", os.path.basename(file_path))
163
+ content_list_urls.append(
164
+ {
165
+ "type": "image_url",
166
+ "label": img_label,
167
+ "image_url": {"url": {"data:image/png;path": rel_path}},
168
+ }
169
+ )
170
+
171
+ # 2) Base64 version
172
+ with open(file_path, "rb") as f:
173
+ file_bytes = f.read()
174
+ img_b64_str = base64.b64encode(file_bytes).decode("utf-8")
175
+ content_list_base64.append(
176
+ {
177
+ "type": "image_url",
178
+ "label": img_label,
179
+ "image_url": {"url": {"data:image/png;base64": img_b64_str}},
180
+ }
181
+ )
182
+
183
+ # Build the final JSON structures for each approach
184
+ i = 1
185
+
186
+ assistant_content = [
187
+ {"type": "text", "text": rationale_text},
188
+ {"type": "text", "text": final_answer},
189
+ ]
190
+
191
+ # A) URLs JSON
192
+ item_urls = {
193
+ "custom_id": f"request______{i}",
194
+ # Metadata at top level
195
+ "name": name,
196
+ "email_address": email_address,
197
+ "institution": institution,
198
+ "openreview_profile": openreview_profile,
199
+ "question_categories": question_categories,
200
+ "question": {
201
+ "messages": [
202
+ {"role": "system", "content": system_message},
203
+ {
204
+ "role": "user",
205
+ "content": [
206
+ {"type": "text", "label": "question", "value": question}
207
+ ]
208
+ + [
209
+ item
210
+ for item in content_list_urls
211
+ if item.get("type") == "image_url"
212
+ and "question_image" in item.get("label", "")
213
+ ],
214
+ },
215
+ ],
216
+ },
217
+ "subquestions": [
218
+ {"text": subquestion_1_text, "answer": subquestion_1_answer},
219
+ {"text": subquestion_2_text, "answer": subquestion_2_answer},
220
+ ],
221
+ "answer": {
222
+ "final_answer": final_answer,
223
+ "rationale_text": rationale_text,
224
+ "rationale_images": [
225
+ item
226
+ for item in content_list_urls
227
+ if item.get("type") == "image_url"
228
+ and "rationale_image" in item.get("label", "")
229
+ ],
230
+ },
231
+ }
232
+
233
+ # B) Base64 JSON
234
+ item_base64 = {
235
+ "custom_id": f"request______{i}",
236
+ # Metadata at top level
237
+ "name": name,
238
+ "email_address": email_address,
239
+ "institution": institution,
240
+ "openreview_profile": openreview_profile,
241
+ # Question-related fields at top level
242
+ "question_categories": question_categories,
243
+ "subquestions": [
244
+ {"text": subquestion_1_text, "answer": subquestion_1_answer},
245
+ {"text": subquestion_2_text, "answer": subquestion_2_answer},
246
+ ],
247
+ "final_answer": final_answer,
248
+ "rationale_text": rationale_text,
249
+ "body": {
250
+ "model": "MODEL_NAME",
251
+ "messages": [
252
+ {"role": "system", "content": system_message},
253
+ {
254
+ "role": "user",
255
+ "content": [
256
+ {"type": "field", "label": "question", "value": question}
257
+ ]
258
+ + [
259
+ item
260
+ for item in content_list_base64
261
+ if item.get("type") == "image_url"
262
+ and "question_image" in item.get("label", "")
263
+ ],
264
+ },
265
+ {
266
+ "role": "assistant",
267
+ "content": [
268
+ {"type": "text", "text": rationale_text},
269
+ {"type": "text", "text": final_answer},
270
+ *[
271
+ item
272
+ for item in content_list_base64
273
+ if item.get("type") == "image_url"
274
+ and "rationale_image" in item.get("label", "")
275
+ ],
276
+ ],
277
+ },
278
+ ],
279
+ },
280
+ }
281
+
282
+ # Convert each to JSON line format
283
+ urls_json_line = json.dumps(item_urls, ensure_ascii=False)
284
+ base64_json_line = json.dumps(item_base64, ensure_ascii=False)
285
+
286
+ # 3) Write out two JSON files in request_folder
287
+ urls_jsonl_path = os.path.join(request_folder, "request_urls.json")
288
+ base64_jsonl_path = os.path.join(request_folder, "request_base64.json")
289
+
290
+ with open(urls_jsonl_path, "w", encoding="utf-8") as f:
291
+ f.write(urls_json_line + "\n")
292
+ with open(base64_jsonl_path, "w", encoding="utf-8") as f:
293
+ f.write(base64_json_line + "\n")
294
+
295
+ # Return the two file paths so Gradio can offer them as downloads
296
+ return urls_jsonl_path, base64_jsonl_path
297
+
298
+
299
+ # Build the Gradio app
300
+ with gr.Blocks() as demo:
301
+ gr.Markdown("# Dataset Builder")
302
+ with gr.Accordion("Instructions", open=True):
303
+ gr.HTML(
304
+ """
305
+ <h3>Instructions:</h3>
306
+ <p>Welcome to the Hugging Face space for collecting questions for new benchmark datasets.</p>
307
+
308
+ <table style="width:100%; border-collapse: collapse; margin: 10px 0;">
309
+ <tr>
310
+ <th style="width:50%; background-color: #3366f0; padding: 8px; text-align: left; border: 1px solid #ddd;">
311
+ Required Fields
312
+ </th>
313
+ <th style="width:50%; background-color: #3366f0; padding: 8px; text-align: left; border: 1px solid #ddd;">
314
+ Optional Fields
315
+ </th>
316
+ </tr>
317
+ <tr>
318
+ <td style="vertical-align: top; padding: 8px; border: 1px solid #ddd;">
319
+ <ul style="margin: 0;">
320
+ <li>Author Information</li>
321
+ <li>At least <b>one question image</b></li>
322
+ <li>The <b>question text</b></li>
323
+ <li>The <b>final answer</b></li>
324
+ </ul>
325
+ </td>
326
+ <td style="vertical-align: top; padding: 8px; border: 1px solid #ddd;">
327
+ <ul style="margin: 0;">
328
+ <li>Up to four question images</li>
329
+ <li>Supporting images for your answer</li>
330
+ <li><b>Rationale text</b> to explain your reasoning</li>
331
+ <li><b>Sub-questions</b> with their answers</li>
332
+ </ul>
333
+ </td>
334
+ </tr>
335
+ </table>
336
+
337
+ <p>While not all fields are mandatory, providing additional context through optional fields will help create a more comprehensive dataset. After submitting a question, you can clear up the form to submit another one.</p>
338
+ """
339
+ )
340
+ gr.Markdown("## Author Information")
341
+ with gr.Row():
342
+ name_input = gr.Textbox(label="Name", lines=1)
343
+ email_address_input = gr.Textbox(label="Email Address", lines=1)
344
+ institution_input = gr.Textbox(
345
+ label="Institution or 'Independent'",
346
+ lines=1,
347
+ placeholder="e.g. MIT, Google, Independent, etc.",
348
+ )
349
+ openreview_profile_input = gr.Textbox(
350
+ label="OpenReview Profile Name",
351
+ lines=1,
352
+ placeholder="Your OpenReview username or profile name",
353
+ )
354
+
355
+ gr.Markdown("## Question Information")
356
+
357
+ # Question Images - Individual Tabs
358
+ with gr.Tabs():
359
+ with gr.Tab("Image 1"):
360
+ image1 = gr.Image(label="Question Image 1", type="filepath")
361
+ with gr.Tab("Image 2 (Optional)"):
362
+ image2 = gr.Image(label="Question Image 2", type="filepath")
363
+ with gr.Tab("Image 3 (Optional)"):
364
+ image3 = gr.Image(label="Question Image 3", type="filepath")
365
+ with gr.Tab("Image 4 (Optional)"):
366
+ image4 = gr.Image(label="Question Image 4", type="filepath")
367
+
368
+ question_input = gr.Textbox(
369
+ label="Question", lines=15, placeholder="Type your question here..."
370
+ )
371
+
372
+ question_categories_input = gr.Textbox(
373
+ label="Question Categories",
374
+ lines=1,
375
+ placeholder="Comma-separated tags, e.g. math, geometry",
376
+ )
377
+
378
+ # Answer Section
379
+ gr.Markdown("## Answer ")
380
+
381
+ final_answer_input = gr.Textbox(
382
+ label="Final Answer",
383
+ lines=1,
384
+ placeholder="Enter the short/concise final answer...",
385
+ )
386
+
387
+ rationale_text_input = gr.Textbox(
388
+ label="Rationale Text",
389
+ lines=5,
390
+ placeholder="Enter the reasoning or explanation for the answer...",
391
+ )
392
+
393
+ # Rationale Images - Individual Tabs
394
+ with gr.Tabs():
395
+ with gr.Tab("Rationale 1 (Optional)"):
396
+ rationale_image1 = gr.Image(label="Rationale Image 1", type="filepath")
397
+ with gr.Tab("Rationale 2 (Optional)"):
398
+ rationale_image2 = gr.Image(label="Rationale Image 2", type="filepath")
399
+
400
+ # Subquestions Section
401
+ gr.Markdown("## Subquestions")
402
+ with gr.Row():
403
+ subquestion_1_text_input = gr.Textbox(
404
+ label="Subquestion 1 Text", lines=2, placeholder="First sub-question..."
405
+ )
406
+ subquestion_1_answer_input = gr.Textbox(
407
+ label="Subquestion 1 Answer",
408
+ lines=2,
409
+ placeholder="Answer to sub-question 1...",
410
+ )
411
+
412
+ with gr.Row():
413
+ subquestion_2_text_input = gr.Textbox(
414
+ label="Subquestion 2 Text", lines=2, placeholder="Second sub-question..."
415
+ )
416
+ subquestion_2_answer_input = gr.Textbox(
417
+ label="Subquestion 2 Answer",
418
+ lines=2,
419
+ placeholder="Answer to sub-question 2...",
420
+ )
421
+
422
+ system_message_input = gr.Textbox(
423
+ label="System Message",
424
+ value="You are a helpful assistant",
425
+ lines=2,
426
+ placeholder="Enter the system message that defines the AI assistant's role and behavior...",
427
+ )
428
+
429
+ with gr.Row():
430
+ submit_button = gr.Button("Submit")
431
+ clear_button = gr.Button("Clear Form")
432
+
433
+ with gr.Row():
434
+ output_file_urls = gr.File(
435
+ label="Download URLs JSON", interactive=False, visible=False
436
+ )
437
+ output_file_base64 = gr.File(
438
+ label="Download Base64 JSON", interactive=False, visible=False
439
+ )
440
+
441
+ # On Submit, we call generate_json_files with all relevant fields
442
+ def validate_and_generate(
443
+ sys_msg,
444
+ nm,
445
+ em,
446
+ inst,
447
+ orp,
448
+ qcats,
449
+ sq1t,
450
+ sq1a,
451
+ sq2t,
452
+ sq2a,
453
+ q,
454
+ fa,
455
+ rt,
456
+ i1,
457
+ i2,
458
+ i3,
459
+ i4,
460
+ ri1,
461
+ ri2,
462
+ ):
463
+ # Check all required fields
464
+ missing_fields = []
465
+ if not nm or not nm.strip():
466
+ missing_fields.append("Name")
467
+ if not em or not em.strip():
468
+ missing_fields.append("Email Address")
469
+ if not inst or not inst.strip():
470
+ missing_fields.append("Institution")
471
+ if not q or not q.strip():
472
+ missing_fields.append("Question")
473
+ if not fa or not fa.strip():
474
+ missing_fields.append("Final Answer")
475
+ if not i1:
476
+ missing_fields.append("First Question Image")
477
+
478
+ # If any required fields are missing, return a warning and keep all fields as is
479
+ if missing_fields:
480
+ warning_msg = f"Required fields missing: {', '.join(missing_fields)} ⛔️"
481
+ # Return all inputs unchanged plus the warning
482
+ gr.Warning(warning_msg, duration=5)
483
+ return gr.Button(interactive=True)
484
+
485
+ # Only after successful validation, generate files but keep all fields
486
+ results = generate_json_files(
487
+ sys_msg,
488
+ nm,
489
+ em,
490
+ inst,
491
+ orp,
492
+ qcats,
493
+ sq1t,
494
+ sq1a,
495
+ sq2t,
496
+ sq2a,
497
+ q,
498
+ fa,
499
+ rt,
500
+ i1,
501
+ i2,
502
+ i3,
503
+ i4,
504
+ ri1,
505
+ ri2,
506
+ )
507
+
508
+ gr.Info(
509
+ "Dataset item created successfully! 🎉, Clear the form to submit a new one"
510
+ )
511
+
512
+ return gr.update(interactive=False)
513
+
514
+ submit_button.click(
515
+ fn=validate_and_generate,
516
+ inputs=[
517
+ system_message_input,
518
+ name_input,
519
+ email_address_input,
520
+ institution_input,
521
+ openreview_profile_input,
522
+ question_categories_input,
523
+ subquestion_1_text_input,
524
+ subquestion_1_answer_input,
525
+ subquestion_2_text_input,
526
+ subquestion_2_answer_input,
527
+ question_input,
528
+ final_answer_input,
529
+ rationale_text_input,
530
+ image1,
531
+ image2,
532
+ image3,
533
+ image4,
534
+ rationale_image1,
535
+ rationale_image2,
536
+ ],
537
+ outputs=[submit_button],
538
+ )
539
+
540
+ # Clear button functionality
541
+ def clear_form_fields(sys_msg, name, email, inst, openreview, *args):
542
+ # Preserve personal info fields
543
+ return [
544
+ "You are a helpful assistant", # Reset system message to default
545
+ name, # Preserve name
546
+ email, # Preserve email
547
+ inst, # Preserve institution
548
+ openreview, # Preserve OpenReview profile
549
+ None, # Clear question categories
550
+ None, # Clear subquestion 1 text
551
+ None, # Clear subquestion 1 answer
552
+ None, # Clear subquestion 2 text
553
+ None, # Clear subquestion 2 answer
554
+ None, # Clear question
555
+ None, # Clear final answer
556
+ None, # Clear rationale text
557
+ None, # Clear image1
558
+ None, # Clear image2
559
+ None, # Clear image3
560
+ None, # Clear image4
561
+ None, # Clear rationale image1
562
+ None, # Clear rationale image2
563
+ None, # Clear output file urls
564
+ None, # Clear output file base64
565
+ gr.update(interactive=True), # Re-enable submit button
566
+ ]
567
+
568
+ clear_button.click(
569
+ fn=clear_form_fields,
570
+ inputs=[
571
+ system_message_input,
572
+ name_input,
573
+ email_address_input,
574
+ institution_input,
575
+ openreview_profile_input,
576
+ ],
577
+ outputs=[
578
+ system_message_input,
579
+ name_input,
580
+ email_address_input,
581
+ institution_input,
582
+ openreview_profile_input,
583
+ question_categories_input,
584
+ subquestion_1_text_input,
585
+ subquestion_1_answer_input,
586
+ subquestion_2_text_input,
587
+ subquestion_2_answer_input,
588
+ question_input,
589
+ final_answer_input,
590
+ rationale_text_input,
591
+ image1,
592
+ image2,
593
+ image3,
594
+ image4,
595
+ rationale_image1,
596
+ rationale_image2,
597
+ output_file_urls,
598
+ output_file_base64,
599
+ submit_button,
600
+ ],
601
+ )
602
+
603
+ demo.launch()