taesiri commited on
Commit
5943a36
Β·
1 Parent(s): ae2c2e9
Files changed (1) hide show
  1. app.py +267 -27
app.py CHANGED
@@ -4,6 +4,8 @@ import json
4
  import os
5
  import shutil
6
  import uuid
 
 
7
  from huggingface_hub import CommitScheduler, HfApi
8
 
9
  api = HfApi(token=os.environ["HF_TOKEN"])
@@ -17,6 +19,112 @@ scheduler = CommitScheduler(
17
  )
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def generate_json_files(
21
  name,
22
  email_address,
@@ -44,25 +152,41 @@ def generate_json_files(
44
  image4,
45
  rationale_image1,
46
  rationale_image2,
 
47
  ):
48
  """
49
  For each request:
50
- 1) Create a unique folder under ./data/
51
  2) Copy uploaded images (question + rationale) into that folder
52
- 3) Produce two JSON files:
53
- - question.json (local file paths in content)
54
- - request_base64.json (base64-encoded images in content)
55
- 4) Return paths to both files for Gradio to provide as download links
56
  """
57
 
58
- # 1) Create parent data folder if it doesn't exist
 
 
 
59
  parent_data_folder = "./data"
60
  os.makedirs(parent_data_folder, exist_ok=True)
61
 
62
- # 2) Generate a unique request ID and create a subfolder
63
- request_id = str(uuid.uuid4()) # unique ID
64
  request_folder = os.path.join(parent_data_folder, request_id)
65
- os.makedirs(request_folder)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  # Convert None strings
68
  def safe_str(val):
@@ -104,21 +228,71 @@ def generate_json_files(
104
  ("rationale_image_2", rationale_image2),
105
  ]
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  files_list = []
108
  for idx, (img_label, img_obj) in enumerate(all_images):
109
  if img_obj is not None:
110
  temp_path = os.path.join(request_folder, f"{img_label}.png")
111
  if isinstance(img_obj, str):
112
  # If image is a file path
113
- shutil.copy2(img_obj, temp_path)
 
 
 
 
 
114
  else:
115
  # If image is a numpy array
116
  gr.processing_utils.save_image(img_obj, temp_path)
117
- # Keep track of the saved path + label
118
- files_list.append((img_label, temp_path))
119
 
120
  # Build user content in two flavors: local file paths vs base64
121
- # We’ll store text fields as simple dictionaries, and then images separately.
122
  content_list_urls = [
123
  {"type": "field", "label": "name", "value": name},
124
  {"type": "field", "label": "email_address", "value": email_address},
@@ -218,7 +392,7 @@ def generate_json_files(
218
  # Convert each to JSON line format
219
  urls_json_line = json.dumps(item_urls, ensure_ascii=False)
220
 
221
- # 3) Write out two JSON files in request_folder
222
  urls_jsonl_path = os.path.join(request_folder, "question.json")
223
 
224
  with open(urls_jsonl_path, "w", encoding="utf-8") as f:
@@ -230,6 +404,7 @@ def generate_json_files(
230
  # Build the Gradio app
231
  with gr.Blocks() as demo:
232
  gr.Markdown("# Dataset Builder")
 
233
  with gr.Accordion("Instructions", open=True):
234
  gr.HTML(
235
  """
@@ -436,7 +611,59 @@ with gr.Blocks() as demo:
436
  label="Download Base64 JSON", interactive=False, visible=False
437
  )
438
 
439
- # On Submit, we call generate_json_files with all relevant fields
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  def validate_and_generate(
441
  nm,
442
  em,
@@ -464,8 +691,9 @@ with gr.Blocks() as demo:
464
  i4,
465
  ri1,
466
  ri2,
 
467
  ):
468
- # Check all required fields
469
  missing_fields = []
470
  if not nm or not nm.strip():
471
  missing_fields.append("Name")
@@ -492,14 +720,18 @@ with gr.Blocks() as demo:
492
  if not sq5t or not sq5t.strip() or not sq5a or not sq5a.strip():
493
  missing_fields.append("Fifth Sub-question and Answer")
494
 
495
- # If any required fields are missing, return a warning and keep all fields as is
496
  if missing_fields:
497
  warning_msg = f"Required fields missing: {', '.join(missing_fields)} ⛔️"
498
- # Return all inputs unchanged plus the warning
499
  gr.Warning(warning_msg, duration=5)
500
- return gr.Button(interactive=True)
 
 
 
 
 
 
 
501
 
502
- # Only after successful validation, generate files but keep all fields
503
  results = generate_json_files(
504
  nm,
505
  em,
@@ -527,14 +759,19 @@ with gr.Blocks() as demo:
527
  i4,
528
  ri1,
529
  ri2,
 
530
  )
531
 
 
532
  gr.Info(
533
- "Dataset item created successfully! πŸŽ‰, Clear the form to submit a new one"
534
  )
535
 
536
- return gr.update(interactive=False)
 
 
537
 
 
538
  submit_button.click(
539
  fn=validate_and_generate,
540
  inputs=[
@@ -564,15 +801,14 @@ with gr.Blocks() as demo:
564
  image4,
565
  rationale_image1,
566
  rationale_image2,
 
567
  ],
568
- outputs=[submit_button],
569
  )
570
 
571
- # Clear button functionality
572
  def clear_form_fields(name, email, inst, openreview, authorship, *args):
573
- # Preserve personal info fields and re-enable submit button
574
- gr.Info("Form cleared! Ready for new submission πŸ”„")
575
- return [
576
  name, # Preserve name
577
  email, # Preserve email
578
  inst, # Preserve institution
@@ -601,7 +837,10 @@ with gr.Blocks() as demo:
601
  None, # Clear rationale image2
602
  None, # Clear output file urls
603
  gr.Button(interactive=True), # Re-enable submit button
 
604
  ]
 
 
605
 
606
  clear_button.click(
607
  fn=clear_form_fields,
@@ -641,6 +880,7 @@ with gr.Blocks() as demo:
641
  rationale_image2,
642
  output_file_urls,
643
  submit_button,
 
644
  ],
645
  )
646
 
 
4
  import os
5
  import shutil
6
  import uuid
7
+ import glob
8
+
9
  from huggingface_hub import CommitScheduler, HfApi
10
 
11
  api = HfApi(token=os.environ["HF_TOKEN"])
 
19
  )
20
 
21
 
22
+ def load_existing_questions():
23
+ """
24
+ Load all existing questions from the data directory
25
+ Returns a list of tuples (question_id, question_preview)
26
+ """
27
+ questions = []
28
+ data_dir = "./data"
29
+ if not os.path.exists(data_dir):
30
+ return questions
31
+
32
+ for question_dir in glob.glob(os.path.join(data_dir, "*")):
33
+ if os.path.isdir(question_dir):
34
+ json_path = os.path.join(question_dir, "question.json")
35
+ if os.path.exists(json_path):
36
+ try:
37
+ with open(json_path, "r", encoding="utf-8") as f:
38
+ data = json.loads(f.read().strip())
39
+ question_id = os.path.basename(question_dir)
40
+ preview = (
41
+ f"{data['question'][:100]}..."
42
+ if len(data["question"]) > 100
43
+ else data["question"]
44
+ )
45
+ questions.append((question_id, f"{question_id}: {preview}"))
46
+ except:
47
+ continue
48
+
49
+ return sorted(questions, key=lambda x: x[1])
50
+
51
+
52
+ def load_question_data(question_id):
53
+ """
54
+ Load a specific question's data
55
+ Returns a tuple of all form fields
56
+ """
57
+ if not question_id:
58
+ return [None] * 26 # Return None for all fields
59
+
60
+ # Extract the ID part before the colon from the dropdown selection
61
+ question_id = (
62
+ question_id.split(":")[0].strip() if ":" in question_id else question_id
63
+ )
64
+
65
+ json_path = os.path.join("./data", question_id, "question.json")
66
+ if not os.path.exists(json_path):
67
+ print(f"Question file not found: {json_path}")
68
+ return [None] * 26
69
+
70
+ try:
71
+ with open(json_path, "r", encoding="utf-8") as f:
72
+ data = json.loads(f.read().strip())
73
+
74
+ # Load images
75
+ def load_image(image_path):
76
+ if not image_path:
77
+ return None
78
+ full_path = os.path.join(
79
+ "./data", question_id, os.path.basename(image_path)
80
+ )
81
+ return full_path if os.path.exists(full_path) else None
82
+
83
+ question_images = data.get("question_images", [])
84
+ rationale_images = data.get("rationale_images", [])
85
+
86
+ # Convert authorship_interest to boolean if it's a string
87
+ authorship = data["author_info"].get("authorship_interest", False)
88
+ if isinstance(authorship, str):
89
+ authorship = authorship.lower() == "true"
90
+
91
+ return [
92
+ data["author_info"]["name"],
93
+ data["author_info"]["email_address"],
94
+ data["author_info"]["institution"],
95
+ data["author_info"].get("openreview_profile", ""),
96
+ authorship,
97
+ (
98
+ ",".join(data["question_categories"])
99
+ if isinstance(data["question_categories"], list)
100
+ else data["question_categories"]
101
+ ),
102
+ data.get("subquestions_1_text", "N/A"),
103
+ data.get("subquestions_1_answer", "N/A"),
104
+ data.get("subquestions_2_text", "N/A"),
105
+ data.get("subquestions_2_answer", "N/A"),
106
+ data.get("subquestions_3_text", "N/A"),
107
+ data.get("subquestions_3_answer", "N/A"),
108
+ data.get("subquestions_4_text", "N/A"),
109
+ data.get("subquestions_4_answer", "N/A"),
110
+ data.get("subquestions_5_text", "N/A"),
111
+ data.get("subquestions_5_answer", "N/A"),
112
+ data["question"],
113
+ data["final_answer"],
114
+ data.get("rationale_text", ""),
115
+ data["image_attribution"],
116
+ load_image(question_images[0] if question_images else None),
117
+ load_image(question_images[1] if len(question_images) > 1 else None),
118
+ load_image(question_images[2] if len(question_images) > 2 else None),
119
+ load_image(question_images[3] if len(question_images) > 3 else None),
120
+ load_image(rationale_images[0] if rationale_images else None),
121
+ load_image(rationale_images[1] if len(rationale_images) > 1 else None),
122
+ ]
123
+ except Exception as e:
124
+ print(f"Error loading question {question_id}: {str(e)}")
125
+ return [None] * 26
126
+
127
+
128
  def generate_json_files(
129
  name,
130
  email_address,
 
152
  image4,
153
  rationale_image1,
154
  rationale_image2,
155
+ existing_id=None, # New parameter for updating existing questions
156
  ):
157
  """
158
  For each request:
159
+ 1) Create a unique folder under ./data/ (or use existing if updating)
160
  2) Copy uploaded images (question + rationale) into that folder
161
+ 3) Produce JSON file with question data
162
+ 4) Return path to the JSON file
 
 
163
  """
164
 
165
+ # Use existing ID if updating, otherwise generate new one
166
+ request_id = existing_id if existing_id else str(uuid.uuid4())
167
+
168
+ # Create parent data folder if it doesn't exist
169
  parent_data_folder = "./data"
170
  os.makedirs(parent_data_folder, exist_ok=True)
171
 
172
+ # Create or clean request folder
 
173
  request_folder = os.path.join(parent_data_folder, request_id)
174
+ if os.path.exists(request_folder):
175
+ # If updating, remove old image files but only if new images are provided
176
+ for f in glob.glob(os.path.join(request_folder, "*.png")):
177
+ # Only remove if we have a new image to replace it
178
+ filename = os.path.basename(f)
179
+ if (
180
+ ("question_image_1" in filename and image1)
181
+ or ("question_image_2" in filename and image2)
182
+ or ("question_image_3" in filename and image3)
183
+ or ("question_image_4" in filename and image4)
184
+ or ("rationale_image_1" in filename and rationale_image1)
185
+ or ("rationale_image_2" in filename and rationale_image2)
186
+ ):
187
+ os.remove(f)
188
+ else:
189
+ os.makedirs(request_folder)
190
 
191
  # Convert None strings
192
  def safe_str(val):
 
228
  ("rationale_image_2", rationale_image2),
229
  ]
230
 
231
+ # If updating, load existing images that haven't been replaced
232
+ if existing_id:
233
+ json_path = os.path.join(parent_data_folder, existing_id, "question.json")
234
+ if os.path.exists(json_path):
235
+ try:
236
+ with open(json_path, "r", encoding="utf-8") as f:
237
+ existing_data = json.loads(f.read().strip())
238
+ existing_question_images = existing_data.get("question_images", [])
239
+ existing_rationale_images = existing_data.get(
240
+ "rationale_images", []
241
+ )
242
+
243
+ # Keep existing images if no new ones provided
244
+ if not image1 and existing_question_images:
245
+ all_images[0] = (
246
+ "question_image_1",
247
+ existing_question_images[0],
248
+ )
249
+ if not image2 and len(existing_question_images) > 1:
250
+ all_images[1] = (
251
+ "question_image_2",
252
+ existing_question_images[1],
253
+ )
254
+ if not image3 and len(existing_question_images) > 2:
255
+ all_images[2] = (
256
+ "question_image_3",
257
+ existing_question_images[2],
258
+ )
259
+ if not image4 and len(existing_question_images) > 3:
260
+ all_images[3] = (
261
+ "question_image_4",
262
+ existing_question_images[3],
263
+ )
264
+ if not rationale_image1 and existing_rationale_images:
265
+ all_images[4] = (
266
+ "rationale_image_1",
267
+ existing_rationale_images[0],
268
+ )
269
+ if not rationale_image2 and len(existing_rationale_images) > 1:
270
+ all_images[5] = (
271
+ "rationale_image_2",
272
+ existing_rationale_images[1],
273
+ )
274
+ except:
275
+ pass
276
+
277
  files_list = []
278
  for idx, (img_label, img_obj) in enumerate(all_images):
279
  if img_obj is not None:
280
  temp_path = os.path.join(request_folder, f"{img_label}.png")
281
  if isinstance(img_obj, str):
282
  # If image is a file path
283
+ if os.path.exists(img_obj):
284
+ if (
285
+ img_obj != temp_path
286
+ ): # Only copy if source and destination are different
287
+ shutil.copy2(img_obj, temp_path)
288
+ files_list.append((img_label, temp_path))
289
  else:
290
  # If image is a numpy array
291
  gr.processing_utils.save_image(img_obj, temp_path)
292
+ files_list.append((img_label, temp_path))
 
293
 
294
  # Build user content in two flavors: local file paths vs base64
295
+ # We'll store text fields as simple dictionaries, and then images separately.
296
  content_list_urls = [
297
  {"type": "field", "label": "name", "value": name},
298
  {"type": "field", "label": "email_address", "value": email_address},
 
392
  # Convert each to JSON line format
393
  urls_json_line = json.dumps(item_urls, ensure_ascii=False)
394
 
395
+ # 3) Write out JSON file in request_folder
396
  urls_jsonl_path = os.path.join(request_folder, "question.json")
397
 
398
  with open(urls_jsonl_path, "w", encoding="utf-8") as f:
 
404
  # Build the Gradio app
405
  with gr.Blocks() as demo:
406
  gr.Markdown("# Dataset Builder")
407
+
408
  with gr.Accordion("Instructions", open=True):
409
  gr.HTML(
410
  """
 
611
  label="Download Base64 JSON", interactive=False, visible=False
612
  )
613
 
614
+ with gr.Accordion("Load Existing Question", open=False):
615
+ gr.Markdown("## Load Existing Question")
616
+
617
+ with gr.Row():
618
+ existing_questions = gr.Dropdown(
619
+ label="Load Existing Question",
620
+ choices=load_existing_questions(),
621
+ type="value",
622
+ allow_custom_value=False,
623
+ )
624
+ refresh_button = gr.Button("πŸ”„ Refresh")
625
+ load_button = gr.Button("Load Selected Question")
626
+
627
+ def refresh_questions():
628
+ return gr.Dropdown(choices=load_existing_questions())
629
+
630
+ refresh_button.click(fn=refresh_questions, inputs=[], outputs=[existing_questions])
631
+
632
+ # Load button functionality
633
+ load_button.click(
634
+ fn=load_question_data,
635
+ inputs=[existing_questions],
636
+ outputs=[
637
+ name_input,
638
+ email_address_input,
639
+ institution_input,
640
+ openreview_profile_input,
641
+ authorship_input,
642
+ question_categories_input,
643
+ subquestion_1_text_input,
644
+ subquestion_1_answer_input,
645
+ subquestion_2_text_input,
646
+ subquestion_2_answer_input,
647
+ subquestion_3_text_input,
648
+ subquestion_3_answer_input,
649
+ subquestion_4_text_input,
650
+ subquestion_4_answer_input,
651
+ subquestion_5_text_input,
652
+ subquestion_5_answer_input,
653
+ question_input,
654
+ final_answer_input,
655
+ rationale_text_input,
656
+ image_attribution_input,
657
+ image1,
658
+ image2,
659
+ image3,
660
+ image4,
661
+ rationale_image1,
662
+ rationale_image2,
663
+ ],
664
+ )
665
+
666
+ # Modify validate_and_generate to handle updates
667
  def validate_and_generate(
668
  nm,
669
  em,
 
691
  i4,
692
  ri1,
693
  ri2,
694
+ selected_question_id,
695
  ):
696
+ # Validation code remains the same
697
  missing_fields = []
698
  if not nm or not nm.strip():
699
  missing_fields.append("Name")
 
720
  if not sq5t or not sq5t.strip() or not sq5a or not sq5a.strip():
721
  missing_fields.append("Fifth Sub-question and Answer")
722
 
 
723
  if missing_fields:
724
  warning_msg = f"Required fields missing: {', '.join(missing_fields)} ⛔️"
 
725
  gr.Warning(warning_msg, duration=5)
726
+ return gr.Button(interactive=True), gr.Dropdown(
727
+ choices=load_existing_questions()
728
+ )
729
+
730
+ # Extract question ID if updating existing question
731
+ existing_id = (
732
+ selected_question_id.split(":")[0].strip() if selected_question_id else None
733
+ )
734
 
 
735
  results = generate_json_files(
736
  nm,
737
  em,
 
759
  i4,
760
  ri1,
761
  ri2,
762
+ existing_id,
763
  )
764
 
765
+ action = "updated" if existing_id else "created"
766
  gr.Info(
767
+ f"Dataset item {action} successfully! πŸŽ‰ Clear the form to submit a new one"
768
  )
769
 
770
+ return gr.update(interactive=False), gr.Dropdown(
771
+ choices=load_existing_questions()
772
+ )
773
 
774
+ # Update submit button click handler to include selected question
775
  submit_button.click(
776
  fn=validate_and_generate,
777
  inputs=[
 
801
  image4,
802
  rationale_image1,
803
  rationale_image2,
804
+ existing_questions, # Add selected question to inputs
805
  ],
806
+ outputs=[submit_button, existing_questions], # Update dropdown after submit
807
  )
808
 
809
+ # Update clear button to also clear selected question
810
  def clear_form_fields(name, email, inst, openreview, authorship, *args):
811
+ outputs = [
 
 
812
  name, # Preserve name
813
  email, # Preserve email
814
  inst, # Preserve institution
 
837
  None, # Clear rationale image2
838
  None, # Clear output file urls
839
  gr.Button(interactive=True), # Re-enable submit button
840
+ gr.update(value=None), # Clear selected question
841
  ]
842
+ gr.Info("Form cleared! Ready for new submission πŸ”„")
843
+ return outputs
844
 
845
  clear_button.click(
846
  fn=clear_form_fields,
 
880
  rationale_image2,
881
  output_file_urls,
882
  submit_button,
883
+ existing_questions,
884
  ],
885
  )
886