Spaces:
Paused
Paused
import gradio as gr | |
import base64 | |
import json | |
import os | |
import shutil | |
import uuid | |
from huggingface_hub import CommitScheduler, HfApi | |
api = HfApi() | |
api.login(os.environ["HF_TOKEN"]) | |
scheduler = CommitScheduler( | |
repo_id="taesiri/EdgeQuest", | |
repo_type="dataset", | |
folder_path="./data", | |
path_in_repo="data", | |
every=1, | |
) | |
def generate_json_files( | |
system_message, | |
# New fields | |
name, | |
email_address, | |
institution, | |
openreview_profile, | |
question_categories, | |
subquestion_1_text, | |
subquestion_1_answer, | |
subquestion_2_text, | |
subquestion_2_answer, | |
# Existing fields | |
question, | |
final_answer, | |
rationale_text, | |
# Question images | |
image1, | |
image2, | |
image3, | |
image4, | |
# Rationale images | |
rationale_image1, | |
rationale_image2, | |
): | |
""" | |
For each request: | |
1) Create a unique folder under ./data/ | |
2) Copy uploaded images (question + rationale) into that folder | |
3) Produce two JSON files: | |
- request_urls.json (local file paths in content) | |
- request_base64.json (base64-encoded images in content) | |
4) Return paths to both files for Gradio to provide as download links | |
""" | |
# 1) Create parent data folder if it doesn't exist | |
parent_data_folder = "./data" | |
os.makedirs(parent_data_folder, exist_ok=True) | |
# 2) Generate a unique request ID and create a subfolder | |
request_id = str(uuid.uuid4()) # unique ID | |
request_folder = os.path.join(parent_data_folder, request_id) | |
os.makedirs(request_folder) | |
# Handle defaults | |
if not system_message: | |
system_message = "You are a helpful assistant" | |
# Convert None strings | |
def safe_str(val): | |
return val if val is not None else "" | |
name = safe_str(name) | |
email_address = safe_str(email_address) | |
institution = safe_str(institution) | |
openreview_profile = safe_str(openreview_profile) | |
# Convert question_categories to list | |
question_categories = ( | |
[cat.strip() for cat in safe_str(question_categories).split(",")] | |
if question_categories | |
else [] | |
) | |
subquestion_1_text = safe_str(subquestion_1_text) | |
subquestion_1_answer = safe_str(subquestion_1_answer) | |
subquestion_2_text = safe_str(subquestion_2_text) | |
subquestion_2_answer = safe_str(subquestion_2_answer) | |
question = safe_str(question) | |
final_answer = safe_str(final_answer) | |
rationale_text = safe_str(rationale_text) | |
# Collect image-like fields so we can process them in one loop | |
all_images = [ | |
("question_image_1", image1), | |
("question_image_2", image2), | |
("question_image_3", image3), | |
("question_image_4", image4), | |
("rationale_image_1", rationale_image1), | |
("rationale_image_2", rationale_image2), | |
] | |
files_list = [] | |
for idx, (img_label, img_obj) in enumerate(all_images): | |
if img_obj is not None: | |
temp_path = os.path.join(request_folder, f"{img_label}.png") | |
if isinstance(img_obj, str): | |
# If image is a file path | |
shutil.copy2(img_obj, temp_path) | |
else: | |
# If image is a numpy array | |
gr.processing_utils.save_image(img_obj, temp_path) | |
# Keep track of the saved path + label | |
files_list.append((img_label, temp_path)) | |
# Build user content in two flavors: local file paths vs base64 | |
# We’ll store text fields as simple dictionaries, and then images separately. | |
content_list_urls = [ | |
{"type": "field", "label": "name", "value": name}, | |
{"type": "field", "label": "email_address", "value": email_address}, | |
{"type": "field", "label": "institution", "value": institution}, | |
{"type": "field", "label": "openreview_profile", "value": openreview_profile}, | |
{"type": "field", "label": "question_categories", "value": question_categories}, | |
{"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text}, | |
{ | |
"type": "field", | |
"label": "subquestion_1_answer", | |
"value": subquestion_1_answer, | |
}, | |
{"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text}, | |
{ | |
"type": "field", | |
"label": "subquestion_2_answer", | |
"value": subquestion_2_answer, | |
}, | |
{"type": "field", "label": "question", "value": question}, | |
{"type": "field", "label": "final_answer", "value": final_answer}, | |
{"type": "field", "label": "rationale_text", "value": rationale_text}, | |
] | |
content_list_base64 = [ | |
{"type": "field", "label": "name", "value": name}, | |
{"type": "field", "label": "email_address", "value": email_address}, | |
{"type": "field", "label": "institution", "value": institution}, | |
{"type": "field", "label": "openreview_profile", "value": openreview_profile}, | |
{"type": "field", "label": "question_categories", "value": question_categories}, | |
{"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text}, | |
{ | |
"type": "field", | |
"label": "subquestion_1_answer", | |
"value": subquestion_1_answer, | |
}, | |
{"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text}, | |
{ | |
"type": "field", | |
"label": "subquestion_2_answer", | |
"value": subquestion_2_answer, | |
}, | |
{"type": "field", "label": "question", "value": question}, | |
{"type": "field", "label": "final_answer", "value": final_answer}, | |
{"type": "field", "label": "rationale_text", "value": rationale_text}, | |
] | |
# Append image references | |
for img_label, file_path in files_list: | |
# 1) Local path (URL) version | |
rel_path = os.path.join(".", os.path.basename(file_path)) | |
content_list_urls.append( | |
{ | |
"type": "image_url", | |
"label": img_label, | |
"image_url": {"url": {"data:image/png;path": rel_path}}, | |
} | |
) | |
# 2) Base64 version | |
with open(file_path, "rb") as f: | |
file_bytes = f.read() | |
img_b64_str = base64.b64encode(file_bytes).decode("utf-8") | |
content_list_base64.append( | |
{ | |
"type": "image_url", | |
"label": img_label, | |
"image_url": {"url": {"data:image/png;base64": img_b64_str}}, | |
} | |
) | |
# Build the final JSON structures for each approach | |
i = 1 | |
assistant_content = [ | |
{"type": "text", "text": rationale_text}, | |
{"type": "text", "text": final_answer}, | |
] | |
# A) URLs JSON | |
item_urls = { | |
"custom_id": f"request______{i}", | |
# Metadata at top level | |
"name": name, | |
"email_address": email_address, | |
"institution": institution, | |
"openreview_profile": openreview_profile, | |
"question_categories": question_categories, | |
"question": { | |
"messages": [ | |
{"role": "system", "content": system_message}, | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "label": "question", "value": question} | |
] | |
+ [ | |
item | |
for item in content_list_urls | |
if item.get("type") == "image_url" | |
and "question_image" in item.get("label", "") | |
], | |
}, | |
], | |
}, | |
"subquestions": [ | |
{"text": subquestion_1_text, "answer": subquestion_1_answer}, | |
{"text": subquestion_2_text, "answer": subquestion_2_answer}, | |
], | |
"answer": { | |
"final_answer": final_answer, | |
"rationale_text": rationale_text, | |
"rationale_images": [ | |
item | |
for item in content_list_urls | |
if item.get("type") == "image_url" | |
and "rationale_image" in item.get("label", "") | |
], | |
}, | |
} | |
# B) Base64 JSON | |
item_base64 = { | |
"custom_id": f"request______{i}", | |
# Metadata at top level | |
"name": name, | |
"email_address": email_address, | |
"institution": institution, | |
"openreview_profile": openreview_profile, | |
# Question-related fields at top level | |
"question_categories": question_categories, | |
"subquestions": [ | |
{"text": subquestion_1_text, "answer": subquestion_1_answer}, | |
{"text": subquestion_2_text, "answer": subquestion_2_answer}, | |
], | |
"final_answer": final_answer, | |
"rationale_text": rationale_text, | |
"body": { | |
"model": "MODEL_NAME", | |
"messages": [ | |
{"role": "system", "content": system_message}, | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "field", "label": "question", "value": question} | |
] | |
+ [ | |
item | |
for item in content_list_base64 | |
if item.get("type") == "image_url" | |
and "question_image" in item.get("label", "") | |
], | |
}, | |
{ | |
"role": "assistant", | |
"content": [ | |
{"type": "text", "text": rationale_text}, | |
{"type": "text", "text": final_answer}, | |
*[ | |
item | |
for item in content_list_base64 | |
if item.get("type") == "image_url" | |
and "rationale_image" in item.get("label", "") | |
], | |
], | |
}, | |
], | |
}, | |
} | |
# Convert each to JSON line format | |
urls_json_line = json.dumps(item_urls, ensure_ascii=False) | |
base64_json_line = json.dumps(item_base64, ensure_ascii=False) | |
# 3) Write out two JSON files in request_folder | |
urls_jsonl_path = os.path.join(request_folder, "request_urls.json") | |
base64_jsonl_path = os.path.join(request_folder, "request_base64.json") | |
with open(urls_jsonl_path, "w", encoding="utf-8") as f: | |
f.write(urls_json_line + "\n") | |
with open(base64_jsonl_path, "w", encoding="utf-8") as f: | |
f.write(base64_json_line + "\n") | |
# Return the two file paths so Gradio can offer them as downloads | |
return urls_jsonl_path, base64_jsonl_path | |
# Build the Gradio app | |
with gr.Blocks() as demo: | |
gr.Markdown("# Dataset Builder") | |
with gr.Accordion("Instructions", open=True): | |
gr.HTML( | |
""" | |
<h3>Instructions:</h3> | |
<p>Welcome to the Hugging Face space for collecting questions for new benchmark datasets.</p> | |
<table style="width:100%; border-collapse: collapse; margin: 10px 0;"> | |
<tr> | |
<th style="width:50%; background-color: #3366f0; padding: 8px; text-align: left; border: 1px solid #ddd;"> | |
Required Fields | |
</th> | |
<th style="width:50%; background-color: #3366f0; padding: 8px; text-align: left; border: 1px solid #ddd;"> | |
Optional Fields | |
</th> | |
</tr> | |
<tr> | |
<td style="vertical-align: top; padding: 8px; border: 1px solid #ddd;"> | |
<ul style="margin: 0;"> | |
<li>Author Information</li> | |
<li>At least <b>one question image</b></li> | |
<li>The <b>question text</b></li> | |
<li>The <b>final answer</b></li> | |
</ul> | |
</td> | |
<td style="vertical-align: top; padding: 8px; border: 1px solid #ddd;"> | |
<ul style="margin: 0;"> | |
<li>Up to four question images</li> | |
<li>Supporting images for your answer</li> | |
<li><b>Rationale text</b> to explain your reasoning</li> | |
<li><b>Sub-questions</b> with their answers</li> | |
</ul> | |
</td> | |
</tr> | |
</table> | |
<p>While not all fields are mandatory, providing additional context through optional fields will help create a more comprehensive dataset. After submitting a question, you can clear up the form to submit another one.</p> | |
""" | |
) | |
gr.Markdown("## Author Information") | |
with gr.Row(): | |
name_input = gr.Textbox(label="Name", lines=1) | |
email_address_input = gr.Textbox(label="Email Address", lines=1) | |
institution_input = gr.Textbox( | |
label="Institution or 'Independent'", | |
lines=1, | |
placeholder="e.g. MIT, Google, Independent, etc.", | |
) | |
openreview_profile_input = gr.Textbox( | |
label="OpenReview Profile Name", | |
lines=1, | |
placeholder="Your OpenReview username or profile name", | |
) | |
gr.Markdown("## Question Information") | |
# Question Images - Individual Tabs | |
with gr.Tabs(): | |
with gr.Tab("Image 1"): | |
image1 = gr.Image(label="Question Image 1", type="filepath") | |
with gr.Tab("Image 2 (Optional)"): | |
image2 = gr.Image(label="Question Image 2", type="filepath") | |
with gr.Tab("Image 3 (Optional)"): | |
image3 = gr.Image(label="Question Image 3", type="filepath") | |
with gr.Tab("Image 4 (Optional)"): | |
image4 = gr.Image(label="Question Image 4", type="filepath") | |
question_input = gr.Textbox( | |
label="Question", lines=15, placeholder="Type your question here..." | |
) | |
question_categories_input = gr.Textbox( | |
label="Question Categories", | |
lines=1, | |
placeholder="Comma-separated tags, e.g. math, geometry", | |
) | |
# Answer Section | |
gr.Markdown("## Answer ") | |
final_answer_input = gr.Textbox( | |
label="Final Answer", | |
lines=1, | |
placeholder="Enter the short/concise final answer...", | |
) | |
rationale_text_input = gr.Textbox( | |
label="Rationale Text", | |
lines=5, | |
placeholder="Enter the reasoning or explanation for the answer...", | |
) | |
# Rationale Images - Individual Tabs | |
with gr.Tabs(): | |
with gr.Tab("Rationale 1 (Optional)"): | |
rationale_image1 = gr.Image(label="Rationale Image 1", type="filepath") | |
with gr.Tab("Rationale 2 (Optional)"): | |
rationale_image2 = gr.Image(label="Rationale Image 2", type="filepath") | |
# Subquestions Section | |
gr.Markdown("## Subquestions") | |
with gr.Row(): | |
subquestion_1_text_input = gr.Textbox( | |
label="Subquestion 1 Text", lines=2, placeholder="First sub-question..." | |
) | |
subquestion_1_answer_input = gr.Textbox( | |
label="Subquestion 1 Answer", | |
lines=2, | |
placeholder="Answer to sub-question 1...", | |
) | |
with gr.Row(): | |
subquestion_2_text_input = gr.Textbox( | |
label="Subquestion 2 Text", lines=2, placeholder="Second sub-question..." | |
) | |
subquestion_2_answer_input = gr.Textbox( | |
label="Subquestion 2 Answer", | |
lines=2, | |
placeholder="Answer to sub-question 2...", | |
) | |
system_message_input = gr.Textbox( | |
label="System Message", | |
value="You are a helpful assistant", | |
lines=2, | |
placeholder="Enter the system message that defines the AI assistant's role and behavior...", | |
) | |
with gr.Row(): | |
submit_button = gr.Button("Submit") | |
clear_button = gr.Button("Clear Form") | |
with gr.Row(): | |
output_file_urls = gr.File( | |
label="Download URLs JSON", interactive=False, visible=False | |
) | |
output_file_base64 = gr.File( | |
label="Download Base64 JSON", interactive=False, visible=False | |
) | |
# On Submit, we call generate_json_files with all relevant fields | |
def validate_and_generate( | |
sys_msg, | |
nm, | |
em, | |
inst, | |
orp, | |
qcats, | |
sq1t, | |
sq1a, | |
sq2t, | |
sq2a, | |
q, | |
fa, | |
rt, | |
i1, | |
i2, | |
i3, | |
i4, | |
ri1, | |
ri2, | |
): | |
# Check all required fields | |
missing_fields = [] | |
if not nm or not nm.strip(): | |
missing_fields.append("Name") | |
if not em or not em.strip(): | |
missing_fields.append("Email Address") | |
if not inst or not inst.strip(): | |
missing_fields.append("Institution") | |
if not q or not q.strip(): | |
missing_fields.append("Question") | |
if not fa or not fa.strip(): | |
missing_fields.append("Final Answer") | |
if not i1: | |
missing_fields.append("First Question Image") | |
# If any required fields are missing, return a warning and keep all fields as is | |
if missing_fields: | |
warning_msg = f"Required fields missing: {', '.join(missing_fields)} ⛔️" | |
# Return all inputs unchanged plus the warning | |
gr.Warning(warning_msg, duration=5) | |
return gr.Button(interactive=True) | |
# Only after successful validation, generate files but keep all fields | |
results = generate_json_files( | |
sys_msg, | |
nm, | |
em, | |
inst, | |
orp, | |
qcats, | |
sq1t, | |
sq1a, | |
sq2t, | |
sq2a, | |
q, | |
fa, | |
rt, | |
i1, | |
i2, | |
i3, | |
i4, | |
ri1, | |
ri2, | |
) | |
gr.Info( | |
"Dataset item created successfully! 🎉, Clear the form to submit a new one" | |
) | |
return gr.update(interactive=False) | |
submit_button.click( | |
fn=validate_and_generate, | |
inputs=[ | |
system_message_input, | |
name_input, | |
email_address_input, | |
institution_input, | |
openreview_profile_input, | |
question_categories_input, | |
subquestion_1_text_input, | |
subquestion_1_answer_input, | |
subquestion_2_text_input, | |
subquestion_2_answer_input, | |
question_input, | |
final_answer_input, | |
rationale_text_input, | |
image1, | |
image2, | |
image3, | |
image4, | |
rationale_image1, | |
rationale_image2, | |
], | |
outputs=[submit_button], | |
) | |
# Clear button functionality | |
def clear_form_fields(sys_msg, name, email, inst, openreview, *args): | |
# Preserve personal info fields | |
return [ | |
"You are a helpful assistant", # Reset system message to default | |
name, # Preserve name | |
email, # Preserve email | |
inst, # Preserve institution | |
openreview, # Preserve OpenReview profile | |
None, # Clear question categories | |
None, # Clear subquestion 1 text | |
None, # Clear subquestion 1 answer | |
None, # Clear subquestion 2 text | |
None, # Clear subquestion 2 answer | |
None, # Clear question | |
None, # Clear final answer | |
None, # Clear rationale text | |
None, # Clear image1 | |
None, # Clear image2 | |
None, # Clear image3 | |
None, # Clear image4 | |
None, # Clear rationale image1 | |
None, # Clear rationale image2 | |
None, # Clear output file urls | |
None, # Clear output file base64 | |
gr.update(interactive=True), # Re-enable submit button | |
] | |
clear_button.click( | |
fn=clear_form_fields, | |
inputs=[ | |
system_message_input, | |
name_input, | |
email_address_input, | |
institution_input, | |
openreview_profile_input, | |
], | |
outputs=[ | |
system_message_input, | |
name_input, | |
email_address_input, | |
institution_input, | |
openreview_profile_input, | |
question_categories_input, | |
subquestion_1_text_input, | |
subquestion_1_answer_input, | |
subquestion_2_text_input, | |
subquestion_2_answer_input, | |
question_input, | |
final_answer_input, | |
rationale_text_input, | |
image1, | |
image2, | |
image3, | |
image4, | |
rationale_image1, | |
rationale_image2, | |
output_file_urls, | |
output_file_base64, | |
submit_button, | |
], | |
) | |
demo.launch() | |