SubmitQuestions / app.py
taesiri's picture
backup
f9bcc24
raw
history blame
21 kB
import gradio as gr
import base64
import json
import os
import shutil
import uuid
from huggingface_hub import CommitScheduler, HfApi
api = HfApi()
api.login(os.environ["HF_TOKEN"])
scheduler = CommitScheduler(
repo_id="taesiri/EdgeQuest",
repo_type="dataset",
folder_path="./data",
path_in_repo="data",
every=1,
)
def generate_json_files(
system_message,
# New fields
name,
email_address,
institution,
openreview_profile,
question_categories,
subquestion_1_text,
subquestion_1_answer,
subquestion_2_text,
subquestion_2_answer,
# Existing fields
question,
final_answer,
rationale_text,
# Question images
image1,
image2,
image3,
image4,
# Rationale images
rationale_image1,
rationale_image2,
):
"""
For each request:
1) Create a unique folder under ./data/
2) Copy uploaded images (question + rationale) into that folder
3) Produce two JSON files:
- request_urls.json (local file paths in content)
- request_base64.json (base64-encoded images in content)
4) Return paths to both files for Gradio to provide as download links
"""
# 1) Create parent data folder if it doesn't exist
parent_data_folder = "./data"
os.makedirs(parent_data_folder, exist_ok=True)
# 2) Generate a unique request ID and create a subfolder
request_id = str(uuid.uuid4()) # unique ID
request_folder = os.path.join(parent_data_folder, request_id)
os.makedirs(request_folder)
# Handle defaults
if not system_message:
system_message = "You are a helpful assistant"
# Convert None strings
def safe_str(val):
return val if val is not None else ""
name = safe_str(name)
email_address = safe_str(email_address)
institution = safe_str(institution)
openreview_profile = safe_str(openreview_profile)
# Convert question_categories to list
question_categories = (
[cat.strip() for cat in safe_str(question_categories).split(",")]
if question_categories
else []
)
subquestion_1_text = safe_str(subquestion_1_text)
subquestion_1_answer = safe_str(subquestion_1_answer)
subquestion_2_text = safe_str(subquestion_2_text)
subquestion_2_answer = safe_str(subquestion_2_answer)
question = safe_str(question)
final_answer = safe_str(final_answer)
rationale_text = safe_str(rationale_text)
# Collect image-like fields so we can process them in one loop
all_images = [
("question_image_1", image1),
("question_image_2", image2),
("question_image_3", image3),
("question_image_4", image4),
("rationale_image_1", rationale_image1),
("rationale_image_2", rationale_image2),
]
files_list = []
for idx, (img_label, img_obj) in enumerate(all_images):
if img_obj is not None:
temp_path = os.path.join(request_folder, f"{img_label}.png")
if isinstance(img_obj, str):
# If image is a file path
shutil.copy2(img_obj, temp_path)
else:
# If image is a numpy array
gr.processing_utils.save_image(img_obj, temp_path)
# Keep track of the saved path + label
files_list.append((img_label, temp_path))
# Build user content in two flavors: local file paths vs base64
# We’ll store text fields as simple dictionaries, and then images separately.
content_list_urls = [
{"type": "field", "label": "name", "value": name},
{"type": "field", "label": "email_address", "value": email_address},
{"type": "field", "label": "institution", "value": institution},
{"type": "field", "label": "openreview_profile", "value": openreview_profile},
{"type": "field", "label": "question_categories", "value": question_categories},
{"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text},
{
"type": "field",
"label": "subquestion_1_answer",
"value": subquestion_1_answer,
},
{"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text},
{
"type": "field",
"label": "subquestion_2_answer",
"value": subquestion_2_answer,
},
{"type": "field", "label": "question", "value": question},
{"type": "field", "label": "final_answer", "value": final_answer},
{"type": "field", "label": "rationale_text", "value": rationale_text},
]
content_list_base64 = [
{"type": "field", "label": "name", "value": name},
{"type": "field", "label": "email_address", "value": email_address},
{"type": "field", "label": "institution", "value": institution},
{"type": "field", "label": "openreview_profile", "value": openreview_profile},
{"type": "field", "label": "question_categories", "value": question_categories},
{"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text},
{
"type": "field",
"label": "subquestion_1_answer",
"value": subquestion_1_answer,
},
{"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text},
{
"type": "field",
"label": "subquestion_2_answer",
"value": subquestion_2_answer,
},
{"type": "field", "label": "question", "value": question},
{"type": "field", "label": "final_answer", "value": final_answer},
{"type": "field", "label": "rationale_text", "value": rationale_text},
]
# Append image references
for img_label, file_path in files_list:
# 1) Local path (URL) version
rel_path = os.path.join(".", os.path.basename(file_path))
content_list_urls.append(
{
"type": "image_url",
"label": img_label,
"image_url": {"url": {"data:image/png;path": rel_path}},
}
)
# 2) Base64 version
with open(file_path, "rb") as f:
file_bytes = f.read()
img_b64_str = base64.b64encode(file_bytes).decode("utf-8")
content_list_base64.append(
{
"type": "image_url",
"label": img_label,
"image_url": {"url": {"data:image/png;base64": img_b64_str}},
}
)
# Build the final JSON structures for each approach
i = 1
assistant_content = [
{"type": "text", "text": rationale_text},
{"type": "text", "text": final_answer},
]
# A) URLs JSON
item_urls = {
"custom_id": f"request______{i}",
# Metadata at top level
"name": name,
"email_address": email_address,
"institution": institution,
"openreview_profile": openreview_profile,
"question_categories": question_categories,
"question": {
"messages": [
{"role": "system", "content": system_message},
{
"role": "user",
"content": [
{"type": "text", "label": "question", "value": question}
]
+ [
item
for item in content_list_urls
if item.get("type") == "image_url"
and "question_image" in item.get("label", "")
],
},
],
},
"subquestions": [
{"text": subquestion_1_text, "answer": subquestion_1_answer},
{"text": subquestion_2_text, "answer": subquestion_2_answer},
],
"answer": {
"final_answer": final_answer,
"rationale_text": rationale_text,
"rationale_images": [
item
for item in content_list_urls
if item.get("type") == "image_url"
and "rationale_image" in item.get("label", "")
],
},
}
# B) Base64 JSON
item_base64 = {
"custom_id": f"request______{i}",
# Metadata at top level
"name": name,
"email_address": email_address,
"institution": institution,
"openreview_profile": openreview_profile,
# Question-related fields at top level
"question_categories": question_categories,
"subquestions": [
{"text": subquestion_1_text, "answer": subquestion_1_answer},
{"text": subquestion_2_text, "answer": subquestion_2_answer},
],
"final_answer": final_answer,
"rationale_text": rationale_text,
"body": {
"model": "MODEL_NAME",
"messages": [
{"role": "system", "content": system_message},
{
"role": "user",
"content": [
{"type": "field", "label": "question", "value": question}
]
+ [
item
for item in content_list_base64
if item.get("type") == "image_url"
and "question_image" in item.get("label", "")
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": rationale_text},
{"type": "text", "text": final_answer},
*[
item
for item in content_list_base64
if item.get("type") == "image_url"
and "rationale_image" in item.get("label", "")
],
],
},
],
},
}
# Convert each to JSON line format
urls_json_line = json.dumps(item_urls, ensure_ascii=False)
base64_json_line = json.dumps(item_base64, ensure_ascii=False)
# 3) Write out two JSON files in request_folder
urls_jsonl_path = os.path.join(request_folder, "request_urls.json")
base64_jsonl_path = os.path.join(request_folder, "request_base64.json")
with open(urls_jsonl_path, "w", encoding="utf-8") as f:
f.write(urls_json_line + "\n")
with open(base64_jsonl_path, "w", encoding="utf-8") as f:
f.write(base64_json_line + "\n")
# Return the two file paths so Gradio can offer them as downloads
return urls_jsonl_path, base64_jsonl_path
# Build the Gradio app
with gr.Blocks() as demo:
gr.Markdown("# Dataset Builder")
with gr.Accordion("Instructions", open=True):
gr.HTML(
"""
<h3>Instructions:</h3>
<p>Welcome to the Hugging Face space for collecting questions for new benchmark datasets.</p>
<table style="width:100%; border-collapse: collapse; margin: 10px 0;">
<tr>
<th style="width:50%; background-color: #3366f0; padding: 8px; text-align: left; border: 1px solid #ddd;">
Required Fields
</th>
<th style="width:50%; background-color: #3366f0; padding: 8px; text-align: left; border: 1px solid #ddd;">
Optional Fields
</th>
</tr>
<tr>
<td style="vertical-align: top; padding: 8px; border: 1px solid #ddd;">
<ul style="margin: 0;">
<li>Author Information</li>
<li>At least <b>one question image</b></li>
<li>The <b>question text</b></li>
<li>The <b>final answer</b></li>
</ul>
</td>
<td style="vertical-align: top; padding: 8px; border: 1px solid #ddd;">
<ul style="margin: 0;">
<li>Up to four question images</li>
<li>Supporting images for your answer</li>
<li><b>Rationale text</b> to explain your reasoning</li>
<li><b>Sub-questions</b> with their answers</li>
</ul>
</td>
</tr>
</table>
<p>While not all fields are mandatory, providing additional context through optional fields will help create a more comprehensive dataset. After submitting a question, you can clear up the form to submit another one.</p>
"""
)
gr.Markdown("## Author Information")
with gr.Row():
name_input = gr.Textbox(label="Name", lines=1)
email_address_input = gr.Textbox(label="Email Address", lines=1)
institution_input = gr.Textbox(
label="Institution or 'Independent'",
lines=1,
placeholder="e.g. MIT, Google, Independent, etc.",
)
openreview_profile_input = gr.Textbox(
label="OpenReview Profile Name",
lines=1,
placeholder="Your OpenReview username or profile name",
)
gr.Markdown("## Question Information")
# Question Images - Individual Tabs
with gr.Tabs():
with gr.Tab("Image 1"):
image1 = gr.Image(label="Question Image 1", type="filepath")
with gr.Tab("Image 2 (Optional)"):
image2 = gr.Image(label="Question Image 2", type="filepath")
with gr.Tab("Image 3 (Optional)"):
image3 = gr.Image(label="Question Image 3", type="filepath")
with gr.Tab("Image 4 (Optional)"):
image4 = gr.Image(label="Question Image 4", type="filepath")
question_input = gr.Textbox(
label="Question", lines=15, placeholder="Type your question here..."
)
question_categories_input = gr.Textbox(
label="Question Categories",
lines=1,
placeholder="Comma-separated tags, e.g. math, geometry",
)
# Answer Section
gr.Markdown("## Answer ")
final_answer_input = gr.Textbox(
label="Final Answer",
lines=1,
placeholder="Enter the short/concise final answer...",
)
rationale_text_input = gr.Textbox(
label="Rationale Text",
lines=5,
placeholder="Enter the reasoning or explanation for the answer...",
)
# Rationale Images - Individual Tabs
with gr.Tabs():
with gr.Tab("Rationale 1 (Optional)"):
rationale_image1 = gr.Image(label="Rationale Image 1", type="filepath")
with gr.Tab("Rationale 2 (Optional)"):
rationale_image2 = gr.Image(label="Rationale Image 2", type="filepath")
# Subquestions Section
gr.Markdown("## Subquestions")
with gr.Row():
subquestion_1_text_input = gr.Textbox(
label="Subquestion 1 Text", lines=2, placeholder="First sub-question..."
)
subquestion_1_answer_input = gr.Textbox(
label="Subquestion 1 Answer",
lines=2,
placeholder="Answer to sub-question 1...",
)
with gr.Row():
subquestion_2_text_input = gr.Textbox(
label="Subquestion 2 Text", lines=2, placeholder="Second sub-question..."
)
subquestion_2_answer_input = gr.Textbox(
label="Subquestion 2 Answer",
lines=2,
placeholder="Answer to sub-question 2...",
)
system_message_input = gr.Textbox(
label="System Message",
value="You are a helpful assistant",
lines=2,
placeholder="Enter the system message that defines the AI assistant's role and behavior...",
)
with gr.Row():
submit_button = gr.Button("Submit")
clear_button = gr.Button("Clear Form")
with gr.Row():
output_file_urls = gr.File(
label="Download URLs JSON", interactive=False, visible=False
)
output_file_base64 = gr.File(
label="Download Base64 JSON", interactive=False, visible=False
)
# On Submit, we call generate_json_files with all relevant fields
def validate_and_generate(
sys_msg,
nm,
em,
inst,
orp,
qcats,
sq1t,
sq1a,
sq2t,
sq2a,
q,
fa,
rt,
i1,
i2,
i3,
i4,
ri1,
ri2,
):
# Check all required fields
missing_fields = []
if not nm or not nm.strip():
missing_fields.append("Name")
if not em or not em.strip():
missing_fields.append("Email Address")
if not inst or not inst.strip():
missing_fields.append("Institution")
if not q or not q.strip():
missing_fields.append("Question")
if not fa or not fa.strip():
missing_fields.append("Final Answer")
if not i1:
missing_fields.append("First Question Image")
# If any required fields are missing, return a warning and keep all fields as is
if missing_fields:
warning_msg = f"Required fields missing: {', '.join(missing_fields)} ⛔️"
# Return all inputs unchanged plus the warning
gr.Warning(warning_msg, duration=5)
return gr.Button(interactive=True)
# Only after successful validation, generate files but keep all fields
results = generate_json_files(
sys_msg,
nm,
em,
inst,
orp,
qcats,
sq1t,
sq1a,
sq2t,
sq2a,
q,
fa,
rt,
i1,
i2,
i3,
i4,
ri1,
ri2,
)
gr.Info(
"Dataset item created successfully! 🎉, Clear the form to submit a new one"
)
return gr.update(interactive=False)
submit_button.click(
fn=validate_and_generate,
inputs=[
system_message_input,
name_input,
email_address_input,
institution_input,
openreview_profile_input,
question_categories_input,
subquestion_1_text_input,
subquestion_1_answer_input,
subquestion_2_text_input,
subquestion_2_answer_input,
question_input,
final_answer_input,
rationale_text_input,
image1,
image2,
image3,
image4,
rationale_image1,
rationale_image2,
],
outputs=[submit_button],
)
# Clear button functionality
def clear_form_fields(sys_msg, name, email, inst, openreview, *args):
# Preserve personal info fields
return [
"You are a helpful assistant", # Reset system message to default
name, # Preserve name
email, # Preserve email
inst, # Preserve institution
openreview, # Preserve OpenReview profile
None, # Clear question categories
None, # Clear subquestion 1 text
None, # Clear subquestion 1 answer
None, # Clear subquestion 2 text
None, # Clear subquestion 2 answer
None, # Clear question
None, # Clear final answer
None, # Clear rationale text
None, # Clear image1
None, # Clear image2
None, # Clear image3
None, # Clear image4
None, # Clear rationale image1
None, # Clear rationale image2
None, # Clear output file urls
None, # Clear output file base64
gr.update(interactive=True), # Re-enable submit button
]
clear_button.click(
fn=clear_form_fields,
inputs=[
system_message_input,
name_input,
email_address_input,
institution_input,
openreview_profile_input,
],
outputs=[
system_message_input,
name_input,
email_address_input,
institution_input,
openreview_profile_input,
question_categories_input,
subquestion_1_text_input,
subquestion_1_answer_input,
subquestion_2_text_input,
subquestion_2_answer_input,
question_input,
final_answer_input,
rationale_text_input,
image1,
image2,
image3,
image4,
rationale_image1,
rationale_image2,
output_file_urls,
output_file_base64,
submit_button,
],
)
demo.launch()