Spaces:

taesiri
/

SubmitQuestions

Paused

App Files Files Community

SubmitQuestions / app.py

taesiri

backup

f9bcc24 8 months ago

raw

history blame

21 kB

	import gradio as gr
	import base64
	import json
	import os
	import shutil
	import uuid
	from huggingface_hub import CommitScheduler, HfApi

	api = HfApi()
	api.login(os.environ["HF_TOKEN"])

	scheduler = CommitScheduler(
	repo_id="taesiri/EdgeQuest",
	repo_type="dataset",
	folder_path="./data",
	path_in_repo="data",
	every=1,
	)


	def generate_json_files(
	system_message,
	# New fields
	name,
	email_address,
	institution,
	openreview_profile,
	question_categories,
	subquestion_1_text,
	subquestion_1_answer,
	subquestion_2_text,
	subquestion_2_answer,
	# Existing fields
	question,
	final_answer,
	rationale_text,
	# Question images
	image1,
	image2,
	image3,
	image4,
	# Rationale images
	rationale_image1,
	rationale_image2,
	):
	"""
	For each request:
	1) Create a unique folder under ./data/
	2) Copy uploaded images (question + rationale) into that folder
	3) Produce two JSON files:
	- request_urls.json (local file paths in content)
	- request_base64.json (base64-encoded images in content)
	4) Return paths to both files for Gradio to provide as download links
	"""

	# 1) Create parent data folder if it doesn't exist
	parent_data_folder = "./data"
	os.makedirs(parent_data_folder, exist_ok=True)

	# 2) Generate a unique request ID and create a subfolder
	request_id = str(uuid.uuid4()) # unique ID
	request_folder = os.path.join(parent_data_folder, request_id)
	os.makedirs(request_folder)

	# Handle defaults
	if not system_message:
	system_message = "You are a helpful assistant"

	# Convert None strings
	def safe_str(val):
	return val if val is not None else ""

	name = safe_str(name)
	email_address = safe_str(email_address)
	institution = safe_str(institution)
	openreview_profile = safe_str(openreview_profile)
	# Convert question_categories to list
	question_categories = (
	[cat.strip() for cat in safe_str(question_categories).split(",")]
	if question_categories
	else []
	)
	subquestion_1_text = safe_str(subquestion_1_text)
	subquestion_1_answer = safe_str(subquestion_1_answer)
	subquestion_2_text = safe_str(subquestion_2_text)
	subquestion_2_answer = safe_str(subquestion_2_answer)
	question = safe_str(question)
	final_answer = safe_str(final_answer)
	rationale_text = safe_str(rationale_text)

	# Collect image-like fields so we can process them in one loop
	all_images = [
	("question_image_1", image1),
	("question_image_2", image2),
	("question_image_3", image3),
	("question_image_4", image4),
	("rationale_image_1", rationale_image1),
	("rationale_image_2", rationale_image2),
	]

	files_list = []
	for idx, (img_label, img_obj) in enumerate(all_images):
	if img_obj is not None:
	temp_path = os.path.join(request_folder, f"{img_label}.png")
	if isinstance(img_obj, str):
	# If image is a file path
	shutil.copy2(img_obj, temp_path)
	else:
	# If image is a numpy array
	gr.processing_utils.save_image(img_obj, temp_path)
	# Keep track of the saved path + label
	files_list.append((img_label, temp_path))

	# Build user content in two flavors: local file paths vs base64
	# We’ll store text fields as simple dictionaries, and then images separately.
	content_list_urls = [
	{"type": "field", "label": "name", "value": name},
	{"type": "field", "label": "email_address", "value": email_address},
	{"type": "field", "label": "institution", "value": institution},
	{"type": "field", "label": "openreview_profile", "value": openreview_profile},
	{"type": "field", "label": "question_categories", "value": question_categories},
	{"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text},
	{
	"type": "field",
	"label": "subquestion_1_answer",
	"value": subquestion_1_answer,
	},
	{"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text},
	{
	"type": "field",
	"label": "subquestion_2_answer",
	"value": subquestion_2_answer,
	},
	{"type": "field", "label": "question", "value": question},
	{"type": "field", "label": "final_answer", "value": final_answer},
	{"type": "field", "label": "rationale_text", "value": rationale_text},
	]
	content_list_base64 = [
	{"type": "field", "label": "name", "value": name},
	{"type": "field", "label": "email_address", "value": email_address},
	{"type": "field", "label": "institution", "value": institution},
	{"type": "field", "label": "openreview_profile", "value": openreview_profile},
	{"type": "field", "label": "question_categories", "value": question_categories},
	{"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text},
	{
	"type": "field",
	"label": "subquestion_1_answer",
	"value": subquestion_1_answer,
	},
	{"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text},
	{
	"type": "field",
	"label": "subquestion_2_answer",
	"value": subquestion_2_answer,
	},
	{"type": "field", "label": "question", "value": question},
	{"type": "field", "label": "final_answer", "value": final_answer},
	{"type": "field", "label": "rationale_text", "value": rationale_text},
	]

	# Append image references
	for img_label, file_path in files_list:
	# 1) Local path (URL) version
	rel_path = os.path.join(".", os.path.basename(file_path))
	content_list_urls.append(
	{
	"type": "image_url",
	"label": img_label,
	"image_url": {"url": {"data:image/png;path": rel_path}},
	}
	)

	# 2) Base64 version
	with open(file_path, "rb") as f:
	file_bytes = f.read()
	img_b64_str = base64.b64encode(file_bytes).decode("utf-8")
	content_list_base64.append(
	{
	"type": "image_url",
	"label": img_label,
	"image_url": {"url": {"data:image/png;base64": img_b64_str}},
	}
	)

	# Build the final JSON structures for each approach
	i = 1

	assistant_content = [
	{"type": "text", "text": rationale_text},
	{"type": "text", "text": final_answer},
	]

	# A) URLs JSON
	item_urls = {
	"custom_id": f"request______{i}",
	# Metadata at top level
	"name": name,
	"email_address": email_address,
	"institution": institution,
	"openreview_profile": openreview_profile,
	"question_categories": question_categories,
	"question": {
	"messages": [
	{"role": "system", "content": system_message},
	{
	"role": "user",
	"content": [
	{"type": "text", "label": "question", "value": question}
	]
	+ [
	item
	for item in content_list_urls
	if item.get("type") == "image_url"
	and "question_image" in item.get("label", "")
	],
	},
	],
	},
	"subquestions": [
	{"text": subquestion_1_text, "answer": subquestion_1_answer},
	{"text": subquestion_2_text, "answer": subquestion_2_answer},
	],
	"answer": {
	"final_answer": final_answer,
	"rationale_text": rationale_text,
	"rationale_images": [
	item
	for item in content_list_urls
	if item.get("type") == "image_url"
	and "rationale_image" in item.get("label", "")
	],
	},
	}

	# B) Base64 JSON
	item_base64 = {
	"custom_id": f"request______{i}",
	# Metadata at top level
	"name": name,
	"email_address": email_address,
	"institution": institution,
	"openreview_profile": openreview_profile,
	# Question-related fields at top level
	"question_categories": question_categories,
	"subquestions": [
	{"text": subquestion_1_text, "answer": subquestion_1_answer},
	{"text": subquestion_2_text, "answer": subquestion_2_answer},
	],
	"final_answer": final_answer,
	"rationale_text": rationale_text,
	"body": {
	"model": "MODEL_NAME",
	"messages": [
	{"role": "system", "content": system_message},
	{
	"role": "user",
	"content": [
	{"type": "field", "label": "question", "value": question}
	]
	+ [
	item
	for item in content_list_base64
	if item.get("type") == "image_url"
	and "question_image" in item.get("label", "")
	],
	},
	{
	"role": "assistant",
	"content": [
	{"type": "text", "text": rationale_text},
	{"type": "text", "text": final_answer},
	*[
	item
	for item in content_list_base64
	if item.get("type") == "image_url"
	and "rationale_image" in item.get("label", "")
	],
	],
	},
	],
	},
	}

	# Convert each to JSON line format
	urls_json_line = json.dumps(item_urls, ensure_ascii=False)
	base64_json_line = json.dumps(item_base64, ensure_ascii=False)

	# 3) Write out two JSON files in request_folder
	urls_jsonl_path = os.path.join(request_folder, "request_urls.json")
	base64_jsonl_path = os.path.join(request_folder, "request_base64.json")

	with open(urls_jsonl_path, "w", encoding="utf-8") as f:
	f.write(urls_json_line + "\n")
	with open(base64_jsonl_path, "w", encoding="utf-8") as f:
	f.write(base64_json_line + "\n")

	# Return the two file paths so Gradio can offer them as downloads
	return urls_jsonl_path, base64_jsonl_path


	# Build the Gradio app
	with gr.Blocks() as demo:
	gr.Markdown("# Dataset Builder")
	with gr.Accordion("Instructions", open=True):
	gr.HTML(
	"""
	<h3>Instructions:</h3>
	<p>Welcome to the Hugging Face space for collecting questions for new benchmark datasets.</p>

	<table style="width:100%; border-collapse: collapse; margin: 10px 0;">
	<tr>
	<th style="width:50%; background-color: #3366f0; padding: 8px; text-align: left; border: 1px solid #ddd;">
	Required Fields
	</th>
	<th style="width:50%; background-color: #3366f0; padding: 8px; text-align: left; border: 1px solid #ddd;">
	Optional Fields
	</th>
	</tr>
	<tr>
	<td style="vertical-align: top; padding: 8px; border: 1px solid #ddd;">
	<ul style="margin: 0;">
	<li>Author Information</li>
	<li>At least <b>one question image</b></li>
	<li>The <b>question text</b></li>
	<li>The <b>final answer</b></li>
	</ul>
	</td>
	<td style="vertical-align: top; padding: 8px; border: 1px solid #ddd;">
	<ul style="margin: 0;">
	<li>Up to four question images</li>
	<li>Supporting images for your answer</li>
	<li><b>Rationale text</b> to explain your reasoning</li>
	<li><b>Sub-questions</b> with their answers</li>
	</ul>
	</td>
	</tr>
	</table>

	<p>While not all fields are mandatory, providing additional context through optional fields will help create a more comprehensive dataset. After submitting a question, you can clear up the form to submit another one.</p>
	"""
	)
	gr.Markdown("## Author Information")
	with gr.Row():
	name_input = gr.Textbox(label="Name", lines=1)
	email_address_input = gr.Textbox(label="Email Address", lines=1)
	institution_input = gr.Textbox(
	label="Institution or 'Independent'",
	lines=1,
	placeholder="e.g. MIT, Google, Independent, etc.",
	)
	openreview_profile_input = gr.Textbox(
	label="OpenReview Profile Name",
	lines=1,
	placeholder="Your OpenReview username or profile name",
	)

	gr.Markdown("## Question Information")

	# Question Images - Individual Tabs
	with gr.Tabs():
	with gr.Tab("Image 1"):
	image1 = gr.Image(label="Question Image 1", type="filepath")
	with gr.Tab("Image 2 (Optional)"):
	image2 = gr.Image(label="Question Image 2", type="filepath")
	with gr.Tab("Image 3 (Optional)"):
	image3 = gr.Image(label="Question Image 3", type="filepath")
	with gr.Tab("Image 4 (Optional)"):
	image4 = gr.Image(label="Question Image 4", type="filepath")

	question_input = gr.Textbox(
	label="Question", lines=15, placeholder="Type your question here..."
	)

	question_categories_input = gr.Textbox(
	label="Question Categories",
	lines=1,
	placeholder="Comma-separated tags, e.g. math, geometry",
	)

	# Answer Section
	gr.Markdown("## Answer ")

	final_answer_input = gr.Textbox(
	label="Final Answer",
	lines=1,
	placeholder="Enter the short/concise final answer...",
	)

	rationale_text_input = gr.Textbox(
	label="Rationale Text",
	lines=5,
	placeholder="Enter the reasoning or explanation for the answer...",
	)

	# Rationale Images - Individual Tabs
	with gr.Tabs():
	with gr.Tab("Rationale 1 (Optional)"):
	rationale_image1 = gr.Image(label="Rationale Image 1", type="filepath")
	with gr.Tab("Rationale 2 (Optional)"):
	rationale_image2 = gr.Image(label="Rationale Image 2", type="filepath")

	# Subquestions Section
	gr.Markdown("## Subquestions")
	with gr.Row():
	subquestion_1_text_input = gr.Textbox(
	label="Subquestion 1 Text", lines=2, placeholder="First sub-question..."
	)
	subquestion_1_answer_input = gr.Textbox(
	label="Subquestion 1 Answer",
	lines=2,
	placeholder="Answer to sub-question 1...",
	)

	with gr.Row():
	subquestion_2_text_input = gr.Textbox(
	label="Subquestion 2 Text", lines=2, placeholder="Second sub-question..."
	)
	subquestion_2_answer_input = gr.Textbox(
	label="Subquestion 2 Answer",
	lines=2,
	placeholder="Answer to sub-question 2...",
	)

	system_message_input = gr.Textbox(
	label="System Message",
	value="You are a helpful assistant",
	lines=2,
	placeholder="Enter the system message that defines the AI assistant's role and behavior...",
	)

	with gr.Row():
	submit_button = gr.Button("Submit")
	clear_button = gr.Button("Clear Form")

	with gr.Row():
	output_file_urls = gr.File(
	label="Download URLs JSON", interactive=False, visible=False
	)
	output_file_base64 = gr.File(
	label="Download Base64 JSON", interactive=False, visible=False
	)

	# On Submit, we call generate_json_files with all relevant fields
	def validate_and_generate(
	sys_msg,
	nm,
	em,
	inst,
	orp,
	qcats,
	sq1t,
	sq1a,
	sq2t,
	sq2a,
	q,
	fa,
	rt,
	i1,
	i2,
	i3,
	i4,
	ri1,
	ri2,
	):
	# Check all required fields
	missing_fields = []
	if not nm or not nm.strip():
	missing_fields.append("Name")
	if not em or not em.strip():
	missing_fields.append("Email Address")
	if not inst or not inst.strip():
	missing_fields.append("Institution")
	if not q or not q.strip():
	missing_fields.append("Question")
	if not fa or not fa.strip():
	missing_fields.append("Final Answer")
	if not i1:
	missing_fields.append("First Question Image")

	# If any required fields are missing, return a warning and keep all fields as is
	if missing_fields:
	warning_msg = f"Required fields missing: {', '.join(missing_fields)} ⛔️"
	# Return all inputs unchanged plus the warning
	gr.Warning(warning_msg, duration=5)
	return gr.Button(interactive=True)

	# Only after successful validation, generate files but keep all fields
	results = generate_json_files(
	sys_msg,
	nm,
	em,
	inst,
	orp,
	qcats,
	sq1t,
	sq1a,
	sq2t,
	sq2a,
	q,
	fa,
	rt,
	i1,
	i2,
	i3,
	i4,
	ri1,
	ri2,
	)

	gr.Info(
	"Dataset item created successfully! 🎉, Clear the form to submit a new one"
	)

	return gr.update(interactive=False)

	submit_button.click(
	fn=validate_and_generate,
	inputs=[
	system_message_input,
	name_input,
	email_address_input,
	institution_input,
	openreview_profile_input,
	question_categories_input,
	subquestion_1_text_input,
	subquestion_1_answer_input,
	subquestion_2_text_input,
	subquestion_2_answer_input,
	question_input,
	final_answer_input,
	rationale_text_input,
	image1,
	image2,
	image3,
	image4,
	rationale_image1,
	rationale_image2,
	],
	outputs=[submit_button],
	)

	# Clear button functionality
	def clear_form_fields(sys_msg, name, email, inst, openreview, *args):
	# Preserve personal info fields
	return [
	"You are a helpful assistant", # Reset system message to default
	name, # Preserve name
	email, # Preserve email
	inst, # Preserve institution
	openreview, # Preserve OpenReview profile
	None, # Clear question categories
	None, # Clear subquestion 1 text
	None, # Clear subquestion 1 answer
	None, # Clear subquestion 2 text
	None, # Clear subquestion 2 answer
	None, # Clear question
	None, # Clear final answer
	None, # Clear rationale text
	None, # Clear image1
	None, # Clear image2
	None, # Clear image3
	None, # Clear image4
	None, # Clear rationale image1
	None, # Clear rationale image2
	None, # Clear output file urls
	None, # Clear output file base64
	gr.update(interactive=True), # Re-enable submit button
	]

	clear_button.click(
	fn=clear_form_fields,
	inputs=[
	system_message_input,
	name_input,
	email_address_input,
	institution_input,
	openreview_profile_input,
	],
	outputs=[
	system_message_input,
	name_input,
	email_address_input,
	institution_input,
	openreview_profile_input,
	question_categories_input,
	subquestion_1_text_input,
	subquestion_1_answer_input,
	subquestion_2_text_input,
	subquestion_2_answer_input,
	question_input,
	final_answer_input,
	rationale_text_input,
	image1,
	image2,
	image3,
	image4,
	rationale_image1,
	rationale_image2,
	output_file_urls,
	output_file_base64,
	submit_button,
	],
	)

	demo.launch()