Spaces:

Gatsby767
/

AbrahamicSolver

Running

App Files Files Community

AbrahamicSolver / question_generate.py

Gatsby767

Rename math.py to app_math.py and update imports to avoid stdlib conflict

84bfc85 3 months ago

raw

history blame contribute delete

5.04 kB

	import vllm
	import torch
	from transformers import AutoTokenizer
	import argparse
	from typing import List
	from vllm.outputs import RequestOutput
	from evaluation.datasets_loader import get_dataset_handler
	import json
	import regex as re
	import os
	STORAGE_PATH = os.getenv("STORAGE_PATH")

	def extract_boxed(text):
	results, i = [], 0
	prefix = r'\boxed{'
	plen = len(prefix)

	while True:
	start = text.find(prefix, i)
	if start == -1:
	break # no more \boxed{…}

	j = start + plen
	depth = 1
	while j < len(text) and depth:
	if text[j] == '{':
	depth += 1
	elif text[j] == '}':
	depth -= 1
	j += 1

	results.append(text[start + plen : j - 1])
	i = j

	return results

	def get_response_mask(response_ids, eos_token_id, dtype):
	batch_size, seq_len = response_ids.shape
	mask = torch.ones((batch_size, seq_len), dtype=dtype)
	for i in range(batch_size):
	for j in range(seq_len):
	if response_ids[i][j] == eos_token_id:
	mask[i][j:] = 0
	break
	return mask

	def main(args):
	tokenizer = AutoTokenizer.from_pretrained(args.model)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token_id = tokenizer.eos_token_id
	model = vllm.LLM(
	model=args.model,
	tokenizer=args.model,
	# gpu_memory_utilization=0.8,
	seed=int(args.suffix),
	)
	dataset_handler = get_dataset_handler("math")
	questions, answers = dataset_handler.load_data()
	question = questions[0]
	answer = answers[0]
	chat = [
	{
	"role": "system",
	"content": (
	"You are an expert competition-math problem setter.\n"
	"FIRST, in your private scratch-pad, think step-by-step to design a brand-new, non-trivial problem. "
	"The problem could come from any field of mathematics, including but not limited to algebra, geometry, number theory, combinatorics, prealgebra, probability, statistics, and calculus. "
	"Aim for a difficulty such that fewer than 30 % of advanced high-school students could solve it. "
	"Avoid re-using textbook clichés or famous contest problems.\n"
	"THEN, without revealing any of your private thoughts, output exactly the following two blocks:\n\n"
	"<question>\n"
	"{The full problem statement on one or more lines}\n"
	"</question>\n\n"
	r"\boxed{final_answer}"
	"\n\n"
	"Do NOT output anything else—no explanations, no extra markup."
	)
	},
	{
	"role": "user",
	"content": (
	"Generate one new, challenging reasoning question now. "
	"Remember to format the output exactly as instructed."
	)
	}
	]

	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template(
	chat,
	tokenize=False,
	add_generation_prompt=True,
	add_special_tokens=True
	)
	else:
	prompt = "system: " + chat[0]["content"] + '\n' + "user: " + chat[1]["content"]
	sample_params = vllm.SamplingParams(
	max_tokens=4096,
	temperature=1.0,
	top_p=0.95,
	n=1,
	stop_token_ids=[tokenizer.eos_token_id],
	)

	completions: List[RequestOutput] = model.generate([prompt]*args.num_samples, sampling_params=sample_params)
	results=[]
	for completion in completions:
	response = completion.outputs[0].text
	try:
	questions = re.findall(r"<question>(.*?)</question>", response, re.DOTALL)
	answers = extract_boxed(response)

	if questions and answers:
	question = questions[-1].strip()
	answer = answers[-1].strip()
	results.append({"question": question, "answer": answer, "score": 0})
	else:
	results.append({"question": response, "answer": "", "score": -1})
	except:
	results.append({"question": response, "answer": "", "score": -1})
	with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json", "w") as f:
	json.dump(results, f, indent=4)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B")
	parser.add_argument("--num_samples", type=int, default=1250, help="Number of samples to generate")
	parser.add_argument("--suffix", type=str, default="", help="Suffix to add to the output file")
	parser.add_argument("--save_name", type=str, default="", help="")
	args = parser.parse_args()

	main(args)