Recipe-GPT / utils.py
saksh-d's picture
upload files
b0671c9 verified
from transformers import GPT2TokenizerFast
NUMBER_OF_RECIPE_BATCHES = 50000
def create_tokenizer():
tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
special_tokens = {
"bos_token" : "<start>",
"eos_token" : "<end>",
"additional_special_tokens": []
}
tokenizer.add_special_tokens(special_tokens)
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
def process_recipes(recipes, tokenizer):
"""
Splits recipes by "<start>", prepends "<start>" back to each block,
and tokenizes each block.
Args:
recipes (str): Raw recipe text containing multiple recipes separated by "<start>"
tokenizer (callable): Tokenizer function to apply to each recipe block
Returns:
list: List of tokenized recipe blocks
"""
# Split by "<start>" and remove empty strings
recipe_blocks = [block.strip() for block in recipes.split("<start>") if block.strip()]
# Prepend <start> back to each block and tokenize
tokenized_recipes = []
for i, block in enumerate(recipe_blocks):
if i == NUMBER_OF_RECIPE_BATCHES: # Processing recipe blocks due to memory constraints
break
full_block = "<start>\n" + block
tokenized_block = tokenizer(full_block, truncation=True, max_length=512, padding=False)
tokenized_recipes.append(tokenized_block)
return tokenized_recipes