from transformers import GPT2TokenizerFast NUMBER_OF_RECIPE_BATCHES = 50000 def create_tokenizer(): tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2") special_tokens = { "bos_token" : "", "eos_token" : "", "additional_special_tokens": [] } tokenizer.add_special_tokens(special_tokens) tokenizer.pad_token = tokenizer.eos_token return tokenizer def process_recipes(recipes, tokenizer): """ Splits recipes by "", prepends "" back to each block, and tokenizes each block. Args: recipes (str): Raw recipe text containing multiple recipes separated by "" tokenizer (callable): Tokenizer function to apply to each recipe block Returns: list: List of tokenized recipe blocks """ # Split by "" and remove empty strings recipe_blocks = [block.strip() for block in recipes.split("") if block.strip()] # Prepend back to each block and tokenize tokenized_recipes = [] for i, block in enumerate(recipe_blocks): if i == NUMBER_OF_RECIPE_BATCHES: # Processing recipe blocks due to memory constraints break full_block = "\n" + block tokenized_block = tokenizer(full_block, truncation=True, max_length=512, padding=False) tokenized_recipes.append(tokenized_block) return tokenized_recipes