Spaces:
Running
Running
import os | |
import random | |
from pathlib import Path | |
from datasets import load_dataset, Dataset | |
from utils import process_recipes, create_tokenizer | |
dataset = 'tengomucho/all-recipes-split' | |
dataset_split = 'train' | |
tokenizer = create_tokenizer() | |
def get_dataset(dataset, dataset_split): | |
""" | |
Function to grab dataset based on `dataset` and `dataset_split`. | |
""" | |
data = load_dataset(dataset, split=dataset_split) | |
return data | |
def save_data(dataset, dataset_split): | |
""" | |
Function to save data locally as a text file. | |
""" | |
data = get_dataset(dataset, dataset_split) | |
data_path = Path('../data/') | |
file_path = data_path / "recipes.txt" | |
if file_path.exists(): | |
print("Dataset file already exists. Moving onto tokenization...") | |
return | |
if data_path.is_dir(): | |
print(f"Directory exists, skipping...") | |
else: | |
print("Creating diretory...") | |
data_path.mkdir(parents=True, exist_ok=True) | |
with open(data_path / "recipes.txt", "w", encoding="utf-8") as f: | |
for i in range(len(data)): | |
title = data[i].get("title", "").strip() | |
ingredients = data[i].get("ingredients", "").strip() | |
directions = data[i].get("directions", "").strip() | |
recipe_block = f"<start>\n" # Add special token to the beginning of each recipe | |
if title: | |
recipe_block += f"Title: {title}\n" | |
recipe_block += f"Ingredients:\n{ingredients}\n" | |
recipe_block += f"Directions:\n{directions}\n" | |
recipe_block += f"<end>\n\n" # Add a special token to the end of each recipe | |
f.write(recipe_block) | |
print("File saved.") | |
def create_dataset(processed_recipes): | |
""" | |
Extract input_ids and attention_mask from processed recipes and create a Dataset. | |
Args: | |
processed_recipes (list): List of tokenized recipe blocks | |
Returns: | |
Dataset: HuggingFace Dataset with input_ids and attention_mask | |
""" | |
input_ids = [] | |
attention_mask = [] | |
for recipe in processed_recipes: | |
input_ids.append(recipe['input_ids']) | |
attention_mask.append(recipe['attention_mask']) | |
dataset = Dataset.from_dict({ | |
"input_ids": input_ids, | |
"attention_mask": attention_mask | |
}) | |
return dataset | |
def split_and_save_dataset(dataset, train_split=0.9, save_dir="../data/tokenized_recipes"): | |
""" | |
Split tokenized dataset into train/validation and save both to disk. | |
Args: | |
dataset (Dataset): HuggingFace Dataset to split | |
train_split (float): Proportion for training set (default 0.9 = 90%) | |
save_dir (str): Directory to save the datasets | |
Returns: | |
tuple: (train_dataset, val_dataset) | |
""" | |
# Split the dataset | |
split_dataset = dataset.train_test_split(test_size=1-train_split, seed=42) | |
train_dataset = split_dataset['train'] | |
val_dataset = split_dataset['test'] | |
# Create save directory if it doesn't exist | |
os.makedirs(save_dir, exist_ok=True) | |
# Save datasets to disk | |
train_path = os.path.join(save_dir, "train") | |
val_path = os.path.join(save_dir, "validation") | |
train_dataset.save_to_disk(train_path) | |
val_dataset.save_to_disk(val_path) | |
print(f"Datasets saved successfully!") | |
print(f"Train dataset: {train_path} | ({len(train_dataset)} samples)") | |
print(f"Validation dataset: {val_path} | ({len(val_dataset)} samples)") | |
return train_dataset, val_dataset | |
def load_saved_datasets(save_dir="../data/tokenized_recipes"): | |
""" | |
Load previously saved tokenized train/validation datasets from disk. | |
Args: | |
save_dir (str): Directory where datasets were saved | |
Returns: | |
tuple: (train_dataset, val_dataset) | |
""" | |
train_path = os.path.join(save_dir, "train") | |
val_path = os.path.join(save_dir, "validation") | |
if not os.path.exists(train_path) or not os.path.exists(val_path): | |
raise FileNotFoundError(f"Datasets not found in {save_dir}. Please run processing first.") | |
train_dataset = Dataset.load_from_disk(train_path) | |
val_dataset = Dataset.load_from_disk(val_path) | |
print(f"Datasets loaded successfully!") | |
print(f"Train dataset: {len(train_dataset)} samples") | |
print(f"Validation dataset: {len(val_dataset)} samples") | |
return train_dataset, val_dataset | |
if __name__ == "__main__": | |
#download data | |
get_dataset(dataset, dataset_split) | |
#save as text file | |
save_data(dataset, dataset_split) | |
#Create tokenized dataset | |
with open("../data/recipes.txt", "r", encoding="utf-8") as f: | |
recipes = f.read() | |
processed_recipes = process_recipes(recipes, tokenizer) | |
dataset = create_dataset(processed_recipes) | |
split_and_save_dataset(dataset) | |
print("Successfully created tokenized dataset.") | |