Spaces:

saksh-d
/

Recipe-GPT

Running

File size: 5,041 Bytes

b0671c9

import os
import random
from pathlib import Path
from datasets import load_dataset, Dataset
from utils import process_recipes, create_tokenizer

dataset = 'tengomucho/all-recipes-split'
dataset_split = 'train'
tokenizer = create_tokenizer()


def get_dataset(dataset, dataset_split):
    """

    Function to grab dataset based on `dataset` and `dataset_split`.

    """
    data = load_dataset(dataset, split=dataset_split)
    
    return data

def save_data(dataset, dataset_split):
    """

    Function to save data locally as a text file.

    """
    data = get_dataset(dataset, dataset_split)
    data_path = Path('../data/')
    file_path = data_path / "recipes.txt"

    if file_path.exists():
        print("Dataset file already exists. Moving onto tokenization...")
        return

    if data_path.is_dir():
        print(f"Directory exists, skipping...")
    else:
        print("Creating diretory...")
        data_path.mkdir(parents=True, exist_ok=True)

    with open(data_path / "recipes.txt", "w", encoding="utf-8") as f:
        for i in range(len(data)):
            title = data[i].get("title", "").strip()
            ingredients = data[i].get("ingredients", "").strip()
            directions = data[i].get("directions", "").strip()

            recipe_block = f"<start>\n"  # Add special token to the beginning of each recipe
            if title:
                recipe_block += f"Title: {title}\n"
            recipe_block += f"Ingredients:\n{ingredients}\n"
            recipe_block += f"Directions:\n{directions}\n"
            recipe_block += f"<end>\n\n"  # Add a special token to the end of each recipe

            f.write(recipe_block) 

    print("File saved.")


def create_dataset(processed_recipes):
    """

    Extract input_ids and attention_mask from processed recipes and create a Dataset.

    

    Args:

        processed_recipes (list): List of tokenized recipe blocks

    

    Returns:

        Dataset: HuggingFace Dataset with input_ids and attention_mask

    """
    input_ids = []
    attention_mask = []

    for recipe in processed_recipes:
        input_ids.append(recipe['input_ids'])
        attention_mask.append(recipe['attention_mask'])

    dataset = Dataset.from_dict({
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })

    return dataset


def split_and_save_dataset(dataset, train_split=0.9, save_dir="../data/tokenized_recipes"):
    """

    Split tokenized dataset into train/validation and save both to disk.

    

    Args:

        dataset (Dataset): HuggingFace Dataset to split

        train_split (float): Proportion for training set (default 0.9 = 90%)

        save_dir (str): Directory to save the datasets

    

    Returns:

        tuple: (train_dataset, val_dataset)

    """

    # Split the dataset
    split_dataset = dataset.train_test_split(test_size=1-train_split, seed=42)
    train_dataset = split_dataset['train']
    val_dataset = split_dataset['test']
    
    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save datasets to disk
    train_path = os.path.join(save_dir, "train")
    val_path = os.path.join(save_dir, "validation")
    
    train_dataset.save_to_disk(train_path)
    val_dataset.save_to_disk(val_path)
    
    print(f"Datasets saved successfully!")
    print(f"Train dataset: {train_path} | ({len(train_dataset)} samples)")
    print(f"Validation dataset: {val_path} | ({len(val_dataset)} samples)")
    
    return train_dataset, val_dataset


def load_saved_datasets(save_dir="../data/tokenized_recipes"):
    """

    Load previously saved tokenized train/validation datasets from disk.

    

    Args:

        save_dir (str): Directory where datasets were saved

    

    Returns:

        tuple: (train_dataset, val_dataset)

    """
    
    train_path = os.path.join(save_dir, "train")
    val_path = os.path.join(save_dir, "validation")
    
    if not os.path.exists(train_path) or not os.path.exists(val_path):
        raise FileNotFoundError(f"Datasets not found in {save_dir}. Please run processing first.")
    
    train_dataset = Dataset.load_from_disk(train_path)
    val_dataset = Dataset.load_from_disk(val_path)
    
    print(f"Datasets loaded successfully!")
    print(f"Train dataset: {len(train_dataset)} samples")
    print(f"Validation dataset: {len(val_dataset)} samples")
    
    return train_dataset, val_dataset


if __name__ == "__main__":
    #download data
    get_dataset(dataset, dataset_split)
    #save as text file
    save_data(dataset, dataset_split)

    #Create tokenized dataset
    with open("../data/recipes.txt", "r", encoding="utf-8") as f:
        recipes = f.read()

    processed_recipes = process_recipes(recipes, tokenizer)
    dataset = create_dataset(processed_recipes)
    split_and_save_dataset(dataset)
    print("Successfully created tokenized dataset.")