File size: 5,041 Bytes
b0671c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import random
from pathlib import Path
from datasets import load_dataset, Dataset
from utils import process_recipes, create_tokenizer

dataset = 'tengomucho/all-recipes-split'
dataset_split = 'train'
tokenizer = create_tokenizer()


def get_dataset(dataset, dataset_split):
    """

    Function to grab dataset based on `dataset` and `dataset_split`.

    """
    data = load_dataset(dataset, split=dataset_split)
    
    return data

def save_data(dataset, dataset_split):
    """

    Function to save data locally as a text file.

    """
    data = get_dataset(dataset, dataset_split)
    data_path = Path('../data/')
    file_path = data_path / "recipes.txt"

    if file_path.exists():
        print("Dataset file already exists. Moving onto tokenization...")
        return

    if data_path.is_dir():
        print(f"Directory exists, skipping...")
    else:
        print("Creating diretory...")
        data_path.mkdir(parents=True, exist_ok=True)

    with open(data_path / "recipes.txt", "w", encoding="utf-8") as f:
        for i in range(len(data)):
            title = data[i].get("title", "").strip()
            ingredients = data[i].get("ingredients", "").strip()
            directions = data[i].get("directions", "").strip()

            recipe_block = f"<start>\n"  # Add special token to the beginning of each recipe
            if title:
                recipe_block += f"Title: {title}\n"
            recipe_block += f"Ingredients:\n{ingredients}\n"
            recipe_block += f"Directions:\n{directions}\n"
            recipe_block += f"<end>\n\n"  # Add a special token to the end of each recipe

            f.write(recipe_block) 

    print("File saved.")


def create_dataset(processed_recipes):
    """

    Extract input_ids and attention_mask from processed recipes and create a Dataset.

    

    Args:

        processed_recipes (list): List of tokenized recipe blocks

    

    Returns:

        Dataset: HuggingFace Dataset with input_ids and attention_mask

    """
    input_ids = []
    attention_mask = []

    for recipe in processed_recipes:
        input_ids.append(recipe['input_ids'])
        attention_mask.append(recipe['attention_mask'])

    dataset = Dataset.from_dict({
        "input_ids": input_ids,
        "attention_mask": attention_mask
    })

    return dataset


def split_and_save_dataset(dataset, train_split=0.9, save_dir="../data/tokenized_recipes"):
    """

    Split tokenized dataset into train/validation and save both to disk.

    

    Args:

        dataset (Dataset): HuggingFace Dataset to split

        train_split (float): Proportion for training set (default 0.9 = 90%)

        save_dir (str): Directory to save the datasets

    

    Returns:

        tuple: (train_dataset, val_dataset)

    """

    # Split the dataset
    split_dataset = dataset.train_test_split(test_size=1-train_split, seed=42)
    train_dataset = split_dataset['train']
    val_dataset = split_dataset['test']
    
    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save datasets to disk
    train_path = os.path.join(save_dir, "train")
    val_path = os.path.join(save_dir, "validation")
    
    train_dataset.save_to_disk(train_path)
    val_dataset.save_to_disk(val_path)
    
    print(f"Datasets saved successfully!")
    print(f"Train dataset: {train_path} | ({len(train_dataset)} samples)")
    print(f"Validation dataset: {val_path} | ({len(val_dataset)} samples)")
    
    return train_dataset, val_dataset


def load_saved_datasets(save_dir="../data/tokenized_recipes"):
    """

    Load previously saved tokenized train/validation datasets from disk.

    

    Args:

        save_dir (str): Directory where datasets were saved

    

    Returns:

        tuple: (train_dataset, val_dataset)

    """
    
    train_path = os.path.join(save_dir, "train")
    val_path = os.path.join(save_dir, "validation")
    
    if not os.path.exists(train_path) or not os.path.exists(val_path):
        raise FileNotFoundError(f"Datasets not found in {save_dir}. Please run processing first.")
    
    train_dataset = Dataset.load_from_disk(train_path)
    val_dataset = Dataset.load_from_disk(val_path)
    
    print(f"Datasets loaded successfully!")
    print(f"Train dataset: {len(train_dataset)} samples")
    print(f"Validation dataset: {len(val_dataset)} samples")
    
    return train_dataset, val_dataset


if __name__ == "__main__":
    #download data
    get_dataset(dataset, dataset_split)
    #save as text file
    save_data(dataset, dataset_split)

    #Create tokenized dataset
    with open("../data/recipes.txt", "r", encoding="utf-8") as f:
        recipes = f.read()

    processed_recipes = process_recipes(recipes, tokenizer)
    dataset = create_dataset(processed_recipes)
    split_and_save_dataset(dataset)
    print("Successfully created tokenized dataset.")