Spaces:
Running
Running
File size: 5,041 Bytes
b0671c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import os
import random
from pathlib import Path
from datasets import load_dataset, Dataset
from utils import process_recipes, create_tokenizer
dataset = 'tengomucho/all-recipes-split'
dataset_split = 'train'
tokenizer = create_tokenizer()
def get_dataset(dataset, dataset_split):
"""
Function to grab dataset based on `dataset` and `dataset_split`.
"""
data = load_dataset(dataset, split=dataset_split)
return data
def save_data(dataset, dataset_split):
"""
Function to save data locally as a text file.
"""
data = get_dataset(dataset, dataset_split)
data_path = Path('../data/')
file_path = data_path / "recipes.txt"
if file_path.exists():
print("Dataset file already exists. Moving onto tokenization...")
return
if data_path.is_dir():
print(f"Directory exists, skipping...")
else:
print("Creating diretory...")
data_path.mkdir(parents=True, exist_ok=True)
with open(data_path / "recipes.txt", "w", encoding="utf-8") as f:
for i in range(len(data)):
title = data[i].get("title", "").strip()
ingredients = data[i].get("ingredients", "").strip()
directions = data[i].get("directions", "").strip()
recipe_block = f"<start>\n" # Add special token to the beginning of each recipe
if title:
recipe_block += f"Title: {title}\n"
recipe_block += f"Ingredients:\n{ingredients}\n"
recipe_block += f"Directions:\n{directions}\n"
recipe_block += f"<end>\n\n" # Add a special token to the end of each recipe
f.write(recipe_block)
print("File saved.")
def create_dataset(processed_recipes):
"""
Extract input_ids and attention_mask from processed recipes and create a Dataset.
Args:
processed_recipes (list): List of tokenized recipe blocks
Returns:
Dataset: HuggingFace Dataset with input_ids and attention_mask
"""
input_ids = []
attention_mask = []
for recipe in processed_recipes:
input_ids.append(recipe['input_ids'])
attention_mask.append(recipe['attention_mask'])
dataset = Dataset.from_dict({
"input_ids": input_ids,
"attention_mask": attention_mask
})
return dataset
def split_and_save_dataset(dataset, train_split=0.9, save_dir="../data/tokenized_recipes"):
"""
Split tokenized dataset into train/validation and save both to disk.
Args:
dataset (Dataset): HuggingFace Dataset to split
train_split (float): Proportion for training set (default 0.9 = 90%)
save_dir (str): Directory to save the datasets
Returns:
tuple: (train_dataset, val_dataset)
"""
# Split the dataset
split_dataset = dataset.train_test_split(test_size=1-train_split, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']
# Create save directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)
# Save datasets to disk
train_path = os.path.join(save_dir, "train")
val_path = os.path.join(save_dir, "validation")
train_dataset.save_to_disk(train_path)
val_dataset.save_to_disk(val_path)
print(f"Datasets saved successfully!")
print(f"Train dataset: {train_path} | ({len(train_dataset)} samples)")
print(f"Validation dataset: {val_path} | ({len(val_dataset)} samples)")
return train_dataset, val_dataset
def load_saved_datasets(save_dir="../data/tokenized_recipes"):
"""
Load previously saved tokenized train/validation datasets from disk.
Args:
save_dir (str): Directory where datasets were saved
Returns:
tuple: (train_dataset, val_dataset)
"""
train_path = os.path.join(save_dir, "train")
val_path = os.path.join(save_dir, "validation")
if not os.path.exists(train_path) or not os.path.exists(val_path):
raise FileNotFoundError(f"Datasets not found in {save_dir}. Please run processing first.")
train_dataset = Dataset.load_from_disk(train_path)
val_dataset = Dataset.load_from_disk(val_path)
print(f"Datasets loaded successfully!")
print(f"Train dataset: {len(train_dataset)} samples")
print(f"Validation dataset: {len(val_dataset)} samples")
return train_dataset, val_dataset
if __name__ == "__main__":
#download data
get_dataset(dataset, dataset_split)
#save as text file
save_data(dataset, dataset_split)
#Create tokenized dataset
with open("../data/recipes.txt", "r", encoding="utf-8") as f:
recipes = f.read()
processed_recipes = process_recipes(recipes, tokenizer)
dataset = create_dataset(processed_recipes)
split_and_save_dataset(dataset)
print("Successfully created tokenized dataset.")
|