pere
/

pk-nb-t5x

Model card Files Files and versions

xet

Community

pere commited on May 15, 2022

Commit

a2ec3ff

1 Parent(s): 6bc8162

nospace

Browse files

Files changed (3) hide show

norwegian_byt5_ns_base.gin +29 -0
tasks.py +47 -0
train_byt5_ns_base.sh +9 -0

norwegian_byt5_ns_base.gin ADDED Viewed

	@@ -0,0 +1,29 @@

+include 't5x/examples/t5/byt5/base.gin'
+include 'pretrain_cont.gin'
+#include 't5x/configs/runs/pretrain.gin'
+#iinclude 't5x/configs/runs/finetune.gin'
+# Register necessary SeqIO Tasks/Mixtures.
+import t5.data.mixtures
+import tasks
+MIXTURE_OR_TASK_NAME = "byt5_ns_ncc_english_span_corruption_stream"
+TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
+TRAIN_STEPS = 1_500_000
+DROPOUT_RATE = 0.0 # Changed from the default since T5-1.1 recomments this.
+INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/byt5/base/model.ckpt-1000000"
+PjitPartitioner.num_partitions = 1
+# `LOSS_NORMALIZING_FACTOR`: When fine-tuning a model that was pre-trained
+# # using Mesh Tensorflow (e.g. the public T5 / mT5 / ByT5 models), this should be
+# # set to `pretraining batch_size` * `target_token_length`. For T5 and T5.1.1:
+# # `2048 * 114`. For mT5: `1024 * 229`. For ByT5: `1024 * 189`.
+# The instructions above is from T5X. We here have to convert the Mesh Tensorflow byt5-model, so this needs to be set
+LOSS_NORMALIZING_FACTOR = 193536

tasks.py CHANGED Viewed

@@ -50,6 +50,25 @@ def dataset_fn(split, shuffle_files, seed=None, dataset_params=None):
     )
 @utils.map_over_dataset
 def target_to_key(x, key_map, target_key):
     """Assign the value from the dataset to target_key in key_map"""
@@ -192,6 +211,34 @@ TaskRegistry.add(
     metric_fns=[]
 )
 # Final pretraining task used in Raffel et al., 2019 adaptated to NCC
 dataset_name = 'NbAiLab/NCC_plus_english'

     )
+def gen_dataset_ns(split, shuffle=False, seed=None, column="text", dataset_params=None):
+    dataset = load_dataset(**dataset_params)
+    if shuffle:
+        if seed:
+            dataset = dataset.shuffle(seed=seed)
+        else:
+            dataset = dataset.shuffle()
+    while True:
+        for item in dataset[str(split)]:
+            yield item[column].replace(" ","")
+def dataset_fni_ns(split, shuffle_files, seed=None, dataset_params=None):
+    return tf.data.Dataset.from_generator(
+        functools.partial(gen_dataset_ns, split, shuffle_files, seed, dataset_params=dataset_params),
+        output_signature=tf.TensorSpec(shape=(), dtype=tf.string, name=dataset_name)
+    )
 @utils.map_over_dataset
 def target_to_key(x, key_map, target_key):
     """Assign the value from the dataset to target_key in key_map"""
     metric_fns=[]
 )
+# Final pretraining task used in Raffel et al., 2019 adaptated to NCC
+# No space training
+dataset_name = 'NbAiLab/NCC_plus_english'
+dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True}
+dataset_shapes = None
+TaskRegistry.add(
+        "byt5_ns_ncc_english_span_corruption_stream",
+    source=seqio.FunctionDataSource(
+        dataset_ns_fn=functools.partial(dataset_fn, dataset_params=dataset_params),
+        splits=("train", "validation"),
+        caching_permitted=False,
+        num_input_examples=dataset_shapes,
+    ),
+    preprocessors=[
+        functools.partial(
+            target_to_key, key_map={
+                "inputs": None,
+                "targets": None,
+            }, target_key="targets"),
+        seqio.preprocessors.tokenize,
+        # seqio.CacheDatasetPlaceholder(),
+        preprocessors.span_corruption,
+        seqio.preprocessors.append_eos_after_trim,
+    ],
+    output_features={"targets": BYT5_DEFAULT_OUTPUT_FEATURES["targets"]},
+    metric_fns=[]
+)
 # Final pretraining task used in Raffel et al., 2019 adaptated to NCC
 dataset_name = 'NbAiLab/NCC_plus_english'

train_byt5_ns_base.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+PROJECT_DIR=${HOME}"/models/pk-nb-t5x"
+T5X_DIR="../../t5x"  # directory where the t5x is cloned.
+MODEL_DIR="gs://t5x-training/pretrained_models/norwegian_NCC_plus_English_byt5x_ns_base"
+export PYTHONPATH=${PROJECT_DIR}
+python3 ${T5X_DIR}/t5x/train.py \
+  --gin_search_paths=${PROJECT_DIR} \
+  --gin_file="norwegian_byt5_ns_base.gin" \
+  --gin.MODEL_DIR="'${MODEL_DIR}'" \