pere commited on
Commit
e0e9a43
·
1 Parent(s): 0e11d17

pod changes

Browse files
__pycache__/tasks.cpython-38.pyc CHANGED
Binary files a/__pycache__/tasks.cpython-38.pyc and b/__pycache__/tasks.cpython-38.pyc differ
 
norwegian_base.gin CHANGED
@@ -10,7 +10,7 @@ import tasks
10
 
11
  MIXTURE_OR_TASK_NAME = "ncc_span_corruption_stream"
12
  TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
13
- TRAIN_STEPS = 1_100_000
14
  DROPOUT_RATE = 0.0 # Changed from the default since T5-1.1 recomments this.
15
  INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
16
  PjitPartitioner.num_partitions = 4
 
10
 
11
  MIXTURE_OR_TASK_NAME = "ncc_span_corruption_stream"
12
  TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
13
+ TRAIN_STEPS = 1_500_000
14
  DROPOUT_RATE = 0.0 # Changed from the default since T5-1.1 recomments this.
15
  INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/mt5_base/checkpoint_1000000"
16
  PjitPartitioner.num_partitions = 4
tasks.py CHANGED
@@ -128,3 +128,30 @@ TaskRegistry.add(
128
  output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"]},
129
  metric_fns=[]
130
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"]},
129
  metric_fns=[]
130
  )
131
+
132
+ # Final pretraining task used in Raffel et al., 2019 adaptated to NCC
133
+ dataset_name = 'NbAiLab/scandinavian'
134
+ dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True}
135
+ dataset_shapes = None
136
+ TaskRegistry.add(
137
+ "scandinavian_span_corruption_stream",
138
+ source=seqio.FunctionDataSource(
139
+ dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params),
140
+ splits=("train", "validation"),
141
+ caching_permitted=False,
142
+ num_input_examples=dataset_shapes,
143
+ ),
144
+ preprocessors=[
145
+ functools.partial(
146
+ target_to_key, key_map={
147
+ "inputs": None,
148
+ "targets": None,
149
+ }, target_key="targets"),
150
+ seqio.preprocessors.tokenize,
151
+ # seqio.CacheDatasetPlaceholder(),
152
+ preprocessors.span_corruption,
153
+ seqio.preprocessors.append_eos_after_trim,
154
+ ],
155
+ output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"]},
156
+ metric_fns=[]
157
+ )
train_base.sh CHANGED
@@ -1,6 +1,6 @@
1
  PROJECT_DIR=${HOME}"/models/pk-nb-t5x"
2
  T5X_DIR="../../t5x" # directory where the t5x is cloned.
3
- MODEL_DIR="gs://nb-t5x-us-central2/pk_nb_t5x_base_run2"
4
  export PYTHONPATH=${PROJECT_DIR}
5
 
6
  python3 ${T5X_DIR}/t5x/train.py \
 
1
  PROJECT_DIR=${HOME}"/models/pk-nb-t5x"
2
  T5X_DIR="../../t5x" # directory where the t5x is cloned.
3
+ MODEL_DIR="gs://nb-t5x-us-central2/norwegian_t5x_base"
4
  export PYTHONPATH=${PROJECT_DIR}
5
 
6
  python3 ${T5X_DIR}/t5x/train.py \