diff --git a/.gitattributes b/.gitattributes index ac481c8eb05e4d2496fbe076a38a7b4835dd733d..5b0b9613c3d3a8b6f6a52f5c455a56e9c79ab7e7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zstandard filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/config.gin b/config.gin new file mode 100644 index 0000000000000000000000000000000000000000..fd09b57c4020c9b950ed4bde0e118d8cb57c9a7a --- /dev/null +++ b/config.gin @@ -0,0 +1,151 @@ +from __gin__ import dynamic_registration +import __main__ as train_script +import seqio +import t5.data.mixtures +from t5x import adafactor +from t5x.examples.t5 import network +from t5x import gin_utils +from t5x import models +from t5x import partitioning +from t5x import trainer +from t5x import utils +import tasks + +# Macros: +# ============================================================================== +BATCH_SIZE = 128 +DROPOUT_RATE = 0.0 +INITIAL_CHECKPOINT_PATH = \ + 'gs://t5-data/pretrained_models/t5x/mt5_xl/checkpoint_1000000' +LABEL_SMOOTHING = 0.0 +LOSS_NORMALIZING_FACTOR = None +MIXTURE_OR_TASK_MODULE = None +MIXTURE_OR_TASK_NAME = 'ncc_english_span_corruption_stream' +MODEL = @models.EncoderDecoderModel() +MODEL_DIR = 'gs://nb-t5x-us-central2/norwegian_NCC_plus_English_t5x_xl' +OPTIMIZER = @adafactor.Adafactor() +RANDOM_SEED = None +SHUFFLE_TRAIN_EXAMPLES = True +TASK_FEATURE_LENGTHS = {'inputs': 512, 'targets': 512} +TRAIN_STEPS = 1500000 +USE_CACHED_TASKS = True +USE_HARDWARE_RNG = False +VOCABULARY = @seqio.SentencePieceVocabulary() +Z_LOSS = 0.0001 + +# Parameters for adafactor.Adafactor: +# ============================================================================== +adafactor.Adafactor.decay_rate = 0.8 +adafactor.Adafactor.logical_factor_rules = \ + @adafactor.standard_logical_factor_rules() +adafactor.Adafactor.step_offset = 0 + +# Parameters for utils.CheckpointConfig: +# ============================================================================== +utils.CheckpointConfig.restore = @utils.RestoreCheckpointConfig() +utils.CheckpointConfig.save = @utils.SaveCheckpointConfig() + +# Parameters for utils.create_learning_rate_scheduler: +# ============================================================================== +utils.create_learning_rate_scheduler.base_learning_rate = 0.5 +utils.create_learning_rate_scheduler.factors = 'constant * rsqrt_decay' +utils.create_learning_rate_scheduler.warmup_steps = 10000 + +# Parameters for train/utils.DatasetConfig: +# ============================================================================== +train/utils.DatasetConfig.batch_size = %BATCH_SIZE +train/utils.DatasetConfig.mixture_or_task_name = %MIXTURE_OR_TASK_NAME +train/utils.DatasetConfig.module = %MIXTURE_OR_TASK_MODULE +train/utils.DatasetConfig.pack = True +train/utils.DatasetConfig.seed = None +train/utils.DatasetConfig.shuffle = %SHUFFLE_TRAIN_EXAMPLES +train/utils.DatasetConfig.split = 'train' +train/utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS +train/utils.DatasetConfig.use_cached = %USE_CACHED_TASKS + +# Parameters for train_eval/utils.DatasetConfig: +# ============================================================================== +train_eval/utils.DatasetConfig.batch_size = %BATCH_SIZE +train_eval/utils.DatasetConfig.mixture_or_task_name = %MIXTURE_OR_TASK_NAME +train_eval/utils.DatasetConfig.module = %MIXTURE_OR_TASK_MODULE +train_eval/utils.DatasetConfig.pack = True +train_eval/utils.DatasetConfig.seed = 42 +train_eval/utils.DatasetConfig.shuffle = False +train_eval/utils.DatasetConfig.split = 'validation' +train_eval/utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS +train_eval/utils.DatasetConfig.use_cached = %USE_CACHED_TASKS + +# Parameters for models.EncoderDecoderModel: +# ============================================================================== +models.EncoderDecoderModel.input_vocabulary = %VOCABULARY +models.EncoderDecoderModel.label_smoothing = %LABEL_SMOOTHING +models.EncoderDecoderModel.loss_normalizing_factor = %LOSS_NORMALIZING_FACTOR +models.EncoderDecoderModel.module = @network.Transformer() +models.EncoderDecoderModel.optimizer_def = %OPTIMIZER +models.EncoderDecoderModel.output_vocabulary = %VOCABULARY +models.EncoderDecoderModel.z_loss = %Z_LOSS + +# Parameters for partitioning.PjitPartitioner: +# ============================================================================== +partitioning.PjitPartitioner.logical_axis_rules = \ + @partitioning.standard_logical_axis_rules() +partitioning.PjitPartitioner.model_parallel_submesh = None +partitioning.PjitPartitioner.num_partitions = 2 + +# Parameters for utils.RestoreCheckpointConfig: +# ============================================================================== +utils.RestoreCheckpointConfig.dtype = 'float32' +utils.RestoreCheckpointConfig.mode = 'specific' +utils.RestoreCheckpointConfig.path = %INITIAL_CHECKPOINT_PATH + +# Parameters for utils.SaveCheckpointConfig: +# ============================================================================== +utils.SaveCheckpointConfig.dtype = 'float32' +utils.SaveCheckpointConfig.keep = 3 +utils.SaveCheckpointConfig.period = 20000 +utils.SaveCheckpointConfig.save_dataset = False + +# Parameters for seqio.SentencePieceVocabulary: +# ============================================================================== +seqio.SentencePieceVocabulary.sentencepiece_model_file = \ + 'gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model' + +# Parameters for network.T5Config: +# ============================================================================== +network.T5Config.dropout_rate = %DROPOUT_RATE +network.T5Config.dtype = 'bfloat16' +network.T5Config.emb_dim = 2048 +network.T5Config.head_dim = 64 +network.T5Config.logits_via_embedding = False +network.T5Config.mlp_activations = ('gelu', 'linear') +network.T5Config.mlp_dim = 5120 +network.T5Config.num_decoder_layers = 24 +network.T5Config.num_encoder_layers = 24 +network.T5Config.num_heads = 32 +network.T5Config.vocab_size = 250112 + +# Parameters for train_script.train: +# ============================================================================== +train_script.train.checkpoint_cfg = @utils.CheckpointConfig() +train_script.train.eval_period = 1000 +train_script.train.eval_steps = 20 +train_script.train.infer_eval_dataset_cfg = None +train_script.train.model = %MODEL +train_script.train.model_dir = %MODEL_DIR +train_script.train.partitioner = @partitioning.PjitPartitioner() +train_script.train.random_seed = %RANDOM_SEED +train_script.train.summarize_config_fn = @gin_utils.summarize_gin_config +train_script.train.total_steps = %TRAIN_STEPS +train_script.train.train_dataset_cfg = @train/utils.DatasetConfig() +train_script.train.train_eval_dataset_cfg = @train_eval/utils.DatasetConfig() +train_script.train.trainer_cls = @trainer.Trainer +train_script.train.use_hardware_rng = %USE_HARDWARE_RNG + +# Parameters for trainer.Trainer: +# ============================================================================== +trainer.Trainer.learning_rate_fn = @utils.create_learning_rate_scheduler() +trainer.Trainer.num_microbatches = None + +# Parameters for network.Transformer: +# ============================================================================== +network.Transformer.config = @network.T5Config() diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b3a91e8da058732da8489382e1513389464e2a2b --- /dev/null +++ b/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "/home/patrick/t5/mt5-xl", + "architectures": [ + "T5ForConditionalGeneration" + ], + "d_ff": 5120, + "d_kv": 64, + "d_model": 2048, + "decoder_start_token_id": 0, + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "num_decoder_layers": 24, + "num_heads": 32, + "num_layers": 24, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "tie_word_embeddings": false, + "tokenizer_class": "T5Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.19.2", + "use_cache": true, + "vocab_size": 250112 +} diff --git a/flax_model.msgpack b/flax_model.msgpack new file mode 100644 index 0000000000000000000000000000000000000000..f4e7dc53bdb6df5242bf208e15420912b99a55ec --- /dev/null +++ b/flax_model.msgpack @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:429ff72f16d7ab8b36957814093c4adcd1ac8ca3c33a200cc764520af56f3b42 +size 14970502847 diff --git a/model-info.txt b/model-info.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf9d7807998df583a9a80e6fad3823600c903742 --- /dev/null +++ b/model-info.txt @@ -0,0 +1,2793 @@ +Variable decoder/decoder_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_0/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_0/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_0/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_0/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_0/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_0/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_0/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_0/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_0/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_0/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_0/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_0/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_0/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_0/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_1/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_1/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_1/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_1/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_1/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_1/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_1/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_1/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_1/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_1/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_1/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_1/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_1/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_1/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_10/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_10/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_10/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_10/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_10/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_10/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_10/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_10/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_10/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_10/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_10/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_10/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_10/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_10/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_11/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_11/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_11/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_11/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_11/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_11/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_11/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_11/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_11/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_11/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_11/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_11/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_11/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_11/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_12/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_12/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_12/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_12/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_12/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_12/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_12/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_12/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_12/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_12/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_12/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_12/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_12/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_12/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_13/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_13/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_13/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_13/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_13/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_13/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_13/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_13/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_13/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_13/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_13/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_13/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_13/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_13/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_14/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_14/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_14/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_14/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_14/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_14/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_14/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_14/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_14/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_14/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_14/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_14/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_14/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_14/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_15/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_15/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_15/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_15/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_15/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_15/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_15/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_15/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_15/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_15/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_15/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_15/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_15/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_15/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_16/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_16/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_16/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_16/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_16/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_16/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_16/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_16/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_16/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_16/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_16/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_16/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_16/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_16/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_17/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_17/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_17/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_17/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_17/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_17/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_17/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_17/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_17/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_17/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_17/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_17/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_17/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_17/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_18/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_18/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_18/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_18/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_18/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_18/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_18/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_18/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_18/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_18/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_18/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_18/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_18/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_18/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_19/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_19/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_19/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_19/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_19/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_19/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_19/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_19/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_19/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_19/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_19/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_19/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_19/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_19/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_2/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_2/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_2/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_2/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_2/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_2/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_2/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_2/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_2/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_2/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_2/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_2/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_2/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_2/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_20/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_20/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_20/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_20/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_20/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_20/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_20/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_20/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_20/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_20/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_20/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_20/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_20/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_20/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_21/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_21/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_21/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_21/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_21/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_21/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_21/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_21/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_21/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_21/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_21/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_21/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_21/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_21/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_22/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_22/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_22/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_22/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_22/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_22/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_22/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_22/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_22/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_22/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_22/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_22/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_22/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_22/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_23/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_23/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_23/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_23/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_23/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_23/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_23/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_23/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_23/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_23/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_23/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_23/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_23/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_23/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_3/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_3/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_3/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_3/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_3/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_3/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_3/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_3/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_3/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_3/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_3/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_3/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_3/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_3/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_4/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_4/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_4/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_4/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_4/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_4/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_4/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_4/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_4/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_4/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_4/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_4/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_4/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_4/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_5/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_5/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_5/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_5/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_5/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_5/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_5/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_5/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_5/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_5/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_5/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_5/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_5/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_5/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_6/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_6/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_6/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_6/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_6/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_6/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_6/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_6/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_6/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_6/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_6/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_6/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_6/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_6/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_7/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_7/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_7/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_7/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_7/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_7/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_7/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_7/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_7/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_7/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_7/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_7/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_7/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_7/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_8/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_8/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_8/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_8/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_8/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_8/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_8/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_8/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_8/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_8/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_8/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_8/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_8/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_8/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_9/encoder_decoder_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_9/encoder_decoder_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_9/encoder_decoder_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_9/encoder_decoder_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_9/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_9/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable decoder/layers_9/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable decoder/layers_9/pre_cross_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_9/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_9/pre_self_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable decoder/layers_9/self_attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_9/self_attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable decoder/layers_9/self_attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/layers_9/self_attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable decoder/logits_dense/kernel size 512229376 shape (embed=2048, vocab=250112) partition spec (None, 'model') +Variable decoder/relpos_bias/rel_embedding size 1024 shape (heads=32, relpos_buckets=32) partition spec ('model', None) +Variable encoder/encoder_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_0/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_0/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_0/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_0/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_0/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_0/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_0/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_0/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_0/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_1/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_1/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_1/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_1/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_1/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_1/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_1/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_1/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_1/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_10/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_10/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_10/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_10/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_10/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_10/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_10/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_10/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_10/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_11/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_11/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_11/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_11/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_11/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_11/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_11/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_11/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_11/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_12/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_12/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_12/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_12/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_12/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_12/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_12/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_12/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_12/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_13/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_13/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_13/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_13/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_13/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_13/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_13/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_13/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_13/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_14/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_14/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_14/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_14/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_14/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_14/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_14/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_14/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_14/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_15/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_15/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_15/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_15/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_15/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_15/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_15/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_15/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_15/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_16/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_16/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_16/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_16/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_16/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_16/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_16/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_16/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_16/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_17/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_17/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_17/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_17/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_17/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_17/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_17/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_17/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_17/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_18/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_18/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_18/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_18/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_18/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_18/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_18/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_18/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_18/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_19/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_19/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_19/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_19/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_19/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_19/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_19/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_19/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_19/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_2/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_2/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_2/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_2/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_2/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_2/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_2/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_2/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_2/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_20/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_20/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_20/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_20/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_20/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_20/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_20/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_20/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_20/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_21/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_21/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_21/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_21/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_21/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_21/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_21/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_21/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_21/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_22/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_22/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_22/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_22/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_22/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_22/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_22/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_22/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_22/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_23/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_23/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_23/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_23/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_23/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_23/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_23/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_23/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_23/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_3/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_3/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_3/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_3/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_3/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_3/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_3/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_3/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_3/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_4/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_4/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_4/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_4/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_4/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_4/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_4/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_4/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_4/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_5/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_5/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_5/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_5/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_5/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_5/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_5/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_5/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_5/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_6/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_6/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_6/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_6/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_6/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_6/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_6/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_6/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_6/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_7/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_7/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_7/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_7/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_7/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_7/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_7/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_7/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_7/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_8/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_8/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_8/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_8/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_8/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_8/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_8/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_8/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_8/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_9/attention/key/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_9/attention/out/kernel size 4194304 shape (joined_kv=2048, embed=2048) partition spec ('model', None) +Variable encoder/layers_9/attention/query/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_9/attention/value/kernel size 4194304 shape (embed=2048, joined_kv=2048) partition spec (None, 'model') +Variable encoder/layers_9/mlp/wi_0/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_9/mlp/wi_1/kernel size 10485760 shape (embed=2048, mlp=5120) partition spec (None, 'model') +Variable encoder/layers_9/mlp/wo/kernel size 10485760 shape (mlp=5120, embed=2048) partition spec ('model', None) +Variable encoder/layers_9/pre_attention_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/layers_9/pre_mlp_layer_norm/scale size 2048 shape (embed=2048) partition spec (None,) +Variable encoder/relpos_bias/rel_embedding size 1024 shape (heads=32, relpos_buckets=32) partition spec ('model', None) +Variable token_embedder/embedding size 512229376 shape (vocab=250112, embed=2048) partition spec ('model', None) +Total number of parameters: 3742619648 + +Variable param_states/decoder/decoder_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/decoder_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/decoder_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/decoder_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_0/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_0/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_0/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_0/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_0/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_0/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_0/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_0/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_1/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_1/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_1/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_1/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_1/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_1/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_1/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_1/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_10/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_10/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_10/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_10/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_10/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_10/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_10/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_10/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_11/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_11/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_11/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_11/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_11/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_11/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_11/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_11/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_12/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_12/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_12/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_12/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_12/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_12/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_12/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_12/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_13/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_13/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_13/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_13/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_13/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_13/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_13/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_13/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_14/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_14/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_14/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_14/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_14/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_14/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_14/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_14/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_15/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_15/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_15/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_15/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_15/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_15/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_15/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_15/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_16/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_16/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_16/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_16/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_16/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_16/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_16/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_16/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_17/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_17/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_17/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_17/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_17/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_17/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_17/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_17/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_18/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_18/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_18/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_18/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_18/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_18/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_18/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_18/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_19/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_19/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_19/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_19/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_19/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_19/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_19/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_19/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_2/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_2/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_2/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_2/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_2/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_2/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_2/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_2/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_20/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_20/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_20/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_20/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_20/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_20/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_20/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_20/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_21/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_21/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_21/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_21/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_21/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_21/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_21/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_21/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_22/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_22/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_22/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_22/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_22/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_22/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_22/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_22/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_23/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_23/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_23/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_23/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_23/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_23/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_23/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_23/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_3/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_3/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_3/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_3/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_3/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_3/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_3/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_3/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_4/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_4/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_4/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_4/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_4/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_4/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_4/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_4/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_5/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_5/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_5/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_5/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_5/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_5/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_5/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_5/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_6/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_6/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_6/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_6/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_6/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_6/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_6/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_6/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_7/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_7/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_7/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_7/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_7/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_7/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_7/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_7/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_8/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_8/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_8/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_8/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_8/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_8/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_8/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_8/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/encoder_decoder_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_9/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_9/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/decoder/layers_9/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/pre_cross_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/pre_cross_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_9/pre_cross_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/pre_cross_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_9/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/pre_self_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/pre_self_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/decoder/layers_9/pre_self_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/pre_self_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/self_attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/self_attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/self_attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/self_attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/self_attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/self_attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/layers_9/self_attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/decoder/layers_9/self_attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/logits_dense/kernel/m size 1 shape (1,) partition spec None +Variable param_states/decoder/logits_dense/kernel/v size 1 shape (1,) partition spec None +Variable param_states/decoder/logits_dense/kernel/v_col size 250112 shape (250112,) partition spec None +Variable param_states/decoder/logits_dense/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/decoder/relpos_bias/rel_embedding/m size 1 shape (1,) partition spec None +Variable param_states/decoder/relpos_bias/rel_embedding/v size 1024 shape (heads=32, relpos_buckets=32) partition spec ('model', None) +Variable param_states/decoder/relpos_bias/rel_embedding/v_col size 1 shape (1,) partition spec None +Variable param_states/decoder/relpos_bias/rel_embedding/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/encoder_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/encoder_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/encoder_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/encoder_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_0/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_0/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_0/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_0/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_0/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_0/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_0/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_1/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_1/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_1/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_1/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_1/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_1/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_1/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_10/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_10/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_10/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_10/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_10/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_10/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_10/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_11/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_11/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_11/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_11/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_11/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_11/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_11/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_12/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_12/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_12/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_12/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_12/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_12/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_12/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_13/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_13/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_13/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_13/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_13/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_13/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_13/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_14/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_14/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_14/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_14/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_14/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_14/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_14/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_15/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_15/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_15/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_15/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_15/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_15/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_15/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_16/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_16/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_16/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_16/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_16/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_16/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_16/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_17/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_17/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_17/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_17/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_17/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_17/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_17/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_18/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_18/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_18/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_18/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_18/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_18/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_18/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_19/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_19/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_19/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_19/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_19/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_19/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_19/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_2/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_2/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_2/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_2/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_2/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_2/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_2/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_20/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_20/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_20/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_20/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_20/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_20/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_20/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_21/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_21/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_21/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_21/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_21/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_21/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_21/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_22/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_22/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_22/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_22/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_22/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_22/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_22/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_23/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_23/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_23/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_23/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_23/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_23/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_23/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_3/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_3/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_3/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_3/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_3/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_3/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_3/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_4/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_4/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_4/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_4/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_4/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_4/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_4/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_5/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_5/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_5/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_5/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_5/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_5/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_5/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_6/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_6/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_6/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_6/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_6/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_6/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_6/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_7/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_7/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_7/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_7/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_7/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_7/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_7/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_8/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_8/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_8/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_8/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_8/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_8/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_8/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/key/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/key/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/key/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/attention/key/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/attention/out/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/out/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/out/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/attention/out/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/attention/query/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/query/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/query/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/attention/query/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/attention/value/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/value/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/attention/value/kernel/v_col size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/attention/value/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/mlp/wi_0/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/mlp/wi_0/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/mlp/wi_0/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_9/mlp/wi_0/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/mlp/wi_1/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/mlp/wi_1/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/mlp/wi_1/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_9/mlp/wi_1/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/mlp/wo/kernel/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/mlp/wo/kernel/v size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/mlp/wo/kernel/v_col size 5120 shape (5120,) partition spec None +Variable param_states/encoder/layers_9/mlp/wo/kernel/v_row size 2048 shape (2048,) partition spec None +Variable param_states/encoder/layers_9/pre_attention_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/pre_attention_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_9/pre_attention_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/pre_attention_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/pre_mlp_layer_norm/scale/m size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/pre_mlp_layer_norm/scale/v size 2048 shape (embed=2048) partition spec (None,) +Variable param_states/encoder/layers_9/pre_mlp_layer_norm/scale/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/layers_9/pre_mlp_layer_norm/scale/v_row size 1 shape (1,) partition spec None +Variable param_states/encoder/relpos_bias/rel_embedding/m size 1 shape (1,) partition spec None +Variable param_states/encoder/relpos_bias/rel_embedding/v size 1024 shape (heads=32, relpos_buckets=32) partition spec ('model', None) +Variable param_states/encoder/relpos_bias/rel_embedding/v_col size 1 shape (1,) partition spec None +Variable param_states/encoder/relpos_bias/rel_embedding/v_row size 1 shape (1,) partition spec None +Variable param_states/token_embedder/embedding/m size 1 shape (1,) partition spec None +Variable param_states/token_embedder/embedding/v size 1 shape (1,) partition spec None +Variable param_states/token_embedder/embedding/v_col size 250112 shape (250112,) partition spec None +Variable param_states/token_embedder/embedding/v_row size 2048 shape (2048,) partition spec None +Variable step size 1 shape () partition spec None diff --git a/pytorch_model-00001-of-00003.bin b/pytorch_model-00001-of-00003.bin new file mode 100644 index 0000000000000000000000000000000000000000..f20a78f6d30a44ee511441b4cf00c5c0b7e92a32 --- /dev/null +++ b/pytorch_model-00001-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33c5d6bb3cafeb8e4b4395b24e2cf80929a0392c35e7512421c819211e27968e +size 6679907079 diff --git a/pytorch_model-00002-of-00003.bin b/pytorch_model-00002-of-00003.bin new file mode 100644 index 0000000000000000000000000000000000000000..d4371510dd7f46fb94c1bc6dfaef7147b60c0df1 --- /dev/null +++ b/pytorch_model-00002-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9637711248e208528786fa4ad83dbc2a24195f1b393a651ff42ff607281f6dc3 +size 8290756867 diff --git a/pytorch_model-00003-of-00003.bin b/pytorch_model-00003-of-00003.bin new file mode 100644 index 0000000000000000000000000000000000000000..c77ba17533bcf8875d41d044259a0882dbf52cc9 --- /dev/null +++ b/pytorch_model-00003-of-00003.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f2ac16f65f06bb847ef5691c5c2d10f90bbe5a8d822167be490a06bd2fc0d43 +size 2048918251 diff --git a/pytorch_model.bin.index.json b/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..b5b3d0add02797be3938774ef810c7b0f8937cf2 --- /dev/null +++ b/pytorch_model.bin.index.json @@ -0,0 +1,567 @@ +{ + "metadata": { + "total_size": 19068313600 + }, + "weight_map": { + "decoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.0.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.1.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.10.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.11.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.12.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.13.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.14.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.15.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.16.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.17.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.18.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.19.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.2.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.20.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.21.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.22.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.23.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.3.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.4.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.5.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.6.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.7.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.8.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00003.bin", + "decoder.block.9.layer.2.layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "decoder.embed_tokens.weight": "pytorch_model-00002-of-00003.bin", + "decoder.final_layer_norm.weight": "pytorch_model-00002-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00003.bin", + "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "encoder.embed_tokens.weight": "pytorch_model-00001-of-00003.bin", + "encoder.final_layer_norm.weight": "pytorch_model-00001-of-00003.bin", + "lm_head.weight": "pytorch_model-00003-of-00003.bin", + "shared.weight": "pytorch_model-00001-of-00003.bin" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..6dc4d430ddbd24171268d73da061ce9f0b092911 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1 @@ +{"eos_token": "", "unk_token": "", "pad_token": ""} \ No newline at end of file diff --git a/spiece.model b/spiece.model new file mode 100644 index 0000000000000000000000000000000000000000..e417801865fd66bd40f9d45d46b6d0d0c2aa36b6 --- /dev/null +++ b/spiece.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6 +size 4309802 diff --git a/tf_model.h5 b/tf_model.h5 new file mode 100644 index 0000000000000000000000000000000000000000..a90e96d64ffa119b933bb6d0a32eec59bb2857cc --- /dev/null +++ b/tf_model.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13edf9346263c231557abbd4b42fd847571a957f938e4f48b2d23ec2f1c58acc +size 9760 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..db5a8773175d9750d86374dc47b64f6d55615279 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93c3578052e1605d8332eb961bc08d72e246071974e4cc54aa6991826b802aa5 +size 16330369 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c6bdd09d3767260a6a52c91bfb072afbb360c40d --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1 @@ +{"eos_token": "", "unk_token": "", "pad_token": "", "extra_ids": 0, "additional_special_tokens": null, "special_tokens_map_file": "/home/patrick/.cache/torch/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276", "name_or_path": "/home/perk/models/t5_xl_NCC", "sp_model_kwargs": {}, "tokenizer_class": "T5Tokenizer"} \ No newline at end of file diff --git a/train/events.out.tfevents.1649576424.t1v-n-94d01c37-w-3.2758548.0.v2 b/train/events.out.tfevents.1649576424.t1v-n-94d01c37-w-3.2758548.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..52df57c91193ffb956d16c6a04d1b378811c90b4 --- /dev/null +++ b/train/events.out.tfevents.1649576424.t1v-n-94d01c37-w-3.2758548.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eaa0614ba50b24dc998f7e7b63243ab9fdfae1f284825cecd15e9fc714e03b6 +size 6403 diff --git a/train/events.out.tfevents.1649578354.t1v-n-94d01c37-w-3.2769526.0.v2 b/train/events.out.tfevents.1649578354.t1v-n-94d01c37-w-3.2769526.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..aa1490310e00c11f0307915f1fcc0f65f6a97891 --- /dev/null +++ b/train/events.out.tfevents.1649578354.t1v-n-94d01c37-w-3.2769526.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2f6cb0d30dd4337e1c29de2f7df28d582514e6dc117511b1a200431039bf243 +size 10707 diff --git a/train/events.out.tfevents.1649585207.t1v-n-94d01c37-w-3.2804417.0.v2 b/train/events.out.tfevents.1649585207.t1v-n-94d01c37-w-3.2804417.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..75df096088488434bb2b31a5821cc8b5c57d33a3 --- /dev/null +++ b/train/events.out.tfevents.1649585207.t1v-n-94d01c37-w-3.2804417.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3974d6a10de120a1346e96e238e32789c497132381ad3edc8c1ac9c821178d0e +size 28263 diff --git a/train/events.out.tfevents.1649604716.t1v-n-94d01c37-w-3.2916105.0.v2 b/train/events.out.tfevents.1649604716.t1v-n-94d01c37-w-3.2916105.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..6ed1d1245a48c00dd3169d4b2361127fc24bfd84 --- /dev/null +++ b/train/events.out.tfevents.1649604716.t1v-n-94d01c37-w-3.2916105.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1f6f30f189d95910ac423c65ed57625ded2d0087c11e89244c9a57cde13e9c1 +size 28263 diff --git a/train/events.out.tfevents.1649625471.t1v-n-94d01c37-w-3.3032489.0.v2 b/train/events.out.tfevents.1649625471.t1v-n-94d01c37-w-3.3032489.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..391271040e480d040040b845fa86f1d1ddf4cbaf --- /dev/null +++ b/train/events.out.tfevents.1649625471.t1v-n-94d01c37-w-3.3032489.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec0cc134882eb7103245909da57535eece1c57e273ce22762229171b926bbb7f +size 6403 diff --git a/train/events.out.tfevents.1649658143.t1v-n-94d01c37-w-3.3159692.0.v2 b/train/events.out.tfevents.1649658143.t1v-n-94d01c37-w-3.3159692.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..9e3411fb1e732aa4882d66c1aebb8179882cdd1c --- /dev/null +++ b/train/events.out.tfevents.1649658143.t1v-n-94d01c37-w-3.3159692.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c79a36372d4c751ed1b0bfd11e05e835e24612db30a14e2ee10c9158277de31 +size 6317 diff --git a/train/events.out.tfevents.1649659968.t1v-n-94d01c37-w-3.3170227.0.v2 b/train/events.out.tfevents.1649659968.t1v-n-94d01c37-w-3.3170227.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..32611137a72ca51870f367d642b159a0649264a9 --- /dev/null +++ b/train/events.out.tfevents.1649659968.t1v-n-94d01c37-w-3.3170227.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc77584288839071e6f8375cd35ad6eee3ef0232b8428ec81f00b93220ee061e +size 6317 diff --git a/train/events.out.tfevents.1649663312.t1v-n-94d01c37-w-3.3186665.0.v2 b/train/events.out.tfevents.1649663312.t1v-n-94d01c37-w-3.3186665.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..c8d8c280e4a11e92ac42b91e44580c814327200a --- /dev/null +++ b/train/events.out.tfevents.1649663312.t1v-n-94d01c37-w-3.3186665.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3694f46d96522abe7892f510f983a0521fc09566c98dc66f395fa7cdf876eb61 +size 6317 diff --git a/train/events.out.tfevents.1649665806.t1v-n-94d01c37-w-3.3200483.0.v2 b/train/events.out.tfevents.1649665806.t1v-n-94d01c37-w-3.3200483.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..0be5dc4dfa7eaae3ba3576749d1363a11b60c3df --- /dev/null +++ b/train/events.out.tfevents.1649665806.t1v-n-94d01c37-w-3.3200483.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b50b7c299d2d2e0896ab637e289f90fee498d48b7101741b87d3802ea9ca214 +size 7866 diff --git a/train/events.out.tfevents.1649677709.t1v-n-94d01c37-w-1.41136.0.v2 b/train/events.out.tfevents.1649677709.t1v-n-94d01c37-w-1.41136.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..976ffd727b279e86b9e333eb892f64e8b747eb72 --- /dev/null +++ b/train/events.out.tfevents.1649677709.t1v-n-94d01c37-w-1.41136.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76925dc40b3244836dc508f5254db7336510ae3129a3a37b37c8ce7ce78771c +size 22411 diff --git a/train/events.out.tfevents.1649693993.t1v-n-94d01c37-w-1.131865.0.v2 b/train/events.out.tfevents.1649693993.t1v-n-94d01c37-w-1.131865.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..20f6f24ab58b4aec1ce3215fa2d2fb24b5b30fc0 --- /dev/null +++ b/train/events.out.tfevents.1649693993.t1v-n-94d01c37-w-1.131865.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c01f9901e7ff5009fde1dccc8a8b82ee794226950316436491cc85f7a863eb +size 75079 diff --git a/train/events.out.tfevents.1649751312.t1v-n-94d01c37-w-1.465030.0.v2 b/train/events.out.tfevents.1649751312.t1v-n-94d01c37-w-1.465030.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..1549cd9b88a106e708ac385b923f156d1c7aa8e8 --- /dev/null +++ b/train/events.out.tfevents.1649751312.t1v-n-94d01c37-w-1.465030.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f98287e1f1a9e25023b263dbeca18bc5870b07526d745d3437ffbbd294dc5e +size 110191 diff --git a/train/events.out.tfevents.1649853499.t1v-n-94d01c37-w-1.1031862.0.v2 b/train/events.out.tfevents.1649853499.t1v-n-94d01c37-w-1.1031862.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..98e92a38399d9164c118c702c438406b341e33ac --- /dev/null +++ b/train/events.out.tfevents.1649853499.t1v-n-94d01c37-w-1.1031862.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dda7643bef2c672627dae470a9b247f8fc7c44513aeed21cf05e480f8740c50 +size 69227 diff --git a/train/events.out.tfevents.1649910223.t1v-n-94d01c37-w-1.1357507.0.v2 b/train/events.out.tfevents.1649910223.t1v-n-94d01c37-w-1.1357507.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..a93b8115745f82500da107b3cd86c40993cacfd0 --- /dev/null +++ b/train/events.out.tfevents.1649910223.t1v-n-94d01c37-w-1.1357507.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adaedf31236ad4eea2176c56799177579f99714714672c4edb7d680c7db20692 +size 15096 diff --git a/train/events.out.tfevents.1649919405.t1v-n-94d01c37-w-1.1411806.0.v2 b/train/events.out.tfevents.1649919405.t1v-n-94d01c37-w-1.1411806.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..70fa53f34f435e22a7b88a620b2229a5f4983b2a --- /dev/null +++ b/train/events.out.tfevents.1649919405.t1v-n-94d01c37-w-1.1411806.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4d2bb2140a669742ef9880407ec2ebef2031ae6957d32ca68a7dc4e374c23f7 +size 10707 diff --git a/train/events.out.tfevents.1649961155.t1v-n-94d01c37-w-1.1579646.0.v2 b/train/events.out.tfevents.1649961155.t1v-n-94d01c37-w-1.1579646.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..9e70c219a553910a1946daf40920cb767bbd3b85 --- /dev/null +++ b/train/events.out.tfevents.1649961155.t1v-n-94d01c37-w-1.1579646.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:667fea9a63e25dd71c2c362da69f2837221cea288beb464437c17003a44ae4b1 +size 53134 diff --git a/train/events.out.tfevents.1650012376.t1v-n-94d01c37-w-1.1854026.0.v2 b/train/events.out.tfevents.1650012376.t1v-n-94d01c37-w-1.1854026.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..b5d333c8ecf30899de4e49484c771991abd38f18 --- /dev/null +++ b/train/events.out.tfevents.1650012376.t1v-n-94d01c37-w-1.1854026.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7cf5b908c5ba20aaa10783e3a680c49114fbdeac59aa22cf9794c56344903fe +size 28263 diff --git a/train/events.out.tfevents.1650037350.t1v-n-94d01c37-w-1.1988057.0.v2 b/train/events.out.tfevents.1650037350.t1v-n-94d01c37-w-1.1988057.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..ea916dee610b20b84f968b7a8f495f9b8673ca35 --- /dev/null +++ b/train/events.out.tfevents.1650037350.t1v-n-94d01c37-w-1.1988057.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29ed193fd84b02c439e7b4055e0ed9896909f90f40f4a56329a1d102ad30ec74 +size 26372 diff --git a/train/events.out.tfevents.1650082486.t1v-n-94d01c37-w-1.2198868.0.v2 b/train/events.out.tfevents.1650082486.t1v-n-94d01c37-w-1.2198868.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..3bd3407979231207a547aad0f69ebc7ae4257b80 --- /dev/null +++ b/train/events.out.tfevents.1650082486.t1v-n-94d01c37-w-1.2198868.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5cef4109843f49dfe2b0d49d81f6d43022a5a4209a3845065ecab22ac7d4486 +size 6403 diff --git a/train/events.out.tfevents.1650097888.t1v-n-94d01c37-w-1.2260945.0.v2 b/train/events.out.tfevents.1650097888.t1v-n-94d01c37-w-1.2260945.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..4ff09a80cfa7414d5e6b2a3c2ab7e2d44c66022f --- /dev/null +++ b/train/events.out.tfevents.1650097888.t1v-n-94d01c37-w-1.2260945.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aacbb2467a1dcb8f369c0e2d41ce4c73d9d523b75605b0bbd715fdb423eebcaa +size 32652 diff --git a/train/events.out.tfevents.1650134701.t1v-n-94d01c37-w-1.2449113.0.v2 b/train/events.out.tfevents.1650134701.t1v-n-94d01c37-w-1.2449113.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..516cee3b76b00f436f20d6a326c159d9a682cad5 --- /dev/null +++ b/train/events.out.tfevents.1650134701.t1v-n-94d01c37-w-1.2449113.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4241c13db2a46710277f894245ea41b6a3b77767b20646396427204b3c3a3cde +size 16559 diff --git a/train/events.out.tfevents.1650145105.t1v-n-94d01c37-w-1.2507564.0.v2 b/train/events.out.tfevents.1650145105.t1v-n-94d01c37-w-1.2507564.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..3248e8a2b3d52cc3c3e31454d7a35eec18ece629 --- /dev/null +++ b/train/events.out.tfevents.1650145105.t1v-n-94d01c37-w-1.2507564.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e88c4e07c8b62035b03601d6d951c6f24f7b21c22b5ffab1fb7cab7b251d43 +size 10707 diff --git a/train/events.out.tfevents.1650150928.t1v-n-94d01c37-w-1.2539030.0.v2 b/train/events.out.tfevents.1650150928.t1v-n-94d01c37-w-1.2539030.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..53288495acd8a7f5ea964edbc782b2f102cee3b6 --- /dev/null +++ b/train/events.out.tfevents.1650150928.t1v-n-94d01c37-w-1.2539030.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8acbe6d1f6c4f40f7ce32ecf27ff394d327b8590ac9d6756077b4b9ce5d4d694 +size 16559 diff --git a/train/events.out.tfevents.1650180232.t1v-n-94d01c37-w-1.2669968.0.v2 b/train/events.out.tfevents.1650180232.t1v-n-94d01c37-w-1.2669968.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..4df4d9a0190e759871a914faedbf00816c48c289 --- /dev/null +++ b/train/events.out.tfevents.1650180232.t1v-n-94d01c37-w-1.2669968.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8468b608786eac31b338728da54ed32811f8852ed3c53fd1fc47bef3191195e0 +size 116043 diff --git a/train/events.out.tfevents.1650269484.t1v-n-94d01c37-w-1.3191003.0.v2 b/train/events.out.tfevents.1650269484.t1v-n-94d01c37-w-1.3191003.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..0949ab972e1cfe6518ee7bcbc65d3ee9d0f6196e --- /dev/null +++ b/train/events.out.tfevents.1650269484.t1v-n-94d01c37-w-1.3191003.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bcb2d4ce6acc149394e4cd99139e4771a6b7d09af997ada17beca487ced890e +size 7779 diff --git a/train/events.out.tfevents.1650301631.t1v-n-06e40f6a-w-2.144912.0.v2 b/train/events.out.tfevents.1650301631.t1v-n-06e40f6a-w-2.144912.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..27e74d9dd32fefe199a2ecb5ef9a9f8d697fc5d7 --- /dev/null +++ b/train/events.out.tfevents.1650301631.t1v-n-06e40f6a-w-2.144912.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db36ed65485140ddbfd32fcf53d6afb01ba9693556e2209082eff87d11463adc +size 35581 diff --git a/train/events.out.tfevents.1650331210.t1v-n-06e40f6a-w-2.269071.0.v2 b/train/events.out.tfevents.1650331210.t1v-n-06e40f6a-w-2.269071.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..617300ee1c292664dcddd41732e01c6ec730fe1d --- /dev/null +++ b/train/events.out.tfevents.1650331210.t1v-n-06e40f6a-w-2.269071.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e91ebae4f5bc78c13ce30a75effbb2786fa015944412de197ab088cb2ecaa156 +size 35581 diff --git a/train/events.out.tfevents.1650354640.t1v-n-06e40f6a-w-2.418760.0.v2 b/train/events.out.tfevents.1650354640.t1v-n-06e40f6a-w-2.418760.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..b8be3c98f81e0f22df76b607bd59a15e24565318 --- /dev/null +++ b/train/events.out.tfevents.1650354640.t1v-n-06e40f6a-w-2.418760.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed889e112d885dd33c2a50a062293eafc4a5bfb847bd2642034473fb2d2d982 +size 6315 diff --git a/train/events.out.tfevents.1650355218.t1v-n-06e40f6a-w-2.423813.0.v2 b/train/events.out.tfevents.1650355218.t1v-n-06e40f6a-w-2.423813.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..687f84cef77a957b2174c908f58ecfbe452ba201 --- /dev/null +++ b/train/events.out.tfevents.1650355218.t1v-n-06e40f6a-w-2.423813.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:925d23a9789d019665a3a0f75b8b401b4d0f4c449d3926a401647e4f61406d01 +size 40043 diff --git a/train/events.out.tfevents.1650439634.t1v-n-da2df89d-w-0.68748.0.v2 b/train/events.out.tfevents.1650439634.t1v-n-da2df89d-w-0.68748.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..f6ce4cc93dfa36595d030018efe83254439a2459 --- /dev/null +++ b/train/events.out.tfevents.1650439634.t1v-n-da2df89d-w-0.68748.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d977f60add19dad7e8526ad9aa7416d73caa003069c077ea5998ab104c8d479 +size 6401 diff --git a/train/events.out.tfevents.1650442527.t1v-n-da2df89d-w-0.83496.0.v2 b/train/events.out.tfevents.1650442527.t1v-n-da2df89d-w-0.83496.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..879cc26fe2c5fd5c963b6e4120a2c5ff39cfbdb2 --- /dev/null +++ b/train/events.out.tfevents.1650442527.t1v-n-da2df89d-w-0.83496.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e4c473f11ae8b5a0182b63d1eec23072cd57e45a459c2d0c3a7290a5196a04b +size 7860 diff --git a/train/events.out.tfevents.1650450275.t1v-n-51c2c60a-w-3.119563.0.v2 b/train/events.out.tfevents.1650450275.t1v-n-51c2c60a-w-3.119563.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..70f4658b39bef7b3e40ee7b5736c38bdd33fbae4 --- /dev/null +++ b/train/events.out.tfevents.1650450275.t1v-n-51c2c60a-w-3.119563.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d719be90fafdd1864d812007443cf9a0ff7b4d7a0bf767d91c9d92254e70534 +size 6401 diff --git a/train/events.out.tfevents.1650451918.t1v-n-51c2c60a-w-3.129431.0.v2 b/train/events.out.tfevents.1650451918.t1v-n-51c2c60a-w-3.129431.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..8cddf59dd659ad255a903e2b70bdee06ca25b058 --- /dev/null +++ b/train/events.out.tfevents.1650451918.t1v-n-51c2c60a-w-3.129431.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b22f48a97b35dd71a83c61c50c4a76ada4235e09d6f6dff7c533a2b092323fdd +size 38584 diff --git a/train/events.out.tfevents.1650489808.t1v-n-214493c0-w-1.96872.0.v2 b/train/events.out.tfevents.1650489808.t1v-n-214493c0-w-1.96872.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..358df4c553012483cbfe5e3ae5b7fe3869d58cc7 --- /dev/null +++ b/train/events.out.tfevents.1650489808.t1v-n-214493c0-w-1.96872.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29dad258413de9b47cd41acea5c733109736c27d11fcaa573ba98c7b7735807c +size 6315 diff --git a/train/events.out.tfevents.1650490489.t1v-n-214493c0-w-1.105452.0.v2 b/train/events.out.tfevents.1650490489.t1v-n-214493c0-w-1.105452.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..72f4614db6b5042f4ab52f02ac017bf3e1659e0c --- /dev/null +++ b/train/events.out.tfevents.1650490489.t1v-n-214493c0-w-1.105452.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b8f290c760c6da845beffd909e692b0e394e5fcd5e86239de528158194ddc9 +size 35581 diff --git a/train/events.out.tfevents.1650520938.t1v-n-214493c0-w-1.233747.0.v2 b/train/events.out.tfevents.1650520938.t1v-n-214493c0-w-1.233747.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..d508ab0437ddb0fe855ee4d7c17733e1cb58d222 --- /dev/null +++ b/train/events.out.tfevents.1650520938.t1v-n-214493c0-w-1.233747.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:300f08ff791c2c31764d1d191f8ce6dedced0fc64cda1fbe5cc7bf4ba77e7040 +size 94111 diff --git a/train/events.out.tfevents.1650601402.t1v-n-214493c0-w-1.620758.0.v2 b/train/events.out.tfevents.1650601402.t1v-n-214493c0-w-1.620758.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..823ce5696e11be837c587bb05824076df05553e0 --- /dev/null +++ b/train/events.out.tfevents.1650601402.t1v-n-214493c0-w-1.620758.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f31c10a8fe3877fd62b8721f8aa4b2ddbb1896cd222f6ff8a4895fbdda11ea43 +size 34122 diff --git a/train/events.out.tfevents.1650691171.t1v-n-0474fd26-w-2.66859.0.v2 b/train/events.out.tfevents.1650691171.t1v-n-0474fd26-w-2.66859.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..0608c56d2fbec29bd09773f780696bab7e7e39d0 --- /dev/null +++ b/train/events.out.tfevents.1650691171.t1v-n-0474fd26-w-2.66859.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0828cb41ce91aa74ba4a2807fb9314c0c6b3c643490ac328fdc4435102984ad5 +size 81065 diff --git a/train/events.out.tfevents.1650757351.t1v-n-0474fd26-w-2.397514.0.v2 b/train/events.out.tfevents.1650757351.t1v-n-0474fd26-w-2.397514.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..d3be208f6d21353c0011a06107ad8dd4eea08367 --- /dev/null +++ b/train/events.out.tfevents.1650757351.t1v-n-0474fd26-w-2.397514.0.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8b2958ed44760cc6c76e52a43c9ec04a6adc44420f226d8335d0baacceae8a6 +size 6401 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649576425.t1v-n-94d01c37-w-3.2758548.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649576425.t1v-n-94d01c37-w-3.2758548.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..c3ca15af8a9a09af0dc7f209808b624bc90383ed --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649576425.t1v-n-94d01c37-w-3.2758548.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a2f50eb555aa898d22966e9c2d1aa1b26a5b572a8d789145df4e4231d857f6 +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649578354.t1v-n-94d01c37-w-3.2769526.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649578354.t1v-n-94d01c37-w-3.2769526.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..5c9137878442ef68f29430755b5e46dcb40e92fc --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649578354.t1v-n-94d01c37-w-3.2769526.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d07657ceb0b756b30642fa31c7596faeb69ff34ece848656834372998b6126fc +size 2736 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649585207.t1v-n-94d01c37-w-3.2804417.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649585207.t1v-n-94d01c37-w-3.2804417.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..4a1a4a86debd9d8a97a46bff078b84a45869c0ff --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649585207.t1v-n-94d01c37-w-3.2804417.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca67896c7081ff07c926eb8c4edd5267ac3757cf21a662a2ae5cb8b78f662d27 +size 18396 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649604716.t1v-n-94d01c37-w-3.2916105.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649604716.t1v-n-94d01c37-w-3.2916105.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..98f17364336bf4756b4e1c6a221101dd762b25cc --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649604716.t1v-n-94d01c37-w-3.2916105.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34e3a972bf65012d6307f038c2d4528196e38856542fed7e2b595c37951d410f +size 18396 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649625472.t1v-n-94d01c37-w-3.3032489.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649625472.t1v-n-94d01c37-w-3.3032489.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..8d35f1fbd1154bfc9968eed83c4bcd137572875f --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649625472.t1v-n-94d01c37-w-3.3032489.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73de66ae81b6e6517fcb1855069d4a27a88e4045208ad0f35d921540a9133101 +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649658143.t1v-n-94d01c37-w-3.3159692.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649658143.t1v-n-94d01c37-w-3.3159692.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..3e7b69e01547a278f35ba41b4bc3f71c45cea28a --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649658143.t1v-n-94d01c37-w-3.3159692.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7aaf305f7bc4a7e374edf05d5791811275a2d2deb6567e02ec545365da2eedb +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649659969.t1v-n-94d01c37-w-3.3170227.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649659969.t1v-n-94d01c37-w-3.3170227.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..c3a1d48fd5a2dec5b6d553a0ba6541bc08cd5710 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649659969.t1v-n-94d01c37-w-3.3170227.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09d8dd821466f17ede4c7b1399a183d7b40022a817c389bd1abe548911edef5 +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649663312.t1v-n-94d01c37-w-3.3186665.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649663312.t1v-n-94d01c37-w-3.3186665.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..4c1db1b83b0c83abcaedb2319e42fc5462680ace --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649663312.t1v-n-94d01c37-w-3.3186665.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:721a16a33579d6a34ca5d3b9555a3a777e14c72575aea6b21fe08bbf920fa13e +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649665806.t1v-n-94d01c37-w-3.3200483.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649665806.t1v-n-94d01c37-w-3.3200483.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..ec37385d9788eab3e075585cb215aa97716f8f2a --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649665806.t1v-n-94d01c37-w-3.3200483.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82909e7ce902ee604f94c2f23cfe2fd5bd53131133a10f4f5c19db462760923d +size 1431 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649677710.t1v-n-94d01c37-w-1.41136.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649677710.t1v-n-94d01c37-w-1.41136.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..011ee99b998a1e46c55f44f456d454ba3ebec0bf --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649677710.t1v-n-94d01c37-w-1.41136.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8834d274aa80e64c4bef6ad0471bf4020041cc26848a3bcdf24b43582dbf56a6 +size 13176 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649693993.t1v-n-94d01c37-w-1.131865.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649693993.t1v-n-94d01c37-w-1.131865.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..b3c16d32968e99036bceb7565a6123b488e810c4 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649693993.t1v-n-94d01c37-w-1.131865.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87aa15556bd0e83a938f55dc0af1cda1952fc722f2c7275cc4bd077e9829bf4d +size 60156 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649751313.t1v-n-94d01c37-w-1.465030.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649751313.t1v-n-94d01c37-w-1.465030.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..66bd961180ce5cd64cbcef7338c5118c22eac805 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649751313.t1v-n-94d01c37-w-1.465030.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cd54e60f3c883c297e09372cd989f0622e1b1753fadd1cb4ccdf8482f2e0934 +size 91476 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649853499.t1v-n-94d01c37-w-1.1031862.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649853499.t1v-n-94d01c37-w-1.1031862.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..8f6f0c6fef97248dfb7caa5f99802f8edfe9b0f9 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649853499.t1v-n-94d01c37-w-1.1031862.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2202338b0f91c6657ce2f63410ee531fe23034cf080c50e4b45a024cacfad77 +size 54936 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649910223.t1v-n-94d01c37-w-1.1357507.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649910223.t1v-n-94d01c37-w-1.1357507.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..80d038f407b51cce0700f09a3f2354c0b0d13746 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649910223.t1v-n-94d01c37-w-1.1357507.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c312cb001a264ab845287b319b6d0350e077f0c1424e659c23b6b5d118754f6 +size 6651 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649919405.t1v-n-94d01c37-w-1.1411806.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649919405.t1v-n-94d01c37-w-1.1411806.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..fd4d924f7afa871c6b3bcaba25b1193e40e6ab56 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649919405.t1v-n-94d01c37-w-1.1411806.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d0fd9e16e3196ae45e192d83d6f9f99e25faa9617723d47c3dab690ccbb97ce +size 2736 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649961155.t1v-n-94d01c37-w-1.1579646.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649961155.t1v-n-94d01c37-w-1.1579646.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..826b7a17ba3a754d65a4314a055de3d142982293 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1649961155.t1v-n-94d01c37-w-1.1579646.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f8f66bad41b289d53b324ce6d087c3911e87d7776ba529cc9b6106101e7c46c +size 40581 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650012376.t1v-n-94d01c37-w-1.1854026.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650012376.t1v-n-94d01c37-w-1.1854026.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..873eb6321fed03d8cbaecedebe14839f7541b7ed --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650012376.t1v-n-94d01c37-w-1.1854026.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f27b14686835017f9dcc8289e9b35f773cc06df1ea96d1e13d52ec5e09ae1bb8 +size 18396 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650037350.t1v-n-94d01c37-w-1.1988057.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650037350.t1v-n-94d01c37-w-1.1988057.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..f6c57769357780cb3a662a0fa6012b97a681ac4f --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650037350.t1v-n-94d01c37-w-1.1988057.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad69c2f106bde93ecfec157b03ced958b967b88087c0378aed468df583195e4 +size 17091 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650082486.t1v-n-94d01c37-w-1.2198868.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650082486.t1v-n-94d01c37-w-1.2198868.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..679fea783adf35a48b6c3d03be83609f868e1d05 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650082486.t1v-n-94d01c37-w-1.2198868.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08cb5a508614b457b895d300f26b6c80c06e3a37534505095e7e7d78a7586fd6 +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650097888.t1v-n-94d01c37-w-1.2260945.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650097888.t1v-n-94d01c37-w-1.2260945.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..234af6582ea45e95a9594539707078e10ca0b16f --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650097888.t1v-n-94d01c37-w-1.2260945.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10c145a507f414f30e4120317ce93597f0461e5f783f0389fa0886c53d4a2b95 +size 22311 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650134701.t1v-n-94d01c37-w-1.2449113.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650134701.t1v-n-94d01c37-w-1.2449113.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..c11edc2278846cd8c972e1e05b4c7f223f2f74a6 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650134701.t1v-n-94d01c37-w-1.2449113.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c188cd2ae0706e3dc9abf3f876bfd75ded942e54870034772d8c44efcacac39d +size 7956 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650145105.t1v-n-94d01c37-w-1.2507564.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650145105.t1v-n-94d01c37-w-1.2507564.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..4c9a29858a5992908f886dd9fff52b09cdced26a --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650145105.t1v-n-94d01c37-w-1.2507564.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd3283b7275341598ddd41bf783c1f79be694db561d64e79e30f407fa8cc4dd1 +size 2736 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650150928.t1v-n-94d01c37-w-1.2539030.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650150928.t1v-n-94d01c37-w-1.2539030.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..7a5c27992cc5543dd1e289cbfaac579ba8a63c3e --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650150928.t1v-n-94d01c37-w-1.2539030.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c175e4bea4cfbe34b76e5144f765b91815cab21eb8060d12770a336f43d7f43 +size 7956 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650180232.t1v-n-94d01c37-w-1.2669968.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650180232.t1v-n-94d01c37-w-1.2669968.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..96742912a743e14097ed5ab7137d3e8cea2b3b91 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650180232.t1v-n-94d01c37-w-1.2669968.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b98ccc21fd51df7282871a54d6b1b4031efb058ae56e92dd28bc37fee8dd287c +size 96696 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650269484.t1v-n-94d01c37-w-1.3191003.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650269484.t1v-n-94d01c37-w-1.3191003.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..8e5e6dc1036546608c349b90a65037506548a327 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650269484.t1v-n-94d01c37-w-1.3191003.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce05e832829f28ec5fa05d7ef6c48a9b560111542b90e2cb6f97e60ceff99e70 +size 1431 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650301632.t1v-n-06e40f6a-w-2.144912.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650301632.t1v-n-06e40f6a-w-2.144912.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..0768e26afc4c87ddbe266d750116ee37d394c746 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650301632.t1v-n-06e40f6a-w-2.144912.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:797f8cb323fb39691004e7a9886a2c8db3da62cad3b1013b3d2216093c4ce41c +size 24921 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650331210.t1v-n-06e40f6a-w-2.269071.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650331210.t1v-n-06e40f6a-w-2.269071.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..680b3804e70b4f20056c5d1bd18f445811c5a5fc --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650331210.t1v-n-06e40f6a-w-2.269071.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0610719d72ecab8583139cae2c66f87be5958e24f8a8f4cbde4b98c66c1878df +size 24921 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650354640.t1v-n-06e40f6a-w-2.418760.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650354640.t1v-n-06e40f6a-w-2.418760.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..0b63de634fdde0de31133d96e01c7c42d8e1fd49 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650354640.t1v-n-06e40f6a-w-2.418760.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32c88fc04427b6385c7cfec40cde0da3834743957a344a6d5ff2c5238fe8bb2e +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650355219.t1v-n-06e40f6a-w-2.423813.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650355219.t1v-n-06e40f6a-w-2.423813.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..51857f95b1c9c0e8f33a41e396626c89b8422951 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650355219.t1v-n-06e40f6a-w-2.423813.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d45a732f205c36f4068b478daa02db2d1ab9bc549cabf8d49b6ec64f95427164 +size 30141 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650439634.t1v-n-da2df89d-w-0.68748.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650439634.t1v-n-da2df89d-w-0.68748.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..a23055e851076da123689a91798723cc33f38e91 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650439634.t1v-n-da2df89d-w-0.68748.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6640eee54f56c515fceb59c7d8e769cf575a746cee6863a412026c0b95b357d +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650442528.t1v-n-da2df89d-w-0.83496.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650442528.t1v-n-da2df89d-w-0.83496.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..642bb57fbcdcdfdc483f33e9e32303434d015bb6 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650442528.t1v-n-da2df89d-w-0.83496.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf35bba39e3362258cb6a9b8be42e9b72a9b0cc053f85244876d26f9aa22569f +size 1431 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650450275.t1v-n-51c2c60a-w-3.119563.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650450275.t1v-n-51c2c60a-w-3.119563.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..a0e322bc36e12189f96938338a98e474025b9b2f --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650450275.t1v-n-51c2c60a-w-3.119563.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2615aa72db9df9817413a010a22964ea5eced9f4eff597c7a33dd8da993af098 +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650451918.t1v-n-51c2c60a-w-3.129431.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650451918.t1v-n-51c2c60a-w-3.129431.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..6566efe1169b6f522a5f7e46ac3811e2e338bf7d --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650451918.t1v-n-51c2c60a-w-3.129431.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2276da88dec34498b45b89b8e86a61b393d6c749fb74ebd974b5266cea744b6f +size 28836 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650489808.t1v-n-214493c0-w-1.96872.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650489808.t1v-n-214493c0-w-1.96872.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..c8916cf7512ab6590357587a1ddded40d08d4ae1 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650489808.t1v-n-214493c0-w-1.96872.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3022229cd3ef8682157e7275a9a946fadde8aba5675d593b21dbb10073e02aa +size 40 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650490490.t1v-n-214493c0-w-1.105452.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650490490.t1v-n-214493c0-w-1.105452.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..f7762a588b6720bf027d27e4bf623792a5977aa5 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650490490.t1v-n-214493c0-w-1.105452.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1b7f73344b9386bc0b3949d9b1ac3c8681b52119ffbd610337e33c59dd308fb +size 24921 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650520938.t1v-n-214493c0-w-1.233747.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650520938.t1v-n-214493c0-w-1.233747.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..94947baeb3c8818e7d63bd3b1d8b0d8e56daf4cf --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650520938.t1v-n-214493c0-w-1.233747.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7916f5fd84420188e43e9eabfbb03d736c14e7168344a66afbdebe0b8e8f445d +size 77121 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650601403.t1v-n-214493c0-w-1.620758.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650601403.t1v-n-214493c0-w-1.620758.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..39747c7868dae3973aaedb33a25e9487dda82221 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650601403.t1v-n-214493c0-w-1.620758.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21c82b6c05a7498e83a4f98f0fff960ab381f02c352d3fa66eace4b54152f40b +size 24921 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650691171.t1v-n-0474fd26-w-2.66859.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650691171.t1v-n-0474fd26-w-2.66859.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..399502f392636f1fba6fb939eee039f61947eea7 --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650691171.t1v-n-0474fd26-w-2.66859.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:766cc0a11f5b17d1e63b9eb90e653bc525ae5f037984fbb88c0035eb53bd19bb +size 66681 diff --git a/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650757351.t1v-n-0474fd26-w-2.397514.1.v2 b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650757351.t1v-n-0474fd26-w-2.397514.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..adaff34b6dcbff68ce9a6883f4ca95364db5485c --- /dev/null +++ b/training_eval/ncc_english_span_corruption_stream/events.out.tfevents.1650757351.t1v-n-0474fd26-w-2.397514.1.v2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f3f74c1fbc2ad778e59896e9f3e6a89cd40fa5d50cf8cd2e84f908d2eee7921 +size 40