Upload folder using huggingface_hub (#1)
Browse files- Upload folder using huggingface_hub (d193feb86c853434135b9b9573481ac83de38ffb)
- .gitattributes +2 -0
 - mesh_tensorflow_checkpoints/model.ckpt-1460784.data-00000-of-00002 +0 -0
 - mesh_tensorflow_checkpoints/model.ckpt-1460784.data-00001-of-00002 +3 -0
 - mesh_tensorflow_checkpoints/model.ckpt-1460784.index +0 -0
 - mesh_tensorflow_checkpoints/model.ckpt-1460784.meta +3 -0
 - mesh_tensorflow_checkpoints/operative_config.gin +245 -0
 
    	
        .gitattributes
    CHANGED
    
    | 
         @@ -6,3 +6,5 @@ 
     | 
|
| 6 | 
         
             
            *.tar.gz filter=lfs diff=lfs merge=lfs -text
         
     | 
| 7 | 
         
             
            *.ot filter=lfs diff=lfs merge=lfs -text
         
     | 
| 8 | 
         
             
            *.onnx filter=lfs diff=lfs merge=lfs -text
         
     | 
| 
         | 
|
| 
         | 
| 
         | 
|
| 6 | 
         
             
            *.tar.gz filter=lfs diff=lfs merge=lfs -text
         
     | 
| 7 | 
         
             
            *.ot filter=lfs diff=lfs merge=lfs -text
         
     | 
| 8 | 
         
             
            *.onnx filter=lfs diff=lfs merge=lfs -text
         
     | 
| 9 | 
         
            +
            mesh_tensorflow_checkpoints/model.ckpt-1460784.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
         
     | 
| 10 | 
         
            +
            mesh_tensorflow_checkpoints/model.ckpt-1460784.meta filter=lfs diff=lfs merge=lfs -text
         
     | 
    	
        mesh_tensorflow_checkpoints/model.ckpt-1460784.data-00000-of-00002
    ADDED
    
    | 
         Binary file (8 Bytes). View file 
     | 
| 
         | 
    	
        mesh_tensorflow_checkpoints/model.ckpt-1460784.data-00001-of-00002
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:4e45f3c6d3d0ef5a2678b59ec0da6bccc45dd203a431aa9278eb601104da26cd
         
     | 
| 3 | 
         
            +
            size 1480297984
         
     | 
    	
        mesh_tensorflow_checkpoints/model.ckpt-1460784.index
    ADDED
    
    | 
         Binary file (21 kB). View file 
     | 
| 
         | 
    	
        mesh_tensorflow_checkpoints/model.ckpt-1460784.meta
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:63fbae2b0454bef2790070e657d0431193083bd93865736a4ffd8e1c4df29b36
         
     | 
| 3 | 
         
            +
            size 41753926
         
     | 
    	
        mesh_tensorflow_checkpoints/operative_config.gin
    ADDED
    
    | 
         @@ -0,0 +1,245 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import mesh_tensorflow.optimize
         
     | 
| 2 | 
         
            +
            import mesh_tensorflow.transformer.dataset
         
     | 
| 3 | 
         
            +
            import mesh_tensorflow.transformer.learning_rate_schedules
         
     | 
| 4 | 
         
            +
            import mesh_tensorflow.transformer.t2t_vocabulary
         
     | 
| 5 | 
         
            +
            import mesh_tensorflow.transformer.transformer_layers
         
     | 
| 6 | 
         
            +
            import mesh_tensorflow.transformer.utils
         
     | 
| 7 | 
         
            +
            import t5.data.sentencepiece_vocabulary
         
     | 
| 8 | 
         
            +
            import t5.models.mesh_transformer
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            # Macros:
         
     | 
| 11 | 
         
            +
            # ==============================================================================
         
     | 
| 12 | 
         
            +
            d_ff = 4096
         
     | 
| 13 | 
         
            +
            d_kv = 64
         
     | 
| 14 | 
         
            +
            d_model = 1024
         
     | 
| 15 | 
         
            +
            dropout_rate = 0.1
         
     | 
| 16 | 
         
            +
            num_heads = 16
         
     | 
| 17 | 
         
            +
            num_layers = 24
         
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            # Parameters for AdafactorOptimizer:
         
     | 
| 20 | 
         
            +
            # ==============================================================================
         
     | 
| 21 | 
         
            +
            AdafactorOptimizer.beta1 = 0.0
         
     | 
| 22 | 
         
            +
            AdafactorOptimizer.clipping_threshold = 1.0
         
     | 
| 23 | 
         
            +
            AdafactorOptimizer.decay_rate = None
         
     | 
| 24 | 
         
            +
            AdafactorOptimizer.epsilon1 = 1e-30
         
     | 
| 25 | 
         
            +
            AdafactorOptimizer.epsilon2 = 0.001
         
     | 
| 26 | 
         
            +
            AdafactorOptimizer.factored = True
         
     | 
| 27 | 
         
            +
            AdafactorOptimizer.min_dim_size_to_factor = 128
         
     | 
| 28 | 
         
            +
            AdafactorOptimizer.multiply_by_parameter_scale = True
         
     | 
| 29 | 
         
            +
             
     | 
| 30 | 
         
            +
            # Parameters for Bitransformer:
         
     | 
| 31 | 
         
            +
            # ==============================================================================
         
     | 
| 32 | 
         
            +
            Bitransformer.shared_embedding = True
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
            # Parameters for denoise:
         
     | 
| 35 | 
         
            +
            # ==============================================================================
         
     | 
| 36 | 
         
            +
            # None.
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
            # Parameters for decoder/DenseReluDense:
         
     | 
| 39 | 
         
            +
            # ==============================================================================
         
     | 
| 40 | 
         
            +
            decoder/DenseReluDense.activation = 'relu'
         
     | 
| 41 | 
         
            +
            decoder/DenseReluDense.dropout_rate = %dropout_rate
         
     | 
| 42 | 
         
            +
            decoder/DenseReluDense.hidden_size = %d_ff
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
            # Parameters for encoder/DenseReluDense:
         
     | 
| 45 | 
         
            +
            # ==============================================================================
         
     | 
| 46 | 
         
            +
            encoder/DenseReluDense.activation = 'relu'
         
     | 
| 47 | 
         
            +
            encoder/DenseReluDense.dropout_rate = %dropout_rate
         
     | 
| 48 | 
         
            +
            encoder/DenseReluDense.hidden_size = %d_ff
         
     | 
| 49 | 
         
            +
             
     | 
| 50 | 
         
            +
            # Parameters for decoder/EncDecAttention:
         
     | 
| 51 | 
         
            +
            # ==============================================================================
         
     | 
| 52 | 
         
            +
            # None.
         
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
            +
            # Parameters for get_variable_dtype:
         
     | 
| 55 | 
         
            +
            # ==============================================================================
         
     | 
| 56 | 
         
            +
            get_variable_dtype.activation_dtype = 'bfloat16'
         
     | 
| 57 | 
         
            +
             
     | 
| 58 | 
         
            +
            # Parameters for get_vocab_embedding_cls:
         
     | 
| 59 | 
         
            +
            # ==============================================================================
         
     | 
| 60 | 
         
            +
            # None.
         
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
            # Parameters for get_vocabulary:
         
     | 
| 63 | 
         
            +
            # ==============================================================================
         
     | 
| 64 | 
         
            +
            # None.
         
     | 
| 65 | 
         
            +
             
     | 
| 66 | 
         
            +
            # Parameters for iid_noise_mask:
         
     | 
| 67 | 
         
            +
            # ==============================================================================
         
     | 
| 68 | 
         
            +
            # None.
         
     | 
| 69 | 
         
            +
             
     | 
| 70 | 
         
            +
            # Parameters for decoder/LayerStack:
         
     | 
| 71 | 
         
            +
            # ==============================================================================
         
     | 
| 72 | 
         
            +
            decoder/LayerStack.dropout_rate = %dropout_rate
         
     | 
| 73 | 
         
            +
            decoder/LayerStack.norm_epsilon = 1e-06
         
     | 
| 74 | 
         
            +
            decoder/LayerStack.recompute_grads = False
         
     | 
| 75 | 
         
            +
             
     | 
| 76 | 
         
            +
            # Parameters for encoder/LayerStack:
         
     | 
| 77 | 
         
            +
            # ==============================================================================
         
     | 
| 78 | 
         
            +
            encoder/LayerStack.dropout_rate = %dropout_rate
         
     | 
| 79 | 
         
            +
            encoder/LayerStack.norm_epsilon = 1e-06
         
     | 
| 80 | 
         
            +
            encoder/LayerStack.recompute_grads = False
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
            # Parameters for make_bitransformer:
         
     | 
| 83 | 
         
            +
            # ==============================================================================
         
     | 
| 84 | 
         
            +
            make_bitransformer.decoder_name = 'decoder'
         
     | 
| 85 | 
         
            +
            make_bitransformer.encoder_name = 'encoder'
         
     | 
| 86 | 
         
            +
             
     | 
| 87 | 
         
            +
            # Parameters for decoder/make_layer_stack:
         
     | 
| 88 | 
         
            +
            # ==============================================================================
         
     | 
| 89 | 
         
            +
            decoder/make_layer_stack.block_scope = True
         
     | 
| 90 | 
         
            +
            decoder/make_layer_stack.layers = \
         
     | 
| 91 | 
         
            +
                [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
         
     | 
| 92 | 
         
            +
                 @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
         
     | 
| 93 | 
         
            +
                 @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
         
     | 
| 94 | 
         
            +
            decoder/make_layer_stack.num_layers = %num_layers
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
            # Parameters for encoder/make_layer_stack:
         
     | 
| 97 | 
         
            +
            # ==============================================================================
         
     | 
| 98 | 
         
            +
            encoder/make_layer_stack.block_scope = True
         
     | 
| 99 | 
         
            +
            encoder/make_layer_stack.layers = \
         
     | 
| 100 | 
         
            +
                [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
         
     | 
| 101 | 
         
            +
                 @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
         
     | 
| 102 | 
         
            +
            encoder/make_layer_stack.num_layers = %num_layers
         
     | 
| 103 | 
         
            +
             
     | 
| 104 | 
         
            +
            # Parameters for maybe_print_dataset:
         
     | 
| 105 | 
         
            +
            # ==============================================================================
         
     | 
| 106 | 
         
            +
            maybe_print_dataset.should_print = False
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
            # Parameters for mesh_train_dataset_fn:
         
     | 
| 109 | 
         
            +
            # ==============================================================================
         
     | 
| 110 | 
         
            +
            mesh_train_dataset_fn.use_cached = False
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
            # Parameters for MtfModel:
         
     | 
| 113 | 
         
            +
            # ==============================================================================
         
     | 
| 114 | 
         
            +
            MtfModel.autostack = True
         
     | 
| 115 | 
         
            +
            MtfModel.ensemble_inputs = None
         
     | 
| 116 | 
         
            +
            MtfModel.gcp_project = None
         
     | 
| 117 | 
         
            +
            MtfModel.layout_rules = \
         
     | 
| 118 | 
         
            +
                'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
         
     | 
| 119 | 
         
            +
            MtfModel.mesh_devices = None
         
     | 
| 120 | 
         
            +
            MtfModel.mesh_shape = None
         
     | 
| 121 | 
         
            +
            MtfModel.model_type = 'bitransformer'
         
     | 
| 122 | 
         
            +
            MtfModel.optimizer = None
         
     | 
| 123 | 
         
            +
            MtfModel.predict_fn = None
         
     | 
| 124 | 
         
            +
            MtfModel.tpu_job_name = None
         
     | 
| 125 | 
         
            +
            MtfModel.tpu_zone = None
         
     | 
| 126 | 
         
            +
            MtfModel.variable_filter = None
         
     | 
| 127 | 
         
            +
             
     | 
| 128 | 
         
            +
            # Parameters for noise_token_to_sentinel:
         
     | 
| 129 | 
         
            +
            # ==============================================================================
         
     | 
| 130 | 
         
            +
            # None.
         
     | 
| 131 | 
         
            +
             
     | 
| 132 | 
         
            +
            # Parameters for num_parallel_calls:
         
     | 
| 133 | 
         
            +
            # ==============================================================================
         
     | 
| 134 | 
         
            +
            num_parallel_calls.deterministic = False
         
     | 
| 135 | 
         
            +
             
     | 
| 136 | 
         
            +
            # Parameters for pack_dataset:
         
     | 
| 137 | 
         
            +
            # ==============================================================================
         
     | 
| 138 | 
         
            +
            pack_dataset.use_custom_ops = False
         
     | 
| 139 | 
         
            +
             
     | 
| 140 | 
         
            +
            # Parameters for pack_or_pad:
         
     | 
| 141 | 
         
            +
            # ==============================================================================
         
     | 
| 142 | 
         
            +
            # None.
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
            # Parameters for decoder/SelfAttention:
         
     | 
| 145 | 
         
            +
            # ==============================================================================
         
     | 
| 146 | 
         
            +
            decoder/SelfAttention.attention_func = None
         
     | 
| 147 | 
         
            +
            decoder/SelfAttention.attention_kwargs = None
         
     | 
| 148 | 
         
            +
            decoder/SelfAttention.combine_dims = True
         
     | 
| 149 | 
         
            +
            decoder/SelfAttention.dropout_rate = %dropout_rate
         
     | 
| 150 | 
         
            +
            decoder/SelfAttention.keep_query_heads_dims = False
         
     | 
| 151 | 
         
            +
            decoder/SelfAttention.key_value_size = %d_kv
         
     | 
| 152 | 
         
            +
            decoder/SelfAttention.num_heads = %num_heads
         
     | 
| 153 | 
         
            +
            decoder/SelfAttention.num_memory_heads = 0
         
     | 
| 154 | 
         
            +
            decoder/SelfAttention.relative_attention_num_buckets = 32
         
     | 
| 155 | 
         
            +
            decoder/SelfAttention.relative_attention_type = 'bias_shared'
         
     | 
| 156 | 
         
            +
            decoder/SelfAttention.shared_kv = False
         
     | 
| 157 | 
         
            +
             
     | 
| 158 | 
         
            +
            # Parameters for encoder/SelfAttention:
         
     | 
| 159 | 
         
            +
            # ==============================================================================
         
     | 
| 160 | 
         
            +
            encoder/SelfAttention.attention_func = None
         
     | 
| 161 | 
         
            +
            encoder/SelfAttention.attention_kwargs = None
         
     | 
| 162 | 
         
            +
            encoder/SelfAttention.combine_dims = True
         
     | 
| 163 | 
         
            +
            encoder/SelfAttention.dropout_rate = %dropout_rate
         
     | 
| 164 | 
         
            +
            encoder/SelfAttention.keep_query_heads_dims = False
         
     | 
| 165 | 
         
            +
            encoder/SelfAttention.key_value_size = %d_kv
         
     | 
| 166 | 
         
            +
            encoder/SelfAttention.num_heads = %num_heads
         
     | 
| 167 | 
         
            +
            encoder/SelfAttention.num_memory_heads = 0
         
     | 
| 168 | 
         
            +
            encoder/SelfAttention.relative_attention_num_buckets = 32
         
     | 
| 169 | 
         
            +
            encoder/SelfAttention.relative_attention_type = 'bias_shared'
         
     | 
| 170 | 
         
            +
            encoder/SelfAttention.shared_kv = False
         
     | 
| 171 | 
         
            +
             
     | 
| 172 | 
         
            +
            # Parameters for SentencePieceVocabulary:
         
     | 
| 173 | 
         
            +
            # ==============================================================================
         
     | 
| 174 | 
         
            +
            # None.
         
     | 
| 175 | 
         
            +
             
     | 
| 176 | 
         
            +
            # Parameters for sentinel_id:
         
     | 
| 177 | 
         
            +
            # ==============================================================================
         
     | 
| 178 | 
         
            +
            sentinel_id.return_value = None
         
     | 
| 179 | 
         
            +
             
     | 
| 180 | 
         
            +
            # Parameters for serialize_num_microbatches:
         
     | 
| 181 | 
         
            +
            # ==============================================================================
         
     | 
| 182 | 
         
            +
            serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
         
     | 
| 183 | 
         
            +
             
     | 
| 184 | 
         
            +
            # Parameters for shift_targets:
         
     | 
| 185 | 
         
            +
            # ==============================================================================
         
     | 
| 186 | 
         
            +
            shift_targets.bos_id = 0
         
     | 
| 187 | 
         
            +
            shift_targets.eos_id = 1
         
     | 
| 188 | 
         
            +
             
     | 
| 189 | 
         
            +
            # Parameters for tpu_estimator_model_fn:
         
     | 
| 190 | 
         
            +
            # ==============================================================================
         
     | 
| 191 | 
         
            +
            tpu_estimator_model_fn.model_info_file = None
         
     | 
| 192 | 
         
            +
            tpu_estimator_model_fn.outer_batch_size = 1
         
     | 
| 193 | 
         
            +
            tpu_estimator_model_fn.tpu_summaries = False
         
     | 
| 194 | 
         
            +
             
     | 
| 195 | 
         
            +
            # Parameters for tpu_mesh_shape:
         
     | 
| 196 | 
         
            +
            # ==============================================================================
         
     | 
| 197 | 
         
            +
            tpu_mesh_shape.ensemble_parallelism = None
         
     | 
| 198 | 
         
            +
             
     | 
| 199 | 
         
            +
            # Parameters for decoder/Unitransformer:
         
     | 
| 200 | 
         
            +
            # ==============================================================================
         
     | 
| 201 | 
         
            +
            decoder/Unitransformer.d_model = %d_model
         
     | 
| 202 | 
         
            +
            decoder/Unitransformer.ensemble = None
         
     | 
| 203 | 
         
            +
            decoder/Unitransformer.input_full_attention = False
         
     | 
| 204 | 
         
            +
            decoder/Unitransformer.label_smoothing = 0.0
         
     | 
| 205 | 
         
            +
            decoder/Unitransformer.loss_denominator = 233472
         
     | 
| 206 | 
         
            +
            decoder/Unitransformer.loss_fn = None
         
     | 
| 207 | 
         
            +
            decoder/Unitransformer.loss_on_targets_only = False
         
     | 
| 208 | 
         
            +
            decoder/Unitransformer.max_length = 512
         
     | 
| 209 | 
         
            +
            decoder/Unitransformer.positional_embedding = False
         
     | 
| 210 | 
         
            +
            decoder/Unitransformer.shared_embedding_and_softmax_weights = True
         
     | 
| 211 | 
         
            +
            decoder/Unitransformer.sinusoid_positional_embedding = False
         
     | 
| 212 | 
         
            +
            decoder/Unitransformer.token_dropout_rate = 0.0
         
     | 
| 213 | 
         
            +
            decoder/Unitransformer.vocab_divisor = 128
         
     | 
| 214 | 
         
            +
            decoder/Unitransformer.z_loss = 0.0001
         
     | 
| 215 | 
         
            +
             
     | 
| 216 | 
         
            +
            # Parameters for encoder/Unitransformer:
         
     | 
| 217 | 
         
            +
            # ==============================================================================
         
     | 
| 218 | 
         
            +
            encoder/Unitransformer.d_model = %d_model
         
     | 
| 219 | 
         
            +
            encoder/Unitransformer.ensemble = None
         
     | 
| 220 | 
         
            +
            encoder/Unitransformer.input_full_attention = False
         
     | 
| 221 | 
         
            +
            encoder/Unitransformer.label_smoothing = 0.0
         
     | 
| 222 | 
         
            +
            encoder/Unitransformer.loss_denominator = None
         
     | 
| 223 | 
         
            +
            encoder/Unitransformer.loss_fn = None
         
     | 
| 224 | 
         
            +
            encoder/Unitransformer.loss_on_targets_only = False
         
     | 
| 225 | 
         
            +
            encoder/Unitransformer.max_length = 512
         
     | 
| 226 | 
         
            +
            encoder/Unitransformer.positional_embedding = False
         
     | 
| 227 | 
         
            +
            encoder/Unitransformer.shared_embedding_and_softmax_weights = True
         
     | 
| 228 | 
         
            +
            encoder/Unitransformer.sinusoid_positional_embedding = False
         
     | 
| 229 | 
         
            +
            encoder/Unitransformer.token_dropout_rate = 0.0
         
     | 
| 230 | 
         
            +
            encoder/Unitransformer.vocab_divisor = 128
         
     | 
| 231 | 
         
            +
            encoder/Unitransformer.z_loss = 0.0001
         
     | 
| 232 | 
         
            +
             
     | 
| 233 | 
         
            +
            # Parameters for VarianceScalingInitializer:
         
     | 
| 234 | 
         
            +
            # ==============================================================================
         
     | 
| 235 | 
         
            +
            VarianceScalingInitializer.distribution = 'normal'
         
     | 
| 236 | 
         
            +
            VarianceScalingInitializer.mode = 'fan_in'
         
     | 
| 237 | 
         
            +
            VarianceScalingInitializer.scale = 1.0
         
     | 
| 238 | 
         
            +
             
     | 
| 239 | 
         
            +
            # Parameters for VocabEmbedding:
         
     | 
| 240 | 
         
            +
            # ==============================================================================
         
     | 
| 241 | 
         
            +
            # None.
         
     | 
| 242 | 
         
            +
             
     | 
| 243 | 
         
            +
            # Parameters for Vocabulary:
         
     | 
| 244 | 
         
            +
            # ==============================================================================
         
     | 
| 245 | 
         
            +
            # None.
         
     |