| tensor_model_parallel_size: 1 | |
| pipeline_model_parallel_size: 1 | |
| virtual_pipeline_model_parallel_size: null | |
| sequence_parallel: false | |
| context_parallel_size: 1 | |
| expert_model_parallel_size: 1 | |
| moe_extended_tp: false | |
| perform_initialization: true | |
| use_cpu_initialization: false | |
| fp16: false | |
| bf16: false | |
| params_dtype: float32 | |
| timers: null | |
| finalize_model_grads_func: null | |
| grad_scale_func: null | |
| no_sync_func: null | |
| grad_sync_func: null | |
| param_sync_func: null | |
| deterministic_mode: false | |
| enable_autocast: false | |
| autocast_dtype: float32 | |
| num_microbatches_with_partial_activation_checkpoints: null | |
| gradient_accumulation_fusion: false | |
| async_tensor_model_parallel_allreduce: false | |
| use_te_rng_tracker: false | |
| tp_comm_overlap: false | |
| tp_comm_bulk_wgrad: true | |
| tp_comm_bulk_dgrad: true | |
| tp_comm_overlap_ag: true | |
| tp_comm_overlap_rs: true | |
| tp_comm_overlap_rs_dgrad: false | |
| tp_comm_split_ag: true | |
| tp_comm_atomic_ag: false | |
| tp_comm_split_rs: true | |
| tp_comm_atomic_rs: false | |
| pipeline_dtype: null | |
| variable_seq_lengths: false | |
| overlap_p2p_comm: false | |
| batch_p2p_comm: true | |
| batch_p2p_sync: true | |
| use_ring_exchange_p2p: false | |
| deallocate_pipeline_outputs: false | |
| defer_embedding_wgrad_compute: false | |
| pipeline_model_parallel_split_rank: null | |
| cpu_offloading: false | |
| cpu_offloading_num_layers: 0 | |
| _cpu_offloading_context: null | |
| cpu_offloading_activations: true | |
| cpu_offloading_weights: true | |
| barrier_with_L1_time: true | |
| fp16_lm_cross_entropy: false | |
| parallel_output: true | |
| share_embeddings_and_output_weights: false | |
| make_vocab_size_divisible_by: 128 | |
| position_embedding_type: learned_absolute | |
| rotary_base: 10000 | |
| rotary_percent: 1.0 | |
| seq_len_interpolation_factor: null | |
| seq_length: 2048 | |
| optim: | |
| name: fused_adam | |
| sched: null | |
| optimizer_fn: null | |
| tokenizer_filepath: null | |
| num_layers: 4 | |
| hidden_size: 256 | |
| num_attention_heads: 4 | |
| num_query_groups: 4 | |
| ffn_hidden_size: 256 | |
| kv_channels: 64 | |
| hidden_dropout: 0.1 | |
| attention_dropout: 0.1 | |
| fp32_residual_connection: false | |
| apply_residual_connection_post_layernorm: false | |
| layernorm_epsilon: 1.0e-05 | |
| layernorm_zero_centered_gamma: false | |
| add_bias_linear: true | |
| add_qkv_bias: false | |
| gated_linear_unit: false | |
| activation_func: gelu | |
| activation_func_fp8_input_store: false | |
| num_moe_experts: null | |
| rotary_interleaved: false | |
| window_size: null | |
| normalization: LayerNorm | |
| qk_layernorm: false | |
| test_mode: false | |
| calculate_per_token_loss: false | |
| init_method: init_ | |
| output_layer_init_method: init_ | |
| init_method_std: 0.02 | |
| apply_query_key_layer_scaling: false | |
| attention_softmax_in_fp32: true | |
| bias_activation_fusion: false | |
| masked_softmax_fusion: false | |
| persist_layer_norm: false | |
| memory_efficient_layer_norm: false | |
| bias_dropout_fusion: false | |
| apply_rope_fusion: false | |
| recompute_granularity: null | |
| recompute_method: null | |
| recompute_num_layers: null | |
| distribute_saved_activations: null | |
| fp8: null | |
| fp8_margin: 0 | |
| fp8_interval: 1 | |
| fp8_amax_history_len: 1 | |
| fp8_amax_compute_algo: most_recent | |
| fp8_wgrad: true | |
| fp8_dot_product_attention: false | |
| fp8_multi_head_attention: false | |
| moe_router_load_balancing_type: aux_loss | |
| moe_router_topk: 2 | |
| moe_grouped_gemm: false | |
| moe_aux_loss_coeff: 0.0 | |
| moe_z_loss_coeff: null | |
| moe_input_jitter_eps: null | |
| moe_token_dropping: false | |
| moe_token_dispatcher_type: allgather | |
| moe_per_layer_logging: false | |
| moe_expert_capacity_factor: null | |
| moe_pad_expert_input_to_capacity: false | |
| moe_token_drop_policy: probs | |
| moe_layer_recompute: false | |
| clone_scatter_output_in_embedding: true | |
| disable_parameter_transpose_cache: false | |
| enable_cuda_graph: false | |
| target: nemo.collections.llm.gpt.model.base_v2.GPTModelV2 | |
| nemo_version: 2.0.0rc1 | |