Spaces:

ISEEKYAN
/

megatron_memory_estimator

Running

App Files Files Community

Yan Bai commited on 1 day ago

Commit

d2006b6

1 Parent(s): 0ffae8a

0.13

Browse files

Files changed (13) hide show

Dockerfile +3 -6
__init__.py +0 -0
app.py +1 -0
estimate_013.py +505 -0
moe_mem_estimator/__init__.py +1 -0
moe_mem_estimator/base.py +217 -0
moe_mem_estimator/gpt_model.py +157 -0
moe_mem_estimator/layers.py +1940 -0
webui/index.html +208 -0
webui/main.py +255 -0
webui/requirements.txt +3 -0
webui/script.js +787 -0
webui/style.css +383 -0

Dockerfile CHANGED Viewed

@@ -8,7 +8,7 @@ RUN pip install --no-cache-dir \
     termcolor \
     ipdb
 # 添加 Megatron-LM core_v0.12.2
-RUN git clone -b core_v0.13.0rc4 --depth 1 https://github.com/NVIDIA/Megatron-LM.git /opt/Megatron-LM
 RUN git clone -b estimator_mcore013 --depth 1 https://github.com/ISEEKYAN/mbridge.git /opt/mbridge
@@ -18,10 +18,7 @@ RUN groupadd -g 1000 user && \
 # 复制代码至工作目录
 WORKDIR $HOME/app
-RUN mkdir -p $HOME/app && mv /opt/mbridge/memory_estimator/* $HOME/app && chown -R user:user $HOME/app
-RUN echo " " > $HOME/app/__init__.py
-RUN echo "from webui.main import app" > $HOME/app/app.py
 # HF Spaces 默认通过 $PORT 注入端口
 ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
@@ -29,4 +26,4 @@ ENV PORT=7860
 EXPOSE 7860
 # 启动 FastAPI 服务
-CMD ["sh", "-c", "uvicorn app:app --host 0.0.0.0 --port $PORT"]

     termcolor \
     ipdb
 # 添加 Megatron-LM core_v0.12.2
+RUN git clone -b core_v0.12.2 --depth 1 https://github.com/NVIDIA/Megatron-LM.git /opt/Megatron-LM
 RUN git clone -b estimator_mcore013 --depth 1 https://github.com/ISEEKYAN/mbridge.git /opt/mbridge
 # 复制代码至工作目录
 WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
 # HF Spaces 默认通过 $PORT 注入端口
 ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
 EXPOSE 7860
 # 启动 FastAPI 服务
+CMD ["sh", "-c", "uvicorn app:app --host 0.0.0.0 --port $PORT"]

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from webui.main import app

estimate_013.py ADDED Viewed

	@@ -0,0 +1,505 @@

+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+"""Pretrain GPT."""
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore")
+import inspect
+import os
+from contextlib import nullcontext
+from functools import partial
+from typing import List, Optional, Tuple, Union
+import torch
+from megatron.core import mpu
+from megatron.core.datasets.blended_megatron_dataset_builder import (
+    BlendedMegatronDatasetBuilder,
+)
+from megatron.core.datasets.gpt_dataset import (
+    GPTDataset,
+    GPTDatasetConfig,
+    MockGPTDataset,
+)
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.enums import ModelType
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_decoder_block_spec,
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+    get_gpt_mtp_block_spec,
+)
+from megatron.core.transformer.spec_utils import import_module
+from megatron.core.utils import StragglerDetector
+from megatron.training import (
+    get_args,
+    get_timers,
+    get_tokenizer,
+    pretrain,
+    print_rank_0,
+)
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.training.initialize import initialize_megatron
+from megatron.training.utils import get_batch_on_this_cp_rank, get_batch_on_this_tp_rank
+from megatron.training.yaml_arguments import core_transformer_config_from_yaml
+from moe_mem_estimator.base import (
+    get_pipeline_model_parallel_rank,
+    get_pipeline_model_parallel_world_size,
+    get_virtual_pipeline_model_parallel_world_size,
+    is_pipeline_first_stage,
+    is_pipeline_last_stage,
+    set_global_config,
+    set_pipeline_model_parallel_rank,
+)
+from moe_mem_estimator.gpt_model import GPTModel
+from moe_mem_estimator.layers import MLASelfAttention, MoELayer
+torch.distributed.get_rank = lambda: 0
+torch.cuda.get_device_capability = lambda: [8]
+def estimate_from_config(config, args):
+    """
+    Estimate memory usage from a given config and args, instead of global state.
+    Now supports virtual pipeline model parallelism for more accurate results.
+    """
+    args.moe_grouped_gemm = True
+    patch_parallel_states()
+    if config is None:
+        if args.yaml_cfg is not None:
+            config = core_transformer_config_from_yaml(args, "language_model")
+        else:
+            config = core_transformer_config_from_args(args)
+    input_shape = [args.micro_batch_size, args.seq_length]
+    set_global_config(config)
+    print(config)
+    # return
+    cli_reports = []
+    if config.pipeline_model_parallel_size > 1:
+        for pp_rank in range(config.pipeline_model_parallel_size):
+            set_pipeline_model_parallel_rank(pp_rank)
+            print(
+                f"\n------------------------------[Pipeline_Parallelism_Rank={pp_rank}]------------------------------"
+            )
+            input_shape, rpt = report_memory_usage_one_pp_rank(
+                input_shape, args, config, pp_rank, config.pipeline_model_parallel_size
+            )
+            cli_reports.append(rpt)
+    else:
+        set_pipeline_model_parallel_rank(0)
+        _, rpt = report_memory_usage_one_pp_rank(input_shape, args, config)
+        cli_reports.append(rpt)
+    aggregated_reports: list[dict] = cli_reports
+    # 返回 (聚合后的 pp 报告列表, 全量 raw chunk 列表)
+    return aggregated_reports, cli_reports
+def _get_transformer_layer_spec(use_te, config):
+    """Get transformer layer specification based on configuration.
+    Args:
+        use_te (bool): Whether to use Transformer Engine
+        args: Training arguments
+        config: Model configuration
+    Returns:
+        transformer_layer_spec: The transformer layer specification
+    """
+    if use_te:
+        return get_gpt_layer_with_transformer_engine_spec(
+            config.num_moe_experts,
+            config.moe_grouped_gemm,
+            config.qk_layernorm,
+            config.multi_latent_attention,
+            config.fp8,
+        )
+    else:
+        return get_gpt_layer_local_spec(
+            config.num_moe_experts,
+            config.moe_grouped_gemm,
+            config.qk_layernorm,
+            config.multi_latent_attention,
+        )
+def model_provider(
+    args, config, pre_process=True, post_process=True, vp_stage: Optional[int] = None
+) -> GPTModel:
+    use_te = True
+    if args.num_experts:
+        # Define the decoder block spec
+        transformer_layer_spec = get_gpt_decoder_block_spec(
+            config,
+            use_transformer_engine=use_te,
+            normalization="LayerNorm",
+            qk_l2_norm=False,
+            vp_stage=vp_stage,
+        )
+    else:
+        # Define the decoder layer spec
+        transformer_layer_spec = _get_transformer_layer_spec(use_te, config)
+    mtp_block_spec = None
+    # TODO fp8
+    model = GPTModel(
+        config=config,
+        transformer_layer_spec=transformer_layer_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        fp16_lm_cross_entropy=getattr(config, "fp16_lm_cross_entropy", False),
+        parallel_output=True,
+        share_embeddings_and_output_weights=False,
+        position_embedding_type="rope",
+        rotary_percent=getattr(args, "rotary_percent", 1.0),
+        rotary_base=getattr(args, "rotary_base", 10000),
+        rope_scaling=getattr(config, "use_rope_scaling", False),
+        mtp_block_spec=mtp_block_spec,
+        vp_stage=vp_stage,
+    )
+    return model
+def get_model(
+    model_provider_func, args, config, model_type=ModelType.encoder_or_decoder
+):
+    """Build the model."""
+    # args = get_args()
+    # args.model_type = model_type
+    # Build model.
+    if not getattr(args, "virtual_pipeline_model_parallel_size", None):
+        args.virtual_pipeline_model_parallel_size = None
+    if config.pipeline_model_parallel_layout:
+        args.virtual_pipeline_model_parallel_size = (
+            config.pipeline_model_parallel_layout.virtual_pipeline_model_parallel_size
+        )
+        config.virtual_pipeline_model_parallel_size = (
+            config.pipeline_model_parallel_layout.virtual_pipeline_model_parallel_size
+        )
+    def build_model():
+        if (
+            get_pipeline_model_parallel_world_size() > 1
+            and args.virtual_pipeline_model_parallel_size is not None
+        ):
+            if model_type == ModelType.encoder_and_decoder:
+                assert (
+                    config.encoder_pipeline_model_parallel_size == 0
+                ), "Interleaved schedule not supported for model with encoder on separate PP rank"
+            model = []
+            for i in range(args.virtual_pipeline_model_parallel_size):
+                # Set pre_process and post_process only after virtual rank is set.
+                pre_process = is_pipeline_first_stage(ignore_virtual=False, vp_stage=i)
+                post_process = is_pipeline_last_stage(ignore_virtual=False, vp_stage=i)
+                this_model = model_provider_func(
+                    args,
+                    config,
+                    pre_process=pre_process,
+                    post_process=post_process,
+                    vp_stage=i,
+                )
+                this_model.model_type = model_type
+                this_model.vp_stage = i
+                model.append(this_model)
+        else:
+            pre_process = is_pipeline_first_stage()
+            post_process = is_pipeline_last_stage()
+            if model_type == ModelType.encoder_and_decoder:
+                if get_pipeline_model_parallel_world_size() > 1:
+                    rank = get_pipeline_model_parallel_rank()
+                    first_decoder_rank = config.encoder_pipeline_model_parallel_size
+                    world_size = get_pipeline_model_parallel_world_size()
+                    pre_process = rank == 0 or rank == first_decoder_rank
+                    post_process = (rank == (first_decoder_rank - 1)) or (
+                        rank == (world_size - 1)
+                    )
+                model = model_provider_func(
+                    args,
+                    config,
+                    pre_process=pre_process,
+                    post_process=post_process,
+                )
+            else:
+                model = model_provider_func(
+                    args, config, pre_process=pre_process, post_process=post_process
+                )
+            model.model_type = model_type
+        return model
+    model = build_model()
+    if not isinstance(model, list):
+        model = [model]
+    return model
+NUM_BYTES_IN_MEGABYTE = 1024 * 1024
+NUM_BYTES_IN_GIGABYTE = 1024 * 1024 * 1024
+def patch_parallel_states():
+    from megatron.core import parallel_state
+    parallel_state.is_pipeline_first_stage = is_pipeline_first_stage
+    parallel_state.is_pipeline_last_stage = is_pipeline_last_stage
+    parallel_state.get_pipeline_model_parallel_rank = get_pipeline_model_parallel_rank
+    parallel_state.get_pipeline_model_parallel_world_size = (
+        get_pipeline_model_parallel_world_size
+    )
+    parallel_state.get_virtual_pipeline_model_parallel_world_size = (
+        get_virtual_pipeline_model_parallel_world_size
+    )
+    parallel_state.is_inside_encoder = lambda: False
+    parallel_state.get_pipeline_model_parallel_decoder_start = lambda: 0
+def report_memory_usage(args, config=None):
+    args.moe_grouped_gemm = True
+    patch_parallel_states()
+    if config is None:
+        if args.yaml_cfg is not None:
+            config = core_transformer_config_from_yaml(args, "language_model")
+        else:
+            config = core_transformer_config_from_args(args)
+    input_shape = [args.micro_batch_size, args.seq_length]
+    set_global_config(config)
+    cli_reports = []
+    if config.pipeline_model_parallel_size > 1:
+        for pp_rank in range(config.pipeline_model_parallel_size):
+            set_pipeline_model_parallel_rank(pp_rank)
+            print(
+                f"\n------------------------------[Pipeline_Parallelism_Rank={pp_rank}]------------------------------"
+            )
+            input_shape, rpt = report_memory_usage_one_pp_rank(
+                input_shape, args, config, pp_rank, config.pipeline_model_parallel_size
+            )
+            cli_reports.append(rpt)
+    else:
+        set_pipeline_model_parallel_rank(0)
+        _, rpt = report_memory_usage_one_pp_rank(input_shape, args, config)
+        cli_reports.append(rpt)
+    # Optionally pretty print summary
+    print("\n===== Summary (per PP rank) =====")
+    for r in cli_reports:
+        print(
+            f"PP{r['pp_rank']}  total {r['total_gb']} GB  (weight_grad {r['weight_grad_gb']} GB weight_grad_optim {r['weight_grad_optim_gb']} GB  act {r['activation_gb']} GB)"
+        )
+def report_memory_usage_one_pp_rank(
+    input_shape: list[int], args, config, pp_rank=0, pp_size=1
+) -> tuple[list[int], dict]:
+    print(f"{input_shape=}")
+    model: list[GPTModel] = get_model(model_provider, args, config)
+    num_parameter_this_shard_all = 0
+    num_parameter_this_shard_sparse_all = 0
+    num_activation_all = 0
+    output_shape = input_shape
+    for vpp_rank, one_chunk in enumerate(model):
+        num_parameter_this_shard = one_chunk.num_parameter()
+        num_activation = one_chunk.num_activation(output_shape)
+        output_shape = one_chunk.mock_forward(output_shape)
+        print(f"{output_shape=}")
+        num_parameter_this_shard_sparse = 0
+        for layer in one_chunk.decoder.layers.modules:
+            if isinstance(layer.mlp, MoELayer):
+                num_parameter_this_shard_sparse += layer.mlp.num_parameter()
+                if (
+                    "shared_experts" in layer.mlp.__dir__()
+                    and layer.mlp.shared_experts is not None
+                ):
+                    num_parameter_this_shard_sparse -= (
+                        layer.mlp.shared_experts.num_parameter()
+                    )
+        num_activation_this_shard_mlp = sum(
+            [m.mlp.num_activation() for m in one_chunk.decoder.layers.modules]
+        )
+        if len(model) > 1:
+            if vpp_rank >= 1 and vpp_rank < len(model) - 1:
+                num_microbatch_this_pp_rank = pp_size
+            elif vpp_rank == 0:
+                num_microbatch_this_pp_rank = pp_size + max(
+                    (pp_size - pp_rank) * 2 - 1 - pp_size, 0
+                )
+            elif vpp_rank == len(model) - 1:
+                num_microbatch_this_pp_rank = min((pp_size - pp_rank) * 2 + 1, pp_size)
+        else:
+            num_microbatch_this_pp_rank = pp_size - pp_rank
+        num_parameter_this_shard_sparse = 0
+        for layer in one_chunk.decoder.layers.modules:
+            if isinstance(layer.mlp, MoELayer):
+                num_parameter_this_shard_sparse += layer.mlp.num_parameter()
+                if (
+                    "shared_experts" in layer.mlp.__dir__()
+                    and layer.mlp.shared_experts is not None
+                ):
+                    num_parameter_this_shard_sparse -= (
+                        layer.mlp.shared_experts.num_parameter()
+                    )
+        one_chunk.__repr__()
+        print(one_chunk)
+        print(
+            f"Number of parameters in every GPU in billions: "
+            f"{num_parameter_this_shard / 10**9: .2f} where mlp part is {num_parameter_this_shard_sparse / 10**9: .2f}"
+        )
+        num_parameter_this_shard_all += num_parameter_this_shard
+        num_parameter_this_shard_sparse_all += num_parameter_this_shard_sparse
+        # recompute
+        if config.recompute_granularity == "full":
+            recompute_num_layers = config.recompute_num_layers
+            num_layers = one_chunk.num_layers
+            common_act = (
+                one_chunk.num_act_pre
+                + one_chunk.num_act_between_layers
+                * num_layers
+                * num_microbatch_this_pp_rank
+            )  # recompute with pipeline parallel
+            info = "With this recomputing setting, the number of activation achieve peak when "
+            if config.recompute_method == "block":
+                num_layers_with_loss = num_layers - recompute_num_layers
+                if num_layers_with_loss == 0:
+                    peak1 = common_act + one_chunk.num_act_post
+                    peak2 = common_act + one_chunk.num_act_per_layer
+                    if peak1 > peak2:
+                        info += "calculating loss"
+                    else:
+                        info += "back-propogating loss"
+                    num_activation = max(peak1, peak2)
+                else:
+                    info += f"calculating loss with {num_layers_with_loss} non-recompute layers"
+                    num_activation = (
+                        common_act
+                        + one_chunk.num_act_post
+                        + one_chunk.num_act_per_layer
+                        * num_layers_with_loss
+                        * num_microbatch_this_pp_rank
+                    )
+            elif config.recompute_method == "uniform":
+                peak1 = common_act + one_chunk.num_act_post
+                peak2 = (
+                    (common_act + one_chunk.num_act_per_layer)
+                    if vpp_rank == 0
+                    else (common_act)
+                )
+                if peak1 > peak2:
+                    info += "calculating loss"
+                else:
+                    info += f"back-propogating loss recomputing every {recompute_num_layers} layers"
+                num_activation = max(peak1, peak2)
+            if len(one_chunk.decoder.layers.modules) > 0 and isinstance(
+                one_chunk.decoder.layers.modules[0].self_attention, MLASelfAttention
+            ):  # MLA recompute achieve peak at backward
+                num_activation += one_chunk.decoder.layers.modules[
+                    0
+                ].self_attention.core_attention.num_activation()
+            print(info)
+        else:
+            num_activation = (
+                num_activation - one_chunk.num_act_post
+            ) * num_microbatch_this_pp_rank + one_chunk.num_act_post
+        # CP
+        num_activation = num_activation / config.context_parallel_size
+        if pp_size == 1:
+            print(
+                f"Number of activation in every GPU in billions: "
+                f"{num_activation / 10**9: .2f} where mlp part is {num_activation_this_shard_mlp / 10**9: .2f}"
+            )
+        else:
+            print(
+                f"Number of activation per microbatch in every GPU in billions: "
+                f"{num_activation / 10**9: .2f} where mlp part is {num_activation_this_shard_mlp / 10**9: .2f}"
+                f", {num_microbatch_this_pp_rank=} {vpp_rank=}"
+            )
+        num_activation_all += num_activation
+    num_bytes_per_parameter = (
+        18
+        if not args.use_distributed_optimizer
+        else 6 + (12 / args.data_parallel_size / config.context_parallel_size)
+    )
+    if config.expert_model_parallel_size * config.expert_tensor_parallel_size > 1:
+        num_bytes_per_parameter_dense = num_bytes_per_parameter
+        num_bytes_per_parameter_moe = (
+            18
+            if not args.use_distributed_optimizer
+            else 6
+            + (
+                12
+                / (
+                    args.world_size
+                    / config.pipeline_model_parallel_size
+                    / config.expert_model_parallel_size
+                    / config.expert_tensor_parallel_size
+                )
+            )
+        )
+        print(f"{num_bytes_per_parameter_dense=} {num_bytes_per_parameter_moe=}")
+        weight_grad_memory = num_parameter_this_shard_all * 6 / NUM_BYTES_IN_GIGABYTE
+        weight_grad_optim_memory = (
+            (num_parameter_this_shard_all - num_parameter_this_shard_sparse_all)
+            * num_bytes_per_parameter_dense
+            + num_parameter_this_shard_sparse_all * num_bytes_per_parameter_moe
+        ) / NUM_BYTES_IN_GIGABYTE
+    else:
+        print(f"{num_bytes_per_parameter=}")
+        weight_grad_memory = num_parameter_this_shard_all * 6 / NUM_BYTES_IN_GIGABYTE
+        weight_grad_optim_memory = (
+            num_parameter_this_shard_all
+            * num_bytes_per_parameter
+            / NUM_BYTES_IN_GIGABYTE
+        )
+    activation_memory = (
+        num_activation_all * 2 / NUM_BYTES_IN_GIGABYTE
+    )  # only support fp16
+    total_memory = weight_grad_optim_memory + activation_memory
+    print(
+        f"Theoretical memory footprints: weight and optimizer={weight_grad_optim_memory:.2f} GB, "
+        f"activation={activation_memory:.2f} GB, total={total_memory:.2f} GB\n"
+    )
+    # 生成与 estimate_from_config 相同格式的聚合报告
+    model_breakdown_concat = "\n\n".join(
+        [f"--- vpp_chunk {i} ---\n{str(m)}" for i, m in enumerate(model)]
+    )
+    report = {
+        "pp_rank": pp_rank,
+        "parameters_b": num_parameter_this_shard_all / 1e9,
+        "activation_b": num_activation_all / 1e9,
+        "weight_grad_gb": round(weight_grad_memory, 2),
+        "weight_grad_optim_gb": round(weight_grad_optim_memory, 2),
+        "activation_gb": round(activation_memory, 2),
+        "total_gb": round(total_memory, 2),
+        "model_breakdown": model_breakdown_concat,
+        "details": None,
+    }
+    return output_shape, report
+if __name__ == "__main__":
+    initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True)
+    import ipdb
+    with ipdb.launch_ipdb_on_exception():
+        args = get_args()
+        report_memory_usage(args)

moe_mem_estimator/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.

moe_mem_estimator/base.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from abc import ABC
+from megatron.core.transformer.transformer_config import TransformerConfig
+from termcolor import colored
+from torch.nn.modules.module import _addindent
+def prehook_save_input_shape(func):
+    def wrapper(self, *input_shapes, **kw_input_shapes):
+        if len(input_shapes) + len(kw_input_shapes) == 0:
+            if "_input_shape" in self.__dict__:
+                return func(self, *self._input_shape, **self._kw_input_shapes)
+            else:
+                return 0
+        self._input_shape = input_shapes
+        self._kw_input_shapes = kw_input_shapes
+        return func(self, *self._input_shape, **self._kw_input_shapes)
+    return wrapper
+class MetaBase(type):
+    def __new__(cls, name, bases, attrs):
+        if "num_activation" in attrs:
+            attrs["num_activation"] = prehook_save_input_shape(attrs["num_activation"])
+        return super().__new__(cls, name, bases, attrs)
+class MemEstimator(metaclass=MetaBase):
+    def __init__(self, *args, **kwargs):
+        self._modules = {}
+        pass
+    def __repr__(self):
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        # extra_repr = self.extra_repr()
+        # # empty string will be split into list ['']
+        # if extra_repr:
+        #     extra_lines = extra_repr.split("\n")
+        child_lines = []
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append("(" + key + "): " + mod_str)
+        lines = extra_lines + child_lines
+        stat = (
+            "\t/* n_params="
+            + colored(f"{self.num_parameter()/1024/1024:.2f}M", "red")
+            + "\tn_act="
+            + colored(f"{self.num_activation()/1024/1024:.2f}M", "green")
+            + " */"
+        )
+        main_str = self._get_name() + stat + " ("
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
+        return f"{self.__class__.__name__} n_param={self.num_parameter()}"
+    def dump(self):
+        ret = {}
+        ret["name"] = self._get_name()
+        ret["n_params"] = self.num_parameter()
+        ret["n_act"] = self.num_activation()
+        modules = {}
+        for key, module in self._modules.items():
+            modules[key] = module.dump()
+        if len(modules) > 0:
+            ret["modules"] = modules
+        return ret
+    def _get_name(self):
+        return self.__class__.__name__
+    def num_parameter(self):
+        """
+        Calculate number of the model parameters
+        """
+        raise NotImplemented
+    def num_activation(self, input_shape: list[int]):
+        """
+        Calculate number of the activation with given input_shape.
+        Args:
+            input shape
+        """
+        raise NotImplemented
+    def mock_forward(self, input_shape: list[int]):
+        """
+        Mock the forward.
+        Args:
+            input shape
+        return:
+            output shape
+        """
+        raise NotImplemented
+    def __setattr__(self, name: str, value) -> None:
+        if isinstance(value, MemEstimator):
+            modules = self.__dict__.get("_modules")
+            modules[name] = value
+        else:
+            pass
+        return super().__setattr__(name, value)
+    def __delattr__(self, name):
+        modules = self.__dict__.get("_modules")
+        if name in modules:
+            del modules[name]
+        return super().__delattr__(name)
+_global_config: TransformerConfig = None
+def set_global_config(cfg):
+    global _global_config
+    _global_config = cfg
+def get_tensor_model_parallel_world_size():
+    global _global_config
+    return _global_config.tensor_model_parallel_size
+def get_tensor_model_parallel_rank():
+    return 0
+def get_expert_tensor_parallel_world_size():
+    global _global_config
+    return _global_config.expert_tensor_parallel_size
+def get_expert_tensor_parallel_rank():
+    return 0
+_pp_rank = 0
+def set_pipeline_model_parallel_rank(rank):
+    global _pp_rank
+    _pp_rank = rank
+def get_pipeline_model_parallel_rank():
+    global _pp_rank
+    return _pp_rank
+def get_virtual_pipeline_model_parallel_rank():
+    return 0
+def get_pipeline_model_parallel_world_size():
+    global _global_config
+    return _global_config.pipeline_model_parallel_size
+def get_expert_model_parallel_rank():
+    return 0
+def get_expert_model_parallel_world_size():
+    global _global_config
+    return _global_config.expert_model_parallel_size
+def get_virtual_pipeline_model_parallel_world_size():
+    global _global_config
+    return _global_config.virtual_pipeline_model_parallel_size
+def is_pipeline_first_stage(ignore_virtual=False, vp_stage=None):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    if (
+        not ignore_virtual
+        and get_virtual_pipeline_model_parallel_world_size() is not None
+    ):
+        if vp_stage != 0:
+            return False
+    return get_pipeline_model_parallel_rank() == 0
+def is_pipeline_last_stage(ignore_virtual=False, vp_stage=None):
+    """Return True if in the last pipeline-model-parallel stage, False otherwise."""
+    if (
+        not ignore_virtual
+        and get_virtual_pipeline_model_parallel_world_size() is not None
+    ):
+        if vp_stage != (get_virtual_pipeline_model_parallel_world_size() - 1):
+            return False
+    return get_pipeline_model_parallel_rank() == (
+        get_pipeline_model_parallel_world_size() - 1
+    )
+def cum_mul(l: list):
+    try:
+        ret = 1
+        for one in l:
+            ret *= one
+        return ret
+    except:
+        return 0
+        __import__("ipdb").set_trace()

moe_mem_estimator/gpt_model.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from typing import Dict, Literal, Optional, Union
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.tensor_parallel.utils import VocabUtility
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import (
+    TransformerBlockSubmodules,
+    _get_block_submodules,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+from .base import (
+    MemEstimator,
+    cum_mul,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    set_global_config,
+)
+from .layers import ColumnParallelLinear, LanguageModelEmbedding, TransformerBlock
+class GPTModel(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        transformer_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal[
+            "learned_absolute", "rope", "none"
+        ] = "learned_absolute",
+        rotary_percent: float = 1.0,
+        rotary_base: int = 10000,
+        rope_scaling: bool = False,
+        seq_len_interpolation_factor: Optional[float] = None,
+        mtp_block_spec: Optional[ModuleSpec] = None,
+        vp_stage: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        config.use_cpu_initialization = True
+        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+        # These 4 attributes are needed for TensorRT-LLM export.
+        self.max_position_embeddings = max_sequence_length
+        self.rotary_percent = rotary_percent
+        self.rotary_base = rotary_base
+        self.rotary_scaling = rope_scaling
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=position_embedding_type,
+            )
+        # remove RotaryEmbedding
+        # Transformer.
+        self.decoder = TransformerBlock(
+            config=self.config,
+            spec=transformer_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            vp_stage=vp_stage,
+        )
+        # Output
+        if post_process:
+            if self.config.defer_embedding_wgrad_compute:
+                self.embedding_activation_buffer = []
+                self.grad_output_buffer = []
+            else:
+                self.embedding_activation_buffer = None
+                self.grad_output_buffer = None
+            self.output_layer = ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+                embedding_activation_buffer=self.embedding_activation_buffer,
+                grad_output_buffer=self.grad_output_buffer,
+            )
+    def num_parameter(self):
+        ret = 0
+        if self.pre_process:
+            ret += self.embedding.num_parameter()
+        ret += self.decoder.num_parameter()
+        if self.post_process:
+            ret += self.output_layer.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        self._inited = True
+        ret = 0
+        self.num_act_pre = 0
+        self.num_act_post = 0
+        self.num_act_per_layer = 0
+        self.num_act_between_layers = 0
+        self.num_layers = self.decoder.layers.modules.__len__()
+        if self.pre_process:
+            self.num_act_pre = self.embedding.num_activation(input_shape)
+            ret += self.num_act_pre
+            input_shape = self.embedding.mock_forward(input_shape)
+        ret += self.decoder.num_activation(input_shape)
+        if self.decoder.layers.modules.__len__() > 0:
+            self.num_act_per_layer = self.decoder.layers.modules[0].num_activation()
+        input_shape = self.decoder.mock_forward(input_shape)
+        self.num_act_between_layers = cum_mul(input_shape)
+        if self.post_process:
+            self.num_act_post = self.output_layer.num_activation(input_shape)
+            softmax_activation = (
+                self.output_layer.num_activation(input_shape) * 2
+            )  # due to softmax is calculate in fp32
+            self.num_act_post += softmax_activation
+            ret += self.num_act_post
+        return ret
+    def mock_forward(self, input_shape: list[int]):
+        if self.pre_process:
+            input_shape = self.embedding.mock_forward(input_shape)
+        input_shape = self.decoder.mock_forward(input_shape)
+        if self.post_process:
+            input_shape = self.output_layer.mock_forward(input_shape)
+        return input_shape

moe_mem_estimator/layers.py ADDED Viewed

	@@ -0,0 +1,1940 @@

+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+import math
+import types
+import warnings
+from copy import deepcopy
+from typing import Dict, Literal, Optional, Union
+from megatron.core.extensions.transformer_engine import (
+    _get_extra_te_kwargs,
+    condition_init_method,
+    get_expert_parallel_rng_tracker_name,
+)
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.models.common.embeddings import (
+    _yarn_get_mscale,
+    apply_rotary_pos_emb,
+)
+from megatron.core.tensor_parallel.utils import VocabUtility
+from megatron.core.transformer import transformer_layer
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec, import_module
+from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
+from megatron.core.transformer.transformer_config import (
+    MLATransformerConfig,
+    TransformerConfig,
+)
+from megatron.core.utils import divide
+from .base import (
+    MemEstimator,
+    _addindent,
+    colored,
+    cum_mul,
+    get_expert_model_parallel_rank,
+    get_expert_model_parallel_world_size,
+    get_expert_tensor_parallel_rank,
+    get_expert_tensor_parallel_world_size,
+    get_pipeline_model_parallel_rank,
+    get_pipeline_model_parallel_world_size,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    is_pipeline_first_stage,
+    is_pipeline_last_stage,
+    set_global_config,
+)
+class LanguageModelEmbedding(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int,
+        max_sequence_length: int,
+        position_embedding_type: Literal[
+            "learned_absolute", "rope", "none"
+        ] = "learned_absolute",
+        num_tokentypes: int = 0,
+    ):
+        super().__init__()
+        self.config: TransformerConfig = config
+        self.vocab_size: int = vocab_size
+        self.max_sequence_length: int = max_sequence_length
+        self.add_position_embedding: bool = (
+            position_embedding_type == "learned_absolute"
+        )
+        self.num_tokentypes = num_tokentypes
+        self.reduce_scatter_embeddings = (
+            (not self.add_position_embedding)
+            and self.num_tokentypes <= 0
+            and self.config.sequence_parallel
+        )
+        # Word embeddings (parallel).
+        self.word_embeddings = VocabParallelEmbedding(
+            num_embeddings=self.vocab_size,
+            embedding_dim=self.config.hidden_size,
+            init_method=self.config.init_method,
+            reduce_scatter_embeddings=self.reduce_scatter_embeddings,
+            config=self.config,
+        )
+        # TODO if self.add_position_embedding:
+        # TODO if self.num_tokentypes > 0:
+        self.embedding_dropout = Dropout(self.config.hidden_dropout)
+    def num_parameter(self):
+        ret = self.word_embeddings.num_parameter()
+        ret += self.embedding_dropout.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        ret = self.word_embeddings.num_activation(input_shape)
+        input_shape = self.word_embeddings.mock_forward(input_shape)
+        ret += self.embedding_dropout.num_activation(input_shape)
+        return ret
+    def mock_forward(self, input_shape: list[int]):
+        input_shape = self.word_embeddings.mock_forward(input_shape)
+        return input_shape
+class VocabParallelEmbedding(MemEstimator):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        init_method,
+        reduce_scatter_embeddings: bool = False,
+        config: ModelParallelConfig,
+    ):
+        super().__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.reduce_scatter_embeddings = reduce_scatter_embeddings
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        # Divide the weight matrix along the vocaburaly dimension.
+        (self.vocab_start_index, self.vocab_end_index) = (
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings,
+                get_tensor_model_parallel_rank(),
+                self.tensor_model_parallel_size,
+            )
+        )
+        self.num_embeddings_per_partition = (
+            self.vocab_end_index - self.vocab_start_index
+        )
+        self.deterministic_mode = config.deterministic_mode
+        self.weight = (self.num_embeddings_per_partition, self.embedding_dim)
+    def num_parameter(self):
+        return self.weight[0] * self.weight[1]
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape) * self.weight[1]
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape + [self.weight[1]]
+class Dropout(MemEstimator):
+    def __init__(self, p=0, *args, **kwargs):
+        super().__init__()
+        self.p = p
+    def num_parameter(self):
+        return 0
+    def num_activation(self, input_shape: list[int]):
+        if self.p == 0:
+            return 0
+        return cum_mul(input_shape[:])
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class ColumnParallelLinear(MemEstimator):
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        *,
+        config: ModelParallelConfig,
+        init_method,
+        bias=True,
+        gather_output=False,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        skip_weight_param_allocation: bool = False,
+        embedding_activation_buffer=None,
+        grad_output_buffer=None,
+        is_expert: bool = False,
+        tp_comm_buffer_name: str = None,  # Not used
+        disable_grad_reduce: bool = False,
+        is_mla: bool = False,
+    ):
+        super().__init__()
+        if is_mla and config.sequence_parallel:
+            tp_size = get_tensor_model_parallel_world_size()
+            output_size = divide(output_size, tp_size)
+            parallel_mode = None
+            tp_size = 1
+            tp_group = None
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        self.skip_bias_add = skip_bias_add
+        self.is_expert = is_expert
+        self.expert_parallel = config.expert_model_parallel_size > 1
+        self.embedding_activation_buffer = embedding_activation_buffer
+        self.grad_output_buffer = grad_output_buffer
+        self.config = config
+        self.disable_grad_reduce = disable_grad_reduce
+        if is_expert:
+            world_size = get_expert_tensor_parallel_world_size()
+            rank = get_expert_tensor_parallel_rank()
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+        self.output_size_per_partition = divide(output_size, world_size)
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if not skip_weight_param_allocation:
+            self.weight = (self.output_size_per_partition, self.input_size)
+        else:
+            self.weight = (self.output_size_per_partition, self.input_size)
+        if bias:
+            self.bias = [self.output_size_per_partition]
+        else:
+            self.bias = None
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and world_size <= 1:
+            warnings.warn(
+                "`sequence_parallel` is set to `True`, but tensor model parallel size "
+                f"is {world_size}. Disabling sequence parallel."
+            )
+            self.sequence_parallel = False
+        self.allreduce_dgrad = (
+            world_size > 1
+            and not self.sequence_parallel
+            and not self.disable_grad_reduce
+        )
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+    def num_parameter(self):
+        ret = cum_mul(self.weight)
+        if self.bias is not None:
+            ret += self.bias[0]
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:-1]) * self.weight[0]
+    def mock_forward(self, input_shape: list[int]):
+        try:
+            assert self.weight[-1] == input_shape[-1]
+        except:
+            print(f"{self.weight=} {input_shape=}")
+            __import__("ipdb").set_trace()
+            raise
+        return input_shape[:-1] + [self.weight[0]]
+class RowParallelLinear(MemEstimator):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method,
+        bias: bool,
+        input_is_parallel: bool,
+        skip_bias_add: bool,
+        stride: int = 1,
+        keep_master_weight_for_test: bool = False,
+        is_expert: bool = False,
+        tp_comm_buffer_name: str = None,  # Not used
+    ):
+        super().__init__()
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        self.skip_bias_add = skip_bias_add
+        self.config = config
+        self.is_expert = is_expert
+        self.expert_parallel = config.expert_model_parallel_size > 1
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and not self.input_is_parallel:
+            raise RuntimeError(
+                "To enable `sequence_parallel`, `input_is_parallel` must be `True`"
+            )
+        # Divide the weight matrix along the last dimension.
+        if self.is_expert:
+            world_size = get_expert_tensor_parallel_world_size()
+            rank = get_expert_tensor_parallel_rank()
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.weight = (self.output_size, self.input_size_per_partition)
+        if bias:
+            self.bias = [self.output_size]
+        else:
+            self.bias = None
+    def num_parameter(self):
+        ret = cum_mul(self.weight)
+        if self.bias is not None:
+            ret += self.bias[0]
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:-1]) * self.weight[1]
+    def mock_forward(self, input_shape: list[int]):
+        assert self.weight[0] == input_shape[-1]
+        return input_shape[:-1] + [self.weight[1]]
+class RMSNorm(MemEstimator):
+    def __init__(self, hidden_size: int, *args, **kwargs):
+        super().__init__()
+        self.weight = hidden_size
+    def num_parameter(self):
+        return self.weight
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:])
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class GetBiasDropoutAdd(MemEstimator):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+    def num_parameter(self):
+        return 0
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:])
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+get_bias_dropout_add = GetBiasDropoutAdd()
+class MLP(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules,
+        is_expert: bool = False,
+        input_size: int = None,
+    ):
+        super().__init__()
+        self.config: TransformerConfig = config
+        self.input_size = input_size if input_size != None else self.config.hidden_size
+        # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        ffn_hidden_size = self.config.ffn_hidden_size
+        if self.config.gated_linear_unit:
+            ffn_hidden_size *= 2
+        self.linear_fc1 = build_module(
+            submodules.linear_fc1,
+            self.input_size,
+            ffn_hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=is_expert,
+            tp_comm_buffer_name="fc1",
+        )
+        self.activation_func = self.config.activation_func
+        self.linear_fc2 = build_module(
+            submodules.linear_fc2,
+            self.config.ffn_hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=is_expert,
+            tp_comm_buffer_name="fc2",
+        )
+    def num_parameter(self):
+        return self.linear_fc1.num_parameter() + self.linear_fc2.num_parameter()
+    def num_activation(self, input_shape: list[int]):
+        result = 0
+        result += self.linear_fc1.num_activation(input_shape)
+        intermediate_shape = self.linear_fc1.mock_forward(input_shape)
+        result += cum_mul(intermediate_shape) / 2  # activation layer
+        self.linear_fc2.num_activation(intermediate_shape)
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        intermediate_shape = self.linear_fc1.mock_forward(input_shape)
+        output_shape = self.linear_fc2.mock_forward(intermediate_shape)
+        return output_shape
+class ModuleList(MemEstimator):
+    def __init__(self, modules: list[MemEstimator] = None):
+        super().__init__()
+        if modules is None:
+            modules = []
+        self.modules = modules
+    def __repr__(self):
+        """Return a custom repr for ModuleList that compresses repeated module representations."""
+        list_of_reprs = [repr(item) for item in self.modules]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + "()"
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+        lines = []
+        stat = (
+            "\t/* n_params="
+            + colored(f"{self.num_parameter()/1024/1024:.2f}M", "red")
+            + "\tn_act="
+            + colored(f"{self.num_activation()/1024/1024:.2f}M", "green")
+            + " */"
+        )
+        main_str = self._get_name() + stat + " ("
+        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
+            local_repr = f"({start_id}): {b}"  # default repr
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                local_repr = f"({start_id}-{end_id}): {n} x {b}"
+            local_repr = _addindent(local_repr, 2)
+            lines.append(local_repr)
+        main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
+    def dump(self):
+        list_of_reprs = [repr(item) for item in self.modules]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + "()"
+        list_of_dumps = [item.dump() for item in self.modules]
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        repeated_blocks_dump = [list_of_dumps[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+            repeated_blocks_dump(list_of_dumps[i])
+        modules = {}
+        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks_dump):
+            key = f"({start_id})"
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                key = f"({start_id}-{end_id}) {n} layers"
+            modules[key] = b
+        ret = {}
+        ret["name"] = self._get_name()
+        ret["n_params"] = self.num_parameter()
+        ret["n_act"] = self.num_activation()
+        if len(modules) > 0:
+            ret["modules"] = modules
+        return ret
+    def append(self, m: MemEstimator):
+        self.modules.append(m)
+    def __len__(
+        self,
+    ):
+        return self.modules.__len__()
+    def num_parameter(self):
+        return sum([x.num_parameter() for x in self.modules])
+    def num_activation(self, input_shape: list[int]):
+        result = 0
+        for m in self.modules:
+            result += m.num_activation(input_shape)
+            input_shape = m.mock_forward(input_shape)
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        for m in self.modules:
+            result += m.num_activation(input_shape)
+            input_shape = m.mock_forward(input_shape)
+        return input_shape
+class SequentialMLP(MemEstimator):
+    def __init__(self, num_local_experts, config: TransformerConfig, submodules):
+        super().__init__()
+        self.config = config
+        self.add_bias = config.add_bias_linear
+        self.moe_extended_tp = config.moe_extended_tp
+        self.num_local_experts = num_local_experts
+        self.local_experts = ModuleList()
+        for _ in range(self.num_local_experts):
+            expert = MLP(self.config, submodules, is_expert=True)
+            self.local_experts.append(expert)
+    def num_parameter(self):
+        return self.local_experts.num_parameter()
+    def num_activation(self, input_shape: list[int], tokens_per_expert=None):
+        # assume all the inputs are routed equally
+        all_tokens = input_shape[1]
+        result = 0
+        for m in self.local_experts.modules:
+            result += m.num_activation(
+                input_shape[:1]
+                + [all_tokens // self.num_local_experts]
+                + input_shape[2:]
+            )
+        return result
+    def mock_forward(self, input_shape: list[int], tokens_per_expert=None):
+        # assume all the inputs are routed to the first expert
+        input_shape = self.local_experts.modules[0].mock_forward(input_shape)
+        return input_shape
+class TEGroupedMLP(MemEstimator):
+    """An efficient implementation of the Experts layer using TE's GroupedLinear.
+    Executes multiple experts in parallel to maximize computational efficiency.
+    """
+    def __init__(self, num_local_experts, config: TransformerConfig, submodules):
+        super().__init__()
+        self.config = config
+        self.moe_extended_tp = config.moe_extended_tp
+        self.num_local_experts = num_local_experts
+        self.input_size = self.config.hidden_size
+        # Double the output width with gated linear unit, see https://arxiv.org/pdf/2002.05202.pdf
+        ffn_hidden_size = self.config.moe_ffn_hidden_size
+        if self.config.gated_linear_unit:
+            ffn_hidden_size *= 2
+        self.linear_fc1 = build_module(
+            submodules.linear_fc1,
+            self.num_local_experts,
+            self.input_size,
+            ffn_hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=True,
+            tp_comm_buffer_name="fc1",
+        )
+        self.activation_func = self.config.activation_func
+        self.activation_recompute = (
+            self.config.recompute_granularity == "selective"
+            and "moe_act" in self.config.recompute_modules
+        )
+        self.linear_fc2 = build_module(
+            submodules.linear_fc2,
+            self.num_local_experts,
+            self.config.moe_ffn_hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=True,
+            tp_comm_buffer_name="fc2",
+        )
+        # TODO if self.config.fp8:
+    def num_parameter(self):
+        ret = self.linear_fc1.num_parameter()
+        ret += self.linear_fc2.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int], tokens_per_expert=None):
+        ret = 0
+        if not self.activation_recompute:
+            ret += self.linear_fc1.num_activation(input_shape)
+        input_shape = self.linear_fc1.mock_forward(input_shape)
+        # activation
+        if not self.activation_recompute:
+            ret += cum_mul(input_shape) / 2  # swiglu or gelu
+        input_shape = deepcopy(input_shape)
+        input_shape[-1] //= 2
+        self.linear_fc2.num_activation(input_shape)
+        return ret
+    def mock_forward(self, input_shape: list[int], tokens_per_expert=None):
+        # assume all the inputs are routed to the first expert
+        input_shape = self.local_experts.modules[0].mock_forward(input_shape)
+        return input_shape
+class TEGroupedLinear(MemEstimator):
+    def __init__(
+        self,
+        num_gemms: int,
+        input_size: int,
+        output_size: int,
+        *,
+        parallel_mode: str,
+        config: ModelParallelConfig,
+        init_method,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool = False,
+        tp_comm_buffer_name: str = None,
+    ):
+        super().__init__()
+        self.config = config
+        # TE returns a zero length Tensor when bias=False and
+        # return_bias=True, but we prefer None.  So in that case we
+        # tell TE to not return the bias, and return None
+        # ourselves. This way our forward always returns two values
+        # and we don't have to deal with the zero length Tensor.
+        self.te_return_bias = skip_bias_add and bias
+        self.is_first_microbatch = True
+        self.disable_parameter_transpose_cache = (
+            self.config.disable_parameter_transpose_cache
+        )
+        extra_kwargs = _get_extra_te_kwargs(config)
+        extra_kwargs["ub_name"] = tp_comm_buffer_name
+        self.expert_parallel = self.config.expert_model_parallel_size > 1
+        if self.expert_parallel:
+            extra_kwargs["rng_tracker_name"] = get_expert_parallel_rng_tracker_name()
+        # For MoE models, the comms between TP and EP group is explicitly handled by
+        # MoE token dispatcher. So we disable comms by making TE agnostic of model parallel.
+        self.explicit_expert_comm = is_expert and (
+            config.tensor_model_parallel_size > 1 or self.expert_parallel
+        )
+        if is_expert:
+            tp_size = get_expert_tensor_parallel_world_size()
+        else:
+            tp_size = get_tensor_model_parallel_world_size()
+        if self.explicit_expert_comm:
+            if parallel_mode == "column":
+                output_size = divide(output_size, tp_size)
+            elif parallel_mode == "row":
+                input_size = divide(input_size, tp_size)
+            parallel_mode = None
+            tp_size = 1
+        assert not bias, "bias is not considered for now"
+        self.num_gemms = num_gemms
+        self.input_size = input_size
+        self.output_size = output_size
+    def num_parameter(self):
+        ret = self.num_gemms * self.input_size * self.output_size
+        return ret
+    def num_activation(self, input_shape: list[int], tokens_per_expert=None):
+        ret = cum_mul(self.mock_forward(input_shape))
+        return ret
+    def mock_forward(self, input_shape: list[int], tokens_per_expert=None):
+        return input_shape[:-1] + [self.output_size]
+class TEColumnParallelGroupedLinear(TEGroupedLinear):
+    def __init__(
+        self,
+        num_gemms: int,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        tp_comm_buffer_name: str = None,
+    ):
+        super().__init__(
+            num_gemms=num_gemms,
+            input_size=input_size,
+            output_size=output_size,
+            parallel_mode="column",
+            config=config,
+            init_method=condition_init_method(config, init_method),
+            bias=bias,
+            skip_bias_add=skip_bias_add,
+            is_expert=is_expert,
+            tp_comm_buffer_name=tp_comm_buffer_name,
+        )
+class TERowParallelGroupedLinear(TEGroupedLinear):
+    def __init__(
+        self,
+        num_gemms: int,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        tp_comm_buffer_name: str = None,
+    ):
+        super().__init__(
+            num_gemms=num_gemms,
+            input_size=input_size,
+            output_size=output_size,
+            parallel_mode="row",
+            config=config,
+            init_method=condition_init_method(config, init_method),
+            bias=bias,
+            skip_bias_add=skip_bias_add,
+            is_expert=is_expert,
+            tp_comm_buffer_name=tp_comm_buffer_name,
+        )
+class SharedExpertMLP(MLP):
+    """
+    MLP layer for Shared Experts.
+    """
+    def __init__(self, config: TransformerConfig, spec: ModuleSpec):
+        config = deepcopy(config)
+        assert (
+            config.add_bias_linear == False
+        ), "bias is not supported in the shared experts, "
+        "please set '--disable-bias-linear' instead."
+        config.ffn_hidden_size = config.moe_shared_expert_intermediate_size
+        super().__init__(config=config, submodules=spec.submodules)
+        self.use_shared_expert_gate = spec.params.get("gate", False)
+        if self.use_shared_expert_gate:
+            assert False, "use_shared_expert_gate is not Implemented"
+            # self.gate_weight = torch.nn.Parameter(torch.empty((1, self.config.hidden_size)))
+            # if config.perform_initialization:
+            #     if get_cuda_rng_tracker().is_initialized():
+            #         with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()):
+            #             config.init_method(self.gate_weight)
+            # else:
+            #     config.init_method(self.gate_weight)
+            # self.gate_weight.data = self.gate_weight.data.to(dtype=config.params_dtype)
+            # setattr(self.gate_weight, 'sequence_parallel', self.config.sequence_parallel)
+        else:
+            self.gate_weight = None
+class TransformerBlock(MemEstimator):
+    """Transformer class."""
+    def __init__(
+        self,
+        config: TransformerConfig,
+        spec: Union[TransformerBlockSubmodules, ModuleSpec],
+        post_layer_norm: bool = True,
+        pre_process: bool = True,
+        post_process: bool = True,
+        vp_stage: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.submodules = _get_block_submodules(config, spec, vp_stage)
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.vp_stage = vp_stage
+        self.cuda_graphs = {}
+        self.current_microbatch = -1
+        self.input_tensor = None
+        self.checkpoint_core_attention = (
+            self.config.recompute_granularity == "selective"
+            and "core_attn" in self.config.recompute_modules
+        )
+        self._build_layers()
+        self.num_layers_per_pipeline_rank = len(self.layers)
+        self.tp_only_amax_red = config.tp_only_amax_red
+    def _build_layers(self):
+        def build_layer(layer_spec, layer_number):
+            return build_module(
+                layer_spec,
+                config=self.config,
+                layer_number=layer_number,
+                vp_stage=self.vp_stage,
+            )
+        # offset is implicit in TransformerLayer
+        self.layers = ModuleList(
+            [
+                build_layer(layer_spec, i + 1)
+                for i, layer_spec in enumerate(self.submodules.layer_specs)
+            ]
+        )
+        if self.submodules.layer_norm and self.post_process and self.post_layer_norm:
+            self.final_layernorm = build_module(
+                self.submodules.layer_norm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.final_layernorm = None  # Either this or nn.Identity
+    def num_parameter(self):
+        ret = self.layers.num_parameter()
+        if self.final_layernorm is not None:
+            ret += self.final_layernorm.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        result = self.layers.num_activation(input_shape)
+        if self.final_layernorm is not None:
+            result += self.final_layernorm.num_activation(input_shape)
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class TopKRouter(MemEstimator):
+    def __init__(self, config: TransformerConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.topk = self.config.moe_router_topk
+        self.routing_type = self.config.moe_router_load_balancing_type
+        self.input_jitter = None
+    def num_parameter(self):
+        return 0
+    def num_activation(self, input_shape: list[int]):
+        result = cum_mul(input_shape) * 2  # sinkhorn and sinkhorn activation
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape[:-1] + [self.topk]
+class MoELayer(MemEstimator):
+    def __init__(
+        self, config: TransformerConfig, submodules=None, layer_number: int = None
+    ):
+        super().__init__()
+        self.config = config
+        self.submodules = submodules
+        self.moe_layer_recompute = config.moe_layer_recompute
+        self.expert_parallel_size = get_expert_model_parallel_world_size()
+        assert (
+            self.expert_parallel_size > 0
+        ), "Expected non-negative expert parallel size"
+        assert self.config.num_moe_experts % self.expert_parallel_size == 0
+        self.num_local_experts = (
+            self.config.num_moe_experts // self.expert_parallel_size
+        )
+        local_expert_indices_offset = (
+            get_expert_model_parallel_rank() * self.num_local_experts
+        )
+        self.moe_layer_recompute = (
+            config.recompute_granularity == "selective"
+            and "moe" in config.recompute_modules
+        )
+        self.router = TopKRouter(config=self.config)
+        self.use_shared_expert = (
+            self.config.moe_shared_expert_intermediate_size is not None
+        )
+        self.shared_expert_overlap = self.config.moe_shared_expert_overlap
+        self.local_expert_indices = [
+            local_expert_indices_offset + i for i in range(self.num_local_experts)
+        ]
+        assert all(
+            map(lambda x: x < self.config.num_moe_experts, self.local_expert_indices)
+        )
+        self.experts = None
+        self.shared_experts = None
+        self.token_dispatcher = None
+        self.layer_number = layer_number
+        # Initialize experts
+        self.experts = build_module(
+            self.submodules.experts, self.num_local_experts, self.config
+        )
+        # Initialize shared experts
+        if self.use_shared_expert:
+            self.shared_experts = SharedExpertMLP(
+                self.config, self.submodules.shared_experts
+            )
+            # if self.shared_expert_overlap:
+            #     self.token_dispatcher.set_shared_experts(self.shared_experts)
+    def num_parameter(self):
+        ret = self.experts.num_parameter() + self.router.num_parameter()
+        if self.use_shared_expert:
+            ret += self.shared_experts.num_parameter()
+        return ret
+    def num_activation(self, input_shape: list[int]):
+        if self.moe_layer_recompute:
+            return 0
+        tp_size = get_tensor_model_parallel_world_size()
+        etp_size = get_expert_tensor_parallel_world_size()
+        new_input_shape = deepcopy(input_shape)
+        new_input_shape[1] = input_shape[1] // tp_size * etp_size
+        input_shape = new_input_shape
+        result = self.router.num_activation(input_shape)
+        result += cum_mul(input_shape) * self.router.topk  # token dispatcher
+        moe_input_shape_average = deepcopy(input_shape)
+        moe_input_shape_average[1] = int(moe_input_shape_average[1] * self.router.topk)
+        result += self.experts.num_activation(moe_input_shape_average)
+        if self.use_shared_expert:
+            result += self.shared_experts.num_activation(input_shape)
+        if self.config.moe_layer_recompute:
+            result = cum_mul(input_shape) * 2
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class IdentityOp(MemEstimator):
+    def num_parameter(self):
+        return 0
+    def num_activation(self, input_shape: list[int]):
+        return 0
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+IdentityFuncOp = IdentityOp
+TERowParallelLinear = RowParallelLinear
+TEColumnParallelLinear = ColumnParallelLinear
+TELayerNormColumnParallelLinear = ColumnParallelLinear
+class TEDotProductAttention(MemEstimator):
+    def __init__(self, config: TransformerConfig, *args, **kwargs):
+        super().__init__()
+        self.config = config
+    def num_parameter(self):
+        return 0
+    def num_activation(
+        self, q_shape: list[int], k_shape: list[int], v_shape: list[int]
+    ):
+        bs, seqs, heads, dim = q_shape
+        if self.config.multi_latent_attention and False:
+            result = bs * seqs * seqs * heads
+        else:
+            bs, seqs, heads, dim = k_shape
+            result = (
+                bs * seqs * dim * heads * 2  # * self.config.tensor_model_parallel_size
+            )  # flash attention
+            if self.config.context_parallel_size > 1:
+                result *= 2
+        return result
+    def mock_forward(
+        self,
+        hidden_size: int,
+        q_shape: list[int],
+        k_shape: list[int],
+        v_shape: list[int],
+    ):
+        seqs, bs, heads, dim = q_shape
+        return [seqs, bs, hidden_size]
+class TransformerLayer(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+        vp_stage: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        if config.enable_cuda_graph and self.training:
+            assert (
+                not config.cpu_offloading and config.recompute_granularity is None
+            ), "Cudagraphs not supported"
+            self.cudagraph_manager = CudaGraphManager()
+        self.submodules_config = submodules
+        self.layer_number = layer_number + get_transformer_layer_offset(
+            self.config, vp_stage
+        )
+        self.hidden_dropout = (
+            config.hidden_dropout if hidden_dropout is None else hidden_dropout
+        )
+        # [Module 1: Input Layernorm] Optional Layernorm on the input data
+        # TODO: add pytorch only layernorm
+        self.input_layernorm = build_module(
+            submodules.input_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        # [Module 2: SelfAttention]
+        self.self_attention = build_module(
+            submodules.self_attention, config=self.config, layer_number=layer_number
+        )
+        # [Module 3: BiasDropoutFusion]
+        self.self_attn_bda = build_module(submodules.self_attn_bda)
+        # [Module 4: Post SelfAttention] Optional Layernorm after self-attn
+        self.pre_cross_attn_layernorm = build_module(
+            submodules.pre_cross_attn_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        # [Module 5: CrossAttention]
+        self.cross_attention = build_module(
+            submodules.cross_attention, config=self.config, layer_number=layer_number
+        )
+        # [Module 6: BiasDropoutFusion]
+        self.cross_attn_bda = build_module(
+            submodules.cross_attn_bda, config=self.config
+        )
+        # [Module 7: Pre MLP] Optional Layernorm before MLP
+        self.pre_mlp_layernorm = build_module(
+            submodules.pre_mlp_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        # [Module 8: MLP block]
+        self.mlp = build_module(submodules.mlp, config=self.config)
+        if hasattr(self.mlp, "set_layer_number"):
+            self.mlp.set_layer_number(self.layer_number)
+        # [Module 9: BiasDropoutFusion]
+        self.mlp_bda = build_module(submodules.mlp_bda)
+        self.recompute_input_layernorm = False
+        self.recompute_pre_mlp_layernorm = False
+        self.recompute_mlp = False
+        if self.config.recompute_granularity == "selective":
+            if "layernorm" in self.config.recompute_modules:
+                if not isinstance(self.input_layernorm, IdentityOp):
+                    self.recompute_input_layernorm = True
+                if not isinstance(self.pre_mlp_layernorm, IdentityOp):
+                    self.recompute_pre_mlp_layernorm = True
+            if "mlp" in self.config.recompute_modules:
+                if not isinstance(self.mlp, MoELayer):
+                    self.recompute_mlp = True
+    def num_parameter(self):
+        result = self.input_layernorm.num_parameter()
+        result += self.self_attention.num_parameter()
+        result += self.pre_cross_attn_layernorm.num_parameter()
+        result += self.cross_attention.num_parameter()
+        result += self.cross_attn_bda.num_parameter()
+        result += self.pre_mlp_layernorm.num_parameter()
+        result += self.mlp.num_parameter()
+        return result
+    def num_activation(self, input_shape: list[int]):
+        result = 0
+        result += self.self_attention.num_activation(input_shape)
+        if not self.recompute_mlp:
+            result += self.mlp.num_activation(input_shape)
+        # __import__('ipdb').set_trace()
+        # sequence parallel
+        if self.config.sequence_parallel and self.config.tensor_model_parallel_size > 1:
+            input_shape = deepcopy(input_shape)
+            input_shape[1] /= self.config.tensor_model_parallel_size
+        if not self.recompute_input_layernorm:
+            result += self.input_layernorm.num_activation(input_shape)
+        if not self.recompute_pre_mlp_layernorm:
+            result += self.pre_mlp_layernorm.num_activation(input_shape)
+        result += self.self_attn_bda.num_activation(input_shape)
+        result += self.mlp_bda.num_activation(input_shape)
+        return result
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class SelfAttention(MemEstimator):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules,
+        layer_number: int,
+        attn_mask_type,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_number = layer_number
+        self.attn_mask_type = attn_mask_type
+        self.attention_type = ""
+        # For normal attention without groups, num_query_groups == num_attention_heads,
+        # so these two will be the same
+        self.query_projection_size = (
+            self.config.kv_channels * self.config.num_attention_heads
+        )
+        self.kv_projection_size = self.config.kv_channels * self.config.num_query_groups
+        # Per attention head and per partition values.
+        world_size = get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = divide(
+            self.query_projection_size, self.config.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = divide(
+            self.config.num_attention_heads, world_size
+        )
+        self.num_query_groups_per_partition = divide(
+            self.config.num_query_groups, world_size
+        )
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+        )
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear or self.config.add_qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name="qkv",
+        )
+        if submodules.q_layernorm is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.q_layernorm = None
+        if submodules.k_layernorm is not None:
+            self.k_layernorm = build_module(
+                submodules.k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.k_layernorm = None
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name="proj",
+        )
+        self.checkpoint_core_attention = (
+            self.config.recompute_granularity == "selective"
+        )
+    def num_parameter(self):
+        result = 0
+        result += self.core_attention.num_parameter()
+        result += self.linear_proj.num_parameter()
+        result += self.linear_qkv.num_parameter()
+        if self.q_layernorm is not None:
+            result += self.q_layernorm.num_parameter()
+        if self.k_layernorm is not None:
+            result += self.k_layernorm.num_parameter()
+        return result
+    def num_activation(self, input_shape: list[int]):
+        ret = 0
+        ## in estimator: act(linear) = 1.5*cum_mul(input_shape)
+        ## in reality: act(linear) = cum_mul(input_shape), act(rotary) = cum_mul(input_shape), act(attn_forward_func_with_cp) = cum_mul(input_shape)
+        # ret += self.linear_qkv.num_activation(input_shape)
+        mixed_qkv_shape = self.linear_qkv.mock_forward(input_shape)
+        new_tensor_shape = mixed_qkv_shape[:-1] + [
+            self.num_query_groups_per_partition,
+            (
+                (
+                    self.num_attention_heads_per_partition
+                    // self.num_query_groups_per_partition
+                    + 2
+                )
+                * self.hidden_size_per_attention_head
+            ),
+        ]
+        split_arg_list = [
+            (
+                self.num_attention_heads_per_partition
+                // self.num_query_groups_per_partition
+                * self.hidden_size_per_attention_head
+            ),
+            self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head,
+        ]
+        # [sq, b, ng, (np/ng + 2) * hn]
+        # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+        q_shape = new_tensor_shape[:-1] + [split_arg_list[0]]
+        k_shape = new_tensor_shape[:-1] + [split_arg_list[1]]
+        v_shape = new_tensor_shape[:-1] + [split_arg_list[2]]
+        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        q_shape = (
+            q_shape[:2]
+            + [cum_mul(q_shape[-2:]) // self.hidden_size_per_attention_head]
+            + [self.hidden_size_per_attention_head]
+        )
+        if not self.checkpoint_core_attention:
+            ret += self.core_attention.num_activation(q_shape, k_shape, v_shape)
+        ret += self.linear_proj.num_activation(input_shape)
+        ## in reality: act(linear) = cum_mul(input_shape), act(rotary) = cum_mul(input_shape), act(attn_forward_func_with_cp) = cum_mul(input_shape)
+        ret += self.linear_proj.num_activation(input_shape) * 3
+        return ret
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class Linear(MemEstimator):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__()
+        self.weight = (in_features, out_features)
+    def num_parameter(self):
+        return self.weight[0] * self.weight[1]
+    def num_activation(self, input_shape: list[int]):
+        return cum_mul(input_shape[:-1]) * self.weight[1]
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape[:-1] + [self.weight[1]]
+class MLASelfAttention(MemEstimator):
+    """MLA Self-attention layer class
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_number = layer_number
+        self.attn_mask_type = attn_mask_type
+        self.attention_type = "self"
+        self.world_size = get_tensor_model_parallel_world_size()
+        # assert (
+        #     world_size == 1
+        # ), "MLA is not supported with Tensor Parallelism yet, \
+        # use Expert Parallelism and Pipeline Parallelism for better performance."
+        self.query_projection_size = (
+            self.config.v_head_dim * self.config.num_attention_heads
+        )
+        self.q_head_dim = self.config.qk_head_dim + self.config.qk_pos_emb_head_dim
+        mscale = _yarn_get_mscale(self.config.rotary_scaling_factor, self.config.mscale)
+        self.softmax_scale = mscale * mscale / math.sqrt(self.q_head_dim)
+        # Per attention head and per partition values.
+        world_size = get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = divide(
+            self.query_projection_size, self.config.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = divide(
+            self.config.num_attention_heads, world_size
+        )
+        self.num_query_groups_per_partition = divide(
+            self.config.num_query_groups, world_size
+        )
+        # TODO Rotary Embedding
+        # self.rotary_pos_emb = YarnRotaryEmbedding(
+        #     self.config.qk_pos_emb_head_dim,
+        #     rotary_base=self.config.rotary_base,
+        #     scaling_factor=self.config.rotary_scaling_factor,
+        #     original_max_position_embeddings=self.config.max_position_embeddings,
+        #     beta_fast=self.config.beta_fast,
+        #     beta_slow=self.config.beta_slow,
+        #     mscale=self.config.mscale,
+        #     mscale_all_dim=self.config.mscale_all_dim,
+        # )
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+            attention_type=self.attention_type,
+            softmax_scale=self.softmax_scale,
+            k_channels=self.q_head_dim,
+            v_channels=self.config.v_head_dim,
+        )
+        if self.config.q_lora_rank is None:
+            # Not projectiing query
+            self.linear_q_proj = build_module(
+                submodules.linear_q_proj,
+                self.config.hidden_size,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+                is_mla=True,
+            )
+        else:
+            self.linear_q_down_proj = Linear(
+                self.config.hidden_size, self.config.q_lora_rank, bias=False
+            )
+            self.linear_q_up_proj = build_module(
+                submodules.linear_q_up_proj,
+                self.config.q_lora_rank,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+                is_mla=True,
+            )
+        self.linear_kv_down_proj = Linear(
+            self.config.hidden_size,
+            self.config.kv_lora_rank + self.config.qk_pos_emb_head_dim,
+            bias=False,
+        )
+        self.linear_kv_up_proj = build_module(
+            submodules.linear_kv_up_proj,
+            self.config.kv_lora_rank,
+            self.config.num_attention_heads
+            * (self.config.qk_head_dim + self.config.v_head_dim),
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+            is_mla=True,
+        )
+        if self.config.q_lora_rank is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.config.q_lora_rank,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        self.kv_layernorm = build_module(
+            submodules.kv_layernorm,
+            hidden_size=self.config.kv_lora_rank,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+        # Output.
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name="proj",
+        )
+        self.checkpoint_core_attention = (
+            self.config.recompute_granularity == "selective"
+        )
+    def num_parameter(self):
+        result = 0
+        result += self.core_attention.num_parameter()
+        result += self.linear_proj.num_parameter()
+        if self.config.q_lora_rank is None:
+            result += self.linear_q_proj.num_parameter()
+        else:
+            result += self.linear_q_down_proj.num_parameter()
+            result += self.linear_q_up_proj.num_parameter()
+        result += self.linear_kv_down_proj.num_parameter()
+        result += self.linear_kv_up_proj.num_parameter()
+        result += self.kv_layernorm.num_parameter()
+        if self.config.q_lora_rank is not None:
+            result += self.q_layernorm.num_parameter()
+        return result
+    def num_activation(self, input_shape: list[int]):
+        q_len, bsz, _ = input_shape
+        ret = 0
+        if self.config.q_lora_rank is not None:
+            ret += self.linear_q_down_proj.num_activation(input_shape)
+            q_compressed_shape = self.linear_q_down_proj.mock_forward(input_shape)
+            ret += self.q_layernorm.num_activation(q_compressed_shape)
+            ret += self.linear_q_up_proj.num_activation(q_compressed_shape)
+            q_shape = self.linear_q_up_proj.mock_forward(q_compressed_shape)
+        else:
+            # hidden_states:[s, b, 2048], q: [s, b, n * 192]
+            ret += self.linear_q_proj.num_activation(input_shape)
+            q_shape = self.linear_q_proj.mock_forward(input_shape)
+        # kv_combined: [s, b, 576]
+        ret += self.linear_kv_down_proj.num_activation(input_shape)
+        kv_combined_shape = self.linear_kv_down_proj.mock_forward(input_shape)
+        # kv_compressed:[s, b, 512], k_pos_emb: [s, b, 64]
+        kv_compressed_shape = kv_combined_shape[:-1] + [self.config.kv_lora_rank]
+        # kv: [s, b, 2048]
+        ret += self.kv_layernorm.num_activation(kv_compressed_shape)
+        ret += self.linear_kv_up_proj.num_activation(kv_compressed_shape)
+        q_shape = [q_len, bsz, self.num_attention_heads_per_partition, self.q_head_dim]
+        k_shape = [q_len, bsz, self.num_attention_heads_per_partition, self.q_head_dim]
+        v_shape = [
+            q_len,
+            bsz,
+            self.num_attention_heads_per_partition,
+            self.config.v_head_dim,
+        ]
+        if not self.checkpoint_core_attention:
+            ret += self.core_attention.num_activation(q_shape, k_shape, v_shape)
+        ret += self.linear_proj.num_activation(input_shape)
+        return ret
+    def mock_forward(self, input_shape: list[int]):
+        return input_shape
+class TENorm:
+    def __new__(cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5):
+        from megatron.core.extensions.transformer_engine import _get_extra_te_kwargs, te
+        if config.normalization == "LayerNorm":
+            # TODO layernorm
+            pass
+        elif config.normalization == "RMSNorm":
+            assert hasattr(
+                te.pytorch, "RMSNorm"
+            ), "Transformer-Engine >= v0.11 required to use this feature"
+            instance = RMSNorm(
+                hidden_size=hidden_size,
+                eps=eps,
+                sequence_parallel=config.sequence_parallel,
+                zero_centered_gamma=config.layernorm_zero_centered_gamma,
+                **_get_extra_te_kwargs(config),
+            )
+        else:
+            raise Exception("Only LayerNorm and RMSNorm are curently supported")
+        return instance
+def build_module(
+    spec_or_module: Union[ModuleSpec, type], *args, **kwargs
+) -> MemEstimator:
+    """replace module with MemEstimators"""
+    if isinstance(spec_or_module, types.FunctionType):
+        return globals()[spec_or_module.__name__]
+    if isinstance(spec_or_module, ModuleSpec) and isinstance(
+        spec_or_module.module, types.FunctionType
+    ):
+        assert False
+        return spec_or_module.module
+    if isinstance(spec_or_module, type):
+        module = spec_or_module
+    elif hasattr(spec_or_module, "module") and isinstance(spec_or_module.module, type):
+        module = spec_or_module.module
+    else:
+        module = import_module(spec_or_module.module)
+    if isinstance(module, types.FunctionType):
+        assert False
+        return module
+    if hasattr(spec_or_module, "submodules") and spec_or_module.submodules is not None:
+        kwargs["submodules"] = spec_or_module.submodules
+    try:
+        module = globals()[module.__name__]
+        return module(
+            *args,
+            **spec_or_module.params if hasattr(spec_or_module, "params") else {},
+            **kwargs,
+        )
+    except Exception as e:
+        # import ipdb
+        # ipdb.set_trace()
+        # improve the error message since we hide the module name in the line above
+        import sys
+        raise type(e)(f"{str(e)} when instantiating {module.__name__}").with_traceback(
+            sys.exc_info()[2]
+        )
+from megatron.core.transformer.transformer_block import (
+    BaseTransformerLayer,
+    LayerNormImpl,
+    TransformerBlockSubmodules,
+)
+def _get_block_submodules(
+    config: TransformerConfig,
+    spec: Union[TransformerBlockSubmodules, ModuleSpec],
+    vp_stage: Optional[int] = None,
+) -> TransformerBlockSubmodules:
+    """
+    Retrieve or construct TransformerBlockSubmodules based on the provided specification.
+    Args:
+        config (TransformerConfig): Configuration object for the transformer model.
+        spec (Union[TransformerBlockSubmodules, ModuleSpec]): Specification for the
+            transformer block submodules. Can be either a TransformerBlockSubmodules
+            instance or a ModuleSpec.
+    Returns:
+        TransformerBlockSubmodules: The submodules for the transformer block.
+    """
+    # Transformer block submodules.
+    if isinstance(spec, TransformerBlockSubmodules):
+        return spec
+    # ModuleSpec here is generally assumed to be for a transformer layer that
+    # is implemented in `transformer_layer.py` or if it subclasses
+    # `BaseTransformerLayer` from the `transformer_layer.py` file.
+    elif isinstance(spec, ModuleSpec):
+        if issubclass(spec.module, TransformerBlock):
+            return spec.submodules
+        elif issubclass(spec.module, BaseTransformerLayer):
+            num_layers = get_num_layers_to_build(config, vp_stage)
+            return TransformerBlockSubmodules(
+                layer_specs=[spec] * num_layers, layer_norm=LayerNormImpl
+            )
+        else:
+            raise Exception(f"specialize for {spec.module.__name__}.")
+    else:
+        raise Exception(f"specialize for {type(spec).__name__}.")
+from megatron.core.transformer.transformer_block import get_num_layers_to_build
+def ___get_num_layers_to_build(config: TransformerConfig) -> int:
+    """
+    Determine the number of transformer layers to build for the current pipeline stage.
+    Args:
+        config (TransformerConfig): Configuration object containing transformer model parameters.
+    Returns:
+        int: The number of layers to be built for the current pipeline stage.
+    """
+    if (
+        config.num_layers_in_first_pipeline_stage is not None
+        or config.num_layers_in_last_pipeline_stage is not None
+    ):
+        assert not (
+            config.account_for_embedding_in_pipeline_split
+            or config.account_for_loss_in_pipeline_split
+        ), " \
+        Does not support standalone embedding stage and standalone loss stage with uneven pp"
+        # Number of layers to distribute over rest of pipeline stages
+        layers_to_distribute = config.num_layers
+        # Number of pipeline stages left for distributing transformer layers
+        pipeline_stages_left = get_pipeline_model_parallel_world_size()
+        # If the uneven first (last) pipeline stage is enabled, remove the specified number
+        # of layers to calculate the number of layers on each middle pipeline stage.
+        if config.num_layers_in_first_pipeline_stage is not None:
+            layers_to_distribute -= config.num_layers_in_first_pipeline_stage
+            pipeline_stages_left -= 1
+        if config.num_layers_in_last_pipeline_stage is not None:
+            layers_to_distribute -= config.num_layers_in_last_pipeline_stage
+            pipeline_stages_left -= 1
+        assert (
+            layers_to_distribute % pipeline_stages_left == 0
+        ), "With uneven pipelineing the left over layers must be divisible by left over stages"
+        num_layers_per_pipeline_rank = layers_to_distribute // pipeline_stages_left
+        # If the uneven first (last) pipeline stage is enabled, return the specified number
+        # of layers for all virtual pipeline parallel stages within the first (last) pipeline
+        # parallel stage.
+        if (
+            is_pipeline_first_stage(ignore_virtual=True)
+            and config.num_layers_in_first_pipeline_stage is not None
+        ):
+            num_layers_per_pipeline_rank = config.num_layers_in_first_pipeline_stage
+        if (
+            is_pipeline_last_stage(ignore_virtual=True)
+            and config.num_layers_in_last_pipeline_stage is not None
+        ):
+            num_layers_per_pipeline_rank = config.num_layers_in_last_pipeline_stage
+    else:
+        # Include the embedding layer and loss layer into pipeline parallelism partition
+        num_layers = config.num_layers
+        if config.account_for_embedding_in_pipeline_split:
+            num_layers += 1
+        if config.account_for_loss_in_pipeline_split:
+            num_layers += 1
+        assert (
+            num_layers % config.pipeline_model_parallel_size == 0
+        ), "num_layers should be divisible by pipeline_model_parallel_size"
+        num_layers_per_pipeline_rank = num_layers // config.pipeline_model_parallel_size
+    # if get_virtual_pipeline_model_parallel_world_size() is not None:
+    #     # Interleaved pipeline parallelism:
+    #     # Number of layers in each model chunk is the number of layers in the stage,
+    #     # divided by the number of model chunks in a stage.
+    #     # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+    #     # layers to stages like (each list is a model chunk):
+    #     # Stage 0: [0]  [2]  [4]  [6]
+    #     # Stage 1: [1]  [3]  [5]  [7]
+    #     # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+    #     # layers to stages like (each list is a model chunk):
+    #     # Stage 0: [0, 1]  [4, 5]
+    #     # Stage 1: [2, 3]  [6, 7]
+    #     vp_size = get_virtual_pipeline_model_parallel_world_size()
+    #     assert (
+    #         num_layers_per_pipeline_rank % vp_size == 0
+    #     ), "num_layers_per_pipeline_rank should be divisible by vp_size"
+    #     num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
+    #     num_layers_to_build = num_layers_per_virtual_rank
+    # else:
+    #     # Non-interleaved pipeline parallelism:
+    #     # Each stage gets a contiguous set of layers.
+    #     num_layers_to_build = num_layers_per_pipeline_rank
+    num_layers_to_build = num_layers_per_pipeline_rank
+    # The embedding (or loss) layer cannot function as a standalone transformer layer
+    # Reduce the number of layers to construct by 1 on the first (or last) stage if the
+    # embedding (or loss) layer is included in the pipeline parallelism partition and placement.
+    if is_pipeline_first_stage() and config.account_for_embedding_in_pipeline_split:
+        num_layers_to_build -= 1
+        assert (
+            num_layers_to_build >= 0
+        ), "Not enough layers in the first virtual pipeline stage"
+    if is_pipeline_last_stage() and config.account_for_loss_in_pipeline_split:
+        num_layers_to_build -= 1
+        assert (
+            num_layers_to_build >= 0
+        ), "Not enough layers in the last virtual pipeline stage"
+    return num_layers_to_build
+from megatron.core.transformer.enums import LayerType
+def get_transformer_layer_offset(
+    config: TransformerConfig, vp_stage: Optional[int] = None
+):
+    """Get the index offset of current pipeline stage, given the level of pipelining."""
+    pipeline_rank = get_pipeline_model_parallel_rank()
+    if config.pipeline_model_parallel_size > 1:
+        if config.pipeline_model_parallel_layout:
+            offset = config.pipeline_model_parallel_layout.get_layer_offset(
+                layer_type=LayerType.decoder, vp_stage=vp_stage
+            )
+        elif (
+            config.num_layers_in_first_pipeline_stage is not None
+            or config.num_layers_in_last_pipeline_stage is not None
+        ):
+            # Calculate number of pipeline stages to distribute the remaining Transformer
+            # layers after deducting the Transformer layers in the first or the last stages
+            middle_pipeline_stages = config.pipeline_model_parallel_size
+            middle_pipeline_stages -= sum(
+                [
+                    1 if x is not None else 0
+                    for x in (
+                        config.num_layers_in_first_pipeline_stage,
+                        config.num_layers_in_last_pipeline_stage,
+                    )
+                ]
+            )
+            # Calculate layers to distribute in each pipeline stage. If the
+            # num_layers_in_first_pipeline_stage and num_layers_in_last_pipeline_stage
+            # are not set, we will not enable uneven pipeline. All layers will be treated
+            # as middle layers.
+            num_layers_in_first_pipeline_stage = (
+                0
+                if config.num_layers_in_first_pipeline_stage is None
+                else config.num_layers_in_first_pipeline_stage
+            )
+            num_layers_in_last_pipeline_stage = (
+                0
+                if config.num_layers_in_last_pipeline_stage is None
+                else config.num_layers_in_last_pipeline_stage
+            )
+            middle_num_layers = (
+                config.num_layers
+                - num_layers_in_first_pipeline_stage
+                - num_layers_in_last_pipeline_stage
+            )
+            if (vp_size := config.virtual_pipeline_model_parallel_size) is not None:
+                assert (
+                    vp_stage is not None
+                ), "vp_stage must be provided if virtual pipeline model parallel size is set"
+                # Calculate number of layers in each virtual model chunk
+                # If the num_layers_in_first_pipeline_stage and
+                # num_layers_in_last_pipeline_stage are not set, all pipeline stages
+                # will be treated as middle pipeline stages in the calculation
+                num_layers_per_virtual_model_chunk_in_first_pipeline_stage = (
+                    0
+                    if config.num_layers_in_first_pipeline_stage is None
+                    else config.num_layers_in_first_pipeline_stage // vp_size
+                )
+                num_layers_per_virtual_model_chunk_in_last_pipeline_stage = (
+                    0
+                    if config.num_layers_in_last_pipeline_stage is None
+                    else config.num_layers_in_last_pipeline_stage // vp_size
+                )
+                num_layers_per_vritual_model_chunk_in_middle_pipeline_stage = (
+                    middle_num_layers // vp_size
+                )
+                # First stage + middle stage + last stage
+                total_virtual_chunks = (
+                    num_layers_per_virtual_model_chunk_in_first_pipeline_stage
+                    + num_layers_per_vritual_model_chunk_in_middle_pipeline_stage
+                    + num_layers_per_virtual_model_chunk_in_last_pipeline_stage
+                )
+                # Calculate the layer offset with interleaved uneven pipeline parallelism
+                if pipeline_rank == 0:
+                    offset = vp_stage * total_virtual_chunks
+                else:
+                    offset = (
+                        vp_stage * total_virtual_chunks
+                        + num_layers_per_virtual_model_chunk_in_first_pipeline_stage
+                        + (pipeline_rank - 1)
+                        * (
+                            num_layers_per_vritual_model_chunk_in_middle_pipeline_stage
+                            // middle_pipeline_stages
+                        )
+                    )
+            else:
+                if middle_pipeline_stages > 0:
+                    num_layers_per_pipeline_rank = (
+                        middle_num_layers // middle_pipeline_stages
+                    )
+                else:
+                    num_layers_per_pipeline_rank = 0
+                middle_pipeline_rank = (
+                    pipeline_rank
+                    if config.num_layers_in_first_pipeline_stage is None
+                    else pipeline_rank - 1
+                )
+                if pipeline_rank == 0:
+                    offset = 0
+                else:
+                    offset = (
+                        middle_pipeline_rank * num_layers_per_pipeline_rank
+                    ) + num_layers_in_first_pipeline_stage
+        else:
+            num_layers = config.num_layers
+            # Increase the number of layers by one if we include the embedding (loss)
+            # layer into pipeline parallelism partition and placement
+            if config.account_for_embedding_in_pipeline_split:
+                num_layers += 1
+            if config.account_for_loss_in_pipeline_split:
+                num_layers += 1
+            num_layers_per_pipeline_rank = (
+                num_layers // config.pipeline_model_parallel_size
+            )
+            if (vp_size := config.virtual_pipeline_model_parallel_size) is not None:
+                assert (
+                    vp_stage is not None
+                ), "vp_stage must be provided if virtual pipeline model parallel size is set"
+                num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
+                total_virtual_chunks = num_layers // vp_size
+                offset = vp_stage * total_virtual_chunks + (
+                    pipeline_rank * num_layers_per_virtual_rank
+                )
+                # Reduce the offset of embedding layer from the total layer number
+                if (
+                    config.account_for_embedding_in_pipeline_split
+                    and not is_pipeline_first_stage(
+                        ignore_virtual=False, vp_stage=vp_stage
+                    )
+                ):
+                    offset -= 1
+            else:
+                offset = pipeline_rank * num_layers_per_pipeline_rank
+                # Reduce the offset of embedding layer from the total layer number
+                if (
+                    config.account_for_embedding_in_pipeline_split
+                    and not is_pipeline_first_stage(
+                        ignore_virtual=False, vp_stage=vp_stage
+                    )
+                ):
+                    offset -= 1
+    else:
+        offset = 0
+    return offset

webui/index.html ADDED Viewed

	@@ -0,0 +1,208 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Megatron Memory Estimator</title>
+    <link rel="stylesheet" href="style.css">
+</head>
+<body>
+    <div class="container">
+        <h1>Megatron Memory Estimator v0.13</h1>
+        <div class="disclaimer-banner">
+    Note: This estimator only measures the GPU memory directly managed by PyTorch when running Megatron. It does not include extra consumption from NCCL communication buffers, kernel fusion, overlap optimizations, CUDA Graphs, etc. Please use the "Overhead per GPU" option below to account for these additional costs.
+</div>
+        <div class="main-layout">
+            <div class="top-section">
+                <div class="config-column">
+                    <form id="config-form">
+                <h2>Configuration</h2>
+                <p class="config-hint" style="font-size: 0.9em; color: #666; margin-top: -0.5em; margin-bottom: 1em;">
+                    For detailed explanations of each parameter, please see the&nbsp;<a href="https://github.com/NVIDIA/Megatron-LM/blob/core_r0.13.0/megatron/training/arguments.py#L2266" target="_blank">Megatron-LM arguments documentation</a>.
+                </p>
+                        <div class="form-group">
+                            <label for="model-select">Select a Local Config:</label>
+                            <select id="model-select" name="model">
+                                <option value="">Loading...</option>
+                            </select>
+                        </div>
+                        <!-- All settings are now in one block -->
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="num-gpus">Total GPUs:</label>
+                                <input type="number" id="num-gpus" name="num_gpus" value="8" step="8" min="8">
+                            </div>
+                    <div class="form-group">
+                                <label for="mbs">micro batch size:</label>
+                                <input type="number" id="mbs" name="mbs" value="1" min="1">
+                    </div>
+                    <div class="form-group">
+                                <label for="seq-len">SeqLen:</label>
+                                <input type="number"id="seq-len" name="seq-len" value="4096" min="1">
+                            </div>
+                        </div>
+                        <div class="form-group">
+                            <input type="checkbox" id="use-distributed-optimizer" name="use_distributed_optimizer" checked>
+                            <label for="use-distributed-optimizer" class="inline-label">Use Distributed Optimizer</label>
+                        </div>
+                        <!-- 新增：Embedding/Loss Pipeline Split 选项 -->
+                        <div class="form-group vpp-dependent" style="display: none;">
+                            <input type="checkbox" id="account_for_embedding_in_pipeline_split" name="account_for_embedding_in_pipeline_split">
+                            <label for="account_for_embedding_in_pipeline_split" class="inline-label">Account for Embedding in PP Split</label>
+                        </div>
+                        <div class="form-group vpp-dependent" style="display: none;">
+                            <input type="checkbox" id="account_for_loss_in_pipeline_split" name="account_for_loss_in_pipeline_split">
+                            <label for="account_for_loss_in_pipeline_split" class="inline-label">Account for Loss in PP Split</label>
+                        </div>
+                        <!-- 选项结束 -->
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="recompute-granularity">Recomputation:</label>
+                                <select id="recompute-granularity" name="recompute_granularity">
+                                    <option value="none">None</option>
+                                    <option value="selective">Selective</option>
+                                    <option value="full">Full</option>
+                                </select>
+                            </div>
+                            <div class="form-group recompute-options" style="display: none;">
+                                <label for="recompute-method">Method:</label>
+                                <select id="recompute-method" name="recompute_method">
+                                    <option value="uniform">Uniform</option>
+                                    <option value="block">Block</option>
+                                </select>
+                            </div>
+                            <div class="form-group recompute-options" style="display: none;">
+                                <label for="recompute-num-layers">Layers:</label>
+                                <input type="number" id="recompute-num-layers" name="recompute_num_layers" value="1" min="1">
+                            </div>
+                        </div>
+                        <!-- 新增：Selective Recompute 模块选择 -->
+                        <div class="form-row selective-options" style="display: none;">
+                            <div class="form-group">
+                                <label><input type="checkbox" name="recompute_modules" value="core_attn"> core_attn</label>
+                            </div>
+                            <div class="form-group">
+                                <label><input type="checkbox" name="recompute_modules" value="moe_act"> moe_act</label>
+                            </div>
+                            <div class="form-group">
+                                <label><input type="checkbox" name="recompute_modules" value="layernorm"> layernorm</label>
+                            </div>
+                            <div class="form-group">
+                                <label><input type="checkbox" name="recompute_modules" value="mla_up_proj"> mla_up_proj</label>
+                            </div>
+                            <div class="form-group">
+                                <label><input type="checkbox" name="recompute_modules" value="mlp"> mlp</label>
+                            </div>
+                            <div class="form-group">
+                                <label><input type="checkbox" name="recompute_modules" value="moe"> moe</label>
+                            </div>
+                        </div>
+                        <!-- Selective Recompute 结束 -->
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="tp">TP:</label>
+                                <select id="tp" name="tp"></select>
+                    </div>
+                    <div class="form-group">
+                                <label for="pp">PP:</label>
+                        <input type="number" id="pp" name="pp" value="1" min="1">
+                    </div>
+                    <div class="form-group">
+                                <label for="ep">EP:</label>
+                                <select id="ep" name="ep"></select>
+                            </div>
+                            <div class="form-group">
+                                <label for="cp">CP:</label>
+                                <select id="cp" name="cp"></select>
+                            </div>
+                        </div>
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="vpp">VPP:</label>
+                                <input type="number" id="vpp" name="vpp" placeholder="None" min="1">
+                    </div>
+                    <div class="form-group">
+                                <label for="etp">ETP:</label>
+                                <input type="number" id="etp" name="etp" placeholder="None" min="1">
+                            </div>
+                        </div>
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="num_layers_in_first_pipeline_stage">First Stage Layers:</label>
+                                <input type="number" id="num_layers_in_first_pipeline_stage" name="num_layers_in_first_pipeline_stage" placeholder="None" min="0">
+                    </div>
+                    <div class="form-group">
+                                <label for="num_layers_in_last_pipeline_stage">Last Stage Layers:</label>
+                                <input type="number" id="num_layers_in_last_pipeline_stage" name="num_layers_in_last_pipeline_stage" placeholder="None" min="0">
+                            </div>
+                        </div>
+                        <div class="form-row">
+                            <div class="form-group">
+                                <label for="overhead">Overhead per GPU:</label>
+                                <select id="overhead" name="overhead">
+                                    <option value="5">5GB</option>
+                                    <option value="10" selected>10GB</option>
+                                </select>
+                            </div>
+                        </div>
+                        <!-- Pipeline Layout Row Added -->
+                        <div class="form-row">
+                            <div class="form-group" style="width: 100%;">
+                                <label for="pipeline-layout">Pipeline Layout (comma-separated layers per stage):</label>
+                                <input type="text" id="pipeline-layout" name="pipeline_model_parallel_layout" placeholder="e.g., Et|(tt|)*30L">
+                            </div>
+                        </div>
+                        <!-- End Pipeline Layout Row -->
+                        <div id="validation-message" class="error-message" style="display: none;"></div>
+                        <div class="button-container">
+                            <button type="submit">Estimate</button>
+                    </div>
+                </form>
+                </div>
+                <div class="output-column">
+                    <div class="config-editor-wrapper">
+                        <h2>Model Config (Editable)</h2>
+                        <textarea id="config-editor" rows="20"></textarea>
+                    </div>
+                </div>
+            </div>
+            <div class="bottom-section">
+                <div id="output-container">
+                    <div id="loading" style="display: none;">Calculating...</div>
+                    <div id="history-wrapper">
+                        <h3>History</h3>
+                        <table id="history-table">
+                            <thead>
+                                <tr>
+                                    <th>Model</th>
+                                    <th>Weight Gradient Optimizer (GB)</th>
+                                    <th>Activation (GB)</th>
+                                    <th>Total (GB/GPU)</th>
+                                    <th>Actions</th>
+                                </tr>
+                            </thead>
+                            <tbody>
+                            </tbody>
+                        </table>
+                        <button id="clear-history" style="margin-top: 1em;">Clear History</button>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="script.js"></script>
+    <footer class="footer">
+        <p>&copy; 2025 <a href="https://github.com/ISEEKYAN" target="_blank">ISEEKYAN</a>. Developed at NVIDIA.</p>
+    </footer>
+</body>
+</html>

webui/main.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import argparse
+import glob
+import json
+import os
+import tempfile
+from typing import Optional
+import requests
+from estimate_013 import estimate_from_config
+from fastapi import Body, FastAPI
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+from megatron.core import parallel_state as mpu
+from pydantic import BaseModel, field_validator
+from mbridge import AutoBridge
+# The directory of the current script (main.py)
+WEBUI_DIR = os.path.dirname(os.path.abspath(__file__))
+app = FastAPI()
+# Mount static files from the webui directory
+app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="static")
+@app.get("/")
+async def read_index():
+    return FileResponse(os.path.join(WEBUI_DIR, "index.html"))
+@app.get("/style.css")
+async def read_css():
+    return FileResponse(os.path.join(WEBUI_DIR, "style.css"))
+@app.get("/script.js")
+async def read_js():
+    return FileResponse(os.path.join(WEBUI_DIR, "script.js"))
+SUPPORTED_MODELS = [
+    "Qwen/Qwen3-235B-A22B",
+    "Qwen/Qwen3-30B-A3B",
+    "Qwen/Qwen3-32B",
+    "Qwen/Qwen3-14B",
+    "Qwen/Qwen3-8B",
+    "Qwen/Qwen2.5-7B",
+    "Qwen/Qwen2.5-14B",
+    "Qwen/Qwen2.5-32B",
+    "Qwen/Qwen2.5-72B",
+    "moonshotai/Moonlight-16B-A3B",
+    "moonshotai/Kimi-K2-Instruct",
+    "deepseek-ai/DeepSeek-V3",
+    "XiaomiMiMo/MiMo-7B-RL",
+]
+@app.get("/local-hf-configs")
+async def get_supported_models():
+    """Return the list of HF model identifiers supported by the UI."""
+    return SUPPORTED_MODELS
+@app.get("/get-megatron-config/{model_path:path}")
+async def get_remote_hf_config(model_path: str):
+    """Fetch the HuggingFace config.json for the given model id."""
+    url = f"https://huggingface.co/{model_path}/raw/main/config.json"
+    try:
+        resp = requests.get(url, timeout=10)
+        resp.raise_for_status()
+        return resp.json()
+    except Exception as e:
+        return {"error": f"Failed to fetch config from {url}: {str(e)}"}
+class MBridgeEstimateConfig(BaseModel):
+    hf_model_path: str
+    custom_hf_config: Optional[dict] = None  # Renamed for clarity
+    # Hardware & Training
+    num_gpus: int = 8
+    mbs: int = 1
+    seq_len: int = 4096
+    use_distributed_optimizer: bool = True
+    # Recompute settings are now part of the main config
+    recompute_granularity: str = "selective"
+    recompute_method: str = "uniform"
+    recompute_num_layers: Optional[int] = 1
+    # Selective recompute modules (optional list only used when granularity==selective)
+    recompute_modules: Optional[list[str]] = None
+    # 新增：Embedding/Loss PP Split 选项
+    account_for_embedding_in_pipeline_split: bool = False
+    account_for_loss_in_pipeline_split: bool = False
+    # Parallelism
+    tp: int = 1
+    pp: int = 1
+    ep: int = 1
+    cp: int = 1
+    vpp: Optional[int] = None
+    etp: Optional[int] = None
+    # Pipeline stage layer counts
+    num_layers_in_first_pipeline_stage: Optional[int] = None
+    num_layers_in_last_pipeline_stage: Optional[int] = None
+    # New field: custom pipeline-model-parallel layout
+    pipeline_model_parallel_layout: Optional[str] = None  # Comma-separated ints
+    @field_validator("num_gpus")
+    def num_gpus_must_be_multiple_of_8(cls, v):
+        if v <= 0 or v % 8 != 0:
+            raise ValueError("must be a positive multiple of 8")
+        return v
+def patch_parallel_states(config: MBridgeEstimateConfig):
+    from mbridge.core.parallel_states import ParallelStates
+    ParallelStates.get_default_parallel_states = lambda: ParallelStates(
+        tp_size=config.tp,
+        pp_size=config.pp,
+        ep_size=config.ep,
+        cp_size=config.cp,
+        vpp_size=config.vpp,
+        etp_size=config.etp,
+    )
+@app.post("/estimate_with_mbridge")
+async def estimate_with_mbridge(config: MBridgeEstimateConfig):
+    # Validate Inputs
+    if config.num_gpus <= 0 or config.num_gpus % 8 != 0:
+        return {"error": "Total number of GPUs must be a positive multiple of 8."}
+    parallel_product = config.tp * config.pp * config.cp
+    if parallel_product == 0:  # Avoid division by zero
+        return {"error": "Parallelism dimensions (TP, PP, CP) cannot be zero."}
+    if config.num_gpus % parallel_product != 0:
+        return {
+            "error": f"Number of GPUs ({config.num_gpus}) must be divisible by the product of TP*PP*CP ({parallel_product})."
+        }
+    patch_parallel_states(config)
+    # If the path is just a filename, assume it's in our local model-configs dir
+    hf_model_path = config.hf_model_path
+    # This logic needs to change. The custom config from the UI is an HF config, not a Megatron config.
+    # We need to load it via a temporary file.
+    if config.custom_hf_config:
+        try:
+            # Create a temporary file to save the custom HF config
+            with tempfile.NamedTemporaryFile(
+                mode="w+",
+                delete=False,
+                suffix=".json",
+                dir=os.path.join("/dev/shm"),
+            ) as tmp:
+                json.dump(config.custom_hf_config, tmp)
+                tmp_path = tmp.name
+            # Load the bridge from the temporary config file
+            from transformers import AutoConfig
+            AutoConfig.trust_remote_code = True
+            bridge = AutoBridge.from_pretrained(tmp_path)
+            tf_config = bridge.config
+            hf_config = bridge.hf_config
+        finally:
+            # Ensure the temporary file is deleted
+            if "tmp_path" in locals() and os.path.exists(tmp_path):
+                os.remove(tmp_path)
+    else:
+        # If no custom config, load from the original path
+        if not os.path.isabs(hf_model_path) and not hf_model_path.startswith(
+            ("http", "./", "../")
+        ):
+            hf_model_path = os.path.join("/dev/shm", hf_model_path)
+        bridge = AutoBridge.from_pretrained(hf_model_path)
+        tf_config = bridge.config
+        hf_config = bridge.hf_config
+    # --- Configuration Unification ---
+    # Update the tf_config with values from the form. This makes tf_config the single source of truth.
+    tf_config.tensor_model_parallel_size = config.tp
+    tf_config.pipeline_model_parallel_size = config.pp
+    tf_config.expert_model_parallel_size = config.ep
+    tf_config.context_parallel_size = config.cp
+    tf_config.recompute_granularity = config.recompute_granularity
+    tf_config.recompute_method = config.recompute_method
+    tf_config.recompute_num_layers = config.recompute_num_layers
+    # 新增：Selective 模式下的模块列表
+    tf_config.recompute_modules = config.recompute_modules if config.recompute_modules is not None else []
+    # 新增：Embedding/Loss PP Split
+    tf_config.account_for_embedding_in_pipeline_split = config.account_for_embedding_in_pipeline_split
+    tf_config.account_for_loss_in_pipeline_split = config.account_for_loss_in_pipeline_split
+    tf_config.num_layers_per_virtual_pipeline_stage = (
+        config.vpp if config.vpp and config.vpp > 1 else None
+    )
+    if config.num_layers_in_first_pipeline_stage is not None:
+        tf_config.num_layers_in_first_pipeline_stage = (
+            config.num_layers_in_first_pipeline_stage
+        )
+    if config.num_layers_in_last_pipeline_stage is not None:
+        tf_config.num_layers_in_last_pipeline_stage = (
+            config.num_layers_in_last_pipeline_stage
+        )
+    # Handle custom pipeline layout if provided
+    if config.pipeline_model_parallel_layout:
+        from megatron.core.transformer.pipeline_parallel_layer_layout import (
+            PipelineParallelLayerLayout,
+        )
+        tf_config.pipeline_model_parallel_layout = PipelineParallelLayerLayout(
+            config.pipeline_model_parallel_layout, config.pp
+        )
+    # print(tf_config)
+    # Create a minimal 'args' object with parameters not present in TransformerConfig
+    args = argparse.Namespace()
+    args.micro_batch_size = config.mbs
+    args.seq_length = config.seq_len
+    args.use_distributed_optimizer = config.use_distributed_optimizer
+    args.data_parallel_size = config.num_gpus // parallel_product
+    args.expert_tensor_parallel_size = config.etp if config.etp else 1
+    # These are required by the estimator but can be derived or defaulted
+    args.transformer_impl = "transformer_engine"
+    args.fp8 = False
+    args.num_experts = getattr(tf_config, "num_moe_experts", 1)  # Needed for layer spec
+    args.moe_grouped_gemm = True  # Default
+    args.qk_layernorm = tf_config.qk_layernorm
+    args.multi_latent_attention = "deepseek" in getattr(hf_config, "model_type", "")
+    args.padded_vocab_size = getattr(hf_config, "vocab_size")
+    args.max_position_embeddings = getattr(hf_config, "max_position_embeddings")
+    args.tie_word_embeddings = getattr(hf_config, "tie_word_embeddings", False)
+    args.world_size = config.num_gpus
+    # This function now returns (aggregated_pp_reports, raw_chunk_reports)
+    aggregated_reports, raw_chunk_reports = estimate_from_config(tf_config, args)
+    processed_reports = []
+    for rpt in aggregated_reports:
+        p = rpt.copy()
+        p.pop("details", None)
+        processed_reports.append(p)
+    return {"processed_report": processed_reports, "raw_report": raw_chunk_reports}

webui/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi
+uvicorn[standard]
+mbridge

webui/script.js ADDED Viewed

	@@ -0,0 +1,787 @@

+document.addEventListener('DOMContentLoaded', () => {
+    // Initial UI setup
+    loadLocalConfigs();
+    updateHistoryView();
+    setupEventListeners();
+    updateParallelismOptions();
+    validateParallelismLive();
+    toggleEpBasedOnConfig(); // Disable EP initially
+    toggleVppDependentOptions(); // 初始化 VPP 相关复选框显隐
+});
+// Utility: convert ANSI color codes (red 31, green 32) to HTML spans for display
+function ansiToHtml(str) {
+    if (!str) return '';
+    // Replace known ANSI codes
+    return str
+        .replace(/\u001b\[31m/g, '<span class="ansi-red">')
+        .replace(/\u001b\[32m/g, '<span class="ansi-green">')
+        .replace(/\u001b\[33m/g, '<span class="ansi-yellow">')
+        .replace(/\u001b\[34m/g, '<span class="ansi-blue">')
+        .replace(/\u001b\[35m/g, '<span class="ansi-magenta">')
+        .replace(/\u001b\[36m/g, '<span class="ansi-cyan">')
+        .replace(/\u001b\[0m/g, '</span>');
+}
+function setupEventListeners() {
+    document.getElementById('config-form').addEventListener('submit', (e) => {
+        e.preventDefault();
+        submitForm();
+    });
+    document.getElementById('model-select').addEventListener('change', loadSelectedModelConfig);
+    document.getElementById('recompute-granularity').addEventListener('change', (e) => {
+        const recomputeOptions = document.querySelectorAll('.recompute-options');
+        recomputeOptions.forEach(opt => {
+            opt.style.display = e.target.value === 'full' ? 'block' : 'none';
+        });
+        // 新增：Selective 模式下展示复选框
+        const selectiveOptions = document.querySelectorAll('.selective-options');
+        selectiveOptions.forEach(opt => {
+            opt.style.display = e.target.value === 'selective' ? 'block' : 'none';
+        });
+    });
+    const liveValidationInputs = ['num-gpus', 'tp', 'pp', 'ep', 'cp', 'etp', 'vpp', 'config-editor', 'pipeline-layout'];
+    liveValidationInputs.forEach(id => {
+        const input = document.getElementById(id);
+        if(input) {
+            input.addEventListener('change', validateParallelismLive);
+            if (id === 'num-gpus') {
+                input.addEventListener('change', updateParallelismOptions);
+            }
+            if (id === 'vpp') {
+                input.addEventListener('change', toggleVppDependentOptions);
+            }
+        }
+    });
+    document.getElementById('config-editor').addEventListener('input', toggleEpBasedOnConfig);
+    document.getElementById('history-table').addEventListener('click', handleHistoryAction);
+    document.getElementById('clear-history').addEventListener('click', clearHistory);
+        }
+async function loadLocalConfigs() {
+    const modelSelect = document.getElementById('model-select');
+    const defaultConfigName = 'Qwen/Qwen3-235B-A22B'; // Updated default model
+        try {
+        const response = await fetch('/local-hf-configs');
+        const configs = await response.json();
+        modelSelect.innerHTML = '<option value="">Select a model...</option>';
+        // Add custom option to allow user supplied configs
+        modelSelect.innerHTML += '<option value="__custom__">Custom (paste JSON below)...</option>';
+        configs.forEach(config => {
+            modelSelect.innerHTML += `<option value="${config}">${config}</option>`;
+        });
+        // Check if the default config exists and select it
+        if (configs.includes(defaultConfigName)) {
+            modelSelect.value = defaultConfigName;
+            // Await the loading of the model config to ensure it's ready
+            await loadSelectedModelConfig();
+            }
+        } catch (error) {
+        modelSelect.innerHTML = '<option value="">Error loading configs</option>';
+        console.error('Error loading local configs:', error);
+        }
+    }
+async function loadSelectedModelConfig() {
+    const modelSelect = document.getElementById('model-select');
+    const editor = document.getElementById('config-editor');
+    const selectedConfig = modelSelect.value;
+    const messageDiv = document.getElementById('validation-message'); // move early for use in all branches
+    let configData = null; // declare for wider scope
+    if (!selectedConfig) {
+        editor.value = '';
+        toggleEpBasedOnConfig();
+        if (messageDiv) messageDiv.style.display = 'none';
+        return;
+    } else if (selectedConfig === '__custom__') {
+        // Custom config: do not fetch, user must paste JSON
+        editor.value = '';
+        toggleEpBasedOnConfig();
+        if (messageDiv) messageDiv.style.display = 'none';
+        return;
+    }
+        // 优先直接从 HuggingFace 仓库拉取配置文件
+        const hfUrl = `https://huggingface.co/${selectedConfig}/raw/main/config.json`;
+        try {
+            const resp = await fetch(hfUrl, { mode: 'cors' });
+            if (resp.ok) {
+                configData = await resp.json();
+                editor.value = JSON.stringify(configData, null, 2);
+            } else {
+                throw new Error(`HF returned status ${resp.status}`);
+            }
+        } catch (hfErr) {
+            console.warn('Direct HF fetch failed, fallback to backend:', hfErr);
+            // 回退到后端接口（兼容本地部署无 CORS 或私有模型）
+            try {
+                const response = await fetch(`/get-megatron-config/${encodeURIComponent(selectedConfig)}`);
+                configData = await response.json();
+                if (configData.error) {
+                    editor.value = `Error: ${configData.error}`;
+                } else {
+                    editor.value = JSON.stringify(configData, null, 2);
+                }
+            } catch (beErr) {
+                editor.value = 'Failed to fetch model configuration.';
+                console.error('Backend config fetch error:', beErr);
+            }
+        }
+    // Trigger validation and UI updates after loading new config
+    validateParallelismLive();
+    toggleEpBasedOnConfig();
+    // Show Kimi-K2-Instruct warning if needed
+    if (selectedConfig.includes('Kimi-K2-Instruct') && configData && configData.model_type !== 'deepseek_v3') {
+        messageDiv.textContent = 'Notice: For Kimi-K2-Instruct the config field "model_type" must be set to "deepseek_v3" before memory estimation.';
+        messageDiv.style.display = 'block';
+    } else if (messageDiv) {
+        messageDiv.style.display = 'none';
+    }
+}
+function getFormValues(isSubmission = false) {
+    const form = document.getElementById('config-form');
+    const formData = new FormData(form);
+    const modelSelect = document.getElementById('model-select');
+    const hfPath = modelSelect.value;
+        if (!hfPath) {
+        // We will now handle this case in the submitForm function instead of an alert.
+        return null;
+    }
+    const editor = document.getElementById('config-editor');
+    let customConfig = null;
+    try {
+        // Only parse if the editor has content
+        if (editor.value) {
+            customConfig = JSON.parse(editor.value);
+        }
+    } catch (e) {
+        // Only alert on final submission, not on live validation
+        if (isSubmission) {
+            // alert('Model Config is not valid JSON.'); // Removing alert
+        }
+        return null; // Return null if JSON is invalid
+        }
+    const vppInput = formData.get('vpp');
+    const etpInput = formData.get('etp');
+    const pipelineLayoutInput = formData.get('pipeline_model_parallel_layout');
+    // 新增：收集 selective 模式下用户选择的模块
+    const recomputeModules = formData.getAll('recompute_modules');
+    return {
+            hf_model_path: hfPath,
+        custom_hf_config: customConfig, // Renamed for clarity
+        num_gpus: parseInt(formData.get('num_gpus')),
+        mbs: parseInt(formData.get('mbs')),
+        seq_len: parseInt(formData.get('seq-len')),
+        use_distributed_optimizer: document.getElementById('use-distributed-optimizer').checked,
+        recompute_granularity: formData.get('recompute_granularity'),
+        recompute_method: formData.get('recompute_method'),
+        recompute_num_layers: parseInt(formData.get('recompute_num_layers')),
+        // 新增字段
+        recompute_modules: recomputeModules,
+        tp: parseInt(formData.get('tp')),
+        pp: parseInt(formData.get('pp')),
+        ep: parseInt(formData.get('ep')) || 1, // Default to 1 if disabled/null
+        cp: parseInt(formData.get('cp')),
+        vpp: vppInput ? parseInt(vppInput) : null,
+        etp: etpInput ? parseInt(etpInput) : null,
+        num_layers_in_first_pipeline_stage: formData.get('num_layers_in_first_pipeline_stage') ? parseInt(formData.get('num_layers_in_first_pipeline_stage')) : null,
+        num_layers_in_last_pipeline_stage: formData.get('num_layers_in_last_pipeline_stage') ? parseInt(formData.get('num_layers_in_last_pipeline_stage')) : null,
+        pipeline_model_parallel_layout: pipelineLayoutInput ? pipelineLayoutInput.trim() : null,
+        overhead: parseInt(formData.get('overhead')),
+        // 新增:
+        account_for_embedding_in_pipeline_split: document.getElementById('account_for_embedding_in_pipeline_split').checked,
+        account_for_loss_in_pipeline_split: document.getElementById('account_for_loss_in_pipeline_split').checked,
+    };
+}
+async function submitForm() {
+    const messageDiv = document.getElementById('validation-message');
+    messageDiv.textContent = '';
+    messageDiv.style.display = 'none';
+    // Get all form values first. We use getFormValues(false) to avoid any legacy alerts
+    // and handle all validation directly within this function for clarity.
+    const formValues = getFormValues(false);
+    // === START SUBMISSION VALIDATION ===
+    // 1. Check if form values could be retrieved. This catches both missing model selection
+    //    and invalid JSON, as getFormValues returns null in those cases.
+    if (!formValues) {
+        if (!document.getElementById('model-select').value) {
+            messageDiv.textContent = 'Validation Error: Please select a model config.';
+        } else {
+            messageDiv.textContent = 'Validation Error: Model Config is not valid JSON.';
+        }
+        messageDiv.style.display = 'block';
+        return;
+    }
+    // Custom config must have valid JSON
+    if (document.getElementById('model-select').value === '__custom__' && !formValues.custom_hf_config) {
+        messageDiv.textContent = 'Validation Error: Please paste a valid model configuration JSON for the custom model.';
+        messageDiv.style.display = 'block';
+        return;
+    }
+    // 2. Perform all numeric and parallelism validation.
+    const { num_gpus, tp, pp, ep, cp, etp, custom_hf_config } = formValues;
+    const num_kv_heads = custom_hf_config?.num_key_value_heads || null;
+    let errors = [];
+    if (tp * pp * cp > num_gpus) {
+        errors.push(`TP*PP*CP (${tp * pp * cp}) > GPUs (${num_gpus}).`);
+    }
+    if (etp){
+        if (etp * pp * cp * ep > num_gpus) {
+            errors.push(`ETP*PP*CP*EP (${etp * pp * cp * ep}) > GPUs (${num_gpus}).`);
+        }
+    } else {
+        if (tp * pp * cp * ep > num_gpus) {
+            errors.push(`TP*PP*CP*EP (${tp * pp * cp * ep}) > GPUs (${num_gpus}) when ETP is not set.`);
+        }
+    }
+    if (num_kv_heads && tp > num_kv_heads) {
+        errors.push(`TP (${tp}) > Num KV Heads (${num_kv_heads}).`);
+    }
+    if (errors.length > 0) {
+        messageDiv.textContent = 'Validation Error: ' + errors.join(' ');
+        messageDiv.style.display = 'block';
+        return;
+    }
+    // === END SUBMISSION VALIDATION ===
+    const loading = document.getElementById('loading');
+    const submitBtn = document.querySelector('#config-form button[type="submit"]');
+    loading.style.display = 'block';
+    if (submitBtn) submitBtn.disabled = true;
+        try {
+            const response = await fetch('/estimate_with_mbridge', {
+                method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify(formValues) // Send the now fully-validated formValues
+            });
+        console.log('Response Status:', response.status);
+        if (response.ok) {
+            const data = await response.json();
+            // FIX: Ensure history wrapper is visible before updating and showing details
+            document.getElementById('history-wrapper').style.display = 'block';
+            saveToHistory(formValues, data);
+            updateHistoryView();
+            const newEntryRow = document.querySelector('#history-table tbody tr:first-child');
+            if (newEntryRow) {
+                const detailBtn = newEntryRow.querySelector('.detail-btn');
+                if (detailBtn) {
+                    // We need to pass the event object structure to handleHistoryAction
+                    handleHistoryAction({ target: detailBtn });
+                }
+            }
+        } else {
+            const error = await response.text();
+            console.error('Server error response:', error);
+            // Since we removed the main results display, show error in the validation div
+            messageDiv.textContent = `Server Error: ${error}`;
+            messageDiv.style.display = 'block';
+        }
+    } catch (error) {
+        console.error('Fetch API Error:', error);
+        messageDiv.textContent = `Client Error: ${error.message}`;
+        messageDiv.style.display = 'block';
+    } finally {
+        loading.style.display = 'none';
+        if (submitBtn) submitBtn.disabled = false;
+    }
+}
+function renderTable(details, rawFullReport) {
+    if (!details || details.length === 0) {
+        return '<p>No detailed memory breakdown available.</p>';
+    }
+    const headers = Object.keys(details[0]);
+    headers.push('Breakdown');
+    let table = '<table><thead><tr>';
+    headers.forEach(h => table += `<th>${h}</th>`);
+    table += '</tr></thead><tbody>';
+    details.forEach(row => {
+        const ppRank = row.pp_rank;
+        // FIX: Look in the full raw report array passed in.
+        const rawDataForRank = rawFullReport ? rawFullReport.find(r => r.pp_rank === ppRank) : null;
+        // FIX: Change to `let` to allow modification for highlighting.
+        let modelBreakdown = (rawDataForRank && rawDataForRank.model_breakdown)
+            ? rawDataForRank.model_breakdown
+            : 'No breakdown available.';
+        // Add syntax-like highlighting for params and activations
+        // Basic HTML escaping for safety before inserting spans
+        modelBreakdown = modelBreakdown.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
+        modelBreakdown = modelBreakdown
+            .replace(/(n_params=[0-9.]+[a-zA-Z]*)/g, '<span class="highlight-red">$1</span>')
+            .replace(/(n_act=[0-9.]+[a-zA-Z]*)/g, '<span class="highlight-red">$1</span>');
+        // Main row with data
+        table += `<tr data-pp-rank="${ppRank}">`;
+        headers.forEach(h => {
+            if (h !== 'Breakdown') {
+                table += `<td>${row[h]}</td>`;
+            }
+        });
+        table += `<td><button class="action-btn raw-per-rank-btn" data-pp-rank="${ppRank}">Raw</button></td>`;
+        table += '</tr>';
+        // Hidden row for the breakdown
+        table += `<tr class="raw-breakdown-row" data-pp-rank="${ppRank}" style="display: none;">
+                    <td colspan="${headers.length}">
+                        <pre>${modelBreakdown}</pre>
+                    </td>
+                  </tr>`;
+    });
+    table += '</tbody></table>';
+    return table;
+}
+function saveToHistory(params, resultData) {
+    let history = JSON.parse(localStorage.getItem('estimationHistory')) || [];
+    const historyEntry = {
+        params: params,
+        result: resultData, // Store the full result object { processed_report, raw_report }
+        id: new Date().getTime()
+    };
+    history.unshift(historyEntry); // Add to the beginning
+    if (history.length > 20) { // Keep history size manageable
+        history.pop();
+    }
+    localStorage.setItem('estimationHistory', JSON.stringify(history));
+}
+function updateHistoryView() {
+    const history = JSON.parse(localStorage.getItem('estimationHistory')) || [];
+    const historyTableBody = document.querySelector('#history-table tbody');
+    const historyWrapper = document.getElementById('history-wrapper');
+    historyTableBody.innerHTML = '';
+    if (history.length === 0) {
+        historyWrapper.style.display = 'none';
+        return;
+    }
+    historyWrapper.style.display = 'block';
+    history.forEach(item => {
+        const row = document.createElement('tr');
+        const params = item.params;
+        const resultData = item.result || {};
+        // FIX: Handle both old and new data structures for compatibility.
+        const details = (resultData.report && resultData.report.details) ? resultData.report.details : (resultData.processed_report || []);
+        const pp0Result = details.find(r => r.pp_rank === 0) || details[0] || {};
+        const modelName = params.hf_model_path.split('/').pop();
+        // Build parallelism string, e.g., "TP2 PP2 VPP2"
+        const parallelismParts = [];
+        ['tp', 'pp', 'ep', 'cp', 'vpp', 'etp'].forEach(p => {
+            const value = params[p];
+            if (value && value > 1) {
+                parallelismParts.push(`${p.toUpperCase()}${value}`);
+            }
+        });
+        const parallelismInfo = parallelismParts.join(' ') || 'No Parallelism';
+        const overheadGb = params.overhead ? parseInt(params.overhead) : 0;
+        const baseTotal = details.length > 0 ? Math.max(...details.map(r => r.total_gb || 0)) : null;
+        const totalGb = baseTotal !== null ? (baseTotal + overheadGb).toFixed(2) : 'N/A';
+        const seqLen = params.seq_len || 0;
+        const formattedSeqLen = seqLen >= 1024 ? `${seqLen / 1024}k` : seqLen;
+        const sequenceInfo = `${params.mbs || 'N/A'}*${formattedSeqLen}`;
+        // 新增：生成重算方式描述
+        let recomputeInfo = '';
+        switch (params.recompute_granularity) {
+            case 'none':
+                recomputeInfo = 'Recompute: None';
+                break;
+            case 'full':
+                const method = params.recompute_method || 'uniform';
+                const layers = params.recompute_num_layers ? params.recompute_num_layers : '';
+                recomputeInfo = `Recompute: Full (${method}${layers ? ',' + layers + 'L' : ''})`;
+                break;
+            case 'selective':
+                const mods = Array.isArray(params.recompute_modules) && params.recompute_modules.length ? params.recompute_modules.join('+') : '';
+                recomputeInfo = `Recompute: Selective${mods ? ' (' + mods + ')' : ''}`;
+                break;
+            default:
+                recomputeInfo = '';
+        }
+        row.innerHTML = `
+            <td>
+                <div>${modelName}</div>
+                <div class="model-meta-info">
+                    <span>GPUs: ${params.num_gpus || 'N/A'}</span>
+                    <span>${parallelismInfo}</span>
+                    <span>Sequence: ${sequenceInfo}</span>
+                    ${recomputeInfo ? `<span>${recomputeInfo}</span>` : ''}
+                </div>
+            </td>
+            <td>${pp0Result.weight_grad_optim_gb || 'N/A'}</td>
+            <td>${pp0Result.activation_gb || 'N/A'}</td>
+            <td>${totalGb}</td>
+            <td>
+                <button class="restore-btn" data-id="${item.id}">Restore</button>
+                <button class="detail-btn" data-id="${item.id}">Detail</button>
+                <button class="delete-btn" data-id="${item.id}">Delete</button>
+            </td>
+        `;
+        historyTableBody.appendChild(row);
+    });
+}
+async function handleHistoryAction(e) {
+    const button = e.target.closest('button');
+    if (!button) return;
+    // Handle breakdown toggle first
+    if (button.classList.contains('breakdown-btn')) {
+        const ppRank = button.dataset.ppRank;
+        const detailTable = button.closest('table');
+        if (!detailTable) return;
+        const breakdownRow = detailTable.querySelector(`tr.breakdown-row[data-pp-rank="${ppRank}"]`);
+        if (!breakdownRow) return;
+        const isVisible = breakdownRow.style.display !== 'none';
+        breakdownRow.style.display = isVisible ? 'none' : 'table-row';
+        button.textContent = isVisible ? 'Breakdown' : 'Hide';
+        return; // Do not continue to other handlers
+    }
+    if (!button.matches('.detail-btn, .restore-btn, .delete-btn')) return;
+    const id = parseInt(button.dataset.id, 10);
+    const history = JSON.parse(localStorage.getItem('estimationHistory')) || [];
+    const entry = history.find(item => item.id === id);
+    if (!entry) {
+        console.error('History entry not found for id:', id);
+            return;
+        }
+    const row = button.closest('tr');
+    if (button.classList.contains('detail-btn')) {
+        const isDetailsVisible = row.nextElementSibling && row.nextElementSibling.classList.contains('detail-row');
+        document.querySelectorAll('.detail-row').forEach(detailRow => {
+            const prevRow = detailRow.previousElementSibling;
+            const detailBtn = prevRow.querySelector('.detail-btn');
+            if (detailRow !== row.nextElementSibling) {
+                detailRow.remove();
+                if (detailBtn) detailBtn.textContent = 'Detail';
+            }
+        });
+        if (isDetailsVisible) {
+            row.nextElementSibling.remove();
+            button.textContent = 'Detail';
+        } else {
+            const detailRow = document.createElement('tr');
+            detailRow.classList.add('detail-row');
+            const detailCell = detailRow.insertCell();
+            detailCell.colSpan = row.cells.length;
+            // FIX: Handle both old and new data structures for compatibility.
+            const report = entry.result.report;
+            const details = (report && report.details) ? report.details : (entry.result.processed_report || []);
+            const modelBreakdown = (report && report.model_breakdown) ? report.model_breakdown : null;
+            if (details && details.length > 0) {
+                const newTable = document.createElement('table');
+                // Determine if breakdown information exists per-row or globally
+                let headers = Object.keys(details[0]);
+                // If old-format data, there is a 'model_breakdown' key on each detail row
+                const hasRowBreakdown = headers.includes('model_breakdown');
+                // Remove the raw model_breakdown column from headers to keep table compact
+                if (hasRowBreakdown) {
+                    headers = headers.filter(h => h !== 'model_breakdown');
+                }
+                // Include global breakdown if provided, or row breakdowns if present
+                const includeBreakdown = hasRowBreakdown || (modelBreakdown && typeof modelBreakdown === 'string');
+                if (includeBreakdown) {
+                    headers.push('Breakdown');
+                }
+                const headerRow = newTable.insertRow();
+                headers.forEach(h => {
+                    const th = document.createElement('th');
+                    th.textContent = h;
+                    headerRow.appendChild(th);
+                });
+                details.forEach(detail => {
+                    const newRow = newTable.insertRow();
+                    headers.forEach(header => {
+                        if (header === 'Breakdown') {
+                            const cell = newRow.insertCell();
+                            cell.innerHTML = `<button class="breakdown-btn" data-pp-rank="${detail.pp_rank}">Breakdown</button>`;
+                        } else {
+                            const cell = newRow.insertCell();
+                            let value = detail[header];
+                            if (typeof value === 'number' && !Number.isInteger(value)) {
+                                value = value.toFixed(4);
+                            }
+                            cell.textContent = value;
+                        }
+                    });
+                    // Hidden breakdown row
+                    if (includeBreakdown) {
+                        const breakdownRow = newTable.insertRow();
+                        breakdownRow.classList.add('breakdown-row');
+                        breakdownRow.dataset.ppRank = detail.pp_rank;
+                        breakdownRow.style.display = 'none';
+                        const breakdownCell = breakdownRow.insertCell();
+                        breakdownCell.colSpan = headers.length;
+                        const rowSpecificBreakdown = hasRowBreakdown ? (detail.model_breakdown || '') : modelBreakdown;
+                        const htmlBreakdown = ansiToHtml(rowSpecificBreakdown);
+                        breakdownCell.innerHTML = `<pre class="model-breakdown-view">${htmlBreakdown || 'No breakdown available.'}</pre>`;
+                    }
+                });
+                detailCell.appendChild(newTable);
+            } else {
+                detailCell.innerHTML = 'No detailed per-rank results available.';
+            }
+            row.after(detailRow);
+            button.textContent = 'Hide';
+        }
+    } else if (button.classList.contains('restore-btn')) {
+        restoreForm(entry.params);
+    } else if (button.classList.contains('delete-btn')) {
+        deleteHistoryEntry(id);
+    }
+}
+function deleteHistoryEntry(id) {
+    let history = JSON.parse(localStorage.getItem('estimationHistory')) || [];
+    const updatedHistory = history.filter(item => item.id != id);
+    localStorage.setItem('estimationHistory', JSON.stringify(updatedHistory));
+    updateHistoryView();
+    // If history is now empty, hide the whole output container
+    if (updatedHistory.length === 0) {
+        // document.getElementById('output-container').style.display = 'none';
+    }
+}
+function clearHistory() {
+    localStorage.removeItem('estimationHistory');
+    updateHistoryView();
+    // document.getElementById('output-container').style.display = 'none';
+}
+function restoreForm(params) {
+    if (!params) return;
+    const setElementValue = (id, value, defaultValue = '') => {
+        const element = document.getElementById(id);
+        if (element) {
+            if (element.type === 'checkbox') {
+                element.checked = value ?? defaultValue;
+            } else {
+                element.value = value ?? defaultValue;
+            }
+        }
+    };
+    setElementValue('num-gpus', params.num_gpus, 8);
+    setElementValue('mbs', params.mbs, 1);
+    setElementValue('seq-len', params.seq_len, 4096);
+    setElementValue('use-distributed-optimizer', params.use_distributed_optimizer, true);
+    setElementValue('recompute_granularity', params.recompute_granularity, 'selective');
+    setElementValue('recompute_method', params.recompute_method, 'uniform');
+    setElementValue('recompute_num_layers', params.recompute_num_layers, 1);
+    setElementValue('tp', params.tp, 1);
+    setElementValue('pp', params.pp, 1);
+    setElementValue('ep', params.ep, 1);
+    setElementValue('cp', params.cp, 1);
+    setElementValue('vpp', params.vpp);
+    // 在设置 vpp 之后更新依赖显示
+    toggleVppDependentOptions();
+    setElementValue('etp', params.etp);
+    setElementValue('num_layers_in_first_pipeline_stage', params.num_layers_in_first_pipeline_stage);
+    setElementValue('num_layers_in_last_pipeline_stage', params.num_layers_in_last_pipeline_stage);
+    setElementValue('pipeline-layout', params.pipeline_model_parallel_layout);
+    setElementValue('overhead', params.overhead, 10);
+    // 新增 checkbox 恢复
+    setElementValue('account_for_embedding_in_pipeline_split', params.account_for_embedding_in_pipeline_split, false);
+    setElementValue('account_for_loss_in_pipeline_split', params.account_for_loss_in_pipeline_split, false);
+    const modelSelect = document.getElementById('model-select');
+    if (modelSelect && params.hf_model_path) {
+        modelSelect.value = params.hf_model_path;
+    }
+    // Manually trigger change event for UI updates
+    const recomputeSelect = document.getElementById('recompute_granularity');
+    if (recomputeSelect) {
+        recomputeSelect.dispatchEvent(new Event('change'));
+            }
+}
+function updateParallelismOptions() {
+    const numGpusInput = document.getElementById('num-gpus');
+    if (!numGpusInput) return;
+    const numGpus = parseInt(numGpusInput.value);
+    if (isNaN(numGpus) || numGpus <= 0) {
+        return; // Don't update if GPU count is invalid
+    }
+    const tpSelect = document.getElementById('tp');
+    const epSelect = document.getElementById('ep');
+    const cpSelect = document.getElementById('cp');
+    // PP is now a manual input, so we only handle TP, EP, CP here.
+    const selects = [tpSelect, epSelect, cpSelect];
+    const powersOfTwo = [1];
+    for (let i = 1; (1 << i) <= numGpus; i++) {
+        powersOfTwo.push(1 << i);
+    }
+    selects.forEach(select => {
+        if (!select) return;
+        const currentVal = select.value;
+        select.innerHTML = ''; // Clear existing options
+        powersOfTwo.forEach(val => {
+            const option = document.createElement('option');
+            option.value = val;
+            option.textContent = val;
+            select.appendChild(option);
+        });
+        // Try to restore the previous value, otherwise default to 1
+        if (powersOfTwo.includes(parseInt(currentVal))) {
+            select.value = currentVal;
+        } else {
+            select.value = 1;
+        }
+    });
+}
+function validateParallelismLive() {
+    const messageDiv = document.getElementById('validation-message');
+    // Pass isSubmission = false to getFormValues to prevent alerts during live validation
+    const formValues = getFormValues(false);
+    if (!formValues) {
+        messageDiv.textContent = '';
+        return true;
+        }
+    const { num_gpus, tp, pp, ep, cp, etp, custom_hf_config } = formValues;
+    // The key is the same in the HF config, so this logic remains valid.
+    const num_kv_heads = custom_hf_config?.num_key_value_heads || null;
+    let errors = [];
+    if (tp * pp * cp > num_gpus) {
+        errors.push(`TP*PP*CP (${tp*pp*cp}) > GPUs (${num_gpus}).`);
+        }
+    if (etp) {
+        if (etp * pp * cp * ep > num_gpus) {
+            errors.push(`ETP*PP*CP*EP (${etp*pp*cp*ep}) > GPUs (${num_gpus}).`);
+        }
+    } else {
+        if (tp * pp * cp * ep > num_gpus) {
+            errors.push(`TP*PP*CP*EP (${tp*pp*cp*ep}) > GPUs (${num_gpus}) when ETP is not set.`);
+        }
+    }
+    if (num_kv_heads && tp > num_kv_heads) {
+        errors.push(`TP (${tp}) > Num KV Heads (${num_kv_heads}).`);
+    }
+    if (errors.length > 0) {
+        messageDiv.textContent = 'Validation Error: ' + errors.join(' ');
+        messageDiv.style.display = 'block';
+    } else {
+        messageDiv.textContent = '';
+        messageDiv.style.display = 'none';
+    }
+    return errors.length === 0;
+}
+function toggleEpBasedOnConfig() {
+    const editor = document.getElementById('config-editor');
+    const epSelect = document.getElementById('ep');
+    if (!editor || !epSelect) return;
+    let config = null;
+    try {
+        if (editor.value) {
+            config = JSON.parse(editor.value);
+        }
+    } catch (e) {
+        // Invalid JSON, disable EP as a safety measure
+        epSelect.disabled = true;
+        return;
+    }
+    if (config && config.num_experts_per_tok) {
+        epSelect.disabled = false;
+    } else {
+        epSelect.disabled = true;
+        epSelect.value = 1; // Reset to 1 if disabled
+    }
+}
+// 新增：根据 vpp 输入显示/隐藏依赖选项
+function toggleVppDependentOptions() {
+    const vppInput = document.getElementById('vpp');
+    const dependents = document.querySelectorAll('.vpp-dependent');
+    if (!vppInput) return;
+    const shouldShow = vppInput.value && parseInt(vppInput.value) > 0;
+    dependents.forEach(el => {
+        el.style.display = shouldShow ? 'block' : 'none';
+    });
+}

webui/style.css ADDED Viewed

	@@ -0,0 +1,383 @@

+body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+    line-height: 1.6;
+    background-color: #f4f4f4;
+    color: #333;
+    margin: 0;
+    padding: 1em;
+}
+.container {
+    max-width: 1600px;
+    margin: auto;
+    background: #fff;
+    padding: 2em;
+    border-radius: 8px;
+    box-shadow: 0 0 20px rgba(0, 0, 0, 0.05);
+}
+.main-layout {
+    display: flex;
+    flex-direction: column; /* Main axis is vertical */
+    gap: 2em;
+}
+.top-section {
+    display: flex;
+    flex-direction: row; /* Children are horizontal */
+    gap: 2em;
+}
+.config-column, .output-column {
+    flex: 1; /* Each column takes up half the space */
+    display: flex;
+    flex-direction: column;
+}
+/* The editor wrapper should grow to fill the space */
+.config-editor-wrapper {
+    flex-grow: 1;
+    display: flex;
+    flex-direction: column;
+}
+#config-editor {
+    flex-grow: 1; /* The textarea itself should grow */
+    width: 100%;
+    box-sizing: border-box; /* Include padding and border in the element's total width and height */
+    resize: vertical; /* Allow vertical resizing */
+}
+.bottom-section {
+    width: 100%;
+}
+.form-row {
+    display: flex;
+    gap: 1em;
+    align-items: flex-end;
+}
+.form-row .form-group {
+    flex: 1; /* Allow groups to grow and fill space */
+    margin-bottom: 0.8em;
+}
+.form-group {
+    margin-bottom: 0.8em; /* Reduced from default */
+}
+.form-group label {
+    display: block;
+    margin-bottom: 0.25em; /* Reduced */
+    font-weight: 500;
+}
+.form-group label.inline-label {
+    display: inline-block;
+    margin-left: 0.5em;
+    font-weight: normal;
+}
+.form-group input[type="number"],
+.form-group select {
+    width: 100%;
+    padding: 6px 10px; /* Reduced padding */
+    border-radius: 4px;
+    border: 1px solid #ccc;
+    box-sizing: border-box;
+}
+button {
+    background-color: #3498db;
+    color: white;
+    padding: 10px 15px;
+    border: none;
+    border-radius: 4px;
+    cursor: pointer;
+    font-size: 16px;
+    margin-top: 10px;
+}
+button:hover {
+    background-color: #2980b9;
+}
+#results {
+    background-color: #ecf0f1;
+    padding: 15px;
+    border-radius: 4px;
+    white-space: pre-wrap;
+    word-wrap: break-word;
+    min-height: 100px;
+}
+.results-container {
+    margin-top: 20px;
+}
+/* New styles for results table */
+table {
+    width: 100%;
+    border-collapse: collapse;
+    margin-top: 20px;
+}
+th, td {
+    border: 1px solid #ddd;
+    padding: 12px;
+    text-align: left;
+}
+th {
+    background-color: #f2f2f2;
+    font-weight: bold;
+}
+tbody tr:nth-child(even) {
+    background-color: #f9f9f9;
+}
+tbody tr:hover {
+    background-color: #f1f1f1;
+}
+.error {
+    color: #e74c3c;
+    font-weight: bold;
+}
+.button-container {
+    grid-column: 1 / -1; /* Span across all columns */
+    text-align: center;
+    margin-top: 20px;
+}
+/* History Section */
+.history-container {
+    margin-top: 40px;
+    border-top: 1px solid #e0e0e0;
+    padding-top: 20px;
+}
+.history-container h2 {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+#history-list table {
+    margin-top: 10px;
+}
+.small-button {
+    padding: 4px 8px;
+    font-size: 0.8em;
+    background-color: #e74c3c;
+}
+.small-button:hover {
+    background-color: #c0392b;
+}
+.history-item-actions {
+    display: flex;
+    gap: 10px;
+}
+#output-container {
+    margin-top: 2em;
+    padding: 1.5em;
+    background-color: #f9f9f9;
+    border: 1px solid #ddd;
+    border-radius: 8px;
+}
+#results-wrapper h3, #history-wrapper h3 {
+    margin-top: 0;
+    border-bottom: 2px solid #eee;
+    padding-bottom: 0.5em;
+    margin-bottom: 1em;
+}
+#results-display table {
+    width: 100%;
+    border-collapse: collapse;
+}
+#results-display th, #results-display td {
+    padding: 8px 12px;
+    border: 1px solid #ddd;
+    text-align: left;
+}
+#results-display th {
+    background-color: #f2f2f2;
+}
+#history-table {
+    width: 100%;
+    border-collapse: collapse;
+}
+#history-table th, #history-table td {
+    padding: 8px 12px;
+    border: 1px solid #ddd;
+    text-align: left;
+}
+#history-table th {
+    background-color: #f2f2f2;
+}
+#history-table td:last-child {
+    text-align: right;
+}
+#raw-json-output {
+    background-color: #2d2d2d;
+    color: #f1f1f1;
+    padding: 1em;
+    border-radius: 5px;
+    max-height: 500px;
+    overflow-y: auto;
+}
+#clear-history {
+    background-color: #dc3545;
+}
+#clear-history:hover {
+    background-color: #c82333;
+}
+.error-message {
+    color: #dc3545;
+    background-color: #f8d7da;
+    border: 1px solid #f5c6cb;
+    padding: 0.75rem 1.25rem;
+    margin-top: 1rem;
+    margin-bottom: 1rem;
+    border-radius: 0.25rem;
+    text-align: center;
+}
+/* Responsive Design for smaller screens */
+@media (max-width: 992px) {
+    .top-section {
+        flex-direction: column;
+    }
+}
+.history-detail-row td {
+    background-color: #333;
+    padding: 15px;
+    border-top: 2px solid #555;
+    text-align: left; /* Align content to the left */
+}
+.history-detail-row pre {
+    background-color: #1e1e1e;
+    color: #d4d4d4;
+    padding: 10px;
+    border-radius: 4px;
+    white-space: pre-wrap;
+    word-break: break-all;
+}
+.history-detail-row table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 0;
+}
+.history-detail-row table th {
+    background-color: #e0e0e0;
+    color: #333;
+    padding: 8px 12px;
+    border: 1px solid #555;
+}
+.history-detail-row table td {
+    color: #d4d4d4;
+    padding: 8px 12px;
+    border: 1px solid #555;
+    background-color: #2a2a2a;
+}
+.model-breakdown-view {
+    max-height: 400px; /* Or any other suitable height */
+    overflow-y: auto;
+    overflow-x: auto;
+    background-color: #2d2d2d;
+    color: #f1f1f1;
+    padding: 1em;
+    border-radius: 5px;
+    white-space: pre-wrap; /* Ensures the pre content wraps */
+    margin: 0;
+    font-family: monospace;
+    font-size: 0.85em;
+}
+.model-meta-info {
+    font-size: 0.9em;
+    color: #666;
+    margin-top: 4px;
+}
+.model-meta-info span {
+    margin-right: 15px;
+}
+.action-btn.raw-btn {
+    background-color: #555;
+    color: white;
+}
+.highlight-red {
+    color: #ff6b6b;
+}
+.ansi-red { color: #e74c3c; }
+.ansi-green { color: #2ecc71; }
+.ansi-yellow { color: #f1c40f; }
+.ansi-blue { color: #3498db; }
+.ansi-magenta { color: #9b59b6; }
+.ansi-cyan { color: #1abc9c; }
+.breakdown-row td {
+    text-align: left !important;
+}
+.footer {
+    margin-top: 2em;
+    font-size: 0.85em;
+    color: #555;
+    text-align: center;
+}
+.footer a {
+    color: #2a77d4;
+    text-decoration: none;
+}
+.footer a:hover {
+    text-decoration: underline;
+}
+.disclaimer {
+    margin-top: 0.5em;
+    font-style: italic;
+}
+.disclaimer-banner {
+    background-color: #fff3cd;
+    color: #856404;
+    border: 1px solid #ffeeba;
+    padding: 10px 15px;
+    border-radius: 4px;
+    margin: 15px 0;
+    font-weight: bold;
+    text-align: center;
+}