NVIDIA
diff --git a/‎.gitlab/stages/01.build.yml‎
Lines changed: 6 additions & 0 deletions b/‎.gitlab/stages/01.build.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎mamba_builders.py‎
Lines changed: 6 additions & 2 deletions b/‎mamba_builders.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎megatron/core/models/common/language_module/language_module.py‎
Lines changed: 27 additions & 3 deletions b/‎megatron/core/models/common/language_module/language_module.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 1 addition & 1 deletion b/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 2 additions & 2 deletions b/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎megatron/core/models/gpt/gpt_layer_specs.py‎
Lines changed: 1 addition & 1 deletion b/‎megatron/core/models/gpt/gpt_layer_specs.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 19 additions & 93 deletions b/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 19 additions & 93 deletions
diff --git a/‎megatron/core/models/mamba/mamba_layer_specs.py‎
Lines changed: 33 additions & 0 deletions b/‎megatron/core/models/mamba/mamba_layer_specs.py‎
Lines changed: 33 additions & 0 deletions
@@ -121,6 +121,7 @@ test:build_image:
     KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi
     SHARED_PATH: /builds/$CI_PROJECT_PATH/shared
   script:
+    - apk add skopeo
     - |
       set -x
 
@@ -132,6 +133,11 @@ test:build_image:
         ${IMAGE}:${CI_PIPELINE_ID}-arm64
 
       docker manifest push ${IMAGE}:${CI_PIPELINE_ID}
+
+      if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then
+        skopeo copy --all docker://${IMAGE}:${CI_PIPELINE_ID} docker://${IMAGE}:${CI_COMMIT_BRANCH}
+      fi
+
     - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env
     - echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env
     - cat build.env
 
@@ -8,15 +8,18 @@
 from megatron.training.arguments import core_transformer_config_from_args
 from megatron.core.models.mamba.mamba_layer_specs import mamba_inference_stack_spec
 
+
 def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None):
     print_rank_0('building MAMBA model ...')
     if config is None:
         config = core_transformer_config_from_args(args, TransformerConfig)
     assert args.use_legacy_models is False, "Mamba only supported in Mcore!"
 
     if config.transformer_impl == "inference_optimized":
-        mamba_stack_spec = mamba_inference_stack_spec 
-        assert not config.inference_fuse_tp_communication, "inference_fuse_tp_communication is not supported for Mamba"
+        mamba_stack_spec = mamba_inference_stack_spec
+        assert (
+            not config.inference_fuse_tp_communication
+        ), "inference_fuse_tp_communication is not supported for Mamba"
     elif args.spec is not None:
         mamba_stack_spec = import_module(args.spec)
     else:
@@ -39,6 +42,7 @@ def mamba_builder(args, pre_process, post_process, vp_stage=None, config=None, p
         rotary_percent=args.rotary_percent,
         rotary_base=args.rotary_base,
         pg_collection=pg_collection,
+        vp_stage=vp_stage,
     )
 
     for l in range(model.decoder.num_layers_per_pipeline_rank):
 
@@ -23,6 +23,7 @@
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
 from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.multi_token_prediction import tie_word_embeddings_state_dict
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import ensure_metadata_has_dp_cp_group
 from megatron.core.utils import (
@@ -255,12 +256,20 @@ def setup_embeddings_and_output_layer(self) -> None:
             LanguageModule.embedding_warning_printed = True
 
     def shared_embedding_or_output_weight(self) -> Tensor:
-        """Gets the emedding weight or output logit weights when share embedding and output weights set to True.
+        """Gets the embedding weight or output logit weights when share embedding and output weights set to True
+          or when use Multi-Token Prediction (MTP).
 
         Returns:
-            Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight
+            Tensor: During pre processing or MTP process it returns the input embeddings weight while during post processing it returns the final output layers weight
         """
-        if self.pre_process:
+        if self.pre_process or getattr(self, 'mtp_process', False):
+            # Multi-Token Prediction (MTP) need both embedding layer and output layer.
+            # So there will be both embedding layer and output layer in the mtp process stage.
+            # When share_embeddings_and_output_weights is True, the embedding weight is the
+            # canonical shared weight and is passed to the output layer during forward.
+            assert hasattr(
+                self, 'embedding'
+            ), f"embedding is needed in this pipeline stage, but it is not initialized."
             return self.embedding.word_embeddings.weight
         elif self.post_process:
             return self.output_layer.weight
@@ -293,6 +302,21 @@ def sharded_state_dict(
         output_layer_weight_key = f'{prefix}output_layer.weight'
         output_layer_bias_key = f'{prefix}output_layer.bias'
 
+        # Multi-Token Prediction (MTP) needs embedding layer in mtp process stage.
+        # If MTP is not placed in the pre processing stage, we need to maintain a copy of
+        # embedding layer in the mtp process stage and tie it to the embedding in the pre
+        # processing stage.
+        # Note: MTP loss is computed at post_process stage, so the output_layer on mtp_process
+        # rank doesn't need special tying - it's not used for loss computation.
+        if getattr(self, 'mtp_process', False) and not self.pre_process:
+            emb_weight = self.embedding.word_embeddings.weight
+            tie_word_embeddings_state_dict(
+                sharded_state_dict,
+                emb_weight,
+                first_stage_word_emb_key,
+                tp_group=self.tp_group,
+                dp_cp_group=metadata['dp_cp_group'],
+            )
         if self.share_embeddings_and_output_weights:
             self.tie_embeddings_and_output_weights_state_dict(
                 sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key, metadata
 
@@ -123,7 +123,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args):
         # get flags for latter use
         is_mtp = isinstance(self.layer, MultiTokenPredictionLayer)
         is_moe = (
-            isinstance(self.layer.transformer_layer.mlp, MoELayer)
+            isinstance(self.layer.mtp_model_layer.mlp, MoELayer)
             if is_mtp
             else isinstance(self.layer.mlp, MoELayer)
         )
 
@@ -613,9 +613,9 @@ def build_mtp_layer_callables(layer):
     multi-token prediction layer nodes (attention, MLP, etc.)
     """
 
-    forward_funcs, backward_dw = build_transformer_layer_callables(layer.transformer_layer)
+    forward_funcs, backward_dw = build_transformer_layer_callables(layer.mtp_model_layer)
     attn_forward, dispatch_forward, mlp_forward, combine_forward, _ = forward_funcs
-    is_moe = isinstance(layer.transformer_layer.mlp, MoELayer)
+    is_moe = isinstance(layer.mtp_model_layer.mlp, MoELayer)
     assert is_moe, "MTP layer in a2a overlap only supports MoE layer for now."
 
     def submodule_mtp_attn_forward(node, hidden_states):
 
@@ -704,7 +704,7 @@ def get_gpt_mtp_block_spec_for_backend(
         raise ValueError(f"Invalid spec: {spec}")
 
     mtp_layer_spec = get_mtp_layer_spec_for_backend(
-        transformer_layer_spec=transformer_layer_spec, backend=backend
+        mtp_model_layer_spec=transformer_layer_spec, backend=backend
     )
     mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0
     mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers
 
@@ -6,7 +6,7 @@
 import torch
 from torch import Tensor
 
-from megatron.core import parallel_state, tensor_parallel
+from megatron.core import tensor_parallel
 from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.inference.contexts import BaseInferenceContext
@@ -26,11 +26,9 @@
 from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
 from megatron.core.transformer.enums import CudaGraphScope, ModelType
 from megatron.core.transformer.multi_token_prediction import (
-    MTPLossAutoScaler,
-    MTPLossLoggingHelper,
     MultiTokenPredictionBlock,
-    roll_tensor,
-    tie_word_embeddings_state_dict,
+    mtp_on_this_rank,
+    process_mtp_loss,
 )
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_block import TransformerBlock
@@ -144,7 +142,9 @@ def __init__(
             self.rotary_base = rotary_base
         self.rotary_scaling = rope_scaling
         self.mtp_block_spec = mtp_block_spec
-        self.mtp_process = mtp_block_spec is not None
+        self.mtp_process = mtp_block_spec is not None and mtp_on_this_rank(
+            self.config, ignore_virtual=False, vp_stage=vp_stage
+        )
 
         if self.pre_process or self.mtp_process:
             self.embedding = LanguageModelEmbedding(
@@ -609,56 +609,19 @@ def _postprocess(
             return hidden_states
 
         if self.config.mtp_num_layers is not None:
-            mtp_labels = labels.clone()
-            hidden_states_list = torch.chunk(hidden_states, 1 + self.config.mtp_num_layers, dim=0)
-            hidden_states = hidden_states_list[0]
-            if loss_mask is None:
-                # if loss_mask is not provided, use all ones as loss_mask
-                loss_mask = torch.ones_like(mtp_labels)
-            for mtp_layer_number in range(self.config.mtp_num_layers):
-                # output
-                mtp_logits, _ = self.output_layer(
-                    hidden_states_list[mtp_layer_number + 1],
-                    weight=output_weight,
-                    runtime_gather_output=runtime_gather_output,
-                )
-                # Calc loss for the current Multi-Token Prediction (MTP) layers.
-                mtp_labels, _ = roll_tensor(
-                    mtp_labels,
-                    shifts=-1,
-                    dims=-1,
-                    cp_group=self.cp_group,
-                    packed_seq_params=packed_seq_params,
-                )
-                loss_mask, num_tokens = roll_tensor(
-                    loss_mask,
-                    shifts=-1,
-                    dims=-1,
-                    cp_group=self.cp_group,
-                    packed_seq_params=packed_seq_params,
-                )
-                mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits)
-                mtp_loss = loss_mask * mtp_loss
-                if self.training:
-                    # TODO(shifangx): remove the use of parallel_state here
-                    # after moving loss logging to loss_func in pretrain_gpt.py
-                    MTPLossLoggingHelper.save_loss_to_tracker(
-                        torch.sum(mtp_loss) / num_tokens,
-                        mtp_layer_number,
-                        self.config.mtp_num_layers,
-                        avg_group=parallel_state.get_data_parallel_group(
-                            with_context_parallel=True
-                        ),
-                    )
-                mtp_loss_scale = self.config.mtp_loss_scaling_factor / self.config.mtp_num_layers
-                if self.config.calculate_per_token_loss:
-                    hidden_states = MTPLossAutoScaler.apply(
-                        hidden_states, mtp_loss_scale * mtp_loss
-                    )
-                else:
-                    hidden_states = MTPLossAutoScaler.apply(
-                        hidden_states, mtp_loss_scale * mtp_loss / num_tokens
-                    )
+            hidden_states = process_mtp_loss(
+                hidden_states=hidden_states,
+                labels=labels,
+                loss_mask=loss_mask,
+                output_layer=self.output_layer,
+                output_weight=output_weight,
+                runtime_gather_output=runtime_gather_output,
+                is_training=self.training,
+                compute_language_model_loss=self.compute_language_model_loss,
+                config=self.config,
+                cp_group=self.pg_collection.cp,
+                packed_seq_params=packed_seq_params,
+            )
         sequence_parallel_override = False
 
         if in_inference_mode and inference_context.config.materialize_only_last_token_logits:
@@ -715,27 +678,6 @@ def _postprocess(
 
         return loss
 
-    def shared_embedding_or_output_weight(self) -> Tensor:
-        """Gets the embedding weight or output logit weights when share input embedding and
-        output weights set to True or when use Multi-Token Prediction (MTP) feature.
-
-        Returns:
-            Tensor: During pre processing or MTP process it returns the input embeddings weight.
-            Otherwise, during post processing it returns the final output layers weight.
-        """
-        if self.pre_process or self.mtp_process:
-            # Multi-Token Prediction (MTP) need both embedding layer and output layer.
-            # So there will be both embedding layer and output layer in the mtp process stage.
-            # In this case, if share_embeddings_and_output_weights is True, the shared weights
-            # will be stored in embedding layer, and output layer will not have any weight.
-            assert hasattr(
-                self, 'embedding'
-            ), f"embedding is needed in this pipeline stage, but it is not initialized."
-            return self.embedding.word_embeddings.weight
-        elif self.post_process:
-            return self.output_layer.weight
-        return None
-
     def build_schedule_plan(
         self,
         input_ids: Tensor,
@@ -826,20 +768,4 @@ def sharded_state_dict(
             output_extra_state and output_extra_state.data
         ), f'Expected output layer extra state to be empty, got: {output_extra_state}'
 
-        # Multi-Token Prediction (MTP) need embedding layer in mtp process stage.
-        # If MTP is not placed in the pre processing stage, we need to maintain a copy of
-        # embedding layer in the mtp process stage and tie it to the embedding in the pre
-        # processing stage.
-        # Now MTP loss is computed in post processing stage, so the output_layer is not needed.
-        if self.mtp_process and not self.pre_process:
-            emb_weight_key = f'{prefix}embedding.word_embeddings.weight'
-            emb_weight = self.embedding.word_embeddings.weight
-            tie_word_embeddings_state_dict(
-                sharded_state_dict,
-                emb_weight,
-                emb_weight_key,
-                tp_group=self.tp_group,
-                dp_cp_group=metadata['dp_cp_group'],
-            )
-
         return sharded_state_dict
@@ -1,6 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 from megatron.core.extensions.transformer_engine import (
+    TEColumnParallelLinear,
     TEDotProductAttention,
     TELayerNormColumnParallelLinear,
     TENorm,
@@ -19,20 +20,49 @@
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.multi_token_prediction import (
+    MultiTokenPredictionBlock,
+    MultiTokenPredictionBlockSubmodules,
+    MultiTokenPredictionLayer,
+    MultiTokenPredictionLayerSubmodules,
+)
 from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_layer import (
     MoETransformerLayer,
     TransformerLayer,
     TransformerLayerSubmodules,
 )
 
+# This should be private and should not be used outside of this file.
 moe = get_moe_module_spec(
     use_te=True,
     num_experts=8,  # Can be any positive integer (must not be None).
     moe_grouped_gemm=True,
     moe_use_legacy_grouped_gemm=False,
 )
 
+
+# MTP block spec for Mamba - provides norms and projection only.
+# Inner layers are built by MultiTokenPredictionLayer using nested MambaStack
+_mamba_mtp_block_spec = ModuleSpec(
+    module=MultiTokenPredictionBlock,
+    submodules=MultiTokenPredictionBlockSubmodules(
+        layer_specs=[
+            ModuleSpec(
+                module=MultiTokenPredictionLayer,
+                submodules=MultiTokenPredictionLayerSubmodules(
+                    enorm=TENorm,
+                    hnorm=TENorm,
+                    eh_proj=TEColumnParallelLinear,
+                    mtp_model_layer=None,  # Built via pattern + mamba_submodules
+                    layer_norm=TENorm,
+                ),
+            )
+        ]
+    ),
+)
+
+
 mamba_stack_spec = ModuleSpec(
     module=MambaStack,
     submodules=MambaStackSubmodules(
@@ -87,9 +117,11 @@
                 pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add
             ),
         ),
+        mtp_block_spec=_mamba_mtp_block_spec,
     ),
 )
 
+
 mamba_inference_stack_spec = ModuleSpec(
     module=MambaStack,
     submodules=MambaStackSubmodules(
@@ -147,5 +179,6 @@
                 pre_mlp_layernorm=TENorm, mlp=moe, mlp_bda=get_bias_dropout_add
             ),
         ),
+        mtp_block_spec=_mamba_mtp_block_spec,
     ),
 )
Original file line number	Diff line number	Diff line change
`@@ -123,7 +123,7 @@ def _build_callable_nodes(self, event, comp_stream, comm_stream, extra_args):`
`123`	`123`	`# get flags for latter use`
`124`	`124`	`is_mtp = isinstance(self.layer, MultiTokenPredictionLayer)`
`125`	`125`	`is_moe = (`
`126`		`- isinstance(self.layer.transformer_layer.mlp, MoELayer)`
	`126`	`+ isinstance(self.layer.mtp_model_layer.mlp, MoELayer)`
`127`	`127`	`if is_mtp`
`128`	`128`	`else isinstance(self.layer.mlp, MoELayer)`
`129`	`129`	`)`
Original file line number	Diff line number	Diff line change
`@@ -704,7 +704,7 @@ def get_gpt_mtp_block_spec_for_backend(`
`704`	`704`	`raise ValueError(f"Invalid spec: {spec}")`
`705`	`705`
`706`	`706`	`mtp_layer_spec = get_mtp_layer_spec_for_backend(`
`707`		`- transformer_layer_spec=transformer_layer_spec, backend=backend`
	`707`	`+ mtp_model_layer_spec=transformer_layer_spec, backend=backend`
`708`	`708`	`)`
`709`	`709`	`mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0`
`710`	`710`	`mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers`