NVIDIA · yeyu-nvidia · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -178,14 +178,57 @@ def make_eagle_supervised_data_module(
 class EagleTrainerWithAccLog(Trainer):
     """Wrapper around Trainer that logs training accuracy."""
 
+    def __init__(
+        self,
+        *args,
+        lora_lr_multiplier: float = 1.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.lora_lr_multiplier = lora_lr_multiplier
+
+    def create_optimizer(self):
+        """Override to give LoRA parameters a higher learning rate."""
+        super().create_optimizer()
+        if self.lora_lr_multiplier != 1.0:
+            lora_ids = {
+                id(p) for n, p in self.model.named_parameters() if "lora_" in n and p.requires_grad
+            }
+            if lora_ids:
+                new_groups = []
+                for group in self.optimizer.param_groups:
+                    lora = [p for p in group["params"] if id(p) in lora_ids]
+                    others = [p for p in group["params"] if id(p) not in lora_ids]
+                    if lora and others:
+                        new_groups.append({**group, "params": others})
+                        new_groups.append(
+                            {**group, "params": lora, "lr": group["lr"] * self.lora_lr_multiplier}
+                        )
+                    elif lora:
+                        new_groups.append({**group, "lr": group["lr"] * self.lora_lr_multiplier})
+                    else:
+                        new_groups.append(group)
+                self.optimizer.param_groups = new_groups
+        return self.optimizer
+
     def compute_loss(self, *args, **kwargs):
-        """Override compute_loss to save train accs in trainer state."""
+        """Override compute_loss to save train accs and per-component losses in trainer state."""
         if not hasattr(self.state, "training_accs"):
             self.state.training_accs = []
+        if not hasattr(self.state, "component_losses"):
+            self.state.component_losses = {"eagle": [], "preservation": []}
         kwargs.pop("num_items_in_batch", None)
         loss, outputs = super().compute_loss(return_outputs=True, *args, **kwargs)
-        if hasattr(outputs, "train_acc"):
+        if hasattr(outputs, "train_acc") and any(outputs.train_acc):
             self.state.training_accs.append(outputs.train_acc)
+        # Track per-component losses
+        for key, attr in [
+            ("eagle", "eagle_loss"),
+            ("preservation", "preservation_loss"),
+        ]:
+            val = getattr(outputs, attr, None)
+            if val is not None:
+                self.state.component_losses[key].append(val.item())
         return loss
 
 
@@ -230,8 +273,16 @@ def on_log(self, args, state, control, **kwargs):
             if self.estimate_ar:
                 wandb.log({"estimated_training_ar": est_ar}, step=state.global_step)
 
-        # reset training_accs
+            # Log per-component losses
+            if hasattr(state, "component_losses"):
+                for key, vals in state.component_losses.items():
+                    if vals:
+                        wandb.log({f"{key}_loss": np.mean(vals)}, step=state.global_step)
+
+        # reset training_accs and component_losses
         state.training_accs = []
+        if hasattr(state, "component_losses"):
+            state.component_losses = {"eagle": [], "preservation": []}
         return control
 
     def on_step_end(self, args, state, control, **kwargs):
@@ -240,6 +291,7 @@ def on_step_end(self, args, state, control, **kwargs):
             return control
         if state.global_step % self.ar_validate_steps == 0 and state.global_step > 0:
             print_rank_0("Running AR validation...")
+            torch.cuda.empty_cache()
             try:
                 ars = validate_ar(
                     model=kwargs["model"],

@@ -134,6 +134,30 @@ while [ $# -gt 0 ]; do
       if [[ "$1" != *=* ]]; then shift; fi
       FSDP="${1#*=}"
       ;;
+    --eagle_base_lora_rank*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_BASE_LORA_RANK="${1#*=}"
+      ;;
+    --eagle_base_lora_alpha*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_BASE_LORA_ALPHA="${1#*=}"
+      ;;
+    --eagle_base_lora_target_modules*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_BASE_LORA_TARGET_MODULES="${1#*=}"
+      ;;
+    --eagle_base_lora_preservation_loss_weight*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT="${1#*=}"
+      ;;
+    --eagle_base_lora_lr_multiplier*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_BASE_LORA_LR_MULTIPLIER="${1#*=}"
+      ;;
+    --eagle_base_lora*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_BASE_LORA="${1#*=}"
+      ;;
     *)
       >&2 printf "Error: Invalid argument ${1#*=}\n"
       exit 1
@@ -184,6 +208,12 @@ DRAFT_VOCAB_CACHE=${DRAFT_VOCAB_CACHE:-""}
 MIX_HIDDEN_STATES=${MIX_HIDDEN_STATES:-"False"}
 DISABLE_TORCH_COMPILE=${DISABLE_TORCH_COMPILE:-"False"}
 NUM_TTT_STEPS=${NUM_TTT_STEPS:-3}
+EAGLE_BASE_LORA=${EAGLE_BASE_LORA:-"False"}
+EAGLE_BASE_LORA_RANK=${EAGLE_BASE_LORA_RANK:-64}
+EAGLE_BASE_LORA_ALPHA=${EAGLE_BASE_LORA_ALPHA:-16.0}
+EAGLE_BASE_LORA_TARGET_MODULES=${EAGLE_BASE_LORA_TARGET_MODULES:-""}
+EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT=${EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT:-1.0}
+EAGLE_BASE_LORA_LR_MULTIPLIER=${EAGLE_BASE_LORA_LR_MULTIPLIER:-1.0}
 
 USE_FAKE_BASE_FOR_OFFLINE=${USE_FAKE_BASE_FOR_OFFLINE:-"False"}
 TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE:-"False"}
@@ -218,6 +248,19 @@ else
   VLM_ARGS=""
 fi
 
+if [[ "$EAGLE_BASE_LORA" == "True" ]]; then
+  LORA_ARGS="--eagle_base_lora True \
+             --eagle_base_lora_rank $EAGLE_BASE_LORA_RANK \
+             --eagle_base_lora_alpha $EAGLE_BASE_LORA_ALPHA \
+             --eagle_base_lora_preservation_loss_weight $EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT \
+             --eagle_base_lora_lr_multiplier $EAGLE_BASE_LORA_LR_MULTIPLIER"
+  if [[ "$EAGLE_BASE_LORA_TARGET_MODULES" != "" ]]; then
+    LORA_ARGS="$LORA_ARGS --eagle_base_lora_target_modules $EAGLE_BASE_LORA_TARGET_MODULES"
+  fi
+else
+  LORA_ARGS=""
+fi
+
 if [[ "$TOTAL_GPU" -gt 1 && "$FSDP" == "True" ]]; then
   #Use FSDP2 when multi GPU available
   FSDP_ARGS="--fsdp 'full_shard' --fsdp_config ${SCRIPT_DIR}/fsdp_config.json"
@@ -283,6 +326,7 @@ CMD="accelerate launch $MULTI_NODE_ARGS --mixed_precision bf16 ${SCRIPT_DIR}/mai
     --cp_size $CP_SIZE \
     --dp_shard_size $DP_SHARD_SIZE \
     --num_ttt_steps $NUM_TTT_STEPS \
+    $LORA_ARGS \
 "
 
 start_time=$(date +%s)

@@ -142,6 +142,49 @@ class EagleArguments:
         default=3,
         metadata={"help": "Number of train-time-test steps to use during training."},
     )
+    eagle_base_lora: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to add LoRA adapters to the base model for co-training with the EAGLE "
+                "draft module. Requires the `peft` library. Incompatible with offline training."
+            )
+        },
+    )
+    eagle_base_lora_rank: int = field(
+        default=64,
+        metadata={"help": "LoRA rank for the base model adapters."},
+    )
+    eagle_base_lora_alpha: float = field(
+        default=16.0,
+        metadata={"help": "LoRA alpha (scaling) for the base model adapters."},
+    )
+    eagle_base_lora_target_modules: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "Comma-separated list of module name patterns to apply LoRA to in the base model "
+                "(e.g. 'q_proj,v_proj'). Defaults to peft's default target modules."
+            )
+        },
+    )
+    eagle_base_lora_preservation_loss_weight: float = field(
+        default=1.0,
+        metadata={
+            "help": (
+                "Weight for the preservation loss that minimizes KL divergence between the "
+                "LoRA-adapted base model output and the original base model output."
+            )
+        },
+    )
+    eagle_base_lora_lr_multiplier: float = field(
+        default=1.0,
+        metadata={
+            "help": (
+                "Learning rate multiplier for LoRA parameters relative to the base learning rate."
+            )
+        },
+    )
 
 
 def train():
@@ -217,13 +260,23 @@ def train():
                 json.load(open(eagle_args.eagle_config)) if eagle_args.eagle_config else {}
             )
 
+            lora_target_modules = (
+                eagle_args.eagle_base_lora_target_modules.split(",")
+                if eagle_args.eagle_base_lora_target_modules
+                else None
+            )
             config = {
                 "eagle_decoder_type": eagle_args.eagle_decoder_type,
                 "eagle_offline": use_offline_training,
                 "eagle_mix_hidden_states": eagle_args.mix_hidden_states,
                 "eagle_use_torch_compile": not eagle_args.disable_torch_compile,
                 "eagle_ttt_steps": eagle_args.num_ttt_steps,
                 "eagle_architecture_config": custom_config,
+                "eagle_base_lora": eagle_args.eagle_base_lora,
+                "eagle_base_lora_rank": eagle_args.eagle_base_lora_rank,
+                "eagle_base_lora_alpha": eagle_args.eagle_base_lora_alpha,
+                "eagle_base_lora_target_modules": lora_target_modules,
+                "eagle_base_lora_preservation_loss_weight": eagle_args.eagle_base_lora_preservation_loss_weight,
             }
 
             mtsp.convert(model, [("eagle", config)])
@@ -239,6 +292,20 @@ def train():
         else:
             raise Exception(f"{training_args.mode} is not supported!")
 
+    # Move any remaining CPU buffers to CUDA so DDP (NCCL-only) can broadcast
+    # them.  We iterate named_buffers and reassign via the owning module to
+    # keep the module tree consistent.  Parameters are left on CPU — the HF
+    # Trainer will move them during init.
+    if torch.cuda.is_available():
+        _target_dev = torch.device("cuda", 0)
+        for name, buf in list(model.named_buffers()):
+            if buf.device.type == "cpu":
+                parts = name.split(".")
+                mod = model
+                for p in parts[:-1]:
+                    mod = getattr(mod, p)
+                setattr(mod, parts[-1], buf.to(_target_dev))
+
     print_rank_0("Loading dataset...")
     if training_args.mode == "eagle3":
         data_module = make_eagle_supervised_data_module(
@@ -250,6 +317,7 @@ def train():
         processing_class=tokenizer,
         args=training_args,
         callbacks=[EagleTrainingPlot(training_args.ar_validate_steps, training_args.estimate_ar)],
+        lora_lr_multiplier=eagle_args.eagle_base_lora_lr_multiplier,
         **data_module,
     )
 

@@ -1,2 +1,3 @@
 accelerate==1.12.0
+peft==0.18.1
 transformers==5.0.0rc1